From 36755be9cc4f5e287e53be4dc0592ee00137a43f Mon Sep 17 00:00:00 2001
From: Jianbing Dong <jianbingd@nvidia.com>
Date: Thu, 23 Oct 2025 19:43:45 -0700
Subject: [PATCH 01/17] add fused_linear_cross_entropy interface

Signed-off-by: Jianbing Dong <jianbingd@nvidia.com>
---
 .../fusions/fused_linear_cross_entropy.py     | 74 +++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 megatron/core/fusions/fused_linear_cross_entropy.py

diff --git a/megatron/core/fusions/fused_linear_cross_entropy.py b/megatron/core/fusions/fused_linear_cross_entropy.py
new file mode 100644
index 00000000000..6f33bfaf72e
--- /dev/null
+++ b/megatron/core/fusions/fused_linear_cross_entropy.py
@@ -0,0 +1,74 @@
+"""
+Linear Cross Entropy API
+Fuse cross entropy with linear layer.
+"""
+
+import typing
+import torch
+
+class LinearCrossEntropy(torch.autograd.Function):
+    """
+    This class implements a custom autograd function for linear and cross entropy, whose equivalent logic in PyTorch is:
+        ```python
+        def torch_entropy(hidden, weight, labels):
+            logits = torch.matmul(hidden, weight)
+            logprobs = torch.nn.functional.cross_entropy(logits, labels)
+            return logprobs
+        ```
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        hidden: torch.Tensor,
+        weight: torch.Tensor,
+        labels: torch.Tensor,
+        reduction: typing.Optional[str] = "mean",
+        dist_process_group: typing.Optional[torch.distributed.ProcessGroup] = None,
+        ignore_index: typing.Optional[int] = -100,
+    ) -> torch.Tensor:
+        """
+        The forward pass of the Linear Cross Entropy.
+        If dist_process_group is passed for distributed loss calculation,
+        the weight tensor to each distributed rank should be (*, vocab_size / world_size).
+        Note that each of the ranks should get equal shards along the vocab_size dimension.
+
+        Args:
+            hidden (torch.Tensor): The input tensor of shape (num_tokens, hidden_size).
+            weight (torch.Tensor): The weight tensor of shape (hidden_size, vocab_size).
+            labels (torch.Tensor): The labels tensor of shape (num_tokens,).
+            reduction (str, optional): The reduction method. Defaults to "mean", and can be
+                one of "none", "sum", "mean".
+        Returns:
+            logprobs (torch.Tensor): The cross entropy.
+        """
+        with torch.cuda.nvtx.range("LinearCrossEntropy-forward"):
+            logprobs = torch.empty(
+                hidden.view(-1, hidden.shape[-1]).shape[0], 
+                device=hidden.device, 
+                dtype=torch.float32)
+
+        return logprobs
+
+    @staticmethod
+    def backward(ctx, dlogprobs: torch.Tensor) -> typing.List[torch.Tensor]:
+        """
+        The backward pass of the Linear Cross Entropy.
+        Args:
+            dlogprobs (torch.Tensor): The gradient of the cross entropy.
+        Returns:
+            dhidden (torch.Tensor): The gradient of the hidden.
+            dweight (torch.Tensor): The gradient of the weight.
+        """
+        with torch.cuda.nvtx.range("LinearCrossEntropy-backward"):
+            d_hidden = torch.empty(hidden.shape, device=hidden.device, dtype=hidden.dtype)
+            d_weight = torch.empty(weight.shape, device=weight.device, dtype=weight.dtype)
+        return d_hidden, d_weight, None, None, None, None
+
+
+linear_cross_entropy = LinearCrossEntropy.apply
+
+__all__ = [
+    "linear_cross_entropy",
+    "LinearCrossEntropy",
+]

From 5781d3dca80c510c3f27cda3e53462f160341567 Mon Sep 17 00:00:00 2001
From: Jianbin Chang <shjwudp@gmail.com>
Date: Thu, 30 Oct 2025 17:07:47 +0800
Subject: [PATCH 02/17] Merge pull request #1 from
 shjwudp/jianbinc/fused_linear_ce

init fused linear cross-entropy interface
---
 .../common/language_module/language_module.py |  58 +++++-
 megatron/core/models/gpt/gpt_model.py         |  40 +++-
 megatron/core/models/mamba/mamba_model.py     |  18 +-
 .../core/transformer/transformer_config.py    |   3 +
 megatron/training/arguments.py                |   4 +
 tests/unit_tests/a2a_overlap/utils.py         |   9 +-
 .../test_fused_linear_cross_entropy.py        | 189 ++++++++++++++++++
 7 files changed, 303 insertions(+), 18 deletions(-)
 create mode 100644 tests/unit_tests/fusions/test_fused_linear_cross_entropy.py

diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py
index de2ecfb8011..b8e39693b22 100644
--- a/megatron/core/models/common/language_module/language_module.py
+++ b/megatron/core/models/common/language_module/language_module.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 import logging
 import os
-from typing import Optional, Tuple
+from typing import Any, Dict, Optional, Tuple
 
 import torch
 from torch import Tensor
@@ -14,6 +14,7 @@
 except:
     te_parallel_cross_entropy = None
 from megatron.core.fusions.fused_cross_entropy import fused_vocab_parallel_cross_entropy
+from megatron.core.fusions.fused_linear_cross_entropy import linear_cross_entropy
 from megatron.core.pipeline_parallel.utils import (
     is_pp_first_stage,
     is_pp_last_stage,
@@ -125,6 +126,61 @@ def check_and_set_env_variable(
             check_and_set_env_variable("NVTE_FUSED_ATTN", 1, AttnBackend.auto)
             check_and_set_env_variable("NVTE_UNFUSED_ATTN", 1, AttnBackend.auto)
 
+    def compute_language_model_loss_without_logits(
+        self,
+        hidden: Tensor,
+        labels: Optional[Tensor],
+        weight: Tensor = None,
+        column_parallel_linear: torch.nn.Module = None,
+        col_linear_kwargs: Dict[str, Any] = {},
+        reduction: Optional[str] = "mean",
+        ignore_index: Optional[int] = -100,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """Computes the language model logits and loss (Cross entropy across vocabulary)
+
+        Args:
+            hidden (Tensor): The hidden states from the transformer model
+            labels (Optional[Tensor]): The labels of dimension [batch size, seq length]
+            weight (Tensor): The weight tensor of shape [vocab size, hidden size].
+                Required if using fused linear cross entropy.
+            column_parallel_linear (torch.nn.Module): The column parallel linear
+                layer to use for computing logits when not using fused linear cross entropy.
+            col_linear_kwargs (Dict[str, Any]): Additional kwargs for column parallel linear layer
+            reduction (Optional[str]): The reduction method. Defaults to "mean", and can be
+                one of "none", "sum", "mean".
+            ignore_index (Optional[int]): The index to ignore in the loss calculation.
+                Defaults to -100.
+
+        Returns:
+            Tensor: Loss tensor of dimensions [batch size, sequence_length].
+        """
+        if self.config.linear_cross_entropy_fusion:
+
+            assert (
+                weight is not None
+            ), "weight cannot be None when using fused linear cross entropy."
+            # [b s] => [s b]
+            labels = labels.transpose(0, 1).contiguous()
+            loss = linear_cross_entropy(
+                hidden,
+                weight,
+                labels,
+                dist_process_group=self.pg_collection.tp,
+                reduction=reduction,
+                ignore_index=ignore_index,
+            )
+
+            # [s b] => [b, s]
+            loss = loss.transpose(0, 1).contiguous()
+            return loss
+        else:
+            assert (
+                column_parallel_linear is not None
+            ), "column_parallel_linear cannot be None when not using fused linear cross entropy."
+            logits, _ = column_parallel_linear(hidden, **col_linear_kwargs)
+
+            return self.compute_language_model_loss(labels, logits)
+
     def compute_language_model_loss(self, labels: Tensor, logits: Tensor) -> Tensor:
         """Computes the language model loss (Cross entropy across vocabulary)
 
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index a1156012106..b48dcec2078 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -568,18 +568,24 @@ def _postprocess(
                 # if loss_mask is not provided, use all ones as loss_mask
                 loss_mask = torch.ones_like(mtp_labels)
             for mtp_layer_number in range(self.config.mtp_num_layers):
-                # output
-                mtp_logits, _ = self.output_layer(
-                    hidden_states_list[mtp_layer_number + 1],
-                    weight=output_weight,
-                    runtime_gather_output=runtime_gather_output,
-                )
                 # Calc loss for the current Multi-Token Prediction (MTP) layers.
                 mtp_labels, _ = roll_tensor(mtp_labels, shifts=-1, dims=-1, cp_group=self.cp_group)
                 loss_mask, num_tokens = roll_tensor(
                     loss_mask, shifts=-1, dims=-1, cp_group=self.cp_group
                 )
-                mtp_loss = self.compute_language_model_loss(mtp_labels, mtp_logits)
+
+                # Compute mtp loss without storing logits to save memory.
+                mtp_loss = self.compute_language_model_loss_without_logits(
+                    hidden_states_list[mtp_layer_number + 1],
+                    labels=mtp_labels,
+                    weight=output_weight,
+                    column_parallel_linear=self.output_layer,
+                    col_linear_kwargs={
+                        'weight': output_weight,
+                        'runtime_gather_output': runtime_gather_output,
+                    },
+                )
+
                 mtp_loss = loss_mask * mtp_loss
                 if self.training:
                     # TODO(shifangx): remove the use of parallel_state here
@@ -626,9 +632,12 @@ def _postprocess(
                     hidden_states.squeeze(1).unsqueeze(0)
                 ).unsqueeze(1)
 
-        logits, _ = self.output_layer(
-            hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output
-        )
+        if has_config_logger_enabled(self.config) or labels is not None:
+            logits, _ = self.output_layer(
+                hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output
+            )
+        else:
+            logits = None
 
         # Restore sequence parallel execution to the output layer if necessary.
         if sequence_parallel_override:
@@ -655,7 +664,16 @@ def _postprocess(
             # [s b h] => [b s h]
             return logits.transpose(0, 1).contiguous()
 
-        loss = self.compute_language_model_loss(labels, logits)
+        loss = self.compute_language_model_loss_without_logits(
+            hidden_states,
+            labels=labels,
+            weight=output_weight,
+            column_parallel_linear=self.output_layer,
+            col_linear_kwargs={
+                'weight': output_weight,
+                'runtime_gather_output': runtime_gather_output,
+            },
+        )
 
         return loss
 
diff --git a/megatron/core/models/mamba/mamba_model.py b/megatron/core/models/mamba/mamba_model.py
index fb3df5e23f2..533f4efc257 100644
--- a/megatron/core/models/mamba/mamba_model.py
+++ b/megatron/core/models/mamba/mamba_model.py
@@ -247,14 +247,22 @@ def forward(
         if in_inference_mode and inference_context.materialize_only_last_token_logits:
             hidden_states = hidden_states[-1, :, :].unsqueeze(0)
 
-        logits, _ = self.output_layer(
-            hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output
-        )
-
         if labels is None:
+            logits, _ = self.output_layer(
+                hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output
+            )
             # [s b h] => [b s h]
             return logits.transpose(0, 1).contiguous()
 
-        loss = self.compute_language_model_loss(labels, logits)
+        loss = self.compute_language_model_loss_without_logits(
+            hidden_states,
+            labels,
+            weight=output_weight,
+            column_parallel_linear=self.output_layer,
+            col_linear_kwargs={
+                "weight": output_weight,
+                "runtime_gather_output": runtime_gather_output,
+            },
+        )
 
         return loss
diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py
index aab137b6430..55de1e07181 100644
--- a/megatron/core/transformer/transformer_config.py
+++ b/megatron/core/transformer/transformer_config.py
@@ -327,6 +327,9 @@ class TransformerConfig(ModelParallelConfig):
     fused_single_qkv_rope: bool = False
     """If set, avoid splitting QKV before ROPE forward and avoid concatenating ROPE dgrads."""
 
+    linear_cross_entropy_fusion: bool = False
+    """If True, fuses the linear layer and cross entropy loss calculation."""
+
     ####################
     # activation recomputation
     ####################
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index 507c21e6883..439825aaf57 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -2254,6 +2254,10 @@ def _add_training_args(parser):
                        dest='bias_swiglu_fusion')
     group.add_argument('--use-fused-weighted-squared-relu', action='store_true',
                        help='Use fused weighted squared relu when using MoE.')
+    group.add_argument('--no-linear-cross-entropy-fusion', action='store_false',
+                       help='Disable fusion of linear layer and cross entropy '
+                       'loss calculation.',
+                       dest='linear_cross_entropy_fusion')
     group.add_argument('--no-bias-dropout-fusion', action='store_false',
                        help='Disable bias and dropout fusion.',
                        dest='bias_dropout_fusion')
diff --git a/tests/unit_tests/a2a_overlap/utils.py b/tests/unit_tests/a2a_overlap/utils.py
index 7db4256a849..994998337d8 100644
--- a/tests/unit_tests/a2a_overlap/utils.py
+++ b/tests/unit_tests/a2a_overlap/utils.py
@@ -237,7 +237,14 @@ def get_valid_fp8_flags():
     recipes = []
     valid_flags = []
     if is_te_min_version("2.3.0.dev0"):
-        recipes.append(Fp8Recipe.blockwise)
+        props = torch.cuda.get_device_properties(torch.cuda.current_device())
+        compute_capability = (props.major, props.minor)
+        if (
+            compute_capability >= (9, 0)
+            and compute_capability < (10, 0)
+            and float(torch.version.cuda) >= 12.9
+        ):
+            recipes.append(Fp8Recipe.blockwise)
         recipes.append(Fp8Recipe.tensorwise)
 
     for fp8_type in fp8_types:
diff --git a/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py b/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py
new file mode 100644
index 00000000000..4d0ae55b666
--- /dev/null
+++ b/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py
@@ -0,0 +1,189 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+import contextlib
+from contextlib import ExitStack
+
+import numpy as np
+import pytest
+import torch
+from torch.utils.data import DataLoader, Dataset
+from torch.utils.data.distributed import DistributedSampler
+
+import megatron.core.parallel_state as ps
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_decoder_block_spec,
+    get_gpt_mtp_block_spec,
+)
+from megatron.core.models.gpt.gpt_model import GPTModel
+from tests.unit_tests.a2a_overlap.utils import (
+    deterministic_mode,
+    get_test_config,
+    get_valid_fp8_flags,
+    get_valid_token_dispatcher_types,
+)
+from tests.unit_tests.test_utilities import Utils
+
+
+class MockDataset(Dataset):
+    """
+    Mock dataset for torchtitan GPT training tests
+    Generates synthetic tokenized sequences on-the-fly
+    """
+
+    def __init__(
+        self,
+        num_samples=10000,
+        micro_batch_size=4,
+        sequence_length=2048,
+        vocab_size=128256,
+        seed=42,
+    ):
+        """
+        Initialize mock dataset
+
+        Args:
+            num_samples: Total number of samples
+            sequence_length: Length of each sequence
+            vocab_size: Size of vocabulary
+            seed: Random seed for reproducibility
+        """
+        self.num_samples = num_samples
+        self.micro_batch_size = micro_batch_size
+        self.sequence_length = sequence_length
+        self.vocab_size = vocab_size
+        self.seed = seed
+
+        # Set numpy seed for deterministic generation
+        np.random.seed(seed)
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        """
+        Generate a single training sample
+
+        Returns:
+            dict with 'tokens' and 'labels'
+        """
+        # Use idx as seed for reproducible but varied samples
+        rng = np.random.RandomState(self.seed + idx)
+
+        # Generate random token sequence
+        tokens = rng.randint(0, self.vocab_size, size=self.sequence_length, dtype=np.int64)
+
+        # Labels are tokens shifted by 1 (next token prediction)
+        labels = rng.randint(0, self.vocab_size, size=self.sequence_length, dtype=np.int64)
+
+        return {
+            'input_ids': torch.from_numpy(tokens.copy()),
+            'labels': torch.from_numpy(labels.copy()),
+            "attention_mask": torch.ones(
+                (1, self.sequence_length, self.sequence_length), dtype=bool
+            ),
+        }
+
+
+def build_model(config):
+    max_seq_len = 300
+
+    # build layer spec
+    transformer_layer_spec = get_gpt_decoder_block_spec(config=config, use_transformer_engine=True)
+    mtp_block_spec = get_gpt_mtp_block_spec(config, transformer_layer_spec.layer_specs[-1], True)
+
+    # build model
+    gpt_model = GPTModel(
+        config=config,
+        transformer_layer_spec=transformer_layer_spec,
+        mtp_block_spec=mtp_block_spec,
+        vocab_size=100,
+        pre_process=True,
+        post_process=True,
+        max_sequence_length=max_seq_len,
+    )
+    return gpt_model
+
+
+# Define a reusable context manager
+@contextlib.contextmanager
+def init_model_parallel(tp=1, pp=1, ep=1):
+    try:
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tp,
+            pipeline_model_parallel_size=pp,
+            expert_model_parallel_size=ep,
+        )
+        yield
+    finally:
+        Utils.destroy_model_parallel()
+
+
+def init_gpt_dataloader(
+    dp_group, micro_batch_size=1, vocab_size=50257, sequence_length=128, batch_size=8
+):
+    dataset = MockDataset(
+        num_samples=1000,
+        micro_batch_size=micro_batch_size,
+        sequence_length=sequence_length,
+        vocab_size=vocab_size,
+        seed=42,
+    )
+    sampler = DistributedSampler(dataset, num_replicas=dp_group.size(), rank=dp_group.rank())
+    dataloader = DataLoader(dataset, batch_size=batch_size, sampler=sampler)
+    return dataloader
+
+
+class TestFusedLinearCrossEntropy:
+
+    @pytest.mark.parametrize("fp8_flag", get_valid_fp8_flags())
+    @pytest.mark.parametrize("mtp_layers", [0, 1])
+    @pytest.mark.parametrize("dispatcher_type", get_valid_token_dispatcher_types())
+    @pytest.mark.parametrize("layer_num", [2])
+    def test_gpt_model(self, mtp_layers, dispatcher_type, fp8_flag, layer_num):
+        with ExitStack() as stack:
+            gpu_count = torch.cuda.device_count()
+            tp = min(2, gpu_count)
+            ep = gpu_count // tp
+            stack.enter_context(init_model_parallel(tp=tp, ep=ep))
+            stack.enter_context(deterministic_mode())
+
+            # create TransformerConfig
+            extra_kwargs = {
+                "moe_token_dispatcher_type": dispatcher_type,
+                "sequence_parallel": tp > 1,
+                "tensor_model_parallel_size": tp,
+            }
+            if dispatcher_type == "flex":
+                extra_kwargs["moe_enable_deepep"] = True
+                extra_kwargs["moe_router_dtype"] = "fp32"
+            if fp8_flag is not None:
+                extra_kwargs["fp8"] = fp8_flag[0]
+                extra_kwargs["fp8_recipe"] = fp8_flag[1]
+            if mtp_layers > 0:
+                extra_kwargs["mtp_num_layers"] = mtp_layers
+                extra_kwargs["mtp_loss_scaling_factor"] = 1.1
+
+            # build config
+            config = get_test_config(num_layers=layer_num, extra_kwargs=extra_kwargs)
+            config.expert_model_parallel_size = ep
+
+            # build model
+            gpt_model = build_model(config)
+            gpt_model.cuda()
+
+            dataloader = init_gpt_dataloader(
+                ps.get_data_parallel_group(),
+                vocab_size=gpt_model.vocab_size,
+                micro_batch_size=1,
+                sequence_length=gpt_model.max_sequence_length,
+                batch_size=4,
+            )
+            # for batch in dataloder:
+            for batch in dataloader:
+                batch["position_ids"] = torch.arange(
+                    gpt_model.max_sequence_length, dtype=torch.int64
+                )
+                batch = {k: v.cuda() for k, v in batch.items()}
+                gpt_model.zero_grad()
+                output = gpt_model(**batch)
+                loss = output.sum()
+                loss.backward()

From 289847f3365c9f89096ca40bc8003e58e0602139 Mon Sep 17 00:00:00 2001
From: Jianbing-D <69858819+Jianbing-D@users.noreply.github.com>
Date: Thu, 6 Nov 2025 12:37:36 +0800
Subject: [PATCH 03/17] Feat linear cross entropy kernel dev (#2)

* add forward-mainloop and bwd_partial_dlogits kernel

Signed-off-by: Jianbing Dong <jianbingd@nvidia.com>

* skip TestFusedLinearCrossEntropyOnGptModel for single GPU

Signed-off-by: Jianbing Dong <jianbingd@nvidia.com>

* added unit-test for linear_cross_entropy on dp

Signed-off-by: Jianbing Dong <jianbingd@nvidia.com>

---------

Signed-off-by: Jianbing Dong <jianbingd@nvidia.com>
---
 .../fusions/fused_linear_cross_entropy.py     | 218 ++++-
 .../blackwell/bwd_partial_dlogits.py          | 926 ++++++++++++++++++
 .../linear_cross_entropy/blackwell/entry.py   | 385 ++++++++
 .../blackwell/fwd_mainloop.py                 | 892 +++++++++++++++++
 .../linear_cross_entropy/blackwell/triton.py  | 303 ++++++
 .../fusions/linear_cross_entropy/utils.py     |  35 +
 .../test_fused_linear_cross_entropy.py        | 380 ++++++-
 7 files changed, 3118 insertions(+), 21 deletions(-)
 create mode 100644 megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py
 create mode 100644 megatron/core/fusions/linear_cross_entropy/blackwell/entry.py
 create mode 100644 megatron/core/fusions/linear_cross_entropy/blackwell/fwd_mainloop.py
 create mode 100644 megatron/core/fusions/linear_cross_entropy/blackwell/triton.py
 create mode 100644 megatron/core/fusions/linear_cross_entropy/utils.py

diff --git a/megatron/core/fusions/fused_linear_cross_entropy.py b/megatron/core/fusions/fused_linear_cross_entropy.py
index 6f33bfaf72e..e3fccc92a4d 100644
--- a/megatron/core/fusions/fused_linear_cross_entropy.py
+++ b/megatron/core/fusions/fused_linear_cross_entropy.py
@@ -6,6 +6,24 @@
 import typing
 import torch
 
+def _setup_platform():
+    """
+    Setup the platform for the Linear Cross Entropy.
+    """
+    assert torch.cuda.is_available(), "CUDA is not available"
+    device = torch.cuda.current_device()
+    cc = torch.cuda.get_device_capability(device)
+    
+    global forward_func, backward_func
+    if cc[0] == 10:
+        # from linear_cross_entropy.blackwell import entry as platform
+        from .linear_cross_entropy.blackwell import entry as platform
+        forward_func = platform.forward
+        backward_func = platform.backward
+    else:
+        raise ValueError(f"Unsupported architecture: {cc[0]}")
+_setup_platform()
+
 class LinearCrossEntropy(torch.autograd.Function):
     """
     This class implements a custom autograd function for linear and cross entropy, whose equivalent logic in PyTorch is:
@@ -16,59 +34,221 @@ def torch_entropy(hidden, weight, labels):
             return logprobs
         ```
     """
-
     @staticmethod
     def forward(
         ctx,
         hidden: torch.Tensor,
         weight: torch.Tensor,
         labels: torch.Tensor,
+        tp_group: typing.Optional[torch.distributed.ProcessGroup] = None,
         reduction: typing.Optional[str] = "mean",
-        dist_process_group: typing.Optional[torch.distributed.ProcessGroup] = None,
         ignore_index: typing.Optional[int] = -100,
     ) -> torch.Tensor:
         """
         The forward pass of the Linear Cross Entropy.
-        If dist_process_group is passed for distributed loss calculation,
-        the weight tensor to each distributed rank should be (*, vocab_size / world_size).
+        If tp_group is not None, the weight tensor to each TP rank should be (vocab_size // world_size, dim).
         Note that each of the ranks should get equal shards along the vocab_size dimension.
 
         Args:
-            hidden (torch.Tensor): The input tensor of shape (num_tokens, hidden_size).
-            weight (torch.Tensor): The weight tensor of shape (hidden_size, vocab_size).
-            labels (torch.Tensor): The labels tensor of shape (num_tokens,).
-            reduction (str, optional): The reduction method. Defaults to "mean", and can be
-                one of "none", "sum", "mean".
+            @param hidden: the input tensor with shape (num_tokens, dim)
+            @param weight: the lm_head weight tensor with shape (vocab_size, dim)
+            @param labels: the labels tensor with shape (num_tokens,)
+            @param tp_group: the distributed process group for TP.
+            @param reduction: Default to "mean", and can be one of "none", "sum", "mean".
+            @param ignore_index: The index to ignore. Default to -100.
         Returns:
-            logprobs (torch.Tensor): The cross entropy.
+            @return: logprobs with shape
+                - either (num_tokens,) when reduction is "none"
+                - or (1,) when reduction is "mean" or "sum"
+
         """
         with torch.cuda.nvtx.range("LinearCrossEntropy-forward"):
-            logprobs = torch.empty(
-                hidden.view(-1, hidden.shape[-1]).shape[0], 
-                device=hidden.device, 
-                dtype=torch.float32)
+            logprobs, _maximum, _acc, _num_valid_tokens, tp_rank, tp_world_size = (
+                forward_func(
+                    hidden, weight, labels,
+                    tp_group, 
+                    reduction,
+                    ignore_index,
+                )
+            )
+            ctx.save_for_backward(
+                hidden, weight, labels,
+                _maximum, _acc, _num_valid_tokens,
+            )
+            ctx.tp_group = tp_group
+            ctx.ignore_index = ignore_index
+            ctx.reduction = reduction
+            ctx.tp_rank = tp_rank
+            ctx.tp_world_size = tp_world_size
 
         return logprobs
+            
 
     @staticmethod
-    def backward(ctx, dlogprobs: torch.Tensor) -> typing.List[torch.Tensor]:
+    def backward(
+        ctx,
+        dlogprobs: torch.Tensor
+    ) -> typing.List[torch.Tensor]:
         """
         The backward pass of the Linear Cross Entropy.
         Args:
-            dlogprobs (torch.Tensor): The gradient of the cross entropy.
+            dlogprobs (torch.Tensor): The gradient of the cross entropy, with shape
+                - either (num_tokens,) when reduction is "none"
+                - or (1,) when reduction is "mean" or "sum"
         Returns:
             dhidden (torch.Tensor): The gradient of the hidden.
             dweight (torch.Tensor): The gradient of the weight.
         """
         with torch.cuda.nvtx.range("LinearCrossEntropy-backward"):
-            d_hidden = torch.empty(hidden.shape, device=hidden.device, dtype=hidden.dtype)
-            d_weight = torch.empty(weight.shape, device=weight.device, dtype=weight.dtype)
+            (hidden, weight, labels, _maximum, _accu, _num_valid_tokens) = ctx.saved_tensors
+
+            tp_group = ctx.tp_group
+            ignore_index = ctx.ignore_index
+            reduction = ctx.reduction
+            tp_rank = ctx.tp_rank
+            tp_world_size = ctx.tp_world_size
+
+            d_hidden, d_weight = backward_func(
+                dlogprobs,
+                hidden,
+                weight,
+                labels,
+                _maximum,
+                _accu,
+                _num_valid_tokens,
+                reduction,
+                ignore_index,
+                tp_group,
+                tp_rank,
+                tp_world_size
+            )
+
         return d_hidden, d_weight, None, None, None, None
 
 
-linear_cross_entropy = LinearCrossEntropy.apply
+def linear_cross_entropy(
+    hidden: torch.Tensor,
+    weight: torch.Tensor,
+    labels: torch.Tensor,
+    tp_group: typing.Optional[torch.distributed.ProcessGroup] = None,
+    reduction: typing.Optional[str] = "mean",
+    ignore_index: typing.Optional[int] = -100,
+) -> torch.Tensor:
+    """
+    helper function for linear cross entropy.
+    """
+    _impl = LinearCrossEntropy.apply
+    return _impl(hidden, weight, labels, tp_group, reduction, ignore_index)
 
 __all__ = [
     "linear_cross_entropy",
     "LinearCrossEntropy",
 ]
+
+
+# FIXME: move this unit-test to other place
+if __name__ == "__main__":
+    def test_dp():
+        # batch = 4
+        # seqlen = 2035
+        # vocab_size = 152063
+        # dim = 4096
+        batch = 1
+        seqlen = 80
+        vocab_size = 125
+        dim = 64
+        dtype = torch.float16
+        reduction = "none"
+
+        hidden = (
+            torch.empty((batch, seqlen, dim), device="cuda", dtype=dtype)
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        weight = (
+            torch.empty((vocab_size, dim), device="cuda", dtype=dtype)
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+
+        labels = torch.randint(0, vocab_size, (batch, seqlen), device="cuda", dtype=torch.long)
+
+        logits = hidden @ weight.T
+        # print(logits)
+
+        _logits = logits.to(torch.float32)
+        _logits_view = _logits.view(-1, _logits.shape[-1])
+        maximum = _logits_view.max(dim=-1, keepdim=False).values
+        accu = torch.exp(_logits_view - maximum.unsqueeze(-1)).sum(dim=-1)
+        
+        logprobs = torch.nn.functional.cross_entropy(
+            logits.view(-1, logits.shape[-1]),
+            labels.view(-1),
+            reduction=reduction,
+        )
+        
+        custom_logprobs = linear_cross_entropy(
+            hidden, weight, labels, 
+            reduction=reduction,
+        )
+
+        print(custom_logprobs)
+        print(logprobs)
+
+        # backward
+        g_logprobs = torch.rand_like(logprobs, dtype=dtype, device="cuda")
+
+        (d_torch_hidden, d_torch_weight) = torch.autograd.grad(
+            (logprobs,), 
+            (hidden, weight),
+            (g_logprobs,),
+            retain_graph=False
+        )
+
+        # first way to do backward
+        if reduction == "mean":
+            _g_logprobs = torch.broadcast_to(g_logprobs / (batch * seqlen), (batch * seqlen,))
+        elif reduction == "sum":
+            _g_logprobs = torch.broadcast_to(g_logprobs, (batch * seqlen,))
+        else:
+            _g_logprobs = g_logprobs
+
+        intermediate = _logits_view - maximum.unsqueeze(-1)
+        exp_logits = torch.exp(intermediate)
+        d_logits = exp_logits / accu.unsqueeze(-1)
+        d_logits *= _g_logprobs.unsqueeze(-1)
+        # mask = torch.arange(vocab_size, dtype=torch.long, device="cuda")
+        # mask = torch.broadcast_to(mask, (batch * seqlen, vocab_size))
+        # mask = (labels.view(-1).unsqueeze(-1) == mask)
+
+        one_hot = torch.zeros_like(_logits_view)
+        one_hot.scatter_(1, labels.view(-1).unsqueeze(-1), 1)
+
+        d_logits += one_hot * -_g_logprobs.unsqueeze(-1)
+        d_logits = d_logits.to(hidden.dtype)
+        # print(d_logits)
+        
+        d_hidden = d_logits @ weight
+        d_weight = d_logits.T @ hidden.view(-1, dim)
+
+        # print("first way to do backward")
+        # print(d_hidden.view(hidden.shape))
+        # print(d_torch_hidden)
+        # print(d_weight)
+        # print(d_torch_weight)
+        # print(d_logits)
+
+        (d_custom_hidden, d_custom_weight) = torch.autograd.grad(
+            (custom_logprobs,),
+            (hidden, weight),
+            (g_logprobs,),
+            retain_graph=False
+        )
+        # print(d_torch_hidden)
+        # print(d_custom_hidden)
+        print(d_torch_weight)
+        print(d_custom_weight)
+
+    torch.manual_seed(42)
+
+    test_dp()
\ No newline at end of file
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py b/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py
new file mode 100644
index 00000000000..2d5da82ab6a
--- /dev/null
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py
@@ -0,0 +1,926 @@
+from typing import Optional, Type, Tuple, Union
+import cuda.bindings.driver as cuda
+
+import torch
+
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils as utils
+import cutlass.pipeline as pipeline
+from cutlass.cute.nvgpu import cpasync, tcgen05
+import cutlass.torch as cutlass_torch
+import cutlass.utils.blackwell_helpers as sm100_utils
+from cutlass.cute.runtime import from_dlpack
+
+
+SM100_TMEM_CAPACITY_COLUMNS: int = 512
+
+def make_thread_cooperative_group(size: int, alignment: Optional[int] = None):
+    return pipeline.CooperativeGroup(
+        pipeline.Agent.Thread, size, 
+        alignment=alignment if alignment is not None else size)
+
+
+class BwdPartialDlogits:
+    """
+    This class implements the backward kernel for partial d_logits.
+    """
+    def __init__(self,
+                 reduction: int,
+                 acc_dtype: Type[cutlass.Numeric] = cutlass.Float32,
+                 use_2cta_instrs: bool = False,
+                 mma_tiler_mn: Tuple[int, int] = (128, 256),
+                 rank: int = 0,
+                 vocab_per_split: int = 512):
+        self.REDUCTION: cutlass.Constexpr[cutlass.Int32] = cutlass.const_expr(reduction)
+        self.acc_dtype = acc_dtype
+        self.use_2cta_instrs = use_2cta_instrs
+        self.mma_tiler = (*mma_tiler_mn, 1)
+        self.rank = rank
+        self.vocab_per_split = vocab_per_split
+
+        self.cta_group = (
+            tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
+        )
+        self.cluster_shape_mn = (2, 1) if self.use_2cta_instrs else (1, 1)
+
+        self.smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
+
+        self.threads_per_warp: int = 32
+
+        self.epi_warp_ids = (0, 1, 2, 3)
+        self.load_warp_ids = 4
+        self.mma_warp_ids = 5
+        self.empty_warp_ids = (6, 7)
+
+        self.threads_per_cta: int = self.threads_per_warp * len(
+            (*self.epi_warp_ids,
+             self.load_warp_ids,
+             self.mma_warp_ids,
+             *self.empty_warp_ids)
+        )
+        self.cta_sync_barrier = pipeline.NamedBarrier(
+            barrier_id = 1,
+            num_threads = self.threads_per_cta
+        )
+
+        self.buffer_align_bytes: int = 1024
+        self.num_regs_other: int = 32
+        self.num_regs_epi: int = 192
+
+    def _compute_grid(
+        self,
+        problem_mnk: Tuple[int, int, int],
+        cluster_shape_mn: Tuple[int, int],
+        cta_tiler: Tuple[int, int, int],
+    ) -> Tuple[int, int, int]:
+        cluster_shape_mnk = (*cluster_shape_mn, 1)
+
+        grid = cute.round_up(
+            (
+                cute.ceil_div(problem_mnk[0], cta_tiler[0]),
+                cute.ceil_div(self.vocab_per_split, cta_tiler[1]),
+                1,
+            ),
+            cluster_shape_mnk
+        )
+        return grid
+
+    def _compute_stages(
+        self,
+        tiled_mma: cute.TiledMma,
+        mma_tiler: Tuple[int, int, int],
+        a_dtype: Type[cutlass.Numeric],
+        b_dtype: Type[cutlass.Numeric],
+    ):
+        num_acc_stage = 1
+        num_ab_stage = 4
+        num_epi_stage_per_tile = 4
+        return num_acc_stage, num_ab_stage, num_epi_stage_per_tile
+
+    def _setup_attributes(
+        self,
+        tiled_mma: cute.TiledMma,
+        a_dtype: Type[cutlass.Numeric],
+        b_dtype: Type[cutlass.Numeric],
+    ):
+        self.cluster_shape_mnk = (*self.cluster_shape_mn, 1)
+        self.cluster_layout_vmnk = cute.tiled_divide(
+            cute.make_layout(self.cluster_shape_mnk),
+            (tiled_mma.thr_id.shape,),
+        )
+
+        mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
+        # it requires k-mode to be 128B aligned
+        mma_inst_tile_k: int = 4
+        self.mma_tiler = (
+            self.mma_tiler[0],
+            self.mma_tiler[1],
+            mma_inst_shape_k * mma_inst_tile_k
+        )
+        
+        self.num_acc_stage, self.num_ab_stage, self.num_epi_stage_per_tile =\
+            self._compute_stages(tiled_mma, self.mma_tiler, a_dtype, b_dtype)
+        self.tmem_alloc_cols = self.num_acc_stage * self.mma_tiler[1]
+        assert self.tmem_alloc_cols <= SM100_TMEM_CAPACITY_COLUMNS
+
+        self.cta_tile_shape_mnk = (
+            self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler[1],
+            self.mma_tiler[2]
+        )
+
+    @cute.kernel
+    def kernel(
+        self,
+        split_idx: cutlass.Int32,
+        tiled_mma: cute.TiledMma,
+        tma_atom_a: cute.CopyAtom,
+        mA: cute.Tensor,
+        tma_atom_b: cute.CopyAtom,
+        mB: cute.Tensor,
+        mLabels: cute.Tensor,
+        mDlogprobs: cute.Tensor,
+        mMaximum: cute.Tensor,
+        mAccu: cute.Tensor,
+        mDlogits_partial: cute.Tensor,
+        scalarNumValidTokens: cute.Pointer,
+        ignore_index: cutlass.Int64,
+        a_smem_layout_staged: cute.ComposedLayout,
+        b_smem_layout_staged: cute.ComposedLayout,
+        cluster_layout_vmnk: cute.Layout,
+        problem_mnk: Tuple[int, int, int],
+    ) -> None:
+        warp_idx = cute.arch.make_warp_uniform(
+            cute.arch.warp_idx()
+        )
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, bidy, _ = cute.arch.block_idx()
+        # FIXME: block swizzling applied here
+        pidm, pidn = bidx, bidy
+
+        # FIXME: if 2 CTAs, modify here
+        cta_rank_in_cluster = 0
+        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(
+            cta_rank_in_cluster
+        )
+
+        # prefetch tma descriptors
+        if warp_idx == self.load_warp_ids:
+            cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_a)
+            cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_b)
+
+        smem = utils.SmemAllocator()
+        storage = smem.allocate(self.shared_storage)
+
+        ab_pipeline = pipeline.PipelineTmaUmma.create(
+            num_stages=self.num_ab_stage,
+            producer_group=make_thread_cooperative_group(len([self.load_warp_ids])),
+            consumer_group=make_thread_cooperative_group(len([self.mma_warp_ids])),
+            tx_count=self.tma_copy_ab_bytes,
+            barrier_storage=storage.load_ab_mbar_ptr.data_ptr()
+        )
+        ab_producer_state = pipeline.make_pipeline_state(
+            pipeline.PipelineUserType.Producer,
+            self.num_ab_stage
+        )
+        ab_consumer_state = pipeline.make_pipeline_state(
+            pipeline.PipelineUserType.Consumer,
+            self.num_ab_stage
+        )
+
+        mma_pipeline = pipeline.PipelineUmmaAsync.create(
+            num_stages=self.num_acc_stage,
+            producer_group=make_thread_cooperative_group(len([self.mma_warp_ids])),
+            consumer_group=make_thread_cooperative_group(self.threads_per_warp * len(self.epi_warp_ids)),
+            barrier_storage=storage.mma_mbar_ptr.data_ptr()
+        )
+        mma_producer_state = pipeline.make_pipeline_state(
+            pipeline.PipelineUserType.Producer,
+            self.num_acc_stage
+        )
+        mma_consumer_state = pipeline.make_pipeline_state(
+            pipeline.PipelineUserType.Consumer,
+            self.num_acc_stage
+        )
+
+        tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr.data_ptr()
+        if warp_idx == self.empty_warp_ids[0]:
+            with cute.arch.elect_one():
+                cute.arch.mbarrier_init(
+                    tmem_dealloc_mbar_ptr,
+                    self.threads_per_warp * len(self.epi_warp_ids)
+                )
+                cute.arch.mbarrier_init_fence()
+
+        # -------- tensor partition ------------ #
+        # swizzle o [(tileM, tileK), loopM, loopK, stage]
+        sA = storage.sA.get_tensor(
+            a_smem_layout_staged.outer,
+            swizzle=a_smem_layout_staged.inner
+        )
+        # swizzle o [(tileN, tileK), loopN, loopK, stage]
+        sB = storage.sB.get_tensor(
+            b_smem_layout_staged.outer,
+            swizzle=b_smem_layout_staged.inner
+        )
+        
+        # FIXME: if 2 CTAs, modify here
+        thr_mma = tiled_mma.get_slice(0)
+        # [MMA, loopM, loopK, stage]
+        tCsA = thr_mma.make_fragment_A(sA)
+        # [MMA, loopN, loopK, stage]
+        tCsB = thr_mma.make_fragment_B(sB)
+        
+        # [tileM, tileK, loopK]
+        gA = cute.local_tile(
+            mA,
+            (self.cta_tile_shape_mnk[0], self.cta_tile_shape_mnk[2]),
+            (pidm, None)
+        )
+        # [vocab_per_split, dim]
+        mB_n = cute.local_tile(
+            mB,
+            (self.vocab_per_split, cute.size(mB.layout.shape, mode=[1])),
+            (split_idx, 0)
+        )
+        # [tileN, tileK, loopK]
+        gB = cute.local_tile(
+            mB_n,
+            (self.cta_tile_shape_mnk[1], self.cta_tile_shape_mnk[2]),
+            (pidn, None)
+        )
+
+        a_cta_layout = cute.make_layout(
+            cute.slice_(
+                cluster_layout_vmnk,
+                (0, 0, None, 0)
+            ).shape
+        )
+        # just to make sure SMEM and GMEM tensor has the same size in the first rank
+        tCgA = thr_mma.partition_A(gA)
+        tCgB = thr_mma.partition_B(gB)
+        # [CPY, stage] & [CPY, loopK]
+        tTMAsA, tTMAgA = cpasync.tma_partition(
+            tma_atom_a,
+            block_in_cluster_coord_vmnk[2], # cta_coord,
+            a_cta_layout,
+            cute.group_modes(sA, 0, 3),
+            cute.group_modes(tCgA, 0, 3)
+        )
+        b_cta_layout = cute.make_layout(
+            cute.slice_(
+                cluster_layout_vmnk,
+                (0, None, 0, 0)
+            ).shape
+        )
+        # [CPY, stage] & [CPY, loopK]
+        tTMAsB, tTMAgB = cpasync.tma_partition(
+            tma_atom_b,
+            block_in_cluster_coord_vmnk[1], # cta_coord
+            b_cta_layout,
+            cute.group_modes(sB, 0, 3),
+            cute.group_modes(tCgB, 0, 3)
+        )
+
+        # ------ Allocate TMEM ------ #
+        tmem_holding_buf = storage.tmem_holding_buf
+        if warp_idx == self.empty_warp_ids[0]:
+            cute.arch.alloc_tmem(
+                self.tmem_alloc_cols,
+                tmem_holding_buf,
+                is_two_cta=self.use_2cta_instrs
+            )
+        self.cta_sync_barrier.arrive_and_wait()
+        tmem_ptr = cute.arch.retrieve_tmem_ptr(
+            self.acc_dtype,
+            alignment=16,
+            ptr_to_buffer_holding_addr=tmem_holding_buf
+        )
+
+        tmem_shape = (128, self.tmem_alloc_cols)
+        acc_shape = thr_mma.partition_shape_C(tmem_shape)
+        tCtC_fake = thr_mma.make_fragment_C(acc_shape)
+        # [(tileM, tileN), loopM, loopN]
+        tCtC = cute.make_tensor(tmem_ptr, tCtC_fake.layout)
+        
+        # ------ Empty ------ #
+        if warp_idx in self.empty_warp_ids:
+            cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+
+        # ------ Load ------ #
+        if warp_idx == self.load_warp_ids:
+            cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+
+            for k in cutlass.range(cute.size(gA, mode=[2])):
+                ab_pipeline.producer_acquire(ab_producer_state)
+                cute.copy(
+                    tma_atom_a,
+                    tTMAgA[(None, k)],
+                    tTMAsA[(None, ab_producer_state.index)],
+                    tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state)
+                )
+                cute.copy(
+                    tma_atom_b,
+                    tTMAgB[(None, k)],
+                    tTMAsB[(None, ab_producer_state.index)],
+                    tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state)
+                )
+                ab_pipeline.producer_commit(ab_producer_state)
+                ab_producer_state.advance()
+
+        # ------ MMA ------ #
+        if warp_idx == self.mma_warp_ids:
+            cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+
+            tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+            mma_pipeline.producer_acquire(mma_producer_state)
+
+            for k in cutlass.range(cute.size(gA, mode=[2])):
+                ab_pipeline.consumer_wait(ab_consumer_state)
+
+                for kblock_idx in cutlass.range(cute.size(tCsA, mode=[2]), unroll_full=True):
+                    cute.gemm(
+                        tiled_mma,
+                        cute.append_ones(tCtC[(None, None, mma_producer_state.index)]),
+                        tCsA[(None, None, kblock_idx, ab_consumer_state.index)],
+                        tCsB[(None, None, kblock_idx, ab_consumer_state.index)],
+                        cute.append_ones(tCtC[(None, None, mma_producer_state.index)])
+                    )
+                    tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+                ab_pipeline.consumer_release(ab_consumer_state)
+                ab_consumer_state.advance()
+
+            mma_pipeline.producer_commit(mma_producer_state)
+            mma_producer_state.advance()
+            
+        # ------ EPI ------ #
+        if warp_idx in self.epi_warp_ids:
+            cute.arch.warpgroup_reg_alloc(self.num_regs_epi)
+
+            copy_atom_t2r = sm100_utils.get_tmem_load_op(
+                self.cta_tile_shape_mnk,
+                utils.LayoutEnum.ROW_MAJOR,
+                self.acc_dtype,
+                self.acc_dtype,
+                (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
+                self.use_2cta_instrs
+            )
+            # [tileM, subTileN, loopM, CntSubTileN, loopN]
+            tAcc_epi = cute.flat_divide(
+                tCtC[((None, None), 0, None)],
+                (self.epi_tile[0],
+                 self.epi_tile[1] // self.num_epi_stage_per_tile)
+            )
+            tiled_copy_t2r = tcgen05.make_tmem_copy(
+                copy_atom_t2r,
+                tAcc_epi[(None, None, 0, 0, 0)]
+            )
+            thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+            tTMEM_load_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
+            tTMEM_load_tAcc = cute.group_modes(
+                tTMEM_load_tAcc, 3, cute.rank(tTMEM_load_tAcc) - 1
+            )
+
+            # predicates
+            cAcc = cute.make_identity_tensor(self.mma_tiler[:2])
+            tCcAcc = thr_mma.partition_C(cAcc)
+            tCcAcc_epi = cute.flat_divide(
+                tCcAcc[((None, None), 0, None)],
+                (self.epi_tile[0],
+                 self.epi_tile[1] // self.num_epi_stage_per_tile)
+            )
+            tTMEM_load_cAcc = thr_copy_t2r.partition_D(tCcAcc_epi)
+            tTMEM_load_cAcc_shape = cute.select(
+                tTMEM_load_cAcc.shape,
+                mode=[0, 1, 2]
+            )
+            tTMEM_load_rAcc = cute.make_fragment(
+                tTMEM_load_cAcc_shape,
+                self.acc_dtype
+            )
+
+            copy_atom_g2r_int64 = cute.make_copy_atom(
+                cute.nvgpu.CopyUniversalOp(),
+                mLabels.element_type
+            )
+            copy_atom_g2r_fp32 = cute.make_copy_atom(
+                cute.nvgpu.CopyUniversalOp(),
+                mDlogprobs.element_type
+            )
+            epilogue_thread_layout = cute.make_layout(
+                (128, 1),
+                stride=(1, 1))
+            tiled_copy_g2r_int64 = cute.make_tiled_copy_tv(
+                copy_atom_g2r_int64,
+                epilogue_thread_layout,
+                cute.make_layout((1, 1))
+            )
+            tiled_copy_g2r_fp32 = cute.make_tiled_copy_tv(
+                copy_atom_g2r_fp32,
+                epilogue_thread_layout,
+                cute.make_layout((1, 1))
+            )
+            thr_copy_g2r_int64 = tiled_copy_g2r_int64.get_slice(tidx)
+            thr_copy_g2r_fp32 = tiled_copy_g2r_fp32.get_slice(tidx)
+
+            # [tileM]
+            gLabels = cute.local_tile(
+                mLabels,
+                (self.epi_tile[0],),
+                (pidm,)
+            )
+            gMaximum = cute.local_tile(
+                mMaximum,
+                (self.epi_tile[0],),
+                (pidm,)
+            )
+            gAccu = cute.local_tile(
+                mAccu,
+                (self.epi_tile[0],),
+                (pidm,)
+            )
+            
+            # slice along M direction
+            tMCAcc = thr_copy_g2r_int64.partition_S(cAcc)[(None, None, 0)]
+            # [(1, 1), 1]
+            tMCAcc_mask = cute.make_fragment(
+                tMCAcc.shape,
+                cutlass.Boolean
+            )
+            # to align shape with gMax and gAccu
+            tMCAcc_mask = cute.append_ones(tMCAcc_mask)
+            tMCAcc_mask[0] = cute.elem_less(
+                pidm * self.epi_tile[0] + tidx,
+                cute.size(mA, mode=[0])
+            )
+            # [(1, 1), 1, 1]
+            tMgLabels = thr_copy_g2r_int64.partition_S(
+                cute.append_ones(gLabels)
+            )
+            tMrLabels = cute.make_fragment(
+                tMgLabels.shape,
+                tMgLabels.element_type
+            )
+            cute.copy(
+                tiled_copy_g2r_int64,
+                tMgLabels,
+                tMrLabels,
+                pred=tMCAcc_mask
+            )
+            tMgMaximum = thr_copy_g2r_fp32.partition_S(
+                cute.append_ones(gMaximum)
+            )
+            tMrMaximum = cute.make_fragment(
+                tMgMaximum.layout,
+                tMgMaximum.element_type
+            )
+            cute.copy(
+                tiled_copy_g2r_fp32,
+                tMgMaximum,
+                tMrMaximum,
+                pred=tMCAcc_mask
+            )
+            tMgAccu = thr_copy_g2r_fp32.partition_S(
+                cute.append_ones(gAccu)
+            )
+            tMrAccu = cute.make_fragment(
+                tMgAccu.layout,
+                tMgAccu.element_type
+            )
+            cute.copy(
+                tiled_copy_g2r_fp32,
+                tMgAccu,
+                tMrAccu,
+                pred=tMCAcc_mask
+            )
+
+            tMrDlogprobs = cute.make_fragment(
+                tMgAccu.layout,
+                mDlogprobs.element_type
+            )
+            if cutlass.const_expr(self.REDUCTION == 2):
+                # mean reduction
+                num_valid_tokens = cute.make_tensor(
+                    scalarNumValidTokens,
+                    layout=(1,),
+                )
+                tMrDlogprobs[0] = mDlogprobs[0] / num_valid_tokens[0].to(cutlass.Float32)
+            elif cutlass.const_expr(self.REDUCTION == 1):
+                # sum reduction
+                tMrDlogprobs[0] = mDlogprobs[0]
+            else:
+                # no reduction
+                gDlogprobs = cute.local_tile(
+                    mDlogprobs,
+                    (self.epi_tile[0],),
+                    (pidm,)
+                )
+                tMgDlogprobs = thr_copy_g2r_fp32.partition_S(
+                    cute.append_ones(gDlogprobs)
+                )
+                cute.copy(
+                    tiled_copy_g2r_fp32,
+                    tMgDlogprobs,
+                    tMrDlogprobs,
+                    pred=tMCAcc_mask
+                )
+
+            tMrAccu[0] = cute.arch.rcp_approx(tMrAccu[0])
+            tMrDlogprobs[0] *= (tMrLabels[0] != ignore_index)
+            tMr_d_acc_exp_logits = tMrDlogprobs[0] * tMrAccu[0]
+
+            # ------ Partial output ------ #
+            # [tileM, tileN]
+            gDlogits_partial = cute.local_tile(
+                mDlogits_partial,
+                (self.epi_tile[0], self.epi_tile[1]),
+                (pidm, pidn)
+            )
+            # blackwell supports STG.256
+            copy_atom_r2g = cute.make_copy_atom(
+                cute.nvgpu.CopyUniversalOp(),
+                gDlogits_partial.element_type,
+                num_bits_per_copy=256
+            )
+            tiled_copy_r2g = cute.make_tiled_copy_tv(
+                copy_atom_r2g,
+                epilogue_thread_layout,
+                copy_atom_r2g.layout_dst_tv
+            )
+            thr_copy_r2g = tiled_copy_r2g.get_slice(tidx)
+
+            # [CPY, loopM, loopN]
+            tR2GCAcc = thr_copy_r2g.partition_S(cAcc)
+            tR2GCAcc_pred = cute.make_fragment(
+                tR2GCAcc.shape,
+                cutlass.Boolean
+            )
+            for elem in cutlass.range(cute.size(tR2GCAcc_pred, mode=[0])):
+                for row in cutlass.range(cute.size(tR2GCAcc_pred, mode=[1])):
+                    for col in cutlass.range(cute.size(tR2GCAcc_pred, mode=[2])):
+                        # tR2GCAcc_pred[elem, row, col] = cute.elem_less(
+                        #     pidm * self.epi_tile[0] + tR2GCAcc[elem, row, col][0],
+                        #     cute.size(mDlogits_partial, mode=[0])
+                        # ) and cute.elem_less(
+                        #     pidn * self.epi_tile[1] + tR2GCAcc[elem, row, col][1],
+                        #     cute.size(mDlogits_partial, mode=[1])
+                        # )
+                        tR2GCAcc_pred[elem, row, col] = cute.elem_less(
+                            pidm * self.epi_tile[0]
+                            + tR2GCAcc[elem, row, col][0],
+                            problem_mnk[0]
+                        ) and cute.elem_less(
+                            split_idx * self.vocab_per_split
+                            + pidn * self.epi_tile[1]
+                            + tR2GCAcc[elem, row, col][1],
+                            problem_mnk[1]
+                        )
+
+            tR2GgDlogits = thr_copy_r2g.partition_D(gDlogits_partial)
+
+            # for type conversion
+            dLogits_half = cute.make_fragment(
+                tTMEM_load_rAcc.shape,
+                tR2GgDlogits.element_type
+            )          
+            dLogits_half = cute.tiled_divide(
+                dLogits_half,
+                (cute.size(tR2GgDlogits, mode=[0]), 1)
+            )
+            dLogits_half = cute.group_modes(dLogits_half, 2, cute.rank(dLogits_half))
+
+            mma_pipeline.consumer_wait(mma_consumer_state)
+
+            block_vocab_left_idx: cutlass.Int64 = (
+                split_idx * self.vocab_per_split
+                + pidn * self.epi_tile[1]
+            )
+            block_vocab_right_idx: cutlass.Int64 = (
+                min(
+                    split_idx * self.vocab_per_split
+                    + (pidn + 1) * self.epi_tile[1],
+                    min(
+                        (split_idx + 1) * self.vocab_per_split,
+                        problem_mnk[1]
+                    )
+                )
+            )
+            num_n_subtiles: cutlass.Int64 = cute.ceil_div(
+                (block_vocab_right_idx - block_vocab_left_idx),
+                cute.size(tTMEM_load_rAcc, mode=[0])
+            )
+            for n_subtile in cutlass.range(num_n_subtiles):
+                cute.copy(
+                    tiled_copy_t2r,
+                    tTMEM_load_tAcc[(None, None, None, n_subtile, mma_consumer_state.index)],
+                    tTMEM_load_rAcc
+                )
+
+                for idx in cutlass.range(cute.size(tTMEM_load_rAcc, mode=[0]), unroll_full=True):
+                    # exp_logits
+                    tTMEM_load_rAcc[idx] = cute.exp(tTMEM_load_rAcc[idx] - tMrMaximum[0])
+
+                    position: cutlass.Int64 = (
+                        self.rank * problem_mnk[1]
+                        + split_idx * self.vocab_per_split
+                        + pidn * self.epi_tile[1]
+                        + n_subtile * cute.size(tTMEM_load_rAcc, mode=[0])
+                        + idx
+                    )
+                    mask: cutlass.Boolean = (
+                        position == tMrLabels[0]
+                        and tMrLabels[0] != ignore_index
+                    )
+                    # d_logits
+                    tTMEM_load_rAcc[idx] *= tMr_d_acc_exp_logits
+                    tTMEM_load_rAcc[idx] += (mask * -tMrDlogprobs[0])
+                    dLogits_half[idx] = tTMEM_load_rAcc[idx].to(dLogits_half.element_type)
+
+                for idx in cutlass.range(cute.size(dLogits_half, mode=[1]), unroll_full=True):
+                    copy_id = n_subtile * cute.size(dLogits_half, mode=[1]) + idx
+                    cute.copy(
+                        tiled_copy_r2g,
+                        dLogits_half[(None, idx, None)],
+                        tR2GgDlogits[(None, None, copy_id)],
+                        pred=tR2GCAcc_pred[((0, None), None, copy_id)]
+                    )
+
+            mma_pipeline.consumer_release(mma_consumer_state)
+            mma_consumer_state.advance()
+
+
+        # ------ Deallocate TMEM ------ #
+        self.cta_sync_barrier.arrive_and_wait()
+        if warp_idx == self.empty_warp_ids[0]:
+            cute.arch.relinquish_tmem_alloc_permit()
+            cute.arch.dealloc_tmem(
+                tmem_ptr,
+                self.tmem_alloc_cols,
+                is_two_cta=self.use_2cta_instrs
+            )
+
+
+    @cute.jit
+    def __call__(
+        self,
+        split_idx: cutlass.Int32,
+        hidden: cute.Tensor,
+        weight: cute.Tensor,
+        labels: cute.Tensor,
+        dlogprobs: cute.Tensor,
+        maximum: cute.Tensor,
+        accu: cute.Tensor,
+        dlogits_partial: cute.Tensor,
+        scalarNumValidTokens: cute.Pointer,
+        ignore_index: cutlass.Int64,
+        stream: cuda.CUstream,
+    ) -> None:
+        a_dtype: Type[cutlass.Numeric] = hidden.element_type
+        b_dtype: Type[cutlass.Numeric] = weight.element_type
+
+        if cutlass.const_expr(hidden.element_type != weight.element_type):
+            raise RuntimeError(f"data type don't match: {hidden.element_type} v.s. {weight.element_type}")
+        if cutlass.const_expr(hidden.element_type not in [cutlass.Float16, cutlass.BFloat16]):
+            raise RuntimeError("hidden can only be FP16 or BF16")
+        if cutlass.const_expr(hidden.layout.shape[1] != weight.layout.shape[1]):
+            raise RuntimeError("K dimension doesn't match")
+
+        problem_mnk = (
+            hidden.layout.shape[0],
+            weight.layout.shape[0],
+            hidden.layout.shape[1]
+        )
+        if cutlass.const_expr((problem_mnk[2] * a_dtype.width // 8) % 16 != 0):
+            raise RuntimeError(f"K dimension is not 16B aligned: {problem_mnk[2]}")
+        if cutlass.const_expr((problem_mnk[2] * b_dtype.width // 8) % 128 != 0):
+            raise RuntimeError(f"N dimension is not 128B aligned: {problem_mnk[1]}")
+
+        grid = self._compute_grid(
+            problem_mnk = problem_mnk,
+            cluster_shape_mn = self.cluster_shape_mn,
+            cta_tiler = self.mma_tiler,
+        )
+        
+        a_major_mode = utils.LayoutEnum.from_tensor(hidden).mma_major_mode()
+        b_major_mode = utils.LayoutEnum.from_tensor(weight).mma_major_mode()
+
+        tiled_mma = sm100_utils.make_trivial_tiled_mma(
+            a_dtype,
+            a_major_mode,
+            b_major_mode,
+            self.acc_dtype,
+            self.cta_group,
+            self.mma_tiler[:2]
+        )
+        self._setup_attributes(tiled_mma, a_dtype, b_dtype)
+
+        self.epi_tile = self.cta_tile_shape_mnk[:2]
+        
+        # Swizzle o [(tileM, tileK), loopM, loopK, stage]
+        a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            self.mma_tiler,
+            a_dtype,
+            self.num_ab_stage
+        )
+        # Swizzle o [(tileN, tileK), loopN, loopK, stage]
+        b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            self.mma_tiler,
+            b_dtype,
+            self.num_ab_stage
+        )
+        tma_load_op = cpasync.CopyBulkTensorTileG2SOp(self.cta_group)
+        tma_store_op = cpasync.CopyBulkTensorTileS2GOp()
+
+        # Swizzle o [(tileM, tileK), loopM, loopK]
+        a_smem_layout = cute.select(
+            a_smem_layout_staged,
+            mode=[0, 1, 2]
+        )
+        tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
+            tma_load_op,
+            hidden,
+            a_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape
+        )
+        # Swizzle o [(tileN, tileK), loopN, loopK]
+        b_smem_layout = cute.select(
+            b_smem_layout_staged,
+            mode=[0, 1, 2]
+        )
+        tma_atom_b, tma_tensor_b = cute.nvgpu.make_tiled_tma_atom_B(
+            tma_load_op,
+            weight,
+            b_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape
+        )
+        a_copy_size = cute.size_in_bytes(a_dtype, a_smem_layout)
+        b_copy_size = cute.size_in_bytes(b_dtype, b_smem_layout)
+        self.tma_copy_ab_bytes = a_copy_size + b_copy_size
+
+        @cute.struct
+        class SharedStorage:
+            load_ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage * 2]
+            mma_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2]
+
+            tmem_dealloc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, 1]
+            tmem_holding_buf: cutlass.Int32
+
+            sA: cute.struct.Align[
+                cute.struct.MemRange[a_dtype, cute.cosize(a_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+            sB: cute.struct.Align[
+                cute.struct.MemRange[b_dtype, cute.cosize(b_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+        self.shared_storage = SharedStorage
+
+        self.kernel(
+            split_idx,
+            tiled_mma,
+            tma_atom_a,
+            tma_tensor_a,
+            tma_atom_b,
+            tma_tensor_b,
+            labels,
+            dlogprobs,
+            maximum,
+            accu,
+            dlogits_partial,
+            scalarNumValidTokens,
+            ignore_index,
+            a_smem_layout_staged,
+            b_smem_layout_staged,
+            self.cluster_layout_vmnk,
+            problem_mnk,
+        ).launch(
+            grid=grid,
+            block=[self.threads_per_cta, 1, 1],
+            cluster=self.cluster_shape_mnk,
+            stream=stream
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(1113)
+
+    batch = 4
+    seqlen = 1023
+    dim = 8192
+    vocab_size = 152064
+    dtype = torch.bfloat16
+    split_idx = 0
+    vocab_per_split = 512 * 6
+
+    hidden = torch.randn(batch, seqlen, dim, device="cuda", dtype=dtype)
+    weight = torch.randn(vocab_size, dim, device="cuda", dtype=dtype)
+    labels = torch.randint(0, vocab_size, (batch, seqlen), device="cuda", dtype=torch.long)
+    num_valid_tokens = torch.tensor(batch * seqlen, device="cuda", dtype=torch.int64)
+
+    dlogprobs = torch.randn(batch, seqlen, device="cuda", dtype=torch.float32)
+
+    def get_maximum_and_accu(hidden, weight):
+        logits = (hidden @ weight.T).to(torch.float32)
+        maximum, _ = torch.max(logits, dim=-1)
+        accu = torch.sum(torch.exp(logits - maximum.unsqueeze(-1)), dim=-1)
+        return maximum, accu
+    maximum, accu = get_maximum_and_accu(hidden, weight)
+
+    dlogits_partial = torch.empty(
+        (batch, seqlen, vocab_per_split),
+        device=hidden.device,
+        dtype=hidden.dtype
+    )
+
+    # compile kernel
+    bwd_kernel = BwdPartialDlogits(
+        vocab_per_split=vocab_per_split,
+        reduction=0
+    )
+
+    hidden_packed = from_dlpack(
+        hidden.view(-1, dim), 
+        assumed_align=16).mark_compact_shape_dynamic(mode=0)
+    weight_packed = from_dlpack(
+        weight,
+        assumed_align=16
+    )
+    labels_packed = from_dlpack(
+        labels.view(-1),
+        assumed_align=8
+    ).mark_compact_shape_dynamic(mode=0)
+    dlogprobs_packed = from_dlpack(
+        dlogprobs.view(-1),
+        assumed_align=16
+    ).mark_compact_shape_dynamic(mode=0)
+    maximum_packed = from_dlpack(
+        maximum.view(-1),
+        assumed_align=8
+    ).mark_compact_shape_dynamic(mode=0)
+    accu_packed = from_dlpack(
+        accu.view(-1),
+        assumed_align=8
+    ).mark_compact_shape_dynamic(mode=0)
+    dlogits_partial_packed = from_dlpack(
+        dlogits_partial.view(-1, vocab_per_split),
+        assumed_align=32,
+    ).mark_compact_shape_dynamic(mode=0)
+    scalarNumValidTokens_packed = cute.runtime.make_ptr(
+        cutlass.Int64,
+        num_valid_tokens.data_ptr(),
+        cute.AddressSpace.gmem,
+        assumed_align=8
+    )
+
+    ignore_index = -100
+
+    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+
+    compiled = cute.compile(
+        bwd_kernel,
+        split_idx,
+        hidden_packed,
+        weight_packed,
+        labels_packed,
+        dlogprobs_packed,
+        maximum_packed,
+        accu_packed,
+        dlogits_partial_packed,
+        scalarNumValidTokens_packed,
+        ignore_index,
+        stream,
+    )
+
+    start, stop = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+    start.record(stream=torch.cuda.current_stream())
+    with torch.cuda.nvtx.range("BwdPartialDlogits"):
+        compiled(
+            split_idx,
+            hidden_packed,
+            weight_packed,
+            labels_packed,
+            dlogprobs_packed,
+            maximum_packed,
+            accu_packed,
+            dlogits_partial_packed,
+            scalarNumValidTokens_packed,
+            ignore_index,
+            stream
+        )
+    stop.record(stream=torch.cuda.current_stream())
+
+    torch.cuda.synchronize()
+
+    elapsed_time = start.elapsed_time(stop)
+
+    print(dlogits_partial)
+
+    print(f"Success, Elapsed time: {elapsed_time:.4f} ms")
\ No newline at end of file
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py b/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py
new file mode 100644
index 00000000000..c59e7b40d95
--- /dev/null
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py
@@ -0,0 +1,385 @@
+import torch
+import torch.distributed as dist
+import typing
+import triton
+
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.runtime import from_dlpack
+import cuda.bindings.driver as cuda
+
+import megatron.core.fusions.linear_cross_entropy.utils as utils
+import megatron.core.fusions.linear_cross_entropy.blackwell.fwd_mainloop as fwd_mainloop
+import megatron.core.fusions.linear_cross_entropy.blackwell.bwd_partial_dlogits as bwd_partial_dlogits
+import megatron.core.fusions.linear_cross_entropy.blackwell.triton as triton_kernels
+
+# import linear_cross_entropy.utils as utils
+# import linear_cross_entropy.blackwell.fwd_mainloop as fwd_mainloop
+# import linear_cross_entropy.blackwell.bwd_partial_dlogits as bwd_partial_dlogits
+# import linear_cross_entropy.blackwell.triton as triton_kernels
+
+def forward(
+    hidden: torch.Tensor,
+    weight: torch.Tensor,
+    labels: torch.Tensor,
+    tp_group: typing.Optional[torch.distributed.ProcessGroup] = None,
+    reduction: typing.Optional[str] = "mean",
+    ignore_index: typing.Optional[int] = -100,
+) -> typing.Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    forward host function
+    """
+    assert hidden.is_cuda and weight.is_cuda and labels.is_cuda
+    assert weight.device == hidden.device and labels.device == hidden.device
+
+    # hidden could be [batch, seqlen, dim] or [seqlen, batch, dim] or [tokens, dim]
+    assert hidden.dim() == 2 or hidden.dim() == 3
+    # weight must be [vocab_size, dim]
+    assert weight.dim() == 2
+    # labels could be [batch, seqlen] or [seqlen, batch] or [tokens]
+    assert ((hidden.dim() == 2 and labels.dim() == 1) 
+            or (hidden.dim() == 3 and labels.dim() == 2))
+    assert hidden.is_contiguous() and weight.is_contiguous() and labels.is_contiguous()
+
+    hidden_view = hidden.view(-1, hidden.shape[-1])
+    labels_view = labels.view(-1)
+
+    assert hidden_view.shape[0] == labels_view.shape[0]
+    assert hidden_view.shape[1] == weight.shape[1]
+    num_tokens, dim = hidden_view.shape
+    vocab_size, _ = weight.shape
+
+    tp_rank = 0 if tp_group is None else torch.distributed.get_rank(tp_group)
+    tp_world_size = 1 if tp_group is None else torch.distributed.get_world_size(tp_group)
+
+    if not hasattr(forward, "_initialized"):
+        global _dedicated_stream, _dedicated_events
+        _dedicated_stream = torch.cuda.Stream(hidden.device)
+        _dedicated_events = [torch.cuda.Event() for _ in range(2)]
+        forward._initialized = True
+
+    REDUCTION = utils.str_to_reduction_enum(reduction)
+    # declare logprobs
+    if REDUCTION == utils.EntropyReductionEnum.kNone:
+        logprobs = torch.empty((num_tokens,), device=hidden.device, dtype=torch.float32)
+        if tp_group is not None:
+            logprobs.zero_()
+    else:
+        logprobs = torch.zeros((), device=hidden.device, dtype=torch.float32)
+    # declare auxiliary tensors
+    maximum = torch.empty((num_tokens,), device=hidden.device, dtype=torch.float32)
+    accumulate = torch.empty_like(maximum, dtype=torch.float32)
+    num_valid_tokens = torch.empty((), device=hidden.device, dtype=torch.int64)
+    assert maximum.is_contiguous() and accumulate.is_contiguous() and num_valid_tokens.is_contiguous()
+    # declare intermediate tensors
+    # NOTE: this is a parameter for tuning
+    vocab_per_split = 512 * 6
+    num_splits = (vocab_size + vocab_per_split - 1) // vocab_per_split
+    _max = torch.empty((num_tokens, num_splits), device=hidden.device, dtype=torch.float32)
+    _accu = torch.empty((num_tokens, num_splits), device=hidden.device, dtype=torch.float32)
+    if REDUCTION == utils.EntropyReductionEnum.kNone:
+        _logprobs = logprobs
+    else:
+        _logprobs = torch.empty((num_tokens,), device=hidden.device, dtype=torch.float32)
+        if tp_group is not None:
+            _logprobs.zero_()
+    assert _max.is_contiguous() and _accu.is_contiguous() and _logprobs.is_contiguous()
+
+    triton_kernels.get_num_valid_tokens[(1,)](
+        num_tokens,
+        ignore_index,
+        labels_view,
+        labels_view.stride(0),
+        num_valid_tokens,
+    )
+    
+    if not hasattr(forward, "_fwd_mainloop_kernels"):
+        forward._fwd_mainloop_kernels = dict()
+
+    # need to compile the kernel for the first time
+    hidden_packed = from_dlpack(
+        hidden_view.detach(), assumed_align=16
+    ).mark_compact_shape_dynamic(mode=0)
+    weight_packed = from_dlpack(
+        weight.detach(), assumed_align=16
+    )
+    labels_packed = from_dlpack(
+        labels_view.detach(), assumed_align=8
+    ).mark_compact_shape_dynamic(mode=0)
+    logprobs_packed = from_dlpack(
+        _logprobs, assumed_align=16
+    ).mark_compact_shape_dynamic(mode=0)
+    _max_packed = from_dlpack(
+        _max, assumed_align=8
+    ).mark_compact_shape_dynamic(mode=0, stride_order=(0, 1))
+    _accu_packed = from_dlpack(
+        _accu, assumed_align=8
+    ).mark_compact_shape_dynamic(mode=0, stride_order=(0, 1))
+    cuda_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+
+    # VocabSize and Dim are fixed for a given model,
+    # only the number of tokens can vary
+    key = f"vocab_size:{vocab_size}+dim:{dim}+dtype:{hidden.dtype}"
+    if forward._fwd_mainloop_kernels.get(key) is None:
+        fwd_mainloop_kernel = fwd_mainloop.FwdMainLoop(
+            vocab_per_split=vocab_per_split,
+        )
+        fwd_mainloop_compiled_kernel = cute.compile(
+            fwd_mainloop_kernel,
+            hidden_packed,
+            weight_packed,
+            labels_packed,
+            logprobs_packed,
+            _max_packed,
+            _accu_packed,
+            ignore_index,
+            tp_rank,
+            cuda_stream
+        )
+        forward._fwd_mainloop_kernels[key] = fwd_mainloop_compiled_kernel
+    else:
+        fwd_mainloop_compiled_kernel = forward._fwd_mainloop_kernels[key]
+    fwd_mainloop_compiled_kernel(
+        hidden_packed,
+        weight_packed,
+        labels_packed,
+        logprobs_packed,
+        _max_packed,
+        _accu_packed,
+        ignore_index,
+        tp_rank,
+        cuda_stream
+    )
+    
+    if tp_group is None:
+        def grid(meta):
+            return (triton.cdiv(num_tokens, meta["BLOCK_SIZE_M"]),)
+
+        triton_kernels.forward_dp_epilogue[grid](
+            num_tokens,
+            num_splits,
+            ignore_index,
+            labels_view,
+            labels_view.stride(0),
+            num_valid_tokens,
+            _max,
+            _max.stride(0),
+            _max.stride(1),
+            _accu,
+            _accu.stride(0),
+            _accu.stride(1),
+            maximum,
+            maximum.stride(0),
+            accumulate,
+            maximum.stride(0),
+            _logprobs,
+            _logprobs.stride(0),
+            logprobs,
+            triton.language.constexpr(REDUCTION),
+        )
+    else:
+        _max_backup = _max.clone()
+        dist.all_reduce(_max, op=dist.ReduceOp.MAX, group=tp_group)
+
+        torch.cuda.current_stream().record_event(_dedicated_events[0])
+        with torch.cuda.stream(_dedicated_stream):
+            _dedicated_stream.wait_event(_dedicated_events[0])
+            dist.all_reduce(_logprobs, op=dist.ReduceOp.SUM, group=tp_group)
+            _dedicated_stream.record_event(_dedicated_events[1])
+
+        def grid(meta):
+            return (triton.cdiv(num_tokens, meta["BLOCK_SIZE_M"]),)
+
+        triton_kernels.forward_tp_epilogue[grid](
+            num_tokens,
+            num_splits,
+            _max,
+            _max.stride(0),
+            _max.stride(1),
+            _max_backup,
+            _max_backup.stride(0),
+            _max_backup.stride(1),
+            _accu,
+            _accu.stride(0),
+            _accu.stride(1),
+            maximum,
+            maximum.stride(0),
+            accumulate,
+            maximum.stride(0),
+        )
+        # reduce accumulate
+        dist.all_reduce(accumulate, op=dist.ReduceOp.SUM, group=tp_group)
+
+        # update logprobs
+        torch.cuda.current_stream().wait_event(_dedicated_events[1])
+        triton_kernels.forward_tp_epilogue_update_logprobs[grid](
+            num_tokens,
+            ignore_index,
+            num_valid_tokens,
+            labels_view,
+            labels_view.stride(0),
+            _logprobs,
+            _logprobs.stride(0),
+            maximum,
+            maximum.stride(0),
+            accumulate,
+            accumulate.stride(0),
+            logprobs,
+            REDUCTION,
+        )
+
+    return logprobs, maximum, accumulate, num_valid_tokens, tp_rank, tp_world_size
+
+def backward(
+    dlogprobs: torch.Tensor,
+    hidden: torch.Tensor,
+    weight: torch.Tensor,
+    labels: torch.Tensor,
+    maximum: torch.Tensor,
+    accu: torch.Tensor,
+    num_valid_tokens: torch.Tensor,
+    reduction: typing.Optional[str] = "mean",
+    ignore_index: typing.Optional[int] = -100,
+    tp_group: typing.Optional[dist.ProcessGroup] = None,
+    tp_rank: typing.Optional[int] = 0,
+    tp_world_size: typing.Optional[int] = 1,
+) -> typing.List[torch.Tensor]:
+    """
+    backward host function
+    """
+    hidden_view = hidden.view(-1, hidden.shape[-1])
+    labels_view = labels.view(-1)
+
+    num_tokens, dim = hidden_view.shape
+    vocab_size, _ = weight.shape
+
+    REDUCTION = utils.str_to_reduction_enum(reduction)
+    dlogprobs_view = dlogprobs.view(-1)
+    assert (
+        (REDUCTION == utils.EntropyReductionEnum.kNone and dlogprobs.shape == (num_tokens,))
+        or (REDUCTION != utils.EntropyReductionEnum.kNone and dlogprobs.dim() == 0)
+    )
+    assert dlogprobs.is_contiguous() and dlogprobs.is_cuda
+
+    assert num_valid_tokens.dim() == 0 and num_valid_tokens.is_cuda and num_valid_tokens.dtype == torch.int64
+
+    d_hidden = torch.empty_like(hidden)
+    d_weight = torch.empty_like(weight)
+    assert d_hidden.is_contiguous() and d_weight.is_contiguous()
+
+    # FIXME: implement different backward methods
+    _backward = utils.BackwardMethodEnum.kDlogitsSplitN
+    if _backward == utils.BackwardMethodEnum.kDlogitsSplitN:
+        vocab_per_split = 512 * 6
+        num_splits = (vocab_size + vocab_per_split - 1) // vocab_per_split
+
+        _d_logits = torch.empty(
+            (num_tokens, vocab_per_split),
+            device=hidden.device,
+            dtype=hidden.dtype
+        )
+
+        hidden_packed = from_dlpack(
+            hidden_view.detach(),
+            assumed_align=16
+        ).mark_compact_shape_dynamic(mode=0)
+        weight_packed = from_dlpack(
+            weight.detach(),
+            assumed_align=16
+        )
+        labels_packed = from_dlpack(
+            labels_view.detach(),
+            assumed_align=8
+        ).mark_compact_shape_dynamic(mode=0)
+        dlogprobs_packed = from_dlpack(
+            dlogprobs_view.detach(),
+            assumed_align=8
+        ).mark_compact_shape_dynamic(mode=0)
+        maximum_packed = from_dlpack(
+            maximum.detach(),
+            assumed_align=8
+        ).mark_compact_shape_dynamic(mode=0)
+        accu_packed = from_dlpack(
+            accu.detach(),
+            assumed_align=8
+        ).mark_compact_shape_dynamic(mode=0)
+        dlogits_packed = from_dlpack(
+            _d_logits,
+            assumed_align=32
+        ).mark_compact_shape_dynamic(mode=0)
+        scalarNumValidTokens_packed = cute.runtime.make_ptr(
+            cutlass.Int64,
+            num_valid_tokens.data_ptr(),
+            cute.AddressSpace.gmem,
+            assumed_align=8
+        )
+
+        stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+
+        if not hasattr(backward, "_bwd_kernel"):
+            backward._bwd_kernel = dict()
+
+        key = f"vocab_size:{vocab_size}+dim:{dim}+reduction:{REDUCTION}+dtype:{hidden.dtype}"
+        if backward._bwd_kernel.get(key) is None:
+            bwd_kernel = bwd_partial_dlogits.BwdPartialDlogits(
+                reduction=REDUCTION,
+                vocab_per_split=vocab_per_split,
+            )
+            bwd_kernel_compiled = cute.compile(
+                bwd_kernel,
+                0, # split_idx
+                hidden_packed,
+                weight_packed,
+                labels_packed,
+                dlogprobs_packed,
+                maximum_packed,
+                accu_packed,
+                dlogits_packed,
+                scalarNumValidTokens_packed,
+                ignore_index,
+                stream
+            )
+            backward._bwd_kernel[key] = bwd_kernel_compiled
+        else:
+            bwd_kernel_compiled = backward._bwd_kernel.get(key)
+
+        for split_idx in range(num_splits):
+            bwd_kernel_compiled(
+                split_idx,
+                hidden_packed,
+                weight_packed,
+                labels_packed,
+                dlogprobs_packed,
+                maximum_packed,
+                accu_packed,
+                dlogits_packed,
+                scalarNumValidTokens_packed,
+                ignore_index,
+                stream
+            )
+            vocab_right_bound = (
+                min((split_idx + 1) * vocab_per_split, vocab_size) - split_idx * vocab_per_split
+            )
+            # remove padding areas
+            _d_logits = _d_logits[:, :vocab_right_bound].contiguous()
+
+            if split_idx == 0:
+                torch.matmul(
+                    _d_logits,
+                    weight[split_idx * vocab_per_split : (split_idx + 1) * vocab_per_split, :],
+                    out=d_hidden.view(num_tokens, dim)
+                )
+            else:
+                d_hidden += torch.matmul(
+                    _d_logits,
+                    weight[split_idx * vocab_per_split : (split_idx + 1) * vocab_per_split, :],
+                ).view(d_hidden.shape)
+            torch.matmul(
+                _d_logits.T,
+                hidden_view,
+                out=d_weight[split_idx * vocab_per_split : (split_idx + 1) * vocab_per_split, :]
+            )
+    else:
+        raise NotImplementedError(f"Unsupported backward method: {_backward}")
+    
+    return d_hidden, d_weight
\ No newline at end of file
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/fwd_mainloop.py b/megatron/core/fusions/linear_cross_entropy/blackwell/fwd_mainloop.py
new file mode 100644
index 00000000000..81346b0df81
--- /dev/null
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/fwd_mainloop.py
@@ -0,0 +1,892 @@
+"""
+Implementations of the fusion lm_head(Linear) + Cross-Entropy kernel
+"""
+
+from typing import Optional, Type, Tuple, Union
+import cuda.bindings.driver as cuda
+
+import torch
+
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils as utils
+import cutlass.pipeline as pipeline
+from cutlass.cute.nvgpu import cpasync, tcgen05
+import cutlass.torch as cutlass_torch
+import cutlass.utils.blackwell_helpers as sm100_utils
+from cutlass.cute.runtime import from_dlpack
+
+
+SM100_TMEM_CAPACITY_COLUMNS: int = 512
+
+def make_thread_cooperative_group(size: int):
+    return pipeline.CooperativeGroup(pipeline.Agent.Thread, size, alignment=size)
+
+class FwdMainLoop:
+    """
+    This class implements the mainloop for forward process.
+
+    Traits stored as attributes.
+
+    :param acc_dtype: 
+    """
+
+    def __init__(self,
+                 acc_dtype: Type[cutlass.Numeric] = cutlass.Float32,
+                 use_2cta_instrs: bool = False,
+                 mma_tiler_mn: Tuple[int, int] = (128, 256),
+                 vocab_per_split: int = 512):
+        """
+        Configuration including:
+            - MMA instruction settings
+            - Cluster Shape
+        """
+        self.acc_dtype: Type[cutlass.Numeric] = acc_dtype
+        self.use_2cta_instrs = use_2cta_instrs
+        # This is the shape covered by tiledMMA, not just single MMA instruction
+        self.mma_tiler = (*mma_tiler_mn, 1)
+        self.cta_tiler = (
+            self.mma_tiler[0],
+            vocab_per_split,
+            self.mma_tiler[2]
+        )
+        self.vocab_per_split = vocab_per_split
+        
+        self.cta_group = (
+            tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
+        )
+        self.cluster_shape_mn = (2, 1) if self.use_2cta_instrs else (1, 1)
+
+        self.occupancy = 1
+        # query SMEM capacity
+        self.smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
+
+        # the maximum columns per MMA is 256, and there is only one GEMM, so we can fully
+        # assign TMEM for that GEMM of different tiles.
+        # so 512 = 2 * 256
+
+        self.threads_per_warp: int = 32
+        # 1 warp for loading, 1 warp for issuing MMA, 1 WG for storing
+        self.epi_warp_ids = (0, 1, 2, 3)
+        self.load_warp_ids = 4
+        self.mma_warp_ids = 5
+        self.empty_warp_ids = (6, 7)
+
+        self.threads_per_cta: int = self.threads_per_warp * len(
+            (*self.epi_warp_ids,
+             self.load_warp_ids,
+             self.mma_warp_ids,
+             *self.empty_warp_ids)
+        )
+
+        self.cta_sync_barrier = pipeline.NamedBarrier(
+            barrier_id = 1,
+            num_threads = self.threads_per_cta
+        )
+        self.tmem_alloc_barrier = pipeline.NamedBarrier(
+            barrier_id = 2,
+            num_threads = self.threads_per_cta
+        )
+
+        self.buffer_align_bytes: int = 1024
+        self.num_regs_other: int = 32
+        self.num_regs_epi: int = 192
+
+    def _compute_stages(
+        self,
+        tiled_mma: cute.TiledMma,
+        mma_tiler: Tuple[int, int, int],
+        a_dtype: Type[cutlass.Numeric],
+        b_dtype: Type[cutlass.Numeric]
+    ):
+        a_smem_layout_stage_one = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            mma_tiler,
+            a_dtype,
+            1, # only single stage
+        )
+        b_smem_layout_stage_one = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            mma_tiler,
+            b_dtype,
+            1,
+        )
+        a_bytes_per_stage = cute.size_in_bytes(
+            a_dtype, a_smem_layout_stage_one
+        )
+        b_bytes_per_stage = cute.size_in_bytes(
+            b_dtype, b_smem_layout_stage_one
+        )
+        num_acc_stage = 2
+        num_a_stage = 4
+        num_b_stage = 4
+        num_epi_stage_per_tile = 4
+
+        return num_acc_stage, num_a_stage, num_b_stage, num_epi_stage_per_tile
+
+    def _setup_attributes(
+        self,
+        tiled_mma: cute.TiledMma,
+        a_dtype: Type[cutlass.Numeric],
+        b_dtype: Type[cutlass.Numeric],
+    ):
+        self.cluster_shape_mnk = (*self.cluster_shape_mn, 1)
+        self.cluster_layout_vmnk = cute.tiled_divide(
+            cute.make_layout(self.cluster_shape_mnk),
+            (tiled_mma.thr_id.shape,),
+        )
+
+        # this is fixed for dense MMA, k=16
+        mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
+        # 16*4 = 64; 64 * sizeof(FP16) = 128Bytes
+        mma_inst_tile_k: int = 4
+        self.mma_tiler = (
+            self.mma_tiler[0],
+            self.mma_tiler[1],
+            mma_inst_shape_k * mma_inst_tile_k
+        )
+
+        self.num_acc_stage, self.num_a_stage, self.num_b_stage, self.num_epi_stage_per_tile =\
+            self._compute_stages(tiled_mma, self.mma_tiler, a_dtype, b_dtype)
+        self.tmem_alloc_cols = self.num_acc_stage * self.mma_tiler[1]
+        assert self.tmem_alloc_cols <= SM100_TMEM_CAPACITY_COLUMNS
+
+        self.cta_tile_shape_mnk = (
+            self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler[1],
+            self.mma_tiler[2]
+        )    
+
+    @cute.kernel
+    def kernel(
+        self,
+        tiled_mma: cute.TiledMma,
+        tma_atom_a: cute.CopyAtom,
+        mA: cute.Tensor,
+        tma_atom_b: cute.CopyAtom,
+        mB: cute.Tensor,
+        mLabels: cute.Tensor,
+        mMax: cute.Tensor,
+        mAccu: cute.Tensor,
+        mLogprobs: cute.Tensor,
+        a_smem_layout_staged: cute.ComposedLayout,
+        b_smem_layout_staged: cute.ComposedLayout,
+        cluster_layout_vmnk: cute.Layout,
+        problem_mnk: Tuple[int, int, int],
+        ignore_index: cutlass.Int64,
+        rank: cutlass.Int32
+    ):
+        warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
+        tidx, _, _ = cute.arch.thread_idx()
+        bidx, bidy, _ = cute.arch.block_idx()
+        # FIXME: block swizzling applied here
+        pidm, pidn = bidx, bidy
+
+        # prefetch tma descriptors
+        if warp_idx == self.load_warp_ids:
+            cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_a)
+            cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_b)
+
+        # declare SMEM
+        smem = utils.SmemAllocator()
+        storage = smem.allocate(self.shared_storage)
+
+        ab_pipeline = pipeline.PipelineTmaUmma.create(
+            num_stages=self.num_a_stage,
+            producer_group=make_thread_cooperative_group(len([self.load_warp_ids])),
+            consumer_group=make_thread_cooperative_group(len([self.mma_warp_ids])),
+            tx_count=self.tma_copy_a_bytes + self.tma_copy_b_bytes,
+            barrier_storage=storage.load_ab_mbar_ptr.data_ptr()
+        )
+        ab_producer_state = pipeline.make_pipeline_state(
+            pipeline.PipelineUserType.Producer, self.num_a_stage
+        )
+        ab_consumer_state = pipeline.make_pipeline_state(
+            pipeline.PipelineUserType.Consumer, self.num_a_stage
+        )
+
+        mma_pipeline = pipeline.PipelineUmmaAsync.create(
+            num_stages=self.num_acc_stage,
+            producer_group=make_thread_cooperative_group(len([self.mma_warp_ids])),
+            consumer_group=make_thread_cooperative_group(
+                self.threads_per_warp * len(self.epi_warp_ids)
+            ),
+            barrier_storage=storage.mma_mbar_ptr.data_ptr()
+        )
+        mma_producer_state = pipeline.make_pipeline_state(
+            pipeline.PipelineUserType.Producer, self.num_acc_stage
+        )
+        mma_consumer_state = pipeline.make_pipeline_state(
+            pipeline.PipelineUserType.Consumer, self.num_acc_stage
+        )
+
+        tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr.data_ptr()
+        if warp_idx == self.empty_warp_ids[0]:
+            with cute.arch.elect_one():
+                cute.arch.mbarrier_init(
+                    tmem_dealloc_mbar_ptr,
+                    self.threads_per_warp * len(self.epi_warp_ids)
+                )
+                cute.arch.mbarrier_init_fence()
+
+        # -------- SMEM partition ------------ #
+        # swizzle o [(tileM, tileK), loopM, loopK, Stage]
+        sA = storage.sA.get_tensor(
+            a_smem_layout_staged.outer,
+            swizzle=a_smem_layout_staged.inner
+        )
+        # swizzle o [(tileN, tileK), loopN, loopK, stage]
+        sB = storage.sB.get_tensor(
+            b_smem_layout_staged.outer,
+            swizzle=b_smem_layout_staged.inner
+        )
+        
+        # FIXME: if 2 CTAs, modify here
+        thr_mma = tiled_mma.get_slice(0)
+        # [MMA, loopM, loopK, stage]
+        tCsA = thr_mma.make_fragment_A(sA)
+        # [MMA, loopN, loopK, stage]
+        tCsB = thr_mma.make_fragment_B(sB)
+
+        # ---------- GMEM partition ----------- #
+        # [tileM, tileK, loopK]
+        gA = cute.local_tile(
+            mA,
+            (self.mma_tiler[0], self.mma_tiler[2]),
+            (pidm, None)
+        )
+
+        # [vocab_size_per_split, dim]
+        mB_n = cute.local_tile(
+            mB,
+            (self.vocab_per_split, cute.size(mB.layout.shape, mode=[1])),
+            (pidn, 0)
+        )
+
+        # [tileN, tileK, loopN, loopK]
+        gB = cute.local_tile(
+            mB_n,
+            (self.mma_tiler[1], self.mma_tiler[2]),
+            (None, None)
+        )
+        
+        # [MMA, tileCntM, tileCntK, loopK]
+        tCgA = thr_mma.partition_A(gA)
+        # [MMA, tileCntN, tileCntK, loopN, loopK]
+        tCgB = thr_mma.partition_B(gB)
+
+        a_cta_layout = cute.make_layout(
+            cute.slice_(
+                cluster_layout_vmnk, 
+                (0, 0, None, 0)).shape
+        )
+        # FIXME: if 2 CTAs, modify here
+        cta_rank_in_cluster = 0
+        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(
+            cta_rank_in_cluster
+        )
+        tTMAsA, tTMAgA = cpasync.tma_partition(
+            tma_atom_a,
+            block_in_cluster_coord_vmnk[2], # cta_coord,
+            a_cta_layout,
+            cute.group_modes(sA, 0, 3), # SMEM tensor
+            cute.group_modes(tCgA, 0, 3) # GMEM tensor
+        )
+        b_cta_layout = cute.make_layout(
+            cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape
+        )
+        tTMAsB, tTMAgB = cpasync.tma_partition(
+            tma_atom_b,
+            block_in_cluster_coord_vmnk[1], # cta_coord
+            b_cta_layout,
+            cute.group_modes(sB, 0, 3),
+            cute.group_modes(tCgB, 0, 3)
+        )
+
+        # Allocate TMEM
+        tmem_holding_buf = storage.tmem_holding_buf
+        if warp_idx == self.empty_warp_ids[0]:
+            cute.arch.alloc_tmem(
+                self.tmem_alloc_cols, 
+                tmem_holding_buf,
+                is_two_cta=self.use_2cta_instrs
+            )
+        self.cta_sync_barrier.arrive_and_wait()
+        tmem_ptr = cute.arch.retrieve_tmem_ptr(
+            self.acc_dtype,
+            alignment=16,
+            ptr_to_buffer_holding_addr=tmem_holding_buf
+        )
+
+        # [(tileM, tileN), loopM, loopN]
+        tmem_shape = (128, self.tmem_alloc_cols)
+        acc_shape = thr_mma.partition_shape_C(tmem_shape)
+        tCtC_fake = thr_mma.make_fragment_C(acc_shape)
+        tCtC = cute.make_tensor(tmem_ptr, tCtC_fake.layout)
+        
+        block_vocab_left_idx: cutlass.Int64 = (
+            pidn * self.vocab_per_split
+        )
+        block_vocab_right_idx: cutlass.Int64 = (
+            min((pidn + 1) * self.vocab_per_split, problem_mnk[1])
+        )
+        num_n_tiles: cutlass.Int64 = cute.ceil_div(
+            (block_vocab_right_idx - block_vocab_left_idx), 
+            self.mma_tiler[1])
+
+        # ///////
+        # empty
+        # ///////
+        if warp_idx in self.empty_warp_ids:
+            cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+
+        # ///////
+        # load
+        # ///////
+        if warp_idx == self.load_warp_ids:
+            cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+
+            for n in cutlass.range(num_n_tiles):
+                for k in cutlass.range(cute.size(gA, mode=[2])):
+                    ab_pipeline.producer_acquire(ab_producer_state)
+                    cute.copy(
+                        tma_atom_a,
+                        tTMAgA[(None, k)],
+                        tTMAsA[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state)
+                    )
+                    cute.copy(
+                        tma_atom_b,
+                        tTMAgB[(None, n, k)],
+                        tTMAsB[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state)
+                    )
+                    ab_pipeline.producer_commit(ab_producer_state)
+                    ab_producer_state.advance()
+
+        # ///////
+        # mma
+        # ///////
+        if warp_idx == self.mma_warp_ids:
+            cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+
+            for n in cutlass.range(num_n_tiles):
+                # disable accumulate for the first tile
+                tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+                mma_pipeline.producer_acquire(mma_producer_state)
+
+                for k in cutlass.range(cute.size(gA, mode=[2])):
+                    ab_pipeline.consumer_wait(ab_consumer_state)
+
+                    for kblock_idx in cutlass.range(cute.size(tCsA, mode=[2]), unroll_full=True):
+                        cute.gemm(
+                            tiled_mma,
+                            cute.append_ones(tCtC[(None, None, mma_producer_state.index)]),
+                            tCsA[(None, None, kblock_idx, ab_consumer_state.index)],
+                            tCsB[(None, None, kblock_idx, ab_consumer_state.index)],
+                            cute.append_ones(tCtC[(None, None, mma_producer_state.index)])
+                        )
+                        # enable accumulate for the next tile
+                        tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+                    ab_pipeline.consumer_release(ab_consumer_state)
+                    ab_consumer_state.advance()
+
+                mma_pipeline.producer_commit(mma_producer_state)
+                mma_producer_state.advance()
+
+        # //////////
+        # epilogue
+        # //////////
+        if warp_idx in self.epi_warp_ids:
+            cute.arch.warpgroup_reg_alloc(self.num_regs_epi)
+
+            # epilog TMEM copy and partition
+            copy_atom_t2r = sm100_utils.get_tmem_load_op(
+                self.cta_tile_shape_mnk,
+                utils.LayoutEnum.ROW_MAJOR, # This is hard-coded
+                self.acc_dtype,
+                self.acc_dtype,
+                (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
+                self.use_2cta_instrs
+            )
+            # [tileM, subTileN, loopM, CntSubTileN, loopN]
+            tAcc_epi = cute.flat_divide(
+                tCtC[((None, None), 0, None)],
+                (self.epi_tile[0],
+                 self.epi_tile[1] // self.num_epi_stage_per_tile)
+            )
+            tiled_copy_t2r = tcgen05.make_tmem_copy(
+                copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)]
+            )
+            thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+            tTMEM_load_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
+            # [(pattern), loopM, loopN, CntTileM, CntTileN]
+            tTMEM_load_tAcc = cute.group_modes(tTMEM_load_tAcc, 3, cute.rank(tTMEM_load_tAcc) - 1)
+
+            cAcc = cute.make_identity_tensor(self.mma_tiler[:2])
+            tCcAcc = thr_mma.partition_C(cAcc)
+            # [tileM, subTileN, loopM, CntSubTileN, CntTileN]
+            tCcAcc_epi = cute.flat_divide(
+                tCcAcc[((None, None), 0, None)],
+                (self.epi_tile[0],
+                 self.epi_tile[1] // self.num_epi_stage_per_tile)
+            )
+            tTMEM_load_cAcc = thr_copy_t2r.partition_D(tCcAcc_epi)
+            tTMEM_load_cAcc_shape = cute.select(
+                tTMEM_load_cAcc.shape,
+                mode=[0, 1, 2]
+            )
+
+            # epilogue layouts
+            epilogue_thread_layout = cute.make_layout((128, 1))
+            copy_atom_g2r = cute.make_copy_atom(
+                cute.nvgpu.CopyUniversalOp(),
+                mLabels.element_type
+            )
+            tiled_copy_g2r = cute.make_tiled_copy(
+                copy_atom_g2r,
+                epilogue_thread_layout,
+                (128, 1)
+            )
+            thr_copy_g2r = tiled_copy_g2r.get_slice(tidx)
+
+            copy_atom_r2g = cute.make_copy_atom(
+                cute.nvgpu.CopyUniversalOp(),
+                cutlass.Float32
+            )
+            tiled_copy_r2g = cute.make_tiled_copy(
+                copy_atom_r2g,
+                epilogue_thread_layout,
+                (128, 1)
+            )
+            thr_copy_r2g = tiled_copy_r2g.get_slice(tidx)
+            
+
+            # auxiliary tensors
+            # [tileM]
+            gLabels = cute.local_tile(
+                mLabels,
+                (self.epi_tile[0],),
+                (pidm,)
+            )
+
+            tLabelsCAcc = thr_copy_g2r.partition_S(cAcc)[(None, None, 0)]
+            tLabelsCAcc_mask = cute.make_fragment(tLabelsCAcc.shape, cutlass.Boolean)
+            # [(1, 1), 1]
+            tLabelsCAcc_mask[0] = cute.elem_less(
+                pidm * self.epi_tile[0] + tidx,
+                problem_mnk[0]
+            )
+            # to align shape with gMax and gAccu
+            tLabelsCAcc_mask = cute.append_ones(tLabelsCAcc_mask)
+
+            # [(1, 1), 1, 1]
+            tLabelsgLabels = thr_copy_g2r.partition_S(cute.append_ones(gLabels))
+            tLabelsrLabels = cute.make_fragment(tLabelsgLabels.shape, tLabelsgLabels.element_type)
+            cute.copy(
+                tiled_copy_g2r,
+                tLabelsgLabels,
+                tLabelsrLabels,
+                pred=tLabelsCAcc_mask
+            )
+            valid_mask: cutlass.Boolean =\
+                (tLabelsrLabels[0] != ignore_index) and tLabelsCAcc_mask[0]
+
+            # [tileM, 1]
+            gMax = cute.local_tile(
+                mMax,
+                (self.epi_tile[0], 1),
+                (pidm, pidn)
+            )
+            # [(CPYM, CPYN), loopM, loopN]
+            tR2GgMax = thr_copy_r2g.partition_D(gMax)
+            tR2GrMax = cute.make_fragment(tR2GgMax.shape, tR2GgMax.element_type)
+            tR2GrMax.fill(-1e30)
+
+            # [tileM, 1]
+            gAccu = cute.local_tile(
+                mAccu,
+                (self.epi_tile[0], 1),
+                (pidm, pidn)
+            )
+            # [(CPYM, CPYN), loopM, loopN]
+            tR2GgAccu = thr_copy_r2g.partition_D(gAccu)
+            tR2GrAccu = cute.make_fragment(tR2GgAccu.shape, tR2GgAccu.element_type)
+            tR2GrAccu.fill(0.0)
+            
+            # [tileM, 1]
+            gLogprobs = cute.append_ones(cute.local_tile(
+                mLogprobs,
+                (self.epi_tile[0],),
+                (pidm,)
+            ))
+            # [(CPYM, CPYN), loopM, loopN]
+            tR2GgLogprobs = thr_copy_r2g.partition_D(gLogprobs)
+            tR2GrLogprobs = cute.make_fragment(tR2GgLogprobs.shape, tR2GgLogprobs.element_type)
+            tR2GrLogprobs.fill(0.0)
+
+            # [(tileN // num_epi_stage_per_tile, 1), 1, 1]
+            tTMEM_load_rAcc = cute.make_fragment(
+                tTMEM_load_cAcc_shape,
+                self.acc_dtype
+            )
+
+            for n in cutlass.range(num_n_tiles):
+                mma_pipeline.consumer_wait(mma_consumer_state)
+
+                left: cutlass.Int64 = (
+                    block_vocab_left_idx + n * self.epi_tile[1]
+                )
+                right: cutlass.Int64 = (
+                    min((n + 1) * self.epi_tile[1] + block_vocab_left_idx, 
+                        block_vocab_right_idx)
+                )
+                num_n_subtiles: cutlass.Int64 = cute.ceil_div(
+                    (right - left),
+                    cute.size(tTMEM_load_rAcc, mode=[0])
+                )
+                for n_subtile in cutlass.range(num_n_subtiles):
+                    cute.copy(
+                        tiled_copy_t2r,
+                        tTMEM_load_tAcc[(None, None, None, n_subtile, mma_consumer_state.index)],
+                        tTMEM_load_rAcc
+                    )
+
+                    for idx in cutlass.range(cute.size(tTMEM_load_rAcc, mode=[0]), unroll_full=True):
+                        local_position: cutlass.Int64 = (
+                            n * self.epi_tile[1]
+                            + n_subtile * cute.size(tTMEM_load_rAcc, mode=[0])
+                            + idx
+                        )
+                        if (block_vocab_left_idx + local_position) < block_vocab_right_idx:
+                            _max_old = tR2GrMax[0]
+                            tR2GrMax[0] = cute.arch.fmax(tR2GrMax[0], tTMEM_load_rAcc[idx])
+                            exp_logits = cute.exp(tTMEM_load_rAcc[idx] - tR2GrMax[0])
+                            coeff = cute.exp(_max_old - tR2GrMax[0])
+                            tR2GrAccu[0] = coeff * tR2GrAccu[0] + exp_logits
+
+                            position: cutlass.Int64 = (
+                                rank * problem_mnk[1]
+                                + pidn * self.vocab_per_split
+                                + local_position
+                            )
+                            mask: cutlass.Boolean = valid_mask and (position == tLabelsrLabels[0])
+                            tR2GrLogprobs[0] += (mask * tTMEM_load_rAcc[idx])
+
+                mma_pipeline.consumer_release(mma_consumer_state)
+                mma_consumer_state.advance()
+
+            cute.copy(
+                tiled_copy_r2g,
+                tR2GrMax,
+                tR2GgMax,
+                pred=tLabelsCAcc_mask
+            )
+            cute.copy(
+                tiled_copy_r2g,
+                tR2GrAccu,
+                tR2GgAccu,
+                pred=tLabelsCAcc_mask
+            )
+
+            vocab_left_idx: cutlass.Int64 = (
+                rank * problem_mnk[1]
+                + pidn * self.vocab_per_split
+            )
+            vocab_right_idx: cutlass.Int64 = (
+                rank * problem_mnk[1]
+                + min((pidn + 1) * self.vocab_per_split, problem_mnk[1])
+            )
+            valid: cutlass.Boolean = (
+                tLabelsrLabels[0] >= vocab_left_idx
+                and tLabelsrLabels[0] < vocab_right_idx
+            )
+            tLabelsCAcc_mask[0] &= valid
+
+            cute.copy(
+                tiled_copy_r2g,
+                tR2GrLogprobs,
+                tR2GgLogprobs,
+                pred=tLabelsCAcc_mask
+            )
+
+        # Dealloc TMEM
+        self.cta_sync_barrier.arrive_and_wait()
+        if warp_idx == self.empty_warp_ids[0]:
+            cute.arch.relinquish_tmem_alloc_permit()
+            cute.arch.dealloc_tmem(
+                tmem_ptr, 
+                self.tmem_alloc_cols,
+                is_two_cta=self.use_2cta_instrs
+            )
+
+    @staticmethod
+    def _compute_grid(
+        problem_mnk: Tuple[int, int, int],
+        cluster_shape_mn: Tuple[int, int],
+        cta_tiler: Tuple[int, int, int],
+        num_splits: int
+    ) -> Tuple[int, int, int]:
+
+        cluster_shape = (*cluster_shape_mn, 1)
+
+        grid = cute.round_up(
+            (
+                cute.ceil_div(problem_mnk[0], cta_tiler[0]),
+                num_splits,
+                1,
+            ),
+            cluster_shape
+        )
+        return grid
+
+    @cute.jit
+    def __call__(
+        self,
+        hidden: cute.Tensor,
+        weight: cute.Tensor,
+        labels: cute.Tensor,
+        _logprobs: cute.Tensor,
+        _max: cute.Tensor,
+        _accu: cute.Tensor,
+        ignore_index: cutlass.Int64,
+        rank: cutlass.Int32,
+        stream: cuda.CUstream,
+    ) -> None:
+        a_dtype: Type[cutlass.Numeric] = hidden.element_type
+        b_dtype: Type[cutlass.Numeric] = weight.element_type
+
+        if cutlass.const_expr(hidden.element_type != weight.element_type):
+            raise RuntimeError(f"data type don't match: {hidden.element_type} v.s. {weight.element_type}")
+        if cutlass.const_expr(hidden.element_type not in [cutlass.Float16, cutlass.BFloat16]):
+            raise RuntimeError("hidden can only be FP16 or BF16")
+        if cutlass.const_expr(hidden.layout.shape[1] != weight.layout.shape[1]):
+            raise RuntimeError("K dimension doesn't match")
+        
+        problem_mnk = (
+            hidden.layout.shape[0],
+            weight.layout.shape[0],
+            hidden.layout.shape[1],
+        )
+        if cutlass.const_expr((problem_mnk[2] * a_dtype.width // 8) % 16 != 0):
+            raise RuntimeError(f"K dimension is not 16B aligned: {problem_mnk[2]}")
+
+        num_splits = cute.ceil_div(problem_mnk[1], self.vocab_per_split)
+        # if cutlass.const_expr(_max.layout.shape != (hidden.layout.shape[0], num_splits)):
+        #     raise RuntimeError(f"max shape mismatch: {_max.layout.shape} != ({hidden.layout.shape[0]}, {num_splits})")
+        # if cutlass.const_expr(_accu.layout.shape != (hidden.layout.shape[0], num_splits)):
+        #     raise RuntimeError(f"accu shape mismatch: {_accu.layout.shape} != ({hidden.layout.shape[0]}, {num_splits})")
+
+        grid = self._compute_grid(
+            problem_mnk = problem_mnk,
+            cluster_shape_mn = self.cluster_shape_mn,
+            cta_tiler = self.cta_tiler,
+            num_splits = num_splits
+        )
+        a_major_mode = utils.LayoutEnum.from_tensor(hidden).mma_major_mode()
+        b_major_mode = utils.LayoutEnum.from_tensor(weight).mma_major_mode()
+        
+        tiled_mma = sm100_utils.make_trivial_tiled_mma(
+            a_dtype,
+            a_major_mode,
+            b_major_mode,
+            self.acc_dtype,
+            self.cta_group,
+            self.mma_tiler[:2]
+        )
+
+        self._setup_attributes(tiled_mma, a_dtype, b_dtype)
+        if cutlass.const_expr((problem_mnk[2] * a_dtype.width // 8) % 128 != 0):
+            raise RuntimeError(f"K dimension is not 128B aligned: {problem_mnk[2]}")
+
+        self.epi_tile = self.mma_tiler[:2]
+        
+        # Swizzle o [(tileM, tileK), loopM, loopK, stage]
+        a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            self.mma_tiler,
+            a_dtype,
+            self.num_a_stage
+        )
+        # Swizzle o [(tileN, tileK), loopN, loopK, stage]
+        b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            self.mma_tiler,
+            b_dtype,
+            self.num_b_stage
+        )
+
+        # TMA loading
+        tma_load_op = cpasync.CopyBulkTensorTileG2SOp(self.cta_group)
+        tma_store_op = cpasync.CopyBulkTensorTileS2GOp()
+
+        # Swizzle o [(tileM, tileK), loopM, loopK]
+        a_smem_layout = cute.select(
+            a_smem_layout_staged,
+            mode=[0, 1, 2]
+        )
+        # create tma copy atom for hidden,
+        # and the cooresponding tma descriptor tensor
+        tma_atom_a, tma_desc_a = cute.nvgpu.make_tiled_tma_atom_A(
+            tma_load_op,
+            hidden, # gmem_tensor
+            a_smem_layout, # SMEM layout
+            self.mma_tiler, # MMA tiler
+            tiled_mma, # TiledMMA
+            self.cluster_layout_vmnk.shape # cluster_shape_vmnk
+        )
+        # Swizzle o [(tileN, tileK), loopN, loopK]
+        b_smem_layout = cute.select(
+            b_smem_layout_staged,
+            mode=[0, 1, 2]
+        )
+        tma_atom_b, tma_desc_b = cute.nvgpu.make_tiled_tma_atom_B(
+            tma_load_op,
+            weight, # gmem_tensor
+            b_smem_layout, # SMEM layout
+            self.mma_tiler, # MMA tiler
+            tiled_mma, # TiledMMA
+            self.cluster_layout_vmnk.shape # cluster_shape_vmnk
+        )
+        a_copy_size = cute.size_in_bytes(a_dtype, a_smem_layout)
+        b_copy_size = cute.size_in_bytes(b_dtype, b_smem_layout)
+        self.tma_copy_a_bytes = a_copy_size
+        self.tma_copy_b_bytes = b_copy_size
+
+        assert self.num_a_stage == self.num_b_stage
+        @cute.struct
+        class SharedStorage:
+            # pipeline barriers, 2 = producer + consumer
+            load_ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_a_stage * 2]
+            mma_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2]
+            tmem_dealloc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, 1]
+            # tmem holding buffer
+            tmem_holding_buf: cutlass.Int32
+            # SMEM tensors
+            sA: cute.struct.Align[
+                cute.struct.MemRange[a_dtype, cute.cosize(a_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+            sB: cute.struct.Align[
+                cute.struct.MemRange[b_dtype, cute.cosize(b_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+        self.shared_storage = SharedStorage
+
+        # launch kernel
+        self.kernel(
+            tiled_mma,
+            tma_atom_a,
+            tma_desc_a,
+            tma_atom_b,
+            tma_desc_b,
+            labels,
+            _max,
+            _accu,
+            _logprobs,
+            a_smem_layout_staged,
+            b_smem_layout_staged,
+            self.cluster_layout_vmnk,
+            problem_mnk,
+            ignore_index,
+            rank,
+        ).launch(
+            grid=grid,
+            block=[self.threads_per_cta, 1, 1],
+            cluster=self.cluster_shape_mnk,
+            stream=stream,
+        )
+        return None
+
+
+if __name__ == "__main__":
+    rank = 0
+
+    vocab_per_split = 512 * 6
+    fwd_mainloop = FwdMainLoop(
+        vocab_per_split=vocab_per_split
+    ) # use default arguments
+
+    torch.manual_seed(1111)
+
+    num_tokens = 13092
+    hidden_size = 4096
+    vocab_size = 152064
+    # num_tokens = 4
+    # hidden_size = 64
+    # vocab_size = 512
+    dtype = torch.bfloat16
+    ignore_index = -100
+
+    hidden = (
+        torch.empty((num_tokens, hidden_size), dtype=dtype, device="cuda")
+        .uniform_(-0.5, 0.5)
+    )
+    weight = (
+        torch.empty((vocab_size, hidden_size), dtype=dtype, device="cuda")
+        .uniform_(-0.5, 0.5)
+    )
+    # hidden = torch.ones((num_tokens, hidden_size), dtype=dtype, device="cuda")
+    # weight = torch.ones((vocab_size, hidden_size), dtype=dtype, device="cuda")
+    labels = torch.randint(0, vocab_size, (num_tokens,), device="cuda")
+
+    # pad 1 ignore_index to the right
+    padded_labels = torch.nn.functional.pad(
+        labels, (0, 1), value=ignore_index
+    )
+    # remove first element
+    labels = padded_labels[..., 1:].contiguous()
+
+    # allocate output tensor
+    logprobs = torch.empty((num_tokens), dtype=torch.float32, device="cuda")
+    
+    # allocate intermediate tensors
+    num_splits = (vocab_size + vocab_per_split - 1) // vocab_per_split
+    _max = torch.empty((num_tokens, num_splits), device=hidden.device, dtype=torch.float32)
+    _accu = torch.empty((num_tokens, num_splits), device=hidden.device, dtype=torch.float32)
+
+
+    # compile kernel
+    _hidden = from_dlpack(hidden, assumed_align=16).mark_compact_shape_dynamic(mode=0, divisibility=1)
+    _weight = from_dlpack(weight, assumed_align=16)
+    _labels = from_dlpack(labels, assumed_align=8).mark_compact_shape_dynamic(mode=0)
+    _logprobs = from_dlpack(logprobs, assumed_align=16).mark_compact_shape_dynamic(mode=0)
+    _max_ = from_dlpack(_max, assumed_align=8).mark_compact_shape_dynamic(mode=0)
+    _accu_ = from_dlpack(_accu, assumed_align=8).mark_compact_shape_dynamic(mode=0)
+    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+    compiled = cute.compile(fwd_mainloop, 
+        _hidden, _weight, _labels, _logprobs, 
+        _max_, _accu_, 
+        ignore_index,
+        rank,
+        stream)
+    
+    # launch kernel
+    start, stop = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+
+    with torch.cuda.nvtx.range("FwdMainLoop"):
+        start.record(stream=torch.cuda.current_stream())
+        compiled(_hidden, _weight, _labels, _logprobs, _max_, _accu_, ignore_index, rank, stream)
+        stop.record(stream=torch.cuda.current_stream())
+
+    torch.cuda.synchronize()
+
+    elapsed_time = start.elapsed_time(stop)
+
+    gemm = torch.matmul(hidden.to(torch.float32), weight.T.to(torch.float32))
+    # print(gemm)
+
+    # print(_max)
+    # print(_accu)
+    # print(logprobs)
+
+    cut_max, _ = torch.max(_max, dim=1)
+    print(cut_max)
+    # for i in range(cut_max.shape[0]):
+    #     print(i, cut_max[i])
+
+    torch_max, _ = torch.max(gemm, dim=1)
+    print(torch_max)
+
+    print(f"Success, Elapsed time: {elapsed_time:.4f} ms")
\ No newline at end of file
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/triton.py b/megatron/core/fusions/linear_cross_entropy/blackwell/triton.py
new file mode 100644
index 00000000000..fd3f14236fb
--- /dev/null
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/triton.py
@@ -0,0 +1,303 @@
+import triton
+import triton.language as tl
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE_M": 1024}, num_stages=3, num_warps=32),
+        triton.Config({"BLOCK_SIZE_M": 2048}, num_stages=3, num_warps=32),
+    ],
+    key=["num_tokens"],
+)
+@triton.jit
+def get_num_valid_tokens(
+    num_tokens: tl.int64,
+    ignore_index: tl.int64,
+    labels_ptr: tl.pointer_type(tl.int64),
+    stride_labels: tl.int64,
+    num_valid_tokens_ptr: tl.pointer_type(tl.int64),
+    BLOCK_SIZE_M: tl.constexpr,
+):
+    """
+    Calculate the number of valid tokens in the labels tensor.
+    """
+    num_pid_m: tl.int64 = tl.cdiv(num_tokens, BLOCK_SIZE_M)
+
+    num_valid_tokens: tl.int64 = tl.zeros((), dtype=tl.int64)
+    for m in range(0, num_pid_m):
+        offs_am = m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+
+        labels = tl.load(
+            labels_ptr + offs_am * stride_labels,
+            mask=offs_am < num_tokens,
+            other=ignore_index
+        )
+
+        valid_labels_mask = labels != ignore_index
+        num_valid_tokens += (tl.sum(valid_labels_mask.to(tl.int32), axis=0)).to(tl.int64)
+    tl.store(num_valid_tokens_ptr, num_valid_tokens)
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64})
+    ],
+    key=["num_tokens", "num_splits"]
+)
+@triton.jit
+def forward_dp_epilogue(
+    num_tokens: tl.int64,
+    num_splits: tl.int64, # TODO: maybe this could be a constexpr
+    ignore_index: tl.int64,
+    labels_ptr: tl.pointer_type(tl.int64),
+    stride_labels: tl.int64,
+    num_valid_tokens_ptr: tl.pointer_type(tl.int64),
+    max_ptr: tl.pointer_type(tl.float32),
+    stride_max_m: tl.int64,
+    stride_max_n: tl.int64,
+    accu_ptr: tl.pointer_type(tl.float32),
+    stride_accu_m: tl.int64,
+    stride_accu_n: tl.int64,
+    global_max_ptr: tl.pointer_type(tl.float32),
+    stride_global_max: tl.int64,
+    global_accu_ptr: tl.pointer_type(tl.float32),
+    stride_global_accu: tl.int64,
+    global_logprobs_ptr: tl.pointer_type(tl.float32),
+    stride_global_logprobs: tl.int64,
+    global_logprobs_scalar_ptr: tl.pointer_type(tl.float32),
+    REDUCTION: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+):
+    """
+    forward epilogue in dp
+    """
+    pid_m = tl.program_id(axis=0)
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    global_max = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    global_accu = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+
+    for pid_n in range(0, tl.cdiv(num_splits, BLOCK_SIZE_N)):
+        offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+
+        _max = tl.load(
+            max_ptr + offs_m[:, None] * stride_max_m + offs_n[None, :] * stride_max_n,
+            mask=(offs_m[:, None] < num_tokens) & (offs_n[None, :] < num_splits),
+            other=0.0,
+        )
+        _accu = tl.load(
+            accu_ptr + offs_m[:, None] * stride_accu_m + offs_n[None, :] * stride_accu_n,
+            mask=(offs_m[:, None] < num_tokens) & (offs_n[None, :] < num_splits),
+            other=0.0,
+        )
+
+        # local reduction
+        _max_old = global_max
+        _local_max = tl.max(_max, axis=1, return_indices=False)
+        global_max = tl.maximum(global_max, _local_max)
+
+        _scale = tl.exp(_max - global_max[:, None])
+        _coeff = tl.exp(_max_old - global_max)
+        global_accu = _coeff * global_accu + tl.sum(_scale * _accu, axis=1)
+
+    # store maximum
+    tl.store(
+        global_max_ptr + offs_m * stride_global_max,
+        global_max,
+        mask=offs_m < num_tokens,
+    )
+    # store accumulate
+    tl.store(
+        global_accu_ptr + offs_m * stride_global_accu,
+        global_accu,
+        mask=offs_m < num_tokens,
+    )
+    # update logprobs
+    labels = tl.load(
+        labels_ptr + offs_m * stride_labels,
+        mask=offs_m < num_tokens,
+        other=ignore_index,
+    )
+    global_logprobs_ptrs = global_logprobs_ptr + offs_m * stride_global_logprobs
+    global_logprobs = tl.load(
+        global_logprobs_ptrs,
+        mask=offs_m < num_tokens,
+    )
+    global_logprobs = global_max + tl.log(global_accu) - global_logprobs
+    label_mask = labels != ignore_index
+    global_logprobs = tl.where(label_mask, global_logprobs, 0.0)
+
+    if REDUCTION == 0: # no-reduction
+        tl.store(
+            global_logprobs_ptrs,
+            global_logprobs,
+            mask=offs_m < num_tokens,
+        )
+    elif REDUCTION == 1: # sum
+        global_logprobs_scalar = tl.sum(global_logprobs, axis=0)
+        tl.atomic_add(
+            global_logprobs_scalar_ptr,
+            global_logprobs_scalar
+        )
+    elif REDUCTION == 2: # mean
+        num_valid_tokens = tl.load(num_valid_tokens_ptr)
+        global_logprobs_scalar = tl.fdiv(
+            tl.sum(global_logprobs, axis=0),
+            num_valid_tokens.to(tl.float32),
+        )
+        tl.atomic_add(
+            global_logprobs_scalar_ptr,
+            global_logprobs_scalar
+        )
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64}),
+    ],
+    key=["num_tokens", "num_splits"]
+)
+@triton.jit
+def forward_tp_epilogue(
+    num_tokens: tl.int64,
+    num_splits: tl.int64,
+    reduced_max_ptr: tl.pointer_type(tl.float32),
+    stride_reduced_max_m: tl.int64,
+    stride_reduced_max_n: tl.int64,
+    original_max_ptr: tl.pointer_type(tl.float32),
+    stride_original_max_m: tl.int64,
+    stride_original_max_n: tl.int64,
+    accu_ptr: tl.pointer_type(tl.float32),
+    stride_accu_m: tl.int64,
+    stride_accu_n: tl.int64,
+    global_max_ptr: tl.pointer_type(tl.float32),
+    stride_global_max: tl.int64,
+    global_accu_ptr: tl.pointer_type(tl.float32),
+    stride_global_accu: tl.int64,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+):
+    """
+    forward epilogue in tp
+    """
+    pid_m = tl.program_id(axis=0)
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+
+    global_max = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    global_accu = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+
+    for pid_n in range(0, tl.cdiv(num_splits, BLOCK_SIZE_N)):
+        offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+
+        _reduced_max = tl.load(
+            reduce_max_ptr + offs_m[:, None] * stride_reduce_max_m + offs_n[None, :] * stride_reduce_max_n,
+            mask=(offs_m[:, None] < num_tokens) & (offs_n[None, :] < num_splits),
+            other=0.0,
+        )
+        _original_max = tl.load(
+            original_max_ptr + offs_m[:, None] * stride_original_max_m + offs_n[None, :] * stride_original_max_n,
+            mask=(offs_m[:, None] < num_tokens) & (offs_n[None, :] < num_splits),
+            other=0.0,
+        )
+        _accu = tl.load(
+            accu_ptr + offs_m[:, None] * stride_accu_m + offs_n[None, :] * stride_accu_n,
+            mask=(offs_m[:, None] < num_tokens) & (offs_n[None, :] < num_splits),
+            other=0.0,
+        )
+
+        # local reduction
+        _max_old = global_max
+        _local_max = tl.max(_reduced_max, axis=1)
+        global_max = tl.maximum(global_max, _local_max)
+
+        # update accumulate
+        _coeff = tl.exp(_max_old - global_max)
+        _scale = tl.exp(_original_max - global_max[:, None])
+        global_accu = _coeff * global_accu + tl.sum(_scale * _accu, axis=1)
+
+    # store
+    tl.store(
+        global_max_ptr + offs_m * stride_global_max,
+        global_max,
+        mask=offs_m < num_tokens,
+    )
+    tl.store(
+        global_accu_ptr + offs_m * stride_global_accu,
+        global_accu,
+        mask=offs_m < num_tokens
+    )
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE_M": 16})
+    ],
+    key=["num_tokens"]
+)
+@triton.jit
+def forward_tp_epilogue_update_logprobs(
+    num_tokens: tl.int64,
+    ignore_index: tl.int64,
+    num_valid_tokens_ptr: tl.pointer_type(tl.int64),
+    labels_ptr: tl.pointer_type(tl.int64),
+    stride_labels: tl.int64,
+    logprobs_ptr: tl.pointer_type(tl.float32),
+    stride_logprobs: tl.int64,
+    maximum_ptr: tl.pointer_type(tl.float32),
+    stride_maximum: tl.int64,
+    accumulate_ptr: tl.pointer_type(tl.float32),
+    stride_accumulate: tl.int64,
+    logprobs_scalar_ptr: tl.pointer_type(tl.float32),
+    REDUCTION: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+):
+    """
+    update logprobs in tp
+    """
+    pid_m = tl.program_id(axis=0)
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+
+    logprobs = tl.load(
+        logprobs_ptr + offs_m * stride_logprobs,
+        mask=offs_m < num_tokens,
+    )
+    maximum = tl.load(
+        maximum_ptr + offs_m * stride_maximum,
+        mask=offs_m < num_tokens,
+    )
+    accumulate = tl.load(
+        accumulate_ptr + offs_m * stride_accumulate,
+        mask=offs_m < num_tokens,
+    )
+
+    labels = tl.load(
+        labels_ptr + offs_m * stride_labels,
+        mask=offs_m < num_tokens,
+        other=ignore_index,
+    )
+    label_mask = labels != ignore_index
+
+    logprobs = maximum + tl.log(accumulate) - logprobs
+    logprobs = tl.where(label_mask, logprobs, 0.0)
+
+    if REDUCTION == 0: # no-reduction
+        tl.store(
+            logprobs_ptr + offs_m * stride_logprobs,
+            logprobs,
+            mask=offs_m < num_tokens,
+        )
+    elif REDUCTION == 1: # sum
+        logprobs_scalar = tl.sum(logprobs, axis=0)
+        tl.atomic_add(
+            logprobs_scalar_ptr,
+            logprobs_scalar
+        )
+    elif REDUCTION == 2: # mean
+        num_valid_tokens = tl.load(num_valid_tokens_ptr)
+        logprobs_scalar = tl.fdiv(
+            tl.sum(logprobs, axis=0),
+            num_valid_tokens.to(tl.float32),
+        )
+        tl.atomic_add(logprobs_scalar_ptr, logprobs_scalar)
\ No newline at end of file
diff --git a/megatron/core/fusions/linear_cross_entropy/utils.py b/megatron/core/fusions/linear_cross_entropy/utils.py
new file mode 100644
index 00000000000..642a6b3b230
--- /dev/null
+++ b/megatron/core/fusions/linear_cross_entropy/utils.py
@@ -0,0 +1,35 @@
+import typing
+from dataclasses import dataclass
+
+@dataclass
+class EntropyReductionEnum:
+    """
+    Enum for the reduction method of cross entropy.
+    """
+    kNone = 0
+    kSum = 1
+    kMean = 2
+
+def str_to_reduction_enum(reduction: str) -> EntropyReductionEnum:
+    """
+    str -> EntropyReductionEnum
+    """
+    _enum = EntropyReductionEnum.kNone
+    if reduction == "none":
+        _enum = EntropyReductionEnum.kNone
+    elif reduction == "sum":
+        _enum = EntropyReductionEnum.kSum
+    elif reduction == "mean":
+        _enum = EntropyReductionEnum.kMean
+    else:
+        raise ValueError(f"Invalid reduction: {reduction}")
+    return _enum
+
+@dataclass
+class BackwardMethodEnum:
+    # two separate kernels for d_hidden and d_weight, respectively
+    kTwoKernels = 0
+    # calculate partial d_logits along its N dimension
+    kDlogitsSplitN = 1
+    # fuse d_hidden and d_weight into a single kernel
+    kFused = 2
diff --git a/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py b/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py
index 4d0ae55b666..a4d759046f9 100644
--- a/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py
+++ b/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py
@@ -22,6 +22,9 @@
 )
 from tests.unit_tests.test_utilities import Utils
 
+from megatron.core.fusions.fused_linear_cross_entropy import linear_cross_entropy
+
+import os
 
 class MockDataset(Dataset):
     """
@@ -132,8 +135,11 @@ def init_gpt_dataloader(
     return dataloader
 
 
-class TestFusedLinearCrossEntropy:
-
+@pytest.mark.skipif(
+    "WORLD_SIZE" not in os.environ or os.environ["WORLD_SIZE"] < "2",
+    reason="Requires torchrun with multiple GPUs"
+)
+class TestFusedLinearCrossEntropyOnGptModel:
     @pytest.mark.parametrize("fp8_flag", get_valid_fp8_flags())
     @pytest.mark.parametrize("mtp_layers", [0, 1])
     @pytest.mark.parametrize("dispatcher_type", get_valid_token_dispatcher_types())
@@ -187,3 +193,373 @@ def test_gpt_model(self, mtp_layers, dispatcher_type, fp8_flag, layer_num):
                 output = gpt_model(**batch)
                 loss = output.sum()
                 loss.backward()
+
+
+@pytest.mark.skipif(
+    "WORLD_SIZE" in os.environ and os.environ["WORLD_SIZE"] != "1",
+    reason="Requires single GPU"
+)
+class TestFusedLinearCrossEntropyDataParallel:
+    def cleanup(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+        import gc
+
+        gc.collect()
+        torch.cuda.synchronize()
+
+    @staticmethod
+    def torch_linear_cross_entropy(
+        hidden: torch.Tensor,
+        weight: torch.Tensor,
+        labels: torch.Tensor,
+        reduction: str,
+        ignore_index: int
+    ):
+        # NOTE: need to convert to fp32 to fp32 accumulation,
+        # thus assure accuracy
+        logits = hidden.to(torch.float32) @ weight.T.to(torch.float32)
+        logprobs = torch.nn.functional.cross_entropy(
+            logits.view(-1, logits.shape[-1]),
+            labels.view(-1),
+            reduction=reduction,
+            ignore_index=ignore_index,
+        )
+        return logprobs.to(torch.float32)
+
+    @staticmethod
+    def get_problems():
+        return [
+            (80, 125, 64),
+            (80, 152064, 64),
+            (1024, 152064, 4096),
+            (4096, 152063, 8192),
+            ((1, 4096), 152064, 8192),
+            ((2, 4096), 152064, 8192),
+        ]
+
+    @staticmethod
+    def get_ignore_index():
+        return [-100, 4]
+
+    def test_kernel_launch(self):
+        """
+        Check if the compiled kernel can be
+        launched with different problem sizes
+        """
+        self.cleanup()
+
+        num_tokens = [15, 26, 128, 513, 2048, 8192]
+        vocab_size = 152064
+        dim = 4096
+        dtype = torch.bfloat16
+        reduction = "mean"
+        ignore_index = -100
+
+        weight = torch.randn(vocab_size, dim, dtype=dtype, device="cuda").requires_grad_()
+        for num_token in num_tokens:
+            hidden = torch.randn(num_token, dim, dtype=dtype, device="cuda").requires_grad_()
+            labels = torch.randint(0, vocab_size, (num_token,), dtype=torch.long, device="cuda")
+            
+            logprobs = linear_cross_entropy(hidden, weight, labels, reduction=reduction, ignore_index=ignore_index)
+            assert not torch.isnan(logprobs).any()
+
+            gLogprobs = torch.randn_like(logprobs)
+            (d_hidden, d_weight) = torch.autograd.grad(
+                (logprobs,),
+                (hidden, weight),
+                (gLogprobs,),
+                retain_graph=False
+            )
+            assert not torch.isnan(d_hidden).any()
+            assert not torch.isnan(d_weight).any()
+
+
+    @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+    @pytest.mark.parametrize("problem", get_problems())
+    @pytest.mark.parametrize("reduction", ["none", "mean", "sum"])
+    @pytest.mark.parametrize("ignore_index", get_ignore_index())
+    def test_correctness(
+        self,
+        dtype,
+        problem,
+        reduction,
+        ignore_index
+    ):
+        num_tokens, vocabsize, dim = problem
+        hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
+        labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens
+        
+        hidden = (
+            torch.empty(hidden_shape, dtype=dtype, device="cuda")
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        weight = (
+            torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+        if ignore_index >=0 and ignore_index < vocabsize:
+            pad_labels = torch.nn.functional.pad(labels, (0, 1), value=ignore_index)
+            labels = pad_labels[..., 1:].contiguous()
+
+        # forward
+        torch_logprobs = self.torch_linear_cross_entropy(hidden, weight, labels, 
+            reduction=reduction, ignore_index=ignore_index)
+
+        custom_logprobs = linear_cross_entropy(hidden, weight, labels, 
+            reduction=reduction, ignore_index=ignore_index)
+
+        torch.testing.assert_close(
+            torch_logprobs,
+            custom_logprobs
+        )
+
+        # backward
+        g_logprobs = (
+            torch.empty_like(torch_logprobs)
+            .uniform_(-0.1, 0.1)
+        )
+
+        (d_torch_hidden, d_torch_weight) = torch.autograd.grad(
+            (torch_logprobs,),
+            (hidden, weight),
+            (g_logprobs,),
+            retain_graph=False
+        )
+
+        (d_custom_hidden, d_custom_weight) = torch.autograd.grad(
+            (custom_logprobs,),
+            (hidden, weight),
+            (g_logprobs,),
+            retain_graph=False)
+
+        torch.testing.assert_close(
+            d_torch_hidden,
+            d_custom_hidden,
+            atol=1e-3,
+            rtol=1e-3
+        )
+        torch.testing.assert_close(
+            d_torch_weight,
+            d_custom_weight,
+            atol=1e-3,
+            rtol=1e-3
+        )
+
+    @pytest.mark.parametrize("problem", [((1, 4096), 129280, 7168)])
+    @pytest.mark.parametrize("dtype", [torch.bfloat16])
+    @pytest.mark.parametrize("reduction", ["mean"])
+    @pytest.mark.parametrize("ignore_index", [-100])
+    def test_performance(
+        self,
+        problem,
+        dtype,
+        reduction,
+        ignore_index
+    ):
+        num_tokens, vocabsize, dim = problem
+        hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
+        labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens
+
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+
+        torch_fwd_latency = list()
+        torch_bwd_latency = list()
+        custom_fwd_latency = list()
+        custom_bwd_latency = list()
+
+        iterations = 5
+        for i in range(iterations):
+            hidden = (
+                torch.empty(hidden_shape, dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            weight = (
+                torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+            if ignore_index >=0 and ignore_index < vocabsize:
+                pad_labels = torch.nn.functional.pad(labels, (0, 1), value=ignore_index)
+                labels = pad_labels[..., 1:].contiguous()
+
+            # -------- forward -------- #
+            start_event.record()
+            torch_logprobs = self.torch_linear_cross_entropy(
+                hidden, weight, labels,
+                reduction=reduction, 
+                ignore_index=ignore_index
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            torch_fwd_latency.append(
+                start_event.elapsed_time(end_event)
+            )
+
+            start_event.record()
+            custom_logprobs = linear_cross_entropy(
+                hidden, weight, labels,
+                reduction=reduction, 
+                ignore_index=ignore_index
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            custom_fwd_latency.append(
+                start_event.elapsed_time(end_event)
+            )
+
+            # -------- backward -------- #
+            g_logprobs = (
+                torch.empty_like(torch_logprobs)
+                .uniform_(-0.1, 0.1)
+            )
+
+            start_event.record()
+            (d_torch_hidden, d_torch_weight) = torch.autograd.grad(
+                (torch_logprobs,),
+                (hidden, weight),
+                (g_logprobs,),
+                retain_graph=False
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            torch_bwd_latency.append(
+                start_event.elapsed_time(end_event)
+            )
+
+            start_event.record()
+            (d_custom_hidden, d_custom_weight) = torch.autograd.grad(
+                (custom_logprobs,),
+                (hidden, weight),
+                (g_logprobs,),
+                retain_graph=False
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            custom_bwd_latency.append(
+                start_event.elapsed_time(end_event)
+            )
+
+        # --- remove first latency due to warmup --- #
+        torch_fwd_latency = torch_fwd_latency[1:]
+        torch_bwd_latency = torch_bwd_latency[1:]
+        custom_fwd_latency = custom_fwd_latency[1:]
+        custom_bwd_latency = custom_bwd_latency[1:]
+
+        print()
+        print(f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}:")
+        print(f"[INFO]: Torch forward latency: {sum(torch_fwd_latency) / len(torch_fwd_latency):.2f} ms")
+        print(f"[INFO]: Custom forward latency: {sum(custom_fwd_latency) / len(custom_fwd_latency):.2f} ms")
+        print(f"[INFO]: Torch backward latency: {sum(torch_bwd_latency) / len(torch_bwd_latency):.2f} ms")
+        print(f"[INFO]: Custom backward latency: {sum(custom_bwd_latency) / len(custom_bwd_latency):.2f} ms")
+
+    @pytest.mark.parametrize("problem", [((1, 4096), 129280, 7168)])
+    @pytest.mark.parametrize("dtype", [torch.bfloat16])
+    @pytest.mark.parametrize("reduction", ["mean"])
+    @pytest.mark.parametrize("ignore_index", [-100])
+    def test_storage(
+        self,
+        problem,
+        dtype,
+        reduction,
+        ignore_index
+    ):
+        num_tokens, vocabsize, dim = problem
+        hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
+        labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens
+        print()
+        print(f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}:")
+
+        def torch_storage():
+            hidden = (
+                torch.empty(hidden_shape, dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            weight = (
+                torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+            if ignore_index >=0 and ignore_index < vocabsize:
+                pad_labels = torch.nn.functional.pad(labels, (0, 1), value=ignore_index)
+                labels = pad_labels[..., 1:].contiguous()
+
+            torch.cuda.reset_peak_memory_stats()
+            torch_logprobs = self.torch_linear_cross_entropy(
+                hidden, weight, labels,
+                reduction=reduction, 
+                ignore_index=ignore_index
+            )
+            torch.cuda.synchronize()
+            torch_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            print(f"[INFO]: Torch Forward pass peak memory: {torch_max_memory:.2f} MB")
+
+            torch.cuda.reset_peak_memory_stats()
+            g_logprobs = (
+                torch.empty_like(torch_logprobs)
+                .uniform_(-0.1, 0.1)
+            )
+            (d_torch_hidden, d_torch_weight) = torch.autograd.grad(
+                (torch_logprobs,),
+                (hidden, weight),
+                (g_logprobs,),
+                retain_graph=False
+            )
+            torch.cuda.synchronize()
+            torch_backward_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            print(f"[INFO]: Torch Backward pass peak memory: {torch_backward_max_memory:.2f} MB")
+
+        def custom_storage():
+            hidden = (
+                torch.empty(hidden_shape, dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            weight = (
+                torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+            if ignore_index >=0 and ignore_index < vocabsize:
+                pad_labels = torch.nn.functional.pad(labels, (0, 1), value=ignore_index)
+                labels = pad_labels[..., 1:].contiguous()
+
+            torch.cuda.reset_peak_memory_stats()
+            custom_logprobs = linear_cross_entropy(
+                hidden, weight, labels,
+                reduction=reduction, 
+                ignore_index=ignore_index
+            )
+            torch.cuda.synchronize()
+            custom_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            print(f"[INFO]: Custom Forward pass peak memory: {custom_max_memory:.2f} MB")
+
+            torch.cuda.reset_peak_memory_stats()
+            g_logprobs = (
+                torch.empty_like(custom_logprobs)
+                .uniform_(-0.1, 0.1)
+            )
+            (d_custom_hidden, d_custom_weight) = torch.autograd.grad(
+                (custom_logprobs,),
+                (hidden, weight),
+                (g_logprobs,),
+                retain_graph=False
+            )
+            torch.cuda.synchronize()
+            custom_backward_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            print(f"[INFO]: Custom Backward pass peak memory: {custom_backward_max_memory:.2f} MB")
+
+        
+        self.cleanup()
+        torch_storage()
+        self.cleanup()
+        custom_storage()
\ No newline at end of file

From 14e5360008cb22b76edd4293712a575861c956e7 Mon Sep 17 00:00:00 2001
From: Jianbing-D <69858819+Jianbing-D@users.noreply.github.com>
Date: Thu, 6 Nov 2025 16:36:49 +0800
Subject: [PATCH 04/17] remove tl.pointer_type() type annotation for triton
 kernels (#3)

Signed-off-by: Jianbing Dong <jianbingd@nvidia.com>
---
 .../linear_cross_entropy/blackwell/triton.py  | 44 ++++++++++---------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/triton.py b/megatron/core/fusions/linear_cross_entropy/blackwell/triton.py
index fd3f14236fb..436ede683d0 100644
--- a/megatron/core/fusions/linear_cross_entropy/blackwell/triton.py
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/triton.py
@@ -1,6 +1,8 @@
 import triton
 import triton.language as tl
 
+# NOTE: tl.pointer_type() is not available in Triton 3.3.0
+
 @triton.autotune(
     configs=[
         triton.Config({"BLOCK_SIZE_M": 1024}, num_stages=3, num_warps=32),
@@ -12,9 +14,9 @@
 def get_num_valid_tokens(
     num_tokens: tl.int64,
     ignore_index: tl.int64,
-    labels_ptr: tl.pointer_type(tl.int64),
+    labels_ptr,#: tl.pointer_type(tl.int64),
     stride_labels: tl.int64,
-    num_valid_tokens_ptr: tl.pointer_type(tl.int64),
+    num_valid_tokens_ptr,#: tl.pointer_type(tl.int64),
     BLOCK_SIZE_M: tl.constexpr,
 ):
     """
@@ -48,22 +50,22 @@ def forward_dp_epilogue(
     num_tokens: tl.int64,
     num_splits: tl.int64, # TODO: maybe this could be a constexpr
     ignore_index: tl.int64,
-    labels_ptr: tl.pointer_type(tl.int64),
+    labels_ptr,#: tl.pointer_type(tl.int64),
     stride_labels: tl.int64,
-    num_valid_tokens_ptr: tl.pointer_type(tl.int64),
-    max_ptr: tl.pointer_type(tl.float32),
+    num_valid_tokens_ptr,#: tl.pointer_type(tl.int64),
+    max_ptr,#: tl.pointer_type(tl.float32),
     stride_max_m: tl.int64,
     stride_max_n: tl.int64,
-    accu_ptr: tl.pointer_type(tl.float32),
+    accu_ptr,#: tl.pointer_type(tl.float32),
     stride_accu_m: tl.int64,
     stride_accu_n: tl.int64,
-    global_max_ptr: tl.pointer_type(tl.float32),
+    global_max_ptr,#: tl.pointer_type(tl.float32),
     stride_global_max: tl.int64,
-    global_accu_ptr: tl.pointer_type(tl.float32),
+    global_accu_ptr,#: tl.pointer_type(tl.float32),
     stride_global_accu: tl.int64,
-    global_logprobs_ptr: tl.pointer_type(tl.float32),
+    global_logprobs_ptr,#: tl.pointer_type(tl.float32),
     stride_global_logprobs: tl.int64,
-    global_logprobs_scalar_ptr: tl.pointer_type(tl.float32),
+    global_logprobs_scalar_ptr,#: tl.pointer_type(tl.float32),
     REDUCTION: tl.constexpr,
     BLOCK_SIZE_M: tl.constexpr,
     BLOCK_SIZE_N: tl.constexpr,
@@ -161,18 +163,18 @@ def forward_dp_epilogue(
 def forward_tp_epilogue(
     num_tokens: tl.int64,
     num_splits: tl.int64,
-    reduced_max_ptr: tl.pointer_type(tl.float32),
+    reduced_max_ptr,#: tl.pointer_type(tl.float32),
     stride_reduced_max_m: tl.int64,
     stride_reduced_max_n: tl.int64,
-    original_max_ptr: tl.pointer_type(tl.float32),
+    original_max_ptr,#: tl.pointer_type(tl.float32),
     stride_original_max_m: tl.int64,
     stride_original_max_n: tl.int64,
-    accu_ptr: tl.pointer_type(tl.float32),
+    accu_ptr,#: tl.pointer_type(tl.float32),
     stride_accu_m: tl.int64,
     stride_accu_n: tl.int64,
-    global_max_ptr: tl.pointer_type(tl.float32),
+    global_max_ptr,#: tl.pointer_type(tl.float32),
     stride_global_max: tl.int64,
-    global_accu_ptr: tl.pointer_type(tl.float32),
+    global_accu_ptr,#: tl.pointer_type(tl.float32),
     stride_global_accu: tl.int64,
     BLOCK_SIZE_M: tl.constexpr,
     BLOCK_SIZE_N: tl.constexpr,
@@ -239,16 +241,16 @@ def forward_tp_epilogue(
 def forward_tp_epilogue_update_logprobs(
     num_tokens: tl.int64,
     ignore_index: tl.int64,
-    num_valid_tokens_ptr: tl.pointer_type(tl.int64),
-    labels_ptr: tl.pointer_type(tl.int64),
+    num_valid_tokens_ptr,#: tl.pointer_type(tl.int64),
+    labels_ptr,#: tl.pointer_type(tl.int64),
     stride_labels: tl.int64,
-    logprobs_ptr: tl.pointer_type(tl.float32),
+    logprobs_ptr,#: tl.pointer_type(tl.float32),
     stride_logprobs: tl.int64,
-    maximum_ptr: tl.pointer_type(tl.float32),
+    maximum_ptr,#: tl.pointer_type(tl.float32),
     stride_maximum: tl.int64,
-    accumulate_ptr: tl.pointer_type(tl.float32),
+    accumulate_ptr,#: tl.pointer_type(tl.float32),
     stride_accumulate: tl.int64,
-    logprobs_scalar_ptr: tl.pointer_type(tl.float32),
+    logprobs_scalar_ptr,#: tl.pointer_type(tl.float32),
     REDUCTION: tl.constexpr,
     BLOCK_SIZE_M: tl.constexpr,
 ):

From 45dfe42f8f8270d73a4b156d1476ffcb93c02d73 Mon Sep 17 00:00:00 2001
From: Jianbing-D <69858819+Jianbing-D@users.noreply.github.com>
Date: Fri, 7 Nov 2025 20:57:27 +0800
Subject: [PATCH 05/17] Support Tensor Parallel and Sequence Parallel (#4)

* added unit-test for TP

Signed-off-by: Jianbing Dong <jianbingd@nvidia.com>

* add sequence-parallel and its unit-test

Signed-off-by: Jianbing Dong <jianbingd@nvidia.com>

---------

Signed-off-by: Jianbing Dong <jianbingd@nvidia.com>
---
 .../fusions/fused_linear_cross_entropy.py     |   87 +-
 .../blackwell/bwd_partial_dlogits.py          |   11 +-
 .../linear_cross_entropy/blackwell/entry.py   |   70 +-
 .../linear_cross_entropy/blackwell/triton.py  |    2 +-
 .../test_fused_linear_cross_entropy.py        | 1178 ++++++++++++++++-
 5 files changed, 1319 insertions(+), 29 deletions(-)

diff --git a/megatron/core/fusions/fused_linear_cross_entropy.py b/megatron/core/fusions/fused_linear_cross_entropy.py
index e3fccc92a4d..a08735952dc 100644
--- a/megatron/core/fusions/fused_linear_cross_entropy.py
+++ b/megatron/core/fusions/fused_linear_cross_entropy.py
@@ -43,36 +43,101 @@ def forward(
         tp_group: typing.Optional[torch.distributed.ProcessGroup] = None,
         reduction: typing.Optional[str] = "mean",
         ignore_index: typing.Optional[int] = -100,
+        sequence_parallel: typing.Optional[bool] = False,
     ) -> torch.Tensor:
         """
         The forward pass of the Linear Cross Entropy.
-        If tp_group is not None, the weight tensor to each TP rank should be (vocab_size // world_size, dim).
+        If tp_group is not None, the weight tensor to each TP rank should be (global_vocab_size // world_size, dim).
         Note that each of the ranks should get equal shards along the vocab_size dimension.
 
         Args:
             @param hidden: the input tensor with shape (num_tokens, dim)
-            @param weight: the lm_head weight tensor with shape (vocab_size, dim)
+            @param weight: the lm_head weight tensor with shape (local_vocab_size, dim)
             @param labels: the labels tensor with shape (num_tokens,)
             @param tp_group: the distributed process group for TP.
             @param reduction: Default to "mean", and can be one of "none", "sum", "mean".
             @param ignore_index: The index to ignore. Default to -100.
+            @param sequence_parallel: Whether to use sequence parallel. Default to False.
         Returns:
             @return: logprobs with shape
                 - either (num_tokens,) when reduction is "none"
                 - or (1,) when reduction is "mean" or "sum"
 
+        tp_group is None ----------------------------------> DP
+                B
+            A   C
+        tp_group is not None & sequence_parallel is False -> TP
+                B0  B1
+            A   C0  C1
+        tp_group is not None & sequence_parallel is True --> SP
+                B0  B1
+            A0  C0  XX
+            A1  XX  C1
+
+        When tp_group is not None, the weight tensor will be split along the vocab_size dimension, 
+        which means each rank will get equal shards along the global_vocab_size dimension.
+        Specifically, the weight tensor to each rank will be (local_vocab_size, dim). 
+        And there is an assumption that each rank will get the same local_vocab_size.
+
+        When sequence_parallel is True, the hidden tensor will be split along the sequence length dimension,
+        which means each rank will get equal shards along the sequence length dimension.
+        Specifically, the hidden tensor to each rank will be (local_num_tokens, dim).
+        And there is an assumption that each rank will get the same local_num_tokens.
+
+        In TP forward pass, the hidden tensor and label tensor shall be identical among all TP ranks,
+        and it's user's responsibility to ensure the hidden tensor is identical among all TP ranks.
+        Then this operation will produce identical logprobs among all TP ranks.
+
+        In TP backward pass, the gradient of the logprobs shall be identical among all TP ranks,
+        and it's user's responsibility to ensure the gradient of the logprobs is identical among all TP ranks.
+        Then this operation will produce distinct gradients for the local weight tensor,
+        and identical gradients for the hidden tensor. 
+
+        ```python
+        # ------------ forward pass ------------ #
+        hidden = tp_group.broadcast(hidden, src=0) # handled by framework
+        labels = tp_group.broadcast(labels, src=0) # handled by framework
+        logprobs = linear_cross_entropy(...)
+        # each rank will get the same logprobs
+
+        # ------------ backward pass ------------ #
+        g_logprobs = tp_group.broadcast(g_logprobs, src=0) # handled by framework
+        d_hidden, d_weight = torch.autograd.grad(...)
+        # each rank will get the same d_hidden, 
+        # and distinct d_weight for local weight shard
+        ```
+
+        In SP forward pass, the hidden tensor shall be split along the sequence length dimension, 
+        and the label tensor shall be identical among all TP ranks.
+        Then this operation will produce identical logprobs among all TP ranks.
+
+        In SP backward pass, the gradient of the logprobs shall be identical among all TP ranks,
+        Then this operation will produce distinct gradients for the local hidden tensor and weight tensor.
+        ```python
+        # ------------ forward pass ------------ #
+        hidden = global_hidden[tp_rank] # handled by framework
+        labels = tp_group.broadcast(labels, src=0) # handled by framework
+        logprobs = linear_cross_entropy(...)
+        # each rank will get the same logprobs
+
+        # ------------ backward pass ------------ #
+        g_logprobs = tp_group.broadcast(g_logprobs, src=0) # handled by framework
+        d_hidden, d_weight = torch.autograd.grad(...)
+        # each rank will get distinct local d_hidden and d_weight
+        ```
         """
         with torch.cuda.nvtx.range("LinearCrossEntropy-forward"):
-            logprobs, _maximum, _acc, _num_valid_tokens, tp_rank, tp_world_size = (
+            logprobs, _maximum, _acc, _num_valid_tokens, tp_rank, tp_world_size, global_hidden = (
                 forward_func(
                     hidden, weight, labels,
                     tp_group, 
                     reduction,
                     ignore_index,
+                    sequence_parallel,
                 )
             )
             ctx.save_for_backward(
-                hidden, weight, labels,
+                global_hidden, weight, labels,
                 _maximum, _acc, _num_valid_tokens,
             )
             ctx.tp_group = tp_group
@@ -80,6 +145,7 @@ def forward(
             ctx.reduction = reduction
             ctx.tp_rank = tp_rank
             ctx.tp_world_size = tp_world_size
+            ctx.sequence_parallel = sequence_parallel
 
         return logprobs
             
@@ -100,17 +166,18 @@ def backward(
             dweight (torch.Tensor): The gradient of the weight.
         """
         with torch.cuda.nvtx.range("LinearCrossEntropy-backward"):
-            (hidden, weight, labels, _maximum, _accu, _num_valid_tokens) = ctx.saved_tensors
+            (global_hidden, weight, labels, _maximum, _accu, _num_valid_tokens) = ctx.saved_tensors
 
             tp_group = ctx.tp_group
             ignore_index = ctx.ignore_index
             reduction = ctx.reduction
             tp_rank = ctx.tp_rank
             tp_world_size = ctx.tp_world_size
+            sequence_parallel = ctx.sequence_parallel
 
             d_hidden, d_weight = backward_func(
                 dlogprobs,
-                hidden,
+                global_hidden,
                 weight,
                 labels,
                 _maximum,
@@ -120,10 +187,11 @@ def backward(
                 ignore_index,
                 tp_group,
                 tp_rank,
-                tp_world_size
+                tp_world_size,
+                sequence_parallel,
             )
 
-        return d_hidden, d_weight, None, None, None, None
+        return d_hidden, d_weight, None, None, None, None, None
 
 
 def linear_cross_entropy(
@@ -133,12 +201,13 @@ def linear_cross_entropy(
     tp_group: typing.Optional[torch.distributed.ProcessGroup] = None,
     reduction: typing.Optional[str] = "mean",
     ignore_index: typing.Optional[int] = -100,
+    sequence_parallel: typing.Optional[bool] = False,
 ) -> torch.Tensor:
     """
     helper function for linear cross entropy.
     """
     _impl = LinearCrossEntropy.apply
-    return _impl(hidden, weight, labels, tp_group, reduction, ignore_index)
+    return _impl(hidden, weight, labels, tp_group, reduction, ignore_index, sequence_parallel)
 
 __all__ = [
     "linear_cross_entropy",
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py b/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py
index 2d5da82ab6a..97e7c5ab493 100644
--- a/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py
@@ -30,13 +30,11 @@ def __init__(self,
                  acc_dtype: Type[cutlass.Numeric] = cutlass.Float32,
                  use_2cta_instrs: bool = False,
                  mma_tiler_mn: Tuple[int, int] = (128, 256),
-                 rank: int = 0,
                  vocab_per_split: int = 512):
         self.REDUCTION: cutlass.Constexpr[cutlass.Int32] = cutlass.const_expr(reduction)
         self.acc_dtype = acc_dtype
         self.use_2cta_instrs = use_2cta_instrs
         self.mma_tiler = (*mma_tiler_mn, 1)
-        self.rank = rank
         self.vocab_per_split = vocab_per_split
 
         self.cta_group = (
@@ -150,6 +148,7 @@ def kernel(
         b_smem_layout_staged: cute.ComposedLayout,
         cluster_layout_vmnk: cute.Layout,
         problem_mnk: Tuple[int, int, int],
+        rank: cutlass.Int32,
     ) -> None:
         warp_idx = cute.arch.make_warp_uniform(
             cute.arch.warp_idx()
@@ -623,7 +622,7 @@ def kernel(
                     tTMEM_load_rAcc[idx] = cute.exp(tTMEM_load_rAcc[idx] - tMrMaximum[0])
 
                     position: cutlass.Int64 = (
-                        self.rank * problem_mnk[1]
+                        rank * problem_mnk[1]
                         + split_idx * self.vocab_per_split
                         + pidn * self.epi_tile[1]
                         + n_subtile * cute.size(tTMEM_load_rAcc, mode=[0])
@@ -675,6 +674,7 @@ def __call__(
         dlogits_partial: cute.Tensor,
         scalarNumValidTokens: cute.Pointer,
         ignore_index: cutlass.Int64,
+        rank: cutlass.Int32,
         stream: cuda.CUstream,
     ) -> None:
         a_dtype: Type[cutlass.Numeric] = hidden.element_type
@@ -801,6 +801,7 @@ class SharedStorage:
             b_smem_layout_staged,
             self.cluster_layout_vmnk,
             problem_mnk,
+            rank,
         ).launch(
             grid=grid,
             block=[self.threads_per_cta, 1, 1],
@@ -884,6 +885,8 @@ def get_maximum_and_accu(hidden, weight):
 
     stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
 
+    rank = 0
+
     compiled = cute.compile(
         bwd_kernel,
         split_idx,
@@ -896,6 +899,7 @@ def get_maximum_and_accu(hidden, weight):
         dlogits_partial_packed,
         scalarNumValidTokens_packed,
         ignore_index,
+        rank,
         stream,
     )
 
@@ -913,6 +917,7 @@ def get_maximum_and_accu(hidden, weight):
             dlogits_partial_packed,
             scalarNumValidTokens_packed,
             ignore_index,
+            rank,
             stream
         )
     stop.record(stream=torch.cuda.current_stream())
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py b/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py
index c59e7b40d95..e26661ca06a 100644
--- a/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py
@@ -25,10 +25,15 @@ def forward(
     tp_group: typing.Optional[torch.distributed.ProcessGroup] = None,
     reduction: typing.Optional[str] = "mean",
     ignore_index: typing.Optional[int] = -100,
+    sequence_parallel: typing.Optional[bool] = False,
 ) -> typing.Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     forward host function
     """
+    tp_rank = 0 if tp_group is None else torch.distributed.get_rank(tp_group)
+    tp_world_size = 1 if tp_group is None else torch.distributed.get_world_size(tp_group)
+    in_tp_mode = (tp_group is not None) and (tp_world_size > 1)
+
     assert hidden.is_cuda and weight.is_cuda and labels.is_cuda
     assert weight.device == hidden.device and labels.device == hidden.device
 
@@ -44,14 +49,33 @@ def forward(
     hidden_view = hidden.view(-1, hidden.shape[-1])
     labels_view = labels.view(-1)
 
-    assert hidden_view.shape[0] == labels_view.shape[0]
+    assert ((sequence_parallel and hidden_view.shape[0] * tp_world_size == labels_view.shape[0])
+            or (not sequence_parallel and hidden_view.shape[0] == labels_view.shape[0]))
     assert hidden_view.shape[1] == weight.shape[1]
+
+    global_hidden = hidden
+    if in_tp_mode and sequence_parallel:
+        partial_hidden_shape = hidden.shape
+        global_hidden_shape = (
+            partial_hidden_shape[0] * tp_world_size,
+            *partial_hidden_shape[1:]
+        )
+        global_hidden = torch.empty(
+            global_hidden_shape,
+            dtype=hidden.dtype,
+            device=hidden.device
+        )
+        dist.all_gather_into_tensor(
+            global_hidden,
+            hidden,
+            group=tp_group
+        )
+        assert global_hidden.is_contiguous()
+        hidden_view = global_hidden.view(-1, global_hidden.shape[-1])
+
     num_tokens, dim = hidden_view.shape
     vocab_size, _ = weight.shape
 
-    tp_rank = 0 if tp_group is None else torch.distributed.get_rank(tp_group)
-    tp_world_size = 1 if tp_group is None else torch.distributed.get_world_size(tp_group)
-
     if not hasattr(forward, "_initialized"):
         global _dedicated_stream, _dedicated_events
         _dedicated_stream = torch.cuda.Stream(hidden.device)
@@ -62,7 +86,7 @@ def forward(
     # declare logprobs
     if REDUCTION == utils.EntropyReductionEnum.kNone:
         logprobs = torch.empty((num_tokens,), device=hidden.device, dtype=torch.float32)
-        if tp_group is not None:
+        if in_tp_mode:
             logprobs.zero_()
     else:
         logprobs = torch.zeros((), device=hidden.device, dtype=torch.float32)
@@ -81,7 +105,7 @@ def forward(
         _logprobs = logprobs
     else:
         _logprobs = torch.empty((num_tokens,), device=hidden.device, dtype=torch.float32)
-        if tp_group is not None:
+        if in_tp_mode:
             _logprobs.zero_()
     assert _max.is_contiguous() and _accu.is_contiguous() and _logprobs.is_contiguous()
 
@@ -119,7 +143,7 @@ def forward(
 
     # VocabSize and Dim are fixed for a given model,
     # only the number of tokens can vary
-    key = f"vocab_size:{vocab_size}+dim:{dim}+dtype:{hidden.dtype}"
+    key = f"vocab_size:{vocab_size}+dim:{dim}+dtype:{hidden_view.dtype}"
     if forward._fwd_mainloop_kernels.get(key) is None:
         fwd_mainloop_kernel = fwd_mainloop.FwdMainLoop(
             vocab_per_split=vocab_per_split,
@@ -151,7 +175,7 @@ def forward(
         cuda_stream
     )
     
-    if tp_group is None:
+    if not in_tp_mode:
         def grid(meta):
             return (triton.cdiv(num_tokens, meta["BLOCK_SIZE_M"]),)
 
@@ -228,11 +252,11 @@ def grid(meta):
             REDUCTION,
         )
 
-    return logprobs, maximum, accumulate, num_valid_tokens, tp_rank, tp_world_size
+    return logprobs, maximum, accumulate, num_valid_tokens, tp_rank, tp_world_size, global_hidden
 
 def backward(
     dlogprobs: torch.Tensor,
-    hidden: torch.Tensor,
+    global_hidden: torch.Tensor,
     weight: torch.Tensor,
     labels: torch.Tensor,
     maximum: torch.Tensor,
@@ -243,11 +267,14 @@ def backward(
     tp_group: typing.Optional[dist.ProcessGroup] = None,
     tp_rank: typing.Optional[int] = 0,
     tp_world_size: typing.Optional[int] = 1,
+    sequence_parallel: typing.Optional[bool] = False,
 ) -> typing.List[torch.Tensor]:
     """
     backward host function
     """
-    hidden_view = hidden.view(-1, hidden.shape[-1])
+    in_tp_mode = (tp_group is not None) and (tp_world_size > 1)
+
+    hidden_view = global_hidden.view(-1, global_hidden.shape[-1])
     labels_view = labels.view(-1)
 
     num_tokens, dim = hidden_view.shape
@@ -263,7 +290,7 @@ def backward(
 
     assert num_valid_tokens.dim() == 0 and num_valid_tokens.is_cuda and num_valid_tokens.dtype == torch.int64
 
-    d_hidden = torch.empty_like(hidden)
+    d_hidden = torch.empty_like(global_hidden)
     d_weight = torch.empty_like(weight)
     assert d_hidden.is_contiguous() and d_weight.is_contiguous()
 
@@ -275,8 +302,8 @@ def backward(
 
         _d_logits = torch.empty(
             (num_tokens, vocab_per_split),
-            device=hidden.device,
-            dtype=hidden.dtype
+            device=global_hidden.device,
+            dtype=global_hidden.dtype
         )
 
         hidden_packed = from_dlpack(
@@ -319,7 +346,7 @@ def backward(
         if not hasattr(backward, "_bwd_kernel"):
             backward._bwd_kernel = dict()
 
-        key = f"vocab_size:{vocab_size}+dim:{dim}+reduction:{REDUCTION}+dtype:{hidden.dtype}"
+        key = f"vocab_size:{vocab_size}+dim:{dim}+reduction:{REDUCTION}+dtype:{hidden_view.dtype}"
         if backward._bwd_kernel.get(key) is None:
             bwd_kernel = bwd_partial_dlogits.BwdPartialDlogits(
                 reduction=REDUCTION,
@@ -337,6 +364,7 @@ def backward(
                 dlogits_packed,
                 scalarNumValidTokens_packed,
                 ignore_index,
+                tp_rank,
                 stream
             )
             backward._bwd_kernel[key] = bwd_kernel_compiled
@@ -355,6 +383,7 @@ def backward(
                 dlogits_packed,
                 scalarNumValidTokens_packed,
                 ignore_index,
+                tp_rank,
                 stream
             )
             vocab_right_bound = (
@@ -381,5 +410,16 @@ def backward(
             )
     else:
         raise NotImplementedError(f"Unsupported backward method: {_backward}")
+
+    if in_tp_mode:
+        dist.all_reduce(d_hidden, op=dist.ReduceOp.SUM, group=tp_group)
+        if sequence_parallel:
+            partial_hidden_shape = (
+                global_hidden.shape[0] // tp_world_size,
+                *global_hidden.shape[1:]
+            )
+            partial_num_tokens = num_tokens // tp_world_size
+            d_hidden = d_hidden.view(-1, d_hidden.shape[-1])[tp_rank * partial_num_tokens : (tp_rank + 1) * partial_num_tokens, :]
+            d_hidden = d_hidden.view(partial_hidden_shape).clone()
     
     return d_hidden, d_weight
\ No newline at end of file
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/triton.py b/megatron/core/fusions/linear_cross_entropy/blackwell/triton.py
index 436ede683d0..d7f45d152c2 100644
--- a/megatron/core/fusions/linear_cross_entropy/blackwell/triton.py
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/triton.py
@@ -193,7 +193,7 @@ def forward_tp_epilogue(
         offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
 
         _reduced_max = tl.load(
-            reduce_max_ptr + offs_m[:, None] * stride_reduce_max_m + offs_n[None, :] * stride_reduce_max_n,
+            reduced_max_ptr + offs_m[:, None] * stride_reduced_max_m + offs_n[None, :] * stride_reduced_max_n,
             mask=(offs_m[:, None] < num_tokens) & (offs_n[None, :] < num_splits),
             other=0.0,
         )
diff --git a/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py b/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py
index a4d759046f9..130a2bb5a71 100644
--- a/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py
+++ b/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py
@@ -7,6 +7,7 @@
 import torch
 from torch.utils.data import DataLoader, Dataset
 from torch.utils.data.distributed import DistributedSampler
+import torch.distributed as dist
 
 import megatron.core.parallel_state as ps
 from megatron.core.models.gpt.gpt_layer_specs import (
@@ -25,6 +26,7 @@
 from megatron.core.fusions.fused_linear_cross_entropy import linear_cross_entropy
 
 import os
+import typing
 
 class MockDataset(Dataset):
     """
@@ -136,7 +138,7 @@ def init_gpt_dataloader(
 
 
 @pytest.mark.skipif(
-    "WORLD_SIZE" not in os.environ or os.environ["WORLD_SIZE"] < "2",
+    ("WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2),# or True,
     reason="Requires torchrun with multiple GPUs"
 )
 class TestFusedLinearCrossEntropyOnGptModel:
@@ -559,6 +561,1180 @@ def custom_storage():
             print(f"[INFO]: Custom Backward pass peak memory: {custom_backward_max_memory:.2f} MB")
 
         
+        self.cleanup()
+        torch_storage()
+        self.cleanup()
+        custom_storage()
+
+
+@pytest.mark.skipif(
+    ("WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2),# or True,
+    reason="Requires torchrun with multiple GPUs"
+)
+class TestFusedLinearCrossEntropyTensorParallel:
+    @classmethod
+    def setup_class(cls):
+        if dist.is_initialized():
+            cls.must_teardown = False
+        else:
+            dist.init_process_group(
+                backend="nccl",
+                init_method="env://",
+                world_size=int(os.environ["WORLD_SIZE"]),
+                rank=int(os.environ["RANK"])
+            )
+            cls.must_teardown = True
+        cls.tp_group = dist.group.WORLD
+
+        cls.tp_rank = dist.get_rank(cls.tp_group)
+        cls.tp_world_size = dist.get_world_size(cls.tp_group)
+        cls.is_chief = (cls.tp_rank == 0)
+        device = torch.device(f"cuda:{cls.tp_rank}")
+        torch.cuda.set_device(device)
+        print(f"[INFO]: TP rank: {cls.tp_rank}, TP world size: {cls.tp_world_size}")
+
+    @classmethod
+    def teardown_class(cls):
+        if cls.must_teardown:
+            dist.destroy_process_group()
+
+    def cleanup(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+        import gc
+
+        gc.collect()
+        torch.cuda.synchronize()
+
+    @staticmethod
+    def torch_linear_cross_entropy_single_gpu(
+        hidden: torch.Tensor,
+        weight: torch.Tensor,
+        labels: torch.Tensor,
+        reduction: typing.Optional[str] = "mean",
+    ):
+        logits = hidden.to(torch.float32) @ weight.T.to(torch.float32)
+        logprobs = torch.nn.functional.cross_entropy(
+            logits.view(-1, logits.shape[-1]),
+            labels.view(-1),
+            reduction=reduction,
+        )
+        return logprobs.to(torch.float32)
+
+    class TorchLinearCrossEntropy(torch.autograd.Function):
+        @staticmethod
+        def forward(
+            ctx,
+            hidden: torch.Tensor,
+            weight: torch.Tensor,
+            labels: torch.Tensor,
+            tp_group: torch.distributed.ProcessGroup,
+            reduction: typing.Optional[str] = "mean",
+        ):
+            tp_rank = 0 if tp_group is None else torch.distributed.get_rank(tp_group)
+            tp_world_size = 1 if tp_group is None else torch.distributed.get_world_size(tp_group)
+
+            logits = hidden.to(torch.float32) @ weight.T.to(torch.float32)
+
+            whole_logits = torch.empty(
+                (logits.shape[0], logits.shape[-1] * tp_world_size),
+                dtype=logits.dtype,
+                device=logits.device
+            )
+            whole_logits_ref = [
+                whole_logits[..., i * logits.shape[-1] : (i + 1) * logits.shape[-1]]
+                for i in range(tp_world_size)
+            ]
+            dist.all_gather(whole_logits_ref, logits, group=tp_group)
+
+            logprobs = torch.nn.functional.cross_entropy(
+                whole_logits.view(-1, whole_logits.shape[-1]),
+                labels.view(-1),
+                reduction=reduction,
+            )
+
+            # If we don't preserve whole_logits,
+            # we need to re-compute it in the backward pass
+            ctx.save_for_backward(hidden, weight, labels)
+            ctx.tp_group = tp_group
+            ctx.reduction = reduction
+            ctx.tp_rank = tp_rank
+            ctx.tp_world_size = tp_world_size
+
+            return logprobs.to(torch.float32)
+
+        @staticmethod
+        def backward(
+            ctx,
+            g_logprobs: torch.Tensor,
+        ):
+            hidden, weight, labels = ctx.saved_tensors
+            tp_group = ctx.tp_group
+            reduction = ctx.reduction
+            tp_rank = ctx.tp_rank
+            tp_world_size = ctx.tp_world_size
+
+            num_tokens, dim = hidden.shape
+
+            if reduction == "mean":
+                _g_logprobs = torch.broadcast_to(
+                    g_logprobs / num_tokens,
+                    (num_tokens,)
+                )
+            elif reduction == "sum":
+                _g_logprobs = torch.broadcast_to(
+                    g_logprobs,
+                    (num_tokens,)
+                )
+            else:
+                _g_logprobs = g_logprobs
+
+            # re-compute whole_logits
+            logits = hidden.to(torch.float32) @ weight.T.to(torch.float32)
+            whole_logits = torch.empty(
+                (logits.shape[0], logits.shape[-1] * tp_world_size),
+                dtype=logits.dtype,
+                device=logits.device
+            )
+            whole_logits_ref = [
+                whole_logits[..., i * logits.shape[-1] : (i + 1) * logits.shape[-1]]
+                for i in range(tp_world_size)
+            ]
+            dist.all_gather(whole_logits_ref, logits, group=tp_group)
+
+            one_hot = torch.zeros_like(whole_logits)
+            one_hot.scatter_(1, labels.view(-1).unsqueeze(-1), 1)
+
+            pd = torch.nn.functional.softmax(whole_logits, dim=-1)
+            d_logits = (pd - one_hot) * _g_logprobs.unsqueeze(-1)
+            d_logits = d_logits.to(hidden.dtype)
+
+            local_size = weight.size(0)
+            local_d_logits = d_logits[:, tp_rank * local_size : (tp_rank + 1) * local_size]
+
+            local_d_hidden = local_d_logits @ weight
+            local_d_weight = local_d_logits.T @ hidden
+
+            dist.all_reduce(
+                local_d_hidden,
+                op=dist.ReduceOp.SUM,
+                group=tp_group
+            )
+
+            return local_d_hidden, local_d_weight, None, None, None
+
+    @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+    @pytest.mark.parametrize("reduction", ["mean", "sum", "none"])
+    @pytest.mark.parametrize("problem", [(4096, 129280, 8192)])
+    def test_torch_tp_vs_single_gpu(
+        self,
+        dtype,
+        reduction,
+        problem,
+    ):
+        num_tokens, vocabsize, dim = problem
+
+        hidden = (
+            torch.empty((num_tokens, dim), dtype=dtype, device="cuda")
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        weight = (
+            torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        labels = torch.randint(0, vocabsize, (num_tokens,), dtype=torch.long, device="cuda")
+
+        # ------------ forward pass ------------ #
+        dist.broadcast(hidden, src=0, group=self.tp_group)
+        dist.broadcast(labels, src=0, group=self.tp_group)
+
+        # single GPU
+        whole_weight = torch.empty(
+            (vocabsize * self.tp_world_size, dim),
+            dtype=dtype,
+            device="cuda"
+        )
+        whole_weight_view = [
+            whole_weight[i * vocabsize : (i + 1) * vocabsize, :]
+            for i in range(self.tp_world_size)
+        ]
+        dist.all_gather(
+            whole_weight_view, 
+            weight,
+            group=self.tp_group
+        )
+        whole_weight = whole_weight.clone().requires_grad_()
+        logprobs_single_gpu = self.torch_linear_cross_entropy_single_gpu(
+            hidden, whole_weight, labels,
+            reduction=reduction,
+        )
+
+        # TP
+        logprobs_tp = self.TorchLinearCrossEntropy.apply(
+            hidden, weight, labels,
+            self.tp_group,
+            reduction,
+        )
+        torch.testing.assert_close(
+            logprobs_single_gpu,
+            logprobs_tp,
+        )
+
+        # ------------ backward pass ------------ #
+        g_logprobs = (
+            torch.empty_like(logprobs_single_gpu)
+            .uniform_(-0.1, 0.1)
+        )
+        dist.broadcast(g_logprobs, src=0, group=self.tp_group)
+
+        # single GPU
+        (d_hidden_single_gpu, d_weight_single_gpu) = torch.autograd.grad(
+            (logprobs_single_gpu,),
+            (hidden, whole_weight),
+            (g_logprobs,),
+            retain_graph=False
+        )
+
+        # TP
+        (d_hidden_tp, d_weight_tp) = torch.autograd.grad(
+            (logprobs_tp,),
+            (hidden, weight),
+            (g_logprobs,),
+            retain_graph=False
+        )
+        torch.testing.assert_close(
+            d_hidden_single_gpu,
+            d_hidden_tp,
+            atol=1e-3,
+            rtol=1e-3,
+        )
+        local_d_weight_single_gpu = d_weight_single_gpu[self.tp_rank * weight.shape[0] : (self.tp_rank + 1) * weight.shape[0], :]
+        torch.testing.assert_close(
+            local_d_weight_single_gpu,
+            d_weight_tp,
+            atol=1e-3,
+            rtol=1e-3,
+        )
+
+
+    @staticmethod
+    def get_problems():
+        return [
+            (80, 125, 64),
+            (80, 152064, 64),
+            (1024, 152064, 4096),
+            (4096, 152063, 8192),
+            ((1, 4096), 152064, 8192),
+            ((2, 4096), 152064, 8192),
+        ]
+
+    @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+    @pytest.mark.parametrize("reduction", ["mean", "sum", "none"])
+    @pytest.mark.parametrize("problem", get_problems())
+    def test_correctness(
+        self,
+        dtype,
+        reduction,
+        problem,
+    ):
+        num_tokens, vocabsize, dim = problem
+        hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
+        labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens
+
+        hidden = (
+            torch.empty(hidden_shape, dtype=dtype, device="cuda")
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        weight = (
+            torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+
+
+        # ------ forward pass ------ #
+        dist.broadcast(hidden, src=0, group=self.tp_group)
+        dist.broadcast(labels, src=0, group=self.tp_group)
+
+        torch_logprobs = self.TorchLinearCrossEntropy.apply(
+            hidden.view(-1, dim), weight, labels,
+            self.tp_group,
+            reduction,
+        )
+
+        custom_logprobs = linear_cross_entropy(
+            hidden, weight, labels,
+            tp_group=self.tp_group,
+            reduction=reduction,
+        )
+
+        torch.testing.assert_close(
+            torch_logprobs,
+            custom_logprobs,
+        )
+
+        # ------- backward pass ------- #
+        g_logprobs = (
+            torch.empty_like(torch_logprobs)
+            .uniform_(-0.1, 0.1)
+        )
+        dist.broadcast(g_logprobs, src=0, group=self.tp_group)
+
+        (d_hidden_torch, d_weight_torch) = torch.autograd.grad(
+            (torch_logprobs,),
+            (hidden, weight),
+            (g_logprobs,),
+            retain_graph=False
+        )
+        (d_hidden_custom, d_weight_custom) = torch.autograd.grad(
+            (custom_logprobs,),
+            (hidden, weight),
+            (g_logprobs,),
+            retain_graph=False
+        )
+        torch.testing.assert_close(
+            d_hidden_torch,
+            d_hidden_custom,
+            atol=1e-3,
+            rtol=1e-3,
+        )
+        torch.testing.assert_close(
+            d_weight_torch,
+            d_weight_custom,
+            atol=1e-4,
+            rtol=1e-4,
+        )
+
+    @pytest.mark.parametrize("problem", [((1, 4096), 129280, 7168)])
+    @pytest.mark.parametrize("dtype", [torch.bfloat16])
+    @pytest.mark.parametrize("reduction", ["mean"])
+    def test_performance(
+        self,
+        problem,
+        dtype,
+        reduction
+    ):
+        num_tokens, vocabsize, dim = problem
+        hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
+        labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens
+
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+
+        torch_fwd_latency = list()
+        torch_bwd_latency = list()
+        custom_fwd_latency = list()
+        custom_bwd_latency = list()
+
+        iterations = 5
+        for i in range(iterations):
+            hidden = (
+                torch.empty(hidden_shape, dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            weight = (
+                torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+
+            # ------ forward pass ------ #
+            dist.broadcast(hidden, src=0, group=self.tp_group)
+            dist.broadcast(labels, src=0, group=self.tp_group)
+
+            start_event.record()
+            torch_logprobs = self.TorchLinearCrossEntropy.apply(
+                hidden.view(-1, dim), weight, labels,
+                self.tp_group,
+                reduction,
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            torch_fwd_latency.append(start_event.elapsed_time(end_event))
+
+            start_event.record()
+            custom_logprobs = linear_cross_entropy(
+                hidden, weight, labels,
+                tp_group=self.tp_group,
+                reduction=reduction,
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            custom_fwd_latency.append(start_event.elapsed_time(end_event))
+
+            # ------- backward pass ------- #
+            g_logprobs = (
+                torch.empty_like(torch_logprobs)
+                .uniform_(-0.1, 0.1)
+            )
+            dist.broadcast(g_logprobs, src=0, group=self.tp_group)
+
+            start_event.record()
+            (d_hidden_torch, d_weight_torch) = torch.autograd.grad(
+                (torch_logprobs,),
+                (hidden, weight),
+                (g_logprobs,),
+                retain_graph=False
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            torch_bwd_latency.append(start_event.elapsed_time(end_event))
+
+            start_event.record()
+            (d_hidden_custom, d_weight_custom) = torch.autograd.grad(
+                (custom_logprobs,),
+                (hidden, weight),
+                (g_logprobs,),
+                retain_graph=False
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            custom_bwd_latency.append(start_event.elapsed_time(end_event))
+
+        # --- remove first latency due to warmup --- #
+        torch_fwd_latency = torch_fwd_latency[1:]
+        torch_bwd_latency = torch_bwd_latency[1:]
+        custom_fwd_latency = custom_fwd_latency[1:]
+        custom_bwd_latency = custom_bwd_latency[1:]
+
+        if self.is_chief:
+            print()
+            print(f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}, TP size {self.tp_world_size}:")
+            print(f"[INFO]: Torch forward latency: {sum(torch_fwd_latency) / len(torch_fwd_latency):.2f} ms")
+            print(f"[INFO]: Custom forward latency: {sum(custom_fwd_latency) / len(custom_fwd_latency):.2f} ms")
+            print(f"[INFO]: Torch backward latency: {sum(torch_bwd_latency) / len(torch_bwd_latency):.2f} ms")
+            print(f"[INFO]: Custom backward latency: {sum(custom_bwd_latency) / len(custom_bwd_latency):.2f} ms")
+
+
+    @pytest.mark.parametrize("problem", [((1, 4096), 129280, 7168)])
+    @pytest.mark.parametrize("dtype", [torch.bfloat16])
+    @pytest.mark.parametrize("reduction", ["mean"])
+    def test_storage(
+        self,
+        problem,
+        dtype,
+        reduction
+    ):
+        num_tokens, vocabsize, dim = problem
+        hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
+        labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens
+
+        if self.is_chief:
+            print()
+            print(f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}, TP size {self.tp_world_size}:")
+
+        def torch_storage():
+            hidden = (
+                torch.empty(hidden_shape, dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            weight = (
+                torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+
+            dist.broadcast(hidden, src=0, group=self.tp_group)
+            dist.broadcast(labels, src=0, group=self.tp_group)
+
+            torch.cuda.reset_peak_memory_stats()
+            torch_logprobs = self.TorchLinearCrossEntropy.apply(
+                hidden.view(-1, dim), weight, labels,
+                self.tp_group,
+                reduction,
+            )
+            torch.cuda.synchronize()
+            torch_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            if self.is_chief:
+                print(f"[INFO]: On GPU {self.tp_rank}, Torch Forward pass peak memory: {torch_max_memory:.2f} MB")
+
+            g_logprobs = (
+                torch.empty_like(torch_logprobs)
+                .uniform_(-0.1, 0.1)
+            )
+            dist.broadcast(g_logprobs, src=0, group=self.tp_group)
+
+            torch.cuda.reset_peak_memory_stats()
+            (d_hidden_torch, d_weight_torch) = torch.autograd.grad(
+                (torch_logprobs,),
+                (hidden, weight),
+                (g_logprobs,),
+                retain_graph=False
+            )
+            torch.cuda.synchronize()
+            torch_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            if self.is_chief:
+                print(f"[INFO]: On GPU {self.tp_rank}, Torch Backward pass peak memory: {torch_max_memory:.2f} MB")
+
+        def custom_storage():
+            hidden = (
+                torch.empty(hidden_shape, dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            weight = (
+                torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+            
+            dist.broadcast(hidden, src=0, group=self.tp_group)
+            dist.broadcast(labels, src=0, group=self.tp_group)
+
+            torch.cuda.reset_peak_memory_stats()
+            custom_logprobs = linear_cross_entropy(
+                hidden, weight, labels,
+                tp_group=self.tp_group,
+                reduction=reduction,
+            )
+            torch.cuda.synchronize()
+            custom_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            if self.is_chief:
+                print(f"[INFO]: On GPU {self.tp_rank}, Custom Forward pass peak memory: {custom_max_memory:.2f} MB")
+
+            g_logprobs = (
+                torch.empty_like(custom_logprobs)
+                .uniform_(-0.1, 0.1)
+            )
+            dist.broadcast(g_logprobs, src=0, group=self.tp_group)
+
+            torch.cuda.reset_peak_memory_stats()
+            (d_hidden_custom, d_weight_custom) = torch.autograd.grad(
+                (custom_logprobs,),
+                (hidden, weight),
+                (g_logprobs,),
+                retain_graph=False
+            )
+            torch.cuda.synchronize()
+            custom_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            if self.is_chief:
+                print(f"[INFO]: On GPU {self.tp_rank}, Custom Backward pass peak memory: {custom_max_memory:.2f} MB")
+
+        self.cleanup()
+        torch_storage()
+        self.cleanup()
+        custom_storage()
+
+
+
+@pytest.mark.skipif(
+    "WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2,
+    reason="Requires torchrun with multiple GPUs"
+)
+class TestFusedLinearCrossEntropySequenceParallel:
+    @classmethod
+    def setup_class(cls):
+        if dist.is_initialized():
+            cls.must_teardown = False
+        else:
+            dist.init_process_group(
+                backend="nccl",
+                init_method="env://",
+                world_size=int(os.environ["WORLD_SIZE"]),
+                rank=int(os.environ["RANK"])
+            )
+            cls.must_teardown = True
+        cls.tp_group = dist.group.WORLD
+
+        cls.tp_rank = dist.get_rank(cls.tp_group)
+        cls.tp_world_size = dist.get_world_size(cls.tp_group)
+        cls.is_chief = (cls.tp_rank == 0)
+        device = torch.device(f"cuda:{cls.tp_rank}")
+        torch.cuda.set_device(device)
+        print(f"[INFO]: TP rank: {cls.tp_rank}, TP world size: {cls.tp_world_size}")
+
+    @classmethod
+    def teardown_class(cls):
+        if cls.must_teardown:
+            dist.destroy_process_group()
+
+    @staticmethod
+    def timed_barrier(timeout_s=10):
+        import time
+        work = torch.distributed.barrier(async_op=True)
+        t0 = time.time()
+        while not work.is_completed():
+            if time.time() - t0 > timeout_s:
+                exit(1)
+            time.sleep(0.05)
+        work.wait()
+
+    def cleanup(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+        import gc
+
+        gc.collect()
+        torch.cuda.synchronize()
+
+    @staticmethod
+    def torch_linear_cross_entropy_single_gpu(
+        hidden: torch.Tensor,
+        weight: torch.Tensor,
+        labels: torch.Tensor,
+        reduction: typing.Optional[str] = "mean",
+    ):
+        logits = hidden.to(torch.float32) @ weight.T.to(torch.float32)
+        logprobs = torch.nn.functional.cross_entropy(
+            logits.view(-1, logits.shape[-1]),
+            labels.view(-1),
+            reduction=reduction,
+        )
+        return logprobs.to(torch.float32)
+
+    class TorchLinearCrossEntropy(torch.autograd.Function):
+        @staticmethod
+        def forward(
+            ctx,
+            hidden: torch.Tensor,
+            weight: torch.Tensor,
+            labels: torch.Tensor,
+            tp_group: torch.distributed.ProcessGroup,
+            reduction: typing.Optional[str] = "mean",
+        ):
+            tp_rank = 0 if tp_group is None else torch.distributed.get_rank(tp_group)
+            tp_world_size = 1 if tp_group is None else torch.distributed.get_world_size(tp_group)
+
+            whole_hidden = torch.empty(
+                (hidden.shape[0] * tp_world_size, hidden.shape[-1]),
+                dtype=hidden.dtype,
+                device=hidden.device
+            )
+            dist.all_gather_into_tensor(
+                whole_hidden,
+                hidden,
+                group=tp_group
+            )
+
+            logits = whole_hidden.to(torch.float32) @ weight.T.to(torch.float32)
+
+            whole_logits = torch.empty(
+                (logits.shape[0], logits.shape[-1] * tp_world_size),
+                dtype=logits.dtype,
+                device=logits.device
+            )
+            whole_logits_ref = [
+                whole_logits[..., i * logits.shape[-1] : (i + 1) * logits.shape[-1]]
+                for i in range(tp_world_size)
+            ]
+            dist.all_gather(whole_logits_ref, logits, group=tp_group)
+
+            logprobs = torch.nn.functional.cross_entropy(
+                whole_logits.view(-1, whole_logits.shape[-1]),
+                labels.view(-1),
+                reduction=reduction,
+            )
+
+            # If we don't preserve whole_logits,
+            # we need to re-compute it in the backward pass
+            ctx.save_for_backward(whole_hidden, weight, labels)
+            ctx.tp_group = tp_group
+            ctx.reduction = reduction
+            ctx.tp_rank = tp_rank
+            ctx.tp_world_size = tp_world_size
+
+            return logprobs.to(torch.float32)
+
+        @staticmethod
+        def backward(
+            ctx,
+            g_logprobs: torch.Tensor,
+        ):
+            whole_hidden, weight, labels = ctx.saved_tensors
+            tp_group = ctx.tp_group
+            reduction = ctx.reduction
+            tp_rank = ctx.tp_rank
+            tp_world_size = ctx.tp_world_size
+
+            num_tokens, dim = whole_hidden.shape
+
+            if reduction == "mean":
+                _g_logprobs = torch.broadcast_to(
+                    g_logprobs / num_tokens,
+                    (num_tokens,)
+                )
+            elif reduction == "sum":
+                _g_logprobs = torch.broadcast_to(
+                    g_logprobs,
+                    (num_tokens,)
+                )
+            else:
+                _g_logprobs = g_logprobs
+
+            # re-compute whole_logits
+            logits = whole_hidden.to(torch.float32) @ weight.T.to(torch.float32)
+            whole_logits = torch.empty(
+                (logits.shape[0], logits.shape[-1] * tp_world_size),
+                dtype=logits.dtype,
+                device=logits.device
+            )
+            whole_logits_ref = [
+                whole_logits[..., i * logits.shape[-1] : (i + 1) * logits.shape[-1]]
+                for i in range(tp_world_size)
+            ]
+            dist.all_gather(whole_logits_ref, logits, group=tp_group)
+
+            one_hot = torch.zeros_like(whole_logits)
+            one_hot.scatter_(1, labels.view(-1).unsqueeze(-1), 1)
+
+            pd = torch.nn.functional.softmax(whole_logits, dim=-1)
+            d_logits = (pd - one_hot) * _g_logprobs.unsqueeze(-1)
+            d_logits = d_logits.to(whole_hidden.dtype)
+
+            local_size = weight.size(0)
+            local_d_logits = d_logits[:, tp_rank * local_size : (tp_rank + 1) * local_size]
+
+            d_hidden = local_d_logits @ weight
+            local_d_weight = local_d_logits.T @ whole_hidden
+
+            # dist.all_reduce(
+            #     local_d_hidden,
+            #     op=dist.ReduceOp.SUM,
+            #     group=tp_group
+            # )
+
+            # split the local_d_hidden along the sequence length dimension
+            local_num_tokens = num_tokens // tp_world_size
+            # local_d_hidden = local_d_hidden[tp_rank * local_num_tokens : (tp_rank + 1) * local_num_tokens, :]
+
+            local_d_hidden = torch.empty(
+                (local_num_tokens, dim),
+                dtype=weight.dtype,
+                device=weight.device
+            )
+            dist.reduce_scatter_tensor(
+                local_d_hidden,
+                d_hidden,
+                op=dist.ReduceOp.SUM,
+                group=tp_group
+            )
+            return local_d_hidden, local_d_weight, None, None, None
+
+    @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+    @pytest.mark.parametrize("reduction", ["mean", "sum", "none"])
+    @pytest.mark.parametrize("problem", [(256, 12928, 8192)])
+    def test_torch_tp_vs_single_gpu(
+        self,
+        dtype,
+        reduction,
+        problem,
+    ):
+        num_tokens, vocabsize, dim = problem
+
+        hidden = (
+            torch.empty((num_tokens, dim), dtype=dtype, device="cuda")
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        weight = (
+            torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        labels = torch.randint(0, vocabsize, (num_tokens * self.tp_world_size,), 
+                                dtype=torch.long, device="cuda")
+
+        # ------------ forward pass ------------ #
+        dist.broadcast(labels, src=0, group=self.tp_group)
+
+        # single GPU
+        whole_hidden = torch.empty(
+            (num_tokens * self.tp_world_size, dim),
+            dtype=dtype,
+            device="cuda"
+        )
+        dist.all_gather_into_tensor(
+            whole_hidden,
+            hidden,
+            group=self.tp_group
+        )
+        whole_hidden = whole_hidden.clone().requires_grad_()
+
+        whole_weight = torch.empty(
+            (vocabsize * self.tp_world_size, dim),
+            dtype=dtype,
+            device="cuda"
+        )
+        whole_weight_view = [
+            whole_weight[i * vocabsize : (i + 1) * vocabsize, :]
+            for i in range(self.tp_world_size)
+        ]
+        dist.all_gather(
+            whole_weight_view, 
+            weight,
+            group=self.tp_group
+        )
+        whole_weight = whole_weight.clone().requires_grad_()
+        logprobs_single_gpu = self.torch_linear_cross_entropy_single_gpu(
+            whole_hidden, whole_weight, labels,
+            reduction=reduction,
+        )
+
+        # TP
+        logprobs_tp = self.TorchLinearCrossEntropy.apply(
+            hidden, weight, labels,
+            self.tp_group,
+            reduction,
+        )
+        torch.testing.assert_close(
+            logprobs_single_gpu,
+            logprobs_tp,
+        )
+
+        # ------------ backward pass ------------ #
+        g_logprobs = (
+            torch.empty_like(logprobs_single_gpu)
+            .uniform_(-0.1, 0.1)
+        )
+        dist.broadcast(g_logprobs, src=0, group=self.tp_group)
+
+        # single GPU
+        (d_hidden_single_gpu, d_weight_single_gpu) = torch.autograd.grad(
+            (logprobs_single_gpu,),
+            (whole_hidden, whole_weight),
+            (g_logprobs,),
+            retain_graph=False
+        )
+
+        # TP
+        (d_hidden_tp, d_weight_tp) = torch.autograd.grad(
+            (logprobs_tp,),
+            (hidden, weight),
+            (g_logprobs,),
+            retain_graph=False
+        )
+
+        local_d_hidden_single_gpu = d_hidden_single_gpu[self.tp_rank * hidden.shape[0] : (self.tp_rank + 1) * hidden.shape[0], :]
+        torch.testing.assert_close(
+            local_d_hidden_single_gpu,
+            d_hidden_tp,
+            atol=1e-3,
+            rtol=1e-3,
+        )
+        local_d_weight_single_gpu = d_weight_single_gpu[self.tp_rank * weight.shape[0] : (self.tp_rank + 1) * weight.shape[0], :]
+        torch.testing.assert_close(
+            local_d_weight_single_gpu,
+            d_weight_tp,
+            atol=1e-3,
+            rtol=1e-3,
+        )
+
+        self.cleanup()
+
+    @staticmethod
+    def get_problems():
+        return [
+            (80, 125, 64),
+            (80, 152064, 64),
+            (1024, 152064, 4096),
+            (4096, 15206, 1024),
+            ((1, 4096), 15206, 1024),
+            ((4, 1024), 15206, 1024),
+        ]
+
+    @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+    @pytest.mark.parametrize("reduction", ["mean", "sum", "none"])
+    @pytest.mark.parametrize("problem", get_problems())
+    def test_correctness(
+        self,
+        dtype,
+        reduction,
+        problem,
+    ):
+        num_tokens, vocabsize, dim = problem
+        hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
+        labels_shape = (num_tokens * self.tp_world_size,) if isinstance(num_tokens, int) else (num_tokens[0] * self.tp_world_size, *num_tokens[1:])
+
+        hidden = (
+            torch.empty(hidden_shape, dtype=dtype, device="cuda")
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        weight = (
+            torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+
+        # ------ forward pass ------ #
+        dist.broadcast(labels, src=0, group=self.tp_group)
+
+        torch_logprobs = self.TorchLinearCrossEntropy.apply(
+            hidden.view(-1, dim), weight, labels,
+            self.tp_group,
+            reduction,
+        )
+
+        custom_logprobs = linear_cross_entropy(
+            hidden, weight, labels,
+            tp_group=self.tp_group,
+            reduction=reduction,
+            sequence_parallel=True,
+        )
+
+        torch.testing.assert_close(
+            torch_logprobs,
+            custom_logprobs,
+        )
+
+        # ------- backward pass ------- #
+        g_logprobs = (
+            torch.empty_like(torch_logprobs)
+            .uniform_(-0.1, 0.1)
+        )
+        dist.broadcast(g_logprobs, src=0, group=self.tp_group)
+
+        (d_hidden_torch, d_weight_torch) = torch.autograd.grad(
+            (torch_logprobs,),
+            (hidden, weight),
+            (g_logprobs,),
+            retain_graph=False
+        )
+        (d_hidden_custom, d_weight_custom) = torch.autograd.grad(
+            (custom_logprobs,),
+            (hidden, weight),
+            (g_logprobs,),
+            retain_graph=False
+        )
+
+        # in case one GPU failed, and leading to hang
+        torch.testing.assert_close(
+            d_hidden_torch,
+            d_hidden_custom,
+            atol=1e-3,
+            rtol=1e-3,
+        )
+        torch.testing.assert_close(
+            d_weight_torch,
+            d_weight_custom,
+            atol=1e-3,
+            rtol=1e-3,
+        )
+        self.timed_barrier()
+
+        self.cleanup()
+
+    @pytest.mark.parametrize("problem", [((1, 1024), 129280, 7168)])
+    @pytest.mark.parametrize("dtype", [torch.bfloat16])
+    @pytest.mark.parametrize("reduction", ["mean"])
+    def test_performance(
+        self,
+        problem,
+        dtype,
+        reduction
+    ):
+        num_tokens, vocabsize, dim = problem
+        hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
+        labels_shape = (num_tokens * self.tp_world_size,) if isinstance(num_tokens, int) else (num_tokens[0] * self.tp_world_size, *num_tokens[1:])
+
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+
+        torch_fwd_latency = list()
+        torch_bwd_latency = list()
+        custom_fwd_latency = list()
+        custom_bwd_latency = list()
+
+        iterations = 5
+        for i in range(iterations):
+            hidden = (
+                torch.empty(hidden_shape, dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            weight = (
+                torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+
+            # ------ forward pass ------ #
+            dist.broadcast(labels, src=0, group=self.tp_group)
+
+            start_event.record()
+            torch_logprobs = self.TorchLinearCrossEntropy.apply(
+                hidden.view(-1, dim), weight, labels,
+                self.tp_group,
+                reduction,
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            torch_fwd_latency.append(start_event.elapsed_time(end_event))
+
+            start_event.record()
+            custom_logprobs = linear_cross_entropy(
+                hidden, weight, labels,
+                tp_group=self.tp_group,
+                reduction=reduction,
+                sequence_parallel=True,
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            custom_fwd_latency.append(start_event.elapsed_time(end_event))
+
+            # ------- backward pass ------- #
+            g_logprobs = (
+                torch.empty_like(torch_logprobs)
+                .uniform_(-0.1, 0.1)
+            )
+            dist.broadcast(g_logprobs, src=0, group=self.tp_group)
+
+            start_event.record()
+            (d_hidden_torch, d_weight_torch) = torch.autograd.grad(
+                (torch_logprobs,),
+                (hidden, weight),
+                (g_logprobs,),
+                retain_graph=False
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            torch_bwd_latency.append(start_event.elapsed_time(end_event))
+
+            start_event.record()
+            (d_hidden_custom, d_weight_custom) = torch.autograd.grad(
+                (custom_logprobs,),
+                (hidden, weight),
+                (g_logprobs,),
+                retain_graph=False
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            custom_bwd_latency.append(start_event.elapsed_time(end_event))
+
+        # --- remove first latency due to warmup --- #
+        torch_fwd_latency = torch_fwd_latency[1:]
+        torch_bwd_latency = torch_bwd_latency[1:]
+        custom_fwd_latency = custom_fwd_latency[1:]
+        custom_bwd_latency = custom_bwd_latency[1:]
+
+        if self.is_chief:
+            print()
+            print(f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}, TP size {self.tp_world_size}, Sequence Parallel: True:")
+            print(f"[INFO]: Torch forward latency: {sum(torch_fwd_latency) / len(torch_fwd_latency):.2f} ms")
+            print(f"[INFO]: Custom forward latency: {sum(custom_fwd_latency) / len(custom_fwd_latency):.2f} ms")
+            print(f"[INFO]: Torch backward latency: {sum(torch_bwd_latency) / len(torch_bwd_latency):.2f} ms")
+            print(f"[INFO]: Custom backward latency: {sum(custom_bwd_latency) / len(custom_bwd_latency):.2f} ms")
+
+
+    @pytest.mark.parametrize("problem", [((1, 1024), 129280, 7168)])
+    @pytest.mark.parametrize("dtype", [torch.bfloat16])
+    @pytest.mark.parametrize("reduction", ["mean"])
+    def test_storage(
+        self,
+        problem,
+        dtype,
+        reduction
+    ):
+        num_tokens, vocabsize, dim = problem
+        hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
+        labels_shape = (num_tokens * self.tp_world_size,) if isinstance(num_tokens, int) else (num_tokens[0] * self.tp_world_size, *num_tokens[1:])
+
+        if self.is_chief:
+            print()
+            print(f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}, TP size {self.tp_world_size}, Sequence Parallel: True:")
+
+        def torch_storage():
+            hidden = (
+                torch.empty(hidden_shape, dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            weight = (
+                torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+
+            dist.broadcast(hidden, src=0, group=self.tp_group)
+            dist.broadcast(labels, src=0, group=self.tp_group)
+
+            torch.cuda.reset_peak_memory_stats()
+            torch_logprobs = self.TorchLinearCrossEntropy.apply(
+                hidden.view(-1, dim), weight, labels,
+                self.tp_group,
+                reduction,
+            )
+            torch.cuda.synchronize()
+            torch_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            if self.is_chief:
+                print(f"[INFO]: On GPU {self.tp_rank}, Torch Forward pass peak memory: {torch_max_memory:.2f} MB")
+
+            g_logprobs = (
+                torch.empty_like(torch_logprobs)
+                .uniform_(-0.1, 0.1)
+            )
+            dist.broadcast(g_logprobs, src=0, group=self.tp_group)
+
+            torch.cuda.reset_peak_memory_stats()
+            (d_hidden_torch, d_weight_torch) = torch.autograd.grad(
+                (torch_logprobs,),
+                (hidden, weight),
+                (g_logprobs,),
+                retain_graph=False
+            )
+            torch.cuda.synchronize()
+            torch_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            if self.is_chief:
+                print(f"[INFO]: On GPU {self.tp_rank}, Torch Backward pass peak memory: {torch_max_memory:.2f} MB")
+
+        def custom_storage():
+            hidden = (
+                torch.empty(hidden_shape, dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            weight = (
+                torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+            
+            dist.broadcast(hidden, src=0, group=self.tp_group)
+            dist.broadcast(labels, src=0, group=self.tp_group)
+
+            torch.cuda.reset_peak_memory_stats()
+            custom_logprobs = linear_cross_entropy(
+                hidden, weight, labels,
+                tp_group=self.tp_group,
+                reduction=reduction,
+                sequence_parallel=True,
+            )
+            torch.cuda.synchronize()
+            custom_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            if self.is_chief:
+                print(f"[INFO]: On GPU {self.tp_rank}, Custom Forward pass peak memory: {custom_max_memory:.2f} MB")
+
+            g_logprobs = (
+                torch.empty_like(custom_logprobs)
+                .uniform_(-0.1, 0.1)
+            )
+            dist.broadcast(g_logprobs, src=0, group=self.tp_group)
+
+            torch.cuda.reset_peak_memory_stats()
+            (d_hidden_custom, d_weight_custom) = torch.autograd.grad(
+                (custom_logprobs,),
+                (hidden, weight),
+                (g_logprobs,),
+                retain_graph=False
+            )
+            torch.cuda.synchronize()
+            custom_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            if self.is_chief:
+                print(f"[INFO]: On GPU {self.tp_rank}, Custom Backward pass peak memory: {custom_max_memory:.2f} MB")
+
         self.cleanup()
         torch_storage()
         self.cleanup()

From 0d12e006e546b94aa0657603d41cc186b1a53394 Mon Sep 17 00:00:00 2001
From: Jianbin Chang <shjwudp@gmail.com>
Date: Mon, 10 Nov 2025 16:37:21 +0800
Subject: [PATCH 06/17] Fixed several bugs and added support for
 sequence-parallel (#5)

* 1. fix weight is None issue
2. API compatible fix

* 1. fix weight is None issue
2. API compatible fix

* fix fused linear-ce fusion loss issue

* fix typo in fused_linear_ce triton

* 1. fix weight is None issue
2. API compatible fix

* fix fused linear-ce fusion loss issue

* add sequence_parallel option on compute_language_model_loss_without_logits

* Linear cross-entropy fusion is not used by default.
---
 .../models/common/language_module/language_module.py  | 11 ++++++-----
 megatron/core/models/gpt/gpt_model.py                 |  6 ++++--
 megatron/core/models/mamba/mamba_model.py             |  3 ++-
 megatron/training/arguments.py                        |  4 ++--
 4 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py
index b8e39693b22..15352075661 100644
--- a/megatron/core/models/common/language_module/language_module.py
+++ b/megatron/core/models/common/language_module/language_module.py
@@ -131,9 +131,10 @@ def compute_language_model_loss_without_logits(
         hidden: Tensor,
         labels: Optional[Tensor],
         weight: Tensor = None,
+        sequence_parallel_enabled: bool = False,
         column_parallel_linear: torch.nn.Module = None,
         col_linear_kwargs: Dict[str, Any] = {},
-        reduction: Optional[str] = "mean",
+        reduction: Optional[str] = "none",
         ignore_index: Optional[int] = -100,
     ) -> Tuple[Tensor, Optional[Tensor]]:
         """Computes the language model logits and loss (Cross entropy across vocabulary)
@@ -146,7 +147,7 @@ def compute_language_model_loss_without_logits(
             column_parallel_linear (torch.nn.Module): The column parallel linear
                 layer to use for computing logits when not using fused linear cross entropy.
             col_linear_kwargs (Dict[str, Any]): Additional kwargs for column parallel linear layer
-            reduction (Optional[str]): The reduction method. Defaults to "mean", and can be
+            reduction (Optional[str]): The reduction method. Defaults to "none", and can be
                 one of "none", "sum", "mean".
             ignore_index (Optional[int]): The index to ignore in the loss calculation.
                 Defaults to -100.
@@ -155,7 +156,6 @@ def compute_language_model_loss_without_logits(
             Tensor: Loss tensor of dimensions [batch size, sequence_length].
         """
         if self.config.linear_cross_entropy_fusion:
-
             assert (
                 weight is not None
             ), "weight cannot be None when using fused linear cross entropy."
@@ -165,13 +165,14 @@ def compute_language_model_loss_without_logits(
                 hidden,
                 weight,
                 labels,
-                dist_process_group=self.pg_collection.tp,
+                tp_group=self.pg_collection.tp,
+                sequence_parallel=sequence_parallel_enabled,
                 reduction=reduction,
                 ignore_index=ignore_index,
             )
 
             # [s b] => [b, s]
-            loss = loss.transpose(0, 1).contiguous()
+            loss = loss.view_as(labels).transpose(0, 1).contiguous()
             return loss
         else:
             assert (
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index b48dcec2078..5e3950d0003 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -578,7 +578,8 @@ def _postprocess(
                 mtp_loss = self.compute_language_model_loss_without_logits(
                     hidden_states_list[mtp_layer_number + 1],
                     labels=mtp_labels,
-                    weight=output_weight,
+                    weight=self.shared_embedding_or_output_weight(),
+                    sequence_parallel_enabled=self.output_layer.sequence_parallel,
                     column_parallel_linear=self.output_layer,
                     col_linear_kwargs={
                         'weight': output_weight,
@@ -667,7 +668,8 @@ def _postprocess(
         loss = self.compute_language_model_loss_without_logits(
             hidden_states,
             labels=labels,
-            weight=output_weight,
+            weight=self.shared_embedding_or_output_weight(),
+            sequence_parallel_enabled=self.output_layer.sequence_parallel,
             column_parallel_linear=self.output_layer,
             col_linear_kwargs={
                 'weight': output_weight,
diff --git a/megatron/core/models/mamba/mamba_model.py b/megatron/core/models/mamba/mamba_model.py
index 533f4efc257..98d918ce448 100644
--- a/megatron/core/models/mamba/mamba_model.py
+++ b/megatron/core/models/mamba/mamba_model.py
@@ -257,7 +257,8 @@ def forward(
         loss = self.compute_language_model_loss_without_logits(
             hidden_states,
             labels,
-            weight=output_weight,
+            weight=self.shared_embedding_or_output_weight(),
+            sequence_parallel_enabled=self.output_layer.sequence_parallel,
             column_parallel_linear=self.output_layer,
             col_linear_kwargs={
                 "weight": output_weight,
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index 439825aaf57..ad34c3e5e0a 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -2254,8 +2254,8 @@ def _add_training_args(parser):
                        dest='bias_swiglu_fusion')
     group.add_argument('--use-fused-weighted-squared-relu', action='store_true',
                        help='Use fused weighted squared relu when using MoE.')
-    group.add_argument('--no-linear-cross-entropy-fusion', action='store_false',
-                       help='Disable fusion of linear layer and cross entropy '
+    group.add_argument('--linear-cross-entropy-fusion', action='store_true',
+                       help='Enable fusion of linear layer and cross entropy '
                        'loss calculation.',
                        dest='linear_cross_entropy_fusion')
     group.add_argument('--no-bias-dropout-fusion', action='store_false',

From 24a54659e91da429db9410206a6080f47ca80a5b Mon Sep 17 00:00:00 2001
From: Jianbing-D <69858819+Jianbing-D@users.noreply.github.com>
Date: Tue, 11 Nov 2025 15:59:17 +0800
Subject: [PATCH 07/17] formatting and fixing lints (#6)

Signed-off-by: Jianbing Dong <jianbingd@nvidia.com>
---
 .../fusions/fused_linear_cross_entropy.py     | 241 ++----
 .../blackwell/bwd_partial_dlogits.py          | 576 ++++---------
 .../linear_cross_entropy/blackwell/entry.py   | 281 +++----
 .../blackwell/fwd_mainloop.py                 | 527 ++++--------
 .../linear_cross_entropy/blackwell/triton.py  | 183 ++---
 .../fusions/linear_cross_entropy/utils.py     |  13 +-
 .../common/language_module/language_module.py |  11 +-
 megatron/core/models/gpt/gpt_model.py         |  14 +-
 megatron/core/models/mamba/mamba_model.py     |   2 +-
 tests/unit_tests/a2a_overlap/utils.py         |   4 +-
 .../test_fused_linear_cross_entropy.py        | 772 ++++++------------
 11 files changed, 837 insertions(+), 1787 deletions(-)

diff --git a/megatron/core/fusions/fused_linear_cross_entropy.py b/megatron/core/fusions/fused_linear_cross_entropy.py
index a08735952dc..74d38da8243 100644
--- a/megatron/core/fusions/fused_linear_cross_entropy.py
+++ b/megatron/core/fusions/fused_linear_cross_entropy.py
@@ -1,32 +1,53 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+
 """
 Linear Cross Entropy API
 Fuse cross entropy with linear layer.
 """
 
 import typing
+
 import torch
 
-def _setup_platform():
+
+class Platform:
     """
-    Setup the platform for the Linear Cross Entropy.
+    Singleton class for targeted GPU platform.
     """
-    assert torch.cuda.is_available(), "CUDA is not available"
-    device = torch.cuda.current_device()
-    cc = torch.cuda.get_device_capability(device)
-    
-    global forward_func, backward_func
-    if cc[0] == 10:
-        # from linear_cross_entropy.blackwell import entry as platform
-        from .linear_cross_entropy.blackwell import entry as platform
-        forward_func = platform.forward
-        backward_func = platform.backward
-    else:
-        raise ValueError(f"Unsupported architecture: {cc[0]}")
-_setup_platform()
+
+    _instance: typing.Optional["Platform"] = None
+
+    def __new__(cls) -> "Platform":
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+
+    def __init__(self) -> None:
+        if getattr(self, "_initialized", False):
+            return
+
+        assert torch.cuda.is_available(), "CUDA is not available"
+        device = torch.cuda.current_device()
+        cc = torch.cuda.get_device_capability(device)
+
+        if cc[0] == 10:
+            from .linear_cross_entropy.blackwell import entry as gpu_entry
+
+            self.forward_func: typing.Callable[..., typing.Any] = gpu_entry.forward
+            self.backward_func: typing.Callable[..., typing.Any] = gpu_entry.backward
+        else:
+            raise ValueError(f"Unsupported architecture: {cc[0]}")
+
+        self._initialized = True
+
+
+_platform = Platform()
+
 
 class LinearCrossEntropy(torch.autograd.Function):
     """
-    This class implements a custom autograd function for linear and cross entropy, whose equivalent logic in PyTorch is:
+    This class implements a custom autograd function for linear and cross entropy,
+    whose equivalent logic in PyTorch is:
         ```python
         def torch_entropy(hidden, weight, labels):
             logits = torch.matmul(hidden, weight)
@@ -34,6 +55,7 @@ def torch_entropy(hidden, weight, labels):
             return logprobs
         ```
     """
+
     @staticmethod
     def forward(
         ctx,
@@ -41,13 +63,14 @@ def forward(
         weight: torch.Tensor,
         labels: torch.Tensor,
         tp_group: typing.Optional[torch.distributed.ProcessGroup] = None,
-        reduction: typing.Optional[str] = "mean",
-        ignore_index: typing.Optional[int] = -100,
-        sequence_parallel: typing.Optional[bool] = False,
+        reduction: str = "mean",
+        ignore_index: int = -100,
+        sequence_parallel: bool = False,
     ) -> torch.Tensor:
         """
         The forward pass of the Linear Cross Entropy.
-        If tp_group is not None, the weight tensor to each TP rank should be (global_vocab_size // world_size, dim).
+        If tp_group is not None, the weight tensor to each TP rank should be
+        (global_vocab_size // world_size, dim).
         Note that each of the ranks should get equal shards along the vocab_size dimension.
 
         Args:
@@ -74,24 +97,26 @@ def forward(
             A0  C0  XX
             A1  XX  C1
 
-        When tp_group is not None, the weight tensor will be split along the vocab_size dimension, 
-        which means each rank will get equal shards along the global_vocab_size dimension.
-        Specifically, the weight tensor to each rank will be (local_vocab_size, dim). 
+        When tp_group is not None, the weight tensor will be split along the vocab_size
+        dimension, which means each rank will get equal shards along the global_vocab_size
+        dimension. Specifically, the weight tensor to each rank will be (local_vocab_size, dim).
         And there is an assumption that each rank will get the same local_vocab_size.
 
-        When sequence_parallel is True, the hidden tensor will be split along the sequence length dimension,
-        which means each rank will get equal shards along the sequence length dimension.
-        Specifically, the hidden tensor to each rank will be (local_num_tokens, dim).
-        And there is an assumption that each rank will get the same local_num_tokens.
+        When sequence_parallel is True, the hidden tensor will be split along the
+        sequence length dimension, which means each rank will get equal shards along
+        the sequence length dimension. Specifically, the hidden tensor to each rank
+        will be (local_num_tokens, dim). And there is an assumption that each rank
+        will get the same local_num_tokens.
 
-        In TP forward pass, the hidden tensor and label tensor shall be identical among all TP ranks,
-        and it's user's responsibility to ensure the hidden tensor is identical among all TP ranks.
-        Then this operation will produce identical logprobs among all TP ranks.
+        In TP forward pass, the hidden tensor and label tensor shall be identical
+        among all TP ranks, and it's user's responsibility to ensure the hidden tensor
+        is identical among all TP ranks. Then this operation will produce identical
+        logprobs among all TP ranks.
 
-        In TP backward pass, the gradient of the logprobs shall be identical among all TP ranks,
-        and it's user's responsibility to ensure the gradient of the logprobs is identical among all TP ranks.
-        Then this operation will produce distinct gradients for the local weight tensor,
-        and identical gradients for the hidden tensor. 
+        In TP backward pass, the gradient of the logprobs shall be identical among all
+        TP ranks, and it's user's responsibility to ensure the gradient of the logprobs
+        is identical among all TP ranks. Then this operation will produce distinct gradients
+        for the local weight tensor, and identical gradients for the hidden tensor.
 
         ```python
         # ------------ forward pass ------------ #
@@ -103,16 +128,17 @@ def forward(
         # ------------ backward pass ------------ #
         g_logprobs = tp_group.broadcast(g_logprobs, src=0) # handled by framework
         d_hidden, d_weight = torch.autograd.grad(...)
-        # each rank will get the same d_hidden, 
+        # each rank will get the same d_hidden,
         # and distinct d_weight for local weight shard
         ```
 
-        In SP forward pass, the hidden tensor shall be split along the sequence length dimension, 
+        In SP forward pass, the hidden tensor shall be split along the sequence length dimension,
         and the label tensor shall be identical among all TP ranks.
         Then this operation will produce identical logprobs among all TP ranks.
 
         In SP backward pass, the gradient of the logprobs shall be identical among all TP ranks,
-        Then this operation will produce distinct gradients for the local hidden tensor and weight tensor.
+        Then this operation will produce distinct gradients for the local hidden tensor
+        and local weight tensor.
         ```python
         # ------------ forward pass ------------ #
         hidden = global_hidden[tp_rank] # handled by framework
@@ -128,18 +154,11 @@ def forward(
         """
         with torch.cuda.nvtx.range("LinearCrossEntropy-forward"):
             logprobs, _maximum, _acc, _num_valid_tokens, tp_rank, tp_world_size, global_hidden = (
-                forward_func(
-                    hidden, weight, labels,
-                    tp_group, 
-                    reduction,
-                    ignore_index,
-                    sequence_parallel,
+                _platform.forward_func(
+                    hidden, weight, labels, tp_group, reduction, ignore_index, sequence_parallel
                 )
             )
-            ctx.save_for_backward(
-                global_hidden, weight, labels,
-                _maximum, _acc, _num_valid_tokens,
-            )
+            ctx.save_for_backward(global_hidden, weight, labels, _maximum, _acc, _num_valid_tokens)
             ctx.tp_group = tp_group
             ctx.ignore_index = ignore_index
             ctx.reduction = reduction
@@ -148,13 +167,11 @@ def forward(
             ctx.sequence_parallel = sequence_parallel
 
         return logprobs
-            
 
     @staticmethod
     def backward(
-        ctx,
-        dlogprobs: torch.Tensor
-    ) -> typing.List[torch.Tensor]:
+        ctx, dlogprobs: torch.Tensor
+    ) -> typing.Tuple[torch.Tensor, torch.Tensor, None, None, None, None, None]:
         """
         The backward pass of the Linear Cross Entropy.
         Args:
@@ -175,7 +192,7 @@ def backward(
             tp_world_size = ctx.tp_world_size
             sequence_parallel = ctx.sequence_parallel
 
-            d_hidden, d_weight = backward_func(
+            d_hidden, d_weight = _platform.backward_func(
                 dlogprobs,
                 global_hidden,
                 weight,
@@ -199,9 +216,9 @@ def linear_cross_entropy(
     weight: torch.Tensor,
     labels: torch.Tensor,
     tp_group: typing.Optional[torch.distributed.ProcessGroup] = None,
-    reduction: typing.Optional[str] = "mean",
-    ignore_index: typing.Optional[int] = -100,
-    sequence_parallel: typing.Optional[bool] = False,
+    reduction: str = "mean",
+    ignore_index: int = -100,
+    sequence_parallel: bool = False,
 ) -> torch.Tensor:
     """
     helper function for linear cross entropy.
@@ -209,115 +226,5 @@ def linear_cross_entropy(
     _impl = LinearCrossEntropy.apply
     return _impl(hidden, weight, labels, tp_group, reduction, ignore_index, sequence_parallel)
 
-__all__ = [
-    "linear_cross_entropy",
-    "LinearCrossEntropy",
-]
-
-
-# FIXME: move this unit-test to other place
-if __name__ == "__main__":
-    def test_dp():
-        # batch = 4
-        # seqlen = 2035
-        # vocab_size = 152063
-        # dim = 4096
-        batch = 1
-        seqlen = 80
-        vocab_size = 125
-        dim = 64
-        dtype = torch.float16
-        reduction = "none"
-
-        hidden = (
-            torch.empty((batch, seqlen, dim), device="cuda", dtype=dtype)
-            .uniform_(-0.1, 0.1)
-            .requires_grad_()
-        )
-        weight = (
-            torch.empty((vocab_size, dim), device="cuda", dtype=dtype)
-            .uniform_(-0.1, 0.1)
-            .requires_grad_()
-        )
-
-        labels = torch.randint(0, vocab_size, (batch, seqlen), device="cuda", dtype=torch.long)
-
-        logits = hidden @ weight.T
-        # print(logits)
-
-        _logits = logits.to(torch.float32)
-        _logits_view = _logits.view(-1, _logits.shape[-1])
-        maximum = _logits_view.max(dim=-1, keepdim=False).values
-        accu = torch.exp(_logits_view - maximum.unsqueeze(-1)).sum(dim=-1)
-        
-        logprobs = torch.nn.functional.cross_entropy(
-            logits.view(-1, logits.shape[-1]),
-            labels.view(-1),
-            reduction=reduction,
-        )
-        
-        custom_logprobs = linear_cross_entropy(
-            hidden, weight, labels, 
-            reduction=reduction,
-        )
-
-        print(custom_logprobs)
-        print(logprobs)
-
-        # backward
-        g_logprobs = torch.rand_like(logprobs, dtype=dtype, device="cuda")
-
-        (d_torch_hidden, d_torch_weight) = torch.autograd.grad(
-            (logprobs,), 
-            (hidden, weight),
-            (g_logprobs,),
-            retain_graph=False
-        )
-
-        # first way to do backward
-        if reduction == "mean":
-            _g_logprobs = torch.broadcast_to(g_logprobs / (batch * seqlen), (batch * seqlen,))
-        elif reduction == "sum":
-            _g_logprobs = torch.broadcast_to(g_logprobs, (batch * seqlen,))
-        else:
-            _g_logprobs = g_logprobs
-
-        intermediate = _logits_view - maximum.unsqueeze(-1)
-        exp_logits = torch.exp(intermediate)
-        d_logits = exp_logits / accu.unsqueeze(-1)
-        d_logits *= _g_logprobs.unsqueeze(-1)
-        # mask = torch.arange(vocab_size, dtype=torch.long, device="cuda")
-        # mask = torch.broadcast_to(mask, (batch * seqlen, vocab_size))
-        # mask = (labels.view(-1).unsqueeze(-1) == mask)
-
-        one_hot = torch.zeros_like(_logits_view)
-        one_hot.scatter_(1, labels.view(-1).unsqueeze(-1), 1)
-
-        d_logits += one_hot * -_g_logprobs.unsqueeze(-1)
-        d_logits = d_logits.to(hidden.dtype)
-        # print(d_logits)
-        
-        d_hidden = d_logits @ weight
-        d_weight = d_logits.T @ hidden.view(-1, dim)
-
-        # print("first way to do backward")
-        # print(d_hidden.view(hidden.shape))
-        # print(d_torch_hidden)
-        # print(d_weight)
-        # print(d_torch_weight)
-        # print(d_logits)
-
-        (d_custom_hidden, d_custom_weight) = torch.autograd.grad(
-            (custom_logprobs,),
-            (hidden, weight),
-            (g_logprobs,),
-            retain_graph=False
-        )
-        # print(d_torch_hidden)
-        # print(d_custom_hidden)
-        print(d_torch_weight)
-        print(d_custom_weight)
-
-    torch.manual_seed(42)
-
-    test_dp()
\ No newline at end of file
+
+__all__ = ["linear_cross_entropy", "LinearCrossEntropy"]
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py b/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py
index 97e7c5ab493..8a6e03601bf 100644
--- a/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py
@@ -1,45 +1,47 @@
-from typing import Optional, Type, Tuple, Union
-import cuda.bindings.driver as cuda
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 
-import torch
+from typing import Optional, Tuple, Type
 
+import cuda.bindings.driver as cuda  # type: ignore
 import cutlass
 import cutlass.cute as cute
-import cutlass.utils as utils
-import cutlass.pipeline as pipeline
+import cutlass.pipeline as pipeline  # type: ignore
+import cutlass.utils as utils  # type: ignore
+import cutlass.utils.blackwell_helpers as sm100_utils  # type: ignore
 from cutlass.cute.nvgpu import cpasync, tcgen05
-import cutlass.torch as cutlass_torch
-import cutlass.utils.blackwell_helpers as sm100_utils
-from cutlass.cute.runtime import from_dlpack
-
 
 SM100_TMEM_CAPACITY_COLUMNS: int = 512
 
+
 def make_thread_cooperative_group(size: int, alignment: Optional[int] = None):
+    """
+    Create a thread cooperative group.
+    """
     return pipeline.CooperativeGroup(
-        pipeline.Agent.Thread, size, 
-        alignment=alignment if alignment is not None else size)
+        pipeline.Agent.Thread, size, alignment=alignment if alignment is not None else size
+    )
 
 
 class BwdPartialDlogits:
     """
     This class implements the backward kernel for partial d_logits.
     """
-    def __init__(self,
-                 reduction: int,
-                 acc_dtype: Type[cutlass.Numeric] = cutlass.Float32,
-                 use_2cta_instrs: bool = False,
-                 mma_tiler_mn: Tuple[int, int] = (128, 256),
-                 vocab_per_split: int = 512):
+
+    def __init__(
+        self,
+        reduction: int,
+        acc_dtype: Type[cutlass.Numeric] = cutlass.Float32,
+        use_2cta_instrs: bool = False,
+        mma_tiler_mn: Tuple[int, int] = (128, 256),
+        vocab_per_split: int = 512,
+    ):
         self.REDUCTION: cutlass.Constexpr[cutlass.Int32] = cutlass.const_expr(reduction)
         self.acc_dtype = acc_dtype
         self.use_2cta_instrs = use_2cta_instrs
         self.mma_tiler = (*mma_tiler_mn, 1)
         self.vocab_per_split = vocab_per_split
 
-        self.cta_group = (
-            tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
-        )
+        self.cta_group = tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
         self.cluster_shape_mn = (2, 1) if self.use_2cta_instrs else (1, 1)
 
         self.smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
@@ -52,14 +54,10 @@ def __init__(self,
         self.empty_warp_ids = (6, 7)
 
         self.threads_per_cta: int = self.threads_per_warp * len(
-            (*self.epi_warp_ids,
-             self.load_warp_ids,
-             self.mma_warp_ids,
-             *self.empty_warp_ids)
+            (*self.epi_warp_ids, self.load_warp_ids, self.mma_warp_ids, *self.empty_warp_ids)
         )
         self.cta_sync_barrier = pipeline.NamedBarrier(
-            barrier_id = 1,
-            num_threads = self.threads_per_cta
+            barrier_id=1, num_threads=self.threads_per_cta
         )
 
         self.buffer_align_bytes: int = 1024
@@ -80,7 +78,7 @@ def _compute_grid(
                 cute.ceil_div(self.vocab_per_split, cta_tiler[1]),
                 1,
             ),
-            cluster_shape_mnk
+            cluster_shape_mnk,
         )
         return grid
 
@@ -104,28 +102,24 @@ def _setup_attributes(
     ):
         self.cluster_shape_mnk = (*self.cluster_shape_mn, 1)
         self.cluster_layout_vmnk = cute.tiled_divide(
-            cute.make_layout(self.cluster_shape_mnk),
-            (tiled_mma.thr_id.shape,),
+            cute.make_layout(self.cluster_shape_mnk), (tiled_mma.thr_id.shape,)
         )
 
         mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
         # it requires k-mode to be 128B aligned
         mma_inst_tile_k: int = 4
-        self.mma_tiler = (
-            self.mma_tiler[0],
-            self.mma_tiler[1],
-            mma_inst_shape_k * mma_inst_tile_k
+        self.mma_tiler = (self.mma_tiler[0], self.mma_tiler[1], mma_inst_shape_k * mma_inst_tile_k)
+
+        self.num_acc_stage, self.num_ab_stage, self.num_epi_stage_per_tile = self._compute_stages(
+            tiled_mma, self.mma_tiler, a_dtype, b_dtype
         )
-        
-        self.num_acc_stage, self.num_ab_stage, self.num_epi_stage_per_tile =\
-            self._compute_stages(tiled_mma, self.mma_tiler, a_dtype, b_dtype)
         self.tmem_alloc_cols = self.num_acc_stage * self.mma_tiler[1]
         assert self.tmem_alloc_cols <= SM100_TMEM_CAPACITY_COLUMNS
 
         self.cta_tile_shape_mnk = (
             self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
             self.mma_tiler[1],
-            self.mma_tiler[2]
+            self.mma_tiler[2],
         )
 
     @cute.kernel
@@ -150,9 +144,10 @@ def kernel(
         problem_mnk: Tuple[int, int, int],
         rank: cutlass.Int32,
     ) -> None:
-        warp_idx = cute.arch.make_warp_uniform(
-            cute.arch.warp_idx()
-        )
+        """
+        The backward kernel for partial d_logits.
+        """
+        warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
         tidx, _, _ = cute.arch.thread_idx()
         bidx, bidy, _ = cute.arch.block_idx()
         # FIXME: block swizzling applied here
@@ -160,9 +155,7 @@ def kernel(
 
         # FIXME: if 2 CTAs, modify here
         cta_rank_in_cluster = 0
-        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(
-            cta_rank_in_cluster
-        )
+        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
 
         # prefetch tma descriptors
         if warp_idx == self.load_warp_ids:
@@ -177,124 +170,95 @@ def kernel(
             producer_group=make_thread_cooperative_group(len([self.load_warp_ids])),
             consumer_group=make_thread_cooperative_group(len([self.mma_warp_ids])),
             tx_count=self.tma_copy_ab_bytes,
-            barrier_storage=storage.load_ab_mbar_ptr.data_ptr()
+            barrier_storage=storage.load_ab_mbar_ptr.data_ptr(),
         )
         ab_producer_state = pipeline.make_pipeline_state(
-            pipeline.PipelineUserType.Producer,
-            self.num_ab_stage
+            pipeline.PipelineUserType.Producer, self.num_ab_stage
         )
         ab_consumer_state = pipeline.make_pipeline_state(
-            pipeline.PipelineUserType.Consumer,
-            self.num_ab_stage
+            pipeline.PipelineUserType.Consumer, self.num_ab_stage
         )
 
         mma_pipeline = pipeline.PipelineUmmaAsync.create(
             num_stages=self.num_acc_stage,
             producer_group=make_thread_cooperative_group(len([self.mma_warp_ids])),
-            consumer_group=make_thread_cooperative_group(self.threads_per_warp * len(self.epi_warp_ids)),
-            barrier_storage=storage.mma_mbar_ptr.data_ptr()
+            consumer_group=make_thread_cooperative_group(
+                self.threads_per_warp * len(self.epi_warp_ids)
+            ),
+            barrier_storage=storage.mma_mbar_ptr.data_ptr(),
         )
         mma_producer_state = pipeline.make_pipeline_state(
-            pipeline.PipelineUserType.Producer,
-            self.num_acc_stage
+            pipeline.PipelineUserType.Producer, self.num_acc_stage
         )
         mma_consumer_state = pipeline.make_pipeline_state(
-            pipeline.PipelineUserType.Consumer,
-            self.num_acc_stage
+            pipeline.PipelineUserType.Consumer, self.num_acc_stage
         )
 
         tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr.data_ptr()
         if warp_idx == self.empty_warp_ids[0]:
             with cute.arch.elect_one():
                 cute.arch.mbarrier_init(
-                    tmem_dealloc_mbar_ptr,
-                    self.threads_per_warp * len(self.epi_warp_ids)
+                    tmem_dealloc_mbar_ptr, self.threads_per_warp * len(self.epi_warp_ids)
                 )
                 cute.arch.mbarrier_init_fence()
 
         # -------- tensor partition ------------ #
         # swizzle o [(tileM, tileK), loopM, loopK, stage]
-        sA = storage.sA.get_tensor(
-            a_smem_layout_staged.outer,
-            swizzle=a_smem_layout_staged.inner
-        )
+        sA = storage.sA.get_tensor(a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner)
         # swizzle o [(tileN, tileK), loopN, loopK, stage]
-        sB = storage.sB.get_tensor(
-            b_smem_layout_staged.outer,
-            swizzle=b_smem_layout_staged.inner
-        )
-        
+        sB = storage.sB.get_tensor(b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner)
+
         # FIXME: if 2 CTAs, modify here
         thr_mma = tiled_mma.get_slice(0)
         # [MMA, loopM, loopK, stage]
         tCsA = thr_mma.make_fragment_A(sA)
         # [MMA, loopN, loopK, stage]
         tCsB = thr_mma.make_fragment_B(sB)
-        
+
         # [tileM, tileK, loopK]
         gA = cute.local_tile(
-            mA,
-            (self.cta_tile_shape_mnk[0], self.cta_tile_shape_mnk[2]),
-            (pidm, None)
+            mA, (self.cta_tile_shape_mnk[0], self.cta_tile_shape_mnk[2]), (pidm, None)
         )
         # [vocab_per_split, dim]
         mB_n = cute.local_tile(
-            mB,
-            (self.vocab_per_split, cute.size(mB.layout.shape, mode=[1])),
-            (split_idx, 0)
+            mB, (self.vocab_per_split, cute.size(mB.layout.shape, mode=[1])), (split_idx, 0)
         )
         # [tileN, tileK, loopK]
         gB = cute.local_tile(
-            mB_n,
-            (self.cta_tile_shape_mnk[1], self.cta_tile_shape_mnk[2]),
-            (pidn, None)
+            mB_n, (self.cta_tile_shape_mnk[1], self.cta_tile_shape_mnk[2]), (pidn, None)
         )
 
-        a_cta_layout = cute.make_layout(
-            cute.slice_(
-                cluster_layout_vmnk,
-                (0, 0, None, 0)
-            ).shape
-        )
+        a_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape)
         # just to make sure SMEM and GMEM tensor has the same size in the first rank
         tCgA = thr_mma.partition_A(gA)
         tCgB = thr_mma.partition_B(gB)
         # [CPY, stage] & [CPY, loopK]
         tTMAsA, tTMAgA = cpasync.tma_partition(
             tma_atom_a,
-            block_in_cluster_coord_vmnk[2], # cta_coord,
+            block_in_cluster_coord_vmnk[2],  # cta_coord,
             a_cta_layout,
             cute.group_modes(sA, 0, 3),
-            cute.group_modes(tCgA, 0, 3)
-        )
-        b_cta_layout = cute.make_layout(
-            cute.slice_(
-                cluster_layout_vmnk,
-                (0, None, 0, 0)
-            ).shape
+            cute.group_modes(tCgA, 0, 3),
         )
+        b_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape)
         # [CPY, stage] & [CPY, loopK]
         tTMAsB, tTMAgB = cpasync.tma_partition(
             tma_atom_b,
-            block_in_cluster_coord_vmnk[1], # cta_coord
+            block_in_cluster_coord_vmnk[1],  # cta_coord
             b_cta_layout,
             cute.group_modes(sB, 0, 3),
-            cute.group_modes(tCgB, 0, 3)
+            cute.group_modes(tCgB, 0, 3),
         )
 
         # ------ Allocate TMEM ------ #
         tmem_holding_buf = storage.tmem_holding_buf
         if warp_idx == self.empty_warp_ids[0]:
             cute.arch.alloc_tmem(
-                self.tmem_alloc_cols,
-                tmem_holding_buf,
-                is_two_cta=self.use_2cta_instrs
+                self.tmem_alloc_cols, tmem_holding_buf, is_two_cta=self.use_2cta_instrs
             )
         self.cta_sync_barrier.arrive_and_wait()
         tmem_ptr = cute.arch.retrieve_tmem_ptr(
-            self.acc_dtype,
-            alignment=16,
-            ptr_to_buffer_holding_addr=tmem_holding_buf
+            self.acc_dtype, alignment=16, ptr_to_buffer_holding_addr=tmem_holding_buf
         )
 
         tmem_shape = (128, self.tmem_alloc_cols)
@@ -302,7 +266,7 @@ def kernel(
         tCtC_fake = thr_mma.make_fragment_C(acc_shape)
         # [(tileM, tileN), loopM, loopN]
         tCtC = cute.make_tensor(tmem_ptr, tCtC_fake.layout)
-        
+
         # ------ Empty ------ #
         if warp_idx in self.empty_warp_ids:
             cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
@@ -317,13 +281,13 @@ def kernel(
                     tma_atom_a,
                     tTMAgA[(None, k)],
                     tTMAsA[(None, ab_producer_state.index)],
-                    tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state)
+                    tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
                 )
                 cute.copy(
                     tma_atom_b,
                     tTMAgB[(None, k)],
                     tTMAsB[(None, ab_producer_state.index)],
-                    tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state)
+                    tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
                 )
                 ab_pipeline.producer_commit(ab_producer_state)
                 ab_producer_state.advance()
@@ -344,7 +308,7 @@ def kernel(
                         cute.append_ones(tCtC[(None, None, mma_producer_state.index)]),
                         tCsA[(None, None, kblock_idx, ab_consumer_state.index)],
                         tCsB[(None, None, kblock_idx, ab_consumer_state.index)],
-                        cute.append_ones(tCtC[(None, None, mma_producer_state.index)])
+                        cute.append_ones(tCtC[(None, None, mma_producer_state.index)]),
                     )
                     tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
 
@@ -353,7 +317,7 @@ def kernel(
 
             mma_pipeline.producer_commit(mma_producer_state)
             mma_producer_state.advance()
-            
+
         # ------ EPI ------ #
         if warp_idx in self.epi_warp_ids:
             cute.arch.warpgroup_reg_alloc(self.num_regs_epi)
@@ -364,257 +328,139 @@ def kernel(
                 self.acc_dtype,
                 self.acc_dtype,
                 (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
-                self.use_2cta_instrs
+                self.use_2cta_instrs,
             )
             # [tileM, subTileN, loopM, CntSubTileN, loopN]
             tAcc_epi = cute.flat_divide(
                 tCtC[((None, None), 0, None)],
-                (self.epi_tile[0],
-                 self.epi_tile[1] // self.num_epi_stage_per_tile)
-            )
-            tiled_copy_t2r = tcgen05.make_tmem_copy(
-                copy_atom_t2r,
-                tAcc_epi[(None, None, 0, 0, 0)]
+                (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
             )
+            tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)])
             thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
             tTMEM_load_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
-            tTMEM_load_tAcc = cute.group_modes(
-                tTMEM_load_tAcc, 3, cute.rank(tTMEM_load_tAcc) - 1
-            )
+            tTMEM_load_tAcc = cute.group_modes(tTMEM_load_tAcc, 3, cute.rank(tTMEM_load_tAcc) - 1)
 
             # predicates
             cAcc = cute.make_identity_tensor(self.mma_tiler[:2])
             tCcAcc = thr_mma.partition_C(cAcc)
             tCcAcc_epi = cute.flat_divide(
                 tCcAcc[((None, None), 0, None)],
-                (self.epi_tile[0],
-                 self.epi_tile[1] // self.num_epi_stage_per_tile)
+                (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
             )
             tTMEM_load_cAcc = thr_copy_t2r.partition_D(tCcAcc_epi)
-            tTMEM_load_cAcc_shape = cute.select(
-                tTMEM_load_cAcc.shape,
-                mode=[0, 1, 2]
-            )
-            tTMEM_load_rAcc = cute.make_fragment(
-                tTMEM_load_cAcc_shape,
-                self.acc_dtype
-            )
+            tTMEM_load_cAcc_shape = cute.select(tTMEM_load_cAcc.shape, mode=[0, 1, 2])
+            tTMEM_load_rAcc = cute.make_fragment(tTMEM_load_cAcc_shape, self.acc_dtype)
 
             copy_atom_g2r_int64 = cute.make_copy_atom(
-                cute.nvgpu.CopyUniversalOp(),
-                mLabels.element_type
+                cute.nvgpu.CopyUniversalOp(), mLabels.element_type
             )
             copy_atom_g2r_fp32 = cute.make_copy_atom(
-                cute.nvgpu.CopyUniversalOp(),
-                mDlogprobs.element_type
+                cute.nvgpu.CopyUniversalOp(), mDlogprobs.element_type
             )
-            epilogue_thread_layout = cute.make_layout(
-                (128, 1),
-                stride=(1, 1))
+            epilogue_thread_layout = cute.make_layout((128, 1), stride=(1, 1))
             tiled_copy_g2r_int64 = cute.make_tiled_copy_tv(
-                copy_atom_g2r_int64,
-                epilogue_thread_layout,
-                cute.make_layout((1, 1))
+                copy_atom_g2r_int64, epilogue_thread_layout, cute.make_layout((1, 1))
             )
             tiled_copy_g2r_fp32 = cute.make_tiled_copy_tv(
-                copy_atom_g2r_fp32,
-                epilogue_thread_layout,
-                cute.make_layout((1, 1))
+                copy_atom_g2r_fp32, epilogue_thread_layout, cute.make_layout((1, 1))
             )
             thr_copy_g2r_int64 = tiled_copy_g2r_int64.get_slice(tidx)
             thr_copy_g2r_fp32 = tiled_copy_g2r_fp32.get_slice(tidx)
 
             # [tileM]
-            gLabels = cute.local_tile(
-                mLabels,
-                (self.epi_tile[0],),
-                (pidm,)
-            )
-            gMaximum = cute.local_tile(
-                mMaximum,
-                (self.epi_tile[0],),
-                (pidm,)
-            )
-            gAccu = cute.local_tile(
-                mAccu,
-                (self.epi_tile[0],),
-                (pidm,)
-            )
-            
+            gLabels = cute.local_tile(mLabels, (self.epi_tile[0],), (pidm,))
+            gMaximum = cute.local_tile(mMaximum, (self.epi_tile[0],), (pidm,))
+            gAccu = cute.local_tile(mAccu, (self.epi_tile[0],), (pidm,))
+
             # slice along M direction
             tMCAcc = thr_copy_g2r_int64.partition_S(cAcc)[(None, None, 0)]
             # [(1, 1), 1]
-            tMCAcc_mask = cute.make_fragment(
-                tMCAcc.shape,
-                cutlass.Boolean
-            )
+            tMCAcc_mask = cute.make_fragment(tMCAcc.shape, cutlass.Boolean)
             # to align shape with gMax and gAccu
             tMCAcc_mask = cute.append_ones(tMCAcc_mask)
-            tMCAcc_mask[0] = cute.elem_less(
-                pidm * self.epi_tile[0] + tidx,
-                cute.size(mA, mode=[0])
-            )
+            tMCAcc_mask[0] = cute.elem_less(pidm * self.epi_tile[0] + tidx, cute.size(mA, mode=[0]))
             # [(1, 1), 1, 1]
-            tMgLabels = thr_copy_g2r_int64.partition_S(
-                cute.append_ones(gLabels)
-            )
-            tMrLabels = cute.make_fragment(
-                tMgLabels.shape,
-                tMgLabels.element_type
-            )
-            cute.copy(
-                tiled_copy_g2r_int64,
-                tMgLabels,
-                tMrLabels,
-                pred=tMCAcc_mask
-            )
-            tMgMaximum = thr_copy_g2r_fp32.partition_S(
-                cute.append_ones(gMaximum)
-            )
-            tMrMaximum = cute.make_fragment(
-                tMgMaximum.layout,
-                tMgMaximum.element_type
-            )
-            cute.copy(
-                tiled_copy_g2r_fp32,
-                tMgMaximum,
-                tMrMaximum,
-                pred=tMCAcc_mask
-            )
-            tMgAccu = thr_copy_g2r_fp32.partition_S(
-                cute.append_ones(gAccu)
-            )
-            tMrAccu = cute.make_fragment(
-                tMgAccu.layout,
-                tMgAccu.element_type
-            )
-            cute.copy(
-                tiled_copy_g2r_fp32,
-                tMgAccu,
-                tMrAccu,
-                pred=tMCAcc_mask
-            )
-
-            tMrDlogprobs = cute.make_fragment(
-                tMgAccu.layout,
-                mDlogprobs.element_type
-            )
+            tMgLabels = thr_copy_g2r_int64.partition_S(cute.append_ones(gLabels))
+            tMrLabels = cute.make_fragment(tMgLabels.shape, tMgLabels.element_type)
+            cute.copy(tiled_copy_g2r_int64, tMgLabels, tMrLabels, pred=tMCAcc_mask)
+            tMgMaximum = thr_copy_g2r_fp32.partition_S(cute.append_ones(gMaximum))
+            tMrMaximum = cute.make_fragment(tMgMaximum.layout, tMgMaximum.element_type)
+            cute.copy(tiled_copy_g2r_fp32, tMgMaximum, tMrMaximum, pred=tMCAcc_mask)
+            tMgAccu = thr_copy_g2r_fp32.partition_S(cute.append_ones(gAccu))
+            tMrAccu = cute.make_fragment(tMgAccu.layout, tMgAccu.element_type)
+            cute.copy(tiled_copy_g2r_fp32, tMgAccu, tMrAccu, pred=tMCAcc_mask)
+
+            tMrDlogprobs = cute.make_fragment(tMgAccu.layout, mDlogprobs.element_type)
             if cutlass.const_expr(self.REDUCTION == 2):
                 # mean reduction
-                num_valid_tokens = cute.make_tensor(
-                    scalarNumValidTokens,
-                    layout=(1,),
-                )
+                num_valid_tokens = cute.make_tensor(scalarNumValidTokens, layout=(1,))
                 tMrDlogprobs[0] = mDlogprobs[0] / num_valid_tokens[0].to(cutlass.Float32)
             elif cutlass.const_expr(self.REDUCTION == 1):
                 # sum reduction
                 tMrDlogprobs[0] = mDlogprobs[0]
             else:
                 # no reduction
-                gDlogprobs = cute.local_tile(
-                    mDlogprobs,
-                    (self.epi_tile[0],),
-                    (pidm,)
-                )
-                tMgDlogprobs = thr_copy_g2r_fp32.partition_S(
-                    cute.append_ones(gDlogprobs)
-                )
-                cute.copy(
-                    tiled_copy_g2r_fp32,
-                    tMgDlogprobs,
-                    tMrDlogprobs,
-                    pred=tMCAcc_mask
-                )
+                gDlogprobs = cute.local_tile(mDlogprobs, (self.epi_tile[0],), (pidm,))
+                tMgDlogprobs = thr_copy_g2r_fp32.partition_S(cute.append_ones(gDlogprobs))
+                cute.copy(tiled_copy_g2r_fp32, tMgDlogprobs, tMrDlogprobs, pred=tMCAcc_mask)
 
             tMrAccu[0] = cute.arch.rcp_approx(tMrAccu[0])
-            tMrDlogprobs[0] *= (tMrLabels[0] != ignore_index)
+            tMrDlogprobs[0] *= tMrLabels[0] != ignore_index
             tMr_d_acc_exp_logits = tMrDlogprobs[0] * tMrAccu[0]
 
             # ------ Partial output ------ #
             # [tileM, tileN]
             gDlogits_partial = cute.local_tile(
-                mDlogits_partial,
-                (self.epi_tile[0], self.epi_tile[1]),
-                (pidm, pidn)
+                mDlogits_partial, (self.epi_tile[0], self.epi_tile[1]), (pidm, pidn)
             )
             # blackwell supports STG.256
             copy_atom_r2g = cute.make_copy_atom(
-                cute.nvgpu.CopyUniversalOp(),
-                gDlogits_partial.element_type,
-                num_bits_per_copy=256
+                cute.nvgpu.CopyUniversalOp(), gDlogits_partial.element_type, num_bits_per_copy=256
             )
             tiled_copy_r2g = cute.make_tiled_copy_tv(
-                copy_atom_r2g,
-                epilogue_thread_layout,
-                copy_atom_r2g.layout_dst_tv
+                copy_atom_r2g, epilogue_thread_layout, copy_atom_r2g.layout_dst_tv
             )
             thr_copy_r2g = tiled_copy_r2g.get_slice(tidx)
 
             # [CPY, loopM, loopN]
             tR2GCAcc = thr_copy_r2g.partition_S(cAcc)
-            tR2GCAcc_pred = cute.make_fragment(
-                tR2GCAcc.shape,
-                cutlass.Boolean
-            )
+            tR2GCAcc_pred = cute.make_fragment(tR2GCAcc.shape, cutlass.Boolean)
             for elem in cutlass.range(cute.size(tR2GCAcc_pred, mode=[0])):
                 for row in cutlass.range(cute.size(tR2GCAcc_pred, mode=[1])):
                     for col in cutlass.range(cute.size(tR2GCAcc_pred, mode=[2])):
-                        # tR2GCAcc_pred[elem, row, col] = cute.elem_less(
-                        #     pidm * self.epi_tile[0] + tR2GCAcc[elem, row, col][0],
-                        #     cute.size(mDlogits_partial, mode=[0])
-                        # ) and cute.elem_less(
-                        #     pidn * self.epi_tile[1] + tR2GCAcc[elem, row, col][1],
-                        #     cute.size(mDlogits_partial, mode=[1])
-                        # )
                         tR2GCAcc_pred[elem, row, col] = cute.elem_less(
-                            pidm * self.epi_tile[0]
-                            + tR2GCAcc[elem, row, col][0],
-                            problem_mnk[0]
+                            pidm * self.epi_tile[0] + tR2GCAcc[elem, row, col][0], problem_mnk[0]
                         ) and cute.elem_less(
                             split_idx * self.vocab_per_split
                             + pidn * self.epi_tile[1]
                             + tR2GCAcc[elem, row, col][1],
-                            problem_mnk[1]
+                            problem_mnk[1],
                         )
 
             tR2GgDlogits = thr_copy_r2g.partition_D(gDlogits_partial)
 
             # for type conversion
-            dLogits_half = cute.make_fragment(
-                tTMEM_load_rAcc.shape,
-                tR2GgDlogits.element_type
-            )          
-            dLogits_half = cute.tiled_divide(
-                dLogits_half,
-                (cute.size(tR2GgDlogits, mode=[0]), 1)
-            )
+            dLogits_half = cute.make_fragment(tTMEM_load_rAcc.shape, tR2GgDlogits.element_type)
+            dLogits_half = cute.tiled_divide(dLogits_half, (cute.size(tR2GgDlogits, mode=[0]), 1))
             dLogits_half = cute.group_modes(dLogits_half, 2, cute.rank(dLogits_half))
 
             mma_pipeline.consumer_wait(mma_consumer_state)
 
             block_vocab_left_idx: cutlass.Int64 = (
-                split_idx * self.vocab_per_split
-                + pidn * self.epi_tile[1]
+                split_idx * self.vocab_per_split + pidn * self.epi_tile[1]
             )
-            block_vocab_right_idx: cutlass.Int64 = (
-                min(
-                    split_idx * self.vocab_per_split
-                    + (pidn + 1) * self.epi_tile[1],
-                    min(
-                        (split_idx + 1) * self.vocab_per_split,
-                        problem_mnk[1]
-                    )
-                )
+            block_vocab_right_idx: cutlass.Int64 = min(
+                split_idx * self.vocab_per_split + (pidn + 1) * self.epi_tile[1],
+                min((split_idx + 1) * self.vocab_per_split, problem_mnk[1]),
             )
             num_n_subtiles: cutlass.Int64 = cute.ceil_div(
-                (block_vocab_right_idx - block_vocab_left_idx),
-                cute.size(tTMEM_load_rAcc, mode=[0])
+                (block_vocab_right_idx - block_vocab_left_idx), cute.size(tTMEM_load_rAcc, mode=[0])
             )
             for n_subtile in cutlass.range(num_n_subtiles):
                 cute.copy(
                     tiled_copy_t2r,
                     tTMEM_load_tAcc[(None, None, None, n_subtile, mma_consumer_state.index)],
-                    tTMEM_load_rAcc
+                    tTMEM_load_rAcc,
                 )
 
                 for idx in cutlass.range(cute.size(tTMEM_load_rAcc, mode=[0]), unroll_full=True):
@@ -629,12 +475,11 @@ def kernel(
                         + idx
                     )
                     mask: cutlass.Boolean = (
-                        position == tMrLabels[0]
-                        and tMrLabels[0] != ignore_index
+                        position == tMrLabels[0] and tMrLabels[0] != ignore_index
                     )
                     # d_logits
                     tTMEM_load_rAcc[idx] *= tMr_d_acc_exp_logits
-                    tTMEM_load_rAcc[idx] += (mask * -tMrDlogprobs[0])
+                    tTMEM_load_rAcc[idx] += mask * -tMrDlogprobs[0]
                     dLogits_half[idx] = tTMEM_load_rAcc[idx].to(dLogits_half.element_type)
 
                 for idx in cutlass.range(cute.size(dLogits_half, mode=[1]), unroll_full=True):
@@ -643,23 +488,17 @@ def kernel(
                         tiled_copy_r2g,
                         dLogits_half[(None, idx, None)],
                         tR2GgDlogits[(None, None, copy_id)],
-                        pred=tR2GCAcc_pred[((0, None), None, copy_id)]
+                        pred=tR2GCAcc_pred[((0, None), None, copy_id)],
                     )
 
             mma_pipeline.consumer_release(mma_consumer_state)
             mma_consumer_state.advance()
 
-
         # ------ Deallocate TMEM ------ #
         self.cta_sync_barrier.arrive_and_wait()
         if warp_idx == self.empty_warp_ids[0]:
             cute.arch.relinquish_tmem_alloc_permit()
-            cute.arch.dealloc_tmem(
-                tmem_ptr,
-                self.tmem_alloc_cols,
-                is_two_cta=self.use_2cta_instrs
-            )
-
+            cute.arch.dealloc_tmem(tmem_ptr, self.tmem_alloc_cols, is_two_cta=self.use_2cta_instrs)
 
     @cute.jit
     def __call__(
@@ -681,85 +520,66 @@ def __call__(
         b_dtype: Type[cutlass.Numeric] = weight.element_type
 
         if cutlass.const_expr(hidden.element_type != weight.element_type):
-            raise RuntimeError(f"data type don't match: {hidden.element_type} v.s. {weight.element_type}")
+            raise RuntimeError(
+                f"data type don't match: {hidden.element_type} v.s. {weight.element_type}"
+            )
         if cutlass.const_expr(hidden.element_type not in [cutlass.Float16, cutlass.BFloat16]):
             raise RuntimeError("hidden can only be FP16 or BF16")
         if cutlass.const_expr(hidden.layout.shape[1] != weight.layout.shape[1]):
             raise RuntimeError("K dimension doesn't match")
 
-        problem_mnk = (
-            hidden.layout.shape[0],
-            weight.layout.shape[0],
-            hidden.layout.shape[1]
-        )
+        problem_mnk = (hidden.layout.shape[0], weight.layout.shape[0], hidden.layout.shape[1])
         if cutlass.const_expr((problem_mnk[2] * a_dtype.width // 8) % 16 != 0):
             raise RuntimeError(f"K dimension is not 16B aligned: {problem_mnk[2]}")
         if cutlass.const_expr((problem_mnk[2] * b_dtype.width // 8) % 128 != 0):
             raise RuntimeError(f"N dimension is not 128B aligned: {problem_mnk[1]}")
 
         grid = self._compute_grid(
-            problem_mnk = problem_mnk,
-            cluster_shape_mn = self.cluster_shape_mn,
-            cta_tiler = self.mma_tiler,
+            problem_mnk=problem_mnk,
+            cluster_shape_mn=self.cluster_shape_mn,
+            cta_tiler=self.mma_tiler,
         )
-        
+
         a_major_mode = utils.LayoutEnum.from_tensor(hidden).mma_major_mode()
         b_major_mode = utils.LayoutEnum.from_tensor(weight).mma_major_mode()
 
         tiled_mma = sm100_utils.make_trivial_tiled_mma(
-            a_dtype,
-            a_major_mode,
-            b_major_mode,
-            self.acc_dtype,
-            self.cta_group,
-            self.mma_tiler[:2]
+            a_dtype, a_major_mode, b_major_mode, self.acc_dtype, self.cta_group, self.mma_tiler[:2]
         )
         self._setup_attributes(tiled_mma, a_dtype, b_dtype)
 
         self.epi_tile = self.cta_tile_shape_mnk[:2]
-        
+
         # Swizzle o [(tileM, tileK), loopM, loopK, stage]
         a_smem_layout_staged = sm100_utils.make_smem_layout_a(
-            tiled_mma,
-            self.mma_tiler,
-            a_dtype,
-            self.num_ab_stage
+            tiled_mma, self.mma_tiler, a_dtype, self.num_ab_stage
         )
         # Swizzle o [(tileN, tileK), loopN, loopK, stage]
         b_smem_layout_staged = sm100_utils.make_smem_layout_b(
-            tiled_mma,
-            self.mma_tiler,
-            b_dtype,
-            self.num_ab_stage
+            tiled_mma, self.mma_tiler, b_dtype, self.num_ab_stage
         )
         tma_load_op = cpasync.CopyBulkTensorTileG2SOp(self.cta_group)
         tma_store_op = cpasync.CopyBulkTensorTileS2GOp()
 
         # Swizzle o [(tileM, tileK), loopM, loopK]
-        a_smem_layout = cute.select(
-            a_smem_layout_staged,
-            mode=[0, 1, 2]
-        )
+        a_smem_layout = cute.select(a_smem_layout_staged, mode=[0, 1, 2])
         tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
             tma_load_op,
             hidden,
             a_smem_layout,
             self.mma_tiler,
             tiled_mma,
-            self.cluster_layout_vmnk.shape
+            self.cluster_layout_vmnk.shape,
         )
         # Swizzle o [(tileN, tileK), loopN, loopK]
-        b_smem_layout = cute.select(
-            b_smem_layout_staged,
-            mode=[0, 1, 2]
-        )
+        b_smem_layout = cute.select(b_smem_layout_staged, mode=[0, 1, 2])
         tma_atom_b, tma_tensor_b = cute.nvgpu.make_tiled_tma_atom_B(
             tma_load_op,
             weight,
             b_smem_layout,
             self.mma_tiler,
             tiled_mma,
-            self.cluster_layout_vmnk.shape
+            self.cluster_layout_vmnk.shape,
         )
         a_copy_size = cute.size_in_bytes(a_dtype, a_smem_layout)
         b_copy_size = cute.size_in_bytes(b_dtype, b_smem_layout)
@@ -767,6 +587,10 @@ def __call__(
 
         @cute.struct
         class SharedStorage:
+            """
+            The shared storage for the backward kernel.
+            """
+
             load_ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage * 2]
             mma_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2]
 
@@ -781,6 +605,7 @@ class SharedStorage:
                 cute.struct.MemRange[b_dtype, cute.cosize(b_smem_layout_staged)],
                 self.buffer_align_bytes,
             ]
+
         self.shared_storage = SharedStorage
 
         self.kernel(
@@ -806,126 +631,5 @@ class SharedStorage:
             grid=grid,
             block=[self.threads_per_cta, 1, 1],
             cluster=self.cluster_shape_mnk,
-            stream=stream
+            stream=stream,
         )
-
-
-if __name__ == "__main__":
-    torch.manual_seed(1113)
-
-    batch = 4
-    seqlen = 1023
-    dim = 8192
-    vocab_size = 152064
-    dtype = torch.bfloat16
-    split_idx = 0
-    vocab_per_split = 512 * 6
-
-    hidden = torch.randn(batch, seqlen, dim, device="cuda", dtype=dtype)
-    weight = torch.randn(vocab_size, dim, device="cuda", dtype=dtype)
-    labels = torch.randint(0, vocab_size, (batch, seqlen), device="cuda", dtype=torch.long)
-    num_valid_tokens = torch.tensor(batch * seqlen, device="cuda", dtype=torch.int64)
-
-    dlogprobs = torch.randn(batch, seqlen, device="cuda", dtype=torch.float32)
-
-    def get_maximum_and_accu(hidden, weight):
-        logits = (hidden @ weight.T).to(torch.float32)
-        maximum, _ = torch.max(logits, dim=-1)
-        accu = torch.sum(torch.exp(logits - maximum.unsqueeze(-1)), dim=-1)
-        return maximum, accu
-    maximum, accu = get_maximum_and_accu(hidden, weight)
-
-    dlogits_partial = torch.empty(
-        (batch, seqlen, vocab_per_split),
-        device=hidden.device,
-        dtype=hidden.dtype
-    )
-
-    # compile kernel
-    bwd_kernel = BwdPartialDlogits(
-        vocab_per_split=vocab_per_split,
-        reduction=0
-    )
-
-    hidden_packed = from_dlpack(
-        hidden.view(-1, dim), 
-        assumed_align=16).mark_compact_shape_dynamic(mode=0)
-    weight_packed = from_dlpack(
-        weight,
-        assumed_align=16
-    )
-    labels_packed = from_dlpack(
-        labels.view(-1),
-        assumed_align=8
-    ).mark_compact_shape_dynamic(mode=0)
-    dlogprobs_packed = from_dlpack(
-        dlogprobs.view(-1),
-        assumed_align=16
-    ).mark_compact_shape_dynamic(mode=0)
-    maximum_packed = from_dlpack(
-        maximum.view(-1),
-        assumed_align=8
-    ).mark_compact_shape_dynamic(mode=0)
-    accu_packed = from_dlpack(
-        accu.view(-1),
-        assumed_align=8
-    ).mark_compact_shape_dynamic(mode=0)
-    dlogits_partial_packed = from_dlpack(
-        dlogits_partial.view(-1, vocab_per_split),
-        assumed_align=32,
-    ).mark_compact_shape_dynamic(mode=0)
-    scalarNumValidTokens_packed = cute.runtime.make_ptr(
-        cutlass.Int64,
-        num_valid_tokens.data_ptr(),
-        cute.AddressSpace.gmem,
-        assumed_align=8
-    )
-
-    ignore_index = -100
-
-    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
-
-    rank = 0
-
-    compiled = cute.compile(
-        bwd_kernel,
-        split_idx,
-        hidden_packed,
-        weight_packed,
-        labels_packed,
-        dlogprobs_packed,
-        maximum_packed,
-        accu_packed,
-        dlogits_partial_packed,
-        scalarNumValidTokens_packed,
-        ignore_index,
-        rank,
-        stream,
-    )
-
-    start, stop = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
-    start.record(stream=torch.cuda.current_stream())
-    with torch.cuda.nvtx.range("BwdPartialDlogits"):
-        compiled(
-            split_idx,
-            hidden_packed,
-            weight_packed,
-            labels_packed,
-            dlogprobs_packed,
-            maximum_packed,
-            accu_packed,
-            dlogits_partial_packed,
-            scalarNumValidTokens_packed,
-            ignore_index,
-            rank,
-            stream
-        )
-    stop.record(stream=torch.cuda.current_stream())
-
-    torch.cuda.synchronize()
-
-    elapsed_time = start.elapsed_time(stop)
-
-    print(dlogits_partial)
-
-    print(f"Success, Elapsed time: {elapsed_time:.4f} ms")
\ No newline at end of file
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py b/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py
index e26661ca06a..786f0fd9b3b 100644
--- a/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py
@@ -1,32 +1,55 @@
-import torch
-import torch.distributed as dist
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+
 import typing
-import triton
 
+import cuda.bindings.driver as cuda  # type: ignore
 import cutlass
 import cutlass.cute as cute
+import torch
+import torch.distributed as dist
+import triton  # type: ignore
 from cutlass.cute.runtime import from_dlpack
-import cuda.bindings.driver as cuda
 
 import megatron.core.fusions.linear_cross_entropy.utils as utils
-import megatron.core.fusions.linear_cross_entropy.blackwell.fwd_mainloop as fwd_mainloop
-import megatron.core.fusions.linear_cross_entropy.blackwell.bwd_partial_dlogits as bwd_partial_dlogits
-import megatron.core.fusions.linear_cross_entropy.blackwell.triton as triton_kernels
+from megatron.core.fusions.linear_cross_entropy.blackwell import (
+    bwd_partial_dlogits as bwd_partial_dlogits,
+)
+from megatron.core.fusions.linear_cross_entropy.blackwell import fwd_mainloop as fwd_mainloop
+from megatron.core.fusions.linear_cross_entropy.blackwell import triton as triton_kernels
+
+
+class FwdConfig:
+    """
+    The configuration for the forward pass.
+    """
+
+    _dedicated_stream: torch.cuda.Stream = None
+    _dedicated_events: typing.List[torch.cuda.Event] = list()
+    _initialized: bool = False
+    _fwd_mainloop_kernels: typing.Dict[str, cute.kernel] = dict()
+
+
+class BwdConfig:
+    """
+    The configuration for the backward pass.
+    """
+
+    _bwd_kernel: typing.Dict[str, cute.kernel] = dict()
+
+
+_fwd_config = FwdConfig()
+_bwd_config = BwdConfig()
 
-# import linear_cross_entropy.utils as utils
-# import linear_cross_entropy.blackwell.fwd_mainloop as fwd_mainloop
-# import linear_cross_entropy.blackwell.bwd_partial_dlogits as bwd_partial_dlogits
-# import linear_cross_entropy.blackwell.triton as triton_kernels
 
 def forward(
     hidden: torch.Tensor,
     weight: torch.Tensor,
     labels: torch.Tensor,
     tp_group: typing.Optional[torch.distributed.ProcessGroup] = None,
-    reduction: typing.Optional[str] = "mean",
-    ignore_index: typing.Optional[int] = -100,
-    sequence_parallel: typing.Optional[bool] = False,
-) -> typing.Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    reduction: str = "mean",
+    ignore_index: int = -100,
+    sequence_parallel: bool = False,
+) -> typing.Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int, int, torch.Tensor]:
     """
     forward host function
     """
@@ -42,45 +65,34 @@ def forward(
     # weight must be [vocab_size, dim]
     assert weight.dim() == 2
     # labels could be [batch, seqlen] or [seqlen, batch] or [tokens]
-    assert ((hidden.dim() == 2 and labels.dim() == 1) 
-            or (hidden.dim() == 3 and labels.dim() == 2))
+    assert (hidden.dim() == 2 and labels.dim() == 1) or (hidden.dim() == 3 and labels.dim() == 2)
     assert hidden.is_contiguous() and weight.is_contiguous() and labels.is_contiguous()
 
     hidden_view = hidden.view(-1, hidden.shape[-1])
     labels_view = labels.view(-1)
 
-    assert ((sequence_parallel and hidden_view.shape[0] * tp_world_size == labels_view.shape[0])
-            or (not sequence_parallel and hidden_view.shape[0] == labels_view.shape[0]))
+    assert (sequence_parallel and hidden_view.shape[0] * tp_world_size == labels_view.shape[0]) or (
+        not sequence_parallel and hidden_view.shape[0] == labels_view.shape[0]
+    )
     assert hidden_view.shape[1] == weight.shape[1]
 
     global_hidden = hidden
     if in_tp_mode and sequence_parallel:
         partial_hidden_shape = hidden.shape
-        global_hidden_shape = (
-            partial_hidden_shape[0] * tp_world_size,
-            *partial_hidden_shape[1:]
-        )
-        global_hidden = torch.empty(
-            global_hidden_shape,
-            dtype=hidden.dtype,
-            device=hidden.device
-        )
-        dist.all_gather_into_tensor(
-            global_hidden,
-            hidden,
-            group=tp_group
-        )
+        global_hidden_shape = (partial_hidden_shape[0] * tp_world_size, *partial_hidden_shape[1:])
+        global_hidden = torch.empty(global_hidden_shape, dtype=hidden.dtype, device=hidden.device)
+        dist.all_gather_into_tensor(global_hidden, hidden, group=tp_group)
         assert global_hidden.is_contiguous()
         hidden_view = global_hidden.view(-1, global_hidden.shape[-1])
 
     num_tokens, dim = hidden_view.shape
     vocab_size, _ = weight.shape
 
-    if not hasattr(forward, "_initialized"):
-        global _dedicated_stream, _dedicated_events
-        _dedicated_stream = torch.cuda.Stream(hidden.device)
-        _dedicated_events = [torch.cuda.Event() for _ in range(2)]
-        forward._initialized = True
+    global _fwd_config
+    if not _fwd_config._initialized:
+        _fwd_config._dedicated_stream = torch.cuda.Stream(hidden.device)
+        _fwd_config._dedicated_events = [torch.cuda.Event() for _ in range(2)]
+        _fwd_config._initialized = True
 
     REDUCTION = utils.str_to_reduction_enum(reduction)
     # declare logprobs
@@ -94,7 +106,9 @@ def forward(
     maximum = torch.empty((num_tokens,), device=hidden.device, dtype=torch.float32)
     accumulate = torch.empty_like(maximum, dtype=torch.float32)
     num_valid_tokens = torch.empty((), device=hidden.device, dtype=torch.int64)
-    assert maximum.is_contiguous() and accumulate.is_contiguous() and num_valid_tokens.is_contiguous()
+    assert (
+        maximum.is_contiguous() and accumulate.is_contiguous() and num_valid_tokens.is_contiguous()
+    )
     # declare intermediate tensors
     # NOTE: this is a parameter for tuning
     vocab_per_split = 512 * 6
@@ -110,44 +124,31 @@ def forward(
     assert _max.is_contiguous() and _accu.is_contiguous() and _logprobs.is_contiguous()
 
     triton_kernels.get_num_valid_tokens[(1,)](
-        num_tokens,
-        ignore_index,
-        labels_view,
-        labels_view.stride(0),
-        num_valid_tokens,
+        num_tokens, ignore_index, labels_view, labels_view.stride(0), num_valid_tokens
     )
-    
-    if not hasattr(forward, "_fwd_mainloop_kernels"):
-        forward._fwd_mainloop_kernels = dict()
 
     # need to compile the kernel for the first time
-    hidden_packed = from_dlpack(
-        hidden_view.detach(), assumed_align=16
-    ).mark_compact_shape_dynamic(mode=0)
-    weight_packed = from_dlpack(
-        weight.detach(), assumed_align=16
+    hidden_packed = from_dlpack(hidden_view.detach(), assumed_align=16).mark_compact_shape_dynamic(
+        mode=0
+    )
+    weight_packed = from_dlpack(weight.detach(), assumed_align=16)
+    labels_packed = from_dlpack(labels_view.detach(), assumed_align=8).mark_compact_shape_dynamic(
+        mode=0
+    )
+    logprobs_packed = from_dlpack(_logprobs, assumed_align=16).mark_compact_shape_dynamic(mode=0)
+    _max_packed = from_dlpack(_max, assumed_align=8).mark_compact_shape_dynamic(
+        mode=0, stride_order=(0, 1)
+    )
+    _accu_packed = from_dlpack(_accu, assumed_align=8).mark_compact_shape_dynamic(
+        mode=0, stride_order=(0, 1)
     )
-    labels_packed = from_dlpack(
-        labels_view.detach(), assumed_align=8
-    ).mark_compact_shape_dynamic(mode=0)
-    logprobs_packed = from_dlpack(
-        _logprobs, assumed_align=16
-    ).mark_compact_shape_dynamic(mode=0)
-    _max_packed = from_dlpack(
-        _max, assumed_align=8
-    ).mark_compact_shape_dynamic(mode=0, stride_order=(0, 1))
-    _accu_packed = from_dlpack(
-        _accu, assumed_align=8
-    ).mark_compact_shape_dynamic(mode=0, stride_order=(0, 1))
     cuda_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
 
     # VocabSize and Dim are fixed for a given model,
     # only the number of tokens can vary
     key = f"vocab_size:{vocab_size}+dim:{dim}+dtype:{hidden_view.dtype}"
-    if forward._fwd_mainloop_kernels.get(key) is None:
-        fwd_mainloop_kernel = fwd_mainloop.FwdMainLoop(
-            vocab_per_split=vocab_per_split,
-        )
+    if _fwd_config._fwd_mainloop_kernels.get(key) is None:
+        fwd_mainloop_kernel = fwd_mainloop.FwdMainLoop(vocab_per_split=vocab_per_split)
         fwd_mainloop_compiled_kernel = cute.compile(
             fwd_mainloop_kernel,
             hidden_packed,
@@ -158,11 +159,11 @@ def forward(
             _accu_packed,
             ignore_index,
             tp_rank,
-            cuda_stream
+            cuda_stream,
         )
-        forward._fwd_mainloop_kernels[key] = fwd_mainloop_compiled_kernel
+        _fwd_config._fwd_mainloop_kernels[key] = fwd_mainloop_compiled_kernel
     else:
-        fwd_mainloop_compiled_kernel = forward._fwd_mainloop_kernels[key]
+        fwd_mainloop_compiled_kernel = _fwd_config._fwd_mainloop_kernels[key]
     fwd_mainloop_compiled_kernel(
         hidden_packed,
         weight_packed,
@@ -172,10 +173,11 @@ def forward(
         _accu_packed,
         ignore_index,
         tp_rank,
-        cuda_stream
+        cuda_stream,
     )
-    
+
     if not in_tp_mode:
+
         def grid(meta):
             return (triton.cdiv(num_tokens, meta["BLOCK_SIZE_M"]),)
 
@@ -205,11 +207,11 @@ def grid(meta):
         _max_backup = _max.clone()
         dist.all_reduce(_max, op=dist.ReduceOp.MAX, group=tp_group)
 
-        torch.cuda.current_stream().record_event(_dedicated_events[0])
-        with torch.cuda.stream(_dedicated_stream):
-            _dedicated_stream.wait_event(_dedicated_events[0])
+        torch.cuda.current_stream().record_event(_fwd_config._dedicated_events[0])
+        with torch.cuda.stream(_fwd_config._dedicated_stream):
+            _fwd_config._dedicated_stream.wait_event(_fwd_config._dedicated_events[0])
             dist.all_reduce(_logprobs, op=dist.ReduceOp.SUM, group=tp_group)
-            _dedicated_stream.record_event(_dedicated_events[1])
+            _fwd_config._dedicated_stream.record_event(_fwd_config._dedicated_events[1])
 
         def grid(meta):
             return (triton.cdiv(num_tokens, meta["BLOCK_SIZE_M"]),)
@@ -235,7 +237,7 @@ def grid(meta):
         dist.all_reduce(accumulate, op=dist.ReduceOp.SUM, group=tp_group)
 
         # update logprobs
-        torch.cuda.current_stream().wait_event(_dedicated_events[1])
+        torch.cuda.current_stream().wait_event(_fwd_config._dedicated_events[1])
         triton_kernels.forward_tp_epilogue_update_logprobs[grid](
             num_tokens,
             ignore_index,
@@ -254,6 +256,7 @@ def grid(meta):
 
     return logprobs, maximum, accumulate, num_valid_tokens, tp_rank, tp_world_size, global_hidden
 
+
 def backward(
     dlogprobs: torch.Tensor,
     global_hidden: torch.Tensor,
@@ -262,13 +265,13 @@ def backward(
     maximum: torch.Tensor,
     accu: torch.Tensor,
     num_valid_tokens: torch.Tensor,
-    reduction: typing.Optional[str] = "mean",
-    ignore_index: typing.Optional[int] = -100,
+    reduction: str = "mean",
+    ignore_index: int = -100,
     tp_group: typing.Optional[dist.ProcessGroup] = None,
-    tp_rank: typing.Optional[int] = 0,
-    tp_world_size: typing.Optional[int] = 1,
-    sequence_parallel: typing.Optional[bool] = False,
-) -> typing.List[torch.Tensor]:
+    tp_rank: int = 0,
+    tp_world_size: int = 1,
+    sequence_parallel: bool = False,
+) -> typing.Tuple[torch.Tensor, torch.Tensor]:
     """
     backward host function
     """
@@ -282,13 +285,16 @@ def backward(
 
     REDUCTION = utils.str_to_reduction_enum(reduction)
     dlogprobs_view = dlogprobs.view(-1)
-    assert (
-        (REDUCTION == utils.EntropyReductionEnum.kNone and dlogprobs.shape == (num_tokens,))
-        or (REDUCTION != utils.EntropyReductionEnum.kNone and dlogprobs.dim() == 0)
+    assert (REDUCTION == utils.EntropyReductionEnum.kNone and dlogprobs.shape == (num_tokens,)) or (
+        REDUCTION != utils.EntropyReductionEnum.kNone and dlogprobs.dim() == 0
     )
     assert dlogprobs.is_contiguous() and dlogprobs.is_cuda
 
-    assert num_valid_tokens.dim() == 0 and num_valid_tokens.is_cuda and num_valid_tokens.dtype == torch.int64
+    assert (
+        num_valid_tokens.dim() == 0
+        and num_valid_tokens.is_cuda
+        and num_valid_tokens.dtype == torch.int64
+    )
 
     d_hidden = torch.empty_like(global_hidden)
     d_weight = torch.empty_like(weight)
@@ -301,60 +307,38 @@ def backward(
         num_splits = (vocab_size + vocab_per_split - 1) // vocab_per_split
 
         _d_logits = torch.empty(
-            (num_tokens, vocab_per_split),
-            device=global_hidden.device,
-            dtype=global_hidden.dtype
+            (num_tokens, vocab_per_split), device=global_hidden.device, dtype=global_hidden.dtype
         )
 
         hidden_packed = from_dlpack(
-            hidden_view.detach(),
-            assumed_align=16
+            hidden_view.detach(), assumed_align=16
         ).mark_compact_shape_dynamic(mode=0)
-        weight_packed = from_dlpack(
-            weight.detach(),
-            assumed_align=16
-        )
+        weight_packed = from_dlpack(weight.detach(), assumed_align=16)
         labels_packed = from_dlpack(
-            labels_view.detach(),
-            assumed_align=8
+            labels_view.detach(), assumed_align=8
         ).mark_compact_shape_dynamic(mode=0)
         dlogprobs_packed = from_dlpack(
-            dlogprobs_view.detach(),
-            assumed_align=8
-        ).mark_compact_shape_dynamic(mode=0)
-        maximum_packed = from_dlpack(
-            maximum.detach(),
-            assumed_align=8
-        ).mark_compact_shape_dynamic(mode=0)
-        accu_packed = from_dlpack(
-            accu.detach(),
-            assumed_align=8
-        ).mark_compact_shape_dynamic(mode=0)
-        dlogits_packed = from_dlpack(
-            _d_logits,
-            assumed_align=32
+            dlogprobs_view.detach(), assumed_align=8
         ).mark_compact_shape_dynamic(mode=0)
+        maximum_packed = from_dlpack(maximum.detach(), assumed_align=8).mark_compact_shape_dynamic(
+            mode=0
+        )
+        accu_packed = from_dlpack(accu.detach(), assumed_align=8).mark_compact_shape_dynamic(mode=0)
+        dlogits_packed = from_dlpack(_d_logits, assumed_align=32).mark_compact_shape_dynamic(mode=0)
         scalarNumValidTokens_packed = cute.runtime.make_ptr(
-            cutlass.Int64,
-            num_valid_tokens.data_ptr(),
-            cute.AddressSpace.gmem,
-            assumed_align=8
+            cutlass.Int64, num_valid_tokens.data_ptr(), cute.AddressSpace.gmem, assumed_align=8
         )
 
         stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
 
-        if not hasattr(backward, "_bwd_kernel"):
-            backward._bwd_kernel = dict()
-
         key = f"vocab_size:{vocab_size}+dim:{dim}+reduction:{REDUCTION}+dtype:{hidden_view.dtype}"
-        if backward._bwd_kernel.get(key) is None:
+        if _bwd_config._bwd_kernel.get(key) is None:
             bwd_kernel = bwd_partial_dlogits.BwdPartialDlogits(
-                reduction=REDUCTION,
-                vocab_per_split=vocab_per_split,
+                reduction=REDUCTION, vocab_per_split=vocab_per_split
             )
             bwd_kernel_compiled = cute.compile(
                 bwd_kernel,
-                0, # split_idx
+                0,  # split_idx
                 hidden_packed,
                 weight_packed,
                 labels_packed,
@@ -365,11 +349,11 @@ def backward(
                 scalarNumValidTokens_packed,
                 ignore_index,
                 tp_rank,
-                stream
+                stream,
             )
-            backward._bwd_kernel[key] = bwd_kernel_compiled
+            _bwd_config._bwd_kernel[key] = bwd_kernel_compiled
         else:
-            bwd_kernel_compiled = backward._bwd_kernel.get(key)
+            bwd_kernel_compiled = _bwd_config._bwd_kernel.get(key)
 
         for split_idx in range(num_splits):
             bwd_kernel_compiled(
@@ -384,29 +368,28 @@ def backward(
                 scalarNumValidTokens_packed,
                 ignore_index,
                 tp_rank,
-                stream
+                stream,
             )
+            # remove padding areas
+            # cublas can handle non-contiguous tensors
+            # therefore, we do not need to contiguous the tensor
             vocab_right_bound = (
                 min((split_idx + 1) * vocab_per_split, vocab_size) - split_idx * vocab_per_split
             )
-            # remove padding areas
-            _d_logits = _d_logits[:, :vocab_right_bound].contiguous()
-
-            if split_idx == 0:
-                torch.matmul(
-                    _d_logits,
-                    weight[split_idx * vocab_per_split : (split_idx + 1) * vocab_per_split, :],
-                    out=d_hidden.view(num_tokens, dim)
-                )
-            else:
-                d_hidden += torch.matmul(
-                    _d_logits,
-                    weight[split_idx * vocab_per_split : (split_idx + 1) * vocab_per_split, :],
-                ).view(d_hidden.shape)
+            valid_d_logits = _d_logits[:, :vocab_right_bound]
+
+            torch.addmm(
+                input=d_hidden.view(-1, dim),
+                mat1=valid_d_logits,
+                mat2=weight[split_idx * vocab_per_split : (split_idx + 1) * vocab_per_split, :],
+                beta=(split_idx != 0),
+                alpha=1.0,
+                out=d_hidden.view(-1, dim),
+            )
             torch.matmul(
-                _d_logits.T,
+                valid_d_logits.T,
                 hidden_view,
-                out=d_weight[split_idx * vocab_per_split : (split_idx + 1) * vocab_per_split, :]
+                out=d_weight[split_idx * vocab_per_split : (split_idx + 1) * vocab_per_split, :],
             )
     else:
         raise NotImplementedError(f"Unsupported backward method: {_backward}")
@@ -416,10 +399,12 @@ def backward(
         if sequence_parallel:
             partial_hidden_shape = (
                 global_hidden.shape[0] // tp_world_size,
-                *global_hidden.shape[1:]
+                *global_hidden.shape[1:],
             )
             partial_num_tokens = num_tokens // tp_world_size
-            d_hidden = d_hidden.view(-1, d_hidden.shape[-1])[tp_rank * partial_num_tokens : (tp_rank + 1) * partial_num_tokens, :]
+            d_hidden = d_hidden.view(-1, d_hidden.shape[-1])[
+                tp_rank * partial_num_tokens : (tp_rank + 1) * partial_num_tokens, :
+            ]
             d_hidden = d_hidden.view(partial_hidden_shape).clone()
-    
-    return d_hidden, d_weight
\ No newline at end of file
+
+    return d_hidden, d_weight
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/fwd_mainloop.py b/megatron/core/fusions/linear_cross_entropy/blackwell/fwd_mainloop.py
index 81346b0df81..ebb9709822c 100644
--- a/megatron/core/fusions/linear_cross_entropy/blackwell/fwd_mainloop.py
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/fwd_mainloop.py
@@ -1,41 +1,45 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+
 """
 Implementations of the fusion lm_head(Linear) + Cross-Entropy kernel
 """
 
-from typing import Optional, Type, Tuple, Union
-import cuda.bindings.driver as cuda
-
-import torch
+from typing import Tuple, Type
 
+import cuda.bindings.driver as cuda  # type: ignore
 import cutlass
 import cutlass.cute as cute
-import cutlass.utils as utils
-import cutlass.pipeline as pipeline
+import cutlass.pipeline as pipeline  # type: ignore
+import cutlass.utils as utils  # type: ignore
+import cutlass.utils.blackwell_helpers as sm100_utils  # type: ignore
 from cutlass.cute.nvgpu import cpasync, tcgen05
-import cutlass.torch as cutlass_torch
-import cutlass.utils.blackwell_helpers as sm100_utils
-from cutlass.cute.runtime import from_dlpack
-
 
 SM100_TMEM_CAPACITY_COLUMNS: int = 512
 
+
 def make_thread_cooperative_group(size: int):
+    """
+    Create a thread cooperative group.
+    """
     return pipeline.CooperativeGroup(pipeline.Agent.Thread, size, alignment=size)
 
+
 class FwdMainLoop:
     """
     This class implements the mainloop for forward process.
 
     Traits stored as attributes.
 
-    :param acc_dtype: 
+    :param acc_dtype:
     """
 
-    def __init__(self,
-                 acc_dtype: Type[cutlass.Numeric] = cutlass.Float32,
-                 use_2cta_instrs: bool = False,
-                 mma_tiler_mn: Tuple[int, int] = (128, 256),
-                 vocab_per_split: int = 512):
+    def __init__(
+        self,
+        acc_dtype: Type[cutlass.Numeric] = cutlass.Float32,
+        use_2cta_instrs: bool = False,
+        mma_tiler_mn: Tuple[int, int] = (128, 256),
+        vocab_per_split: int = 512,
+    ):
         """
         Configuration including:
             - MMA instruction settings
@@ -45,16 +49,10 @@ def __init__(self,
         self.use_2cta_instrs = use_2cta_instrs
         # This is the shape covered by tiledMMA, not just single MMA instruction
         self.mma_tiler = (*mma_tiler_mn, 1)
-        self.cta_tiler = (
-            self.mma_tiler[0],
-            vocab_per_split,
-            self.mma_tiler[2]
-        )
+        self.cta_tiler = (self.mma_tiler[0], vocab_per_split, self.mma_tiler[2])
         self.vocab_per_split = vocab_per_split
-        
-        self.cta_group = (
-            tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
-        )
+
+        self.cta_group = tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
         self.cluster_shape_mn = (2, 1) if self.use_2cta_instrs else (1, 1)
 
         self.occupancy = 1
@@ -73,19 +71,14 @@ def __init__(self,
         self.empty_warp_ids = (6, 7)
 
         self.threads_per_cta: int = self.threads_per_warp * len(
-            (*self.epi_warp_ids,
-             self.load_warp_ids,
-             self.mma_warp_ids,
-             *self.empty_warp_ids)
+            (*self.epi_warp_ids, self.load_warp_ids, self.mma_warp_ids, *self.empty_warp_ids)
         )
 
         self.cta_sync_barrier = pipeline.NamedBarrier(
-            barrier_id = 1,
-            num_threads = self.threads_per_cta
+            barrier_id=1, num_threads=self.threads_per_cta
         )
         self.tmem_alloc_barrier = pipeline.NamedBarrier(
-            barrier_id = 2,
-            num_threads = self.threads_per_cta
+            barrier_id=2, num_threads=self.threads_per_cta
         )
 
         self.buffer_align_bytes: int = 1024
@@ -97,26 +90,14 @@ def _compute_stages(
         tiled_mma: cute.TiledMma,
         mma_tiler: Tuple[int, int, int],
         a_dtype: Type[cutlass.Numeric],
-        b_dtype: Type[cutlass.Numeric]
+        b_dtype: Type[cutlass.Numeric],
     ):
         a_smem_layout_stage_one = sm100_utils.make_smem_layout_a(
-            tiled_mma,
-            mma_tiler,
-            a_dtype,
-            1, # only single stage
-        )
-        b_smem_layout_stage_one = sm100_utils.make_smem_layout_b(
-            tiled_mma,
-            mma_tiler,
-            b_dtype,
-            1,
-        )
-        a_bytes_per_stage = cute.size_in_bytes(
-            a_dtype, a_smem_layout_stage_one
-        )
-        b_bytes_per_stage = cute.size_in_bytes(
-            b_dtype, b_smem_layout_stage_one
+            tiled_mma, mma_tiler, a_dtype, 1  # only single stage
         )
+        b_smem_layout_stage_one = sm100_utils.make_smem_layout_b(tiled_mma, mma_tiler, b_dtype, 1)
+        a_bytes_per_stage = cute.size_in_bytes(a_dtype, a_smem_layout_stage_one)
+        b_bytes_per_stage = cute.size_in_bytes(b_dtype, b_smem_layout_stage_one)
         num_acc_stage = 2
         num_a_stage = 4
         num_b_stage = 4
@@ -132,30 +113,26 @@ def _setup_attributes(
     ):
         self.cluster_shape_mnk = (*self.cluster_shape_mn, 1)
         self.cluster_layout_vmnk = cute.tiled_divide(
-            cute.make_layout(self.cluster_shape_mnk),
-            (tiled_mma.thr_id.shape,),
+            cute.make_layout(self.cluster_shape_mnk), (tiled_mma.thr_id.shape,)
         )
 
         # this is fixed for dense MMA, k=16
         mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
         # 16*4 = 64; 64 * sizeof(FP16) = 128Bytes
         mma_inst_tile_k: int = 4
-        self.mma_tiler = (
-            self.mma_tiler[0],
-            self.mma_tiler[1],
-            mma_inst_shape_k * mma_inst_tile_k
-        )
+        self.mma_tiler = (self.mma_tiler[0], self.mma_tiler[1], mma_inst_shape_k * mma_inst_tile_k)
 
-        self.num_acc_stage, self.num_a_stage, self.num_b_stage, self.num_epi_stage_per_tile =\
+        self.num_acc_stage, self.num_a_stage, self.num_b_stage, self.num_epi_stage_per_tile = (
             self._compute_stages(tiled_mma, self.mma_tiler, a_dtype, b_dtype)
+        )
         self.tmem_alloc_cols = self.num_acc_stage * self.mma_tiler[1]
         assert self.tmem_alloc_cols <= SM100_TMEM_CAPACITY_COLUMNS
 
         self.cta_tile_shape_mnk = (
             self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
             self.mma_tiler[1],
-            self.mma_tiler[2]
-        )    
+            self.mma_tiler[2],
+        )
 
     @cute.kernel
     def kernel(
@@ -174,8 +151,11 @@ def kernel(
         cluster_layout_vmnk: cute.Layout,
         problem_mnk: Tuple[int, int, int],
         ignore_index: cutlass.Int64,
-        rank: cutlass.Int32
+        rank: cutlass.Int32,
     ):
+        """
+        The forward kernel for the mainloop.
+        """
         warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
         tidx, _, _ = cute.arch.thread_idx()
         bidx, bidy, _ = cute.arch.block_idx()
@@ -196,7 +176,7 @@ def kernel(
             producer_group=make_thread_cooperative_group(len([self.load_warp_ids])),
             consumer_group=make_thread_cooperative_group(len([self.mma_warp_ids])),
             tx_count=self.tma_copy_a_bytes + self.tma_copy_b_bytes,
-            barrier_storage=storage.load_ab_mbar_ptr.data_ptr()
+            barrier_storage=storage.load_ab_mbar_ptr.data_ptr(),
         )
         ab_producer_state = pipeline.make_pipeline_state(
             pipeline.PipelineUserType.Producer, self.num_a_stage
@@ -211,7 +191,7 @@ def kernel(
             consumer_group=make_thread_cooperative_group(
                 self.threads_per_warp * len(self.epi_warp_ids)
             ),
-            barrier_storage=storage.mma_mbar_ptr.data_ptr()
+            barrier_storage=storage.mma_mbar_ptr.data_ptr(),
         )
         mma_producer_state = pipeline.make_pipeline_state(
             pipeline.PipelineUserType.Producer, self.num_acc_stage
@@ -224,23 +204,16 @@ def kernel(
         if warp_idx == self.empty_warp_ids[0]:
             with cute.arch.elect_one():
                 cute.arch.mbarrier_init(
-                    tmem_dealloc_mbar_ptr,
-                    self.threads_per_warp * len(self.epi_warp_ids)
+                    tmem_dealloc_mbar_ptr, self.threads_per_warp * len(self.epi_warp_ids)
                 )
                 cute.arch.mbarrier_init_fence()
 
         # -------- SMEM partition ------------ #
         # swizzle o [(tileM, tileK), loopM, loopK, Stage]
-        sA = storage.sA.get_tensor(
-            a_smem_layout_staged.outer,
-            swizzle=a_smem_layout_staged.inner
-        )
+        sA = storage.sA.get_tensor(a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner)
         # swizzle o [(tileN, tileK), loopN, loopK, stage]
-        sB = storage.sB.get_tensor(
-            b_smem_layout_staged.outer,
-            swizzle=b_smem_layout_staged.inner
-        )
-        
+        sB = storage.sB.get_tensor(b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner)
+
         # FIXME: if 2 CTAs, modify here
         thr_mma = tiled_mma.get_slice(0)
         # [MMA, loopM, loopK, stage]
@@ -250,72 +223,50 @@ def kernel(
 
         # ---------- GMEM partition ----------- #
         # [tileM, tileK, loopK]
-        gA = cute.local_tile(
-            mA,
-            (self.mma_tiler[0], self.mma_tiler[2]),
-            (pidm, None)
-        )
+        gA = cute.local_tile(mA, (self.mma_tiler[0], self.mma_tiler[2]), (pidm, None))
 
         # [vocab_size_per_split, dim]
         mB_n = cute.local_tile(
-            mB,
-            (self.vocab_per_split, cute.size(mB.layout.shape, mode=[1])),
-            (pidn, 0)
+            mB, (self.vocab_per_split, cute.size(mB.layout.shape, mode=[1])), (pidn, 0)
         )
 
         # [tileN, tileK, loopN, loopK]
-        gB = cute.local_tile(
-            mB_n,
-            (self.mma_tiler[1], self.mma_tiler[2]),
-            (None, None)
-        )
-        
+        gB = cute.local_tile(mB_n, (self.mma_tiler[1], self.mma_tiler[2]), (None, None))
+
         # [MMA, tileCntM, tileCntK, loopK]
         tCgA = thr_mma.partition_A(gA)
         # [MMA, tileCntN, tileCntK, loopN, loopK]
         tCgB = thr_mma.partition_B(gB)
 
-        a_cta_layout = cute.make_layout(
-            cute.slice_(
-                cluster_layout_vmnk, 
-                (0, 0, None, 0)).shape
-        )
+        a_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape)
         # FIXME: if 2 CTAs, modify here
         cta_rank_in_cluster = 0
-        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(
-            cta_rank_in_cluster
-        )
+        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
         tTMAsA, tTMAgA = cpasync.tma_partition(
             tma_atom_a,
-            block_in_cluster_coord_vmnk[2], # cta_coord,
+            block_in_cluster_coord_vmnk[2],  # cta_coord,
             a_cta_layout,
-            cute.group_modes(sA, 0, 3), # SMEM tensor
-            cute.group_modes(tCgA, 0, 3) # GMEM tensor
-        )
-        b_cta_layout = cute.make_layout(
-            cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape
+            cute.group_modes(sA, 0, 3),  # SMEM tensor
+            cute.group_modes(tCgA, 0, 3),  # GMEM tensor
         )
+        b_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape)
         tTMAsB, tTMAgB = cpasync.tma_partition(
             tma_atom_b,
-            block_in_cluster_coord_vmnk[1], # cta_coord
+            block_in_cluster_coord_vmnk[1],  # cta_coord
             b_cta_layout,
             cute.group_modes(sB, 0, 3),
-            cute.group_modes(tCgB, 0, 3)
+            cute.group_modes(tCgB, 0, 3),
         )
 
         # Allocate TMEM
         tmem_holding_buf = storage.tmem_holding_buf
         if warp_idx == self.empty_warp_ids[0]:
             cute.arch.alloc_tmem(
-                self.tmem_alloc_cols, 
-                tmem_holding_buf,
-                is_two_cta=self.use_2cta_instrs
+                self.tmem_alloc_cols, tmem_holding_buf, is_two_cta=self.use_2cta_instrs
             )
         self.cta_sync_barrier.arrive_and_wait()
         tmem_ptr = cute.arch.retrieve_tmem_ptr(
-            self.acc_dtype,
-            alignment=16,
-            ptr_to_buffer_holding_addr=tmem_holding_buf
+            self.acc_dtype, alignment=16, ptr_to_buffer_holding_addr=tmem_holding_buf
         )
 
         # [(tileM, tileN), loopM, loopN]
@@ -323,16 +274,14 @@ def kernel(
         acc_shape = thr_mma.partition_shape_C(tmem_shape)
         tCtC_fake = thr_mma.make_fragment_C(acc_shape)
         tCtC = cute.make_tensor(tmem_ptr, tCtC_fake.layout)
-        
-        block_vocab_left_idx: cutlass.Int64 = (
-            pidn * self.vocab_per_split
-        )
-        block_vocab_right_idx: cutlass.Int64 = (
-            min((pidn + 1) * self.vocab_per_split, problem_mnk[1])
+
+        block_vocab_left_idx: cutlass.Int64 = pidn * self.vocab_per_split
+        block_vocab_right_idx: cutlass.Int64 = min(
+            (pidn + 1) * self.vocab_per_split, problem_mnk[1]
         )
         num_n_tiles: cutlass.Int64 = cute.ceil_div(
-            (block_vocab_right_idx - block_vocab_left_idx), 
-            self.mma_tiler[1])
+            (block_vocab_right_idx - block_vocab_left_idx), self.mma_tiler[1]
+        )
 
         # ///////
         # empty
@@ -353,13 +302,13 @@ def kernel(
                         tma_atom_a,
                         tTMAgA[(None, k)],
                         tTMAsA[(None, ab_producer_state.index)],
-                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state)
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
                     )
                     cute.copy(
                         tma_atom_b,
                         tTMAgB[(None, n, k)],
                         tTMAsB[(None, ab_producer_state.index)],
-                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state)
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
                     )
                     ab_pipeline.producer_commit(ab_producer_state)
                     ab_producer_state.advance()
@@ -384,7 +333,7 @@ def kernel(
                             cute.append_ones(tCtC[(None, None, mma_producer_state.index)]),
                             tCsA[(None, None, kblock_idx, ab_consumer_state.index)],
                             tCsB[(None, None, kblock_idx, ab_consumer_state.index)],
-                            cute.append_ones(tCtC[(None, None, mma_producer_state.index)])
+                            cute.append_ones(tCtC[(None, None, mma_producer_state.index)]),
                         )
                         # enable accumulate for the next tile
                         tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
@@ -404,21 +353,18 @@ def kernel(
             # epilog TMEM copy and partition
             copy_atom_t2r = sm100_utils.get_tmem_load_op(
                 self.cta_tile_shape_mnk,
-                utils.LayoutEnum.ROW_MAJOR, # This is hard-coded
+                utils.LayoutEnum.ROW_MAJOR,  # This is hard-coded
                 self.acc_dtype,
                 self.acc_dtype,
                 (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
-                self.use_2cta_instrs
+                self.use_2cta_instrs,
             )
             # [tileM, subTileN, loopM, CntSubTileN, loopN]
             tAcc_epi = cute.flat_divide(
                 tCtC[((None, None), 0, None)],
-                (self.epi_tile[0],
-                 self.epi_tile[1] // self.num_epi_stage_per_tile)
-            )
-            tiled_copy_t2r = tcgen05.make_tmem_copy(
-                copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)]
+                (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
             )
+            tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)])
             thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
             tTMEM_load_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
             # [(pattern), loopM, loopN, CntTileM, CntTileN]
@@ -429,131 +375,84 @@ def kernel(
             # [tileM, subTileN, loopM, CntSubTileN, CntTileN]
             tCcAcc_epi = cute.flat_divide(
                 tCcAcc[((None, None), 0, None)],
-                (self.epi_tile[0],
-                 self.epi_tile[1] // self.num_epi_stage_per_tile)
+                (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
             )
             tTMEM_load_cAcc = thr_copy_t2r.partition_D(tCcAcc_epi)
-            tTMEM_load_cAcc_shape = cute.select(
-                tTMEM_load_cAcc.shape,
-                mode=[0, 1, 2]
-            )
+            tTMEM_load_cAcc_shape = cute.select(tTMEM_load_cAcc.shape, mode=[0, 1, 2])
 
             # epilogue layouts
             epilogue_thread_layout = cute.make_layout((128, 1))
-            copy_atom_g2r = cute.make_copy_atom(
-                cute.nvgpu.CopyUniversalOp(),
-                mLabels.element_type
-            )
-            tiled_copy_g2r = cute.make_tiled_copy(
-                copy_atom_g2r,
-                epilogue_thread_layout,
-                (128, 1)
-            )
+            copy_atom_g2r = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), mLabels.element_type)
+            tiled_copy_g2r = cute.make_tiled_copy(copy_atom_g2r, epilogue_thread_layout, (128, 1))
             thr_copy_g2r = tiled_copy_g2r.get_slice(tidx)
 
-            copy_atom_r2g = cute.make_copy_atom(
-                cute.nvgpu.CopyUniversalOp(),
-                cutlass.Float32
-            )
-            tiled_copy_r2g = cute.make_tiled_copy(
-                copy_atom_r2g,
-                epilogue_thread_layout,
-                (128, 1)
-            )
+            copy_atom_r2g = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), cutlass.Float32)
+            tiled_copy_r2g = cute.make_tiled_copy(copy_atom_r2g, epilogue_thread_layout, (128, 1))
             thr_copy_r2g = tiled_copy_r2g.get_slice(tidx)
-            
 
             # auxiliary tensors
             # [tileM]
-            gLabels = cute.local_tile(
-                mLabels,
-                (self.epi_tile[0],),
-                (pidm,)
-            )
+            gLabels = cute.local_tile(mLabels, (self.epi_tile[0],), (pidm,))
 
             tLabelsCAcc = thr_copy_g2r.partition_S(cAcc)[(None, None, 0)]
             tLabelsCAcc_mask = cute.make_fragment(tLabelsCAcc.shape, cutlass.Boolean)
             # [(1, 1), 1]
-            tLabelsCAcc_mask[0] = cute.elem_less(
-                pidm * self.epi_tile[0] + tidx,
-                problem_mnk[0]
-            )
+            tLabelsCAcc_mask[0] = cute.elem_less(pidm * self.epi_tile[0] + tidx, problem_mnk[0])
             # to align shape with gMax and gAccu
             tLabelsCAcc_mask = cute.append_ones(tLabelsCAcc_mask)
 
             # [(1, 1), 1, 1]
             tLabelsgLabels = thr_copy_g2r.partition_S(cute.append_ones(gLabels))
             tLabelsrLabels = cute.make_fragment(tLabelsgLabels.shape, tLabelsgLabels.element_type)
-            cute.copy(
-                tiled_copy_g2r,
-                tLabelsgLabels,
-                tLabelsrLabels,
-                pred=tLabelsCAcc_mask
-            )
-            valid_mask: cutlass.Boolean =\
-                (tLabelsrLabels[0] != ignore_index) and tLabelsCAcc_mask[0]
+            cute.copy(tiled_copy_g2r, tLabelsgLabels, tLabelsrLabels, pred=tLabelsCAcc_mask)
+            valid_mask: cutlass.Boolean = (tLabelsrLabels[0] != ignore_index) and tLabelsCAcc_mask[
+                0
+            ]
 
             # [tileM, 1]
-            gMax = cute.local_tile(
-                mMax,
-                (self.epi_tile[0], 1),
-                (pidm, pidn)
-            )
+            gMax = cute.local_tile(mMax, (self.epi_tile[0], 1), (pidm, pidn))
             # [(CPYM, CPYN), loopM, loopN]
             tR2GgMax = thr_copy_r2g.partition_D(gMax)
             tR2GrMax = cute.make_fragment(tR2GgMax.shape, tR2GgMax.element_type)
             tR2GrMax.fill(-1e30)
 
             # [tileM, 1]
-            gAccu = cute.local_tile(
-                mAccu,
-                (self.epi_tile[0], 1),
-                (pidm, pidn)
-            )
+            gAccu = cute.local_tile(mAccu, (self.epi_tile[0], 1), (pidm, pidn))
             # [(CPYM, CPYN), loopM, loopN]
             tR2GgAccu = thr_copy_r2g.partition_D(gAccu)
             tR2GrAccu = cute.make_fragment(tR2GgAccu.shape, tR2GgAccu.element_type)
             tR2GrAccu.fill(0.0)
-            
+
             # [tileM, 1]
-            gLogprobs = cute.append_ones(cute.local_tile(
-                mLogprobs,
-                (self.epi_tile[0],),
-                (pidm,)
-            ))
+            gLogprobs = cute.append_ones(cute.local_tile(mLogprobs, (self.epi_tile[0],), (pidm,)))
             # [(CPYM, CPYN), loopM, loopN]
             tR2GgLogprobs = thr_copy_r2g.partition_D(gLogprobs)
             tR2GrLogprobs = cute.make_fragment(tR2GgLogprobs.shape, tR2GgLogprobs.element_type)
             tR2GrLogprobs.fill(0.0)
 
             # [(tileN // num_epi_stage_per_tile, 1), 1, 1]
-            tTMEM_load_rAcc = cute.make_fragment(
-                tTMEM_load_cAcc_shape,
-                self.acc_dtype
-            )
+            tTMEM_load_rAcc = cute.make_fragment(tTMEM_load_cAcc_shape, self.acc_dtype)
 
             for n in cutlass.range(num_n_tiles):
                 mma_pipeline.consumer_wait(mma_consumer_state)
 
-                left: cutlass.Int64 = (
-                    block_vocab_left_idx + n * self.epi_tile[1]
-                )
-                right: cutlass.Int64 = (
-                    min((n + 1) * self.epi_tile[1] + block_vocab_left_idx, 
-                        block_vocab_right_idx)
+                left: cutlass.Int64 = block_vocab_left_idx + n * self.epi_tile[1]
+                right: cutlass.Int64 = min(
+                    (n + 1) * self.epi_tile[1] + block_vocab_left_idx, block_vocab_right_idx
                 )
                 num_n_subtiles: cutlass.Int64 = cute.ceil_div(
-                    (right - left),
-                    cute.size(tTMEM_load_rAcc, mode=[0])
+                    (right - left), cute.size(tTMEM_load_rAcc, mode=[0])
                 )
                 for n_subtile in cutlass.range(num_n_subtiles):
                     cute.copy(
                         tiled_copy_t2r,
                         tTMEM_load_tAcc[(None, None, None, n_subtile, mma_consumer_state.index)],
-                        tTMEM_load_rAcc
+                        tTMEM_load_rAcc,
                     )
 
-                    for idx in cutlass.range(cute.size(tTMEM_load_rAcc, mode=[0]), unroll_full=True):
+                    for idx in cutlass.range(
+                        cute.size(tTMEM_load_rAcc, mode=[0]), unroll_full=True
+                    ):
                         local_position: cutlass.Int64 = (
                             n * self.epi_tile[1]
                             + n_subtile * cute.size(tTMEM_load_rAcc, mode=[0])
@@ -567,77 +466,46 @@ def kernel(
                             tR2GrAccu[0] = coeff * tR2GrAccu[0] + exp_logits
 
                             position: cutlass.Int64 = (
-                                rank * problem_mnk[1]
-                                + pidn * self.vocab_per_split
-                                + local_position
+                                rank * problem_mnk[1] + pidn * self.vocab_per_split + local_position
                             )
                             mask: cutlass.Boolean = valid_mask and (position == tLabelsrLabels[0])
-                            tR2GrLogprobs[0] += (mask * tTMEM_load_rAcc[idx])
+                            tR2GrLogprobs[0] += mask * tTMEM_load_rAcc[idx]
 
                 mma_pipeline.consumer_release(mma_consumer_state)
                 mma_consumer_state.advance()
 
-            cute.copy(
-                tiled_copy_r2g,
-                tR2GrMax,
-                tR2GgMax,
-                pred=tLabelsCAcc_mask
-            )
-            cute.copy(
-                tiled_copy_r2g,
-                tR2GrAccu,
-                tR2GgAccu,
-                pred=tLabelsCAcc_mask
-            )
+            cute.copy(tiled_copy_r2g, tR2GrMax, tR2GgMax, pred=tLabelsCAcc_mask)
+            cute.copy(tiled_copy_r2g, tR2GrAccu, tR2GgAccu, pred=tLabelsCAcc_mask)
 
-            vocab_left_idx: cutlass.Int64 = (
-                rank * problem_mnk[1]
-                + pidn * self.vocab_per_split
-            )
-            vocab_right_idx: cutlass.Int64 = (
-                rank * problem_mnk[1]
-                + min((pidn + 1) * self.vocab_per_split, problem_mnk[1])
+            vocab_left_idx: cutlass.Int64 = rank * problem_mnk[1] + pidn * self.vocab_per_split
+            vocab_right_idx: cutlass.Int64 = rank * problem_mnk[1] + min(
+                (pidn + 1) * self.vocab_per_split, problem_mnk[1]
             )
             valid: cutlass.Boolean = (
-                tLabelsrLabels[0] >= vocab_left_idx
-                and tLabelsrLabels[0] < vocab_right_idx
+                tLabelsrLabels[0] >= vocab_left_idx and tLabelsrLabels[0] < vocab_right_idx
             )
             tLabelsCAcc_mask[0] &= valid
 
-            cute.copy(
-                tiled_copy_r2g,
-                tR2GrLogprobs,
-                tR2GgLogprobs,
-                pred=tLabelsCAcc_mask
-            )
+            cute.copy(tiled_copy_r2g, tR2GrLogprobs, tR2GgLogprobs, pred=tLabelsCAcc_mask)
 
         # Dealloc TMEM
         self.cta_sync_barrier.arrive_and_wait()
         if warp_idx == self.empty_warp_ids[0]:
             cute.arch.relinquish_tmem_alloc_permit()
-            cute.arch.dealloc_tmem(
-                tmem_ptr, 
-                self.tmem_alloc_cols,
-                is_two_cta=self.use_2cta_instrs
-            )
+            cute.arch.dealloc_tmem(tmem_ptr, self.tmem_alloc_cols, is_two_cta=self.use_2cta_instrs)
 
     @staticmethod
     def _compute_grid(
         problem_mnk: Tuple[int, int, int],
         cluster_shape_mn: Tuple[int, int],
         cta_tiler: Tuple[int, int, int],
-        num_splits: int
+        num_splits: int,
     ) -> Tuple[int, int, int]:
 
         cluster_shape = (*cluster_shape_mn, 1)
 
         grid = cute.round_up(
-            (
-                cute.ceil_div(problem_mnk[0], cta_tiler[0]),
-                num_splits,
-                1,
-            ),
-            cluster_shape
+            (cute.ceil_div(problem_mnk[0], cta_tiler[0]), num_splits, 1), cluster_shape
         )
         return grid
 
@@ -658,42 +526,31 @@ def __call__(
         b_dtype: Type[cutlass.Numeric] = weight.element_type
 
         if cutlass.const_expr(hidden.element_type != weight.element_type):
-            raise RuntimeError(f"data type don't match: {hidden.element_type} v.s. {weight.element_type}")
+            raise RuntimeError(
+                f"data type don't match: {hidden.element_type} v.s. {weight.element_type}"
+            )
         if cutlass.const_expr(hidden.element_type not in [cutlass.Float16, cutlass.BFloat16]):
             raise RuntimeError("hidden can only be FP16 or BF16")
         if cutlass.const_expr(hidden.layout.shape[1] != weight.layout.shape[1]):
             raise RuntimeError("K dimension doesn't match")
-        
-        problem_mnk = (
-            hidden.layout.shape[0],
-            weight.layout.shape[0],
-            hidden.layout.shape[1],
-        )
+
+        problem_mnk = (hidden.layout.shape[0], weight.layout.shape[0], hidden.layout.shape[1])
         if cutlass.const_expr((problem_mnk[2] * a_dtype.width // 8) % 16 != 0):
             raise RuntimeError(f"K dimension is not 16B aligned: {problem_mnk[2]}")
 
         num_splits = cute.ceil_div(problem_mnk[1], self.vocab_per_split)
-        # if cutlass.const_expr(_max.layout.shape != (hidden.layout.shape[0], num_splits)):
-        #     raise RuntimeError(f"max shape mismatch: {_max.layout.shape} != ({hidden.layout.shape[0]}, {num_splits})")
-        # if cutlass.const_expr(_accu.layout.shape != (hidden.layout.shape[0], num_splits)):
-        #     raise RuntimeError(f"accu shape mismatch: {_accu.layout.shape} != ({hidden.layout.shape[0]}, {num_splits})")
 
         grid = self._compute_grid(
-            problem_mnk = problem_mnk,
-            cluster_shape_mn = self.cluster_shape_mn,
-            cta_tiler = self.cta_tiler,
-            num_splits = num_splits
+            problem_mnk=problem_mnk,
+            cluster_shape_mn=self.cluster_shape_mn,
+            cta_tiler=self.cta_tiler,
+            num_splits=num_splits,
         )
         a_major_mode = utils.LayoutEnum.from_tensor(hidden).mma_major_mode()
         b_major_mode = utils.LayoutEnum.from_tensor(weight).mma_major_mode()
-        
+
         tiled_mma = sm100_utils.make_trivial_tiled_mma(
-            a_dtype,
-            a_major_mode,
-            b_major_mode,
-            self.acc_dtype,
-            self.cta_group,
-            self.mma_tiler[:2]
+            a_dtype, a_major_mode, b_major_mode, self.acc_dtype, self.cta_group, self.mma_tiler[:2]
         )
 
         self._setup_attributes(tiled_mma, a_dtype, b_dtype)
@@ -701,20 +558,14 @@ def __call__(
             raise RuntimeError(f"K dimension is not 128B aligned: {problem_mnk[2]}")
 
         self.epi_tile = self.mma_tiler[:2]
-        
+
         # Swizzle o [(tileM, tileK), loopM, loopK, stage]
         a_smem_layout_staged = sm100_utils.make_smem_layout_a(
-            tiled_mma,
-            self.mma_tiler,
-            a_dtype,
-            self.num_a_stage
+            tiled_mma, self.mma_tiler, a_dtype, self.num_a_stage
         )
         # Swizzle o [(tileN, tileK), loopN, loopK, stage]
         b_smem_layout_staged = sm100_utils.make_smem_layout_b(
-            tiled_mma,
-            self.mma_tiler,
-            b_dtype,
-            self.num_b_stage
+            tiled_mma, self.mma_tiler, b_dtype, self.num_b_stage
         )
 
         # TMA loading
@@ -722,32 +573,26 @@ def __call__(
         tma_store_op = cpasync.CopyBulkTensorTileS2GOp()
 
         # Swizzle o [(tileM, tileK), loopM, loopK]
-        a_smem_layout = cute.select(
-            a_smem_layout_staged,
-            mode=[0, 1, 2]
-        )
+        a_smem_layout = cute.select(a_smem_layout_staged, mode=[0, 1, 2])
         # create tma copy atom for hidden,
         # and the cooresponding tma descriptor tensor
         tma_atom_a, tma_desc_a = cute.nvgpu.make_tiled_tma_atom_A(
             tma_load_op,
-            hidden, # gmem_tensor
-            a_smem_layout, # SMEM layout
-            self.mma_tiler, # MMA tiler
-            tiled_mma, # TiledMMA
-            self.cluster_layout_vmnk.shape # cluster_shape_vmnk
+            hidden,  # gmem_tensor
+            a_smem_layout,  # SMEM layout
+            self.mma_tiler,  # MMA tiler
+            tiled_mma,  # TiledMMA
+            self.cluster_layout_vmnk.shape,  # cluster_shape_vmnk
         )
         # Swizzle o [(tileN, tileK), loopN, loopK]
-        b_smem_layout = cute.select(
-            b_smem_layout_staged,
-            mode=[0, 1, 2]
-        )
+        b_smem_layout = cute.select(b_smem_layout_staged, mode=[0, 1, 2])
         tma_atom_b, tma_desc_b = cute.nvgpu.make_tiled_tma_atom_B(
             tma_load_op,
-            weight, # gmem_tensor
-            b_smem_layout, # SMEM layout
-            self.mma_tiler, # MMA tiler
-            tiled_mma, # TiledMMA
-            self.cluster_layout_vmnk.shape # cluster_shape_vmnk
+            weight,  # gmem_tensor
+            b_smem_layout,  # SMEM layout
+            self.mma_tiler,  # MMA tiler
+            tiled_mma,  # TiledMMA
+            self.cluster_layout_vmnk.shape,  # cluster_shape_vmnk
         )
         a_copy_size = cute.size_in_bytes(a_dtype, a_smem_layout)
         b_copy_size = cute.size_in_bytes(b_dtype, b_smem_layout)
@@ -755,8 +600,13 @@ def __call__(
         self.tma_copy_b_bytes = b_copy_size
 
         assert self.num_a_stage == self.num_b_stage
+
         @cute.struct
         class SharedStorage:
+            """
+            The shared storage for the forward kernel.
+            """
+
             # pipeline barriers, 2 = producer + consumer
             load_ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_a_stage * 2]
             mma_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2]
@@ -772,6 +622,7 @@ class SharedStorage:
                 cute.struct.MemRange[b_dtype, cute.cosize(b_smem_layout_staged)],
                 self.buffer_align_bytes,
             ]
+
         self.shared_storage = SharedStorage
 
         # launch kernel
@@ -798,95 +649,3 @@ class SharedStorage:
             stream=stream,
         )
         return None
-
-
-if __name__ == "__main__":
-    rank = 0
-
-    vocab_per_split = 512 * 6
-    fwd_mainloop = FwdMainLoop(
-        vocab_per_split=vocab_per_split
-    ) # use default arguments
-
-    torch.manual_seed(1111)
-
-    num_tokens = 13092
-    hidden_size = 4096
-    vocab_size = 152064
-    # num_tokens = 4
-    # hidden_size = 64
-    # vocab_size = 512
-    dtype = torch.bfloat16
-    ignore_index = -100
-
-    hidden = (
-        torch.empty((num_tokens, hidden_size), dtype=dtype, device="cuda")
-        .uniform_(-0.5, 0.5)
-    )
-    weight = (
-        torch.empty((vocab_size, hidden_size), dtype=dtype, device="cuda")
-        .uniform_(-0.5, 0.5)
-    )
-    # hidden = torch.ones((num_tokens, hidden_size), dtype=dtype, device="cuda")
-    # weight = torch.ones((vocab_size, hidden_size), dtype=dtype, device="cuda")
-    labels = torch.randint(0, vocab_size, (num_tokens,), device="cuda")
-
-    # pad 1 ignore_index to the right
-    padded_labels = torch.nn.functional.pad(
-        labels, (0, 1), value=ignore_index
-    )
-    # remove first element
-    labels = padded_labels[..., 1:].contiguous()
-
-    # allocate output tensor
-    logprobs = torch.empty((num_tokens), dtype=torch.float32, device="cuda")
-    
-    # allocate intermediate tensors
-    num_splits = (vocab_size + vocab_per_split - 1) // vocab_per_split
-    _max = torch.empty((num_tokens, num_splits), device=hidden.device, dtype=torch.float32)
-    _accu = torch.empty((num_tokens, num_splits), device=hidden.device, dtype=torch.float32)
-
-
-    # compile kernel
-    _hidden = from_dlpack(hidden, assumed_align=16).mark_compact_shape_dynamic(mode=0, divisibility=1)
-    _weight = from_dlpack(weight, assumed_align=16)
-    _labels = from_dlpack(labels, assumed_align=8).mark_compact_shape_dynamic(mode=0)
-    _logprobs = from_dlpack(logprobs, assumed_align=16).mark_compact_shape_dynamic(mode=0)
-    _max_ = from_dlpack(_max, assumed_align=8).mark_compact_shape_dynamic(mode=0)
-    _accu_ = from_dlpack(_accu, assumed_align=8).mark_compact_shape_dynamic(mode=0)
-    stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
-    compiled = cute.compile(fwd_mainloop, 
-        _hidden, _weight, _labels, _logprobs, 
-        _max_, _accu_, 
-        ignore_index,
-        rank,
-        stream)
-    
-    # launch kernel
-    start, stop = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
-
-    with torch.cuda.nvtx.range("FwdMainLoop"):
-        start.record(stream=torch.cuda.current_stream())
-        compiled(_hidden, _weight, _labels, _logprobs, _max_, _accu_, ignore_index, rank, stream)
-        stop.record(stream=torch.cuda.current_stream())
-
-    torch.cuda.synchronize()
-
-    elapsed_time = start.elapsed_time(stop)
-
-    gemm = torch.matmul(hidden.to(torch.float32), weight.T.to(torch.float32))
-    # print(gemm)
-
-    # print(_max)
-    # print(_accu)
-    # print(logprobs)
-
-    cut_max, _ = torch.max(_max, dim=1)
-    print(cut_max)
-    # for i in range(cut_max.shape[0]):
-    #     print(i, cut_max[i])
-
-    torch_max, _ = torch.max(gemm, dim=1)
-    print(torch_max)
-
-    print(f"Success, Elapsed time: {elapsed_time:.4f} ms")
\ No newline at end of file
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/triton.py b/megatron/core/fusions/linear_cross_entropy/blackwell/triton.py
index d7f45d152c2..e025cc046f4 100644
--- a/megatron/core/fusions/linear_cross_entropy/blackwell/triton.py
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/triton.py
@@ -1,8 +1,11 @@
-import triton
-import triton.language as tl
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+
+import triton  # type: ignore
+import triton.language as tl  # type: ignore
 
 # NOTE: tl.pointer_type() is not available in Triton 3.3.0
 
+
 @triton.autotune(
     configs=[
         triton.Config({"BLOCK_SIZE_M": 1024}, num_stages=3, num_warps=32),
@@ -14,9 +17,9 @@
 def get_num_valid_tokens(
     num_tokens: tl.int64,
     ignore_index: tl.int64,
-    labels_ptr,#: tl.pointer_type(tl.int64),
+    labels_ptr,  #: tl.pointer_type(tl.int64),
     stride_labels: tl.int64,
-    num_valid_tokens_ptr,#: tl.pointer_type(tl.int64),
+    num_valid_tokens_ptr,  #: tl.pointer_type(tl.int64),
     BLOCK_SIZE_M: tl.constexpr,
 ):
     """
@@ -29,9 +32,7 @@ def get_num_valid_tokens(
         offs_am = m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
 
         labels = tl.load(
-            labels_ptr + offs_am * stride_labels,
-            mask=offs_am < num_tokens,
-            other=ignore_index
+            labels_ptr + offs_am * stride_labels, mask=offs_am < num_tokens, other=ignore_index
         )
 
         valid_labels_mask = labels != ignore_index
@@ -40,32 +41,30 @@ def get_num_valid_tokens(
 
 
 @triton.autotune(
-    configs=[
-        triton.Config({"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64})
-    ],
-    key=["num_tokens", "num_splits"]
+    configs=[triton.Config({"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64})],
+    key=["num_tokens", "num_splits"],
 )
 @triton.jit
 def forward_dp_epilogue(
     num_tokens: tl.int64,
-    num_splits: tl.int64, # TODO: maybe this could be a constexpr
+    num_splits: tl.int64,  # TODO: maybe this could be a constexpr
     ignore_index: tl.int64,
-    labels_ptr,#: tl.pointer_type(tl.int64),
+    labels_ptr,  #: tl.pointer_type(tl.int64),
     stride_labels: tl.int64,
-    num_valid_tokens_ptr,#: tl.pointer_type(tl.int64),
-    max_ptr,#: tl.pointer_type(tl.float32),
+    num_valid_tokens_ptr,  #: tl.pointer_type(tl.int64),
+    max_ptr,  #: tl.pointer_type(tl.float32),
     stride_max_m: tl.int64,
     stride_max_n: tl.int64,
-    accu_ptr,#: tl.pointer_type(tl.float32),
+    accu_ptr,  #: tl.pointer_type(tl.float32),
     stride_accu_m: tl.int64,
     stride_accu_n: tl.int64,
-    global_max_ptr,#: tl.pointer_type(tl.float32),
+    global_max_ptr,  #: tl.pointer_type(tl.float32),
     stride_global_max: tl.int64,
-    global_accu_ptr,#: tl.pointer_type(tl.float32),
+    global_accu_ptr,  #: tl.pointer_type(tl.float32),
     stride_global_accu: tl.int64,
-    global_logprobs_ptr,#: tl.pointer_type(tl.float32),
+    global_logprobs_ptr,  #: tl.pointer_type(tl.float32),
     stride_global_logprobs: tl.int64,
-    global_logprobs_scalar_ptr,#: tl.pointer_type(tl.float32),
+    global_logprobs_scalar_ptr,  #: tl.pointer_type(tl.float32),
     REDUCTION: tl.constexpr,
     BLOCK_SIZE_M: tl.constexpr,
     BLOCK_SIZE_N: tl.constexpr,
@@ -103,78 +102,52 @@ def forward_dp_epilogue(
         global_accu = _coeff * global_accu + tl.sum(_scale * _accu, axis=1)
 
     # store maximum
-    tl.store(
-        global_max_ptr + offs_m * stride_global_max,
-        global_max,
-        mask=offs_m < num_tokens,
-    )
+    tl.store(global_max_ptr + offs_m * stride_global_max, global_max, mask=offs_m < num_tokens)
     # store accumulate
-    tl.store(
-        global_accu_ptr + offs_m * stride_global_accu,
-        global_accu,
-        mask=offs_m < num_tokens,
-    )
+    tl.store(global_accu_ptr + offs_m * stride_global_accu, global_accu, mask=offs_m < num_tokens)
     # update logprobs
     labels = tl.load(
-        labels_ptr + offs_m * stride_labels,
-        mask=offs_m < num_tokens,
-        other=ignore_index,
+        labels_ptr + offs_m * stride_labels, mask=offs_m < num_tokens, other=ignore_index
     )
     global_logprobs_ptrs = global_logprobs_ptr + offs_m * stride_global_logprobs
-    global_logprobs = tl.load(
-        global_logprobs_ptrs,
-        mask=offs_m < num_tokens,
-    )
+    global_logprobs = tl.load(global_logprobs_ptrs, mask=offs_m < num_tokens)
     global_logprobs = global_max + tl.log(global_accu) - global_logprobs
     label_mask = labels != ignore_index
     global_logprobs = tl.where(label_mask, global_logprobs, 0.0)
 
-    if REDUCTION == 0: # no-reduction
-        tl.store(
-            global_logprobs_ptrs,
-            global_logprobs,
-            mask=offs_m < num_tokens,
-        )
-    elif REDUCTION == 1: # sum
+    if REDUCTION == 0:  # no-reduction
+        tl.store(global_logprobs_ptrs, global_logprobs, mask=offs_m < num_tokens)
+    elif REDUCTION == 1:  # sum
         global_logprobs_scalar = tl.sum(global_logprobs, axis=0)
-        tl.atomic_add(
-            global_logprobs_scalar_ptr,
-            global_logprobs_scalar
-        )
-    elif REDUCTION == 2: # mean
+        tl.atomic_add(global_logprobs_scalar_ptr, global_logprobs_scalar)
+    elif REDUCTION == 2:  # mean
         num_valid_tokens = tl.load(num_valid_tokens_ptr)
         global_logprobs_scalar = tl.fdiv(
-            tl.sum(global_logprobs, axis=0),
-            num_valid_tokens.to(tl.float32),
-        )
-        tl.atomic_add(
-            global_logprobs_scalar_ptr,
-            global_logprobs_scalar
+            tl.sum(global_logprobs, axis=0), num_valid_tokens.to(tl.float32)
         )
+        tl.atomic_add(global_logprobs_scalar_ptr, global_logprobs_scalar)
 
 
 @triton.autotune(
-    configs=[
-        triton.Config({"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64}),
-    ],
-    key=["num_tokens", "num_splits"]
+    configs=[triton.Config({"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64})],
+    key=["num_tokens", "num_splits"],
 )
 @triton.jit
 def forward_tp_epilogue(
     num_tokens: tl.int64,
     num_splits: tl.int64,
-    reduced_max_ptr,#: tl.pointer_type(tl.float32),
+    reduced_max_ptr,  #: tl.pointer_type(tl.float32),
     stride_reduced_max_m: tl.int64,
     stride_reduced_max_n: tl.int64,
-    original_max_ptr,#: tl.pointer_type(tl.float32),
+    original_max_ptr,  #: tl.pointer_type(tl.float32),
     stride_original_max_m: tl.int64,
     stride_original_max_n: tl.int64,
-    accu_ptr,#: tl.pointer_type(tl.float32),
+    accu_ptr,  #: tl.pointer_type(tl.float32),
     stride_accu_m: tl.int64,
     stride_accu_n: tl.int64,
-    global_max_ptr,#: tl.pointer_type(tl.float32),
+    global_max_ptr,  #: tl.pointer_type(tl.float32),
     stride_global_max: tl.int64,
-    global_accu_ptr,#: tl.pointer_type(tl.float32),
+    global_accu_ptr,  #: tl.pointer_type(tl.float32),
     stride_global_accu: tl.int64,
     BLOCK_SIZE_M: tl.constexpr,
     BLOCK_SIZE_N: tl.constexpr,
@@ -193,12 +166,16 @@ def forward_tp_epilogue(
         offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
 
         _reduced_max = tl.load(
-            reduced_max_ptr + offs_m[:, None] * stride_reduced_max_m + offs_n[None, :] * stride_reduced_max_n,
+            reduced_max_ptr
+            + offs_m[:, None] * stride_reduced_max_m
+            + offs_n[None, :] * stride_reduced_max_n,
             mask=(offs_m[:, None] < num_tokens) & (offs_n[None, :] < num_splits),
             other=0.0,
         )
         _original_max = tl.load(
-            original_max_ptr + offs_m[:, None] * stride_original_max_m + offs_n[None, :] * stride_original_max_n,
+            original_max_ptr
+            + offs_m[:, None] * stride_original_max_m
+            + offs_n[None, :] * stride_original_max_n,
             mask=(offs_m[:, None] < num_tokens) & (offs_n[None, :] < num_splits),
             other=0.0,
         )
@@ -219,38 +196,25 @@ def forward_tp_epilogue(
         global_accu = _coeff * global_accu + tl.sum(_scale * _accu, axis=1)
 
     # store
-    tl.store(
-        global_max_ptr + offs_m * stride_global_max,
-        global_max,
-        mask=offs_m < num_tokens,
-    )
-    tl.store(
-        global_accu_ptr + offs_m * stride_global_accu,
-        global_accu,
-        mask=offs_m < num_tokens
-    )
+    tl.store(global_max_ptr + offs_m * stride_global_max, global_max, mask=offs_m < num_tokens)
+    tl.store(global_accu_ptr + offs_m * stride_global_accu, global_accu, mask=offs_m < num_tokens)
 
 
-@triton.autotune(
-    configs=[
-        triton.Config({"BLOCK_SIZE_M": 16})
-    ],
-    key=["num_tokens"]
-)
+@triton.autotune(configs=[triton.Config({"BLOCK_SIZE_M": 16})], key=["num_tokens"])
 @triton.jit
 def forward_tp_epilogue_update_logprobs(
     num_tokens: tl.int64,
     ignore_index: tl.int64,
-    num_valid_tokens_ptr,#: tl.pointer_type(tl.int64),
-    labels_ptr,#: tl.pointer_type(tl.int64),
+    num_valid_tokens_ptr,  #: tl.pointer_type(tl.int64),
+    labels_ptr,  #: tl.pointer_type(tl.int64),
     stride_labels: tl.int64,
-    logprobs_ptr,#: tl.pointer_type(tl.float32),
+    logprobs_ptr,  #: tl.pointer_type(tl.float32),
     stride_logprobs: tl.int64,
-    maximum_ptr,#: tl.pointer_type(tl.float32),
+    maximum_ptr,  #: tl.pointer_type(tl.float32),
     stride_maximum: tl.int64,
-    accumulate_ptr,#: tl.pointer_type(tl.float32),
+    accumulate_ptr,  #: tl.pointer_type(tl.float32),
     stride_accumulate: tl.int64,
-    logprobs_scalar_ptr,#: tl.pointer_type(tl.float32),
+    logprobs_scalar_ptr,  #: tl.pointer_type(tl.float32),
     REDUCTION: tl.constexpr,
     BLOCK_SIZE_M: tl.constexpr,
 ):
@@ -261,45 +225,24 @@ def forward_tp_epilogue_update_logprobs(
 
     offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
 
-    logprobs = tl.load(
-        logprobs_ptr + offs_m * stride_logprobs,
-        mask=offs_m < num_tokens,
-    )
-    maximum = tl.load(
-        maximum_ptr + offs_m * stride_maximum,
-        mask=offs_m < num_tokens,
-    )
-    accumulate = tl.load(
-        accumulate_ptr + offs_m * stride_accumulate,
-        mask=offs_m < num_tokens,
-    )
+    logprobs = tl.load(logprobs_ptr + offs_m * stride_logprobs, mask=offs_m < num_tokens)
+    maximum = tl.load(maximum_ptr + offs_m * stride_maximum, mask=offs_m < num_tokens)
+    accumulate = tl.load(accumulate_ptr + offs_m * stride_accumulate, mask=offs_m < num_tokens)
 
     labels = tl.load(
-        labels_ptr + offs_m * stride_labels,
-        mask=offs_m < num_tokens,
-        other=ignore_index,
+        labels_ptr + offs_m * stride_labels, mask=offs_m < num_tokens, other=ignore_index
     )
     label_mask = labels != ignore_index
 
     logprobs = maximum + tl.log(accumulate) - logprobs
     logprobs = tl.where(label_mask, logprobs, 0.0)
 
-    if REDUCTION == 0: # no-reduction
-        tl.store(
-            logprobs_ptr + offs_m * stride_logprobs,
-            logprobs,
-            mask=offs_m < num_tokens,
-        )
-    elif REDUCTION == 1: # sum
+    if REDUCTION == 0:  # no-reduction
+        tl.store(logprobs_ptr + offs_m * stride_logprobs, logprobs, mask=offs_m < num_tokens)
+    elif REDUCTION == 1:  # sum
         logprobs_scalar = tl.sum(logprobs, axis=0)
-        tl.atomic_add(
-            logprobs_scalar_ptr,
-            logprobs_scalar
-        )
-    elif REDUCTION == 2: # mean
+        tl.atomic_add(logprobs_scalar_ptr, logprobs_scalar)
+    elif REDUCTION == 2:  # mean
         num_valid_tokens = tl.load(num_valid_tokens_ptr)
-        logprobs_scalar = tl.fdiv(
-            tl.sum(logprobs, axis=0),
-            num_valid_tokens.to(tl.float32),
-        )
-        tl.atomic_add(logprobs_scalar_ptr, logprobs_scalar)
\ No newline at end of file
+        logprobs_scalar = tl.fdiv(tl.sum(logprobs, axis=0), num_valid_tokens.to(tl.float32))
+        tl.atomic_add(logprobs_scalar_ptr, logprobs_scalar)
diff --git a/megatron/core/fusions/linear_cross_entropy/utils.py b/megatron/core/fusions/linear_cross_entropy/utils.py
index 642a6b3b230..9a62b9826cb 100644
--- a/megatron/core/fusions/linear_cross_entropy/utils.py
+++ b/megatron/core/fusions/linear_cross_entropy/utils.py
@@ -1,16 +1,20 @@
-import typing
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+
 from dataclasses import dataclass
 
+
 @dataclass
 class EntropyReductionEnum:
     """
     Enum for the reduction method of cross entropy.
     """
+
     kNone = 0
     kSum = 1
     kMean = 2
 
-def str_to_reduction_enum(reduction: str) -> EntropyReductionEnum:
+
+def str_to_reduction_enum(reduction: str) -> int:
     """
     str -> EntropyReductionEnum
     """
@@ -25,8 +29,13 @@ def str_to_reduction_enum(reduction: str) -> EntropyReductionEnum:
         raise ValueError(f"Invalid reduction: {reduction}")
     return _enum
 
+
 @dataclass
 class BackwardMethodEnum:
+    """
+    Enum for the backward method of linear cross entropy.
+    """
+
     # two separate kernels for d_hidden and d_weight, respectively
     kTwoKernels = 0
     # calculate partial d_logits along its N dimension
diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py
index 15352075661..b7013be89f0 100644
--- a/megatron/core/models/common/language_module/language_module.py
+++ b/megatron/core/models/common/language_module/language_module.py
@@ -60,7 +60,7 @@ def __init__(
             "If you don't need embd_group, you need to explicitly set it to None."
         )
         self.embd_group = pg_collection.embd
-        self.vp_stage = None
+        self.vp_stage: Optional[int] = None
         self.vp_size = self.config.virtual_pipeline_model_parallel_size
 
     def _is_in_embd_group(self):
@@ -134,8 +134,8 @@ def compute_language_model_loss_without_logits(
         sequence_parallel_enabled: bool = False,
         column_parallel_linear: torch.nn.Module = None,
         col_linear_kwargs: Dict[str, Any] = {},
-        reduction: Optional[str] = "none",
-        ignore_index: Optional[int] = -100,
+        reduction: str = "none",
+        ignore_index: int = -100,
     ) -> Tuple[Tensor, Optional[Tensor]]:
         """Computes the language model logits and loss (Cross entropy across vocabulary)
 
@@ -159,6 +159,9 @@ def compute_language_model_loss_without_logits(
             assert (
                 weight is not None
             ), "weight cannot be None when using fused linear cross entropy."
+            assert (
+                labels is not None
+            ), "labels cannot be None when using fused linear cross entropy."
             # [b s] => [s b]
             labels = labels.transpose(0, 1).contiguous()
             loss = linear_cross_entropy(
@@ -327,7 +330,7 @@ def shared_embedding_or_output_weight(self) -> Tensor:
     def sharded_state_dict(
         self,
         prefix: str = '',
-        sharded_offsets: Tuple[Tuple[int, int, int]] = (),
+        sharded_offsets: Tuple[Tuple[int, int, int], ...] = (),
         metadata: Optional[dict] = None,
     ) -> ShardedStateDict:
         """Sharded state dict implementation that handles the output layer weights tying.
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index 5e3950d0003..0bb144e408d 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 from collections import OrderedDict
-from typing import Dict, Literal, Optional
+from typing import Any, Dict, List, Literal, Optional, Tuple
 
 import torch
 from torch import Tensor
@@ -118,8 +118,8 @@ def __init__(
         self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
         self.parallel_output = parallel_output
         self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
-        self.vp_stage = vp_stage
         self.disable_param_offloading = True
+        self.vp_stage: Optional[int] = vp_stage
 
         if hasattr(self.config, 'position_embedding_type'):
             self.position_embedding_type = self.config.position_embedding_type
@@ -199,7 +199,7 @@ def __init__(
             ), "mrope require mrope_section setting, but we got None from TransformerConfig"
 
         # Cache for RoPE tensors which do not change between iterations.
-        self.rotary_pos_emb_cache = {}
+        self.rotary_pos_emb_cache: Dict[int, Tuple[Tensor, Tensor]] = {}
 
         # Transformer.
         self.decoder = TransformerBlock(
@@ -219,6 +219,8 @@ def __init__(
         # Output
         if self.post_process:
 
+            self.embedding_activation_buffer: Optional[List[Tensor]] = None
+            self.grad_output_buffer: Optional[List[Tensor]] = None
             if self.config.defer_embedding_wgrad_compute:
                 # The embedding activation buffer preserves a reference to the input activations
                 # of the final embedding projection layer GEMM. It will hold the activations for
@@ -395,7 +397,7 @@ def _preprocess(
         if in_inference_mode and not has_config_logger_enabled(self.config):
             decoder_input = WrappedTensor(decoder_input)
 
-        preproc_output = (
+        preproc_output: Tuple[Any, ...] = (
             decoder_input,
             rotary_pos_emb,
             rotary_pos_cos,
@@ -439,7 +441,7 @@ def forward(
         labels: Tensor = None,
         inference_context: BaseInferenceContext = None,
         packed_seq_params: PackedSeqParams = None,
-        extra_block_kwargs: dict = None,
+        extra_block_kwargs: Optional[Dict[str, Any]] = None,
         runtime_gather_output: Optional[bool] = None,
         *,
         inference_params: Optional[BaseInferenceContext] = None,
@@ -709,7 +711,7 @@ def build_schedule_plan(
         labels: Tensor = None,
         inference_context: BaseInferenceContext = None,
         packed_seq_params: PackedSeqParams = None,
-        extra_block_kwargs: dict = None,
+        extra_block_kwargs: Optional[Dict[str, Any]] = None,
         runtime_gather_output: Optional[bool] = None,
         inference_params: Optional[BaseInferenceContext] = None,
         loss_mask: Optional[Tensor] = None,
diff --git a/megatron/core/models/mamba/mamba_model.py b/megatron/core/models/mamba/mamba_model.py
index 98d918ce448..eab86d6d532 100644
--- a/megatron/core/models/mamba/mamba_model.py
+++ b/megatron/core/models/mamba/mamba_model.py
@@ -61,7 +61,7 @@ def __init__(
         pre_process: bool = True,
         hybrid_attention_ratio: float = 0.0,
         hybrid_mlp_ratio: float = 0.0,
-        hybrid_override_pattern: str = None,
+        hybrid_override_pattern: Optional[str] = None,
         post_process: bool = True,
         fp16_lm_cross_entropy: bool = False,
         parallel_output: bool = True,
diff --git a/tests/unit_tests/a2a_overlap/utils.py b/tests/unit_tests/a2a_overlap/utils.py
index 994998337d8..d80eaf13f5a 100644
--- a/tests/unit_tests/a2a_overlap/utils.py
+++ b/tests/unit_tests/a2a_overlap/utils.py
@@ -222,8 +222,8 @@ def get_test_config(num_layers=1, num_moe_experts=8, extra_kwargs={}, moe_groupe
 
 def get_valid_token_dispatcher_types():
     try:
-        from deep_ep import Buffer
-        from deep_ep.utils import EventHandle, EventOverlap
+        from deep_ep import Buffer  # type: ignore
+        from deep_ep.utils import EventHandle, EventOverlap  # type: ignore
 
         return ["alltoall", "flex"]
     except ImportError:
diff --git a/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py b/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py
index 130a2bb5a71..a36b8cfb4e0 100644
--- a/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py
+++ b/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py
@@ -1,15 +1,19 @@
 # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+
 import contextlib
+import os
+import typing
 from contextlib import ExitStack
 
 import numpy as np
 import pytest
 import torch
+import torch.distributed as dist
 from torch.utils.data import DataLoader, Dataset
 from torch.utils.data.distributed import DistributedSampler
-import torch.distributed as dist
 
 import megatron.core.parallel_state as ps
+from megatron.core.fusions.fused_linear_cross_entropy import linear_cross_entropy
 from megatron.core.models.gpt.gpt_layer_specs import (
     get_gpt_decoder_block_spec,
     get_gpt_mtp_block_spec,
@@ -23,10 +27,6 @@
 )
 from tests.unit_tests.test_utilities import Utils
 
-from megatron.core.fusions.fused_linear_cross_entropy import linear_cross_entropy
-
-import os
-import typing
 
 class MockDataset(Dataset):
     """
@@ -138,8 +138,8 @@ def init_gpt_dataloader(
 
 
 @pytest.mark.skipif(
-    ("WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2),# or True,
-    reason="Requires torchrun with multiple GPUs"
+    ("WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2),  # or True,
+    reason="Requires torchrun with multiple GPUs",
 )
 class TestFusedLinearCrossEntropyOnGptModel:
     @pytest.mark.parametrize("fp8_flag", get_valid_fp8_flags())
@@ -198,8 +198,7 @@ def test_gpt_model(self, mtp_layers, dispatcher_type, fp8_flag, layer_num):
 
 
 @pytest.mark.skipif(
-    "WORLD_SIZE" in os.environ and os.environ["WORLD_SIZE"] != "1",
-    reason="Requires single GPU"
+    "WORLD_SIZE" in os.environ and os.environ["WORLD_SIZE"] != "1", reason="Requires single GPU"
 )
 class TestFusedLinearCrossEntropyDataParallel:
     def cleanup(self):
@@ -216,7 +215,7 @@ def torch_linear_cross_entropy(
         weight: torch.Tensor,
         labels: torch.Tensor,
         reduction: str,
-        ignore_index: int
+        ignore_index: int,
     ):
         # NOTE: need to convert to fp32 to fp32 accumulation,
         # thus assure accuracy
@@ -262,36 +261,28 @@ def test_kernel_launch(self):
         for num_token in num_tokens:
             hidden = torch.randn(num_token, dim, dtype=dtype, device="cuda").requires_grad_()
             labels = torch.randint(0, vocab_size, (num_token,), dtype=torch.long, device="cuda")
-            
-            logprobs = linear_cross_entropy(hidden, weight, labels, reduction=reduction, ignore_index=ignore_index)
+
+            logprobs = linear_cross_entropy(
+                hidden, weight, labels, reduction=reduction, ignore_index=ignore_index
+            )
             assert not torch.isnan(logprobs).any()
 
             gLogprobs = torch.randn_like(logprobs)
             (d_hidden, d_weight) = torch.autograd.grad(
-                (logprobs,),
-                (hidden, weight),
-                (gLogprobs,),
-                retain_graph=False
+                (logprobs,), (hidden, weight), (gLogprobs,), retain_graph=False
             )
             assert not torch.isnan(d_hidden).any()
             assert not torch.isnan(d_weight).any()
 
-
     @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
     @pytest.mark.parametrize("problem", get_problems())
     @pytest.mark.parametrize("reduction", ["none", "mean", "sum"])
     @pytest.mark.parametrize("ignore_index", get_ignore_index())
-    def test_correctness(
-        self,
-        dtype,
-        problem,
-        reduction,
-        ignore_index
-    ):
+    def test_correctness(self, dtype, problem, reduction, ignore_index):
         num_tokens, vocabsize, dim = problem
         hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
         labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens
-        
+
         hidden = (
             torch.empty(hidden_shape, dtype=dtype, device="cuda")
             .uniform_(-0.1, 0.1)
@@ -303,65 +294,40 @@ def test_correctness(
             .requires_grad_()
         )
         labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
-        if ignore_index >=0 and ignore_index < vocabsize:
+        if ignore_index >= 0 and ignore_index < vocabsize:
             pad_labels = torch.nn.functional.pad(labels, (0, 1), value=ignore_index)
             labels = pad_labels[..., 1:].contiguous()
 
         # forward
-        torch_logprobs = self.torch_linear_cross_entropy(hidden, weight, labels, 
-            reduction=reduction, ignore_index=ignore_index)
-
-        custom_logprobs = linear_cross_entropy(hidden, weight, labels, 
-            reduction=reduction, ignore_index=ignore_index)
+        torch_logprobs = self.torch_linear_cross_entropy(
+            hidden, weight, labels, reduction=reduction, ignore_index=ignore_index
+        )
 
-        torch.testing.assert_close(
-            torch_logprobs,
-            custom_logprobs
+        custom_logprobs = linear_cross_entropy(
+            hidden, weight, labels, reduction=reduction, ignore_index=ignore_index
         )
 
+        torch.testing.assert_close(torch_logprobs, custom_logprobs)
+
         # backward
-        g_logprobs = (
-            torch.empty_like(torch_logprobs)
-            .uniform_(-0.1, 0.1)
-        )
+        g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1)
 
         (d_torch_hidden, d_torch_weight) = torch.autograd.grad(
-            (torch_logprobs,),
-            (hidden, weight),
-            (g_logprobs,),
-            retain_graph=False
+            (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
         )
 
         (d_custom_hidden, d_custom_weight) = torch.autograd.grad(
-            (custom_logprobs,),
-            (hidden, weight),
-            (g_logprobs,),
-            retain_graph=False)
-
-        torch.testing.assert_close(
-            d_torch_hidden,
-            d_custom_hidden,
-            atol=1e-3,
-            rtol=1e-3
-        )
-        torch.testing.assert_close(
-            d_torch_weight,
-            d_custom_weight,
-            atol=1e-3,
-            rtol=1e-3
+            (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
         )
 
+        torch.testing.assert_close(d_torch_hidden, d_custom_hidden, atol=1e-3, rtol=1e-3)
+        torch.testing.assert_close(d_torch_weight, d_custom_weight, atol=1e-3, rtol=1e-3)
+
     @pytest.mark.parametrize("problem", [((1, 4096), 129280, 7168)])
     @pytest.mark.parametrize("dtype", [torch.bfloat16])
     @pytest.mark.parametrize("reduction", ["mean"])
     @pytest.mark.parametrize("ignore_index", [-100])
-    def test_performance(
-        self,
-        problem,
-        dtype,
-        reduction,
-        ignore_index
-    ):
+    def test_performance(self, problem, dtype, reduction, ignore_index):
         num_tokens, vocabsize, dim = problem
         hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
         labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens
@@ -387,66 +353,45 @@ def test_performance(
                 .requires_grad_()
             )
             labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
-            if ignore_index >=0 and ignore_index < vocabsize:
+            if ignore_index >= 0 and ignore_index < vocabsize:
                 pad_labels = torch.nn.functional.pad(labels, (0, 1), value=ignore_index)
                 labels = pad_labels[..., 1:].contiguous()
 
             # -------- forward -------- #
             start_event.record()
             torch_logprobs = self.torch_linear_cross_entropy(
-                hidden, weight, labels,
-                reduction=reduction, 
-                ignore_index=ignore_index
+                hidden, weight, labels, reduction=reduction, ignore_index=ignore_index
             )
             end_event.record()
             torch.cuda.synchronize()
-            torch_fwd_latency.append(
-                start_event.elapsed_time(end_event)
-            )
+            torch_fwd_latency.append(start_event.elapsed_time(end_event))
 
             start_event.record()
             custom_logprobs = linear_cross_entropy(
-                hidden, weight, labels,
-                reduction=reduction, 
-                ignore_index=ignore_index
+                hidden, weight, labels, reduction=reduction, ignore_index=ignore_index
             )
             end_event.record()
             torch.cuda.synchronize()
-            custom_fwd_latency.append(
-                start_event.elapsed_time(end_event)
-            )
+            custom_fwd_latency.append(start_event.elapsed_time(end_event))
 
             # -------- backward -------- #
-            g_logprobs = (
-                torch.empty_like(torch_logprobs)
-                .uniform_(-0.1, 0.1)
-            )
+            g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1)
 
             start_event.record()
             (d_torch_hidden, d_torch_weight) = torch.autograd.grad(
-                (torch_logprobs,),
-                (hidden, weight),
-                (g_logprobs,),
-                retain_graph=False
+                (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
             )
             end_event.record()
             torch.cuda.synchronize()
-            torch_bwd_latency.append(
-                start_event.elapsed_time(end_event)
-            )
+            torch_bwd_latency.append(start_event.elapsed_time(end_event))
 
             start_event.record()
             (d_custom_hidden, d_custom_weight) = torch.autograd.grad(
-                (custom_logprobs,),
-                (hidden, weight),
-                (g_logprobs,),
-                retain_graph=False
+                (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
             )
             end_event.record()
             torch.cuda.synchronize()
-            custom_bwd_latency.append(
-                start_event.elapsed_time(end_event)
-            )
+            custom_bwd_latency.append(start_event.elapsed_time(end_event))
 
         # --- remove first latency due to warmup --- #
         torch_fwd_latency = torch_fwd_latency[1:]
@@ -456,22 +401,24 @@ def test_performance(
 
         print()
         print(f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}:")
-        print(f"[INFO]: Torch forward latency: {sum(torch_fwd_latency) / len(torch_fwd_latency):.2f} ms")
-        print(f"[INFO]: Custom forward latency: {sum(custom_fwd_latency) / len(custom_fwd_latency):.2f} ms")
-        print(f"[INFO]: Torch backward latency: {sum(torch_bwd_latency) / len(torch_bwd_latency):.2f} ms")
-        print(f"[INFO]: Custom backward latency: {sum(custom_bwd_latency) / len(custom_bwd_latency):.2f} ms")
+        print(
+            f"[INFO]: Torch forward latency: {sum(torch_fwd_latency) / len(torch_fwd_latency):.2f} ms"
+        )
+        print(
+            f"[INFO]: Custom forward latency: {sum(custom_fwd_latency) / len(custom_fwd_latency):.2f} ms"
+        )
+        print(
+            f"[INFO]: Torch backward latency: {sum(torch_bwd_latency) / len(torch_bwd_latency):.2f} ms"
+        )
+        print(
+            f"[INFO]: Custom backward latency: {sum(custom_bwd_latency) / len(custom_bwd_latency):.2f} ms"
+        )
 
     @pytest.mark.parametrize("problem", [((1, 4096), 129280, 7168)])
     @pytest.mark.parametrize("dtype", [torch.bfloat16])
     @pytest.mark.parametrize("reduction", ["mean"])
     @pytest.mark.parametrize("ignore_index", [-100])
-    def test_storage(
-        self,
-        problem,
-        dtype,
-        reduction,
-        ignore_index
-    ):
+    def test_storage(self, problem, dtype, reduction, ignore_index):
         num_tokens, vocabsize, dim = problem
         hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
         labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens
@@ -490,30 +437,22 @@ def torch_storage():
                 .requires_grad_()
             )
             labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
-            if ignore_index >=0 and ignore_index < vocabsize:
+            if ignore_index >= 0 and ignore_index < vocabsize:
                 pad_labels = torch.nn.functional.pad(labels, (0, 1), value=ignore_index)
                 labels = pad_labels[..., 1:].contiguous()
 
             torch.cuda.reset_peak_memory_stats()
             torch_logprobs = self.torch_linear_cross_entropy(
-                hidden, weight, labels,
-                reduction=reduction, 
-                ignore_index=ignore_index
+                hidden, weight, labels, reduction=reduction, ignore_index=ignore_index
             )
             torch.cuda.synchronize()
             torch_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
             print(f"[INFO]: Torch Forward pass peak memory: {torch_max_memory:.2f} MB")
 
             torch.cuda.reset_peak_memory_stats()
-            g_logprobs = (
-                torch.empty_like(torch_logprobs)
-                .uniform_(-0.1, 0.1)
-            )
+            g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1)
             (d_torch_hidden, d_torch_weight) = torch.autograd.grad(
-                (torch_logprobs,),
-                (hidden, weight),
-                (g_logprobs,),
-                retain_graph=False
+                (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
             )
             torch.cuda.synchronize()
             torch_backward_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
@@ -531,36 +470,27 @@ def custom_storage():
                 .requires_grad_()
             )
             labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
-            if ignore_index >=0 and ignore_index < vocabsize:
+            if ignore_index >= 0 and ignore_index < vocabsize:
                 pad_labels = torch.nn.functional.pad(labels, (0, 1), value=ignore_index)
                 labels = pad_labels[..., 1:].contiguous()
 
             torch.cuda.reset_peak_memory_stats()
             custom_logprobs = linear_cross_entropy(
-                hidden, weight, labels,
-                reduction=reduction, 
-                ignore_index=ignore_index
+                hidden, weight, labels, reduction=reduction, ignore_index=ignore_index
             )
             torch.cuda.synchronize()
             custom_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
             print(f"[INFO]: Custom Forward pass peak memory: {custom_max_memory:.2f} MB")
 
             torch.cuda.reset_peak_memory_stats()
-            g_logprobs = (
-                torch.empty_like(custom_logprobs)
-                .uniform_(-0.1, 0.1)
-            )
+            g_logprobs = torch.empty_like(custom_logprobs).uniform_(-0.1, 0.1)
             (d_custom_hidden, d_custom_weight) = torch.autograd.grad(
-                (custom_logprobs,),
-                (hidden, weight),
-                (g_logprobs,),
-                retain_graph=False
+                (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
             )
             torch.cuda.synchronize()
             custom_backward_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
             print(f"[INFO]: Custom Backward pass peak memory: {custom_backward_max_memory:.2f} MB")
 
-        
         self.cleanup()
         torch_storage()
         self.cleanup()
@@ -568,8 +498,8 @@ def custom_storage():
 
 
 @pytest.mark.skipif(
-    ("WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2),# or True,
-    reason="Requires torchrun with multiple GPUs"
+    ("WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2),  # or True,
+    reason="Requires torchrun with multiple GPUs",
 )
 class TestFusedLinearCrossEntropyTensorParallel:
     @classmethod
@@ -581,14 +511,14 @@ def setup_class(cls):
                 backend="nccl",
                 init_method="env://",
                 world_size=int(os.environ["WORLD_SIZE"]),
-                rank=int(os.environ["RANK"])
+                rank=int(os.environ["RANK"]),
             )
             cls.must_teardown = True
         cls.tp_group = dist.group.WORLD
 
         cls.tp_rank = dist.get_rank(cls.tp_group)
         cls.tp_world_size = dist.get_world_size(cls.tp_group)
-        cls.is_chief = (cls.tp_rank == 0)
+        cls.is_chief = cls.tp_rank == 0
         device = torch.device(f"cuda:{cls.tp_rank}")
         torch.cuda.set_device(device)
         print(f"[INFO]: TP rank: {cls.tp_rank}, TP world size: {cls.tp_world_size}")
@@ -615,9 +545,7 @@ def torch_linear_cross_entropy_single_gpu(
     ):
         logits = hidden.to(torch.float32) @ weight.T.to(torch.float32)
         logprobs = torch.nn.functional.cross_entropy(
-            logits.view(-1, logits.shape[-1]),
-            labels.view(-1),
-            reduction=reduction,
+            logits.view(-1, logits.shape[-1]), labels.view(-1), reduction=reduction
         )
         return logprobs.to(torch.float32)
 
@@ -639,7 +567,7 @@ def forward(
             whole_logits = torch.empty(
                 (logits.shape[0], logits.shape[-1] * tp_world_size),
                 dtype=logits.dtype,
-                device=logits.device
+                device=logits.device,
             )
             whole_logits_ref = [
                 whole_logits[..., i * logits.shape[-1] : (i + 1) * logits.shape[-1]]
@@ -648,9 +576,7 @@ def forward(
             dist.all_gather(whole_logits_ref, logits, group=tp_group)
 
             logprobs = torch.nn.functional.cross_entropy(
-                whole_logits.view(-1, whole_logits.shape[-1]),
-                labels.view(-1),
-                reduction=reduction,
+                whole_logits.view(-1, whole_logits.shape[-1]), labels.view(-1), reduction=reduction
             )
 
             # If we don't preserve whole_logits,
@@ -664,10 +590,7 @@ def forward(
             return logprobs.to(torch.float32)
 
         @staticmethod
-        def backward(
-            ctx,
-            g_logprobs: torch.Tensor,
-        ):
+        def backward(ctx, g_logprobs: torch.Tensor):
             hidden, weight, labels = ctx.saved_tensors
             tp_group = ctx.tp_group
             reduction = ctx.reduction
@@ -677,15 +600,9 @@ def backward(
             num_tokens, dim = hidden.shape
 
             if reduction == "mean":
-                _g_logprobs = torch.broadcast_to(
-                    g_logprobs / num_tokens,
-                    (num_tokens,)
-                )
+                _g_logprobs = torch.broadcast_to(g_logprobs / num_tokens, (num_tokens,))
             elif reduction == "sum":
-                _g_logprobs = torch.broadcast_to(
-                    g_logprobs,
-                    (num_tokens,)
-                )
+                _g_logprobs = torch.broadcast_to(g_logprobs, (num_tokens,))
             else:
                 _g_logprobs = g_logprobs
 
@@ -694,7 +611,7 @@ def backward(
             whole_logits = torch.empty(
                 (logits.shape[0], logits.shape[-1] * tp_world_size),
                 dtype=logits.dtype,
-                device=logits.device
+                device=logits.device,
             )
             whole_logits_ref = [
                 whole_logits[..., i * logits.shape[-1] : (i + 1) * logits.shape[-1]]
@@ -715,23 +632,14 @@ def backward(
             local_d_hidden = local_d_logits @ weight
             local_d_weight = local_d_logits.T @ hidden
 
-            dist.all_reduce(
-                local_d_hidden,
-                op=dist.ReduceOp.SUM,
-                group=tp_group
-            )
+            dist.all_reduce(local_d_hidden, op=dist.ReduceOp.SUM, group=tp_group)
 
             return local_d_hidden, local_d_weight, None, None, None
 
     @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
     @pytest.mark.parametrize("reduction", ["mean", "sum", "none"])
     @pytest.mark.parametrize("problem", [(4096, 129280, 8192)])
-    def test_torch_tp_vs_single_gpu(
-        self,
-        dtype,
-        reduction,
-        problem,
-    ):
+    def test_torch_tp_vs_single_gpu(self, dtype, reduction, problem):
         num_tokens, vocabsize, dim = problem
 
         hidden = (
@@ -752,72 +660,41 @@ def test_torch_tp_vs_single_gpu(
 
         # single GPU
         whole_weight = torch.empty(
-            (vocabsize * self.tp_world_size, dim),
-            dtype=dtype,
-            device="cuda"
+            (vocabsize * self.tp_world_size, dim), dtype=dtype, device="cuda"
         )
         whole_weight_view = [
-            whole_weight[i * vocabsize : (i + 1) * vocabsize, :]
-            for i in range(self.tp_world_size)
+            whole_weight[i * vocabsize : (i + 1) * vocabsize, :] for i in range(self.tp_world_size)
         ]
-        dist.all_gather(
-            whole_weight_view, 
-            weight,
-            group=self.tp_group
-        )
+        dist.all_gather(whole_weight_view, weight, group=self.tp_group)
         whole_weight = whole_weight.clone().requires_grad_()
         logprobs_single_gpu = self.torch_linear_cross_entropy_single_gpu(
-            hidden, whole_weight, labels,
-            reduction=reduction,
+            hidden, whole_weight, labels, reduction=reduction
         )
 
         # TP
         logprobs_tp = self.TorchLinearCrossEntropy.apply(
-            hidden, weight, labels,
-            self.tp_group,
-            reduction,
-        )
-        torch.testing.assert_close(
-            logprobs_single_gpu,
-            logprobs_tp,
+            hidden, weight, labels, self.tp_group, reduction
         )
+        torch.testing.assert_close(logprobs_single_gpu, logprobs_tp)
 
         # ------------ backward pass ------------ #
-        g_logprobs = (
-            torch.empty_like(logprobs_single_gpu)
-            .uniform_(-0.1, 0.1)
-        )
+        g_logprobs = torch.empty_like(logprobs_single_gpu).uniform_(-0.1, 0.1)
         dist.broadcast(g_logprobs, src=0, group=self.tp_group)
 
         # single GPU
         (d_hidden_single_gpu, d_weight_single_gpu) = torch.autograd.grad(
-            (logprobs_single_gpu,),
-            (hidden, whole_weight),
-            (g_logprobs,),
-            retain_graph=False
+            (logprobs_single_gpu,), (hidden, whole_weight), (g_logprobs,), retain_graph=False
         )
 
         # TP
         (d_hidden_tp, d_weight_tp) = torch.autograd.grad(
-            (logprobs_tp,),
-            (hidden, weight),
-            (g_logprobs,),
-            retain_graph=False
+            (logprobs_tp,), (hidden, weight), (g_logprobs,), retain_graph=False
         )
-        torch.testing.assert_close(
-            d_hidden_single_gpu,
-            d_hidden_tp,
-            atol=1e-3,
-            rtol=1e-3,
-        )
-        local_d_weight_single_gpu = d_weight_single_gpu[self.tp_rank * weight.shape[0] : (self.tp_rank + 1) * weight.shape[0], :]
-        torch.testing.assert_close(
-            local_d_weight_single_gpu,
-            d_weight_tp,
-            atol=1e-3,
-            rtol=1e-3,
-        )
-
+        torch.testing.assert_close(d_hidden_single_gpu, d_hidden_tp, atol=1e-3, rtol=1e-3)
+        local_d_weight_single_gpu = d_weight_single_gpu[
+            self.tp_rank * weight.shape[0] : (self.tp_rank + 1) * weight.shape[0], :
+        ]
+        torch.testing.assert_close(local_d_weight_single_gpu, d_weight_tp, atol=1e-3, rtol=1e-3)
 
     @staticmethod
     def get_problems():
@@ -833,12 +710,7 @@ def get_problems():
     @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
     @pytest.mark.parametrize("reduction", ["mean", "sum", "none"])
     @pytest.mark.parametrize("problem", get_problems())
-    def test_correctness(
-        self,
-        dtype,
-        reduction,
-        problem,
-    ):
+    def test_correctness(self, dtype, reduction, problem):
         num_tokens, vocabsize, dim = problem
         hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
         labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens
@@ -855,69 +727,37 @@ def test_correctness(
         )
         labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
 
-
         # ------ forward pass ------ #
         dist.broadcast(hidden, src=0, group=self.tp_group)
         dist.broadcast(labels, src=0, group=self.tp_group)
 
         torch_logprobs = self.TorchLinearCrossEntropy.apply(
-            hidden.view(-1, dim), weight, labels,
-            self.tp_group,
-            reduction,
+            hidden.view(-1, dim), weight, labels, self.tp_group, reduction
         )
 
         custom_logprobs = linear_cross_entropy(
-            hidden, weight, labels,
-            tp_group=self.tp_group,
-            reduction=reduction,
+            hidden, weight, labels, tp_group=self.tp_group, reduction=reduction
         )
 
-        torch.testing.assert_close(
-            torch_logprobs,
-            custom_logprobs,
-        )
+        torch.testing.assert_close(torch_logprobs, custom_logprobs)
 
         # ------- backward pass ------- #
-        g_logprobs = (
-            torch.empty_like(torch_logprobs)
-            .uniform_(-0.1, 0.1)
-        )
+        g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1)
         dist.broadcast(g_logprobs, src=0, group=self.tp_group)
 
         (d_hidden_torch, d_weight_torch) = torch.autograd.grad(
-            (torch_logprobs,),
-            (hidden, weight),
-            (g_logprobs,),
-            retain_graph=False
+            (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
         )
         (d_hidden_custom, d_weight_custom) = torch.autograd.grad(
-            (custom_logprobs,),
-            (hidden, weight),
-            (g_logprobs,),
-            retain_graph=False
-        )
-        torch.testing.assert_close(
-            d_hidden_torch,
-            d_hidden_custom,
-            atol=1e-3,
-            rtol=1e-3,
-        )
-        torch.testing.assert_close(
-            d_weight_torch,
-            d_weight_custom,
-            atol=1e-4,
-            rtol=1e-4,
+            (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
         )
+        torch.testing.assert_close(d_hidden_torch, d_hidden_custom, atol=1e-3, rtol=1e-3)
+        torch.testing.assert_close(d_weight_torch, d_weight_custom, atol=1e-4, rtol=1e-4)
 
     @pytest.mark.parametrize("problem", [((1, 4096), 129280, 7168)])
     @pytest.mark.parametrize("dtype", [torch.bfloat16])
     @pytest.mark.parametrize("reduction", ["mean"])
-    def test_performance(
-        self,
-        problem,
-        dtype,
-        reduction
-    ):
+    def test_performance(self, problem, dtype, reduction):
         num_tokens, vocabsize, dim = problem
         hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
         labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens
@@ -950,9 +790,7 @@ def test_performance(
 
             start_event.record()
             torch_logprobs = self.TorchLinearCrossEntropy.apply(
-                hidden.view(-1, dim), weight, labels,
-                self.tp_group,
-                reduction,
+                hidden.view(-1, dim), weight, labels, self.tp_group, reduction
             )
             end_event.record()
             torch.cuda.synchronize()
@@ -960,27 +798,19 @@ def test_performance(
 
             start_event.record()
             custom_logprobs = linear_cross_entropy(
-                hidden, weight, labels,
-                tp_group=self.tp_group,
-                reduction=reduction,
+                hidden, weight, labels, tp_group=self.tp_group, reduction=reduction
             )
             end_event.record()
             torch.cuda.synchronize()
             custom_fwd_latency.append(start_event.elapsed_time(end_event))
 
             # ------- backward pass ------- #
-            g_logprobs = (
-                torch.empty_like(torch_logprobs)
-                .uniform_(-0.1, 0.1)
-            )
+            g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1)
             dist.broadcast(g_logprobs, src=0, group=self.tp_group)
 
             start_event.record()
             (d_hidden_torch, d_weight_torch) = torch.autograd.grad(
-                (torch_logprobs,),
-                (hidden, weight),
-                (g_logprobs,),
-                retain_graph=False
+                (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
             )
             end_event.record()
             torch.cuda.synchronize()
@@ -988,10 +818,7 @@ def test_performance(
 
             start_event.record()
             (d_hidden_custom, d_weight_custom) = torch.autograd.grad(
-                (custom_logprobs,),
-                (hidden, weight),
-                (g_logprobs,),
-                retain_graph=False
+                (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
             )
             end_event.record()
             torch.cuda.synchronize()
@@ -1005,29 +832,35 @@ def test_performance(
 
         if self.is_chief:
             print()
-            print(f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}, TP size {self.tp_world_size}:")
-            print(f"[INFO]: Torch forward latency: {sum(torch_fwd_latency) / len(torch_fwd_latency):.2f} ms")
-            print(f"[INFO]: Custom forward latency: {sum(custom_fwd_latency) / len(custom_fwd_latency):.2f} ms")
-            print(f"[INFO]: Torch backward latency: {sum(torch_bwd_latency) / len(torch_bwd_latency):.2f} ms")
-            print(f"[INFO]: Custom backward latency: {sum(custom_bwd_latency) / len(custom_bwd_latency):.2f} ms")
-
+            print(
+                f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}, TP size {self.tp_world_size}:"
+            )
+            print(
+                f"[INFO]: Torch forward latency: {sum(torch_fwd_latency) / len(torch_fwd_latency):.2f} ms"
+            )
+            print(
+                f"[INFO]: Custom forward latency: {sum(custom_fwd_latency) / len(custom_fwd_latency):.2f} ms"
+            )
+            print(
+                f"[INFO]: Torch backward latency: {sum(torch_bwd_latency) / len(torch_bwd_latency):.2f} ms"
+            )
+            print(
+                f"[INFO]: Custom backward latency: {sum(custom_bwd_latency) / len(custom_bwd_latency):.2f} ms"
+            )
 
     @pytest.mark.parametrize("problem", [((1, 4096), 129280, 7168)])
     @pytest.mark.parametrize("dtype", [torch.bfloat16])
     @pytest.mark.parametrize("reduction", ["mean"])
-    def test_storage(
-        self,
-        problem,
-        dtype,
-        reduction
-    ):
+    def test_storage(self, problem, dtype, reduction):
         num_tokens, vocabsize, dim = problem
         hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
         labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens
 
         if self.is_chief:
             print()
-            print(f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}, TP size {self.tp_world_size}:")
+            print(
+                f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}, TP size {self.tp_world_size}:"
+            )
 
         def torch_storage():
             hidden = (
@@ -1047,32 +880,28 @@ def torch_storage():
 
             torch.cuda.reset_peak_memory_stats()
             torch_logprobs = self.TorchLinearCrossEntropy.apply(
-                hidden.view(-1, dim), weight, labels,
-                self.tp_group,
-                reduction,
+                hidden.view(-1, dim), weight, labels, self.tp_group, reduction
             )
             torch.cuda.synchronize()
             torch_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
             if self.is_chief:
-                print(f"[INFO]: On GPU {self.tp_rank}, Torch Forward pass peak memory: {torch_max_memory:.2f} MB")
+                print(
+                    f"[INFO]: On GPU {self.tp_rank}, Torch Forward pass peak memory: {torch_max_memory:.2f} MB"
+                )
 
-            g_logprobs = (
-                torch.empty_like(torch_logprobs)
-                .uniform_(-0.1, 0.1)
-            )
+            g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1)
             dist.broadcast(g_logprobs, src=0, group=self.tp_group)
 
             torch.cuda.reset_peak_memory_stats()
             (d_hidden_torch, d_weight_torch) = torch.autograd.grad(
-                (torch_logprobs,),
-                (hidden, weight),
-                (g_logprobs,),
-                retain_graph=False
+                (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
             )
             torch.cuda.synchronize()
             torch_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
             if self.is_chief:
-                print(f"[INFO]: On GPU {self.tp_rank}, Torch Backward pass peak memory: {torch_max_memory:.2f} MB")
+                print(
+                    f"[INFO]: On GPU {self.tp_rank}, Torch Backward pass peak memory: {torch_max_memory:.2f} MB"
+                )
 
         def custom_storage():
             hidden = (
@@ -1086,38 +915,34 @@ def custom_storage():
                 .requires_grad_()
             )
             labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
-            
+
             dist.broadcast(hidden, src=0, group=self.tp_group)
             dist.broadcast(labels, src=0, group=self.tp_group)
 
             torch.cuda.reset_peak_memory_stats()
             custom_logprobs = linear_cross_entropy(
-                hidden, weight, labels,
-                tp_group=self.tp_group,
-                reduction=reduction,
+                hidden, weight, labels, tp_group=self.tp_group, reduction=reduction
             )
             torch.cuda.synchronize()
             custom_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
             if self.is_chief:
-                print(f"[INFO]: On GPU {self.tp_rank}, Custom Forward pass peak memory: {custom_max_memory:.2f} MB")
+                print(
+                    f"[INFO]: On GPU {self.tp_rank}, Custom Forward pass peak memory: {custom_max_memory:.2f} MB"
+                )
 
-            g_logprobs = (
-                torch.empty_like(custom_logprobs)
-                .uniform_(-0.1, 0.1)
-            )
+            g_logprobs = torch.empty_like(custom_logprobs).uniform_(-0.1, 0.1)
             dist.broadcast(g_logprobs, src=0, group=self.tp_group)
 
             torch.cuda.reset_peak_memory_stats()
             (d_hidden_custom, d_weight_custom) = torch.autograd.grad(
-                (custom_logprobs,),
-                (hidden, weight),
-                (g_logprobs,),
-                retain_graph=False
+                (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
             )
             torch.cuda.synchronize()
             custom_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
             if self.is_chief:
-                print(f"[INFO]: On GPU {self.tp_rank}, Custom Backward pass peak memory: {custom_max_memory:.2f} MB")
+                print(
+                    f"[INFO]: On GPU {self.tp_rank}, Custom Backward pass peak memory: {custom_max_memory:.2f} MB"
+                )
 
         self.cleanup()
         torch_storage()
@@ -1125,10 +950,9 @@ def custom_storage():
         custom_storage()
 
 
-
 @pytest.mark.skipif(
     "WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2,
-    reason="Requires torchrun with multiple GPUs"
+    reason="Requires torchrun with multiple GPUs",
 )
 class TestFusedLinearCrossEntropySequenceParallel:
     @classmethod
@@ -1140,14 +964,14 @@ def setup_class(cls):
                 backend="nccl",
                 init_method="env://",
                 world_size=int(os.environ["WORLD_SIZE"]),
-                rank=int(os.environ["RANK"])
+                rank=int(os.environ["RANK"]),
             )
             cls.must_teardown = True
         cls.tp_group = dist.group.WORLD
 
         cls.tp_rank = dist.get_rank(cls.tp_group)
         cls.tp_world_size = dist.get_world_size(cls.tp_group)
-        cls.is_chief = (cls.tp_rank == 0)
+        cls.is_chief = cls.tp_rank == 0
         device = torch.device(f"cuda:{cls.tp_rank}")
         torch.cuda.set_device(device)
         print(f"[INFO]: TP rank: {cls.tp_rank}, TP world size: {cls.tp_world_size}")
@@ -1160,6 +984,7 @@ def teardown_class(cls):
     @staticmethod
     def timed_barrier(timeout_s=10):
         import time
+
         work = torch.distributed.barrier(async_op=True)
         t0 = time.time()
         while not work.is_completed():
@@ -1185,9 +1010,7 @@ def torch_linear_cross_entropy_single_gpu(
     ):
         logits = hidden.to(torch.float32) @ weight.T.to(torch.float32)
         logprobs = torch.nn.functional.cross_entropy(
-            logits.view(-1, logits.shape[-1]),
-            labels.view(-1),
-            reduction=reduction,
+            logits.view(-1, logits.shape[-1]), labels.view(-1), reduction=reduction
         )
         return logprobs.to(torch.float32)
 
@@ -1207,20 +1030,16 @@ def forward(
             whole_hidden = torch.empty(
                 (hidden.shape[0] * tp_world_size, hidden.shape[-1]),
                 dtype=hidden.dtype,
-                device=hidden.device
-            )
-            dist.all_gather_into_tensor(
-                whole_hidden,
-                hidden,
-                group=tp_group
+                device=hidden.device,
             )
+            dist.all_gather_into_tensor(whole_hidden, hidden, group=tp_group)
 
             logits = whole_hidden.to(torch.float32) @ weight.T.to(torch.float32)
 
             whole_logits = torch.empty(
                 (logits.shape[0], logits.shape[-1] * tp_world_size),
                 dtype=logits.dtype,
-                device=logits.device
+                device=logits.device,
             )
             whole_logits_ref = [
                 whole_logits[..., i * logits.shape[-1] : (i + 1) * logits.shape[-1]]
@@ -1229,9 +1048,7 @@ def forward(
             dist.all_gather(whole_logits_ref, logits, group=tp_group)
 
             logprobs = torch.nn.functional.cross_entropy(
-                whole_logits.view(-1, whole_logits.shape[-1]),
-                labels.view(-1),
-                reduction=reduction,
+                whole_logits.view(-1, whole_logits.shape[-1]), labels.view(-1), reduction=reduction
             )
 
             # If we don't preserve whole_logits,
@@ -1245,10 +1062,7 @@ def forward(
             return logprobs.to(torch.float32)
 
         @staticmethod
-        def backward(
-            ctx,
-            g_logprobs: torch.Tensor,
-        ):
+        def backward(ctx, g_logprobs: torch.Tensor):
             whole_hidden, weight, labels = ctx.saved_tensors
             tp_group = ctx.tp_group
             reduction = ctx.reduction
@@ -1258,15 +1072,9 @@ def backward(
             num_tokens, dim = whole_hidden.shape
 
             if reduction == "mean":
-                _g_logprobs = torch.broadcast_to(
-                    g_logprobs / num_tokens,
-                    (num_tokens,)
-                )
+                _g_logprobs = torch.broadcast_to(g_logprobs / num_tokens, (num_tokens,))
             elif reduction == "sum":
-                _g_logprobs = torch.broadcast_to(
-                    g_logprobs,
-                    (num_tokens,)
-                )
+                _g_logprobs = torch.broadcast_to(g_logprobs, (num_tokens,))
             else:
                 _g_logprobs = g_logprobs
 
@@ -1275,7 +1083,7 @@ def backward(
             whole_logits = torch.empty(
                 (logits.shape[0], logits.shape[-1] * tp_world_size),
                 dtype=logits.dtype,
-                device=logits.device
+                device=logits.device,
             )
             whole_logits_ref = [
                 whole_logits[..., i * logits.shape[-1] : (i + 1) * logits.shape[-1]]
@@ -1307,27 +1115,17 @@ def backward(
             # local_d_hidden = local_d_hidden[tp_rank * local_num_tokens : (tp_rank + 1) * local_num_tokens, :]
 
             local_d_hidden = torch.empty(
-                (local_num_tokens, dim),
-                dtype=weight.dtype,
-                device=weight.device
+                (local_num_tokens, dim), dtype=weight.dtype, device=weight.device
             )
             dist.reduce_scatter_tensor(
-                local_d_hidden,
-                d_hidden,
-                op=dist.ReduceOp.SUM,
-                group=tp_group
+                local_d_hidden, d_hidden, op=dist.ReduceOp.SUM, group=tp_group
             )
             return local_d_hidden, local_d_weight, None, None, None
 
     @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
     @pytest.mark.parametrize("reduction", ["mean", "sum", "none"])
     @pytest.mark.parametrize("problem", [(256, 12928, 8192)])
-    def test_torch_tp_vs_single_gpu(
-        self,
-        dtype,
-        reduction,
-        problem,
-    ):
+    def test_torch_tp_vs_single_gpu(self, dtype, reduction, problem):
         num_tokens, vocabsize, dim = problem
 
         hidden = (
@@ -1340,93 +1138,60 @@ def test_torch_tp_vs_single_gpu(
             .uniform_(-0.1, 0.1)
             .requires_grad_()
         )
-        labels = torch.randint(0, vocabsize, (num_tokens * self.tp_world_size,), 
-                                dtype=torch.long, device="cuda")
+        labels = torch.randint(
+            0, vocabsize, (num_tokens * self.tp_world_size,), dtype=torch.long, device="cuda"
+        )
 
         # ------------ forward pass ------------ #
         dist.broadcast(labels, src=0, group=self.tp_group)
 
         # single GPU
         whole_hidden = torch.empty(
-            (num_tokens * self.tp_world_size, dim),
-            dtype=dtype,
-            device="cuda"
-        )
-        dist.all_gather_into_tensor(
-            whole_hidden,
-            hidden,
-            group=self.tp_group
+            (num_tokens * self.tp_world_size, dim), dtype=dtype, device="cuda"
         )
+        dist.all_gather_into_tensor(whole_hidden, hidden, group=self.tp_group)
         whole_hidden = whole_hidden.clone().requires_grad_()
 
         whole_weight = torch.empty(
-            (vocabsize * self.tp_world_size, dim),
-            dtype=dtype,
-            device="cuda"
+            (vocabsize * self.tp_world_size, dim), dtype=dtype, device="cuda"
         )
         whole_weight_view = [
-            whole_weight[i * vocabsize : (i + 1) * vocabsize, :]
-            for i in range(self.tp_world_size)
+            whole_weight[i * vocabsize : (i + 1) * vocabsize, :] for i in range(self.tp_world_size)
         ]
-        dist.all_gather(
-            whole_weight_view, 
-            weight,
-            group=self.tp_group
-        )
+        dist.all_gather(whole_weight_view, weight, group=self.tp_group)
         whole_weight = whole_weight.clone().requires_grad_()
         logprobs_single_gpu = self.torch_linear_cross_entropy_single_gpu(
-            whole_hidden, whole_weight, labels,
-            reduction=reduction,
+            whole_hidden, whole_weight, labels, reduction=reduction
         )
 
         # TP
         logprobs_tp = self.TorchLinearCrossEntropy.apply(
-            hidden, weight, labels,
-            self.tp_group,
-            reduction,
-        )
-        torch.testing.assert_close(
-            logprobs_single_gpu,
-            logprobs_tp,
+            hidden, weight, labels, self.tp_group, reduction
         )
+        torch.testing.assert_close(logprobs_single_gpu, logprobs_tp)
 
         # ------------ backward pass ------------ #
-        g_logprobs = (
-            torch.empty_like(logprobs_single_gpu)
-            .uniform_(-0.1, 0.1)
-        )
+        g_logprobs = torch.empty_like(logprobs_single_gpu).uniform_(-0.1, 0.1)
         dist.broadcast(g_logprobs, src=0, group=self.tp_group)
 
         # single GPU
         (d_hidden_single_gpu, d_weight_single_gpu) = torch.autograd.grad(
-            (logprobs_single_gpu,),
-            (whole_hidden, whole_weight),
-            (g_logprobs,),
-            retain_graph=False
+            (logprobs_single_gpu,), (whole_hidden, whole_weight), (g_logprobs,), retain_graph=False
         )
 
         # TP
         (d_hidden_tp, d_weight_tp) = torch.autograd.grad(
-            (logprobs_tp,),
-            (hidden, weight),
-            (g_logprobs,),
-            retain_graph=False
+            (logprobs_tp,), (hidden, weight), (g_logprobs,), retain_graph=False
         )
 
-        local_d_hidden_single_gpu = d_hidden_single_gpu[self.tp_rank * hidden.shape[0] : (self.tp_rank + 1) * hidden.shape[0], :]
-        torch.testing.assert_close(
-            local_d_hidden_single_gpu,
-            d_hidden_tp,
-            atol=1e-3,
-            rtol=1e-3,
-        )
-        local_d_weight_single_gpu = d_weight_single_gpu[self.tp_rank * weight.shape[0] : (self.tp_rank + 1) * weight.shape[0], :]
-        torch.testing.assert_close(
-            local_d_weight_single_gpu,
-            d_weight_tp,
-            atol=1e-3,
-            rtol=1e-3,
-        )
+        local_d_hidden_single_gpu = d_hidden_single_gpu[
+            self.tp_rank * hidden.shape[0] : (self.tp_rank + 1) * hidden.shape[0], :
+        ]
+        torch.testing.assert_close(local_d_hidden_single_gpu, d_hidden_tp, atol=1e-3, rtol=1e-3)
+        local_d_weight_single_gpu = d_weight_single_gpu[
+            self.tp_rank * weight.shape[0] : (self.tp_rank + 1) * weight.shape[0], :
+        ]
+        torch.testing.assert_close(local_d_weight_single_gpu, d_weight_tp, atol=1e-3, rtol=1e-3)
 
         self.cleanup()
 
@@ -1444,15 +1209,14 @@ def get_problems():
     @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
     @pytest.mark.parametrize("reduction", ["mean", "sum", "none"])
     @pytest.mark.parametrize("problem", get_problems())
-    def test_correctness(
-        self,
-        dtype,
-        reduction,
-        problem,
-    ):
+    def test_correctness(self, dtype, reduction, problem):
         num_tokens, vocabsize, dim = problem
         hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
-        labels_shape = (num_tokens * self.tp_world_size,) if isinstance(num_tokens, int) else (num_tokens[0] * self.tp_world_size, *num_tokens[1:])
+        labels_shape = (
+            (num_tokens * self.tp_world_size,)
+            if isinstance(num_tokens, int)
+            else (num_tokens[0] * self.tp_world_size, *num_tokens[1:])
+        )
 
         hidden = (
             torch.empty(hidden_shape, dtype=dtype, device="cuda")
@@ -1470,56 +1234,34 @@ def test_correctness(
         dist.broadcast(labels, src=0, group=self.tp_group)
 
         torch_logprobs = self.TorchLinearCrossEntropy.apply(
-            hidden.view(-1, dim), weight, labels,
-            self.tp_group,
-            reduction,
+            hidden.view(-1, dim), weight, labels, self.tp_group, reduction
         )
 
         custom_logprobs = linear_cross_entropy(
-            hidden, weight, labels,
+            hidden,
+            weight,
+            labels,
             tp_group=self.tp_group,
             reduction=reduction,
             sequence_parallel=True,
         )
 
-        torch.testing.assert_close(
-            torch_logprobs,
-            custom_logprobs,
-        )
+        torch.testing.assert_close(torch_logprobs, custom_logprobs)
 
         # ------- backward pass ------- #
-        g_logprobs = (
-            torch.empty_like(torch_logprobs)
-            .uniform_(-0.1, 0.1)
-        )
+        g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1)
         dist.broadcast(g_logprobs, src=0, group=self.tp_group)
 
         (d_hidden_torch, d_weight_torch) = torch.autograd.grad(
-            (torch_logprobs,),
-            (hidden, weight),
-            (g_logprobs,),
-            retain_graph=False
+            (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
         )
         (d_hidden_custom, d_weight_custom) = torch.autograd.grad(
-            (custom_logprobs,),
-            (hidden, weight),
-            (g_logprobs,),
-            retain_graph=False
+            (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
         )
 
         # in case one GPU failed, and leading to hang
-        torch.testing.assert_close(
-            d_hidden_torch,
-            d_hidden_custom,
-            atol=1e-3,
-            rtol=1e-3,
-        )
-        torch.testing.assert_close(
-            d_weight_torch,
-            d_weight_custom,
-            atol=1e-3,
-            rtol=1e-3,
-        )
+        torch.testing.assert_close(d_hidden_torch, d_hidden_custom, atol=1e-3, rtol=1e-3)
+        torch.testing.assert_close(d_weight_torch, d_weight_custom, atol=1e-3, rtol=1e-3)
         self.timed_barrier()
 
         self.cleanup()
@@ -1527,15 +1269,14 @@ def test_correctness(
     @pytest.mark.parametrize("problem", [((1, 1024), 129280, 7168)])
     @pytest.mark.parametrize("dtype", [torch.bfloat16])
     @pytest.mark.parametrize("reduction", ["mean"])
-    def test_performance(
-        self,
-        problem,
-        dtype,
-        reduction
-    ):
+    def test_performance(self, problem, dtype, reduction):
         num_tokens, vocabsize, dim = problem
         hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
-        labels_shape = (num_tokens * self.tp_world_size,) if isinstance(num_tokens, int) else (num_tokens[0] * self.tp_world_size, *num_tokens[1:])
+        labels_shape = (
+            (num_tokens * self.tp_world_size,)
+            if isinstance(num_tokens, int)
+            else (num_tokens[0] * self.tp_world_size, *num_tokens[1:])
+        )
 
         start_event = torch.cuda.Event(enable_timing=True)
         end_event = torch.cuda.Event(enable_timing=True)
@@ -1564,9 +1305,7 @@ def test_performance(
 
             start_event.record()
             torch_logprobs = self.TorchLinearCrossEntropy.apply(
-                hidden.view(-1, dim), weight, labels,
-                self.tp_group,
-                reduction,
+                hidden.view(-1, dim), weight, labels, self.tp_group, reduction
             )
             end_event.record()
             torch.cuda.synchronize()
@@ -1574,7 +1313,9 @@ def test_performance(
 
             start_event.record()
             custom_logprobs = linear_cross_entropy(
-                hidden, weight, labels,
+                hidden,
+                weight,
+                labels,
                 tp_group=self.tp_group,
                 reduction=reduction,
                 sequence_parallel=True,
@@ -1584,18 +1325,12 @@ def test_performance(
             custom_fwd_latency.append(start_event.elapsed_time(end_event))
 
             # ------- backward pass ------- #
-            g_logprobs = (
-                torch.empty_like(torch_logprobs)
-                .uniform_(-0.1, 0.1)
-            )
+            g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1)
             dist.broadcast(g_logprobs, src=0, group=self.tp_group)
 
             start_event.record()
             (d_hidden_torch, d_weight_torch) = torch.autograd.grad(
-                (torch_logprobs,),
-                (hidden, weight),
-                (g_logprobs,),
-                retain_graph=False
+                (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
             )
             end_event.record()
             torch.cuda.synchronize()
@@ -1603,10 +1338,7 @@ def test_performance(
 
             start_event.record()
             (d_hidden_custom, d_weight_custom) = torch.autograd.grad(
-                (custom_logprobs,),
-                (hidden, weight),
-                (g_logprobs,),
-                retain_graph=False
+                (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
             )
             end_event.record()
             torch.cuda.synchronize()
@@ -1620,29 +1352,39 @@ def test_performance(
 
         if self.is_chief:
             print()
-            print(f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}, TP size {self.tp_world_size}, Sequence Parallel: True:")
-            print(f"[INFO]: Torch forward latency: {sum(torch_fwd_latency) / len(torch_fwd_latency):.2f} ms")
-            print(f"[INFO]: Custom forward latency: {sum(custom_fwd_latency) / len(custom_fwd_latency):.2f} ms")
-            print(f"[INFO]: Torch backward latency: {sum(torch_bwd_latency) / len(torch_bwd_latency):.2f} ms")
-            print(f"[INFO]: Custom backward latency: {sum(custom_bwd_latency) / len(custom_bwd_latency):.2f} ms")
-
+            print(
+                f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}, TP size {self.tp_world_size}, Sequence Parallel: True:"
+            )
+            print(
+                f"[INFO]: Torch forward latency: {sum(torch_fwd_latency) / len(torch_fwd_latency):.2f} ms"
+            )
+            print(
+                f"[INFO]: Custom forward latency: {sum(custom_fwd_latency) / len(custom_fwd_latency):.2f} ms"
+            )
+            print(
+                f"[INFO]: Torch backward latency: {sum(torch_bwd_latency) / len(torch_bwd_latency):.2f} ms"
+            )
+            print(
+                f"[INFO]: Custom backward latency: {sum(custom_bwd_latency) / len(custom_bwd_latency):.2f} ms"
+            )
 
     @pytest.mark.parametrize("problem", [((1, 1024), 129280, 7168)])
     @pytest.mark.parametrize("dtype", [torch.bfloat16])
     @pytest.mark.parametrize("reduction", ["mean"])
-    def test_storage(
-        self,
-        problem,
-        dtype,
-        reduction
-    ):
+    def test_storage(self, problem, dtype, reduction):
         num_tokens, vocabsize, dim = problem
         hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
-        labels_shape = (num_tokens * self.tp_world_size,) if isinstance(num_tokens, int) else (num_tokens[0] * self.tp_world_size, *num_tokens[1:])
+        labels_shape = (
+            (num_tokens * self.tp_world_size,)
+            if isinstance(num_tokens, int)
+            else (num_tokens[0] * self.tp_world_size, *num_tokens[1:])
+        )
 
         if self.is_chief:
             print()
-            print(f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}, TP size {self.tp_world_size}, Sequence Parallel: True:")
+            print(
+                f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}, TP size {self.tp_world_size}, Sequence Parallel: True:"
+            )
 
         def torch_storage():
             hidden = (
@@ -1662,32 +1404,28 @@ def torch_storage():
 
             torch.cuda.reset_peak_memory_stats()
             torch_logprobs = self.TorchLinearCrossEntropy.apply(
-                hidden.view(-1, dim), weight, labels,
-                self.tp_group,
-                reduction,
+                hidden.view(-1, dim), weight, labels, self.tp_group, reduction
             )
             torch.cuda.synchronize()
             torch_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
             if self.is_chief:
-                print(f"[INFO]: On GPU {self.tp_rank}, Torch Forward pass peak memory: {torch_max_memory:.2f} MB")
+                print(
+                    f"[INFO]: On GPU {self.tp_rank}, Torch Forward pass peak memory: {torch_max_memory:.2f} MB"
+                )
 
-            g_logprobs = (
-                torch.empty_like(torch_logprobs)
-                .uniform_(-0.1, 0.1)
-            )
+            g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1)
             dist.broadcast(g_logprobs, src=0, group=self.tp_group)
 
             torch.cuda.reset_peak_memory_stats()
             (d_hidden_torch, d_weight_torch) = torch.autograd.grad(
-                (torch_logprobs,),
-                (hidden, weight),
-                (g_logprobs,),
-                retain_graph=False
+                (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
             )
             torch.cuda.synchronize()
             torch_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
             if self.is_chief:
-                print(f"[INFO]: On GPU {self.tp_rank}, Torch Backward pass peak memory: {torch_max_memory:.2f} MB")
+                print(
+                    f"[INFO]: On GPU {self.tp_rank}, Torch Backward pass peak memory: {torch_max_memory:.2f} MB"
+                )
 
         def custom_storage():
             hidden = (
@@ -1701,13 +1439,15 @@ def custom_storage():
                 .requires_grad_()
             )
             labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
-            
+
             dist.broadcast(hidden, src=0, group=self.tp_group)
             dist.broadcast(labels, src=0, group=self.tp_group)
 
             torch.cuda.reset_peak_memory_stats()
             custom_logprobs = linear_cross_entropy(
-                hidden, weight, labels,
+                hidden,
+                weight,
+                labels,
                 tp_group=self.tp_group,
                 reduction=reduction,
                 sequence_parallel=True,
@@ -1715,27 +1455,25 @@ def custom_storage():
             torch.cuda.synchronize()
             custom_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
             if self.is_chief:
-                print(f"[INFO]: On GPU {self.tp_rank}, Custom Forward pass peak memory: {custom_max_memory:.2f} MB")
+                print(
+                    f"[INFO]: On GPU {self.tp_rank}, Custom Forward pass peak memory: {custom_max_memory:.2f} MB"
+                )
 
-            g_logprobs = (
-                torch.empty_like(custom_logprobs)
-                .uniform_(-0.1, 0.1)
-            )
+            g_logprobs = torch.empty_like(custom_logprobs).uniform_(-0.1, 0.1)
             dist.broadcast(g_logprobs, src=0, group=self.tp_group)
 
             torch.cuda.reset_peak_memory_stats()
             (d_hidden_custom, d_weight_custom) = torch.autograd.grad(
-                (custom_logprobs,),
-                (hidden, weight),
-                (g_logprobs,),
-                retain_graph=False
+                (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
             )
             torch.cuda.synchronize()
             custom_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
             if self.is_chief:
-                print(f"[INFO]: On GPU {self.tp_rank}, Custom Backward pass peak memory: {custom_max_memory:.2f} MB")
+                print(
+                    f"[INFO]: On GPU {self.tp_rank}, Custom Backward pass peak memory: {custom_max_memory:.2f} MB"
+                )
 
         self.cleanup()
         torch_storage()
         self.cleanup()
-        custom_storage()
\ No newline at end of file
+        custom_storage()

From f6538389d44d4feca92de73184dafb451df68606 Mon Sep 17 00:00:00 2001
From: Jianbin Chang <shjwudp@gmail.com>
Date: Wed, 12 Nov 2025 11:43:53 +0800
Subject: [PATCH 08/17] Remove redundant logits calculations in gpt_model (#9)

* Remove redundant logits calculations in gpt_model

* Merge the linear-cross-entropy-fusion flag and the cross-entropy-fusion flag
---
 .../core/models/common/language_module/language_module.py   | 5 ++++-
 megatron/core/models/gpt/gpt_model.py                       | 2 +-
 megatron/training/arguments.py                              | 6 +-----
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py
index b7013be89f0..2144bd8a997 100644
--- a/megatron/core/models/common/language_module/language_module.py
+++ b/megatron/core/models/common/language_module/language_module.py
@@ -155,7 +155,10 @@ def compute_language_model_loss_without_logits(
         Returns:
             Tensor: Loss tensor of dimensions [batch size, sequence_length].
         """
-        if self.config.linear_cross_entropy_fusion:
+        if (
+            self.config.cross_entropy_loss_fusion
+            and self.config.cross_entropy_fusion_impl == 'linear'
+        ):
             assert (
                 weight is not None
             ), "weight cannot be None when using fused linear cross entropy."
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index 0bb144e408d..a69a2250bce 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -635,7 +635,7 @@ def _postprocess(
                     hidden_states.squeeze(1).unsqueeze(0)
                 ).unsqueeze(1)
 
-        if has_config_logger_enabled(self.config) or labels is not None:
+        if has_config_logger_enabled(self.config) or labels is None:
             logits, _ = self.output_layer(
                 hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output
             )
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index ad34c3e5e0a..21849d3dd94 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -2254,10 +2254,6 @@ def _add_training_args(parser):
                        dest='bias_swiglu_fusion')
     group.add_argument('--use-fused-weighted-squared-relu', action='store_true',
                        help='Use fused weighted squared relu when using MoE.')
-    group.add_argument('--linear-cross-entropy-fusion', action='store_true',
-                       help='Enable fusion of linear layer and cross entropy '
-                       'loss calculation.',
-                       dest='linear_cross_entropy_fusion')
     group.add_argument('--no-bias-dropout-fusion', action='store_false',
                        help='Disable bias and dropout fusion.',
                        dest='bias_dropout_fusion')
@@ -2273,7 +2269,7 @@ def _add_training_args(parser):
                        help='Enabled fusion of cross entropy loss calculation.',
                        dest='cross_entropy_loss_fusion')
     group.add_argument('--cross-entropy-fusion-impl', type=str, default='native',
-                       choices=['native', 'te'],
+                       choices=['native', 'te', 'linear'],
                        help='Implementation of cross entropy loss calculation.')
     group.add_argument('--use-flash-attn', action='store_true',
                        help='use FlashAttention implementation of attention. '

From 66d43ff031a964169b2868a83e3e59f5dd7d6231 Mon Sep 17 00:00:00 2001
From: Jianbing <jianbingd@nvidia.com>
Date: Wed, 12 Nov 2025 15:02:12 +0800
Subject: [PATCH 09/17] fixed some styling issue (#10)

Signed-off-by: Jianbing Dong <jianbingd@nvidia.com>
---
 .../fusions/fused_linear_cross_entropy.py     |  4 ++--
 .../linear_cross_entropy/blackwell/entry.py   | 23 +++++++++++--------
 .../fusions/linear_cross_entropy/utils.py     | 11 ++++-----
 .../common/language_module/language_module.py |  4 ++--
 4 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/megatron/core/fusions/fused_linear_cross_entropy.py b/megatron/core/fusions/fused_linear_cross_entropy.py
index 74d38da8243..720bd1478e7 100644
--- a/megatron/core/fusions/fused_linear_cross_entropy.py
+++ b/megatron/core/fusions/fused_linear_cross_entropy.py
@@ -63,7 +63,7 @@ def forward(
         weight: torch.Tensor,
         labels: torch.Tensor,
         tp_group: typing.Optional[torch.distributed.ProcessGroup] = None,
-        reduction: str = "mean",
+        reduction: typing.Literal["none", "sum", "mean"] = "mean",
         ignore_index: int = -100,
         sequence_parallel: bool = False,
     ) -> torch.Tensor:
@@ -216,7 +216,7 @@ def linear_cross_entropy(
     weight: torch.Tensor,
     labels: torch.Tensor,
     tp_group: typing.Optional[torch.distributed.ProcessGroup] = None,
-    reduction: str = "mean",
+    reduction: typing.Literal["none", "sum", "mean"] = "mean",
     ignore_index: int = -100,
     sequence_parallel: bool = False,
 ) -> torch.Tensor:
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py b/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py
index 786f0fd9b3b..e156735ded2 100644
--- a/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 
 import typing
+from dataclasses import dataclass, field
 
 import cuda.bindings.driver as cuda  # type: ignore
 import cutlass
@@ -18,23 +19,25 @@
 from megatron.core.fusions.linear_cross_entropy.blackwell import triton as triton_kernels
 
 
+@dataclass
 class FwdConfig:
     """
     The configuration for the forward pass.
     """
 
-    _dedicated_stream: torch.cuda.Stream = None
-    _dedicated_events: typing.List[torch.cuda.Event] = list()
-    _initialized: bool = False
-    _fwd_mainloop_kernels: typing.Dict[str, cute.kernel] = dict()
+    _dedicated_stream: torch.cuda.Stream = field(default_factory=torch.cuda.Stream)
+    _dedicated_events: typing.List[torch.cuda.Event] = field(default_factory=list)
+    _initialized: bool = field(default=False)
+    _fwd_mainloop_kernels: typing.Dict[str, cute.kernel] = field(default_factory=dict)
 
 
+@dataclass
 class BwdConfig:
     """
     The configuration for the backward pass.
     """
 
-    _bwd_kernel: typing.Dict[str, cute.kernel] = dict()
+    _bwd_kernel: typing.Dict[str, cute.kernel] = field(default_factory=dict)
 
 
 _fwd_config = FwdConfig()
@@ -46,7 +49,7 @@ def forward(
     weight: torch.Tensor,
     labels: torch.Tensor,
     tp_group: typing.Optional[torch.distributed.ProcessGroup] = None,
-    reduction: str = "mean",
+    reduction: typing.Literal["none", "sum", "mean"] = "mean",
     ignore_index: int = -100,
     sequence_parallel: bool = False,
 ) -> typing.Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int, int, torch.Tensor]:
@@ -201,7 +204,7 @@ def grid(meta):
             _logprobs,
             _logprobs.stride(0),
             logprobs,
-            triton.language.constexpr(REDUCTION),
+            triton.language.constexpr(REDUCTION.value),
         )
     else:
         _max_backup = _max.clone()
@@ -251,7 +254,7 @@ def grid(meta):
             accumulate,
             accumulate.stride(0),
             logprobs,
-            REDUCTION,
+            REDUCTION.value,
         )
 
     return logprobs, maximum, accumulate, num_valid_tokens, tp_rank, tp_world_size, global_hidden
@@ -265,7 +268,7 @@ def backward(
     maximum: torch.Tensor,
     accu: torch.Tensor,
     num_valid_tokens: torch.Tensor,
-    reduction: str = "mean",
+    reduction: typing.Literal["none", "sum", "mean"] = "mean",
     ignore_index: int = -100,
     tp_group: typing.Optional[dist.ProcessGroup] = None,
     tp_rank: int = 0,
@@ -334,7 +337,7 @@ def backward(
         key = f"vocab_size:{vocab_size}+dim:{dim}+reduction:{REDUCTION}+dtype:{hidden_view.dtype}"
         if _bwd_config._bwd_kernel.get(key) is None:
             bwd_kernel = bwd_partial_dlogits.BwdPartialDlogits(
-                reduction=REDUCTION, vocab_per_split=vocab_per_split
+                reduction=REDUCTION.value, vocab_per_split=vocab_per_split
             )
             bwd_kernel_compiled = cute.compile(
                 bwd_kernel,
diff --git a/megatron/core/fusions/linear_cross_entropy/utils.py b/megatron/core/fusions/linear_cross_entropy/utils.py
index 9a62b9826cb..d077d64ab17 100644
--- a/megatron/core/fusions/linear_cross_entropy/utils.py
+++ b/megatron/core/fusions/linear_cross_entropy/utils.py
@@ -1,10 +1,10 @@
 # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 
-from dataclasses import dataclass
+import typing
+from enum import Enum
 
 
-@dataclass
-class EntropyReductionEnum:
+class EntropyReductionEnum(Enum):
     """
     Enum for the reduction method of cross entropy.
     """
@@ -14,7 +14,7 @@ class EntropyReductionEnum:
     kMean = 2
 
 
-def str_to_reduction_enum(reduction: str) -> int:
+def str_to_reduction_enum(reduction: typing.Literal["none", "sum", "mean"]) -> EntropyReductionEnum:
     """
     str -> EntropyReductionEnum
     """
@@ -30,8 +30,7 @@ def str_to_reduction_enum(reduction: str) -> int:
     return _enum
 
 
-@dataclass
-class BackwardMethodEnum:
+class BackwardMethodEnum(Enum):
     """
     Enum for the backward method of linear cross entropy.
     """
diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py
index 2144bd8a997..acd81a459bb 100644
--- a/megatron/core/models/common/language_module/language_module.py
+++ b/megatron/core/models/common/language_module/language_module.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 import logging
 import os
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, Literal, Optional, Tuple
 
 import torch
 from torch import Tensor
@@ -134,7 +134,7 @@ def compute_language_model_loss_without_logits(
         sequence_parallel_enabled: bool = False,
         column_parallel_linear: torch.nn.Module = None,
         col_linear_kwargs: Dict[str, Any] = {},
-        reduction: str = "none",
+        reduction: Literal["none", "sum", "mean"] = "none",
         ignore_index: int = -100,
     ) -> Tuple[Tensor, Optional[Tensor]]:
         """Computes the language model logits and loss (Cross entropy across vocabulary)

From c1548f883c68392c60267f72906abbaa79d4c750 Mon Sep 17 00:00:00 2001
From: Jianbin Chang <shjwudp@gmail.com>
Date: Fri, 14 Nov 2025 09:28:02 +0800
Subject: [PATCH 10/17] rename compute_output_layer_and_language_model_loss
 (#12)

* rename compute_output_layer_and_language_model_loss

* remove used option fused_linear_cross_entropy in transformer_config
---
 .../core/models/common/language_module/language_module.py     | 2 +-
 megatron/core/models/gpt/gpt_model.py                         | 4 ++--
 megatron/core/models/mamba/mamba_model.py                     | 2 +-
 megatron/core/transformer/transformer_config.py               | 3 ---
 4 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py
index acd81a459bb..c557b3a94e7 100644
--- a/megatron/core/models/common/language_module/language_module.py
+++ b/megatron/core/models/common/language_module/language_module.py
@@ -126,7 +126,7 @@ def check_and_set_env_variable(
             check_and_set_env_variable("NVTE_FUSED_ATTN", 1, AttnBackend.auto)
             check_and_set_env_variable("NVTE_UNFUSED_ATTN", 1, AttnBackend.auto)
 
-    def compute_language_model_loss_without_logits(
+    def compute_output_layer_and_language_model_loss(
         self,
         hidden: Tensor,
         labels: Optional[Tensor],
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index a69a2250bce..fd1698e3578 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -577,7 +577,7 @@ def _postprocess(
                 )
 
                 # Compute mtp loss without storing logits to save memory.
-                mtp_loss = self.compute_language_model_loss_without_logits(
+                mtp_loss = self.compute_output_layer_and_language_model_loss(
                     hidden_states_list[mtp_layer_number + 1],
                     labels=mtp_labels,
                     weight=self.shared_embedding_or_output_weight(),
@@ -667,7 +667,7 @@ def _postprocess(
             # [s b h] => [b s h]
             return logits.transpose(0, 1).contiguous()
 
-        loss = self.compute_language_model_loss_without_logits(
+        loss = self.compute_output_layer_and_language_model_loss(
             hidden_states,
             labels=labels,
             weight=self.shared_embedding_or_output_weight(),
diff --git a/megatron/core/models/mamba/mamba_model.py b/megatron/core/models/mamba/mamba_model.py
index eab86d6d532..a10315e8203 100644
--- a/megatron/core/models/mamba/mamba_model.py
+++ b/megatron/core/models/mamba/mamba_model.py
@@ -254,7 +254,7 @@ def forward(
             # [s b h] => [b s h]
             return logits.transpose(0, 1).contiguous()
 
-        loss = self.compute_language_model_loss_without_logits(
+        loss = self.compute_output_layer_and_language_model_loss(
             hidden_states,
             labels,
             weight=self.shared_embedding_or_output_weight(),
diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py
index 55de1e07181..aab137b6430 100644
--- a/megatron/core/transformer/transformer_config.py
+++ b/megatron/core/transformer/transformer_config.py
@@ -327,9 +327,6 @@ class TransformerConfig(ModelParallelConfig):
     fused_single_qkv_rope: bool = False
     """If set, avoid splitting QKV before ROPE forward and avoid concatenating ROPE dgrads."""
 
-    linear_cross_entropy_fusion: bool = False
-    """If True, fuses the linear layer and cross entropy loss calculation."""
-
     ####################
     # activation recomputation
     ####################

From 500326452fd426390940ca85e523a8697be8771b Mon Sep 17 00:00:00 2001
From: Jianbin Chang <shjwudp@gmail.com>
Date: Fri, 21 Nov 2025 16:54:49 +0800
Subject: [PATCH 11/17] remove unrelated change (#13)

---
 megatron/core/models/mamba/mamba_model.py |  9 ---------
 tests/unit_tests/a2a_overlap/utils.py     | 13 +++----------
 2 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/megatron/core/models/mamba/mamba_model.py b/megatron/core/models/mamba/mamba_model.py
index 4357ef9e9a7..a10315e8203 100644
--- a/megatron/core/models/mamba/mamba_model.py
+++ b/megatron/core/models/mamba/mamba_model.py
@@ -247,15 +247,6 @@ def forward(
         if in_inference_mode and inference_context.materialize_only_last_token_logits:
             hidden_states = hidden_states[-1, :, :].unsqueeze(0)
 
-        # Restore sequence parallel execution to the output layer if necessary.
-        if sequence_parallel_override:
-            assert (
-                in_inference_mode
-                and inference_context.is_dynamic_batching()
-                and inference_context.materialize_only_last_token_logits
-            )
-            self.output_layer.sequence_parallel = True
-
         if labels is None:
             logits, _ = self.output_layer(
                 hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output
diff --git a/tests/unit_tests/a2a_overlap/utils.py b/tests/unit_tests/a2a_overlap/utils.py
index d80eaf13f5a..7db4256a849 100644
--- a/tests/unit_tests/a2a_overlap/utils.py
+++ b/tests/unit_tests/a2a_overlap/utils.py
@@ -222,8 +222,8 @@ def get_test_config(num_layers=1, num_moe_experts=8, extra_kwargs={}, moe_groupe
 
 def get_valid_token_dispatcher_types():
     try:
-        from deep_ep import Buffer  # type: ignore
-        from deep_ep.utils import EventHandle, EventOverlap  # type: ignore
+        from deep_ep import Buffer
+        from deep_ep.utils import EventHandle, EventOverlap
 
         return ["alltoall", "flex"]
     except ImportError:
@@ -237,14 +237,7 @@ def get_valid_fp8_flags():
     recipes = []
     valid_flags = []
     if is_te_min_version("2.3.0.dev0"):
-        props = torch.cuda.get_device_properties(torch.cuda.current_device())
-        compute_capability = (props.major, props.minor)
-        if (
-            compute_capability >= (9, 0)
-            and compute_capability < (10, 0)
-            and float(torch.version.cuda) >= 12.9
-        ):
-            recipes.append(Fp8Recipe.blockwise)
+        recipes.append(Fp8Recipe.blockwise)
         recipes.append(Fp8Recipe.tensorwise)
 
     for fp8_type in fp8_types:

From 78c827e7cf4b65add328a76721580ecf3f0f807d Mon Sep 17 00:00:00 2001
From: Jianbin Chang <shjwudp@gmail.com>
Date: Thu, 27 Nov 2025 12:23:39 +0800
Subject: [PATCH 12/17] handle non-blackwell arch platform init fail (#14)

---
 .../core/fusions/fused_linear_cross_entropy.py     | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/megatron/core/fusions/fused_linear_cross_entropy.py b/megatron/core/fusions/fused_linear_cross_entropy.py
index 720bd1478e7..85308b1c813 100644
--- a/megatron/core/fusions/fused_linear_cross_entropy.py
+++ b/megatron/core/fusions/fused_linear_cross_entropy.py
@@ -40,9 +40,11 @@ def __init__(self) -> None:
 
         self._initialized = True
 
-
-_platform = Platform()
-
+try:
+    _platform = Platform()
+except ValueError as e:
+    _unsupported_architecture_error = e
+    _platform = None
 
 class LinearCrossEntropy(torch.autograd.Function):
     """
@@ -152,6 +154,9 @@ def forward(
         # each rank will get distinct local d_hidden and d_weight
         ```
         """
+        if _unsupported_architecture_error:
+            raise _unsupported_architecture_error
+
         with torch.cuda.nvtx.range("LinearCrossEntropy-forward"):
             logprobs, _maximum, _acc, _num_valid_tokens, tp_rank, tp_world_size, global_hidden = (
                 _platform.forward_func(
@@ -182,6 +187,9 @@ def backward(
             dhidden (torch.Tensor): The gradient of the hidden.
             dweight (torch.Tensor): The gradient of the weight.
         """
+        if _unsupported_architecture_error:
+            raise _unsupported_architecture_error
+
         with torch.cuda.nvtx.range("LinearCrossEntropy-backward"):
             (global_hidden, weight, labels, _maximum, _accu, _num_valid_tokens) = ctx.saved_tensors
 

From 011947de8b48c02ad1b508c4f07c398124ef33df Mon Sep 17 00:00:00 2001
From: Jianbin Chang <shjwudp@gmail.com>
Date: Thu, 27 Nov 2025 15:01:48 +0800
Subject: [PATCH 13/17] Remove the migration code from the main branch to the
 dev branch (#15)

* Remove the code that synchronizes from the main branch to the dev branch.

* remove unused typing
---
 .../common/language_module/language_module.py      |  6 +++---
 megatron/core/models/gpt/gpt_model.py              | 14 ++++++--------
 megatron/core/models/mamba/mamba_model.py          |  2 +-
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py
index c557b3a94e7..198b7a06f2f 100644
--- a/megatron/core/models/common/language_module/language_module.py
+++ b/megatron/core/models/common/language_module/language_module.py
@@ -60,7 +60,7 @@ def __init__(
             "If you don't need embd_group, you need to explicitly set it to None."
         )
         self.embd_group = pg_collection.embd
-        self.vp_stage: Optional[int] = None
+        self.vp_stage = None
         self.vp_size = self.config.virtual_pipeline_model_parallel_size
 
     def _is_in_embd_group(self):
@@ -136,7 +136,7 @@ def compute_output_layer_and_language_model_loss(
         col_linear_kwargs: Dict[str, Any] = {},
         reduction: Literal["none", "sum", "mean"] = "none",
         ignore_index: int = -100,
-    ) -> Tuple[Tensor, Optional[Tensor]]:
+    ) -> Tensor:
         """Computes the language model logits and loss (Cross entropy across vocabulary)
 
         Args:
@@ -333,7 +333,7 @@ def shared_embedding_or_output_weight(self) -> Tensor:
     def sharded_state_dict(
         self,
         prefix: str = '',
-        sharded_offsets: Tuple[Tuple[int, int, int], ...] = (),
+        sharded_offsets: Tuple[Tuple[int, int, int]] = (),
         metadata: Optional[dict] = None,
     ) -> ShardedStateDict:
         """Sharded state dict implementation that handles the output layer weights tying.
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index bd1b51b6a12..78069e80f71 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 from collections import OrderedDict
-from typing import Any, Dict, List, Literal, Optional, Tuple
+from typing import Dict, Literal, Optional
 
 import torch
 from torch import Tensor
@@ -119,7 +119,7 @@ def __init__(
         self.parallel_output = parallel_output
         self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
         self.disable_param_offloading = True
-        self.vp_stage: Optional[int] = vp_stage
+        self.vp_stage = vp_stage
 
         if hasattr(self.config, 'position_embedding_type'):
             self.position_embedding_type = self.config.position_embedding_type
@@ -199,7 +199,7 @@ def __init__(
             ), "mrope require mrope_section setting, but we got None from TransformerConfig"
 
         # Cache for RoPE tensors which do not change between iterations.
-        self.rotary_pos_emb_cache: Dict[int, Tuple[Tensor, Tensor]] = {}
+        self.rotary_pos_emb_cache = {}
 
         # Transformer.
         self.decoder = TransformerBlock(
@@ -219,8 +219,6 @@ def __init__(
         # Output
         if self.post_process:
 
-            self.embedding_activation_buffer: Optional[List[Tensor]] = None
-            self.grad_output_buffer: Optional[List[Tensor]] = None
             if self.config.defer_embedding_wgrad_compute:
                 # The embedding activation buffer preserves a reference to the input activations
                 # of the final embedding projection layer GEMM. It will hold the activations for
@@ -397,7 +395,7 @@ def _preprocess(
         if in_inference_mode and not has_config_logger_enabled(self.config):
             decoder_input = WrappedTensor(decoder_input)
 
-        preproc_output: Tuple[Any, ...] = (
+        preproc_output = (
             decoder_input,
             rotary_pos_emb,
             rotary_pos_cos,
@@ -441,7 +439,7 @@ def forward(
         labels: Tensor = None,
         inference_context: BaseInferenceContext = None,
         packed_seq_params: PackedSeqParams = None,
-        extra_block_kwargs: Optional[Dict[str, Any]] = None,
+        extra_block_kwargs: dict = None,
         runtime_gather_output: Optional[bool] = None,
         *,
         inference_params: Optional[BaseInferenceContext] = None,
@@ -709,7 +707,7 @@ def build_schedule_plan(
         labels: Tensor = None,
         inference_context: BaseInferenceContext = None,
         packed_seq_params: PackedSeqParams = None,
-        extra_block_kwargs: Optional[Dict[str, Any]] = None,
+        extra_block_kwargs: dict = None,
         runtime_gather_output: Optional[bool] = None,
         inference_params: Optional[BaseInferenceContext] = None,
         loss_mask: Optional[Tensor] = None,
diff --git a/megatron/core/models/mamba/mamba_model.py b/megatron/core/models/mamba/mamba_model.py
index 4f306567565..7138cfad7d6 100644
--- a/megatron/core/models/mamba/mamba_model.py
+++ b/megatron/core/models/mamba/mamba_model.py
@@ -62,7 +62,7 @@ def __init__(
         pre_process: bool = True,
         hybrid_attention_ratio: float = 0.0,
         hybrid_mlp_ratio: float = 0.0,
-        hybrid_override_pattern: Optional[str] = None,
+        hybrid_override_pattern: str = None,
         post_process: bool = True,
         fp16_lm_cross_entropy: bool = False,
         parallel_output: bool = True,

From bfee2a383be9eac49d3c100a61118f48180983de Mon Sep 17 00:00:00 2001
From: Jianbing <jianbingd@nvidia.com>
Date: Thu, 27 Nov 2025 18:27:52 +0800
Subject: [PATCH 14/17] Fix review (#16)

* lazy init for global objects

Signed-off-by: Jianbing Dong <jianbingd@nvidia.com>

* fix distribute init for tp and sp

Signed-off-by: Jianbing Dong <jianbingd@nvidia.com>

* add __init__

Signed-off-by: Jianbing Dong <jianbingd@nvidia.com>

---------

Signed-off-by: Jianbing Dong <jianbingd@nvidia.com>
---
 .../fusions/fused_linear_cross_entropy.py     |  23 ++-
 .../fusions/linear_cross_entropy/__init__.py  |   0
 .../blackwell/__init__.py                     |   0
 .../linear_cross_entropy/blackwell/entry.py   |  63 ++++----
 .../test_fused_linear_cross_entropy.py        | 134 +++++++++++-------
 5 files changed, 129 insertions(+), 91 deletions(-)
 create mode 100644 megatron/core/fusions/linear_cross_entropy/__init__.py
 create mode 100644 megatron/core/fusions/linear_cross_entropy/blackwell/__init__.py

diff --git a/megatron/core/fusions/fused_linear_cross_entropy.py b/megatron/core/fusions/fused_linear_cross_entropy.py
index 85308b1c813..ca87eb09a8a 100644
--- a/megatron/core/fusions/fused_linear_cross_entropy.py
+++ b/megatron/core/fusions/fused_linear_cross_entropy.py
@@ -6,6 +6,7 @@
 """
 
 import typing
+from functools import lru_cache
 
 import torch
 
@@ -40,11 +41,13 @@ def __init__(self) -> None:
 
         self._initialized = True
 
-try:
-    _platform = Platform()
-except ValueError as e:
-    _unsupported_architecture_error = e
-    _platform = None
+@lru_cache(maxsize=1)
+def _get_platform() -> Platform:
+    """
+    Helper function to lazy initialize the platform.
+    """
+    return Platform()
+
 
 class LinearCrossEntropy(torch.autograd.Function):
     """
@@ -154,12 +157,9 @@ def forward(
         # each rank will get distinct local d_hidden and d_weight
         ```
         """
-        if _unsupported_architecture_error:
-            raise _unsupported_architecture_error
-
         with torch.cuda.nvtx.range("LinearCrossEntropy-forward"):
             logprobs, _maximum, _acc, _num_valid_tokens, tp_rank, tp_world_size, global_hidden = (
-                _platform.forward_func(
+                _get_platform().forward_func(
                     hidden, weight, labels, tp_group, reduction, ignore_index, sequence_parallel
                 )
             )
@@ -187,9 +187,6 @@ def backward(
             dhidden (torch.Tensor): The gradient of the hidden.
             dweight (torch.Tensor): The gradient of the weight.
         """
-        if _unsupported_architecture_error:
-            raise _unsupported_architecture_error
-
         with torch.cuda.nvtx.range("LinearCrossEntropy-backward"):
             (global_hidden, weight, labels, _maximum, _accu, _num_valid_tokens) = ctx.saved_tensors
 
@@ -200,7 +197,7 @@ def backward(
             tp_world_size = ctx.tp_world_size
             sequence_parallel = ctx.sequence_parallel
 
-            d_hidden, d_weight = _platform.backward_func(
+            d_hidden, d_weight = _get_platform().backward_func(
                 dlogprobs,
                 global_hidden,
                 weight,
diff --git a/megatron/core/fusions/linear_cross_entropy/__init__.py b/megatron/core/fusions/linear_cross_entropy/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/__init__.py b/megatron/core/fusions/linear_cross_entropy/blackwell/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py b/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py
index e156735ded2..014c574a635 100644
--- a/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py
@@ -2,6 +2,8 @@
 
 import typing
 from dataclasses import dataclass, field
+from functools import lru_cache
+import os
 
 import cuda.bindings.driver as cuda  # type: ignore
 import cutlass
@@ -29,6 +31,7 @@ class FwdConfig:
     _dedicated_events: typing.List[torch.cuda.Event] = field(default_factory=list)
     _initialized: bool = field(default=False)
     _fwd_mainloop_kernels: typing.Dict[str, cute.kernel] = field(default_factory=dict)
+    _vocab_per_split: int = field(default=int(os.environ.get("LCE_FWD_VOCAB_SPLIT_SIZE", 512 * 6)))
 
 
 @dataclass
@@ -38,11 +41,23 @@ class BwdConfig:
     """
 
     _bwd_kernel: typing.Dict[str, cute.kernel] = field(default_factory=dict)
+    _vocab_per_split: int = field(default=int(os.environ.get("LCE_BWD_VOCAB_SPLIT_SIZE", 512 * 6)))
+    _backward_method: utils.BackwardMethodEnum = field(default=utils.BackwardMethodEnum.kDlogitsSplitN)
 
 
-_fwd_config = FwdConfig()
-_bwd_config = BwdConfig()
+@lru_cache(maxsize=1)
+def _get_fwd_config() -> FwdConfig:
+    """
+    Helper function to lazy initialize the forward configuration.
+    """
+    return FwdConfig()
 
+@lru_cache(maxsize=1)
+def _get_bwd_config() -> BwdConfig:
+    """
+    Helper function to lazy initialize the backward configuration.
+    """
+    return BwdConfig()
 
 def forward(
     hidden: torch.Tensor,
@@ -91,11 +106,10 @@ def forward(
     num_tokens, dim = hidden_view.shape
     vocab_size, _ = weight.shape
 
-    global _fwd_config
-    if not _fwd_config._initialized:
-        _fwd_config._dedicated_stream = torch.cuda.Stream(hidden.device)
-        _fwd_config._dedicated_events = [torch.cuda.Event() for _ in range(2)]
-        _fwd_config._initialized = True
+    if not _get_fwd_config()._initialized:
+        _get_fwd_config()._dedicated_stream = torch.cuda.Stream(hidden.device)
+        _get_fwd_config()._dedicated_events = [torch.cuda.Event() for _ in range(2)]
+        _get_fwd_config()._initialized = True
 
     REDUCTION = utils.str_to_reduction_enum(reduction)
     # declare logprobs
@@ -114,8 +128,7 @@ def forward(
     )
     # declare intermediate tensors
     # NOTE: this is a parameter for tuning
-    vocab_per_split = 512 * 6
-    num_splits = (vocab_size + vocab_per_split - 1) // vocab_per_split
+    num_splits = (vocab_size + _get_fwd_config()._vocab_per_split - 1) // _get_fwd_config()._vocab_per_split
     _max = torch.empty((num_tokens, num_splits), device=hidden.device, dtype=torch.float32)
     _accu = torch.empty((num_tokens, num_splits), device=hidden.device, dtype=torch.float32)
     if REDUCTION == utils.EntropyReductionEnum.kNone:
@@ -150,8 +163,8 @@ def forward(
     # VocabSize and Dim are fixed for a given model,
     # only the number of tokens can vary
     key = f"vocab_size:{vocab_size}+dim:{dim}+dtype:{hidden_view.dtype}"
-    if _fwd_config._fwd_mainloop_kernels.get(key) is None:
-        fwd_mainloop_kernel = fwd_mainloop.FwdMainLoop(vocab_per_split=vocab_per_split)
+    if _get_fwd_config()._fwd_mainloop_kernels.get(key) is None:
+        fwd_mainloop_kernel = fwd_mainloop.FwdMainLoop(vocab_per_split=_get_fwd_config()._vocab_per_split)
         fwd_mainloop_compiled_kernel = cute.compile(
             fwd_mainloop_kernel,
             hidden_packed,
@@ -164,9 +177,9 @@ def forward(
             tp_rank,
             cuda_stream,
         )
-        _fwd_config._fwd_mainloop_kernels[key] = fwd_mainloop_compiled_kernel
+        _get_fwd_config()._fwd_mainloop_kernels[key] = fwd_mainloop_compiled_kernel
     else:
-        fwd_mainloop_compiled_kernel = _fwd_config._fwd_mainloop_kernels[key]
+        fwd_mainloop_compiled_kernel = _get_fwd_config()._fwd_mainloop_kernels[key]
     fwd_mainloop_compiled_kernel(
         hidden_packed,
         weight_packed,
@@ -210,11 +223,11 @@ def grid(meta):
         _max_backup = _max.clone()
         dist.all_reduce(_max, op=dist.ReduceOp.MAX, group=tp_group)
 
-        torch.cuda.current_stream().record_event(_fwd_config._dedicated_events[0])
-        with torch.cuda.stream(_fwd_config._dedicated_stream):
-            _fwd_config._dedicated_stream.wait_event(_fwd_config._dedicated_events[0])
+        torch.cuda.current_stream().record_event(_get_fwd_config()._dedicated_events[0])
+        with torch.cuda.stream(_get_fwd_config()._dedicated_stream):
+            _get_fwd_config()._dedicated_stream.wait_event(_get_fwd_config()._dedicated_events[0])
             dist.all_reduce(_logprobs, op=dist.ReduceOp.SUM, group=tp_group)
-            _fwd_config._dedicated_stream.record_event(_fwd_config._dedicated_events[1])
+            _get_fwd_config()._dedicated_stream.record_event(_get_fwd_config()._dedicated_events[1])
 
         def grid(meta):
             return (triton.cdiv(num_tokens, meta["BLOCK_SIZE_M"]),)
@@ -240,7 +253,7 @@ def grid(meta):
         dist.all_reduce(accumulate, op=dist.ReduceOp.SUM, group=tp_group)
 
         # update logprobs
-        torch.cuda.current_stream().wait_event(_fwd_config._dedicated_events[1])
+        torch.cuda.current_stream().wait_event(_get_fwd_config()._dedicated_events[1])
         triton_kernels.forward_tp_epilogue_update_logprobs[grid](
             num_tokens,
             ignore_index,
@@ -304,9 +317,9 @@ def backward(
     assert d_hidden.is_contiguous() and d_weight.is_contiguous()
 
     # FIXME: implement different backward methods
-    _backward = utils.BackwardMethodEnum.kDlogitsSplitN
-    if _backward == utils.BackwardMethodEnum.kDlogitsSplitN:
-        vocab_per_split = 512 * 6
+    _backward_method = _get_bwd_config()._backward_method
+    if _backward_method == utils.BackwardMethodEnum.kDlogitsSplitN:
+        vocab_per_split = _get_bwd_config()._vocab_per_split
         num_splits = (vocab_size + vocab_per_split - 1) // vocab_per_split
 
         _d_logits = torch.empty(
@@ -335,7 +348,7 @@ def backward(
         stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
 
         key = f"vocab_size:{vocab_size}+dim:{dim}+reduction:{REDUCTION}+dtype:{hidden_view.dtype}"
-        if _bwd_config._bwd_kernel.get(key) is None:
+        if _get_bwd_config()._bwd_kernel.get(key) is None:
             bwd_kernel = bwd_partial_dlogits.BwdPartialDlogits(
                 reduction=REDUCTION.value, vocab_per_split=vocab_per_split
             )
@@ -354,9 +367,9 @@ def backward(
                 tp_rank,
                 stream,
             )
-            _bwd_config._bwd_kernel[key] = bwd_kernel_compiled
+            _get_bwd_config()._bwd_kernel[key] = bwd_kernel_compiled
         else:
-            bwd_kernel_compiled = _bwd_config._bwd_kernel.get(key)
+            bwd_kernel_compiled = _get_bwd_config()._bwd_kernel.get(key)
 
         for split_idx in range(num_splits):
             bwd_kernel_compiled(
@@ -395,7 +408,7 @@ def backward(
                 out=d_weight[split_idx * vocab_per_split : (split_idx + 1) * vocab_per_split, :],
             )
     else:
-        raise NotImplementedError(f"Unsupported backward method: {_backward}")
+        raise NotImplementedError(f"Unsupported backward method: {_backward_method}")
 
     if in_tp_mode:
         dist.all_reduce(d_hidden, op=dist.ReduceOp.SUM, group=tp_group)
diff --git a/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py b/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py
index a36b8cfb4e0..66370271de9 100644
--- a/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py
+++ b/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py
@@ -4,6 +4,7 @@
 import os
 import typing
 from contextlib import ExitStack
+from dataclasses import dataclass
 
 import numpy as np
 import pytest
@@ -28,6 +29,62 @@
 from tests.unit_tests.test_utilities import Utils
 
 
+# 1. Define a standardized context to hold your distributed info
+@dataclass
+class DistContext:
+    rank: int
+    world_size: int
+    group: dist.ProcessGroup
+    is_chief: bool
+
+# 2. Create a module-scoped fixture
+# This runs ONE time per file, no matter how many test classes you have.
+@pytest.fixture(scope="module")
+def distributed_context():
+    # --- PRE-CHECK ---
+    if "WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2:
+        pytest.skip("Requires torchrun with multiple GPUs (WORLD_SIZE >= 2)")
+
+    # --- SETUP ---
+    is_external_init = dist.is_initialized()
+    
+    if not is_external_init:
+        # Initialize only if not already done (e.g., by another test runner)
+        dist.init_process_group(
+            backend="nccl",
+            init_method="env://",
+            world_size=int(os.environ["WORLD_SIZE"]),
+            rank=int(os.environ["RANK"]),
+        )
+
+    # Set device immediately to avoid cross-device pollution
+    local_rank = int(os.environ.get("LOCAL_RANK", os.environ["RANK"]))
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+
+    # Gather context data
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+    group = dist.group.WORLD
+    
+    print(f"[INFO]: Initialized Rank: {rank} / {world_size}")
+
+    context = DistContext(
+        rank=rank,
+        world_size=world_size,
+        group=group,
+        is_chief=(rank == 0)
+    )
+
+    # Yield control to the tests
+    yield context
+
+    # --- TEARDOWN ---
+    # Only destroy if we were the ones who initialized it
+    if not is_external_init:
+        dist.destroy_process_group()
+
+
 class MockDataset(Dataset):
     """
     Mock dataset for torchtitan GPT training tests
@@ -136,9 +193,9 @@ def init_gpt_dataloader(
     dataloader = DataLoader(dataset, batch_size=batch_size, sampler=sampler)
     return dataloader
 
-
+# skip it for good
 @pytest.mark.skipif(
-    ("WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2),  # or True,
+    ("WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2) or True,
     reason="Requires torchrun with multiple GPUs",
 )
 class TestFusedLinearCrossEntropyOnGptModel:
@@ -501,32 +558,18 @@ def custom_storage():
     ("WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2),  # or True,
     reason="Requires torchrun with multiple GPUs",
 )
+@pytest.mark.usefixtures("distributed_context")
 class TestFusedLinearCrossEntropyTensorParallel:
-    @classmethod
-    def setup_class(cls):
-        if dist.is_initialized():
-            cls.must_teardown = False
-        else:
-            dist.init_process_group(
-                backend="nccl",
-                init_method="env://",
-                world_size=int(os.environ["WORLD_SIZE"]),
-                rank=int(os.environ["RANK"]),
-            )
-            cls.must_teardown = True
-        cls.tp_group = dist.group.WORLD
-
-        cls.tp_rank = dist.get_rank(cls.tp_group)
-        cls.tp_world_size = dist.get_world_size(cls.tp_group)
-        cls.is_chief = cls.tp_rank == 0
-        device = torch.device(f"cuda:{cls.tp_rank}")
-        torch.cuda.set_device(device)
-        print(f"[INFO]: TP rank: {cls.tp_rank}, TP world size: {cls.tp_world_size}")
-
-    @classmethod
-    def teardown_class(cls):
-        if cls.must_teardown:
-            dist.destroy_process_group()
+    @pytest.fixture(autouse=True)
+    def setup_attrs(self, distributed_context):
+        """
+        Setup attributes for the test class.
+        """
+        self.tp_group = distributed_context.group
+        self.tp_rank = distributed_context.rank
+        self.tp_world_size = distributed_context.world_size
+        self.is_chief = distributed_context.is_chief
+    
 
     def cleanup(self):
         torch.cuda.empty_cache()
@@ -954,32 +997,17 @@ def custom_storage():
     "WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2,
     reason="Requires torchrun with multiple GPUs",
 )
+@pytest.mark.usefixtures("distributed_context")
 class TestFusedLinearCrossEntropySequenceParallel:
-    @classmethod
-    def setup_class(cls):
-        if dist.is_initialized():
-            cls.must_teardown = False
-        else:
-            dist.init_process_group(
-                backend="nccl",
-                init_method="env://",
-                world_size=int(os.environ["WORLD_SIZE"]),
-                rank=int(os.environ["RANK"]),
-            )
-            cls.must_teardown = True
-        cls.tp_group = dist.group.WORLD
-
-        cls.tp_rank = dist.get_rank(cls.tp_group)
-        cls.tp_world_size = dist.get_world_size(cls.tp_group)
-        cls.is_chief = cls.tp_rank == 0
-        device = torch.device(f"cuda:{cls.tp_rank}")
-        torch.cuda.set_device(device)
-        print(f"[INFO]: TP rank: {cls.tp_rank}, TP world size: {cls.tp_world_size}")
-
-    @classmethod
-    def teardown_class(cls):
-        if cls.must_teardown:
-            dist.destroy_process_group()
+    @pytest.fixture(autouse=True)
+    def setup_attrs(self, distributed_context):
+        """
+        Setup attributes for the test class.
+        """
+        self.tp_group = distributed_context.group
+        self.tp_rank = distributed_context.rank
+        self.tp_world_size = distributed_context.world_size
+        self.is_chief = distributed_context.is_chief
 
     @staticmethod
     def timed_barrier(timeout_s=10):
@@ -1125,7 +1153,7 @@ def backward(ctx, g_logprobs: torch.Tensor):
     @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
     @pytest.mark.parametrize("reduction", ["mean", "sum", "none"])
     @pytest.mark.parametrize("problem", [(256, 12928, 8192)])
-    def test_torch_tp_vs_single_gpu(self, dtype, reduction, problem):
+    def test_torch_sp_vs_single_gpu(self, dtype, reduction, problem):
         num_tokens, vocabsize, dim = problem
 
         hidden = (

From 1b603b9a41c731454b7ab7bc9a99318ff41f1e1a Mon Sep 17 00:00:00 2001
From: Jianbin Chang <shjwudp@gmail.com>
Date: Tue, 2 Dec 2025 10:11:25 +0800
Subject: [PATCH 15/17] Fix for CI (#17)

* fix platform fail in test env

* fix import error in no CUDA & CUTE test env

* Revert "fix import error in no CUDA & CUTE test env"

This reverts commit 0b8010b30fcc0795b917b5b177a61ec7e906fe40.

* safe_imports check skip blackwell

* try clean up

* reduce fused_linear_cross_entopy UT problem size for OOM issue

* skip UT when device arch not 10

* fix mamba logits compute order
---
 .gitlab/scripts/check_imports.py                   |  1 +
 .../core/fusions/fused_linear_cross_entropy.py     |  1 +
 megatron/core/models/gpt/gpt_model.py              |  2 +-
 megatron/core/models/mamba/mamba_model.py          |  8 +++++---
 .../fusions/test_fused_linear_cross_entropy.py     | 14 +++++++++++++-
 5 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/.gitlab/scripts/check_imports.py b/.gitlab/scripts/check_imports.py
index f46987d8d87..9d82b661681 100644
--- a/.gitlab/scripts/check_imports.py
+++ b/.gitlab/scripts/check_imports.py
@@ -49,6 +49,7 @@ def __init__(self, package_name: str = "megatron.core", verbose: bool = False):
             ".git",
             "test_",
             "_test",
+            "blackwell",
         }
 
         # Add current directory to Python path if not already there
diff --git a/megatron/core/fusions/fused_linear_cross_entropy.py b/megatron/core/fusions/fused_linear_cross_entropy.py
index ca87eb09a8a..3bb3b5c14f1 100644
--- a/megatron/core/fusions/fused_linear_cross_entropy.py
+++ b/megatron/core/fusions/fused_linear_cross_entropy.py
@@ -41,6 +41,7 @@ def __init__(self) -> None:
 
         self._initialized = True
 
+
 @lru_cache(maxsize=1)
 def _get_platform() -> Platform:
     """
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index 78069e80f71..b46ea83a4d4 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -118,8 +118,8 @@ def __init__(
         self.fp16_lm_cross_entropy = fp16_lm_cross_entropy
         self.parallel_output = parallel_output
         self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
-        self.disable_param_offloading = True
         self.vp_stage = vp_stage
+        self.disable_param_offloading = True
 
         if hasattr(self.config, 'position_embedding_type'):
             self.position_embedding_type = self.config.position_embedding_type
diff --git a/megatron/core/models/mamba/mamba_model.py b/megatron/core/models/mamba/mamba_model.py
index 7138cfad7d6..e4074eda806 100644
--- a/megatron/core/models/mamba/mamba_model.py
+++ b/megatron/core/models/mamba/mamba_model.py
@@ -267,6 +267,11 @@ def forward(
                     hidden_states.squeeze(1).unsqueeze(0)
                 ).unsqueeze(1)
 
+        if labels is None:
+            logits, _ = self.output_layer(
+                hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output
+            )
+
         # Restore sequence parallel execution to the output layer if necessary.
         if sequence_parallel_override:
             assert (
@@ -277,9 +282,6 @@ def forward(
             self.output_layer.sequence_parallel = True
 
         if labels is None:
-            logits, _ = self.output_layer(
-                hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output
-            )
             # [s b h] => [b s h]
             return logits.transpose(0, 1).contiguous()
 
diff --git a/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py b/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py
index 66370271de9..873505fe51c 100644
--- a/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py
+++ b/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py
@@ -20,6 +20,7 @@
     get_gpt_mtp_block_spec,
 )
 from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.training.utils import get_device_arch_version
 from tests.unit_tests.a2a_overlap.utils import (
     deterministic_mode,
     get_test_config,
@@ -257,6 +258,9 @@ def test_gpt_model(self, mtp_layers, dispatcher_type, fp8_flag, layer_num):
 @pytest.mark.skipif(
     "WORLD_SIZE" in os.environ and os.environ["WORLD_SIZE"] != "1", reason="Requires single GPU"
 )
+@pytest.mark.skipif(
+    get_device_arch_version() != 10, reason="Requires GPU architecture = 10"
+)
 class TestFusedLinearCrossEntropyDataParallel:
     def cleanup(self):
         torch.cuda.empty_cache()
@@ -558,6 +562,9 @@ def custom_storage():
     ("WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2),  # or True,
     reason="Requires torchrun with multiple GPUs",
 )
+@pytest.mark.skipif(
+    get_device_arch_version() != 10, reason="Requires GPU architecture = 10"
+)
 @pytest.mark.usefixtures("distributed_context")
 class TestFusedLinearCrossEntropyTensorParallel:
     @pytest.fixture(autouse=True)
@@ -684,6 +691,7 @@ def backward(ctx, g_logprobs: torch.Tensor):
     @pytest.mark.parametrize("problem", [(4096, 129280, 8192)])
     def test_torch_tp_vs_single_gpu(self, dtype, reduction, problem):
         num_tokens, vocabsize, dim = problem
+        vocabsize = vocabsize // self.tp_world_size
 
         hidden = (
             torch.empty((num_tokens, dim), dtype=dtype, device="cuda")
@@ -997,6 +1005,9 @@ def custom_storage():
     "WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2,
     reason="Requires torchrun with multiple GPUs",
 )
+@pytest.mark.skipif(
+    get_device_arch_version() != 10, reason="Requires GPU architecture = 10"
+)
 @pytest.mark.usefixtures("distributed_context")
 class TestFusedLinearCrossEntropySequenceParallel:
     @pytest.fixture(autouse=True)
@@ -1152,9 +1163,10 @@ def backward(ctx, g_logprobs: torch.Tensor):
 
     @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
     @pytest.mark.parametrize("reduction", ["mean", "sum", "none"])
-    @pytest.mark.parametrize("problem", [(256, 12928, 8192)])
+    @pytest.mark.parametrize("problem", [(256, 129280, 8192)])
     def test_torch_sp_vs_single_gpu(self, dtype, reduction, problem):
         num_tokens, vocabsize, dim = problem
+        vocabsize = vocabsize // self.tp_world_size
 
         hidden = (
             torch.empty((num_tokens, dim), dtype=dtype, device="cuda")

From fb2ee78a1cddf716acb3545156bb42781a94641f Mon Sep 17 00:00:00 2001
From: Jianbin Chang <shjwudp@gmail.com>
Date: Tue, 2 Dec 2025 15:48:14 +0800
Subject: [PATCH 16/17] Fix CI (#18)

* fix platform fail in test env

* fix import error in no CUDA & CUTE test env

* Revert "fix import error in no CUDA & CUTE test env"

This reverts commit 0b8010b30fcc0795b917b5b177a61ec7e906fe40.

* safe_imports check skip blackwell

* try clean up

* reduce fused_linear_cross_entopy UT problem size for OOM issue

* skip UT when device arch not 10

* fix mamba logits compute order

* 1. Add Copyright for init.py
2. Allow files under Blackwell to bypass import checks.
---
 .gitlab/scripts/check_imports.py              |    1 -
 .../fusions/linear_cross_entropy/__init__.py  |    1 +
 .../blackwell/__init__.py                     |    1 +
 .../blackwell/bwd_partial_dlogits.py          | 1181 ++++++++--------
 .../linear_cross_entropy/blackwell/entry.py   |  789 +++++------
 .../blackwell/fwd_mainloop.py                 | 1241 +++++++++--------
 6 files changed, 1612 insertions(+), 1602 deletions(-)

diff --git a/.gitlab/scripts/check_imports.py b/.gitlab/scripts/check_imports.py
index 9d82b661681..f46987d8d87 100644
--- a/.gitlab/scripts/check_imports.py
+++ b/.gitlab/scripts/check_imports.py
@@ -49,7 +49,6 @@ def __init__(self, package_name: str = "megatron.core", verbose: bool = False):
             ".git",
             "test_",
             "_test",
-            "blackwell",
         }
 
         # Add current directory to Python path if not already there
diff --git a/megatron/core/fusions/linear_cross_entropy/__init__.py b/megatron/core/fusions/linear_cross_entropy/__init__.py
index e69de29bb2d..b9a9591fa69 100644
--- a/megatron/core/fusions/linear_cross_entropy/__init__.py
+++ b/megatron/core/fusions/linear_cross_entropy/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/__init__.py b/megatron/core/fusions/linear_cross_entropy/blackwell/__init__.py
index e69de29bb2d..b9a9591fa69 100644
--- a/megatron/core/fusions/linear_cross_entropy/blackwell/__init__.py
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py b/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py
index 8a6e03601bf..17ad627322e 100644
--- a/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py
@@ -1,635 +1,638 @@
 # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 
-from typing import Optional, Tuple, Type
-
-import cuda.bindings.driver as cuda  # type: ignore
-import cutlass
-import cutlass.cute as cute
-import cutlass.pipeline as pipeline  # type: ignore
-import cutlass.utils as utils  # type: ignore
-import cutlass.utils.blackwell_helpers as sm100_utils  # type: ignore
-from cutlass.cute.nvgpu import cpasync, tcgen05
-
-SM100_TMEM_CAPACITY_COLUMNS: int = 512
-
-
-def make_thread_cooperative_group(size: int, alignment: Optional[int] = None):
-    """
-    Create a thread cooperative group.
-    """
-    return pipeline.CooperativeGroup(
-        pipeline.Agent.Thread, size, alignment=alignment if alignment is not None else size
-    )
-
-
-class BwdPartialDlogits:
-    """
-    This class implements the backward kernel for partial d_logits.
-    """
-
-    def __init__(
-        self,
-        reduction: int,
-        acc_dtype: Type[cutlass.Numeric] = cutlass.Float32,
-        use_2cta_instrs: bool = False,
-        mma_tiler_mn: Tuple[int, int] = (128, 256),
-        vocab_per_split: int = 512,
-    ):
-        self.REDUCTION: cutlass.Constexpr[cutlass.Int32] = cutlass.const_expr(reduction)
-        self.acc_dtype = acc_dtype
-        self.use_2cta_instrs = use_2cta_instrs
-        self.mma_tiler = (*mma_tiler_mn, 1)
-        self.vocab_per_split = vocab_per_split
-
-        self.cta_group = tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
-        self.cluster_shape_mn = (2, 1) if self.use_2cta_instrs else (1, 1)
-
-        self.smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
-
-        self.threads_per_warp: int = 32
-
-        self.epi_warp_ids = (0, 1, 2, 3)
-        self.load_warp_ids = 4
-        self.mma_warp_ids = 5
-        self.empty_warp_ids = (6, 7)
-
-        self.threads_per_cta: int = self.threads_per_warp * len(
-            (*self.epi_warp_ids, self.load_warp_ids, self.mma_warp_ids, *self.empty_warp_ids)
-        )
-        self.cta_sync_barrier = pipeline.NamedBarrier(
-            barrier_id=1, num_threads=self.threads_per_cta
-        )
+try:
+    from typing import Optional, Tuple, Type
 
-        self.buffer_align_bytes: int = 1024
-        self.num_regs_other: int = 32
-        self.num_regs_epi: int = 192
-
-    def _compute_grid(
-        self,
-        problem_mnk: Tuple[int, int, int],
-        cluster_shape_mn: Tuple[int, int],
-        cta_tiler: Tuple[int, int, int],
-    ) -> Tuple[int, int, int]:
-        cluster_shape_mnk = (*cluster_shape_mn, 1)
-
-        grid = cute.round_up(
-            (
-                cute.ceil_div(problem_mnk[0], cta_tiler[0]),
-                cute.ceil_div(self.vocab_per_split, cta_tiler[1]),
-                1,
-            ),
-            cluster_shape_mnk,
-        )
-        return grid
-
-    def _compute_stages(
-        self,
-        tiled_mma: cute.TiledMma,
-        mma_tiler: Tuple[int, int, int],
-        a_dtype: Type[cutlass.Numeric],
-        b_dtype: Type[cutlass.Numeric],
-    ):
-        num_acc_stage = 1
-        num_ab_stage = 4
-        num_epi_stage_per_tile = 4
-        return num_acc_stage, num_ab_stage, num_epi_stage_per_tile
-
-    def _setup_attributes(
-        self,
-        tiled_mma: cute.TiledMma,
-        a_dtype: Type[cutlass.Numeric],
-        b_dtype: Type[cutlass.Numeric],
-    ):
-        self.cluster_shape_mnk = (*self.cluster_shape_mn, 1)
-        self.cluster_layout_vmnk = cute.tiled_divide(
-            cute.make_layout(self.cluster_shape_mnk), (tiled_mma.thr_id.shape,)
-        )
+    import cuda.bindings.driver as cuda  # type: ignore
+    import cutlass
+    import cutlass.cute as cute
+    import cutlass.pipeline as pipeline  # type: ignore
+    import cutlass.utils as utils  # type: ignore
+    import cutlass.utils.blackwell_helpers as sm100_utils  # type: ignore
+    from cutlass.cute.nvgpu import cpasync, tcgen05
 
-        mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
-        # it requires k-mode to be 128B aligned
-        mma_inst_tile_k: int = 4
-        self.mma_tiler = (self.mma_tiler[0], self.mma_tiler[1], mma_inst_shape_k * mma_inst_tile_k)
+    SM100_TMEM_CAPACITY_COLUMNS: int = 512
 
-        self.num_acc_stage, self.num_ab_stage, self.num_epi_stage_per_tile = self._compute_stages(
-            tiled_mma, self.mma_tiler, a_dtype, b_dtype
-        )
-        self.tmem_alloc_cols = self.num_acc_stage * self.mma_tiler[1]
-        assert self.tmem_alloc_cols <= SM100_TMEM_CAPACITY_COLUMNS
 
-        self.cta_tile_shape_mnk = (
-            self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
-            self.mma_tiler[1],
-            self.mma_tiler[2],
-        )
-
-    @cute.kernel
-    def kernel(
-        self,
-        split_idx: cutlass.Int32,
-        tiled_mma: cute.TiledMma,
-        tma_atom_a: cute.CopyAtom,
-        mA: cute.Tensor,
-        tma_atom_b: cute.CopyAtom,
-        mB: cute.Tensor,
-        mLabels: cute.Tensor,
-        mDlogprobs: cute.Tensor,
-        mMaximum: cute.Tensor,
-        mAccu: cute.Tensor,
-        mDlogits_partial: cute.Tensor,
-        scalarNumValidTokens: cute.Pointer,
-        ignore_index: cutlass.Int64,
-        a_smem_layout_staged: cute.ComposedLayout,
-        b_smem_layout_staged: cute.ComposedLayout,
-        cluster_layout_vmnk: cute.Layout,
-        problem_mnk: Tuple[int, int, int],
-        rank: cutlass.Int32,
-    ) -> None:
+    def make_thread_cooperative_group(size: int, alignment: Optional[int] = None):
         """
-        The backward kernel for partial d_logits.
+        Create a thread cooperative group.
         """
-        warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
-        tidx, _, _ = cute.arch.thread_idx()
-        bidx, bidy, _ = cute.arch.block_idx()
-        # FIXME: block swizzling applied here
-        pidm, pidn = bidx, bidy
-
-        # FIXME: if 2 CTAs, modify here
-        cta_rank_in_cluster = 0
-        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
-
-        # prefetch tma descriptors
-        if warp_idx == self.load_warp_ids:
-            cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_a)
-            cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_b)
-
-        smem = utils.SmemAllocator()
-        storage = smem.allocate(self.shared_storage)
-
-        ab_pipeline = pipeline.PipelineTmaUmma.create(
-            num_stages=self.num_ab_stage,
-            producer_group=make_thread_cooperative_group(len([self.load_warp_ids])),
-            consumer_group=make_thread_cooperative_group(len([self.mma_warp_ids])),
-            tx_count=self.tma_copy_ab_bytes,
-            barrier_storage=storage.load_ab_mbar_ptr.data_ptr(),
-        )
-        ab_producer_state = pipeline.make_pipeline_state(
-            pipeline.PipelineUserType.Producer, self.num_ab_stage
-        )
-        ab_consumer_state = pipeline.make_pipeline_state(
-            pipeline.PipelineUserType.Consumer, self.num_ab_stage
-        )
-
-        mma_pipeline = pipeline.PipelineUmmaAsync.create(
-            num_stages=self.num_acc_stage,
-            producer_group=make_thread_cooperative_group(len([self.mma_warp_ids])),
-            consumer_group=make_thread_cooperative_group(
-                self.threads_per_warp * len(self.epi_warp_ids)
-            ),
-            barrier_storage=storage.mma_mbar_ptr.data_ptr(),
-        )
-        mma_producer_state = pipeline.make_pipeline_state(
-            pipeline.PipelineUserType.Producer, self.num_acc_stage
-        )
-        mma_consumer_state = pipeline.make_pipeline_state(
-            pipeline.PipelineUserType.Consumer, self.num_acc_stage
+        return pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, size, alignment=alignment if alignment is not None else size
         )
 
-        tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr.data_ptr()
-        if warp_idx == self.empty_warp_ids[0]:
-            with cute.arch.elect_one():
-                cute.arch.mbarrier_init(
-                    tmem_dealloc_mbar_ptr, self.threads_per_warp * len(self.epi_warp_ids)
-                )
-                cute.arch.mbarrier_init_fence()
-
-        # -------- tensor partition ------------ #
-        # swizzle o [(tileM, tileK), loopM, loopK, stage]
-        sA = storage.sA.get_tensor(a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner)
-        # swizzle o [(tileN, tileK), loopN, loopK, stage]
-        sB = storage.sB.get_tensor(b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner)
-
-        # FIXME: if 2 CTAs, modify here
-        thr_mma = tiled_mma.get_slice(0)
-        # [MMA, loopM, loopK, stage]
-        tCsA = thr_mma.make_fragment_A(sA)
-        # [MMA, loopN, loopK, stage]
-        tCsB = thr_mma.make_fragment_B(sB)
-
-        # [tileM, tileK, loopK]
-        gA = cute.local_tile(
-            mA, (self.cta_tile_shape_mnk[0], self.cta_tile_shape_mnk[2]), (pidm, None)
-        )
-        # [vocab_per_split, dim]
-        mB_n = cute.local_tile(
-            mB, (self.vocab_per_split, cute.size(mB.layout.shape, mode=[1])), (split_idx, 0)
-        )
-        # [tileN, tileK, loopK]
-        gB = cute.local_tile(
-            mB_n, (self.cta_tile_shape_mnk[1], self.cta_tile_shape_mnk[2]), (pidn, None)
-        )
 
-        a_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape)
-        # just to make sure SMEM and GMEM tensor has the same size in the first rank
-        tCgA = thr_mma.partition_A(gA)
-        tCgB = thr_mma.partition_B(gB)
-        # [CPY, stage] & [CPY, loopK]
-        tTMAsA, tTMAgA = cpasync.tma_partition(
-            tma_atom_a,
-            block_in_cluster_coord_vmnk[2],  # cta_coord,
-            a_cta_layout,
-            cute.group_modes(sA, 0, 3),
-            cute.group_modes(tCgA, 0, 3),
-        )
-        b_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape)
-        # [CPY, stage] & [CPY, loopK]
-        tTMAsB, tTMAgB = cpasync.tma_partition(
-            tma_atom_b,
-            block_in_cluster_coord_vmnk[1],  # cta_coord
-            b_cta_layout,
-            cute.group_modes(sB, 0, 3),
-            cute.group_modes(tCgB, 0, 3),
-        )
+    class BwdPartialDlogits:
+        """
+        This class implements the backward kernel for partial d_logits.
+        """
 
-        # ------ Allocate TMEM ------ #
-        tmem_holding_buf = storage.tmem_holding_buf
-        if warp_idx == self.empty_warp_ids[0]:
-            cute.arch.alloc_tmem(
-                self.tmem_alloc_cols, tmem_holding_buf, is_two_cta=self.use_2cta_instrs
+        def __init__(
+            self,
+            reduction: int,
+            acc_dtype: Type[cutlass.Numeric] = cutlass.Float32,
+            use_2cta_instrs: bool = False,
+            mma_tiler_mn: Tuple[int, int] = (128, 256),
+            vocab_per_split: int = 512,
+        ):
+            self.REDUCTION: cutlass.Constexpr[cutlass.Int32] = cutlass.const_expr(reduction)
+            self.acc_dtype = acc_dtype
+            self.use_2cta_instrs = use_2cta_instrs
+            self.mma_tiler = (*mma_tiler_mn, 1)
+            self.vocab_per_split = vocab_per_split
+
+            self.cta_group = tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
+            self.cluster_shape_mn = (2, 1) if self.use_2cta_instrs else (1, 1)
+
+            self.smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
+
+            self.threads_per_warp: int = 32
+
+            self.epi_warp_ids = (0, 1, 2, 3)
+            self.load_warp_ids = 4
+            self.mma_warp_ids = 5
+            self.empty_warp_ids = (6, 7)
+
+            self.threads_per_cta: int = self.threads_per_warp * len(
+                (*self.epi_warp_ids, self.load_warp_ids, self.mma_warp_ids, *self.empty_warp_ids)
+            )
+            self.cta_sync_barrier = pipeline.NamedBarrier(
+                barrier_id=1, num_threads=self.threads_per_cta
             )
-        self.cta_sync_barrier.arrive_and_wait()
-        tmem_ptr = cute.arch.retrieve_tmem_ptr(
-            self.acc_dtype, alignment=16, ptr_to_buffer_holding_addr=tmem_holding_buf
-        )
 
-        tmem_shape = (128, self.tmem_alloc_cols)
-        acc_shape = thr_mma.partition_shape_C(tmem_shape)
-        tCtC_fake = thr_mma.make_fragment_C(acc_shape)
-        # [(tileM, tileN), loopM, loopN]
-        tCtC = cute.make_tensor(tmem_ptr, tCtC_fake.layout)
-
-        # ------ Empty ------ #
-        if warp_idx in self.empty_warp_ids:
-            cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
-
-        # ------ Load ------ #
-        if warp_idx == self.load_warp_ids:
-            cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
-
-            for k in cutlass.range(cute.size(gA, mode=[2])):
-                ab_pipeline.producer_acquire(ab_producer_state)
-                cute.copy(
-                    tma_atom_a,
-                    tTMAgA[(None, k)],
-                    tTMAsA[(None, ab_producer_state.index)],
-                    tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
-                )
-                cute.copy(
-                    tma_atom_b,
-                    tTMAgB[(None, k)],
-                    tTMAsB[(None, ab_producer_state.index)],
-                    tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
-                )
-                ab_pipeline.producer_commit(ab_producer_state)
-                ab_producer_state.advance()
-
-        # ------ MMA ------ #
-        if warp_idx == self.mma_warp_ids:
-            cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
-
-            tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
-            mma_pipeline.producer_acquire(mma_producer_state)
-
-            for k in cutlass.range(cute.size(gA, mode=[2])):
-                ab_pipeline.consumer_wait(ab_consumer_state)
-
-                for kblock_idx in cutlass.range(cute.size(tCsA, mode=[2]), unroll_full=True):
-                    cute.gemm(
-                        tiled_mma,
-                        cute.append_ones(tCtC[(None, None, mma_producer_state.index)]),
-                        tCsA[(None, None, kblock_idx, ab_consumer_state.index)],
-                        tCsB[(None, None, kblock_idx, ab_consumer_state.index)],
-                        cute.append_ones(tCtC[(None, None, mma_producer_state.index)]),
-                    )
-                    tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+            self.buffer_align_bytes: int = 1024
+            self.num_regs_other: int = 32
+            self.num_regs_epi: int = 192
+
+        def _compute_grid(
+            self,
+            problem_mnk: Tuple[int, int, int],
+            cluster_shape_mn: Tuple[int, int],
+            cta_tiler: Tuple[int, int, int],
+        ) -> Tuple[int, int, int]:
+            cluster_shape_mnk = (*cluster_shape_mn, 1)
+
+            grid = cute.round_up(
+                (
+                    cute.ceil_div(problem_mnk[0], cta_tiler[0]),
+                    cute.ceil_div(self.vocab_per_split, cta_tiler[1]),
+                    1,
+                ),
+                cluster_shape_mnk,
+            )
+            return grid
+
+        def _compute_stages(
+            self,
+            tiled_mma: cute.TiledMma,
+            mma_tiler: Tuple[int, int, int],
+            a_dtype: Type[cutlass.Numeric],
+            b_dtype: Type[cutlass.Numeric],
+        ):
+            num_acc_stage = 1
+            num_ab_stage = 4
+            num_epi_stage_per_tile = 4
+            return num_acc_stage, num_ab_stage, num_epi_stage_per_tile
+
+        def _setup_attributes(
+            self,
+            tiled_mma: cute.TiledMma,
+            a_dtype: Type[cutlass.Numeric],
+            b_dtype: Type[cutlass.Numeric],
+        ):
+            self.cluster_shape_mnk = (*self.cluster_shape_mn, 1)
+            self.cluster_layout_vmnk = cute.tiled_divide(
+                cute.make_layout(self.cluster_shape_mnk), (tiled_mma.thr_id.shape,)
+            )
 
-                ab_pipeline.consumer_release(ab_consumer_state)
-                ab_consumer_state.advance()
+            mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
+            # it requires k-mode to be 128B aligned
+            mma_inst_tile_k: int = 4
+            self.mma_tiler = (self.mma_tiler[0], self.mma_tiler[1], mma_inst_shape_k * mma_inst_tile_k)
 
-            mma_pipeline.producer_commit(mma_producer_state)
-            mma_producer_state.advance()
+            self.num_acc_stage, self.num_ab_stage, self.num_epi_stage_per_tile = self._compute_stages(
+                tiled_mma, self.mma_tiler, a_dtype, b_dtype
+            )
+            self.tmem_alloc_cols = self.num_acc_stage * self.mma_tiler[1]
+            assert self.tmem_alloc_cols <= SM100_TMEM_CAPACITY_COLUMNS
 
-        # ------ EPI ------ #
-        if warp_idx in self.epi_warp_ids:
-            cute.arch.warpgroup_reg_alloc(self.num_regs_epi)
+            self.cta_tile_shape_mnk = (
+                self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
+                self.mma_tiler[1],
+                self.mma_tiler[2],
+            )
 
-            copy_atom_t2r = sm100_utils.get_tmem_load_op(
-                self.cta_tile_shape_mnk,
-                utils.LayoutEnum.ROW_MAJOR,
-                self.acc_dtype,
-                self.acc_dtype,
-                (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
-                self.use_2cta_instrs,
+        @cute.kernel
+        def kernel(
+            self,
+            split_idx: cutlass.Int32,
+            tiled_mma: cute.TiledMma,
+            tma_atom_a: cute.CopyAtom,
+            mA: cute.Tensor,
+            tma_atom_b: cute.CopyAtom,
+            mB: cute.Tensor,
+            mLabels: cute.Tensor,
+            mDlogprobs: cute.Tensor,
+            mMaximum: cute.Tensor,
+            mAccu: cute.Tensor,
+            mDlogits_partial: cute.Tensor,
+            scalarNumValidTokens: cute.Pointer,
+            ignore_index: cutlass.Int64,
+            a_smem_layout_staged: cute.ComposedLayout,
+            b_smem_layout_staged: cute.ComposedLayout,
+            cluster_layout_vmnk: cute.Layout,
+            problem_mnk: Tuple[int, int, int],
+            rank: cutlass.Int32,
+        ) -> None:
+            """
+            The backward kernel for partial d_logits.
+            """
+            warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
+            tidx, _, _ = cute.arch.thread_idx()
+            bidx, bidy, _ = cute.arch.block_idx()
+            # FIXME: block swizzling applied here
+            pidm, pidn = bidx, bidy
+
+            # FIXME: if 2 CTAs, modify here
+            cta_rank_in_cluster = 0
+            block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
+
+            # prefetch tma descriptors
+            if warp_idx == self.load_warp_ids:
+                cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_a)
+                cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_b)
+
+            smem = utils.SmemAllocator()
+            storage = smem.allocate(self.shared_storage)
+
+            ab_pipeline = pipeline.PipelineTmaUmma.create(
+                num_stages=self.num_ab_stage,
+                producer_group=make_thread_cooperative_group(len([self.load_warp_ids])),
+                consumer_group=make_thread_cooperative_group(len([self.mma_warp_ids])),
+                tx_count=self.tma_copy_ab_bytes,
+                barrier_storage=storage.load_ab_mbar_ptr.data_ptr(),
             )
-            # [tileM, subTileN, loopM, CntSubTileN, loopN]
-            tAcc_epi = cute.flat_divide(
-                tCtC[((None, None), 0, None)],
-                (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
+            ab_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_ab_stage
             )
-            tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)])
-            thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
-            tTMEM_load_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
-            tTMEM_load_tAcc = cute.group_modes(tTMEM_load_tAcc, 3, cute.rank(tTMEM_load_tAcc) - 1)
-
-            # predicates
-            cAcc = cute.make_identity_tensor(self.mma_tiler[:2])
-            tCcAcc = thr_mma.partition_C(cAcc)
-            tCcAcc_epi = cute.flat_divide(
-                tCcAcc[((None, None), 0, None)],
-                (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
+            ab_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_ab_stage
             )
-            tTMEM_load_cAcc = thr_copy_t2r.partition_D(tCcAcc_epi)
-            tTMEM_load_cAcc_shape = cute.select(tTMEM_load_cAcc.shape, mode=[0, 1, 2])
-            tTMEM_load_rAcc = cute.make_fragment(tTMEM_load_cAcc_shape, self.acc_dtype)
 
-            copy_atom_g2r_int64 = cute.make_copy_atom(
-                cute.nvgpu.CopyUniversalOp(), mLabels.element_type
-            )
-            copy_atom_g2r_fp32 = cute.make_copy_atom(
-                cute.nvgpu.CopyUniversalOp(), mDlogprobs.element_type
+            mma_pipeline = pipeline.PipelineUmmaAsync.create(
+                num_stages=self.num_acc_stage,
+                producer_group=make_thread_cooperative_group(len([self.mma_warp_ids])),
+                consumer_group=make_thread_cooperative_group(
+                    self.threads_per_warp * len(self.epi_warp_ids)
+                ),
+                barrier_storage=storage.mma_mbar_ptr.data_ptr(),
             )
-            epilogue_thread_layout = cute.make_layout((128, 1), stride=(1, 1))
-            tiled_copy_g2r_int64 = cute.make_tiled_copy_tv(
-                copy_atom_g2r_int64, epilogue_thread_layout, cute.make_layout((1, 1))
+            mma_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_acc_stage
             )
-            tiled_copy_g2r_fp32 = cute.make_tiled_copy_tv(
-                copy_atom_g2r_fp32, epilogue_thread_layout, cute.make_layout((1, 1))
+            mma_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_acc_stage
             )
-            thr_copy_g2r_int64 = tiled_copy_g2r_int64.get_slice(tidx)
-            thr_copy_g2r_fp32 = tiled_copy_g2r_fp32.get_slice(tidx)
-
-            # [tileM]
-            gLabels = cute.local_tile(mLabels, (self.epi_tile[0],), (pidm,))
-            gMaximum = cute.local_tile(mMaximum, (self.epi_tile[0],), (pidm,))
-            gAccu = cute.local_tile(mAccu, (self.epi_tile[0],), (pidm,))
-
-            # slice along M direction
-            tMCAcc = thr_copy_g2r_int64.partition_S(cAcc)[(None, None, 0)]
-            # [(1, 1), 1]
-            tMCAcc_mask = cute.make_fragment(tMCAcc.shape, cutlass.Boolean)
-            # to align shape with gMax and gAccu
-            tMCAcc_mask = cute.append_ones(tMCAcc_mask)
-            tMCAcc_mask[0] = cute.elem_less(pidm * self.epi_tile[0] + tidx, cute.size(mA, mode=[0]))
-            # [(1, 1), 1, 1]
-            tMgLabels = thr_copy_g2r_int64.partition_S(cute.append_ones(gLabels))
-            tMrLabels = cute.make_fragment(tMgLabels.shape, tMgLabels.element_type)
-            cute.copy(tiled_copy_g2r_int64, tMgLabels, tMrLabels, pred=tMCAcc_mask)
-            tMgMaximum = thr_copy_g2r_fp32.partition_S(cute.append_ones(gMaximum))
-            tMrMaximum = cute.make_fragment(tMgMaximum.layout, tMgMaximum.element_type)
-            cute.copy(tiled_copy_g2r_fp32, tMgMaximum, tMrMaximum, pred=tMCAcc_mask)
-            tMgAccu = thr_copy_g2r_fp32.partition_S(cute.append_ones(gAccu))
-            tMrAccu = cute.make_fragment(tMgAccu.layout, tMgAccu.element_type)
-            cute.copy(tiled_copy_g2r_fp32, tMgAccu, tMrAccu, pred=tMCAcc_mask)
-
-            tMrDlogprobs = cute.make_fragment(tMgAccu.layout, mDlogprobs.element_type)
-            if cutlass.const_expr(self.REDUCTION == 2):
-                # mean reduction
-                num_valid_tokens = cute.make_tensor(scalarNumValidTokens, layout=(1,))
-                tMrDlogprobs[0] = mDlogprobs[0] / num_valid_tokens[0].to(cutlass.Float32)
-            elif cutlass.const_expr(self.REDUCTION == 1):
-                # sum reduction
-                tMrDlogprobs[0] = mDlogprobs[0]
-            else:
-                # no reduction
-                gDlogprobs = cute.local_tile(mDlogprobs, (self.epi_tile[0],), (pidm,))
-                tMgDlogprobs = thr_copy_g2r_fp32.partition_S(cute.append_ones(gDlogprobs))
-                cute.copy(tiled_copy_g2r_fp32, tMgDlogprobs, tMrDlogprobs, pred=tMCAcc_mask)
-
-            tMrAccu[0] = cute.arch.rcp_approx(tMrAccu[0])
-            tMrDlogprobs[0] *= tMrLabels[0] != ignore_index
-            tMr_d_acc_exp_logits = tMrDlogprobs[0] * tMrAccu[0]
-
-            # ------ Partial output ------ #
-            # [tileM, tileN]
-            gDlogits_partial = cute.local_tile(
-                mDlogits_partial, (self.epi_tile[0], self.epi_tile[1]), (pidm, pidn)
+
+            tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr.data_ptr()
+            if warp_idx == self.empty_warp_ids[0]:
+                with cute.arch.elect_one():
+                    cute.arch.mbarrier_init(
+                        tmem_dealloc_mbar_ptr, self.threads_per_warp * len(self.epi_warp_ids)
+                    )
+                    cute.arch.mbarrier_init_fence()
+
+            # -------- tensor partition ------------ #
+            # swizzle o [(tileM, tileK), loopM, loopK, stage]
+            sA = storage.sA.get_tensor(a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner)
+            # swizzle o [(tileN, tileK), loopN, loopK, stage]
+            sB = storage.sB.get_tensor(b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner)
+
+            # FIXME: if 2 CTAs, modify here
+            thr_mma = tiled_mma.get_slice(0)
+            # [MMA, loopM, loopK, stage]
+            tCsA = thr_mma.make_fragment_A(sA)
+            # [MMA, loopN, loopK, stage]
+            tCsB = thr_mma.make_fragment_B(sB)
+
+            # [tileM, tileK, loopK]
+            gA = cute.local_tile(
+                mA, (self.cta_tile_shape_mnk[0], self.cta_tile_shape_mnk[2]), (pidm, None)
             )
-            # blackwell supports STG.256
-            copy_atom_r2g = cute.make_copy_atom(
-                cute.nvgpu.CopyUniversalOp(), gDlogits_partial.element_type, num_bits_per_copy=256
+            # [vocab_per_split, dim]
+            mB_n = cute.local_tile(
+                mB, (self.vocab_per_split, cute.size(mB.layout.shape, mode=[1])), (split_idx, 0)
             )
-            tiled_copy_r2g = cute.make_tiled_copy_tv(
-                copy_atom_r2g, epilogue_thread_layout, copy_atom_r2g.layout_dst_tv
+            # [tileN, tileK, loopK]
+            gB = cute.local_tile(
+                mB_n, (self.cta_tile_shape_mnk[1], self.cta_tile_shape_mnk[2]), (pidn, None)
             )
-            thr_copy_r2g = tiled_copy_r2g.get_slice(tidx)
-
-            # [CPY, loopM, loopN]
-            tR2GCAcc = thr_copy_r2g.partition_S(cAcc)
-            tR2GCAcc_pred = cute.make_fragment(tR2GCAcc.shape, cutlass.Boolean)
-            for elem in cutlass.range(cute.size(tR2GCAcc_pred, mode=[0])):
-                for row in cutlass.range(cute.size(tR2GCAcc_pred, mode=[1])):
-                    for col in cutlass.range(cute.size(tR2GCAcc_pred, mode=[2])):
-                        tR2GCAcc_pred[elem, row, col] = cute.elem_less(
-                            pidm * self.epi_tile[0] + tR2GCAcc[elem, row, col][0], problem_mnk[0]
-                        ) and cute.elem_less(
-                            split_idx * self.vocab_per_split
-                            + pidn * self.epi_tile[1]
-                            + tR2GCAcc[elem, row, col][1],
-                            problem_mnk[1],
-                        )
-
-            tR2GgDlogits = thr_copy_r2g.partition_D(gDlogits_partial)
-
-            # for type conversion
-            dLogits_half = cute.make_fragment(tTMEM_load_rAcc.shape, tR2GgDlogits.element_type)
-            dLogits_half = cute.tiled_divide(dLogits_half, (cute.size(tR2GgDlogits, mode=[0]), 1))
-            dLogits_half = cute.group_modes(dLogits_half, 2, cute.rank(dLogits_half))
 
-            mma_pipeline.consumer_wait(mma_consumer_state)
-
-            block_vocab_left_idx: cutlass.Int64 = (
-                split_idx * self.vocab_per_split + pidn * self.epi_tile[1]
-            )
-            block_vocab_right_idx: cutlass.Int64 = min(
-                split_idx * self.vocab_per_split + (pidn + 1) * self.epi_tile[1],
-                min((split_idx + 1) * self.vocab_per_split, problem_mnk[1]),
+            a_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape)
+            # just to make sure SMEM and GMEM tensor has the same size in the first rank
+            tCgA = thr_mma.partition_A(gA)
+            tCgB = thr_mma.partition_B(gB)
+            # [CPY, stage] & [CPY, loopK]
+            tTMAsA, tTMAgA = cpasync.tma_partition(
+                tma_atom_a,
+                block_in_cluster_coord_vmnk[2],  # cta_coord,
+                a_cta_layout,
+                cute.group_modes(sA, 0, 3),
+                cute.group_modes(tCgA, 0, 3),
             )
-            num_n_subtiles: cutlass.Int64 = cute.ceil_div(
-                (block_vocab_right_idx - block_vocab_left_idx), cute.size(tTMEM_load_rAcc, mode=[0])
+            b_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape)
+            # [CPY, stage] & [CPY, loopK]
+            tTMAsB, tTMAgB = cpasync.tma_partition(
+                tma_atom_b,
+                block_in_cluster_coord_vmnk[1],  # cta_coord
+                b_cta_layout,
+                cute.group_modes(sB, 0, 3),
+                cute.group_modes(tCgB, 0, 3),
             )
-            for n_subtile in cutlass.range(num_n_subtiles):
-                cute.copy(
-                    tiled_copy_t2r,
-                    tTMEM_load_tAcc[(None, None, None, n_subtile, mma_consumer_state.index)],
-                    tTMEM_load_rAcc,
+
+            # ------ Allocate TMEM ------ #
+            tmem_holding_buf = storage.tmem_holding_buf
+            if warp_idx == self.empty_warp_ids[0]:
+                cute.arch.alloc_tmem(
+                    self.tmem_alloc_cols, tmem_holding_buf, is_two_cta=self.use_2cta_instrs
                 )
+            self.cta_sync_barrier.arrive_and_wait()
+            tmem_ptr = cute.arch.retrieve_tmem_ptr(
+                self.acc_dtype, alignment=16, ptr_to_buffer_holding_addr=tmem_holding_buf
+            )
+
+            tmem_shape = (128, self.tmem_alloc_cols)
+            acc_shape = thr_mma.partition_shape_C(tmem_shape)
+            tCtC_fake = thr_mma.make_fragment_C(acc_shape)
+            # [(tileM, tileN), loopM, loopN]
+            tCtC = cute.make_tensor(tmem_ptr, tCtC_fake.layout)
 
-                for idx in cutlass.range(cute.size(tTMEM_load_rAcc, mode=[0]), unroll_full=True):
-                    # exp_logits
-                    tTMEM_load_rAcc[idx] = cute.exp(tTMEM_load_rAcc[idx] - tMrMaximum[0])
+            # ------ Empty ------ #
+            if warp_idx in self.empty_warp_ids:
+                cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
 
-                    position: cutlass.Int64 = (
-                        rank * problem_mnk[1]
-                        + split_idx * self.vocab_per_split
-                        + pidn * self.epi_tile[1]
-                        + n_subtile * cute.size(tTMEM_load_rAcc, mode=[0])
-                        + idx
+            # ------ Load ------ #
+            if warp_idx == self.load_warp_ids:
+                cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+
+                for k in cutlass.range(cute.size(gA, mode=[2])):
+                    ab_pipeline.producer_acquire(ab_producer_state)
+                    cute.copy(
+                        tma_atom_a,
+                        tTMAgA[(None, k)],
+                        tTMAsA[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
                     )
-                    mask: cutlass.Boolean = (
-                        position == tMrLabels[0] and tMrLabels[0] != ignore_index
+                    cute.copy(
+                        tma_atom_b,
+                        tTMAgB[(None, k)],
+                        tTMAsB[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
                     )
-                    # d_logits
-                    tTMEM_load_rAcc[idx] *= tMr_d_acc_exp_logits
-                    tTMEM_load_rAcc[idx] += mask * -tMrDlogprobs[0]
-                    dLogits_half[idx] = tTMEM_load_rAcc[idx].to(dLogits_half.element_type)
+                    ab_pipeline.producer_commit(ab_producer_state)
+                    ab_producer_state.advance()
+
+            # ------ MMA ------ #
+            if warp_idx == self.mma_warp_ids:
+                cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+
+                tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+                mma_pipeline.producer_acquire(mma_producer_state)
+
+                for k in cutlass.range(cute.size(gA, mode=[2])):
+                    ab_pipeline.consumer_wait(ab_consumer_state)
+
+                    for kblock_idx in cutlass.range(cute.size(tCsA, mode=[2]), unroll_full=True):
+                        cute.gemm(
+                            tiled_mma,
+                            cute.append_ones(tCtC[(None, None, mma_producer_state.index)]),
+                            tCsA[(None, None, kblock_idx, ab_consumer_state.index)],
+                            tCsB[(None, None, kblock_idx, ab_consumer_state.index)],
+                            cute.append_ones(tCtC[(None, None, mma_producer_state.index)]),
+                        )
+                        tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+                    ab_pipeline.consumer_release(ab_consumer_state)
+                    ab_consumer_state.advance()
+
+                mma_pipeline.producer_commit(mma_producer_state)
+                mma_producer_state.advance()
+
+            # ------ EPI ------ #
+            if warp_idx in self.epi_warp_ids:
+                cute.arch.warpgroup_reg_alloc(self.num_regs_epi)
+
+                copy_atom_t2r = sm100_utils.get_tmem_load_op(
+                    self.cta_tile_shape_mnk,
+                    utils.LayoutEnum.ROW_MAJOR,
+                    self.acc_dtype,
+                    self.acc_dtype,
+                    (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
+                    self.use_2cta_instrs,
+                )
+                # [tileM, subTileN, loopM, CntSubTileN, loopN]
+                tAcc_epi = cute.flat_divide(
+                    tCtC[((None, None), 0, None)],
+                    (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
+                )
+                tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)])
+                thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+                tTMEM_load_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
+                tTMEM_load_tAcc = cute.group_modes(tTMEM_load_tAcc, 3, cute.rank(tTMEM_load_tAcc) - 1)
+
+                # predicates
+                cAcc = cute.make_identity_tensor(self.mma_tiler[:2])
+                tCcAcc = thr_mma.partition_C(cAcc)
+                tCcAcc_epi = cute.flat_divide(
+                    tCcAcc[((None, None), 0, None)],
+                    (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
+                )
+                tTMEM_load_cAcc = thr_copy_t2r.partition_D(tCcAcc_epi)
+                tTMEM_load_cAcc_shape = cute.select(tTMEM_load_cAcc.shape, mode=[0, 1, 2])
+                tTMEM_load_rAcc = cute.make_fragment(tTMEM_load_cAcc_shape, self.acc_dtype)
 
-                for idx in cutlass.range(cute.size(dLogits_half, mode=[1]), unroll_full=True):
-                    copy_id = n_subtile * cute.size(dLogits_half, mode=[1]) + idx
+                copy_atom_g2r_int64 = cute.make_copy_atom(
+                    cute.nvgpu.CopyUniversalOp(), mLabels.element_type
+                )
+                copy_atom_g2r_fp32 = cute.make_copy_atom(
+                    cute.nvgpu.CopyUniversalOp(), mDlogprobs.element_type
+                )
+                epilogue_thread_layout = cute.make_layout((128, 1), stride=(1, 1))
+                tiled_copy_g2r_int64 = cute.make_tiled_copy_tv(
+                    copy_atom_g2r_int64, epilogue_thread_layout, cute.make_layout((1, 1))
+                )
+                tiled_copy_g2r_fp32 = cute.make_tiled_copy_tv(
+                    copy_atom_g2r_fp32, epilogue_thread_layout, cute.make_layout((1, 1))
+                )
+                thr_copy_g2r_int64 = tiled_copy_g2r_int64.get_slice(tidx)
+                thr_copy_g2r_fp32 = tiled_copy_g2r_fp32.get_slice(tidx)
+
+                # [tileM]
+                gLabels = cute.local_tile(mLabels, (self.epi_tile[0],), (pidm,))
+                gMaximum = cute.local_tile(mMaximum, (self.epi_tile[0],), (pidm,))
+                gAccu = cute.local_tile(mAccu, (self.epi_tile[0],), (pidm,))
+
+                # slice along M direction
+                tMCAcc = thr_copy_g2r_int64.partition_S(cAcc)[(None, None, 0)]
+                # [(1, 1), 1]
+                tMCAcc_mask = cute.make_fragment(tMCAcc.shape, cutlass.Boolean)
+                # to align shape with gMax and gAccu
+                tMCAcc_mask = cute.append_ones(tMCAcc_mask)
+                tMCAcc_mask[0] = cute.elem_less(pidm * self.epi_tile[0] + tidx, cute.size(mA, mode=[0]))
+                # [(1, 1), 1, 1]
+                tMgLabels = thr_copy_g2r_int64.partition_S(cute.append_ones(gLabels))
+                tMrLabels = cute.make_fragment(tMgLabels.shape, tMgLabels.element_type)
+                cute.copy(tiled_copy_g2r_int64, tMgLabels, tMrLabels, pred=tMCAcc_mask)
+                tMgMaximum = thr_copy_g2r_fp32.partition_S(cute.append_ones(gMaximum))
+                tMrMaximum = cute.make_fragment(tMgMaximum.layout, tMgMaximum.element_type)
+                cute.copy(tiled_copy_g2r_fp32, tMgMaximum, tMrMaximum, pred=tMCAcc_mask)
+                tMgAccu = thr_copy_g2r_fp32.partition_S(cute.append_ones(gAccu))
+                tMrAccu = cute.make_fragment(tMgAccu.layout, tMgAccu.element_type)
+                cute.copy(tiled_copy_g2r_fp32, tMgAccu, tMrAccu, pred=tMCAcc_mask)
+
+                tMrDlogprobs = cute.make_fragment(tMgAccu.layout, mDlogprobs.element_type)
+                if cutlass.const_expr(self.REDUCTION == 2):
+                    # mean reduction
+                    num_valid_tokens = cute.make_tensor(scalarNumValidTokens, layout=(1,))
+                    tMrDlogprobs[0] = mDlogprobs[0] / num_valid_tokens[0].to(cutlass.Float32)
+                elif cutlass.const_expr(self.REDUCTION == 1):
+                    # sum reduction
+                    tMrDlogprobs[0] = mDlogprobs[0]
+                else:
+                    # no reduction
+                    gDlogprobs = cute.local_tile(mDlogprobs, (self.epi_tile[0],), (pidm,))
+                    tMgDlogprobs = thr_copy_g2r_fp32.partition_S(cute.append_ones(gDlogprobs))
+                    cute.copy(tiled_copy_g2r_fp32, tMgDlogprobs, tMrDlogprobs, pred=tMCAcc_mask)
+
+                tMrAccu[0] = cute.arch.rcp_approx(tMrAccu[0])
+                tMrDlogprobs[0] *= tMrLabels[0] != ignore_index
+                tMr_d_acc_exp_logits = tMrDlogprobs[0] * tMrAccu[0]
+
+                # ------ Partial output ------ #
+                # [tileM, tileN]
+                gDlogits_partial = cute.local_tile(
+                    mDlogits_partial, (self.epi_tile[0], self.epi_tile[1]), (pidm, pidn)
+                )
+                # blackwell supports STG.256
+                copy_atom_r2g = cute.make_copy_atom(
+                    cute.nvgpu.CopyUniversalOp(), gDlogits_partial.element_type, num_bits_per_copy=256
+                )
+                tiled_copy_r2g = cute.make_tiled_copy_tv(
+                    copy_atom_r2g, epilogue_thread_layout, copy_atom_r2g.layout_dst_tv
+                )
+                thr_copy_r2g = tiled_copy_r2g.get_slice(tidx)
+
+                # [CPY, loopM, loopN]
+                tR2GCAcc = thr_copy_r2g.partition_S(cAcc)
+                tR2GCAcc_pred = cute.make_fragment(tR2GCAcc.shape, cutlass.Boolean)
+                for elem in cutlass.range(cute.size(tR2GCAcc_pred, mode=[0])):
+                    for row in cutlass.range(cute.size(tR2GCAcc_pred, mode=[1])):
+                        for col in cutlass.range(cute.size(tR2GCAcc_pred, mode=[2])):
+                            tR2GCAcc_pred[elem, row, col] = cute.elem_less(
+                                pidm * self.epi_tile[0] + tR2GCAcc[elem, row, col][0], problem_mnk[0]
+                            ) and cute.elem_less(
+                                split_idx * self.vocab_per_split
+                                + pidn * self.epi_tile[1]
+                                + tR2GCAcc[elem, row, col][1],
+                                problem_mnk[1],
+                            )
+
+                tR2GgDlogits = thr_copy_r2g.partition_D(gDlogits_partial)
+
+                # for type conversion
+                dLogits_half = cute.make_fragment(tTMEM_load_rAcc.shape, tR2GgDlogits.element_type)
+                dLogits_half = cute.tiled_divide(dLogits_half, (cute.size(tR2GgDlogits, mode=[0]), 1))
+                dLogits_half = cute.group_modes(dLogits_half, 2, cute.rank(dLogits_half))
+
+                mma_pipeline.consumer_wait(mma_consumer_state)
+
+                block_vocab_left_idx: cutlass.Int64 = (
+                    split_idx * self.vocab_per_split + pidn * self.epi_tile[1]
+                )
+                block_vocab_right_idx: cutlass.Int64 = min(
+                    split_idx * self.vocab_per_split + (pidn + 1) * self.epi_tile[1],
+                    min((split_idx + 1) * self.vocab_per_split, problem_mnk[1]),
+                )
+                num_n_subtiles: cutlass.Int64 = cute.ceil_div(
+                    (block_vocab_right_idx - block_vocab_left_idx), cute.size(tTMEM_load_rAcc, mode=[0])
+                )
+                for n_subtile in cutlass.range(num_n_subtiles):
                     cute.copy(
-                        tiled_copy_r2g,
-                        dLogits_half[(None, idx, None)],
-                        tR2GgDlogits[(None, None, copy_id)],
-                        pred=tR2GCAcc_pred[((0, None), None, copy_id)],
+                        tiled_copy_t2r,
+                        tTMEM_load_tAcc[(None, None, None, n_subtile, mma_consumer_state.index)],
+                        tTMEM_load_rAcc,
                     )
 
-            mma_pipeline.consumer_release(mma_consumer_state)
-            mma_consumer_state.advance()
-
-        # ------ Deallocate TMEM ------ #
-        self.cta_sync_barrier.arrive_and_wait()
-        if warp_idx == self.empty_warp_ids[0]:
-            cute.arch.relinquish_tmem_alloc_permit()
-            cute.arch.dealloc_tmem(tmem_ptr, self.tmem_alloc_cols, is_two_cta=self.use_2cta_instrs)
-
-    @cute.jit
-    def __call__(
-        self,
-        split_idx: cutlass.Int32,
-        hidden: cute.Tensor,
-        weight: cute.Tensor,
-        labels: cute.Tensor,
-        dlogprobs: cute.Tensor,
-        maximum: cute.Tensor,
-        accu: cute.Tensor,
-        dlogits_partial: cute.Tensor,
-        scalarNumValidTokens: cute.Pointer,
-        ignore_index: cutlass.Int64,
-        rank: cutlass.Int32,
-        stream: cuda.CUstream,
-    ) -> None:
-        a_dtype: Type[cutlass.Numeric] = hidden.element_type
-        b_dtype: Type[cutlass.Numeric] = weight.element_type
-
-        if cutlass.const_expr(hidden.element_type != weight.element_type):
-            raise RuntimeError(
-                f"data type don't match: {hidden.element_type} v.s. {weight.element_type}"
-            )
-        if cutlass.const_expr(hidden.element_type not in [cutlass.Float16, cutlass.BFloat16]):
-            raise RuntimeError("hidden can only be FP16 or BF16")
-        if cutlass.const_expr(hidden.layout.shape[1] != weight.layout.shape[1]):
-            raise RuntimeError("K dimension doesn't match")
-
-        problem_mnk = (hidden.layout.shape[0], weight.layout.shape[0], hidden.layout.shape[1])
-        if cutlass.const_expr((problem_mnk[2] * a_dtype.width // 8) % 16 != 0):
-            raise RuntimeError(f"K dimension is not 16B aligned: {problem_mnk[2]}")
-        if cutlass.const_expr((problem_mnk[2] * b_dtype.width // 8) % 128 != 0):
-            raise RuntimeError(f"N dimension is not 128B aligned: {problem_mnk[1]}")
-
-        grid = self._compute_grid(
-            problem_mnk=problem_mnk,
-            cluster_shape_mn=self.cluster_shape_mn,
-            cta_tiler=self.mma_tiler,
-        )
+                    for idx in cutlass.range(cute.size(tTMEM_load_rAcc, mode=[0]), unroll_full=True):
+                        # exp_logits
+                        tTMEM_load_rAcc[idx] = cute.exp(tTMEM_load_rAcc[idx] - tMrMaximum[0])
 
-        a_major_mode = utils.LayoutEnum.from_tensor(hidden).mma_major_mode()
-        b_major_mode = utils.LayoutEnum.from_tensor(weight).mma_major_mode()
+                        position: cutlass.Int64 = (
+                            rank * problem_mnk[1]
+                            + split_idx * self.vocab_per_split
+                            + pidn * self.epi_tile[1]
+                            + n_subtile * cute.size(tTMEM_load_rAcc, mode=[0])
+                            + idx
+                        )
+                        mask: cutlass.Boolean = (
+                            position == tMrLabels[0] and tMrLabels[0] != ignore_index
+                        )
+                        # d_logits
+                        tTMEM_load_rAcc[idx] *= tMr_d_acc_exp_logits
+                        tTMEM_load_rAcc[idx] += mask * -tMrDlogprobs[0]
+                        dLogits_half[idx] = tTMEM_load_rAcc[idx].to(dLogits_half.element_type)
+
+                    for idx in cutlass.range(cute.size(dLogits_half, mode=[1]), unroll_full=True):
+                        copy_id = n_subtile * cute.size(dLogits_half, mode=[1]) + idx
+                        cute.copy(
+                            tiled_copy_r2g,
+                            dLogits_half[(None, idx, None)],
+                            tR2GgDlogits[(None, None, copy_id)],
+                            pred=tR2GCAcc_pred[((0, None), None, copy_id)],
+                        )
 
-        tiled_mma = sm100_utils.make_trivial_tiled_mma(
-            a_dtype, a_major_mode, b_major_mode, self.acc_dtype, self.cta_group, self.mma_tiler[:2]
-        )
-        self._setup_attributes(tiled_mma, a_dtype, b_dtype)
+                mma_pipeline.consumer_release(mma_consumer_state)
+                mma_consumer_state.advance()
+
+            # ------ Deallocate TMEM ------ #
+            self.cta_sync_barrier.arrive_and_wait()
+            if warp_idx == self.empty_warp_ids[0]:
+                cute.arch.relinquish_tmem_alloc_permit()
+                cute.arch.dealloc_tmem(tmem_ptr, self.tmem_alloc_cols, is_two_cta=self.use_2cta_instrs)
+
+        @cute.jit
+        def __call__(
+            self,
+            split_idx: cutlass.Int32,
+            hidden: cute.Tensor,
+            weight: cute.Tensor,
+            labels: cute.Tensor,
+            dlogprobs: cute.Tensor,
+            maximum: cute.Tensor,
+            accu: cute.Tensor,
+            dlogits_partial: cute.Tensor,
+            scalarNumValidTokens: cute.Pointer,
+            ignore_index: cutlass.Int64,
+            rank: cutlass.Int32,
+            stream: cuda.CUstream,
+        ) -> None:
+            a_dtype: Type[cutlass.Numeric] = hidden.element_type
+            b_dtype: Type[cutlass.Numeric] = weight.element_type
+
+            if cutlass.const_expr(hidden.element_type != weight.element_type):
+                raise RuntimeError(
+                    f"data type don't match: {hidden.element_type} v.s. {weight.element_type}"
+                )
+            if cutlass.const_expr(hidden.element_type not in [cutlass.Float16, cutlass.BFloat16]):
+                raise RuntimeError("hidden can only be FP16 or BF16")
+            if cutlass.const_expr(hidden.layout.shape[1] != weight.layout.shape[1]):
+                raise RuntimeError("K dimension doesn't match")
+
+            problem_mnk = (hidden.layout.shape[0], weight.layout.shape[0], hidden.layout.shape[1])
+            if cutlass.const_expr((problem_mnk[2] * a_dtype.width // 8) % 16 != 0):
+                raise RuntimeError(f"K dimension is not 16B aligned: {problem_mnk[2]}")
+            if cutlass.const_expr((problem_mnk[2] * b_dtype.width // 8) % 128 != 0):
+                raise RuntimeError(f"N dimension is not 128B aligned: {problem_mnk[1]}")
+
+            grid = self._compute_grid(
+                problem_mnk=problem_mnk,
+                cluster_shape_mn=self.cluster_shape_mn,
+                cta_tiler=self.mma_tiler,
+            )
 
-        self.epi_tile = self.cta_tile_shape_mnk[:2]
+            a_major_mode = utils.LayoutEnum.from_tensor(hidden).mma_major_mode()
+            b_major_mode = utils.LayoutEnum.from_tensor(weight).mma_major_mode()
 
-        # Swizzle o [(tileM, tileK), loopM, loopK, stage]
-        a_smem_layout_staged = sm100_utils.make_smem_layout_a(
-            tiled_mma, self.mma_tiler, a_dtype, self.num_ab_stage
-        )
-        # Swizzle o [(tileN, tileK), loopN, loopK, stage]
-        b_smem_layout_staged = sm100_utils.make_smem_layout_b(
-            tiled_mma, self.mma_tiler, b_dtype, self.num_ab_stage
-        )
-        tma_load_op = cpasync.CopyBulkTensorTileG2SOp(self.cta_group)
-        tma_store_op = cpasync.CopyBulkTensorTileS2GOp()
-
-        # Swizzle o [(tileM, tileK), loopM, loopK]
-        a_smem_layout = cute.select(a_smem_layout_staged, mode=[0, 1, 2])
-        tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
-            tma_load_op,
-            hidden,
-            a_smem_layout,
-            self.mma_tiler,
-            tiled_mma,
-            self.cluster_layout_vmnk.shape,
-        )
-        # Swizzle o [(tileN, tileK), loopN, loopK]
-        b_smem_layout = cute.select(b_smem_layout_staged, mode=[0, 1, 2])
-        tma_atom_b, tma_tensor_b = cute.nvgpu.make_tiled_tma_atom_B(
-            tma_load_op,
-            weight,
-            b_smem_layout,
-            self.mma_tiler,
-            tiled_mma,
-            self.cluster_layout_vmnk.shape,
-        )
-        a_copy_size = cute.size_in_bytes(a_dtype, a_smem_layout)
-        b_copy_size = cute.size_in_bytes(b_dtype, b_smem_layout)
-        self.tma_copy_ab_bytes = a_copy_size + b_copy_size
+            tiled_mma = sm100_utils.make_trivial_tiled_mma(
+                a_dtype, a_major_mode, b_major_mode, self.acc_dtype, self.cta_group, self.mma_tiler[:2]
+            )
+            self._setup_attributes(tiled_mma, a_dtype, b_dtype)
 
-        @cute.struct
-        class SharedStorage:
-            """
-            The shared storage for the backward kernel.
-            """
+            self.epi_tile = self.cta_tile_shape_mnk[:2]
 
-            load_ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage * 2]
-            mma_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2]
-
-            tmem_dealloc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, 1]
-            tmem_holding_buf: cutlass.Int32
-
-            sA: cute.struct.Align[
-                cute.struct.MemRange[a_dtype, cute.cosize(a_smem_layout_staged)],
-                self.buffer_align_bytes,
-            ]
-            sB: cute.struct.Align[
-                cute.struct.MemRange[b_dtype, cute.cosize(b_smem_layout_staged)],
-                self.buffer_align_bytes,
-            ]
-
-        self.shared_storage = SharedStorage
-
-        self.kernel(
-            split_idx,
-            tiled_mma,
-            tma_atom_a,
-            tma_tensor_a,
-            tma_atom_b,
-            tma_tensor_b,
-            labels,
-            dlogprobs,
-            maximum,
-            accu,
-            dlogits_partial,
-            scalarNumValidTokens,
-            ignore_index,
-            a_smem_layout_staged,
-            b_smem_layout_staged,
-            self.cluster_layout_vmnk,
-            problem_mnk,
-            rank,
-        ).launch(
-            grid=grid,
-            block=[self.threads_per_cta, 1, 1],
-            cluster=self.cluster_shape_mnk,
-            stream=stream,
-        )
+            # Swizzle o [(tileM, tileK), loopM, loopK, stage]
+            a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+                tiled_mma, self.mma_tiler, a_dtype, self.num_ab_stage
+            )
+            # Swizzle o [(tileN, tileK), loopN, loopK, stage]
+            b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+                tiled_mma, self.mma_tiler, b_dtype, self.num_ab_stage
+            )
+            tma_load_op = cpasync.CopyBulkTensorTileG2SOp(self.cta_group)
+            tma_store_op = cpasync.CopyBulkTensorTileS2GOp()
+
+            # Swizzle o [(tileM, tileK), loopM, loopK]
+            a_smem_layout = cute.select(a_smem_layout_staged, mode=[0, 1, 2])
+            tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
+                tma_load_op,
+                hidden,
+                a_smem_layout,
+                self.mma_tiler,
+                tiled_mma,
+                self.cluster_layout_vmnk.shape,
+            )
+            # Swizzle o [(tileN, tileK), loopN, loopK]
+            b_smem_layout = cute.select(b_smem_layout_staged, mode=[0, 1, 2])
+            tma_atom_b, tma_tensor_b = cute.nvgpu.make_tiled_tma_atom_B(
+                tma_load_op,
+                weight,
+                b_smem_layout,
+                self.mma_tiler,
+                tiled_mma,
+                self.cluster_layout_vmnk.shape,
+            )
+            a_copy_size = cute.size_in_bytes(a_dtype, a_smem_layout)
+            b_copy_size = cute.size_in_bytes(b_dtype, b_smem_layout)
+            self.tma_copy_ab_bytes = a_copy_size + b_copy_size
+
+            @cute.struct
+            class SharedStorage:
+                """
+                The shared storage for the backward kernel.
+                """
+
+                load_ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage * 2]
+                mma_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2]
+
+                tmem_dealloc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, 1]
+                tmem_holding_buf: cutlass.Int32
+
+                sA: cute.struct.Align[
+                    cute.struct.MemRange[a_dtype, cute.cosize(a_smem_layout_staged)],
+                    self.buffer_align_bytes,
+                ]
+                sB: cute.struct.Align[
+                    cute.struct.MemRange[b_dtype, cute.cosize(b_smem_layout_staged)],
+                    self.buffer_align_bytes,
+                ]
+
+            self.shared_storage = SharedStorage
+
+            self.kernel(
+                split_idx,
+                tiled_mma,
+                tma_atom_a,
+                tma_tensor_a,
+                tma_atom_b,
+                tma_tensor_b,
+                labels,
+                dlogprobs,
+                maximum,
+                accu,
+                dlogits_partial,
+                scalarNumValidTokens,
+                ignore_index,
+                a_smem_layout_staged,
+                b_smem_layout_staged,
+                self.cluster_layout_vmnk,
+                problem_mnk,
+                rank,
+            ).launch(
+                grid=grid,
+                block=[self.threads_per_cta, 1, 1],
+                cluster=self.cluster_shape_mnk,
+                stream=stream,
+            )
+except ImportError:
+    pass
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py b/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py
index 014c574a635..7ca2e5c91fb 100644
--- a/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py
@@ -1,172 +1,187 @@
 # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 
-import typing
-from dataclasses import dataclass, field
-from functools import lru_cache
-import os
-
-import cuda.bindings.driver as cuda  # type: ignore
-import cutlass
-import cutlass.cute as cute
-import torch
-import torch.distributed as dist
-import triton  # type: ignore
-from cutlass.cute.runtime import from_dlpack
-
-import megatron.core.fusions.linear_cross_entropy.utils as utils
-from megatron.core.fusions.linear_cross_entropy.blackwell import (
-    bwd_partial_dlogits as bwd_partial_dlogits,
-)
-from megatron.core.fusions.linear_cross_entropy.blackwell import fwd_mainloop as fwd_mainloop
-from megatron.core.fusions.linear_cross_entropy.blackwell import triton as triton_kernels
-
-
-@dataclass
-class FwdConfig:
-    """
-    The configuration for the forward pass.
-    """
-
-    _dedicated_stream: torch.cuda.Stream = field(default_factory=torch.cuda.Stream)
-    _dedicated_events: typing.List[torch.cuda.Event] = field(default_factory=list)
-    _initialized: bool = field(default=False)
-    _fwd_mainloop_kernels: typing.Dict[str, cute.kernel] = field(default_factory=dict)
-    _vocab_per_split: int = field(default=int(os.environ.get("LCE_FWD_VOCAB_SPLIT_SIZE", 512 * 6)))
-
-
-@dataclass
-class BwdConfig:
-    """
-    The configuration for the backward pass.
-    """
-
-    _bwd_kernel: typing.Dict[str, cute.kernel] = field(default_factory=dict)
-    _vocab_per_split: int = field(default=int(os.environ.get("LCE_BWD_VOCAB_SPLIT_SIZE", 512 * 6)))
-    _backward_method: utils.BackwardMethodEnum = field(default=utils.BackwardMethodEnum.kDlogitsSplitN)
-
-
-@lru_cache(maxsize=1)
-def _get_fwd_config() -> FwdConfig:
-    """
-    Helper function to lazy initialize the forward configuration.
-    """
-    return FwdConfig()
-
-@lru_cache(maxsize=1)
-def _get_bwd_config() -> BwdConfig:
-    """
-    Helper function to lazy initialize the backward configuration.
-    """
-    return BwdConfig()
-
-def forward(
-    hidden: torch.Tensor,
-    weight: torch.Tensor,
-    labels: torch.Tensor,
-    tp_group: typing.Optional[torch.distributed.ProcessGroup] = None,
-    reduction: typing.Literal["none", "sum", "mean"] = "mean",
-    ignore_index: int = -100,
-    sequence_parallel: bool = False,
-) -> typing.Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int, int, torch.Tensor]:
-    """
-    forward host function
-    """
-    tp_rank = 0 if tp_group is None else torch.distributed.get_rank(tp_group)
-    tp_world_size = 1 if tp_group is None else torch.distributed.get_world_size(tp_group)
-    in_tp_mode = (tp_group is not None) and (tp_world_size > 1)
-
-    assert hidden.is_cuda and weight.is_cuda and labels.is_cuda
-    assert weight.device == hidden.device and labels.device == hidden.device
-
-    # hidden could be [batch, seqlen, dim] or [seqlen, batch, dim] or [tokens, dim]
-    assert hidden.dim() == 2 or hidden.dim() == 3
-    # weight must be [vocab_size, dim]
-    assert weight.dim() == 2
-    # labels could be [batch, seqlen] or [seqlen, batch] or [tokens]
-    assert (hidden.dim() == 2 and labels.dim() == 1) or (hidden.dim() == 3 and labels.dim() == 2)
-    assert hidden.is_contiguous() and weight.is_contiguous() and labels.is_contiguous()
-
-    hidden_view = hidden.view(-1, hidden.shape[-1])
-    labels_view = labels.view(-1)
-
-    assert (sequence_parallel and hidden_view.shape[0] * tp_world_size == labels_view.shape[0]) or (
-        not sequence_parallel and hidden_view.shape[0] == labels_view.shape[0]
+try:
+    import typing
+    from dataclasses import dataclass, field
+    from functools import lru_cache
+    import os
+
+    import cuda.bindings.driver as cuda  # type: ignore
+    import cutlass
+    import cutlass.cute as cute
+    import torch
+    import torch.distributed as dist
+    import triton  # type: ignore
+    from cutlass.cute.runtime import from_dlpack
+
+    import megatron.core.fusions.linear_cross_entropy.utils as utils
+    from megatron.core.fusions.linear_cross_entropy.blackwell import (
+        bwd_partial_dlogits as bwd_partial_dlogits,
     )
-    assert hidden_view.shape[1] == weight.shape[1]
-
-    global_hidden = hidden
-    if in_tp_mode and sequence_parallel:
-        partial_hidden_shape = hidden.shape
-        global_hidden_shape = (partial_hidden_shape[0] * tp_world_size, *partial_hidden_shape[1:])
-        global_hidden = torch.empty(global_hidden_shape, dtype=hidden.dtype, device=hidden.device)
-        dist.all_gather_into_tensor(global_hidden, hidden, group=tp_group)
-        assert global_hidden.is_contiguous()
-        hidden_view = global_hidden.view(-1, global_hidden.shape[-1])
-
-    num_tokens, dim = hidden_view.shape
-    vocab_size, _ = weight.shape
-
-    if not _get_fwd_config()._initialized:
-        _get_fwd_config()._dedicated_stream = torch.cuda.Stream(hidden.device)
-        _get_fwd_config()._dedicated_events = [torch.cuda.Event() for _ in range(2)]
-        _get_fwd_config()._initialized = True
-
-    REDUCTION = utils.str_to_reduction_enum(reduction)
-    # declare logprobs
-    if REDUCTION == utils.EntropyReductionEnum.kNone:
-        logprobs = torch.empty((num_tokens,), device=hidden.device, dtype=torch.float32)
-        if in_tp_mode:
-            logprobs.zero_()
-    else:
-        logprobs = torch.zeros((), device=hidden.device, dtype=torch.float32)
-    # declare auxiliary tensors
-    maximum = torch.empty((num_tokens,), device=hidden.device, dtype=torch.float32)
-    accumulate = torch.empty_like(maximum, dtype=torch.float32)
-    num_valid_tokens = torch.empty((), device=hidden.device, dtype=torch.int64)
-    assert (
-        maximum.is_contiguous() and accumulate.is_contiguous() and num_valid_tokens.is_contiguous()
-    )
-    # declare intermediate tensors
-    # NOTE: this is a parameter for tuning
-    num_splits = (vocab_size + _get_fwd_config()._vocab_per_split - 1) // _get_fwd_config()._vocab_per_split
-    _max = torch.empty((num_tokens, num_splits), device=hidden.device, dtype=torch.float32)
-    _accu = torch.empty((num_tokens, num_splits), device=hidden.device, dtype=torch.float32)
-    if REDUCTION == utils.EntropyReductionEnum.kNone:
-        _logprobs = logprobs
-    else:
-        _logprobs = torch.empty((num_tokens,), device=hidden.device, dtype=torch.float32)
-        if in_tp_mode:
-            _logprobs.zero_()
-    assert _max.is_contiguous() and _accu.is_contiguous() and _logprobs.is_contiguous()
+    from megatron.core.fusions.linear_cross_entropy.blackwell import fwd_mainloop as fwd_mainloop
+    from megatron.core.fusions.linear_cross_entropy.blackwell import triton as triton_kernels
+
+
+    @dataclass
+    class FwdConfig:
+        """
+        The configuration for the forward pass.
+        """
+
+        _dedicated_stream: torch.cuda.Stream = field(default_factory=torch.cuda.Stream)
+        _dedicated_events: typing.List[torch.cuda.Event] = field(default_factory=list)
+        _initialized: bool = field(default=False)
+        _fwd_mainloop_kernels: typing.Dict[str, cute.kernel] = field(default_factory=dict)
+        _vocab_per_split: int = field(default=int(os.environ.get("LCE_FWD_VOCAB_SPLIT_SIZE", 512 * 6)))
+
+
+    @dataclass
+    class BwdConfig:
+        """
+        The configuration for the backward pass.
+        """
+
+        _bwd_kernel: typing.Dict[str, cute.kernel] = field(default_factory=dict)
+        _vocab_per_split: int = field(default=int(os.environ.get("LCE_BWD_VOCAB_SPLIT_SIZE", 512 * 6)))
+        _backward_method: utils.BackwardMethodEnum = field(default=utils.BackwardMethodEnum.kDlogitsSplitN)
+
+
+    @lru_cache(maxsize=1)
+    def _get_fwd_config() -> FwdConfig:
+        """
+        Helper function to lazy initialize the forward configuration.
+        """
+        return FwdConfig()
+
+    @lru_cache(maxsize=1)
+    def _get_bwd_config() -> BwdConfig:
+        """
+        Helper function to lazy initialize the backward configuration.
+        """
+        return BwdConfig()
+
+    def forward(
+        hidden: torch.Tensor,
+        weight: torch.Tensor,
+        labels: torch.Tensor,
+        tp_group: typing.Optional[torch.distributed.ProcessGroup] = None,
+        reduction: typing.Literal["none", "sum", "mean"] = "mean",
+        ignore_index: int = -100,
+        sequence_parallel: bool = False,
+    ) -> typing.Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int, int, torch.Tensor]:
+        """
+        forward host function
+        """
+        tp_rank = 0 if tp_group is None else torch.distributed.get_rank(tp_group)
+        tp_world_size = 1 if tp_group is None else torch.distributed.get_world_size(tp_group)
+        in_tp_mode = (tp_group is not None) and (tp_world_size > 1)
+
+        assert hidden.is_cuda and weight.is_cuda and labels.is_cuda
+        assert weight.device == hidden.device and labels.device == hidden.device
+
+        # hidden could be [batch, seqlen, dim] or [seqlen, batch, dim] or [tokens, dim]
+        assert hidden.dim() == 2 or hidden.dim() == 3
+        # weight must be [vocab_size, dim]
+        assert weight.dim() == 2
+        # labels could be [batch, seqlen] or [seqlen, batch] or [tokens]
+        assert (hidden.dim() == 2 and labels.dim() == 1) or (hidden.dim() == 3 and labels.dim() == 2)
+        assert hidden.is_contiguous() and weight.is_contiguous() and labels.is_contiguous()
+
+        hidden_view = hidden.view(-1, hidden.shape[-1])
+        labels_view = labels.view(-1)
+
+        assert (sequence_parallel and hidden_view.shape[0] * tp_world_size == labels_view.shape[0]) or (
+            not sequence_parallel and hidden_view.shape[0] == labels_view.shape[0]
+        )
+        assert hidden_view.shape[1] == weight.shape[1]
+
+        global_hidden = hidden
+        if in_tp_mode and sequence_parallel:
+            partial_hidden_shape = hidden.shape
+            global_hidden_shape = (partial_hidden_shape[0] * tp_world_size, *partial_hidden_shape[1:])
+            global_hidden = torch.empty(global_hidden_shape, dtype=hidden.dtype, device=hidden.device)
+            dist.all_gather_into_tensor(global_hidden, hidden, group=tp_group)
+            assert global_hidden.is_contiguous()
+            hidden_view = global_hidden.view(-1, global_hidden.shape[-1])
+
+        num_tokens, dim = hidden_view.shape
+        vocab_size, _ = weight.shape
+
+        if not _get_fwd_config()._initialized:
+            _get_fwd_config()._dedicated_stream = torch.cuda.Stream(hidden.device)
+            _get_fwd_config()._dedicated_events = [torch.cuda.Event() for _ in range(2)]
+            _get_fwd_config()._initialized = True
+
+        REDUCTION = utils.str_to_reduction_enum(reduction)
+        # declare logprobs
+        if REDUCTION == utils.EntropyReductionEnum.kNone:
+            logprobs = torch.empty((num_tokens,), device=hidden.device, dtype=torch.float32)
+            if in_tp_mode:
+                logprobs.zero_()
+        else:
+            logprobs = torch.zeros((), device=hidden.device, dtype=torch.float32)
+        # declare auxiliary tensors
+        maximum = torch.empty((num_tokens,), device=hidden.device, dtype=torch.float32)
+        accumulate = torch.empty_like(maximum, dtype=torch.float32)
+        num_valid_tokens = torch.empty((), device=hidden.device, dtype=torch.int64)
+        assert (
+            maximum.is_contiguous() and accumulate.is_contiguous() and num_valid_tokens.is_contiguous()
+        )
+        # declare intermediate tensors
+        # NOTE: this is a parameter for tuning
+        num_splits = (vocab_size + _get_fwd_config()._vocab_per_split - 1) // _get_fwd_config()._vocab_per_split
+        _max = torch.empty((num_tokens, num_splits), device=hidden.device, dtype=torch.float32)
+        _accu = torch.empty((num_tokens, num_splits), device=hidden.device, dtype=torch.float32)
+        if REDUCTION == utils.EntropyReductionEnum.kNone:
+            _logprobs = logprobs
+        else:
+            _logprobs = torch.empty((num_tokens,), device=hidden.device, dtype=torch.float32)
+            if in_tp_mode:
+                _logprobs.zero_()
+        assert _max.is_contiguous() and _accu.is_contiguous() and _logprobs.is_contiguous()
 
-    triton_kernels.get_num_valid_tokens[(1,)](
-        num_tokens, ignore_index, labels_view, labels_view.stride(0), num_valid_tokens
-    )
+        triton_kernels.get_num_valid_tokens[(1,)](
+            num_tokens, ignore_index, labels_view, labels_view.stride(0), num_valid_tokens
+        )
 
-    # need to compile the kernel for the first time
-    hidden_packed = from_dlpack(hidden_view.detach(), assumed_align=16).mark_compact_shape_dynamic(
-        mode=0
-    )
-    weight_packed = from_dlpack(weight.detach(), assumed_align=16)
-    labels_packed = from_dlpack(labels_view.detach(), assumed_align=8).mark_compact_shape_dynamic(
-        mode=0
-    )
-    logprobs_packed = from_dlpack(_logprobs, assumed_align=16).mark_compact_shape_dynamic(mode=0)
-    _max_packed = from_dlpack(_max, assumed_align=8).mark_compact_shape_dynamic(
-        mode=0, stride_order=(0, 1)
-    )
-    _accu_packed = from_dlpack(_accu, assumed_align=8).mark_compact_shape_dynamic(
-        mode=0, stride_order=(0, 1)
-    )
-    cuda_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
-
-    # VocabSize and Dim are fixed for a given model,
-    # only the number of tokens can vary
-    key = f"vocab_size:{vocab_size}+dim:{dim}+dtype:{hidden_view.dtype}"
-    if _get_fwd_config()._fwd_mainloop_kernels.get(key) is None:
-        fwd_mainloop_kernel = fwd_mainloop.FwdMainLoop(vocab_per_split=_get_fwd_config()._vocab_per_split)
-        fwd_mainloop_compiled_kernel = cute.compile(
-            fwd_mainloop_kernel,
+        # need to compile the kernel for the first time
+        hidden_packed = from_dlpack(hidden_view.detach(), assumed_align=16).mark_compact_shape_dynamic(
+            mode=0
+        )
+        weight_packed = from_dlpack(weight.detach(), assumed_align=16)
+        labels_packed = from_dlpack(labels_view.detach(), assumed_align=8).mark_compact_shape_dynamic(
+            mode=0
+        )
+        logprobs_packed = from_dlpack(_logprobs, assumed_align=16).mark_compact_shape_dynamic(mode=0)
+        _max_packed = from_dlpack(_max, assumed_align=8).mark_compact_shape_dynamic(
+            mode=0, stride_order=(0, 1)
+        )
+        _accu_packed = from_dlpack(_accu, assumed_align=8).mark_compact_shape_dynamic(
+            mode=0, stride_order=(0, 1)
+        )
+        cuda_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+
+        # VocabSize and Dim are fixed for a given model,
+        # only the number of tokens can vary
+        key = f"vocab_size:{vocab_size}+dim:{dim}+dtype:{hidden_view.dtype}"
+        if _get_fwd_config()._fwd_mainloop_kernels.get(key) is None:
+            fwd_mainloop_kernel = fwd_mainloop.FwdMainLoop(vocab_per_split=_get_fwd_config()._vocab_per_split)
+            fwd_mainloop_compiled_kernel = cute.compile(
+                fwd_mainloop_kernel,
+                hidden_packed,
+                weight_packed,
+                labels_packed,
+                logprobs_packed,
+                _max_packed,
+                _accu_packed,
+                ignore_index,
+                tp_rank,
+                cuda_stream,
+            )
+            _get_fwd_config()._fwd_mainloop_kernels[key] = fwd_mainloop_compiled_kernel
+        else:
+            fwd_mainloop_compiled_kernel = _get_fwd_config()._fwd_mainloop_kernels[key]
+        fwd_mainloop_compiled_kernel(
             hidden_packed,
             weight_packed,
             labels_packed,
@@ -177,250 +192,238 @@ def forward(
             tp_rank,
             cuda_stream,
         )
-        _get_fwd_config()._fwd_mainloop_kernels[key] = fwd_mainloop_compiled_kernel
-    else:
-        fwd_mainloop_compiled_kernel = _get_fwd_config()._fwd_mainloop_kernels[key]
-    fwd_mainloop_compiled_kernel(
-        hidden_packed,
-        weight_packed,
-        labels_packed,
-        logprobs_packed,
-        _max_packed,
-        _accu_packed,
-        ignore_index,
-        tp_rank,
-        cuda_stream,
-    )
 
-    if not in_tp_mode:
+        if not in_tp_mode:
 
-        def grid(meta):
-            return (triton.cdiv(num_tokens, meta["BLOCK_SIZE_M"]),)
+            def grid(meta):
+                return (triton.cdiv(num_tokens, meta["BLOCK_SIZE_M"]),)
 
-        triton_kernels.forward_dp_epilogue[grid](
-            num_tokens,
-            num_splits,
-            ignore_index,
-            labels_view,
-            labels_view.stride(0),
-            num_valid_tokens,
-            _max,
-            _max.stride(0),
-            _max.stride(1),
-            _accu,
-            _accu.stride(0),
-            _accu.stride(1),
-            maximum,
-            maximum.stride(0),
-            accumulate,
-            maximum.stride(0),
-            _logprobs,
-            _logprobs.stride(0),
-            logprobs,
-            triton.language.constexpr(REDUCTION.value),
-        )
-    else:
-        _max_backup = _max.clone()
-        dist.all_reduce(_max, op=dist.ReduceOp.MAX, group=tp_group)
-
-        torch.cuda.current_stream().record_event(_get_fwd_config()._dedicated_events[0])
-        with torch.cuda.stream(_get_fwd_config()._dedicated_stream):
-            _get_fwd_config()._dedicated_stream.wait_event(_get_fwd_config()._dedicated_events[0])
-            dist.all_reduce(_logprobs, op=dist.ReduceOp.SUM, group=tp_group)
-            _get_fwd_config()._dedicated_stream.record_event(_get_fwd_config()._dedicated_events[1])
-
-        def grid(meta):
-            return (triton.cdiv(num_tokens, meta["BLOCK_SIZE_M"]),)
-
-        triton_kernels.forward_tp_epilogue[grid](
-            num_tokens,
-            num_splits,
-            _max,
-            _max.stride(0),
-            _max.stride(1),
-            _max_backup,
-            _max_backup.stride(0),
-            _max_backup.stride(1),
-            _accu,
-            _accu.stride(0),
-            _accu.stride(1),
-            maximum,
-            maximum.stride(0),
-            accumulate,
-            maximum.stride(0),
-        )
-        # reduce accumulate
-        dist.all_reduce(accumulate, op=dist.ReduceOp.SUM, group=tp_group)
-
-        # update logprobs
-        torch.cuda.current_stream().wait_event(_get_fwd_config()._dedicated_events[1])
-        triton_kernels.forward_tp_epilogue_update_logprobs[grid](
-            num_tokens,
-            ignore_index,
-            num_valid_tokens,
-            labels_view,
-            labels_view.stride(0),
-            _logprobs,
-            _logprobs.stride(0),
-            maximum,
-            maximum.stride(0),
-            accumulate,
-            accumulate.stride(0),
-            logprobs,
-            REDUCTION.value,
-        )
+            triton_kernels.forward_dp_epilogue[grid](
+                num_tokens,
+                num_splits,
+                ignore_index,
+                labels_view,
+                labels_view.stride(0),
+                num_valid_tokens,
+                _max,
+                _max.stride(0),
+                _max.stride(1),
+                _accu,
+                _accu.stride(0),
+                _accu.stride(1),
+                maximum,
+                maximum.stride(0),
+                accumulate,
+                maximum.stride(0),
+                _logprobs,
+                _logprobs.stride(0),
+                logprobs,
+                triton.language.constexpr(REDUCTION.value),
+            )
+        else:
+            _max_backup = _max.clone()
+            dist.all_reduce(_max, op=dist.ReduceOp.MAX, group=tp_group)
+
+            torch.cuda.current_stream().record_event(_get_fwd_config()._dedicated_events[0])
+            with torch.cuda.stream(_get_fwd_config()._dedicated_stream):
+                _get_fwd_config()._dedicated_stream.wait_event(_get_fwd_config()._dedicated_events[0])
+                dist.all_reduce(_logprobs, op=dist.ReduceOp.SUM, group=tp_group)
+                _get_fwd_config()._dedicated_stream.record_event(_get_fwd_config()._dedicated_events[1])
+
+            def grid(meta):
+                return (triton.cdiv(num_tokens, meta["BLOCK_SIZE_M"]),)
+
+            triton_kernels.forward_tp_epilogue[grid](
+                num_tokens,
+                num_splits,
+                _max,
+                _max.stride(0),
+                _max.stride(1),
+                _max_backup,
+                _max_backup.stride(0),
+                _max_backup.stride(1),
+                _accu,
+                _accu.stride(0),
+                _accu.stride(1),
+                maximum,
+                maximum.stride(0),
+                accumulate,
+                maximum.stride(0),
+            )
+            # reduce accumulate
+            dist.all_reduce(accumulate, op=dist.ReduceOp.SUM, group=tp_group)
 
-    return logprobs, maximum, accumulate, num_valid_tokens, tp_rank, tp_world_size, global_hidden
-
-
-def backward(
-    dlogprobs: torch.Tensor,
-    global_hidden: torch.Tensor,
-    weight: torch.Tensor,
-    labels: torch.Tensor,
-    maximum: torch.Tensor,
-    accu: torch.Tensor,
-    num_valid_tokens: torch.Tensor,
-    reduction: typing.Literal["none", "sum", "mean"] = "mean",
-    ignore_index: int = -100,
-    tp_group: typing.Optional[dist.ProcessGroup] = None,
-    tp_rank: int = 0,
-    tp_world_size: int = 1,
-    sequence_parallel: bool = False,
-) -> typing.Tuple[torch.Tensor, torch.Tensor]:
-    """
-    backward host function
-    """
-    in_tp_mode = (tp_group is not None) and (tp_world_size > 1)
-
-    hidden_view = global_hidden.view(-1, global_hidden.shape[-1])
-    labels_view = labels.view(-1)
-
-    num_tokens, dim = hidden_view.shape
-    vocab_size, _ = weight.shape
-
-    REDUCTION = utils.str_to_reduction_enum(reduction)
-    dlogprobs_view = dlogprobs.view(-1)
-    assert (REDUCTION == utils.EntropyReductionEnum.kNone and dlogprobs.shape == (num_tokens,)) or (
-        REDUCTION != utils.EntropyReductionEnum.kNone and dlogprobs.dim() == 0
-    )
-    assert dlogprobs.is_contiguous() and dlogprobs.is_cuda
+            # update logprobs
+            torch.cuda.current_stream().wait_event(_get_fwd_config()._dedicated_events[1])
+            triton_kernels.forward_tp_epilogue_update_logprobs[grid](
+                num_tokens,
+                ignore_index,
+                num_valid_tokens,
+                labels_view,
+                labels_view.stride(0),
+                _logprobs,
+                _logprobs.stride(0),
+                maximum,
+                maximum.stride(0),
+                accumulate,
+                accumulate.stride(0),
+                logprobs,
+                REDUCTION.value,
+            )
 
-    assert (
-        num_valid_tokens.dim() == 0
-        and num_valid_tokens.is_cuda
-        and num_valid_tokens.dtype == torch.int64
-    )
+        return logprobs, maximum, accumulate, num_valid_tokens, tp_rank, tp_world_size, global_hidden
+
+
+    def backward(
+        dlogprobs: torch.Tensor,
+        global_hidden: torch.Tensor,
+        weight: torch.Tensor,
+        labels: torch.Tensor,
+        maximum: torch.Tensor,
+        accu: torch.Tensor,
+        num_valid_tokens: torch.Tensor,
+        reduction: typing.Literal["none", "sum", "mean"] = "mean",
+        ignore_index: int = -100,
+        tp_group: typing.Optional[dist.ProcessGroup] = None,
+        tp_rank: int = 0,
+        tp_world_size: int = 1,
+        sequence_parallel: bool = False,
+    ) -> typing.Tuple[torch.Tensor, torch.Tensor]:
+        """
+        backward host function
+        """
+        in_tp_mode = (tp_group is not None) and (tp_world_size > 1)
 
-    d_hidden = torch.empty_like(global_hidden)
-    d_weight = torch.empty_like(weight)
-    assert d_hidden.is_contiguous() and d_weight.is_contiguous()
+        hidden_view = global_hidden.view(-1, global_hidden.shape[-1])
+        labels_view = labels.view(-1)
 
-    # FIXME: implement different backward methods
-    _backward_method = _get_bwd_config()._backward_method
-    if _backward_method == utils.BackwardMethodEnum.kDlogitsSplitN:
-        vocab_per_split = _get_bwd_config()._vocab_per_split
-        num_splits = (vocab_size + vocab_per_split - 1) // vocab_per_split
+        num_tokens, dim = hidden_view.shape
+        vocab_size, _ = weight.shape
 
-        _d_logits = torch.empty(
-            (num_tokens, vocab_per_split), device=global_hidden.device, dtype=global_hidden.dtype
+        REDUCTION = utils.str_to_reduction_enum(reduction)
+        dlogprobs_view = dlogprobs.view(-1)
+        assert (REDUCTION == utils.EntropyReductionEnum.kNone and dlogprobs.shape == (num_tokens,)) or (
+            REDUCTION != utils.EntropyReductionEnum.kNone and dlogprobs.dim() == 0
         )
+        assert dlogprobs.is_contiguous() and dlogprobs.is_cuda
 
-        hidden_packed = from_dlpack(
-            hidden_view.detach(), assumed_align=16
-        ).mark_compact_shape_dynamic(mode=0)
-        weight_packed = from_dlpack(weight.detach(), assumed_align=16)
-        labels_packed = from_dlpack(
-            labels_view.detach(), assumed_align=8
-        ).mark_compact_shape_dynamic(mode=0)
-        dlogprobs_packed = from_dlpack(
-            dlogprobs_view.detach(), assumed_align=8
-        ).mark_compact_shape_dynamic(mode=0)
-        maximum_packed = from_dlpack(maximum.detach(), assumed_align=8).mark_compact_shape_dynamic(
-            mode=0
-        )
-        accu_packed = from_dlpack(accu.detach(), assumed_align=8).mark_compact_shape_dynamic(mode=0)
-        dlogits_packed = from_dlpack(_d_logits, assumed_align=32).mark_compact_shape_dynamic(mode=0)
-        scalarNumValidTokens_packed = cute.runtime.make_ptr(
-            cutlass.Int64, num_valid_tokens.data_ptr(), cute.AddressSpace.gmem, assumed_align=8
+        assert (
+            num_valid_tokens.dim() == 0
+            and num_valid_tokens.is_cuda
+            and num_valid_tokens.dtype == torch.int64
         )
 
-        stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+        d_hidden = torch.empty_like(global_hidden)
+        d_weight = torch.empty_like(weight)
+        assert d_hidden.is_contiguous() and d_weight.is_contiguous()
 
-        key = f"vocab_size:{vocab_size}+dim:{dim}+reduction:{REDUCTION}+dtype:{hidden_view.dtype}"
-        if _get_bwd_config()._bwd_kernel.get(key) is None:
-            bwd_kernel = bwd_partial_dlogits.BwdPartialDlogits(
-                reduction=REDUCTION.value, vocab_per_split=vocab_per_split
-            )
-            bwd_kernel_compiled = cute.compile(
-                bwd_kernel,
-                0,  # split_idx
-                hidden_packed,
-                weight_packed,
-                labels_packed,
-                dlogprobs_packed,
-                maximum_packed,
-                accu_packed,
-                dlogits_packed,
-                scalarNumValidTokens_packed,
-                ignore_index,
-                tp_rank,
-                stream,
-            )
-            _get_bwd_config()._bwd_kernel[key] = bwd_kernel_compiled
-        else:
-            bwd_kernel_compiled = _get_bwd_config()._bwd_kernel.get(key)
+        # FIXME: implement different backward methods
+        _backward_method = _get_bwd_config()._backward_method
+        if _backward_method == utils.BackwardMethodEnum.kDlogitsSplitN:
+            vocab_per_split = _get_bwd_config()._vocab_per_split
+            num_splits = (vocab_size + vocab_per_split - 1) // vocab_per_split
 
-        for split_idx in range(num_splits):
-            bwd_kernel_compiled(
-                split_idx,
-                hidden_packed,
-                weight_packed,
-                labels_packed,
-                dlogprobs_packed,
-                maximum_packed,
-                accu_packed,
-                dlogits_packed,
-                scalarNumValidTokens_packed,
-                ignore_index,
-                tp_rank,
-                stream,
-            )
-            # remove padding areas
-            # cublas can handle non-contiguous tensors
-            # therefore, we do not need to contiguous the tensor
-            vocab_right_bound = (
-                min((split_idx + 1) * vocab_per_split, vocab_size) - split_idx * vocab_per_split
-            )
-            valid_d_logits = _d_logits[:, :vocab_right_bound]
-
-            torch.addmm(
-                input=d_hidden.view(-1, dim),
-                mat1=valid_d_logits,
-                mat2=weight[split_idx * vocab_per_split : (split_idx + 1) * vocab_per_split, :],
-                beta=(split_idx != 0),
-                alpha=1.0,
-                out=d_hidden.view(-1, dim),
+            _d_logits = torch.empty(
+                (num_tokens, vocab_per_split), device=global_hidden.device, dtype=global_hidden.dtype
             )
-            torch.matmul(
-                valid_d_logits.T,
-                hidden_view,
-                out=d_weight[split_idx * vocab_per_split : (split_idx + 1) * vocab_per_split, :],
+
+            hidden_packed = from_dlpack(
+                hidden_view.detach(), assumed_align=16
+            ).mark_compact_shape_dynamic(mode=0)
+            weight_packed = from_dlpack(weight.detach(), assumed_align=16)
+            labels_packed = from_dlpack(
+                labels_view.detach(), assumed_align=8
+            ).mark_compact_shape_dynamic(mode=0)
+            dlogprobs_packed = from_dlpack(
+                dlogprobs_view.detach(), assumed_align=8
+            ).mark_compact_shape_dynamic(mode=0)
+            maximum_packed = from_dlpack(maximum.detach(), assumed_align=8).mark_compact_shape_dynamic(
+                mode=0
             )
-    else:
-        raise NotImplementedError(f"Unsupported backward method: {_backward_method}")
-
-    if in_tp_mode:
-        dist.all_reduce(d_hidden, op=dist.ReduceOp.SUM, group=tp_group)
-        if sequence_parallel:
-            partial_hidden_shape = (
-                global_hidden.shape[0] // tp_world_size,
-                *global_hidden.shape[1:],
+            accu_packed = from_dlpack(accu.detach(), assumed_align=8).mark_compact_shape_dynamic(mode=0)
+            dlogits_packed = from_dlpack(_d_logits, assumed_align=32).mark_compact_shape_dynamic(mode=0)
+            scalarNumValidTokens_packed = cute.runtime.make_ptr(
+                cutlass.Int64, num_valid_tokens.data_ptr(), cute.AddressSpace.gmem, assumed_align=8
             )
-            partial_num_tokens = num_tokens // tp_world_size
-            d_hidden = d_hidden.view(-1, d_hidden.shape[-1])[
-                tp_rank * partial_num_tokens : (tp_rank + 1) * partial_num_tokens, :
-            ]
-            d_hidden = d_hidden.view(partial_hidden_shape).clone()
 
-    return d_hidden, d_weight
+            stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+
+            key = f"vocab_size:{vocab_size}+dim:{dim}+reduction:{REDUCTION}+dtype:{hidden_view.dtype}"
+            if _get_bwd_config()._bwd_kernel.get(key) is None:
+                bwd_kernel = bwd_partial_dlogits.BwdPartialDlogits(
+                    reduction=REDUCTION.value, vocab_per_split=vocab_per_split
+                )
+                bwd_kernel_compiled = cute.compile(
+                    bwd_kernel,
+                    0,  # split_idx
+                    hidden_packed,
+                    weight_packed,
+                    labels_packed,
+                    dlogprobs_packed,
+                    maximum_packed,
+                    accu_packed,
+                    dlogits_packed,
+                    scalarNumValidTokens_packed,
+                    ignore_index,
+                    tp_rank,
+                    stream,
+                )
+                _get_bwd_config()._bwd_kernel[key] = bwd_kernel_compiled
+            else:
+                bwd_kernel_compiled = _get_bwd_config()._bwd_kernel.get(key)
+
+            for split_idx in range(num_splits):
+                bwd_kernel_compiled(
+                    split_idx,
+                    hidden_packed,
+                    weight_packed,
+                    labels_packed,
+                    dlogprobs_packed,
+                    maximum_packed,
+                    accu_packed,
+                    dlogits_packed,
+                    scalarNumValidTokens_packed,
+                    ignore_index,
+                    tp_rank,
+                    stream,
+                )
+                # remove padding areas
+                # cublas can handle non-contiguous tensors
+                # therefore, we do not need to contiguous the tensor
+                vocab_right_bound = (
+                    min((split_idx + 1) * vocab_per_split, vocab_size) - split_idx * vocab_per_split
+                )
+                valid_d_logits = _d_logits[:, :vocab_right_bound]
+
+                torch.addmm(
+                    input=d_hidden.view(-1, dim),
+                    mat1=valid_d_logits,
+                    mat2=weight[split_idx * vocab_per_split : (split_idx + 1) * vocab_per_split, :],
+                    beta=(split_idx != 0),
+                    alpha=1.0,
+                    out=d_hidden.view(-1, dim),
+                )
+                torch.matmul(
+                    valid_d_logits.T,
+                    hidden_view,
+                    out=d_weight[split_idx * vocab_per_split : (split_idx + 1) * vocab_per_split, :],
+                )
+        else:
+            raise NotImplementedError(f"Unsupported backward method: {_backward_method}")
+
+        if in_tp_mode:
+            dist.all_reduce(d_hidden, op=dist.ReduceOp.SUM, group=tp_group)
+            if sequence_parallel:
+                partial_hidden_shape = (
+                    global_hidden.shape[0] // tp_world_size,
+                    *global_hidden.shape[1:],
+                )
+                partial_num_tokens = num_tokens // tp_world_size
+                d_hidden = d_hidden.view(-1, d_hidden.shape[-1])[
+                    tp_rank * partial_num_tokens : (tp_rank + 1) * partial_num_tokens, :
+                ]
+                d_hidden = d_hidden.view(partial_hidden_shape).clone()
+
+        return d_hidden, d_weight
+except ImportError:
+    pass
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/fwd_mainloop.py b/megatron/core/fusions/linear_cross_entropy/blackwell/fwd_mainloop.py
index ebb9709822c..da095e3fc64 100644
--- a/megatron/core/fusions/linear_cross_entropy/blackwell/fwd_mainloop.py
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/fwd_mainloop.py
@@ -4,648 +4,651 @@
 Implementations of the fusion lm_head(Linear) + Cross-Entropy kernel
 """
 
-from typing import Tuple, Type
+try:
+    from typing import Tuple, Type
 
-import cuda.bindings.driver as cuda  # type: ignore
-import cutlass
-import cutlass.cute as cute
-import cutlass.pipeline as pipeline  # type: ignore
-import cutlass.utils as utils  # type: ignore
-import cutlass.utils.blackwell_helpers as sm100_utils  # type: ignore
-from cutlass.cute.nvgpu import cpasync, tcgen05
+    import cuda.bindings.driver as cuda  # type: ignore
+    import cutlass
+    import cutlass.cute as cute
+    import cutlass.pipeline as pipeline  # type: ignore
+    import cutlass.utils as utils  # type: ignore
+    import cutlass.utils.blackwell_helpers as sm100_utils  # type: ignore
+    from cutlass.cute.nvgpu import cpasync, tcgen05
 
-SM100_TMEM_CAPACITY_COLUMNS: int = 512
+    SM100_TMEM_CAPACITY_COLUMNS: int = 512
 
 
-def make_thread_cooperative_group(size: int):
-    """
-    Create a thread cooperative group.
-    """
-    return pipeline.CooperativeGroup(pipeline.Agent.Thread, size, alignment=size)
-
+    def make_thread_cooperative_group(size: int):
+        """
+        Create a thread cooperative group.
+        """
+        return pipeline.CooperativeGroup(pipeline.Agent.Thread, size, alignment=size)
 
-class FwdMainLoop:
-    """
-    This class implements the mainloop for forward process.
 
-    Traits stored as attributes.
+    class FwdMainLoop:
+        """
+        This class implements the mainloop for forward process.
 
-    :param acc_dtype:
-    """
+        Traits stored as attributes.
 
-    def __init__(
-        self,
-        acc_dtype: Type[cutlass.Numeric] = cutlass.Float32,
-        use_2cta_instrs: bool = False,
-        mma_tiler_mn: Tuple[int, int] = (128, 256),
-        vocab_per_split: int = 512,
-    ):
-        """
-        Configuration including:
-            - MMA instruction settings
-            - Cluster Shape
+        :param acc_dtype:
         """
-        self.acc_dtype: Type[cutlass.Numeric] = acc_dtype
-        self.use_2cta_instrs = use_2cta_instrs
-        # This is the shape covered by tiledMMA, not just single MMA instruction
-        self.mma_tiler = (*mma_tiler_mn, 1)
-        self.cta_tiler = (self.mma_tiler[0], vocab_per_split, self.mma_tiler[2])
-        self.vocab_per_split = vocab_per_split
-
-        self.cta_group = tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
-        self.cluster_shape_mn = (2, 1) if self.use_2cta_instrs else (1, 1)
-
-        self.occupancy = 1
-        # query SMEM capacity
-        self.smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
-
-        # the maximum columns per MMA is 256, and there is only one GEMM, so we can fully
-        # assign TMEM for that GEMM of different tiles.
-        # so 512 = 2 * 256
-
-        self.threads_per_warp: int = 32
-        # 1 warp for loading, 1 warp for issuing MMA, 1 WG for storing
-        self.epi_warp_ids = (0, 1, 2, 3)
-        self.load_warp_ids = 4
-        self.mma_warp_ids = 5
-        self.empty_warp_ids = (6, 7)
-
-        self.threads_per_cta: int = self.threads_per_warp * len(
-            (*self.epi_warp_ids, self.load_warp_ids, self.mma_warp_ids, *self.empty_warp_ids)
-        )
-
-        self.cta_sync_barrier = pipeline.NamedBarrier(
-            barrier_id=1, num_threads=self.threads_per_cta
-        )
-        self.tmem_alloc_barrier = pipeline.NamedBarrier(
-            barrier_id=2, num_threads=self.threads_per_cta
-        )
-
-        self.buffer_align_bytes: int = 1024
-        self.num_regs_other: int = 32
-        self.num_regs_epi: int = 192
-
-    def _compute_stages(
-        self,
-        tiled_mma: cute.TiledMma,
-        mma_tiler: Tuple[int, int, int],
-        a_dtype: Type[cutlass.Numeric],
-        b_dtype: Type[cutlass.Numeric],
-    ):
-        a_smem_layout_stage_one = sm100_utils.make_smem_layout_a(
-            tiled_mma, mma_tiler, a_dtype, 1  # only single stage
-        )
-        b_smem_layout_stage_one = sm100_utils.make_smem_layout_b(tiled_mma, mma_tiler, b_dtype, 1)
-        a_bytes_per_stage = cute.size_in_bytes(a_dtype, a_smem_layout_stage_one)
-        b_bytes_per_stage = cute.size_in_bytes(b_dtype, b_smem_layout_stage_one)
-        num_acc_stage = 2
-        num_a_stage = 4
-        num_b_stage = 4
-        num_epi_stage_per_tile = 4
-
-        return num_acc_stage, num_a_stage, num_b_stage, num_epi_stage_per_tile
-
-    def _setup_attributes(
-        self,
-        tiled_mma: cute.TiledMma,
-        a_dtype: Type[cutlass.Numeric],
-        b_dtype: Type[cutlass.Numeric],
-    ):
-        self.cluster_shape_mnk = (*self.cluster_shape_mn, 1)
-        self.cluster_layout_vmnk = cute.tiled_divide(
-            cute.make_layout(self.cluster_shape_mnk), (tiled_mma.thr_id.shape,)
-        )
-
-        # this is fixed for dense MMA, k=16
-        mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
-        # 16*4 = 64; 64 * sizeof(FP16) = 128Bytes
-        mma_inst_tile_k: int = 4
-        self.mma_tiler = (self.mma_tiler[0], self.mma_tiler[1], mma_inst_shape_k * mma_inst_tile_k)
-
-        self.num_acc_stage, self.num_a_stage, self.num_b_stage, self.num_epi_stage_per_tile = (
-            self._compute_stages(tiled_mma, self.mma_tiler, a_dtype, b_dtype)
-        )
-        self.tmem_alloc_cols = self.num_acc_stage * self.mma_tiler[1]
-        assert self.tmem_alloc_cols <= SM100_TMEM_CAPACITY_COLUMNS
-
-        self.cta_tile_shape_mnk = (
-            self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
-            self.mma_tiler[1],
-            self.mma_tiler[2],
-        )
-
-    @cute.kernel
-    def kernel(
-        self,
-        tiled_mma: cute.TiledMma,
-        tma_atom_a: cute.CopyAtom,
-        mA: cute.Tensor,
-        tma_atom_b: cute.CopyAtom,
-        mB: cute.Tensor,
-        mLabels: cute.Tensor,
-        mMax: cute.Tensor,
-        mAccu: cute.Tensor,
-        mLogprobs: cute.Tensor,
-        a_smem_layout_staged: cute.ComposedLayout,
-        b_smem_layout_staged: cute.ComposedLayout,
-        cluster_layout_vmnk: cute.Layout,
-        problem_mnk: Tuple[int, int, int],
-        ignore_index: cutlass.Int64,
-        rank: cutlass.Int32,
-    ):
-        """
-        The forward kernel for the mainloop.
-        """
-        warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
-        tidx, _, _ = cute.arch.thread_idx()
-        bidx, bidy, _ = cute.arch.block_idx()
-        # FIXME: block swizzling applied here
-        pidm, pidn = bidx, bidy
-
-        # prefetch tma descriptors
-        if warp_idx == self.load_warp_ids:
-            cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_a)
-            cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_b)
-
-        # declare SMEM
-        smem = utils.SmemAllocator()
-        storage = smem.allocate(self.shared_storage)
-
-        ab_pipeline = pipeline.PipelineTmaUmma.create(
-            num_stages=self.num_a_stage,
-            producer_group=make_thread_cooperative_group(len([self.load_warp_ids])),
-            consumer_group=make_thread_cooperative_group(len([self.mma_warp_ids])),
-            tx_count=self.tma_copy_a_bytes + self.tma_copy_b_bytes,
-            barrier_storage=storage.load_ab_mbar_ptr.data_ptr(),
-        )
-        ab_producer_state = pipeline.make_pipeline_state(
-            pipeline.PipelineUserType.Producer, self.num_a_stage
-        )
-        ab_consumer_state = pipeline.make_pipeline_state(
-            pipeline.PipelineUserType.Consumer, self.num_a_stage
-        )
-
-        mma_pipeline = pipeline.PipelineUmmaAsync.create(
-            num_stages=self.num_acc_stage,
-            producer_group=make_thread_cooperative_group(len([self.mma_warp_ids])),
-            consumer_group=make_thread_cooperative_group(
-                self.threads_per_warp * len(self.epi_warp_ids)
-            ),
-            barrier_storage=storage.mma_mbar_ptr.data_ptr(),
-        )
-        mma_producer_state = pipeline.make_pipeline_state(
-            pipeline.PipelineUserType.Producer, self.num_acc_stage
-        )
-        mma_consumer_state = pipeline.make_pipeline_state(
-            pipeline.PipelineUserType.Consumer, self.num_acc_stage
-        )
-
-        tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr.data_ptr()
-        if warp_idx == self.empty_warp_ids[0]:
-            with cute.arch.elect_one():
-                cute.arch.mbarrier_init(
-                    tmem_dealloc_mbar_ptr, self.threads_per_warp * len(self.epi_warp_ids)
-                )
-                cute.arch.mbarrier_init_fence()
-
-        # -------- SMEM partition ------------ #
-        # swizzle o [(tileM, tileK), loopM, loopK, Stage]
-        sA = storage.sA.get_tensor(a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner)
-        # swizzle o [(tileN, tileK), loopN, loopK, stage]
-        sB = storage.sB.get_tensor(b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner)
-
-        # FIXME: if 2 CTAs, modify here
-        thr_mma = tiled_mma.get_slice(0)
-        # [MMA, loopM, loopK, stage]
-        tCsA = thr_mma.make_fragment_A(sA)
-        # [MMA, loopN, loopK, stage]
-        tCsB = thr_mma.make_fragment_B(sB)
-
-        # ---------- GMEM partition ----------- #
-        # [tileM, tileK, loopK]
-        gA = cute.local_tile(mA, (self.mma_tiler[0], self.mma_tiler[2]), (pidm, None))
-
-        # [vocab_size_per_split, dim]
-        mB_n = cute.local_tile(
-            mB, (self.vocab_per_split, cute.size(mB.layout.shape, mode=[1])), (pidn, 0)
-        )
-
-        # [tileN, tileK, loopN, loopK]
-        gB = cute.local_tile(mB_n, (self.mma_tiler[1], self.mma_tiler[2]), (None, None))
-
-        # [MMA, tileCntM, tileCntK, loopK]
-        tCgA = thr_mma.partition_A(gA)
-        # [MMA, tileCntN, tileCntK, loopN, loopK]
-        tCgB = thr_mma.partition_B(gB)
-
-        a_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape)
-        # FIXME: if 2 CTAs, modify here
-        cta_rank_in_cluster = 0
-        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
-        tTMAsA, tTMAgA = cpasync.tma_partition(
-            tma_atom_a,
-            block_in_cluster_coord_vmnk[2],  # cta_coord,
-            a_cta_layout,
-            cute.group_modes(sA, 0, 3),  # SMEM tensor
-            cute.group_modes(tCgA, 0, 3),  # GMEM tensor
-        )
-        b_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape)
-        tTMAsB, tTMAgB = cpasync.tma_partition(
-            tma_atom_b,
-            block_in_cluster_coord_vmnk[1],  # cta_coord
-            b_cta_layout,
-            cute.group_modes(sB, 0, 3),
-            cute.group_modes(tCgB, 0, 3),
-        )
-
-        # Allocate TMEM
-        tmem_holding_buf = storage.tmem_holding_buf
-        if warp_idx == self.empty_warp_ids[0]:
-            cute.arch.alloc_tmem(
-                self.tmem_alloc_cols, tmem_holding_buf, is_two_cta=self.use_2cta_instrs
+
+        def __init__(
+            self,
+            acc_dtype: Type[cutlass.Numeric] = cutlass.Float32,
+            use_2cta_instrs: bool = False,
+            mma_tiler_mn: Tuple[int, int] = (128, 256),
+            vocab_per_split: int = 512,
+        ):
+            """
+            Configuration including:
+                - MMA instruction settings
+                - Cluster Shape
+            """
+            self.acc_dtype: Type[cutlass.Numeric] = acc_dtype
+            self.use_2cta_instrs = use_2cta_instrs
+            # This is the shape covered by tiledMMA, not just single MMA instruction
+            self.mma_tiler = (*mma_tiler_mn, 1)
+            self.cta_tiler = (self.mma_tiler[0], vocab_per_split, self.mma_tiler[2])
+            self.vocab_per_split = vocab_per_split
+
+            self.cta_group = tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
+            self.cluster_shape_mn = (2, 1) if self.use_2cta_instrs else (1, 1)
+
+            self.occupancy = 1
+            # query SMEM capacity
+            self.smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
+
+            # the maximum columns per MMA is 256, and there is only one GEMM, so we can fully
+            # assign TMEM for that GEMM of different tiles.
+            # so 512 = 2 * 256
+
+            self.threads_per_warp: int = 32
+            # 1 warp for loading, 1 warp for issuing MMA, 1 WG for storing
+            self.epi_warp_ids = (0, 1, 2, 3)
+            self.load_warp_ids = 4
+            self.mma_warp_ids = 5
+            self.empty_warp_ids = (6, 7)
+
+            self.threads_per_cta: int = self.threads_per_warp * len(
+                (*self.epi_warp_ids, self.load_warp_ids, self.mma_warp_ids, *self.empty_warp_ids)
             )
-        self.cta_sync_barrier.arrive_and_wait()
-        tmem_ptr = cute.arch.retrieve_tmem_ptr(
-            self.acc_dtype, alignment=16, ptr_to_buffer_holding_addr=tmem_holding_buf
-        )
-
-        # [(tileM, tileN), loopM, loopN]
-        tmem_shape = (128, self.tmem_alloc_cols)
-        acc_shape = thr_mma.partition_shape_C(tmem_shape)
-        tCtC_fake = thr_mma.make_fragment_C(acc_shape)
-        tCtC = cute.make_tensor(tmem_ptr, tCtC_fake.layout)
-
-        block_vocab_left_idx: cutlass.Int64 = pidn * self.vocab_per_split
-        block_vocab_right_idx: cutlass.Int64 = min(
-            (pidn + 1) * self.vocab_per_split, problem_mnk[1]
-        )
-        num_n_tiles: cutlass.Int64 = cute.ceil_div(
-            (block_vocab_right_idx - block_vocab_left_idx), self.mma_tiler[1]
-        )
-
-        # ///////
-        # empty
-        # ///////
-        if warp_idx in self.empty_warp_ids:
-            cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
-
-        # ///////
-        # load
-        # ///////
-        if warp_idx == self.load_warp_ids:
-            cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
-
-            for n in cutlass.range(num_n_tiles):
-                for k in cutlass.range(cute.size(gA, mode=[2])):
-                    ab_pipeline.producer_acquire(ab_producer_state)
-                    cute.copy(
-                        tma_atom_a,
-                        tTMAgA[(None, k)],
-                        tTMAsA[(None, ab_producer_state.index)],
-                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
-                    )
-                    cute.copy(
-                        tma_atom_b,
-                        tTMAgB[(None, n, k)],
-                        tTMAsB[(None, ab_producer_state.index)],
-                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+
+            self.cta_sync_barrier = pipeline.NamedBarrier(
+                barrier_id=1, num_threads=self.threads_per_cta
+            )
+            self.tmem_alloc_barrier = pipeline.NamedBarrier(
+                barrier_id=2, num_threads=self.threads_per_cta
+            )
+
+            self.buffer_align_bytes: int = 1024
+            self.num_regs_other: int = 32
+            self.num_regs_epi: int = 192
+
+        def _compute_stages(
+            self,
+            tiled_mma: cute.TiledMma,
+            mma_tiler: Tuple[int, int, int],
+            a_dtype: Type[cutlass.Numeric],
+            b_dtype: Type[cutlass.Numeric],
+        ):
+            a_smem_layout_stage_one = sm100_utils.make_smem_layout_a(
+                tiled_mma, mma_tiler, a_dtype, 1  # only single stage
+            )
+            b_smem_layout_stage_one = sm100_utils.make_smem_layout_b(tiled_mma, mma_tiler, b_dtype, 1)
+            a_bytes_per_stage = cute.size_in_bytes(a_dtype, a_smem_layout_stage_one)
+            b_bytes_per_stage = cute.size_in_bytes(b_dtype, b_smem_layout_stage_one)
+            num_acc_stage = 2
+            num_a_stage = 4
+            num_b_stage = 4
+            num_epi_stage_per_tile = 4
+
+            return num_acc_stage, num_a_stage, num_b_stage, num_epi_stage_per_tile
+
+        def _setup_attributes(
+            self,
+            tiled_mma: cute.TiledMma,
+            a_dtype: Type[cutlass.Numeric],
+            b_dtype: Type[cutlass.Numeric],
+        ):
+            self.cluster_shape_mnk = (*self.cluster_shape_mn, 1)
+            self.cluster_layout_vmnk = cute.tiled_divide(
+                cute.make_layout(self.cluster_shape_mnk), (tiled_mma.thr_id.shape,)
+            )
+
+            # this is fixed for dense MMA, k=16
+            mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
+            # 16*4 = 64; 64 * sizeof(FP16) = 128Bytes
+            mma_inst_tile_k: int = 4
+            self.mma_tiler = (self.mma_tiler[0], self.mma_tiler[1], mma_inst_shape_k * mma_inst_tile_k)
+
+            self.num_acc_stage, self.num_a_stage, self.num_b_stage, self.num_epi_stage_per_tile = (
+                self._compute_stages(tiled_mma, self.mma_tiler, a_dtype, b_dtype)
+            )
+            self.tmem_alloc_cols = self.num_acc_stage * self.mma_tiler[1]
+            assert self.tmem_alloc_cols <= SM100_TMEM_CAPACITY_COLUMNS
+
+            self.cta_tile_shape_mnk = (
+                self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
+                self.mma_tiler[1],
+                self.mma_tiler[2],
+            )
+
+        @cute.kernel
+        def kernel(
+            self,
+            tiled_mma: cute.TiledMma,
+            tma_atom_a: cute.CopyAtom,
+            mA: cute.Tensor,
+            tma_atom_b: cute.CopyAtom,
+            mB: cute.Tensor,
+            mLabels: cute.Tensor,
+            mMax: cute.Tensor,
+            mAccu: cute.Tensor,
+            mLogprobs: cute.Tensor,
+            a_smem_layout_staged: cute.ComposedLayout,
+            b_smem_layout_staged: cute.ComposedLayout,
+            cluster_layout_vmnk: cute.Layout,
+            problem_mnk: Tuple[int, int, int],
+            ignore_index: cutlass.Int64,
+            rank: cutlass.Int32,
+        ):
+            """
+            The forward kernel for the mainloop.
+            """
+            warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
+            tidx, _, _ = cute.arch.thread_idx()
+            bidx, bidy, _ = cute.arch.block_idx()
+            # FIXME: block swizzling applied here
+            pidm, pidn = bidx, bidy
+
+            # prefetch tma descriptors
+            if warp_idx == self.load_warp_ids:
+                cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_a)
+                cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_b)
+
+            # declare SMEM
+            smem = utils.SmemAllocator()
+            storage = smem.allocate(self.shared_storage)
+
+            ab_pipeline = pipeline.PipelineTmaUmma.create(
+                num_stages=self.num_a_stage,
+                producer_group=make_thread_cooperative_group(len([self.load_warp_ids])),
+                consumer_group=make_thread_cooperative_group(len([self.mma_warp_ids])),
+                tx_count=self.tma_copy_a_bytes + self.tma_copy_b_bytes,
+                barrier_storage=storage.load_ab_mbar_ptr.data_ptr(),
+            )
+            ab_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_a_stage
+            )
+            ab_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_a_stage
+            )
+
+            mma_pipeline = pipeline.PipelineUmmaAsync.create(
+                num_stages=self.num_acc_stage,
+                producer_group=make_thread_cooperative_group(len([self.mma_warp_ids])),
+                consumer_group=make_thread_cooperative_group(
+                    self.threads_per_warp * len(self.epi_warp_ids)
+                ),
+                barrier_storage=storage.mma_mbar_ptr.data_ptr(),
+            )
+            mma_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_acc_stage
+            )
+            mma_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_acc_stage
+            )
+
+            tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr.data_ptr()
+            if warp_idx == self.empty_warp_ids[0]:
+                with cute.arch.elect_one():
+                    cute.arch.mbarrier_init(
+                        tmem_dealloc_mbar_ptr, self.threads_per_warp * len(self.epi_warp_ids)
                     )
-                    ab_pipeline.producer_commit(ab_producer_state)
-                    ab_producer_state.advance()
-
-        # ///////
-        # mma
-        # ///////
-        if warp_idx == self.mma_warp_ids:
-            cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
-
-            for n in cutlass.range(num_n_tiles):
-                # disable accumulate for the first tile
-                tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
-                mma_pipeline.producer_acquire(mma_producer_state)
-
-                for k in cutlass.range(cute.size(gA, mode=[2])):
-                    ab_pipeline.consumer_wait(ab_consumer_state)
-
-                    for kblock_idx in cutlass.range(cute.size(tCsA, mode=[2]), unroll_full=True):
-                        cute.gemm(
-                            tiled_mma,
-                            cute.append_ones(tCtC[(None, None, mma_producer_state.index)]),
-                            tCsA[(None, None, kblock_idx, ab_consumer_state.index)],
-                            tCsB[(None, None, kblock_idx, ab_consumer_state.index)],
-                            cute.append_ones(tCtC[(None, None, mma_producer_state.index)]),
-                        )
-                        # enable accumulate for the next tile
-                        tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
-
-                    ab_pipeline.consumer_release(ab_consumer_state)
-                    ab_consumer_state.advance()
-
-                mma_pipeline.producer_commit(mma_producer_state)
-                mma_producer_state.advance()
-
-        # //////////
-        # epilogue
-        # //////////
-        if warp_idx in self.epi_warp_ids:
-            cute.arch.warpgroup_reg_alloc(self.num_regs_epi)
-
-            # epilog TMEM copy and partition
-            copy_atom_t2r = sm100_utils.get_tmem_load_op(
-                self.cta_tile_shape_mnk,
-                utils.LayoutEnum.ROW_MAJOR,  # This is hard-coded
-                self.acc_dtype,
-                self.acc_dtype,
-                (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
-                self.use_2cta_instrs,
+                    cute.arch.mbarrier_init_fence()
+
+            # -------- SMEM partition ------------ #
+            # swizzle o [(tileM, tileK), loopM, loopK, Stage]
+            sA = storage.sA.get_tensor(a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner)
+            # swizzle o [(tileN, tileK), loopN, loopK, stage]
+            sB = storage.sB.get_tensor(b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner)
+
+            # FIXME: if 2 CTAs, modify here
+            thr_mma = tiled_mma.get_slice(0)
+            # [MMA, loopM, loopK, stage]
+            tCsA = thr_mma.make_fragment_A(sA)
+            # [MMA, loopN, loopK, stage]
+            tCsB = thr_mma.make_fragment_B(sB)
+
+            # ---------- GMEM partition ----------- #
+            # [tileM, tileK, loopK]
+            gA = cute.local_tile(mA, (self.mma_tiler[0], self.mma_tiler[2]), (pidm, None))
+
+            # [vocab_size_per_split, dim]
+            mB_n = cute.local_tile(
+                mB, (self.vocab_per_split, cute.size(mB.layout.shape, mode=[1])), (pidn, 0)
             )
-            # [tileM, subTileN, loopM, CntSubTileN, loopN]
-            tAcc_epi = cute.flat_divide(
-                tCtC[((None, None), 0, None)],
-                (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
+
+            # [tileN, tileK, loopN, loopK]
+            gB = cute.local_tile(mB_n, (self.mma_tiler[1], self.mma_tiler[2]), (None, None))
+
+            # [MMA, tileCntM, tileCntK, loopK]
+            tCgA = thr_mma.partition_A(gA)
+            # [MMA, tileCntN, tileCntK, loopN, loopK]
+            tCgB = thr_mma.partition_B(gB)
+
+            a_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape)
+            # FIXME: if 2 CTAs, modify here
+            cta_rank_in_cluster = 0
+            block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
+            tTMAsA, tTMAgA = cpasync.tma_partition(
+                tma_atom_a,
+                block_in_cluster_coord_vmnk[2],  # cta_coord,
+                a_cta_layout,
+                cute.group_modes(sA, 0, 3),  # SMEM tensor
+                cute.group_modes(tCgA, 0, 3),  # GMEM tensor
             )
-            tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)])
-            thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
-            tTMEM_load_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
-            # [(pattern), loopM, loopN, CntTileM, CntTileN]
-            tTMEM_load_tAcc = cute.group_modes(tTMEM_load_tAcc, 3, cute.rank(tTMEM_load_tAcc) - 1)
-
-            cAcc = cute.make_identity_tensor(self.mma_tiler[:2])
-            tCcAcc = thr_mma.partition_C(cAcc)
-            # [tileM, subTileN, loopM, CntSubTileN, CntTileN]
-            tCcAcc_epi = cute.flat_divide(
-                tCcAcc[((None, None), 0, None)],
-                (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
+            b_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape)
+            tTMAsB, tTMAgB = cpasync.tma_partition(
+                tma_atom_b,
+                block_in_cluster_coord_vmnk[1],  # cta_coord
+                b_cta_layout,
+                cute.group_modes(sB, 0, 3),
+                cute.group_modes(tCgB, 0, 3),
             )
-            tTMEM_load_cAcc = thr_copy_t2r.partition_D(tCcAcc_epi)
-            tTMEM_load_cAcc_shape = cute.select(tTMEM_load_cAcc.shape, mode=[0, 1, 2])
-
-            # epilogue layouts
-            epilogue_thread_layout = cute.make_layout((128, 1))
-            copy_atom_g2r = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), mLabels.element_type)
-            tiled_copy_g2r = cute.make_tiled_copy(copy_atom_g2r, epilogue_thread_layout, (128, 1))
-            thr_copy_g2r = tiled_copy_g2r.get_slice(tidx)
-
-            copy_atom_r2g = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), cutlass.Float32)
-            tiled_copy_r2g = cute.make_tiled_copy(copy_atom_r2g, epilogue_thread_layout, (128, 1))
-            thr_copy_r2g = tiled_copy_r2g.get_slice(tidx)
-
-            # auxiliary tensors
-            # [tileM]
-            gLabels = cute.local_tile(mLabels, (self.epi_tile[0],), (pidm,))
-
-            tLabelsCAcc = thr_copy_g2r.partition_S(cAcc)[(None, None, 0)]
-            tLabelsCAcc_mask = cute.make_fragment(tLabelsCAcc.shape, cutlass.Boolean)
-            # [(1, 1), 1]
-            tLabelsCAcc_mask[0] = cute.elem_less(pidm * self.epi_tile[0] + tidx, problem_mnk[0])
-            # to align shape with gMax and gAccu
-            tLabelsCAcc_mask = cute.append_ones(tLabelsCAcc_mask)
-
-            # [(1, 1), 1, 1]
-            tLabelsgLabels = thr_copy_g2r.partition_S(cute.append_ones(gLabels))
-            tLabelsrLabels = cute.make_fragment(tLabelsgLabels.shape, tLabelsgLabels.element_type)
-            cute.copy(tiled_copy_g2r, tLabelsgLabels, tLabelsrLabels, pred=tLabelsCAcc_mask)
-            valid_mask: cutlass.Boolean = (tLabelsrLabels[0] != ignore_index) and tLabelsCAcc_mask[
-                0
-            ]
-
-            # [tileM, 1]
-            gMax = cute.local_tile(mMax, (self.epi_tile[0], 1), (pidm, pidn))
-            # [(CPYM, CPYN), loopM, loopN]
-            tR2GgMax = thr_copy_r2g.partition_D(gMax)
-            tR2GrMax = cute.make_fragment(tR2GgMax.shape, tR2GgMax.element_type)
-            tR2GrMax.fill(-1e30)
-
-            # [tileM, 1]
-            gAccu = cute.local_tile(mAccu, (self.epi_tile[0], 1), (pidm, pidn))
-            # [(CPYM, CPYN), loopM, loopN]
-            tR2GgAccu = thr_copy_r2g.partition_D(gAccu)
-            tR2GrAccu = cute.make_fragment(tR2GgAccu.shape, tR2GgAccu.element_type)
-            tR2GrAccu.fill(0.0)
-
-            # [tileM, 1]
-            gLogprobs = cute.append_ones(cute.local_tile(mLogprobs, (self.epi_tile[0],), (pidm,)))
-            # [(CPYM, CPYN), loopM, loopN]
-            tR2GgLogprobs = thr_copy_r2g.partition_D(gLogprobs)
-            tR2GrLogprobs = cute.make_fragment(tR2GgLogprobs.shape, tR2GgLogprobs.element_type)
-            tR2GrLogprobs.fill(0.0)
-
-            # [(tileN // num_epi_stage_per_tile, 1), 1, 1]
-            tTMEM_load_rAcc = cute.make_fragment(tTMEM_load_cAcc_shape, self.acc_dtype)
-
-            for n in cutlass.range(num_n_tiles):
-                mma_pipeline.consumer_wait(mma_consumer_state)
-
-                left: cutlass.Int64 = block_vocab_left_idx + n * self.epi_tile[1]
-                right: cutlass.Int64 = min(
-                    (n + 1) * self.epi_tile[1] + block_vocab_left_idx, block_vocab_right_idx
+
+            # Allocate TMEM
+            tmem_holding_buf = storage.tmem_holding_buf
+            if warp_idx == self.empty_warp_ids[0]:
+                cute.arch.alloc_tmem(
+                    self.tmem_alloc_cols, tmem_holding_buf, is_two_cta=self.use_2cta_instrs
                 )
-                num_n_subtiles: cutlass.Int64 = cute.ceil_div(
-                    (right - left), cute.size(tTMEM_load_rAcc, mode=[0])
+            self.cta_sync_barrier.arrive_and_wait()
+            tmem_ptr = cute.arch.retrieve_tmem_ptr(
+                self.acc_dtype, alignment=16, ptr_to_buffer_holding_addr=tmem_holding_buf
+            )
+
+            # [(tileM, tileN), loopM, loopN]
+            tmem_shape = (128, self.tmem_alloc_cols)
+            acc_shape = thr_mma.partition_shape_C(tmem_shape)
+            tCtC_fake = thr_mma.make_fragment_C(acc_shape)
+            tCtC = cute.make_tensor(tmem_ptr, tCtC_fake.layout)
+
+            block_vocab_left_idx: cutlass.Int64 = pidn * self.vocab_per_split
+            block_vocab_right_idx: cutlass.Int64 = min(
+                (pidn + 1) * self.vocab_per_split, problem_mnk[1]
+            )
+            num_n_tiles: cutlass.Int64 = cute.ceil_div(
+                (block_vocab_right_idx - block_vocab_left_idx), self.mma_tiler[1]
+            )
+
+            # ///////
+            # empty
+            # ///////
+            if warp_idx in self.empty_warp_ids:
+                cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+
+            # ///////
+            # load
+            # ///////
+            if warp_idx == self.load_warp_ids:
+                cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+
+                for n in cutlass.range(num_n_tiles):
+                    for k in cutlass.range(cute.size(gA, mode=[2])):
+                        ab_pipeline.producer_acquire(ab_producer_state)
+                        cute.copy(
+                            tma_atom_a,
+                            tTMAgA[(None, k)],
+                            tTMAsA[(None, ab_producer_state.index)],
+                            tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        )
+                        cute.copy(
+                            tma_atom_b,
+                            tTMAgB[(None, n, k)],
+                            tTMAsB[(None, ab_producer_state.index)],
+                            tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        )
+                        ab_pipeline.producer_commit(ab_producer_state)
+                        ab_producer_state.advance()
+
+            # ///////
+            # mma
+            # ///////
+            if warp_idx == self.mma_warp_ids:
+                cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+
+                for n in cutlass.range(num_n_tiles):
+                    # disable accumulate for the first tile
+                    tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+                    mma_pipeline.producer_acquire(mma_producer_state)
+
+                    for k in cutlass.range(cute.size(gA, mode=[2])):
+                        ab_pipeline.consumer_wait(ab_consumer_state)
+
+                        for kblock_idx in cutlass.range(cute.size(tCsA, mode=[2]), unroll_full=True):
+                            cute.gemm(
+                                tiled_mma,
+                                cute.append_ones(tCtC[(None, None, mma_producer_state.index)]),
+                                tCsA[(None, None, kblock_idx, ab_consumer_state.index)],
+                                tCsB[(None, None, kblock_idx, ab_consumer_state.index)],
+                                cute.append_ones(tCtC[(None, None, mma_producer_state.index)]),
+                            )
+                            # enable accumulate for the next tile
+                            tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+                        ab_pipeline.consumer_release(ab_consumer_state)
+                        ab_consumer_state.advance()
+
+                    mma_pipeline.producer_commit(mma_producer_state)
+                    mma_producer_state.advance()
+
+            # //////////
+            # epilogue
+            # //////////
+            if warp_idx in self.epi_warp_ids:
+                cute.arch.warpgroup_reg_alloc(self.num_regs_epi)
+
+                # epilog TMEM copy and partition
+                copy_atom_t2r = sm100_utils.get_tmem_load_op(
+                    self.cta_tile_shape_mnk,
+                    utils.LayoutEnum.ROW_MAJOR,  # This is hard-coded
+                    self.acc_dtype,
+                    self.acc_dtype,
+                    (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
+                    self.use_2cta_instrs,
+                )
+                # [tileM, subTileN, loopM, CntSubTileN, loopN]
+                tAcc_epi = cute.flat_divide(
+                    tCtC[((None, None), 0, None)],
+                    (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
                 )
-                for n_subtile in cutlass.range(num_n_subtiles):
-                    cute.copy(
-                        tiled_copy_t2r,
-                        tTMEM_load_tAcc[(None, None, None, n_subtile, mma_consumer_state.index)],
-                        tTMEM_load_rAcc,
+                tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)])
+                thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+                tTMEM_load_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
+                # [(pattern), loopM, loopN, CntTileM, CntTileN]
+                tTMEM_load_tAcc = cute.group_modes(tTMEM_load_tAcc, 3, cute.rank(tTMEM_load_tAcc) - 1)
+
+                cAcc = cute.make_identity_tensor(self.mma_tiler[:2])
+                tCcAcc = thr_mma.partition_C(cAcc)
+                # [tileM, subTileN, loopM, CntSubTileN, CntTileN]
+                tCcAcc_epi = cute.flat_divide(
+                    tCcAcc[((None, None), 0, None)],
+                    (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
+                )
+                tTMEM_load_cAcc = thr_copy_t2r.partition_D(tCcAcc_epi)
+                tTMEM_load_cAcc_shape = cute.select(tTMEM_load_cAcc.shape, mode=[0, 1, 2])
+
+                # epilogue layouts
+                epilogue_thread_layout = cute.make_layout((128, 1))
+                copy_atom_g2r = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), mLabels.element_type)
+                tiled_copy_g2r = cute.make_tiled_copy(copy_atom_g2r, epilogue_thread_layout, (128, 1))
+                thr_copy_g2r = tiled_copy_g2r.get_slice(tidx)
+
+                copy_atom_r2g = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), cutlass.Float32)
+                tiled_copy_r2g = cute.make_tiled_copy(copy_atom_r2g, epilogue_thread_layout, (128, 1))
+                thr_copy_r2g = tiled_copy_r2g.get_slice(tidx)
+
+                # auxiliary tensors
+                # [tileM]
+                gLabels = cute.local_tile(mLabels, (self.epi_tile[0],), (pidm,))
+
+                tLabelsCAcc = thr_copy_g2r.partition_S(cAcc)[(None, None, 0)]
+                tLabelsCAcc_mask = cute.make_fragment(tLabelsCAcc.shape, cutlass.Boolean)
+                # [(1, 1), 1]
+                tLabelsCAcc_mask[0] = cute.elem_less(pidm * self.epi_tile[0] + tidx, problem_mnk[0])
+                # to align shape with gMax and gAccu
+                tLabelsCAcc_mask = cute.append_ones(tLabelsCAcc_mask)
+
+                # [(1, 1), 1, 1]
+                tLabelsgLabels = thr_copy_g2r.partition_S(cute.append_ones(gLabels))
+                tLabelsrLabels = cute.make_fragment(tLabelsgLabels.shape, tLabelsgLabels.element_type)
+                cute.copy(tiled_copy_g2r, tLabelsgLabels, tLabelsrLabels, pred=tLabelsCAcc_mask)
+                valid_mask: cutlass.Boolean = (tLabelsrLabels[0] != ignore_index) and tLabelsCAcc_mask[
+                    0
+                ]
+
+                # [tileM, 1]
+                gMax = cute.local_tile(mMax, (self.epi_tile[0], 1), (pidm, pidn))
+                # [(CPYM, CPYN), loopM, loopN]
+                tR2GgMax = thr_copy_r2g.partition_D(gMax)
+                tR2GrMax = cute.make_fragment(tR2GgMax.shape, tR2GgMax.element_type)
+                tR2GrMax.fill(-1e30)
+
+                # [tileM, 1]
+                gAccu = cute.local_tile(mAccu, (self.epi_tile[0], 1), (pidm, pidn))
+                # [(CPYM, CPYN), loopM, loopN]
+                tR2GgAccu = thr_copy_r2g.partition_D(gAccu)
+                tR2GrAccu = cute.make_fragment(tR2GgAccu.shape, tR2GgAccu.element_type)
+                tR2GrAccu.fill(0.0)
+
+                # [tileM, 1]
+                gLogprobs = cute.append_ones(cute.local_tile(mLogprobs, (self.epi_tile[0],), (pidm,)))
+                # [(CPYM, CPYN), loopM, loopN]
+                tR2GgLogprobs = thr_copy_r2g.partition_D(gLogprobs)
+                tR2GrLogprobs = cute.make_fragment(tR2GgLogprobs.shape, tR2GgLogprobs.element_type)
+                tR2GrLogprobs.fill(0.0)
+
+                # [(tileN // num_epi_stage_per_tile, 1), 1, 1]
+                tTMEM_load_rAcc = cute.make_fragment(tTMEM_load_cAcc_shape, self.acc_dtype)
+
+                for n in cutlass.range(num_n_tiles):
+                    mma_pipeline.consumer_wait(mma_consumer_state)
+
+                    left: cutlass.Int64 = block_vocab_left_idx + n * self.epi_tile[1]
+                    right: cutlass.Int64 = min(
+                        (n + 1) * self.epi_tile[1] + block_vocab_left_idx, block_vocab_right_idx
                     )
-
-                    for idx in cutlass.range(
-                        cute.size(tTMEM_load_rAcc, mode=[0]), unroll_full=True
-                    ):
-                        local_position: cutlass.Int64 = (
-                            n * self.epi_tile[1]
-                            + n_subtile * cute.size(tTMEM_load_rAcc, mode=[0])
-                            + idx
+                    num_n_subtiles: cutlass.Int64 = cute.ceil_div(
+                        (right - left), cute.size(tTMEM_load_rAcc, mode=[0])
+                    )
+                    for n_subtile in cutlass.range(num_n_subtiles):
+                        cute.copy(
+                            tiled_copy_t2r,
+                            tTMEM_load_tAcc[(None, None, None, n_subtile, mma_consumer_state.index)],
+                            tTMEM_load_rAcc,
                         )
-                        if (block_vocab_left_idx + local_position) < block_vocab_right_idx:
-                            _max_old = tR2GrMax[0]
-                            tR2GrMax[0] = cute.arch.fmax(tR2GrMax[0], tTMEM_load_rAcc[idx])
-                            exp_logits = cute.exp(tTMEM_load_rAcc[idx] - tR2GrMax[0])
-                            coeff = cute.exp(_max_old - tR2GrMax[0])
-                            tR2GrAccu[0] = coeff * tR2GrAccu[0] + exp_logits
-
-                            position: cutlass.Int64 = (
-                                rank * problem_mnk[1] + pidn * self.vocab_per_split + local_position
+
+                        for idx in cutlass.range(
+                            cute.size(tTMEM_load_rAcc, mode=[0]), unroll_full=True
+                        ):
+                            local_position: cutlass.Int64 = (
+                                n * self.epi_tile[1]
+                                + n_subtile * cute.size(tTMEM_load_rAcc, mode=[0])
+                                + idx
                             )
-                            mask: cutlass.Boolean = valid_mask and (position == tLabelsrLabels[0])
-                            tR2GrLogprobs[0] += mask * tTMEM_load_rAcc[idx]
+                            if (block_vocab_left_idx + local_position) < block_vocab_right_idx:
+                                _max_old = tR2GrMax[0]
+                                tR2GrMax[0] = cute.arch.fmax(tR2GrMax[0], tTMEM_load_rAcc[idx])
+                                exp_logits = cute.exp(tTMEM_load_rAcc[idx] - tR2GrMax[0])
+                                coeff = cute.exp(_max_old - tR2GrMax[0])
+                                tR2GrAccu[0] = coeff * tR2GrAccu[0] + exp_logits
+
+                                position: cutlass.Int64 = (
+                                    rank * problem_mnk[1] + pidn * self.vocab_per_split + local_position
+                                )
+                                mask: cutlass.Boolean = valid_mask and (position == tLabelsrLabels[0])
+                                tR2GrLogprobs[0] += mask * tTMEM_load_rAcc[idx]
+
+                    mma_pipeline.consumer_release(mma_consumer_state)
+                    mma_consumer_state.advance()
+
+                cute.copy(tiled_copy_r2g, tR2GrMax, tR2GgMax, pred=tLabelsCAcc_mask)
+                cute.copy(tiled_copy_r2g, tR2GrAccu, tR2GgAccu, pred=tLabelsCAcc_mask)
+
+                vocab_left_idx: cutlass.Int64 = rank * problem_mnk[1] + pidn * self.vocab_per_split
+                vocab_right_idx: cutlass.Int64 = rank * problem_mnk[1] + min(
+                    (pidn + 1) * self.vocab_per_split, problem_mnk[1]
+                )
+                valid: cutlass.Boolean = (
+                    tLabelsrLabels[0] >= vocab_left_idx and tLabelsrLabels[0] < vocab_right_idx
+                )
+                tLabelsCAcc_mask[0] &= valid
 
-                mma_pipeline.consumer_release(mma_consumer_state)
-                mma_consumer_state.advance()
+                cute.copy(tiled_copy_r2g, tR2GrLogprobs, tR2GgLogprobs, pred=tLabelsCAcc_mask)
 
-            cute.copy(tiled_copy_r2g, tR2GrMax, tR2GgMax, pred=tLabelsCAcc_mask)
-            cute.copy(tiled_copy_r2g, tR2GrAccu, tR2GgAccu, pred=tLabelsCAcc_mask)
+            # Dealloc TMEM
+            self.cta_sync_barrier.arrive_and_wait()
+            if warp_idx == self.empty_warp_ids[0]:
+                cute.arch.relinquish_tmem_alloc_permit()
+                cute.arch.dealloc_tmem(tmem_ptr, self.tmem_alloc_cols, is_two_cta=self.use_2cta_instrs)
 
-            vocab_left_idx: cutlass.Int64 = rank * problem_mnk[1] + pidn * self.vocab_per_split
-            vocab_right_idx: cutlass.Int64 = rank * problem_mnk[1] + min(
-                (pidn + 1) * self.vocab_per_split, problem_mnk[1]
+        @staticmethod
+        def _compute_grid(
+            problem_mnk: Tuple[int, int, int],
+            cluster_shape_mn: Tuple[int, int],
+            cta_tiler: Tuple[int, int, int],
+            num_splits: int,
+        ) -> Tuple[int, int, int]:
+
+            cluster_shape = (*cluster_shape_mn, 1)
+
+            grid = cute.round_up(
+                (cute.ceil_div(problem_mnk[0], cta_tiler[0]), num_splits, 1), cluster_shape
             )
-            valid: cutlass.Boolean = (
-                tLabelsrLabels[0] >= vocab_left_idx and tLabelsrLabels[0] < vocab_right_idx
+            return grid
+
+        @cute.jit
+        def __call__(
+            self,
+            hidden: cute.Tensor,
+            weight: cute.Tensor,
+            labels: cute.Tensor,
+            _logprobs: cute.Tensor,
+            _max: cute.Tensor,
+            _accu: cute.Tensor,
+            ignore_index: cutlass.Int64,
+            rank: cutlass.Int32,
+            stream: cuda.CUstream,
+        ) -> None:
+            a_dtype: Type[cutlass.Numeric] = hidden.element_type
+            b_dtype: Type[cutlass.Numeric] = weight.element_type
+
+            if cutlass.const_expr(hidden.element_type != weight.element_type):
+                raise RuntimeError(
+                    f"data type don't match: {hidden.element_type} v.s. {weight.element_type}"
+                )
+            if cutlass.const_expr(hidden.element_type not in [cutlass.Float16, cutlass.BFloat16]):
+                raise RuntimeError("hidden can only be FP16 or BF16")
+            if cutlass.const_expr(hidden.layout.shape[1] != weight.layout.shape[1]):
+                raise RuntimeError("K dimension doesn't match")
+
+            problem_mnk = (hidden.layout.shape[0], weight.layout.shape[0], hidden.layout.shape[1])
+            if cutlass.const_expr((problem_mnk[2] * a_dtype.width // 8) % 16 != 0):
+                raise RuntimeError(f"K dimension is not 16B aligned: {problem_mnk[2]}")
+
+            num_splits = cute.ceil_div(problem_mnk[1], self.vocab_per_split)
+
+            grid = self._compute_grid(
+                problem_mnk=problem_mnk,
+                cluster_shape_mn=self.cluster_shape_mn,
+                cta_tiler=self.cta_tiler,
+                num_splits=num_splits,
             )
-            tLabelsCAcc_mask[0] &= valid
-
-            cute.copy(tiled_copy_r2g, tR2GrLogprobs, tR2GgLogprobs, pred=tLabelsCAcc_mask)
-
-        # Dealloc TMEM
-        self.cta_sync_barrier.arrive_and_wait()
-        if warp_idx == self.empty_warp_ids[0]:
-            cute.arch.relinquish_tmem_alloc_permit()
-            cute.arch.dealloc_tmem(tmem_ptr, self.tmem_alloc_cols, is_two_cta=self.use_2cta_instrs)
-
-    @staticmethod
-    def _compute_grid(
-        problem_mnk: Tuple[int, int, int],
-        cluster_shape_mn: Tuple[int, int],
-        cta_tiler: Tuple[int, int, int],
-        num_splits: int,
-    ) -> Tuple[int, int, int]:
-
-        cluster_shape = (*cluster_shape_mn, 1)
-
-        grid = cute.round_up(
-            (cute.ceil_div(problem_mnk[0], cta_tiler[0]), num_splits, 1), cluster_shape
-        )
-        return grid
-
-    @cute.jit
-    def __call__(
-        self,
-        hidden: cute.Tensor,
-        weight: cute.Tensor,
-        labels: cute.Tensor,
-        _logprobs: cute.Tensor,
-        _max: cute.Tensor,
-        _accu: cute.Tensor,
-        ignore_index: cutlass.Int64,
-        rank: cutlass.Int32,
-        stream: cuda.CUstream,
-    ) -> None:
-        a_dtype: Type[cutlass.Numeric] = hidden.element_type
-        b_dtype: Type[cutlass.Numeric] = weight.element_type
-
-        if cutlass.const_expr(hidden.element_type != weight.element_type):
-            raise RuntimeError(
-                f"data type don't match: {hidden.element_type} v.s. {weight.element_type}"
+            a_major_mode = utils.LayoutEnum.from_tensor(hidden).mma_major_mode()
+            b_major_mode = utils.LayoutEnum.from_tensor(weight).mma_major_mode()
+
+            tiled_mma = sm100_utils.make_trivial_tiled_mma(
+                a_dtype, a_major_mode, b_major_mode, self.acc_dtype, self.cta_group, self.mma_tiler[:2]
             )
-        if cutlass.const_expr(hidden.element_type not in [cutlass.Float16, cutlass.BFloat16]):
-            raise RuntimeError("hidden can only be FP16 or BF16")
-        if cutlass.const_expr(hidden.layout.shape[1] != weight.layout.shape[1]):
-            raise RuntimeError("K dimension doesn't match")
-
-        problem_mnk = (hidden.layout.shape[0], weight.layout.shape[0], hidden.layout.shape[1])
-        if cutlass.const_expr((problem_mnk[2] * a_dtype.width // 8) % 16 != 0):
-            raise RuntimeError(f"K dimension is not 16B aligned: {problem_mnk[2]}")
-
-        num_splits = cute.ceil_div(problem_mnk[1], self.vocab_per_split)
-
-        grid = self._compute_grid(
-            problem_mnk=problem_mnk,
-            cluster_shape_mn=self.cluster_shape_mn,
-            cta_tiler=self.cta_tiler,
-            num_splits=num_splits,
-        )
-        a_major_mode = utils.LayoutEnum.from_tensor(hidden).mma_major_mode()
-        b_major_mode = utils.LayoutEnum.from_tensor(weight).mma_major_mode()
-
-        tiled_mma = sm100_utils.make_trivial_tiled_mma(
-            a_dtype, a_major_mode, b_major_mode, self.acc_dtype, self.cta_group, self.mma_tiler[:2]
-        )
-
-        self._setup_attributes(tiled_mma, a_dtype, b_dtype)
-        if cutlass.const_expr((problem_mnk[2] * a_dtype.width // 8) % 128 != 0):
-            raise RuntimeError(f"K dimension is not 128B aligned: {problem_mnk[2]}")
-
-        self.epi_tile = self.mma_tiler[:2]
-
-        # Swizzle o [(tileM, tileK), loopM, loopK, stage]
-        a_smem_layout_staged = sm100_utils.make_smem_layout_a(
-            tiled_mma, self.mma_tiler, a_dtype, self.num_a_stage
-        )
-        # Swizzle o [(tileN, tileK), loopN, loopK, stage]
-        b_smem_layout_staged = sm100_utils.make_smem_layout_b(
-            tiled_mma, self.mma_tiler, b_dtype, self.num_b_stage
-        )
-
-        # TMA loading
-        tma_load_op = cpasync.CopyBulkTensorTileG2SOp(self.cta_group)
-        tma_store_op = cpasync.CopyBulkTensorTileS2GOp()
-
-        # Swizzle o [(tileM, tileK), loopM, loopK]
-        a_smem_layout = cute.select(a_smem_layout_staged, mode=[0, 1, 2])
-        # create tma copy atom for hidden,
-        # and the cooresponding tma descriptor tensor
-        tma_atom_a, tma_desc_a = cute.nvgpu.make_tiled_tma_atom_A(
-            tma_load_op,
-            hidden,  # gmem_tensor
-            a_smem_layout,  # SMEM layout
-            self.mma_tiler,  # MMA tiler
-            tiled_mma,  # TiledMMA
-            self.cluster_layout_vmnk.shape,  # cluster_shape_vmnk
-        )
-        # Swizzle o [(tileN, tileK), loopN, loopK]
-        b_smem_layout = cute.select(b_smem_layout_staged, mode=[0, 1, 2])
-        tma_atom_b, tma_desc_b = cute.nvgpu.make_tiled_tma_atom_B(
-            tma_load_op,
-            weight,  # gmem_tensor
-            b_smem_layout,  # SMEM layout
-            self.mma_tiler,  # MMA tiler
-            tiled_mma,  # TiledMMA
-            self.cluster_layout_vmnk.shape,  # cluster_shape_vmnk
-        )
-        a_copy_size = cute.size_in_bytes(a_dtype, a_smem_layout)
-        b_copy_size = cute.size_in_bytes(b_dtype, b_smem_layout)
-        self.tma_copy_a_bytes = a_copy_size
-        self.tma_copy_b_bytes = b_copy_size
-
-        assert self.num_a_stage == self.num_b_stage
-
-        @cute.struct
-        class SharedStorage:
-            """
-            The shared storage for the forward kernel.
-            """
 
-            # pipeline barriers, 2 = producer + consumer
-            load_ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_a_stage * 2]
-            mma_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2]
-            tmem_dealloc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, 1]
-            # tmem holding buffer
-            tmem_holding_buf: cutlass.Int32
-            # SMEM tensors
-            sA: cute.struct.Align[
-                cute.struct.MemRange[a_dtype, cute.cosize(a_smem_layout_staged)],
-                self.buffer_align_bytes,
-            ]
-            sB: cute.struct.Align[
-                cute.struct.MemRange[b_dtype, cute.cosize(b_smem_layout_staged)],
-                self.buffer_align_bytes,
-            ]
-
-        self.shared_storage = SharedStorage
-
-        # launch kernel
-        self.kernel(
-            tiled_mma,
-            tma_atom_a,
-            tma_desc_a,
-            tma_atom_b,
-            tma_desc_b,
-            labels,
-            _max,
-            _accu,
-            _logprobs,
-            a_smem_layout_staged,
-            b_smem_layout_staged,
-            self.cluster_layout_vmnk,
-            problem_mnk,
-            ignore_index,
-            rank,
-        ).launch(
-            grid=grid,
-            block=[self.threads_per_cta, 1, 1],
-            cluster=self.cluster_shape_mnk,
-            stream=stream,
-        )
-        return None
+            self._setup_attributes(tiled_mma, a_dtype, b_dtype)
+            if cutlass.const_expr((problem_mnk[2] * a_dtype.width // 8) % 128 != 0):
+                raise RuntimeError(f"K dimension is not 128B aligned: {problem_mnk[2]}")
+
+            self.epi_tile = self.mma_tiler[:2]
+
+            # Swizzle o [(tileM, tileK), loopM, loopK, stage]
+            a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+                tiled_mma, self.mma_tiler, a_dtype, self.num_a_stage
+            )
+            # Swizzle o [(tileN, tileK), loopN, loopK, stage]
+            b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+                tiled_mma, self.mma_tiler, b_dtype, self.num_b_stage
+            )
+
+            # TMA loading
+            tma_load_op = cpasync.CopyBulkTensorTileG2SOp(self.cta_group)
+            tma_store_op = cpasync.CopyBulkTensorTileS2GOp()
+
+            # Swizzle o [(tileM, tileK), loopM, loopK]
+            a_smem_layout = cute.select(a_smem_layout_staged, mode=[0, 1, 2])
+            # create tma copy atom for hidden,
+            # and the cooresponding tma descriptor tensor
+            tma_atom_a, tma_desc_a = cute.nvgpu.make_tiled_tma_atom_A(
+                tma_load_op,
+                hidden,  # gmem_tensor
+                a_smem_layout,  # SMEM layout
+                self.mma_tiler,  # MMA tiler
+                tiled_mma,  # TiledMMA
+                self.cluster_layout_vmnk.shape,  # cluster_shape_vmnk
+            )
+            # Swizzle o [(tileN, tileK), loopN, loopK]
+            b_smem_layout = cute.select(b_smem_layout_staged, mode=[0, 1, 2])
+            tma_atom_b, tma_desc_b = cute.nvgpu.make_tiled_tma_atom_B(
+                tma_load_op,
+                weight,  # gmem_tensor
+                b_smem_layout,  # SMEM layout
+                self.mma_tiler,  # MMA tiler
+                tiled_mma,  # TiledMMA
+                self.cluster_layout_vmnk.shape,  # cluster_shape_vmnk
+            )
+            a_copy_size = cute.size_in_bytes(a_dtype, a_smem_layout)
+            b_copy_size = cute.size_in_bytes(b_dtype, b_smem_layout)
+            self.tma_copy_a_bytes = a_copy_size
+            self.tma_copy_b_bytes = b_copy_size
+
+            assert self.num_a_stage == self.num_b_stage
+
+            @cute.struct
+            class SharedStorage:
+                """
+                The shared storage for the forward kernel.
+                """
+
+                # pipeline barriers, 2 = producer + consumer
+                load_ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_a_stage * 2]
+                mma_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2]
+                tmem_dealloc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, 1]
+                # tmem holding buffer
+                tmem_holding_buf: cutlass.Int32
+                # SMEM tensors
+                sA: cute.struct.Align[
+                    cute.struct.MemRange[a_dtype, cute.cosize(a_smem_layout_staged)],
+                    self.buffer_align_bytes,
+                ]
+                sB: cute.struct.Align[
+                    cute.struct.MemRange[b_dtype, cute.cosize(b_smem_layout_staged)],
+                    self.buffer_align_bytes,
+                ]
+
+            self.shared_storage = SharedStorage
+
+            # launch kernel
+            self.kernel(
+                tiled_mma,
+                tma_atom_a,
+                tma_desc_a,
+                tma_atom_b,
+                tma_desc_b,
+                labels,
+                _max,
+                _accu,
+                _logprobs,
+                a_smem_layout_staged,
+                b_smem_layout_staged,
+                self.cluster_layout_vmnk,
+                problem_mnk,
+                ignore_index,
+                rank,
+            ).launch(
+                grid=grid,
+                block=[self.threads_per_cta, 1, 1],
+                cluster=self.cluster_shape_mnk,
+                stream=stream,
+            )
+            return None
+except ImportError:
+    pass

From 48c52289bf229f1ec6dce11e621d6f1851c55f4d Mon Sep 17 00:00:00 2001
From: Jianbin Chang <shjwudp@gmail.com>
Date: Tue, 2 Dec 2025 17:35:58 +0800
Subject: [PATCH 17/17] Update Dev Branch & Fix CI (#19)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [DEV] pull main Nov 25 (#2395)

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
Signed-off-by: Ananth Subramaniam <ansubramania@nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>
Signed-off-by: Youngeun <kyeg9404@gmail.com>
Signed-off-by: Maanu Grover <maanug@nvidia.com>
Signed-off-by: ykarnati <ykarnati@nvidia.com>
Signed-off-by: Deepak Narayanan <dnarayanan@nvidia.com>
Signed-off-by: GitHub Actions <github-actions[bot]@users.noreply.github.com>
Signed-off-by: Charlie Truong <chtruong@nvidia.com>
Signed-off-by: Zhongbo Zhu <zhongboz@nvidia.com>
Signed-off-by: Xiaowei Ren <xren@nvidia.com>
Signed-off-by: Xin Yao <xiny@nvidia.com>
Signed-off-by: Keshav Santhanam <ksanthanam@nvidia.com>
Signed-off-by: Pablo Garay <pagaray@nvidia.com>
Signed-off-by: Asha Anoosheh <aanoosheh@nvidia.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Co-authored-by: Yashaswi Karnati <144376261+yashaswikarnati@users.noreply.github.com>
Co-authored-by: Jared Casper <155158+jaredcasper@users.noreply.github.com>
Co-authored-by: Antoni-Joan Solergibert <asolergibert@nvidia.com>
Co-authored-by: Jianbin Chang <shjwudp@gmail.com>
Co-authored-by: oliver könig <okoenig@nvidia.com>
Co-authored-by: Ananth Subramaniam <ansubramania@nvidia.com>
Co-authored-by: Teodor-Dumitru Ene <34819528+tdene@users.noreply.github.com>
Co-authored-by: Siddharth Singh <136645615+sidsingh-nvidia@users.noreply.github.com>
Co-authored-by: Mcore Bot <mcore-bot@nvidia.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Youngeun Kwon <youngeunk@nvidia.com>
Co-authored-by: Lawrence McAfee <85179052+lmcafee-nvidia@users.noreply.github.com>
Co-authored-by: Maanu Grover <109391026+maanug-nv@users.noreply.github.com>
Co-authored-by: Lawrence McAfee <lmcafee@nvidia.com>
Co-authored-by: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Co-authored-by: Deepak Narayanan <2724038+deepakn94@users.noreply.github.com>
Co-authored-by: helen ngo <helenn@nvidia.com>
Co-authored-by: GitHub Actions <github-actions[bot]@users.noreply.github.com>
Co-authored-by: Aaron Gokaslan <aaronGokaslan@gmail.com>
Co-authored-by: Robert Kirby <rkirby@nvidia.com>
Co-authored-by: Teodor-Dumitru Ene <tene@nvidia.com>
Co-authored-by: yeyu-nvidia <yeyu@nvidia.com>
Co-authored-by: Abhinav Khattar <akhattar@nvidia.com>
Co-authored-by: Roger Waleffe <rwaleffe@nvidia.com>
Co-authored-by: Charlie Truong <chtruong@nvidia.com>
Co-authored-by: Tong Liu <liutongt1998@gmail.com>
Co-authored-by: Zhongbo Zhu <42691305+zhongbozhu@users.noreply.github.com>
Co-authored-by: Xiaowei Ren <xren@nvidia.com>
Co-authored-by: Xin Yao <xiny@nvidia.com>
Co-authored-by: Teodor-Dumitru Ene <teodord.ene@gmail.com>
Co-authored-by: Zijie Yan <zijiey@nvidia.com>
Co-authored-by: root <root@pool0-01101.cm.cluster>
Co-authored-by: Keshav Santhanam <ksanthanam@nvidia.com>
Co-authored-by: Pablo Garay <pagaray@nvidia.com>
Co-authored-by: Asha Anoosheh <aanoosheh@nvidia.com>
Co-authored-by: Kan Zhu <kanz@nvidia.com>
Co-authored-by: Robert Kirby <rkirby@cw-dfw-cs-001-vscode-01.cm.cluster>
Co-authored-by: Jorge Albericio <jalbericiola@nvidia.com>
Co-authored-by: Jon Barker <19699370+jon-barker@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Tong Liu <tongliu@nvidia.com>

* adding action for checking whether PR author is nvidia employee or not for selecting ephemeral ci hosts (#2402)

Signed-off-by: oliver könig <okoenig@nvidia.com>

* fix: exit failure when PR author is external contributor removed (#2410)

* fix: adding k8s taints for ephermeral jobs (#2420)

* ci: Enable functional tests (#2419)

Signed-off-by: oliver könig <okoenig@nvidia.com>

* Reapply "build: Upgrade deps (NVIDIA#2289)" (#2408)

Signed-off-by: oliver könig <okoenig@nvidia.com>

* fix: use a script to do node tainting in the cicd workflow (#2421)

* Revert "[DEV] pull main Nov 25 (#2395)"

This reverts commit 56682f80b0db4492afeee013a07187eadfa9dc8f.

Signed-off-by: oliver könig <okoenig@nvidia.com>

* [Dev] Support packed seq in MTP (#2043)

Signed-off-by: Li Tao <lit@nvidia.com>
Signed-off-by: lit <lit@nvidia.com>

* Fix runaway Etpt in straggler detector by resetting FLOPs accumulator (#2128)

Signed-off-by: Santosh Bhavani <santosh.bhavani@live.com>
Co-authored-by: Li Ruixiao <cgruixiao@outlook.com>

* [Dev] feat(MoE): Refactor cuda_graph_scope - part2 (#2353)

Signed-off-by: Robin Zhang <robinz@nvidia.com>

* [dev] DeepSeek V3.2 support (#2154)

Signed-off-by: kunlunl <kunlunl@nvidia.com>

* Revert "[Dev] feat(MoE): Refactor cuda_graph_scope - part2 (#2353)"

This reverts commit 92c8482e6dcd11c3666c61bb8d1f7e8d0730ed13.

* Add logs for missing CUDA and Cute.

* autoformat

---------

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
Signed-off-by: Ananth Subramaniam <ansubramania@nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>
Signed-off-by: Youngeun <kyeg9404@gmail.com>
Signed-off-by: Maanu Grover <maanug@nvidia.com>
Signed-off-by: ykarnati <ykarnati@nvidia.com>
Signed-off-by: Deepak Narayanan <dnarayanan@nvidia.com>
Signed-off-by: GitHub Actions <github-actions[bot]@users.noreply.github.com>
Signed-off-by: Charlie Truong <chtruong@nvidia.com>
Signed-off-by: Zhongbo Zhu <zhongboz@nvidia.com>
Signed-off-by: Xiaowei Ren <xren@nvidia.com>
Signed-off-by: Xin Yao <xiny@nvidia.com>
Signed-off-by: Keshav Santhanam <ksanthanam@nvidia.com>
Signed-off-by: Pablo Garay <pagaray@nvidia.com>
Signed-off-by: Asha Anoosheh <aanoosheh@nvidia.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Li Tao <lit@nvidia.com>
Signed-off-by: lit <lit@nvidia.com>
Signed-off-by: Santosh Bhavani <santosh.bhavani@live.com>
Signed-off-by: Robin Zhang <robinz@nvidia.com>
Signed-off-by: kunlunl <kunlunl@nvidia.com>
Co-authored-by: Deyu Fu <Deyu.Foo@gmail.com>
Co-authored-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Co-authored-by: Yashaswi Karnati <144376261+yashaswikarnati@users.noreply.github.com>
Co-authored-by: Jared Casper <155158+jaredcasper@users.noreply.github.com>
Co-authored-by: Antoni-Joan Solergibert <asolergibert@nvidia.com>
Co-authored-by: oliver könig <okoenig@nvidia.com>
Co-authored-by: Ananth Subramaniam <ansubramania@nvidia.com>
Co-authored-by: Teodor-Dumitru Ene <34819528+tdene@users.noreply.github.com>
Co-authored-by: Siddharth Singh <136645615+sidsingh-nvidia@users.noreply.github.com>
Co-authored-by: Mcore Bot <mcore-bot@nvidia.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Youngeun Kwon <youngeunk@nvidia.com>
Co-authored-by: Lawrence McAfee <85179052+lmcafee-nvidia@users.noreply.github.com>
Co-authored-by: Maanu Grover <109391026+maanug-nv@users.noreply.github.com>
Co-authored-by: Lawrence McAfee <lmcafee@nvidia.com>
Co-authored-by: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Co-authored-by: Deepak Narayanan <2724038+deepakn94@users.noreply.github.com>
Co-authored-by: helen ngo <helenn@nvidia.com>
Co-authored-by: GitHub Actions <github-actions[bot]@users.noreply.github.com>
Co-authored-by: Aaron Gokaslan <aaronGokaslan@gmail.com>
Co-authored-by: Robert Kirby <rkirby@nvidia.com>
Co-authored-by: Teodor-Dumitru Ene <tene@nvidia.com>
Co-authored-by: yeyu-nvidia <yeyu@nvidia.com>
Co-authored-by: Abhinav Khattar <akhattar@nvidia.com>
Co-authored-by: Roger Waleffe <rwaleffe@nvidia.com>
Co-authored-by: Charlie Truong <chtruong@nvidia.com>
Co-authored-by: Tong Liu <liutongt1998@gmail.com>
Co-authored-by: Zhongbo Zhu <42691305+zhongbozhu@users.noreply.github.com>
Co-authored-by: Xiaowei Ren <xren@nvidia.com>
Co-authored-by: Xin Yao <xiny@nvidia.com>
Co-authored-by: Teodor-Dumitru Ene <teodord.ene@gmail.com>
Co-authored-by: Zijie Yan <zijiey@nvidia.com>
Co-authored-by: root <root@pool0-01101.cm.cluster>
Co-authored-by: Keshav Santhanam <ksanthanam@nvidia.com>
Co-authored-by: Pablo Garay <pagaray@nvidia.com>
Co-authored-by: Asha Anoosheh <aanoosheh@nvidia.com>
Co-authored-by: Kan Zhu <kanz@nvidia.com>
Co-authored-by: Robert Kirby <rkirby@cw-dfw-cs-001-vscode-01.cm.cluster>
Co-authored-by: Jorge Albericio <jalbericiola@nvidia.com>
Co-authored-by: Jon Barker <19699370+jon-barker@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Tong Liu <tongliu@nvidia.com>
Co-authored-by: Michael Wojcikiewicz <mwojcikiewic@nvidia.com>
Co-authored-by: Li Tao <lit@nvidia.com>
Co-authored-by: Santosh Bhavani <santosh.bhavani@live.com>
Co-authored-by: Li Ruixiao <cgruixiao@outlook.com>
Co-authored-by: Robin Zhang <robinz@nvidia.com>
Co-authored-by: Kunlun Li <94586211+kunlunl@users.noreply.github.com>
---
 .github/actions/action.yml                    |   57 +-
 .../check-nvidia-sso-membership/action.yml    |  139 +
 .github/workflows/cicd-main.yml               |  113 +-
 .gitlab/scripts/build.sh                      |    5 +-
 docker/Dockerfile.ci.dev                      |    1 +
 gpt_builders.py                               |    7 +-
 .../core/dist_checkpointing/exchange_utils.py |    2 +-
 megatron/core/dist_checkpointing/mapping.py   |    2 +-
 .../core/dist_checkpointing/validation.py     |    2 +-
 .../fusions/fused_linear_cross_entropy.py     |   14 +-
 .../blackwell/bwd_partial_dlogits.py          |   69 +-
 .../linear_cross_entropy/blackwell/entry.py   |  126 +-
 .../blackwell/fwd_mainloop.py                 |   89 +-
 ...rimental_attention_variant_module_specs.py |  132 +
 megatron/core/models/gpt/gpt_layer_specs.py   |   52 +-
 megatron/core/models/gpt/gpt_model.py         |   14 +-
 .../gpt/linear_attention_module_specs.py      |   27 -
 megatron/core/transformer/attention.py        |    1 +
 .../experimental_attention_variant/dsa.py     |  822 +++++
 .../transformer/multi_latent_attention.py     |   87 +-
 .../transformer/multi_token_prediction.py     |  118 +-
 .../core/transformer/transformer_config.py    |   42 +-
 megatron/training/arguments.py                |   35 +-
 megatron/training/training.py                 |   25 +-
 pyproject.toml                                |   35 +-
 .../download_unit_tests_dataset.py            |  205 +-
 ...pt-dynamic-inference-with-coordinator.yaml |    4 +-
 .../recipes/gpt-dynamic-inference.yaml        |    8 +-
 .../recipes/gpt-static-inference.yaml         |   10 +-
 tests/test_utils/recipes/gpt.yaml             |  122 +-
 .../recipes/mamba-static-inference.yaml       |    6 +-
 tests/test_utils/recipes/mamba.yaml           |   10 +-
 .../recipes/moe-dynamic-inference.yaml        |    6 +-
 .../recipes/moe-static-inference.yaml         |    8 +-
 tests/test_utils/recipes/moe.yaml             |   24 +-
 .../test_utils/recipes/multimodal-llava.yaml  |    6 +-
 tests/unit_tests/conftest.py                  |    9 +-
 .../test_fused_linear_cross_entropy.py        |   26 +-
 tests/unit_tests/ssm/test_gated_delta_net.py  |    4 +-
 .../transformer/test_attention_variant_dsa.py | 1271 ++++++++
 .../test_multi_token_prediction.py            |  208 +-
 uv.lock                                       | 2832 ++++++++---------
 42 files changed, 4668 insertions(+), 2107 deletions(-)
 create mode 100644 .github/actions/check-nvidia-sso-membership/action.yml
 create mode 100644 megatron/core/models/gpt/experimental_attention_variant_module_specs.py
 delete mode 100644 megatron/core/models/gpt/linear_attention_module_specs.py
 create mode 100644 megatron/core/transformer/experimental_attention_variant/dsa.py
 create mode 100644 tests/unit_tests/transformer/test_attention_variant_dsa.py

diff --git a/.github/actions/action.yml b/.github/actions/action.yml
index 8c6ca3a6865..5c35385b036 100644
--- a/.github/actions/action.yml
+++ b/.github/actions/action.yml
@@ -11,28 +11,28 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-name: "Test Template"
-description: "Template for running NeMo tests in a containerized environment"
+name: 'Test Template'
+description: 'Template for running NeMo tests in a containerized environment'
 
 inputs:
   container-image:
-    description: "Container image to use for test"
+    description: 'Container image to use for test'
     required: true
   timeout:
-    description: "Max runtime of test in minutes"
+    description: 'Max runtime of test in minutes'
     required: false
-    default: "30"
+    default: '30'
   script:
-    description: "Test script to execute"
+    description: 'Test script to execute'
     required: true
   is-optional:
-    description: "Pass this job on failure."
+    description: 'Pass this job on failure.'
     required: false
-    default: "false"
+    default: 'false'
   is_unit_test:
-    description: "Upload coverage as unit test"
+    description: 'Upload coverage as unit test'
     required: false
-    default: "false"
+    default: 'false'
   tag:
     description: Latest or legacy test suite
     required: true
@@ -43,11 +43,11 @@ inputs:
     description: Model to launch
     required: false
   PAT:
-    description: "GitHub Personal Access Token"
+    description: 'GitHub Personal Access Token'
     required: true
 
 runs:
-  using: "composite"
+  using: 'composite'
   steps:
     - name: Checkout repository
       uses: actions/checkout@v2
@@ -114,6 +114,16 @@ runs:
         HAS_RUN_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run tests")') || echo "false"
         echo "main=$HAS_RUN_TESTS_LABEL" | tee -a $GITHUB_OUTPUT
 
+    - name: Has Run functional tests label
+      shell: bash -x -e -u -o pipefail {0}
+      id: has-run-functional-tests-label
+      env:
+        GH_TOKEN: ${{ github.token }}
+      run: |
+        PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
+        HAS_RUN_FUNCTIONAL_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run functional tests")') || echo "false"
+        echo "main=$HAS_RUN_FUNCTIONAL_TESTS_LABEL" | tee -a $GITHUB_OUTPUT
+
     - name: Create run-script (e2e test)
       shell: bash -x -e -u -o pipefail {0}
       if: inputs.is_unit_test == 'false'
@@ -126,16 +136,19 @@ runs:
         set -euxo pipefail
 
         if [ "${{ steps.has-run-tests-label.outputs.main }}" == "true" ]; then
-            ARGS=(
-              --scope mr-github
-              --enable-lightweight-mode
-            )
-          else
-            ARGS=(
-              --scope mr-slim
-              --enable-lightweight-mode
-            )
-          fi
+          ARGS=(
+            --scope mr-github
+            --enable-lightweight-mode
+          )
+        elif [ "${{ steps.has-run-functional-tests-label.outputs.main }}" == "true" ]; then
+          ARGS=(
+            --scope mr-github
+          )
+        else
+          ARGS=(
+            --scope mr-github-slim
+          )
+        fi
 
         export PYTHONPATH=$(pwd)
         export NEMORUN_HOME=$(pwd)
diff --git a/.github/actions/check-nvidia-sso-membership/action.yml b/.github/actions/check-nvidia-sso-membership/action.yml
new file mode 100644
index 00000000000..71926c4547d
--- /dev/null
+++ b/.github/actions/check-nvidia-sso-membership/action.yml
@@ -0,0 +1,139 @@
+name: 'Check NVIDIA SSO Membership'
+description: 'Check if a GitHub username exists in the NVIDIA SSO users list from github-audits'
+author: 'NVIDIA'
+
+inputs:
+  username:
+    description: 'GitHub username to check'
+    required: true
+  github_audits_repo:
+    description: 'Repository containing SSO users file'
+    required: false
+    default: 'NVIDIA-GitHub-Management/github-audits'
+  github_audits_version:
+    description: 'Release version tag'
+    required: false
+    default: 'v0.1.0'
+  sso_users_filename:
+    description: 'Filename of SSO users JSON'
+    required: false
+    default: 'users_sso.json'
+  github_token:
+    description: 'GitHub token with access to github-audits repo'
+    required: true
+
+outputs:
+  is_member:
+    description: 'Boolean - true if user is in NVIDIA SSO list, false otherwise'
+    value: ${{ steps.check-membership.outputs.is_member }}
+  is_org_member:
+    description: 'Boolean - true if user has NVIDIA or NVIDIA-NeMo in org_roles'
+    value: ${{ steps.check-membership.outputs.is_org_member }}
+  user_orgs:
+    description: 'Comma-separated list of orgs user is member of'
+    value: ${{ steps.check-membership.outputs.user_orgs }}
+  sso_file_available:
+    description: 'Boolean - true if SSO file was successfully downloaded'
+    value: ${{ steps.download-sso.outputs.sso_file_available }}
+  user_count:
+    description: 'Number of users in the SSO file (0 if download failed)'
+    value: ${{ steps.download-sso.outputs.user_count }}
+
+runs:
+  using: 'composite'
+  steps:
+    - name: Download NVIDIA SSO users from github-audits
+      id: download-sso
+      shell: bash
+      env:
+        GH_TOKEN: ${{ inputs.github_token }}
+      run: |
+        echo "Downloading ${{ inputs.sso_users_filename }} from ${{ inputs.github_audits_repo }} ${{ inputs.github_audits_version }} release..."
+
+        # Download the release asset using gh CLI
+        gh release download ${{ inputs.github_audits_version }} \
+          --repo ${{ inputs.github_audits_repo }} \
+          --pattern ${{ inputs.sso_users_filename }} \
+          --clobber 2>&1 || {
+            echo "ERROR: Failed to download ${{ inputs.sso_users_filename }} from github-audits release"
+            echo "sso_file_available=false" >> $GITHUB_OUTPUT
+            echo "user_count=0" >> $GITHUB_OUTPUT
+            exit 0
+          }
+
+        # Verify file was downloaded and is valid JSON
+        if [ ! -f ${{ inputs.sso_users_filename }} ]; then
+          echo "ERROR: ${{ inputs.sso_users_filename }} file not found after download"
+          echo "sso_file_available=false" >> $GITHUB_OUTPUT
+          echo "user_count=0" >> $GITHUB_OUTPUT
+          exit 0
+        fi
+
+        # Validate JSON structure
+        if ! jq -e 'type == "object"' ${{ inputs.sso_users_filename }} > /dev/null 2>&1; then
+          echo "ERROR: ${{ inputs.sso_users_filename }} is not a valid JSON object"
+          echo "sso_file_available=false" >> $GITHUB_OUTPUT
+          echo "user_count=0" >> $GITHUB_OUTPUT
+          exit 0
+        fi
+
+        USER_COUNT=$(jq 'length' ${{ inputs.sso_users_filename }})
+        echo "Successfully downloaded ${{ inputs.sso_users_filename }} with $USER_COUNT NVIDIA SSO users"
+        echo "sso_file_available=true" >> $GITHUB_OUTPUT
+        echo "user_count=$USER_COUNT" >> $GITHUB_OUTPUT
+
+    - name: Check if user is in SSO list
+      id: check-membership
+      shell: bash
+      run: |
+        USERNAME="${{ inputs.username }}"
+        SSO_FILE="${{ inputs.sso_users_filename }}"
+
+        echo "Checking if $USERNAME is in NVIDIA SSO users list..."
+
+        # Check if SSO file is available
+        if [ "${{ steps.download-sso.outputs.sso_file_available }}" != "true" ] || [ ! -f "$SSO_FILE" ]; then
+          echo "ERROR: $SSO_FILE not available - cannot check membership"
+          echo "is_member=false" >> $GITHUB_OUTPUT
+          echo "is_org_member=false" >> $GITHUB_OUTPUT
+          echo "user_orgs=" >> $GITHUB_OUTPUT
+          exit 0
+        fi
+
+        # Check if username exists as a key in the JSON object
+        if jq -e --arg user "$USERNAME" 'has($user)' "$SSO_FILE" > /dev/null 2>&1; then
+          echo "$USERNAME found in NVIDIA SSO users"
+          echo "is_member=true" >> $GITHUB_OUTPUT
+
+          # Extract and check org membership
+          IS_ORG_MEMBER=$(jq -r --arg user "$USERNAME" '
+            .[$user].org_roles // [] |
+            map(select(test("^(NVIDIA|NVIDIA-NeMo):Member$"))) |
+            length > 0
+          ' "$SSO_FILE")
+
+          USER_ORGS=$(jq -r --arg user "$USERNAME" '
+            .[$user].org_roles // [] |
+            map(split(":")[0]) |
+            unique |
+            join(",")
+          ' "$SSO_FILE")
+
+          echo "is_org_member=$IS_ORG_MEMBER" >> $GITHUB_OUTPUT
+          echo "user_orgs=$USER_ORGS" >> $GITHUB_OUTPUT
+
+          if [ "$IS_ORG_MEMBER" == "true" ]; then
+            echo "$USERNAME is a member of NVIDIA or NVIDIA-NeMo org"
+          else
+            echo "$USERNAME has @nvidia.com email but is not in NVIDIA or NVIDIA-NeMo org (orgs: $USER_ORGS)"
+          fi
+        else
+          echo "$USERNAME NOT found in NVIDIA SSO users"
+          echo "is_member=false" >> $GITHUB_OUTPUT
+          echo "is_org_member=false" >> $GITHUB_OUTPUT
+          echo "user_orgs=" >> $GITHUB_OUTPUT
+        fi
+
+branding:
+  icon: 'shield'
+  color: 'green'
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 4a4a1a2cad1..a5a7a82287e 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -20,8 +20,8 @@ on:
     branches:
       - dev
       - main
-      - "pull-request/[0-9]+"
-      - "deploy-release/*"
+      - 'pull-request/[0-9]+'
+      - 'deploy-release/*'
   merge_group:
     types: [checks_requested]
   workflow_dispatch:
@@ -44,6 +44,8 @@ jobs:
     if: github.repository == 'NVIDIA/Megatron-LM'
     outputs:
       is_external_contributor: ${{ github.event.pull_request.user.type == 'User' }}
+      is_maintainer: ${{ steps.check-membership.outputs.is_maintainer }}
+      selected_runner: ${{ steps.check-membership.outputs.is_maintainer == 'true' && 'nvidia-ci-aws-gpu-x8' || 'nvidia-ci-aws-gpu-x8-ephemeral' }}
     permissions:
       issues: write
       pull-requests: write
@@ -61,7 +63,14 @@ jobs:
         if: startsWith(github.ref, 'refs/heads/pull-request/')
         uses: nv-gha-runners/get-pr-info@main
 
-      - name: Check membership
+      - name: Check NVIDIA SSO membership
+        id: check-sso
+        uses: ./.github/actions/check-nvidia-sso-membership
+        with:
+          username: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }}
+          github_token: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}
+
+      - name: Set maintainer status
         id: check-membership
         env:
           IS_MAIN_BRANCH: ${{ github.ref == 'refs/heads/main' }}
@@ -69,38 +78,15 @@ jobs:
           IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }}
           SCHEDULED_JOB: ${{ github.event_name == 'schedule' }}
         run: |
-          PR_AUTHOR=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').user.login }}
-
+          # Skip SSO check for scheduled jobs, main branch, dev branch, or merge groups
           if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ] || [ "${IS_DEV_BRANCH}" == "true" ] || [ "${IS_MERGE_GROUP}" == "true" ]; then
             echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT
             exit 0
           fi
 
-          echo "Checking if $PR_AUTHOR is a repo collaborator..."
-          API_URL="https://api.github.com/repos/$REPO/collaborators/$PR_AUTHOR"
-          REPO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer $GITHUB_TOKEN" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            $API_URL)
-
-          echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA-NeMo..."
-          API_URL="https://api.github.com/orgs/NVIDIA-NeMo/members/$PR_AUTHOR"
-          ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer $GITHUB_TOKEN" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            $API_URL)
-
-          echo "Checking if $PR_AUTHOR is an org collaborator to NVIDIA..."
-          API_URL="https://api.github.com/orgs/NVIDIA/members/$PR_AUTHOR"
-          ORG_NVIDIA_MEMBERSHIP_RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -L \
-            -H "Accept: application/vnd.github+json" \
-            -H "Authorization: Bearer $GITHUB_TOKEN" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            $API_URL)
-
-          if [ "$REPO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_NEMO_MEMBERSHIP_RESPONSE" -eq 204 ] || [ "$ORG_NVIDIA_MEMBERSHIP_RESPONSE" -eq 204 ]; then
+          # Use SSO membership check result
+          IS_MEMBER="${{ steps.check-sso.outputs.is_member }}"
+          if [ "$IS_MEMBER" == "true" ]; then
             echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT
           else
             echo "is_maintainer=false" | tee -a $GITHUB_OUTPUT
@@ -113,7 +99,7 @@ jobs:
         with:
           issue-number: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
           repository: ${{ github.repository }}
-          body-includes: "<!--external-contributor-comment-->"
+          body-includes: '<!--external-contributor-comment-->'
 
       - name: Delete comment
         uses: actions/github-script@v7
@@ -142,14 +128,6 @@ jobs:
 
             Thank you for your understanding.
 
-      - name: exit
-        run: |
-          if [ "${{ steps.check-membership.outputs.is_maintainer }}" == "true" ]; then
-            exit 0
-          else
-            exit 1
-          fi
-
   pre-flight:
     needs: [is-not-external-contributor]
     if: github.repository == 'NVIDIA/Megatron-LM'
@@ -213,9 +191,8 @@ jobs:
           echo "is_merge_group: ${{ needs.pre-flight.outputs.is_merge_group }}"
 
   cicd-container-build:
-    needs: [pre-flight, cicd-wait-in-queue]
-    runs-on: nvidia-ci-aws-gpu-x8
-    environment: nemo-ci
+    needs: [is-not-external-contributor, pre-flight, cicd-wait-in-queue]
+    runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }}
     if: |
       (
         success()
@@ -225,6 +202,11 @@ jobs:
       && needs.pre-flight.outputs.is_merge_group == 'false'
       && !cancelled()
     steps:
+      - name: Taint node for job isolation
+        if: contains(needs.is-not-external-contributor.outputs.selected_runner, 'ephemeral')
+        shell: bash
+        run: taint-node.sh
+
       - name: Checkout
         uses: actions/checkout@v4
 
@@ -255,11 +237,9 @@ jobs:
 
       - name: Download test data
         shell: bash
-        env:
-          GH_TOKEN: ${{ secrets.PAT }}
         run: |
           echo "::group::Download test data"
-          pip install --no-cache-dir pygithub click
+          pip install --no-cache-dir click requests
           python tests/test_utils/python_scripts/download_unit_tests_dataset.py --assets-dir ./assets
           echo "::endgroup::"
 
@@ -364,13 +344,13 @@ jobs:
       matrix:
         include: ${{ fromJson(needs.cicd-parse-unit-tests.outputs.unit-tests) }}
     needs:
+      - is-not-external-contributor
       - pre-flight
       - cicd-wait-in-queue
       - cicd-container-build
       - cicd-parse-unit-tests
-    runs-on: nvidia-ci-aws-gpu-x8
-    name: "${{ matrix.bucket }} - latest"
-    environment: nemo-ci
+    runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }}
+    name: '${{ matrix.bucket }} - latest'
     if: |
       (
         success()
@@ -384,6 +364,11 @@ jobs:
       PIP_NO_PYTHON_VERSION_WARNING: 1
       PIP_ROOT_USER_ACTION: ignore
     steps:
+      - name: Taint node for job isolation
+        if: contains(needs.is-not-external-contributor.outputs.selected_runner, 'ephemeral')
+        shell: bash
+        run: taint-node.sh
+
       - name: Checkout
         uses: actions/checkout@v4
       - name: main
@@ -392,7 +377,7 @@ jobs:
           test_case: ${{ matrix.bucket }}
           tag: latest
           timeout: ${{ matrix.timeout || 30 }}
-          is_unit_test: "true"
+          is_unit_test: 'true'
           PAT: ${{ secrets.PAT }}
           container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }}
 
@@ -432,10 +417,20 @@ jobs:
           HAS_RUN_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run tests")') || echo "false"
           echo "main=$HAS_RUN_TESTS_LABEL" | tee -a $GITHUB_OUTPUT
 
+      - name: Has Run functional tests label
+        id: has-run-functional-tests-label
+        env:
+          GH_TOKEN: ${{ secrets.PAT }}
+        run: |
+          PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
+          HAS_RUN_FUNCTIONAL_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run functional tests")') || echo "false"
+          echo "main=$HAS_RUN_FUNCTIONAL_TESTS_LABEL" | tee -a $GITHUB_OUTPUT
+
       - name: Parse functional tests
         id: main
         env:
           HAS_RUN_TESTS_LABEL: ${{ steps.has-run-tests-label.outputs.main }}
+          HAS_RUN_FUNCTIONAL_TESTS_LABEL: ${{ steps.has-run-functional-tests-label.outputs.main }}
         run: |
           export PYTHONPATH=$(pwd)
 
@@ -444,10 +439,13 @@ jobs:
               --scope mr-github
               --enable-lightweight-mode
             )
+          elif [ "$HAS_RUN_FUNCTIONAL_TESTS_LABEL" == "true" ]; then
+            ARGS=(
+              --scope mr-github
+            )
           else
             ARGS=(
-              --scope mr-slim
-              --enable-lightweight-mode
+              --scope mr-github-slim
             )
           fi
 
@@ -478,13 +476,13 @@ jobs:
       matrix:
         include: ${{ fromJson(needs.cicd-parse-integration-tests.outputs.integration-tests) }}
     needs:
+      - is-not-external-contributor
       - pre-flight
       - cicd-wait-in-queue
       - cicd-parse-integration-tests
       - cicd-unit-tests-latest
-    runs-on: nvidia-ci-aws-gpu-x8
-    name: "${{ matrix.model }}/${{ matrix.test_case }} - latest"
-    environment: nemo-ci
+    runs-on: ${{ needs.is-not-external-contributor.outputs.selected_runner }}
+    name: '${{ matrix.model }}/${{ matrix.test_case }} - latest'
     env:
       PIP_DISABLE_PIP_VERSION_CHECK: 1
       PIP_NO_PYTHON_VERSION_WARNING: 1
@@ -498,6 +496,11 @@ jobs:
       && needs.pre-flight.outputs.is_merge_group == 'false'
       && !cancelled()
     steps:
+      - name: Taint node for job isolation
+        if: contains(needs.is-not-external-contributor.outputs.selected_runner, 'ephemeral')
+        shell: bash
+        run: taint-node.sh
+
       - name: Checkout
         uses: actions/checkout@v4
       - name: main
@@ -507,7 +510,7 @@ jobs:
           model: ${{ matrix.model }}
           tag: latest
           timeout: ${{ matrix.timeout || 30 }}
-          is_unit_test: "false"
+          is_unit_test: 'false'
           PAT: ${{ secrets.PAT }}
           container-image: ${{ env.container-registry }}/megatron-lm:${{ github.sha }}
 
diff --git a/.gitlab/scripts/build.sh b/.gitlab/scripts/build.sh
index 960af104628..e64434e834d 100644
--- a/.gitlab/scripts/build.sh
+++ b/.gitlab/scripts/build.sh
@@ -7,9 +7,9 @@ eval "IMAGE=\$$IMAGE"
 # Start a named container in detached mode
 docker run -d --name download_test_data -w /workdir/ python:3.12-slim bash -c 'sleep infinity'
 docker cp tests/. download_test_data:/workdir/tests
-docker exec -e GH_TOKEN=$GH_TOKEN download_test_data bash -c '
+docker exec download_test_data bash -c '
     ls -al /workdir/
-    pip install --no-cache-dir pygithub click
+    pip install --no-cache-dir click requests
     python tests/test_utils/python_scripts/download_unit_tests_dataset.py --assets-dir ./assets
 '
 docker cp download_test_data:/workdir/assets ./
@@ -50,6 +50,7 @@ DOCKER_BUILDKIT=1 docker build \
     --builder=container \
     --build-arg JET_API_VERSION=$JET_API_VERSION \
     --cache-from type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID} \
+    --cache-from type=registry,ref=${IMAGE}-buildcache:dev \
     --cache-from type=registry,ref=${IMAGE}-buildcache:main \
     --build-arg FROM_IMAGE_NAME=$BASE_IMAGE \
     --push \
diff --git a/docker/Dockerfile.ci.dev b/docker/Dockerfile.ci.dev
index 6596fc01aaf..482c6af460c 100644
--- a/docker/Dockerfile.ci.dev
+++ b/docker/Dockerfile.ci.dev
@@ -36,6 +36,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         --no-install-package torch \
         --no-install-package torchvision \
         --no-install-package triton \
+        --no-install-package transformer-engine-cu12 \
         --no-install-package nvidia-cublas-cu12 \
         --no-install-package nvidia-cuda-cupti-cu12 \
         --no-install-package nvidia-cuda-nvrtc-cu12 \
diff --git a/gpt_builders.py b/gpt_builders.py
index 9fa1aff72c7..61d159b9967 100644
--- a/gpt_builders.py
+++ b/gpt_builders.py
@@ -42,7 +42,8 @@ def gpt_builder(args, pre_process, post_process, vp_stage=None, config=None):
         else:
             use_te = args.transformer_impl == "transformer_engine"
 
-            if args.num_experts or (args.linear_attention_type is not None):
+            linear_attention_variants = ["gated_delta_net"]
+            if args.num_experts or args.experimental_attention_variant in linear_attention_variants:
                 # Define the decoder block spec
                 transformer_layer_spec = get_gpt_decoder_block_spec(
                     config,
@@ -114,7 +115,7 @@ def _get_transformer_layer_spec(use_te, config):
             args.moe_grouped_gemm,
             args.qk_layernorm,
             args.multi_latent_attention,
-            args.linear_attention_type,
+            args.experimental_attention_variant,
             moe_use_legacy_grouped_gemm=args.moe_use_legacy_grouped_gemm,
             qk_l2_norm=args.qk_l2_norm,
             use_kitchen=config.use_kitchen,
@@ -126,7 +127,7 @@ def _get_transformer_layer_spec(use_te, config):
             args.moe_grouped_gemm,
             args.qk_layernorm,
             args.multi_latent_attention,
-            args.linear_attention_type,
+            args.experimental_attention_variant,
             moe_use_legacy_grouped_gemm=args.moe_use_legacy_grouped_gemm,
             normalization=args.normalization,
             use_kitchen=config.use_kitchen,
diff --git a/megatron/core/dist_checkpointing/exchange_utils.py b/megatron/core/dist_checkpointing/exchange_utils.py
index def79fb778e..2f791449057 100644
--- a/megatron/core/dist_checkpointing/exchange_utils.py
+++ b/megatron/core/dist_checkpointing/exchange_utils.py
@@ -63,7 +63,7 @@ class ShardDistribution(NamedTuple):
 def _shard_size(sh_ten: ShardedTensor):
     """Returns size in bytes of a given sharded tensor."""
     if sh_ten.flattened_range is None:
-        numel = np.product(sh_ten.local_shape)
+        numel = np.prod(sh_ten.local_shape)
     else:
         numel = sh_ten.flattened_range.stop - sh_ten.flattened_range.start
     return numel * torch._utils._element_size(sh_ten.dtype)
diff --git a/megatron/core/dist_checkpointing/mapping.py b/megatron/core/dist_checkpointing/mapping.py
index d38ea57eee0..45a105666ab 100644
--- a/megatron/core/dist_checkpointing/mapping.py
+++ b/megatron/core/dist_checkpointing/mapping.py
@@ -216,7 +216,7 @@ def local_coordinates(self) -> Tuple[np.ndarray, ...]:
             )
 
         # TODO: np.unravel_index?
-        mask = np.zeros(np.product(self.local_shape), dtype=bool)
+        mask = np.zeros(np.prod(self.local_shape), dtype=bool)
         mask[self.flattened_range] = True
         return np.nonzero(mask.reshape(self.local_shape))
 
diff --git a/megatron/core/dist_checkpointing/validation.py b/megatron/core/dist_checkpointing/validation.py
index 96945055319..9bcb59bdbf4 100644
--- a/megatron/core/dist_checkpointing/validation.py
+++ b/megatron/core/dist_checkpointing/validation.py
@@ -519,7 +519,7 @@ def _validate_sharding_for_key_flattened(tensors_by_shard):
         all_slices.append((sharding.flattened_range.start, sharding.flattened_range.stop))
 
     starts, stops = map(np.asarray, zip(*sorted(all_slices)))
-    expected_size = np.product(local_shape)
+    expected_size = np.prod(local_shape)
     if starts[0] != 0 or stops[-1] != expected_size or not np.all(starts[1:] == stops[:-1]):
         raise CheckpointingException(
             f"Flattened ranges dont cover the whole shard {tensors_by_shard[0]} of size {expected_size}. Ranges: {(starts, stops)}"
diff --git a/megatron/core/fusions/fused_linear_cross_entropy.py b/megatron/core/fusions/fused_linear_cross_entropy.py
index 3bb3b5c14f1..b533fef7aa3 100644
--- a/megatron/core/fusions/fused_linear_cross_entropy.py
+++ b/megatron/core/fusions/fused_linear_cross_entropy.py
@@ -159,10 +159,16 @@ def forward(
         ```
         """
         with torch.cuda.nvtx.range("LinearCrossEntropy-forward"):
-            logprobs, _maximum, _acc, _num_valid_tokens, tp_rank, tp_world_size, global_hidden = (
-                _get_platform().forward_func(
-                    hidden, weight, labels, tp_group, reduction, ignore_index, sequence_parallel
-                )
+            (
+                logprobs,
+                _maximum,
+                _acc,
+                _num_valid_tokens,
+                tp_rank,
+                tp_world_size,
+                global_hidden,
+            ) = _get_platform().forward_func(
+                hidden, weight, labels, tp_group, reduction, ignore_index, sequence_parallel
             )
             ctx.save_for_backward(global_hidden, weight, labels, _maximum, _acc, _num_valid_tokens)
             ctx.tp_group = tp_group
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py b/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py
index 17ad627322e..3178e8c6909 100644
--- a/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 
-try:
-    from typing import Optional, Tuple, Type
+import logging
+from typing import Optional, Tuple, Type
 
+try:
     import cuda.bindings.driver as cuda  # type: ignore
     import cutlass
     import cutlass.cute as cute
@@ -13,7 +14,6 @@
 
     SM100_TMEM_CAPACITY_COLUMNS: int = 512
 
-
     def make_thread_cooperative_group(size: int, alignment: Optional[int] = None):
         """
         Create a thread cooperative group.
@@ -22,7 +22,6 @@ def make_thread_cooperative_group(size: int, alignment: Optional[int] = None):
             pipeline.Agent.Thread, size, alignment=alignment if alignment is not None else size
         )
 
-
     class BwdPartialDlogits:
         """
         This class implements the backward kernel for partial d_logits.
@@ -109,10 +108,14 @@ def _setup_attributes(
             mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
             # it requires k-mode to be 128B aligned
             mma_inst_tile_k: int = 4
-            self.mma_tiler = (self.mma_tiler[0], self.mma_tiler[1], mma_inst_shape_k * mma_inst_tile_k)
+            self.mma_tiler = (
+                self.mma_tiler[0],
+                self.mma_tiler[1],
+                mma_inst_shape_k * mma_inst_tile_k,
+            )
 
-            self.num_acc_stage, self.num_ab_stage, self.num_epi_stage_per_tile = self._compute_stages(
-                tiled_mma, self.mma_tiler, a_dtype, b_dtype
+            self.num_acc_stage, self.num_ab_stage, self.num_epi_stage_per_tile = (
+                self._compute_stages(tiled_mma, self.mma_tiler, a_dtype, b_dtype)
             )
             self.tmem_alloc_cols = self.num_acc_stage * self.mma_tiler[1]
             assert self.tmem_alloc_cols <= SM100_TMEM_CAPACITY_COLUMNS
@@ -205,9 +208,13 @@ def kernel(
 
             # -------- tensor partition ------------ #
             # swizzle o [(tileM, tileK), loopM, loopK, stage]
-            sA = storage.sA.get_tensor(a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner)
+            sA = storage.sA.get_tensor(
+                a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner
+            )
             # swizzle o [(tileN, tileK), loopN, loopK, stage]
-            sB = storage.sB.get_tensor(b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner)
+            sB = storage.sB.get_tensor(
+                b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner
+            )
 
             # FIXME: if 2 CTAs, modify here
             thr_mma = tiled_mma.get_slice(0)
@@ -336,10 +343,14 @@ def kernel(
                     tCtC[((None, None), 0, None)],
                     (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
                 )
-                tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)])
+                tiled_copy_t2r = tcgen05.make_tmem_copy(
+                    copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)]
+                )
                 thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
                 tTMEM_load_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
-                tTMEM_load_tAcc = cute.group_modes(tTMEM_load_tAcc, 3, cute.rank(tTMEM_load_tAcc) - 1)
+                tTMEM_load_tAcc = cute.group_modes(
+                    tTMEM_load_tAcc, 3, cute.rank(tTMEM_load_tAcc) - 1
+                )
 
                 # predicates
                 cAcc = cute.make_identity_tensor(self.mma_tiler[:2])
@@ -379,7 +390,9 @@ def kernel(
                 tMCAcc_mask = cute.make_fragment(tMCAcc.shape, cutlass.Boolean)
                 # to align shape with gMax and gAccu
                 tMCAcc_mask = cute.append_ones(tMCAcc_mask)
-                tMCAcc_mask[0] = cute.elem_less(pidm * self.epi_tile[0] + tidx, cute.size(mA, mode=[0]))
+                tMCAcc_mask[0] = cute.elem_less(
+                    pidm * self.epi_tile[0] + tidx, cute.size(mA, mode=[0])
+                )
                 # [(1, 1), 1, 1]
                 tMgLabels = thr_copy_g2r_int64.partition_S(cute.append_ones(gLabels))
                 tMrLabels = cute.make_fragment(tMgLabels.shape, tMgLabels.element_type)
@@ -416,7 +429,9 @@ def kernel(
                 )
                 # blackwell supports STG.256
                 copy_atom_r2g = cute.make_copy_atom(
-                    cute.nvgpu.CopyUniversalOp(), gDlogits_partial.element_type, num_bits_per_copy=256
+                    cute.nvgpu.CopyUniversalOp(),
+                    gDlogits_partial.element_type,
+                    num_bits_per_copy=256,
                 )
                 tiled_copy_r2g = cute.make_tiled_copy_tv(
                     copy_atom_r2g, epilogue_thread_layout, copy_atom_r2g.layout_dst_tv
@@ -430,7 +445,8 @@ def kernel(
                     for row in cutlass.range(cute.size(tR2GCAcc_pred, mode=[1])):
                         for col in cutlass.range(cute.size(tR2GCAcc_pred, mode=[2])):
                             tR2GCAcc_pred[elem, row, col] = cute.elem_less(
-                                pidm * self.epi_tile[0] + tR2GCAcc[elem, row, col][0], problem_mnk[0]
+                                pidm * self.epi_tile[0] + tR2GCAcc[elem, row, col][0],
+                                problem_mnk[0],
                             ) and cute.elem_less(
                                 split_idx * self.vocab_per_split
                                 + pidn * self.epi_tile[1]
@@ -442,7 +458,9 @@ def kernel(
 
                 # for type conversion
                 dLogits_half = cute.make_fragment(tTMEM_load_rAcc.shape, tR2GgDlogits.element_type)
-                dLogits_half = cute.tiled_divide(dLogits_half, (cute.size(tR2GgDlogits, mode=[0]), 1))
+                dLogits_half = cute.tiled_divide(
+                    dLogits_half, (cute.size(tR2GgDlogits, mode=[0]), 1)
+                )
                 dLogits_half = cute.group_modes(dLogits_half, 2, cute.rank(dLogits_half))
 
                 mma_pipeline.consumer_wait(mma_consumer_state)
@@ -455,7 +473,8 @@ def kernel(
                     min((split_idx + 1) * self.vocab_per_split, problem_mnk[1]),
                 )
                 num_n_subtiles: cutlass.Int64 = cute.ceil_div(
-                    (block_vocab_right_idx - block_vocab_left_idx), cute.size(tTMEM_load_rAcc, mode=[0])
+                    (block_vocab_right_idx - block_vocab_left_idx),
+                    cute.size(tTMEM_load_rAcc, mode=[0]),
                 )
                 for n_subtile in cutlass.range(num_n_subtiles):
                     cute.copy(
@@ -464,7 +483,9 @@ def kernel(
                         tTMEM_load_rAcc,
                     )
 
-                    for idx in cutlass.range(cute.size(tTMEM_load_rAcc, mode=[0]), unroll_full=True):
+                    for idx in cutlass.range(
+                        cute.size(tTMEM_load_rAcc, mode=[0]), unroll_full=True
+                    ):
                         # exp_logits
                         tTMEM_load_rAcc[idx] = cute.exp(tTMEM_load_rAcc[idx] - tMrMaximum[0])
 
@@ -499,7 +520,9 @@ def kernel(
             self.cta_sync_barrier.arrive_and_wait()
             if warp_idx == self.empty_warp_ids[0]:
                 cute.arch.relinquish_tmem_alloc_permit()
-                cute.arch.dealloc_tmem(tmem_ptr, self.tmem_alloc_cols, is_two_cta=self.use_2cta_instrs)
+                cute.arch.dealloc_tmem(
+                    tmem_ptr, self.tmem_alloc_cols, is_two_cta=self.use_2cta_instrs
+                )
 
         @cute.jit
         def __call__(
@@ -545,7 +568,12 @@ def __call__(
             b_major_mode = utils.LayoutEnum.from_tensor(weight).mma_major_mode()
 
             tiled_mma = sm100_utils.make_trivial_tiled_mma(
-                a_dtype, a_major_mode, b_major_mode, self.acc_dtype, self.cta_group, self.mma_tiler[:2]
+                a_dtype,
+                a_major_mode,
+                b_major_mode,
+                self.acc_dtype,
+                self.cta_group,
+                self.mma_tiler[:2],
             )
             self._setup_attributes(tiled_mma, a_dtype, b_dtype)
 
@@ -634,5 +662,6 @@ class SharedStorage:
                 cluster=self.cluster_shape_mnk,
                 stream=stream,
             )
+
 except ImportError:
-    pass
+    logging.warning("Cutlass or CUDA bindings not found. BwdPartialDlogits will not be available.")
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py b/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py
index 7ca2e5c91fb..dc369a7c558 100644
--- a/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py
@@ -1,11 +1,12 @@
 # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 
-try:
-    import typing
-    from dataclasses import dataclass, field
-    from functools import lru_cache
-    import os
+import logging
+import os
+import typing
+from dataclasses import dataclass, field
+from functools import lru_cache
 
+try:
     import cuda.bindings.driver as cuda  # type: ignore
     import cutlass
     import cutlass.cute as cute
@@ -21,7 +22,6 @@
     from megatron.core.fusions.linear_cross_entropy.blackwell import fwd_mainloop as fwd_mainloop
     from megatron.core.fusions.linear_cross_entropy.blackwell import triton as triton_kernels
 
-
     @dataclass
     class FwdConfig:
         """
@@ -32,8 +32,9 @@ class FwdConfig:
         _dedicated_events: typing.List[torch.cuda.Event] = field(default_factory=list)
         _initialized: bool = field(default=False)
         _fwd_mainloop_kernels: typing.Dict[str, cute.kernel] = field(default_factory=dict)
-        _vocab_per_split: int = field(default=int(os.environ.get("LCE_FWD_VOCAB_SPLIT_SIZE", 512 * 6)))
-
+        _vocab_per_split: int = field(
+            default=int(os.environ.get("LCE_FWD_VOCAB_SPLIT_SIZE", 512 * 6))
+        )
 
     @dataclass
     class BwdConfig:
@@ -42,9 +43,12 @@ class BwdConfig:
         """
 
         _bwd_kernel: typing.Dict[str, cute.kernel] = field(default_factory=dict)
-        _vocab_per_split: int = field(default=int(os.environ.get("LCE_BWD_VOCAB_SPLIT_SIZE", 512 * 6)))
-        _backward_method: utils.BackwardMethodEnum = field(default=utils.BackwardMethodEnum.kDlogitsSplitN)
-
+        _vocab_per_split: int = field(
+            default=int(os.environ.get("LCE_BWD_VOCAB_SPLIT_SIZE", 512 * 6))
+        )
+        _backward_method: utils.BackwardMethodEnum = field(
+            default=utils.BackwardMethodEnum.kDlogitsSplitN
+        )
 
     @lru_cache(maxsize=1)
     def _get_fwd_config() -> FwdConfig:
@@ -68,7 +72,9 @@ def forward(
         reduction: typing.Literal["none", "sum", "mean"] = "mean",
         ignore_index: int = -100,
         sequence_parallel: bool = False,
-    ) -> typing.Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int, int, torch.Tensor]:
+    ) -> typing.Tuple[
+        torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int, int, torch.Tensor
+    ]:
         """
         forward host function
         """
@@ -84,22 +90,29 @@ def forward(
         # weight must be [vocab_size, dim]
         assert weight.dim() == 2
         # labels could be [batch, seqlen] or [seqlen, batch] or [tokens]
-        assert (hidden.dim() == 2 and labels.dim() == 1) or (hidden.dim() == 3 and labels.dim() == 2)
+        assert (hidden.dim() == 2 and labels.dim() == 1) or (
+            hidden.dim() == 3 and labels.dim() == 2
+        )
         assert hidden.is_contiguous() and weight.is_contiguous() and labels.is_contiguous()
 
         hidden_view = hidden.view(-1, hidden.shape[-1])
         labels_view = labels.view(-1)
 
-        assert (sequence_parallel and hidden_view.shape[0] * tp_world_size == labels_view.shape[0]) or (
-            not sequence_parallel and hidden_view.shape[0] == labels_view.shape[0]
-        )
+        assert (
+            sequence_parallel and hidden_view.shape[0] * tp_world_size == labels_view.shape[0]
+        ) or (not sequence_parallel and hidden_view.shape[0] == labels_view.shape[0])
         assert hidden_view.shape[1] == weight.shape[1]
 
         global_hidden = hidden
         if in_tp_mode and sequence_parallel:
             partial_hidden_shape = hidden.shape
-            global_hidden_shape = (partial_hidden_shape[0] * tp_world_size, *partial_hidden_shape[1:])
-            global_hidden = torch.empty(global_hidden_shape, dtype=hidden.dtype, device=hidden.device)
+            global_hidden_shape = (
+                partial_hidden_shape[0] * tp_world_size,
+                *partial_hidden_shape[1:],
+            )
+            global_hidden = torch.empty(
+                global_hidden_shape, dtype=hidden.dtype, device=hidden.device
+            )
             dist.all_gather_into_tensor(global_hidden, hidden, group=tp_group)
             assert global_hidden.is_contiguous()
             hidden_view = global_hidden.view(-1, global_hidden.shape[-1])
@@ -125,11 +138,15 @@ def forward(
         accumulate = torch.empty_like(maximum, dtype=torch.float32)
         num_valid_tokens = torch.empty((), device=hidden.device, dtype=torch.int64)
         assert (
-            maximum.is_contiguous() and accumulate.is_contiguous() and num_valid_tokens.is_contiguous()
+            maximum.is_contiguous()
+            and accumulate.is_contiguous()
+            and num_valid_tokens.is_contiguous()
         )
         # declare intermediate tensors
         # NOTE: this is a parameter for tuning
-        num_splits = (vocab_size + _get_fwd_config()._vocab_per_split - 1) // _get_fwd_config()._vocab_per_split
+        num_splits = (
+            vocab_size + _get_fwd_config()._vocab_per_split - 1
+        ) // _get_fwd_config()._vocab_per_split
         _max = torch.empty((num_tokens, num_splits), device=hidden.device, dtype=torch.float32)
         _accu = torch.empty((num_tokens, num_splits), device=hidden.device, dtype=torch.float32)
         if REDUCTION == utils.EntropyReductionEnum.kNone:
@@ -145,14 +162,16 @@ def forward(
         )
 
         # need to compile the kernel for the first time
-        hidden_packed = from_dlpack(hidden_view.detach(), assumed_align=16).mark_compact_shape_dynamic(
-            mode=0
-        )
+        hidden_packed = from_dlpack(
+            hidden_view.detach(), assumed_align=16
+        ).mark_compact_shape_dynamic(mode=0)
         weight_packed = from_dlpack(weight.detach(), assumed_align=16)
-        labels_packed = from_dlpack(labels_view.detach(), assumed_align=8).mark_compact_shape_dynamic(
+        labels_packed = from_dlpack(
+            labels_view.detach(), assumed_align=8
+        ).mark_compact_shape_dynamic(mode=0)
+        logprobs_packed = from_dlpack(_logprobs, assumed_align=16).mark_compact_shape_dynamic(
             mode=0
         )
-        logprobs_packed = from_dlpack(_logprobs, assumed_align=16).mark_compact_shape_dynamic(mode=0)
         _max_packed = from_dlpack(_max, assumed_align=8).mark_compact_shape_dynamic(
             mode=0, stride_order=(0, 1)
         )
@@ -165,7 +184,9 @@ def forward(
         # only the number of tokens can vary
         key = f"vocab_size:{vocab_size}+dim:{dim}+dtype:{hidden_view.dtype}"
         if _get_fwd_config()._fwd_mainloop_kernels.get(key) is None:
-            fwd_mainloop_kernel = fwd_mainloop.FwdMainLoop(vocab_per_split=_get_fwd_config()._vocab_per_split)
+            fwd_mainloop_kernel = fwd_mainloop.FwdMainLoop(
+                vocab_per_split=_get_fwd_config()._vocab_per_split
+            )
             fwd_mainloop_compiled_kernel = cute.compile(
                 fwd_mainloop_kernel,
                 hidden_packed,
@@ -226,9 +247,13 @@ def grid(meta):
 
             torch.cuda.current_stream().record_event(_get_fwd_config()._dedicated_events[0])
             with torch.cuda.stream(_get_fwd_config()._dedicated_stream):
-                _get_fwd_config()._dedicated_stream.wait_event(_get_fwd_config()._dedicated_events[0])
+                _get_fwd_config()._dedicated_stream.wait_event(
+                    _get_fwd_config()._dedicated_events[0]
+                )
                 dist.all_reduce(_logprobs, op=dist.ReduceOp.SUM, group=tp_group)
-                _get_fwd_config()._dedicated_stream.record_event(_get_fwd_config()._dedicated_events[1])
+                _get_fwd_config()._dedicated_stream.record_event(
+                    _get_fwd_config()._dedicated_events[1]
+                )
 
             def grid(meta):
                 return (triton.cdiv(num_tokens, meta["BLOCK_SIZE_M"]),)
@@ -271,8 +296,15 @@ def grid(meta):
                 REDUCTION.value,
             )
 
-        return logprobs, maximum, accumulate, num_valid_tokens, tp_rank, tp_world_size, global_hidden
-
+        return (
+            logprobs,
+            maximum,
+            accumulate,
+            num_valid_tokens,
+            tp_rank,
+            tp_world_size,
+            global_hidden,
+        )
 
     def backward(
         dlogprobs: torch.Tensor,
@@ -302,9 +334,9 @@ def backward(
 
         REDUCTION = utils.str_to_reduction_enum(reduction)
         dlogprobs_view = dlogprobs.view(-1)
-        assert (REDUCTION == utils.EntropyReductionEnum.kNone and dlogprobs.shape == (num_tokens,)) or (
-            REDUCTION != utils.EntropyReductionEnum.kNone and dlogprobs.dim() == 0
-        )
+        assert (
+            REDUCTION == utils.EntropyReductionEnum.kNone and dlogprobs.shape == (num_tokens,)
+        ) or (REDUCTION != utils.EntropyReductionEnum.kNone and dlogprobs.dim() == 0)
         assert dlogprobs.is_contiguous() and dlogprobs.is_cuda
 
         assert (
@@ -324,7 +356,9 @@ def backward(
             num_splits = (vocab_size + vocab_per_split - 1) // vocab_per_split
 
             _d_logits = torch.empty(
-                (num_tokens, vocab_per_split), device=global_hidden.device, dtype=global_hidden.dtype
+                (num_tokens, vocab_per_split),
+                device=global_hidden.device,
+                dtype=global_hidden.dtype,
             )
 
             hidden_packed = from_dlpack(
@@ -337,18 +371,24 @@ def backward(
             dlogprobs_packed = from_dlpack(
                 dlogprobs_view.detach(), assumed_align=8
             ).mark_compact_shape_dynamic(mode=0)
-            maximum_packed = from_dlpack(maximum.detach(), assumed_align=8).mark_compact_shape_dynamic(
+            maximum_packed = from_dlpack(
+                maximum.detach(), assumed_align=8
+            ).mark_compact_shape_dynamic(mode=0)
+            accu_packed = from_dlpack(accu.detach(), assumed_align=8).mark_compact_shape_dynamic(
+                mode=0
+            )
+            dlogits_packed = from_dlpack(_d_logits, assumed_align=32).mark_compact_shape_dynamic(
                 mode=0
             )
-            accu_packed = from_dlpack(accu.detach(), assumed_align=8).mark_compact_shape_dynamic(mode=0)
-            dlogits_packed = from_dlpack(_d_logits, assumed_align=32).mark_compact_shape_dynamic(mode=0)
             scalarNumValidTokens_packed = cute.runtime.make_ptr(
                 cutlass.Int64, num_valid_tokens.data_ptr(), cute.AddressSpace.gmem, assumed_align=8
             )
 
             stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
 
-            key = f"vocab_size:{vocab_size}+dim:{dim}+reduction:{REDUCTION}+dtype:{hidden_view.dtype}"
+            key = (
+                f"vocab_size:{vocab_size}+dim:{dim}+reduction:{REDUCTION}+dtype:{hidden_view.dtype}"
+            )
             if _get_bwd_config()._bwd_kernel.get(key) is None:
                 bwd_kernel = bwd_partial_dlogits.BwdPartialDlogits(
                     reduction=REDUCTION.value, vocab_per_split=vocab_per_split
@@ -406,7 +446,9 @@ def backward(
                 torch.matmul(
                     valid_d_logits.T,
                     hidden_view,
-                    out=d_weight[split_idx * vocab_per_split : (split_idx + 1) * vocab_per_split, :],
+                    out=d_weight[
+                        split_idx * vocab_per_split : (split_idx + 1) * vocab_per_split, :
+                    ],
                 )
         else:
             raise NotImplementedError(f"Unsupported backward method: {_backward_method}")
@@ -425,5 +467,9 @@ def backward(
                 d_hidden = d_hidden.view(partial_hidden_shape).clone()
 
         return d_hidden, d_weight
+
 except ImportError:
-    pass
+    logging.warning(
+        "Cutlass or CUDA bindings not found. LinearCrossEntropy Blackwell entry "
+        "points will not be available."
+    )
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/fwd_mainloop.py b/megatron/core/fusions/linear_cross_entropy/blackwell/fwd_mainloop.py
index da095e3fc64..93f5b9523e7 100644
--- a/megatron/core/fusions/linear_cross_entropy/blackwell/fwd_mainloop.py
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/fwd_mainloop.py
@@ -4,9 +4,10 @@
 Implementations of the fusion lm_head(Linear) + Cross-Entropy kernel
 """
 
-try:
-    from typing import Tuple, Type
+import logging
+from typing import Tuple, Type
 
+try:
     import cuda.bindings.driver as cuda  # type: ignore
     import cutlass
     import cutlass.cute as cute
@@ -17,14 +18,12 @@
 
     SM100_TMEM_CAPACITY_COLUMNS: int = 512
 
-
     def make_thread_cooperative_group(size: int):
         """
         Create a thread cooperative group.
         """
         return pipeline.CooperativeGroup(pipeline.Agent.Thread, size, alignment=size)
 
-
     class FwdMainLoop:
         """
         This class implements the mainloop for forward process.
@@ -96,7 +95,9 @@ def _compute_stages(
             a_smem_layout_stage_one = sm100_utils.make_smem_layout_a(
                 tiled_mma, mma_tiler, a_dtype, 1  # only single stage
             )
-            b_smem_layout_stage_one = sm100_utils.make_smem_layout_b(tiled_mma, mma_tiler, b_dtype, 1)
+            b_smem_layout_stage_one = sm100_utils.make_smem_layout_b(
+                tiled_mma, mma_tiler, b_dtype, 1
+            )
             a_bytes_per_stage = cute.size_in_bytes(a_dtype, a_smem_layout_stage_one)
             b_bytes_per_stage = cute.size_in_bytes(b_dtype, b_smem_layout_stage_one)
             num_acc_stage = 2
@@ -121,7 +122,11 @@ def _setup_attributes(
             mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
             # 16*4 = 64; 64 * sizeof(FP16) = 128Bytes
             mma_inst_tile_k: int = 4
-            self.mma_tiler = (self.mma_tiler[0], self.mma_tiler[1], mma_inst_shape_k * mma_inst_tile_k)
+            self.mma_tiler = (
+                self.mma_tiler[0],
+                self.mma_tiler[1],
+                mma_inst_shape_k * mma_inst_tile_k,
+            )
 
             self.num_acc_stage, self.num_a_stage, self.num_b_stage, self.num_epi_stage_per_tile = (
                 self._compute_stages(tiled_mma, self.mma_tiler, a_dtype, b_dtype)
@@ -211,9 +216,13 @@ def kernel(
 
             # -------- SMEM partition ------------ #
             # swizzle o [(tileM, tileK), loopM, loopK, Stage]
-            sA = storage.sA.get_tensor(a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner)
+            sA = storage.sA.get_tensor(
+                a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner
+            )
             # swizzle o [(tileN, tileK), loopN, loopK, stage]
-            sB = storage.sB.get_tensor(b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner)
+            sB = storage.sB.get_tensor(
+                b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner
+            )
 
             # FIXME: if 2 CTAs, modify here
             thr_mma = tiled_mma.get_slice(0)
@@ -328,7 +337,9 @@ def kernel(
                     for k in cutlass.range(cute.size(gA, mode=[2])):
                         ab_pipeline.consumer_wait(ab_consumer_state)
 
-                        for kblock_idx in cutlass.range(cute.size(tCsA, mode=[2]), unroll_full=True):
+                        for kblock_idx in cutlass.range(
+                            cute.size(tCsA, mode=[2]), unroll_full=True
+                        ):
                             cute.gemm(
                                 tiled_mma,
                                 cute.append_ones(tCtC[(None, None, mma_producer_state.index)]),
@@ -365,11 +376,15 @@ def kernel(
                     tCtC[((None, None), 0, None)],
                     (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
                 )
-                tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)])
+                tiled_copy_t2r = tcgen05.make_tmem_copy(
+                    copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)]
+                )
                 thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
                 tTMEM_load_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
                 # [(pattern), loopM, loopN, CntTileM, CntTileN]
-                tTMEM_load_tAcc = cute.group_modes(tTMEM_load_tAcc, 3, cute.rank(tTMEM_load_tAcc) - 1)
+                tTMEM_load_tAcc = cute.group_modes(
+                    tTMEM_load_tAcc, 3, cute.rank(tTMEM_load_tAcc) - 1
+                )
 
                 cAcc = cute.make_identity_tensor(self.mma_tiler[:2])
                 tCcAcc = thr_mma.partition_C(cAcc)
@@ -383,12 +398,18 @@ def kernel(
 
                 # epilogue layouts
                 epilogue_thread_layout = cute.make_layout((128, 1))
-                copy_atom_g2r = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), mLabels.element_type)
-                tiled_copy_g2r = cute.make_tiled_copy(copy_atom_g2r, epilogue_thread_layout, (128, 1))
+                copy_atom_g2r = cute.make_copy_atom(
+                    cute.nvgpu.CopyUniversalOp(), mLabels.element_type
+                )
+                tiled_copy_g2r = cute.make_tiled_copy(
+                    copy_atom_g2r, epilogue_thread_layout, (128, 1)
+                )
                 thr_copy_g2r = tiled_copy_g2r.get_slice(tidx)
 
                 copy_atom_r2g = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), cutlass.Float32)
-                tiled_copy_r2g = cute.make_tiled_copy(copy_atom_r2g, epilogue_thread_layout, (128, 1))
+                tiled_copy_r2g = cute.make_tiled_copy(
+                    copy_atom_r2g, epilogue_thread_layout, (128, 1)
+                )
                 thr_copy_r2g = tiled_copy_r2g.get_slice(tidx)
 
                 # auxiliary tensors
@@ -404,11 +425,13 @@ def kernel(
 
                 # [(1, 1), 1, 1]
                 tLabelsgLabels = thr_copy_g2r.partition_S(cute.append_ones(gLabels))
-                tLabelsrLabels = cute.make_fragment(tLabelsgLabels.shape, tLabelsgLabels.element_type)
+                tLabelsrLabels = cute.make_fragment(
+                    tLabelsgLabels.shape, tLabelsgLabels.element_type
+                )
                 cute.copy(tiled_copy_g2r, tLabelsgLabels, tLabelsrLabels, pred=tLabelsCAcc_mask)
-                valid_mask: cutlass.Boolean = (tLabelsrLabels[0] != ignore_index) and tLabelsCAcc_mask[
-                    0
-                ]
+                valid_mask: cutlass.Boolean = (
+                    tLabelsrLabels[0] != ignore_index
+                ) and tLabelsCAcc_mask[0]
 
                 # [tileM, 1]
                 gMax = cute.local_tile(mMax, (self.epi_tile[0], 1), (pidm, pidn))
@@ -425,7 +448,9 @@ def kernel(
                 tR2GrAccu.fill(0.0)
 
                 # [tileM, 1]
-                gLogprobs = cute.append_ones(cute.local_tile(mLogprobs, (self.epi_tile[0],), (pidm,)))
+                gLogprobs = cute.append_ones(
+                    cute.local_tile(mLogprobs, (self.epi_tile[0],), (pidm,))
+                )
                 # [(CPYM, CPYN), loopM, loopN]
                 tR2GgLogprobs = thr_copy_r2g.partition_D(gLogprobs)
                 tR2GrLogprobs = cute.make_fragment(tR2GgLogprobs.shape, tR2GgLogprobs.element_type)
@@ -447,7 +472,9 @@ def kernel(
                     for n_subtile in cutlass.range(num_n_subtiles):
                         cute.copy(
                             tiled_copy_t2r,
-                            tTMEM_load_tAcc[(None, None, None, n_subtile, mma_consumer_state.index)],
+                            tTMEM_load_tAcc[
+                                (None, None, None, n_subtile, mma_consumer_state.index)
+                            ],
                             tTMEM_load_rAcc,
                         )
 
@@ -467,9 +494,13 @@ def kernel(
                                 tR2GrAccu[0] = coeff * tR2GrAccu[0] + exp_logits
 
                                 position: cutlass.Int64 = (
-                                    rank * problem_mnk[1] + pidn * self.vocab_per_split + local_position
+                                    rank * problem_mnk[1]
+                                    + pidn * self.vocab_per_split
+                                    + local_position
+                                )
+                                mask: cutlass.Boolean = valid_mask and (
+                                    position == tLabelsrLabels[0]
                                 )
-                                mask: cutlass.Boolean = valid_mask and (position == tLabelsrLabels[0])
                                 tR2GrLogprobs[0] += mask * tTMEM_load_rAcc[idx]
 
                     mma_pipeline.consumer_release(mma_consumer_state)
@@ -493,7 +524,9 @@ def kernel(
             self.cta_sync_barrier.arrive_and_wait()
             if warp_idx == self.empty_warp_ids[0]:
                 cute.arch.relinquish_tmem_alloc_permit()
-                cute.arch.dealloc_tmem(tmem_ptr, self.tmem_alloc_cols, is_two_cta=self.use_2cta_instrs)
+                cute.arch.dealloc_tmem(
+                    tmem_ptr, self.tmem_alloc_cols, is_two_cta=self.use_2cta_instrs
+                )
 
         @staticmethod
         def _compute_grid(
@@ -551,7 +584,12 @@ def __call__(
             b_major_mode = utils.LayoutEnum.from_tensor(weight).mma_major_mode()
 
             tiled_mma = sm100_utils.make_trivial_tiled_mma(
-                a_dtype, a_major_mode, b_major_mode, self.acc_dtype, self.cta_group, self.mma_tiler[:2]
+                a_dtype,
+                a_major_mode,
+                b_major_mode,
+                self.acc_dtype,
+                self.cta_group,
+                self.mma_tiler[:2],
             )
 
             self._setup_attributes(tiled_mma, a_dtype, b_dtype)
@@ -650,5 +688,6 @@ class SharedStorage:
                 stream=stream,
             )
             return None
+
 except ImportError:
-    pass
+    logging.warning("Cutlass or CUDA Python bindings not found. FwdMainLoop will not be available.")
diff --git a/megatron/core/models/gpt/experimental_attention_variant_module_specs.py b/megatron/core/models/gpt/experimental_attention_variant_module_specs.py
new file mode 100644
index 00000000000..cbe59618baf
--- /dev/null
+++ b/megatron/core/models/gpt/experimental_attention_variant_module_specs.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+from typing import Optional
+
+from megatron.core.models.backends import BackendSpecProvider
+from megatron.core.ssm.gated_delta_net import GatedDeltaNet, GatedDeltaNetSubmodules
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.experimental_attention_variant.dsa import (
+    DSAIndexer,
+    DSAIndexerSubmodules,
+    DSAttention,
+    DSAttentionSubmodules,
+)
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.multi_latent_attention import (
+    MLASelfAttention,
+    MLASelfAttentionSubmodules,
+)
+from megatron.core.transformer.spec_utils import ModuleSpec
+
+
+def get_gated_delta_net_module_spec_for_backend(
+    backend: BackendSpecProvider, normalization: Optional[str] = None
+) -> ModuleSpec:
+    """Helper function to get module spec for Linear Attention"""
+    rms_norm = normalization == "RMSNorm"
+    attention = ModuleSpec(
+        module=GatedDeltaNet,
+        submodules=GatedDeltaNetSubmodules(
+            in_proj=backend.column_parallel_layer_norm_linear(),
+            out_norm=backend.layer_norm(rms_norm=rms_norm, for_qk=False),
+            out_proj=backend.row_parallel_linear(),
+        ),
+        metainfo={"fuse_input_layernorm": True},
+    )
+    return attention
+
+
+def get_dsa_module_spec_for_backend(
+    backend: BackendSpecProvider,
+    qk_layernorm: Optional[bool] = False,
+    qk_l2_norm: Optional[bool] = False,
+    multi_latent_attention: Optional[bool] = False,
+    mla_down_proj_use_column_parallel: Optional[bool] = False,
+    normalization: Optional[str] = None,
+    fallback_to_eager_attn: Optional[bool] = False,
+) -> ModuleSpec:
+    """Helper function to get module spec for Sparse Attention."""
+    assert multi_latent_attention, "Currently only MLA supports sparse attention."
+    assert qk_l2_norm is False, "qk_l2_norm is not supported with MLA."
+    assert fallback_to_eager_attn is False, "Fallback to eager attention is not supported with DSA."
+
+    linear_q_down_proj = (
+        backend.column_parallel_linear() if mla_down_proj_use_column_parallel else backend.linear()
+    )
+    linear_kv_down_proj = (
+        backend.column_parallel_linear() if mla_down_proj_use_column_parallel else backend.linear()
+    )
+    linear_q_up_proj = backend.column_parallel_linear()
+    linear_kv_up_proj = backend.column_parallel_linear()
+
+    # Because TransformerEngine does not support sparse attention yet, we use local
+    # implementation whether the backend is TransformerEngine or not.
+    core_attention = ModuleSpec(
+        module=DSAttention,
+        submodules=DSAttentionSubmodules(
+            indexer=ModuleSpec(
+                module=DSAIndexer,
+                submodules=DSAIndexerSubmodules(
+                    linear_wq_b=backend.linear(),
+                    linear_wk=backend.linear(),
+                    k_norm=backend.layer_norm(rms_norm=False, for_qk=True),
+                    linear_weights_proj=backend.linear(),
+                ),
+            )
+        ),
+    )
+
+    # Adjust for RMS norm.
+    rms_norm = normalization == "RMSNorm"
+    qk_norm = backend.layer_norm(rms_norm=rms_norm, for_qk=True) if qk_layernorm else IdentityOp
+
+    attention = ModuleSpec(
+        module=MLASelfAttention,
+        params={"attn_mask_type": AttnMaskType.causal},
+        submodules=MLASelfAttentionSubmodules(
+            linear_q_proj=backend.column_parallel_linear(),
+            linear_q_down_proj=linear_q_down_proj,
+            linear_q_up_proj=linear_q_up_proj,
+            linear_kv_down_proj=linear_kv_down_proj,
+            linear_kv_up_proj=linear_kv_up_proj,
+            core_attention=core_attention,
+            linear_proj=backend.row_parallel_linear(),
+            q_layernorm=qk_norm,
+            kv_layernorm=qk_norm,
+        ),
+        metainfo={"fuse_input_layernorm": False},
+    )
+
+    return attention
+
+
+def get_experimental_attention_variant_module_spec_for_backend(
+    backend: BackendSpecProvider,
+    sharded_state_dict_keys_map: dict,
+    experimental_attention_variant: Optional[str] = None,
+    qk_layernorm: Optional[bool] = False,
+    qk_l2_norm: Optional[bool] = False,
+    multi_latent_attention: Optional[bool] = False,
+    mla_down_proj_use_column_parallel: Optional[bool] = False,
+    normalization: Optional[str] = None,
+    fallback_to_eager_attn: Optional[bool] = False,
+) -> ModuleSpec:
+    """Helper function to get module spec for Attention"""
+    if experimental_attention_variant == "gated_delta_net":
+        return get_gated_delta_net_module_spec_for_backend(
+            backend=backend, normalization=normalization
+        )
+    elif experimental_attention_variant == "dsa":
+        return get_dsa_module_spec_for_backend(
+            backend=backend,
+            qk_layernorm=qk_layernorm,
+            qk_l2_norm=qk_l2_norm,
+            multi_latent_attention=multi_latent_attention,
+            mla_down_proj_use_column_parallel=mla_down_proj_use_column_parallel,
+            normalization=normalization,
+            fallback_to_eager_attn=fallback_to_eager_attn,
+        )
+    else:
+        raise ValueError(
+            f"Invalid experimental attention variant: {experimental_attention_variant}"
+        )
diff --git a/megatron/core/models/gpt/gpt_layer_specs.py b/megatron/core/models/gpt/gpt_layer_specs.py
index c5c9caa3d67..5395b158749 100755
--- a/megatron/core/models/gpt/gpt_layer_specs.py
+++ b/megatron/core/models/gpt/gpt_layer_specs.py
@@ -5,8 +5,8 @@
 
 from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
 from megatron.core.models.backends import BackendSpecProvider, LocalSpecProvider
-from megatron.core.models.gpt.linear_attention_module_specs import (
-    get_linear_attention_module_spec_for_backend,
+from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+    get_experimental_attention_variant_module_spec_for_backend,
 )
 from megatron.core.models.gpt.moe_module_specs import get_moe_module_spec_for_backend
 from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
@@ -78,7 +78,7 @@ def get_gpt_layer_with_transformer_engine_spec(
     moe_grouped_gemm: Optional[bool] = False,
     qk_layernorm: Optional[bool] = False,
     multi_latent_attention: Optional[bool] = False,
-    linear_attention_type: Optional[str] = None,
+    experimental_attention_variant: Optional[str] = None,
     fp8: Optional[str] = None,  # pylint: disable=unused-argument
     moe_use_legacy_grouped_gemm: Optional[bool] = False,
     normalization: Optional[str] = None,
@@ -96,7 +96,8 @@ def get_gpt_layer_with_transformer_engine_spec(
         moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False.
         qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False.
         multi_latent_attention (bool, optional): To use multi-latent attention. Defaults to False.
-        linear_attention_type (str, optional): The type of linear attention. Defaults to None.
+        experimental_attention_variant (str, optional): The type of experimental attention variant.
+                                                        Defaults to None.
         fp8 (str, optional): Deprecated. For temporary Nemo compatibility.
         moe_use_legacy_grouped_gemm (bool, optional): Force use the legacy GroupedMLP.
                                                       Defaults to False.
@@ -133,7 +134,7 @@ def get_gpt_layer_with_transformer_engine_spec(
     attention = get_attention_module_spec_for_backend(
         backend=backend,
         sharded_state_dict_keys_map=sharded_state_dict_keys_map,
-        linear_attention_type=linear_attention_type,
+        experimental_attention_variant=experimental_attention_variant,
         qk_layernorm=qk_layernorm,
         qk_l2_norm=qk_l2_norm,
         multi_latent_attention=multi_latent_attention,
@@ -166,7 +167,7 @@ def get_gpt_layer_local_spec(
     moe_grouped_gemm: Optional[bool] = False,
     qk_layernorm: Optional[bool] = False,
     multi_latent_attention: Optional[bool] = False,
-    linear_attention_type: Optional[str] = None,
+    experimental_attention_variant: Optional[str] = None,
     fp8: Optional[str] = None,  # pylint: disable=unused-argument
     moe_use_legacy_grouped_gemm: Optional[bool] = False,
     normalization: Optional[str] = None,
@@ -181,7 +182,8 @@ def get_gpt_layer_local_spec(
         moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False.
         qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False.
         multi_latent_attention (bool, optional): To use multi-latent attention. Defaults to False.
-        linear_attention_type (str, optional): The type of linear attention. Defaults to None.
+        experimental_attention_variant (str, optional): The type of experimental attention variant.
+                                                        Defaults to None.
         fp8 (str, optional): Deprecated. For temporary Nemo compatibility.
         moe_use_legacy_grouped_gemm (bool, optional): Force use the legacy GroupedMLP.
                                                       Defaults to False.
@@ -205,15 +207,17 @@ def get_gpt_layer_local_spec(
             " and will be removed soon. Please update your code accordingly."
         )
 
-    if linear_attention_type is not None:
-        raise NotImplementedError("Linear attention is not supported with local spec yet.")
+    if experimental_attention_variant is not None:
+        raise NotImplementedError(
+            "Experimental attention variant is not supported with local spec yet."
+        )
 
     sharded_state_dict_keys_map = {}
 
     attention = get_attention_module_spec_for_backend(
         backend=backend,
         sharded_state_dict_keys_map=sharded_state_dict_keys_map,
-        linear_attention_type=linear_attention_type,
+        experimental_attention_variant=experimental_attention_variant,
         qk_layernorm=qk_layernorm,
         qk_l2_norm=qk_l2_norm,
         multi_latent_attention=multi_latent_attention,
@@ -278,7 +282,7 @@ def get_transformer_layer_spec_for_backend(
 def get_attention_module_spec_for_backend(
     backend: BackendSpecProvider,
     sharded_state_dict_keys_map: dict,
-    linear_attention_type: Optional[str] = None,
+    experimental_attention_variant: Optional[str] = None,
     qk_layernorm: Optional[bool] = False,
     qk_l2_norm: Optional[bool] = False,
     multi_latent_attention: Optional[bool] = False,
@@ -288,11 +292,17 @@ def get_attention_module_spec_for_backend(
 ) -> ModuleSpec:
     """Helper function to get module spec for Attention"""
 
-    if linear_attention_type is not None:
-        return get_linear_attention_module_spec_for_backend(
-            backend=backend,
-            linear_attention_type=linear_attention_type,
-            normalization=normalization,
+    if experimental_attention_variant is not None:
+        return get_experimental_attention_variant_module_spec_for_backend(
+            backend,
+            sharded_state_dict_keys_map,
+            experimental_attention_variant,
+            qk_layernorm,
+            qk_l2_norm,
+            multi_latent_attention,
+            mla_down_proj_use_column_parallel,
+            normalization,
+            fallback_to_eager_attn,
         )
 
     # Adjust for RMS norm.
@@ -526,13 +536,12 @@ def get_gpt_decoder_layer_specs(
                 num_experts = None
                 moe_grouped_gemm = None
             if attention_type == "linear_attention":
-                if config.linear_attention_type is None:
+                linear_attention_variants = ["gated_delta_net"]
+                if config.experimental_attention_variant not in linear_attention_variants:
                     # Skip if there is no linear attention layer in the model.
                     continue
-                linear_attention_type = config.linear_attention_type
                 multi_latent_attention = None
             else:
-                linear_attention_type = None
                 multi_latent_attention = config.multi_latent_attention
 
             layer_spec_key = f"{mlp_type}_{attention_type}"
@@ -540,7 +549,7 @@ def get_gpt_decoder_layer_specs(
                 num_experts=num_experts,
                 moe_grouped_gemm=moe_grouped_gemm,
                 multi_latent_attention=multi_latent_attention,
-                linear_attention_type=linear_attention_type,
+                experimental_attention_variant=config.experimental_attention_variant,
                 **get_layer_spec_kwargs,
             )
 
@@ -583,7 +592,8 @@ def get_gpt_decoder_layer_specs(
             f"current linear attention pattern: {config.linear_attention_freq}"
         )
     elif config.linear_attention_freq is None:
-        if config.linear_attention_type is None:
+        linear_attention_variants = ["gated_delta_net"]
+        if config.experimental_attention_variant not in linear_attention_variants:
             linear_attention_pattern = [0] * config.num_layers
         else:
             linear_attention_pattern = [1] * config.num_layers
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index b46ea83a4d4..b3f6bdcc728 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -569,9 +569,19 @@ def _postprocess(
                 loss_mask = torch.ones_like(mtp_labels)
             for mtp_layer_number in range(self.config.mtp_num_layers):
                 # Calc loss for the current Multi-Token Prediction (MTP) layers.
-                mtp_labels, _ = roll_tensor(mtp_labels, shifts=-1, dims=-1, cp_group=self.cp_group)
+                mtp_labels, _ = roll_tensor(
+                    mtp_labels,
+                    shifts=-1,
+                    dims=-1,
+                    cp_group=self.cp_group,
+                    packed_seq_params=packed_seq_params,
+                )
                 loss_mask, num_tokens = roll_tensor(
-                    loss_mask, shifts=-1, dims=-1, cp_group=self.cp_group
+                    loss_mask,
+                    shifts=-1,
+                    dims=-1,
+                    cp_group=self.cp_group,
+                    packed_seq_params=packed_seq_params,
                 )
 
                 # Compute mtp loss without storing logits to save memory.
diff --git a/megatron/core/models/gpt/linear_attention_module_specs.py b/megatron/core/models/gpt/linear_attention_module_specs.py
deleted file mode 100644
index 7e76d845cff..00000000000
--- a/megatron/core/models/gpt/linear_attention_module_specs.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-
-from typing import Optional
-
-from megatron.core.models.backends import BackendSpecProvider
-from megatron.core.ssm.gated_delta_net import GatedDeltaNet, GatedDeltaNetSubmodules
-from megatron.core.transformer.spec_utils import ModuleSpec
-
-
-def get_linear_attention_module_spec_for_backend(
-    backend: BackendSpecProvider, linear_attention_type: str, normalization: Optional[str] = None
-) -> ModuleSpec:
-    """Helper function to get module spec for Linear Attention"""
-    rms_norm = normalization == "RMSNorm"
-    if linear_attention_type == "gated_delta_net":
-        attention = ModuleSpec(
-            module=GatedDeltaNet,
-            submodules=GatedDeltaNetSubmodules(
-                in_proj=backend.column_parallel_layer_norm_linear(),
-                out_norm=backend.layer_norm(rms_norm=rms_norm, for_qk=False),
-                out_proj=backend.row_parallel_linear(),
-            ),
-            metainfo={"fuse_input_layernorm": True},
-        )
-    else:
-        raise ValueError(f"Invalid linear attention type: {linear_attention_type}")
-    return attention
diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py
index 74031f38219..f6f40027789 100644
--- a/megatron/core/transformer/attention.py
+++ b/megatron/core/transformer/attention.py
@@ -190,6 +190,7 @@ def __init__(
         self.key_hidden_size = self.hidden_size_per_attention_head
         self.val_hidden_size = self.hidden_size_per_attention_head
 
+        # TODO: This is built twice when using MLA, should be refactored.
         self.core_attention = build_module(
             submodules.core_attention,
             config=self.config,
diff --git a/megatron/core/transformer/experimental_attention_variant/dsa.py b/megatron/core/transformer/experimental_attention_variant/dsa.py
new file mode 100644
index 00000000000..fc994490b1b
--- /dev/null
+++ b/megatron/core/transformer/experimental_attention_variant/dsa.py
@@ -0,0 +1,822 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+import copy
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.models.common.embeddings import (
+    RotaryEmbedding,
+    YarnRotaryEmbedding,
+    apply_rotary_pos_emb,
+)
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.tensor_parallel.mappings import gather_from_sequence_parallel_region
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+
+try:
+    from fast_hadamard_transform import hadamard_transform
+except ImportError:
+    hadamard_transform = None
+
+
+def rotate_activation(x: torch.Tensor) -> torch.Tensor:
+    """Apply Hadamard rotation activation.
+    Reference:
+        https://github.com/deepseek-ai/DeepSeek-V3.2-Exp/blob/main/inference/model.py#L424-L428
+
+    Args:
+        x: Input tensor (must be bfloat16).
+
+    Returns:
+        Rotated tensor.
+    """
+    assert (
+        x.dtype == torch.bfloat16
+    ), f"rotate_activation only support bf16 input, but got {x.dtype}"
+    assert hadamard_transform is not None, "fast_hadamard_transform is not installed."
+    hidden_size = x.size(-1)
+    return hadamard_transform(x, scale=hidden_size**-0.5)
+
+
+class DSAIndexerLossLoggingHelper:
+    """Helper class for logging sparse attention indexer losses."""
+
+    tracker = {}
+
+    @staticmethod
+    def save_loss_to_tracker(
+        loss: torch.Tensor,
+        layer_number: int,
+        num_layers: int,
+        reduce_group: torch.distributed.ProcessGroup = None,
+        avg_group: torch.distributed.ProcessGroup = None,
+    ):
+        """Save the indexer loss for logging.
+
+        Args:
+            loss: The loss tensor.
+            layer_number: Layer index of the loss, 1-indexed.
+            num_layers: The number of total layers.
+            reduce_group: The group for reducing the loss.
+            avg_group: The group for averaging the loss.
+        """
+        # Skip indexer loss logging if layer_number is None.
+        if layer_number is None:
+            return
+
+        tracker = DSAIndexerLossLoggingHelper.tracker
+        if "values" not in tracker:
+            tracker["values"] = torch.zeros(num_layers, device=torch.cuda.current_device())
+        tracker["values"][layer_number - 1] += loss.detach()
+        tracker["reduce_group"] = reduce_group
+        tracker["avg_group"] = avg_group
+
+    @staticmethod
+    def clean_loss_in_tracker():
+        """Clear the indexer losses."""
+        tracker = DSAIndexerLossLoggingHelper.tracker
+        if "values" in tracker:
+            tracker["values"].zero_()
+        tracker["reduce_group"] = None
+        tracker["avg_group"] = None
+
+    @staticmethod
+    def reduce_loss_in_tracker():
+        """Collect and reduce the indexer losses across ranks."""
+        tracker = DSAIndexerLossLoggingHelper.tracker
+        if "values" not in tracker:
+            return
+        values = tracker["values"]
+
+        torch.distributed.all_reduce(
+            values, group=parallel_state.get_pipeline_model_parallel_group()
+        )
+        # Reduce indexer losses across ranks.
+        if tracker.get('reduce_group') is not None:
+            torch.distributed.all_reduce(values, group=tracker.get('reduce_group'))
+        if tracker.get('avg_group') is not None:
+            torch.distributed.all_reduce(
+                values, group=tracker['avg_group'], op=torch.distributed.ReduceOp.AVG
+            )
+        torch.distributed.all_reduce(
+            values,
+            group=parallel_state.get_data_parallel_group(with_context_parallel=False),
+            op=torch.distributed.ReduceOp.AVG,
+        )
+
+    @staticmethod
+    def track_indexer_metrics(
+        loss_scale: float,
+        iteration: int,
+        writer,
+        wandb_writer=None,
+        total_loss_dict=None,
+        per_layer_logging: bool = False,
+    ):
+        """Track the sparse attention indexer metrics for logging.
+
+        Args:
+            loss_scale: Scale factor for the loss.
+            iteration: Current training iteration.
+            writer: TensorBoard writer.
+            wandb_writer: Weights & Biases writer.
+            total_loss_dict: Dictionary to accumulate total losses.
+            per_layer_logging: Whether to log per-layer losses.
+        """
+        DSAIndexerLossLoggingHelper.reduce_loss_in_tracker()
+        tracker = DSAIndexerLossLoggingHelper.tracker
+        if "values" not in tracker:
+            return
+
+        indexer_loss_values = tracker["values"] * loss_scale
+        num_layers = indexer_loss_values.shape[0]
+
+        # Average across all layers (assuming all layers have sparse attention)
+        avg_indexer_loss = indexer_loss_values.sum() / num_layers
+
+        # Log average loss
+        if total_loss_dict is not None:
+            if "indexer loss" in total_loss_dict:
+                total_loss_dict["indexer loss"] += avg_indexer_loss
+            else:
+                total_loss_dict["indexer loss"] = avg_indexer_loss
+
+        if writer is not None:
+            writer.add_scalar("indexer loss", avg_indexer_loss, iteration)
+
+        if wandb_writer is not None:
+            wandb_writer.log({"indexer loss": avg_indexer_loss}, iteration)
+
+        DSAIndexerLossLoggingHelper.clean_loss_in_tracker()
+
+
+def compute_dsa_indexer_loss(
+    index_scores: torch.Tensor,
+    topk_indices: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    softmax_scale: float,
+    loss_coeff: float,
+    sparse_loss: bool,
+    pg_collection: ProcessGroupCollection,
+) -> torch.Tensor:
+    """
+    Compute KL divergence loss between index_scores and true attention_scores.
+
+    This loss trains the indexer to predict which tokens are important by matching the distribution
+    of true attention scores.
+
+    Reference: Section 2.1 of
+        https://github.com/deepseek-ai/DeepSeek-V3.2-Exp/blob/main/DeepSeek_V3_2.pdf
+
+    Args:
+        index_scores: Scores predicted by indexer [batch, seqlen_q, seqlen_k].
+        topk_indices: Top-k indices [batch, seqlen_q, index_topk].
+        query: Query tensor [seqlen_q, batch, heads, dim].
+        key: Key tensor [seqlen_k, batch, heads, dim].
+        softmax_scale: Scale coefficient after q @ k^T.
+        loss_coeff: Coefficient for the indexer KL divergence loss.
+        sparse_loss: bool, whether to use sparse indexer loss. If True, only the topk
+            indices will be used to compute the loss.
+        pg_collection: Process group collection, must have TP process group.
+
+    Returns:
+        index_loss: KL divergence loss (scalar).
+    """
+    sq, b, np, hn = query.size()
+    sk = key.size(0)
+
+    # [sq, b, np, hn] -> [b, np, sq, hn] -> [b * np, sq, hn]
+    query = query.permute(1, 2, 0, 3).reshape(b * np, sq, hn)
+    # [sk, b, np, hn] -> [b, np, hn, sk] -> [b * np, hn, sk]
+    key = key.permute(1, 2, 3, 0).reshape(b * np, hn, sk)
+    # Compute attention scores [b * np, sq, sk]
+    attention_scores = torch.bmm(query.float(), key.float()) * softmax_scale
+    # Reshape to [b, np, sq, sk]
+    attention_scores = attention_scores.reshape(b, np, sq, sk)
+
+    # causal_mask [sq, sk]
+    causal_mask = torch.triu(
+        torch.full((sq, sk), float('-inf'), dtype=torch.float32, device=attention_scores.device),
+        diagonal=1,
+    )
+    # index_mask [b, sq, sk]
+    index_mask = torch.full(
+        (b, sq, sk), float("-inf"), dtype=torch.float32, device=causal_mask.device
+    ).scatter_(-1, topk_indices, 0)
+
+    # [b, np, sq, skv] + [1, 1, sq, skv] -> [b, np, sq, skv]
+    attention_scores += causal_mask.view(1, 1, sq, sk)
+    if sparse_loss:
+        # [b, np, sq, sk] + [b, 1, sq, sk] -> [b, np, sq, sk]
+        attention_scores += index_mask.view(b, 1, sq, sk)
+        # [b, sq, sk] + [b, sq, sk] -> [b, sq, sk]
+        index_scores += index_mask
+
+    # [b, np, sq, sk] -> [b, np, sq, sk]
+    attention_scores = torch.nn.functional.softmax(attention_scores, dim=-1, dtype=torch.float32)
+    # [b, sq, sk] -> [b, sq, sk]
+    index_scores = torch.nn.functional.softmax(index_scores, dim=-1, dtype=torch.float32)
+
+    # Sum attention scores across heads.
+    # [batch, heads, seqlen_q, seqlen_k] -> [batch, seqlen_q, seqlen_k]
+    attention_scores = attention_scores.sum(dim=1)
+    if pg_collection.tp.size() > 1:
+        # attention scores are scattered to TP ranks in head dimension.
+        torch.distributed.all_reduce(attention_scores.contiguous(), group=pg_collection.tp)
+    # L1 normalize target on the last dimension. Doesn't use abs() because attention_scores are
+    # obtained from softmax so they are already non-negative.
+    attention_scores = attention_scores / attention_scores.sum(dim=-1, keepdim=True)
+
+    # Compute KL divergence: KL(target || index) = target(x) * log(target(x) / index(x))
+    # kl_per_element [b, sq, sk]
+    kl_per_element = attention_scores * (
+        torch.log(attention_scores + 1e-10) - torch.log(index_scores + 1e-10)
+    )
+
+    # [b, sq, sk] -> [b, sq] -> [1]
+    # Each token has same weight in the loss.
+    kl_div = kl_per_element.sum(dim=-1).mean()
+
+    # Scale by coefficient.
+    indexer_loss = kl_div * loss_coeff
+
+    return indexer_loss
+
+
+class DSAIndexerLossAutoScaler(torch.autograd.Function):
+    """An AutoScaler that triggers the backward pass and scales the grad for indexer loss.
+
+    This custom autograd function attaches a KL divergence loss to the activation
+    to train the indexer to predict attention scores without affecting the forward pass.
+    """
+
+    main_loss_backward_scale: torch.Tensor = None
+
+    @staticmethod
+    def forward(ctx, output: torch.Tensor, indexer_loss: torch.Tensor):
+        """Preserve the indexer_loss by storing it in the context to avoid garbage collection.
+
+        Args:
+            output: The output tensor (activation).
+            indexer_loss: The indexer KL divergence loss tensor.
+
+        Returns:
+            torch.Tensor: The output tensor unchanged.
+        """
+        ctx.save_for_backward(indexer_loss)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):
+        """Compute and scale the gradient for indexer loss.
+
+        Args:
+            grad_output: The gradient of the output.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: The gradient of the output, scaled indexer loss
+                gradient.
+        """
+        (indexer_loss,) = ctx.saved_tensors
+        if DSAIndexerLossAutoScaler.main_loss_backward_scale is None:
+            DSAIndexerLossAutoScaler.main_loss_backward_scale = torch.tensor(
+                1.0, device=indexer_loss.device
+            )
+        indexer_loss_backward_scale = DSAIndexerLossAutoScaler.main_loss_backward_scale
+        scaled_indexer_loss_grad = torch.ones_like(indexer_loss) * indexer_loss_backward_scale
+        return grad_output, scaled_indexer_loss_grad
+
+    @staticmethod
+    def set_loss_scale(scale: torch.Tensor):
+        """Set the scale of the indexer loss.
+
+        Args:
+            scale: The scale value to set.
+        """
+        if DSAIndexerLossAutoScaler.main_loss_backward_scale is None:
+            DSAIndexerLossAutoScaler.main_loss_backward_scale = scale
+        else:
+            DSAIndexerLossAutoScaler.main_loss_backward_scale.copy_(scale)
+
+
+@dataclass
+class DSAIndexerSubmodules:
+    """
+    Configuration class for specifying the submodules of an DSA Indexer.
+
+    Args:
+        linear_wq_b: Linear projection for query bottleneck expansion.
+        linear_wk: Linear projection for key.
+        k_norm: Layer normalization for key.
+        linear_weights_proj: Linear projection for attention weights.
+    """
+
+    linear_wq_b: Union[ModuleSpec, type] = None
+    linear_wk: Union[ModuleSpec, type] = None
+    k_norm: Union[ModuleSpec, type] = None
+    linear_weights_proj: Union[ModuleSpec, type] = None
+
+
+@dataclass
+class DSAttentionSubmodules:
+    """
+    Configuration class for specifying the submodules of DSAttention.
+
+    Args:
+        indexer: DSA Indexer module for computing sparse attention indices.
+    """
+
+    indexer: Union[ModuleSpec, type] = None
+
+
+class DSAIndexer(MegatronModule):
+    """
+    DSA Lightning Indexer for DeepSeek Sparse Attention.
+
+    Computes index scores to identify the top-k most relevant key-value pairs for each query in
+    sparse attention.
+
+    Reference:
+        https://github.com/deepseek-ai/DeepSeek-V3.2-Exp/blob/main/inference/model.py#L431-L480
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: DSAIndexerSubmodules,
+        pg_collection: Optional[ProcessGroupCollection] = None,
+    ) -> None:
+        """Initialize the indexer.
+
+        Args:
+            config (TransformerConfig): The configuration for the transformer model.
+            submodules (DSAIndexerSubmodules): Indexer submodules specification.
+            pg_collection (ProcessGroupCollection, optional): Process groups for the indexer.
+        """
+        super().__init__(config=config)
+        self.hidden_size = self.config.hidden_size
+        self.qk_pos_emb_head_dim = self.config.qk_pos_emb_head_dim
+        self.q_lora_rank = (
+            self.config.q_lora_rank
+            if self.config.q_lora_rank is not None
+            else self.config.hidden_size
+        )
+
+        self.index_n_heads = self.config.dsa_indexer_n_heads
+        self.index_head_dim = self.config.dsa_indexer_head_dim
+        self.index_topk = self.config.dsa_indexer_topk
+
+        self.softmax_scale: float = self.index_head_dim**-0.5
+
+        if pg_collection is None:
+            pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+        self.pg_collection = pg_collection
+
+        # Initialize Position Embedding.
+        if self.config.rope_type == 'rope':
+            self.rotary_pos_emb = RotaryEmbedding(
+                self.qk_pos_emb_head_dim,
+                rotary_percent=self.config.rotary_percent,
+                rotary_base=self.config.rotary_base,
+                cp_group=self.pg_collection.cp,
+            )
+        elif self.config.rope_type == 'yarn':
+            self.rotary_pos_emb = YarnRotaryEmbedding(
+                self.qk_pos_emb_head_dim,
+                rotary_base=self.config.rotary_base,
+                scaling_factor=self.config.rotary_scaling_factor,
+                original_max_position_embeddings=self.config.original_max_position_embeddings,
+                beta_fast=self.config.beta_fast,
+                beta_slow=self.config.beta_slow,
+                mscale=self.config.mscale,
+                mscale_all_dim=self.config.mscale_all_dim,
+                cp_group=self.pg_collection.cp,
+            )
+        else:
+            raise ValueError(
+                f'Unsupported RoPE type: {self.config.rope_type}, supported types are "rope" and '
+                f'"yarn"'
+            )
+
+        self.linear_wq_b = build_module(
+            submodules.linear_wq_b,
+            self.q_lora_rank,
+            self.index_n_heads * self.index_head_dim,
+            config=self.config,
+            init_method=self.config.init_method,
+            bias=False,
+            skip_bias_add=False,
+            skip_weight_param_allocation=False,
+            parallel_mode="duplicated",
+        )
+
+        self.linear_wk = build_module(
+            submodules.linear_wk,
+            self.hidden_size,
+            self.index_head_dim,
+            config=self.config,
+            init_method=self.config.init_method,
+            bias=False,
+            skip_bias_add=False,
+            skip_weight_param_allocation=False,
+            parallel_mode="duplicated",
+        )
+
+        k_norm_config = copy.copy(self.config)
+        k_norm_config.normalization = "LayerNorm"
+        self.k_norm = build_module(
+            submodules.k_norm,
+            config=k_norm_config,
+            hidden_size=self.index_head_dim,
+            eps=self.config.layernorm_epsilon,
+        )
+
+        self.linear_weights_proj = build_module(
+            submodules.linear_weights_proj,
+            self.hidden_size,
+            self.index_n_heads,
+            config=self.config,
+            init_method=self.config.init_method,
+            bias=False,
+            skip_bias_add=False,
+            skip_weight_param_allocation=False,
+            parallel_mode="duplicated",
+        )
+
+    def _apply_rope(self, x: torch.Tensor, rotary_pos_emb: torch.Tensor, mscale: float):
+        """Apply RoPE to the input tensor."""
+        # x_nope [seqlen, batch, *, index_head_dim - qk_pos_emb_head_dim]
+        # x_pe   [seqlen, batch, *, qk_pos_emb_head_dim]
+        x_nope, x_pe = torch.split(
+            x, [self.index_head_dim - self.qk_pos_emb_head_dim, self.qk_pos_emb_head_dim], dim=-1
+        )
+        x_pe = apply_rotary_pos_emb(
+            x_pe,
+            rotary_pos_emb,
+            config=self.config,
+            cu_seqlens=None,
+            mscale=mscale,
+            cp_group=self.pg_collection.cp,
+        )
+        # [seqlen, batch, *, index_head_dim]
+        x = torch.cat([x_nope, x_pe], dim=-1)
+        return x
+
+    def _compute_index_scores(
+        self, q: torch.Tensor, weights: torch.Tensor, k: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Perform index score using BF16 precision.
+
+        Reference:
+            https://github.com/deepseek-ai/DeepSeek-V3.2-Exp/blob/main/inference/kernel.py#L254-L274
+        This is a BF16 implementation of the `fp8_index` logic:
+            1. Compute attention scores: q @ k^T;
+            2. Apply ReLU activation;
+            3. Weight by attention weights;
+            4. Sum across attention heads.
+
+        Args:
+            q: BF16 [seqlen_q, batch, index_n_heads, index_head_dim], the query tensor.
+            weights: BF16 [seqlen_q, batch, index_n_heads], the attention weights.
+            k: BF16 [seqlen_k, batch, index_head_dim], the key tensor.
+
+        Returns:
+            index_scores: FP32 [batch, seqlen_q, seqlen_k], the index scores.
+        """
+        # Compute attention scores: q @ k^T
+        # [seqlen_q, batch, index_n_heads, index_head_dim] @ [seqlen_k, batch, index_head_dim]^T
+        #   -> [seqlen_q, batch, index_n_heads, seqlen_k]
+        index_scores = torch.einsum('sbhd,tbd->sbht', q.float(), k.float())
+
+        # Apply ReLU activation.
+        index_scores = torch.relu(index_scores)
+
+        # Weight each head by attention weights.
+        # [seqlen_q, batch, index_n_heads, seqlen_k] * [seqlen_q, batch, index_n_heads, 1]
+        #   -> [seqlen_q, batch, index_n_heads, seqlen_k]
+        index_scores = index_scores * weights.unsqueeze(-1)
+
+        # Sum across attention heads.
+        # [seqlen_q, batch, index_n_heads, seqlen_k] -> [seqlen_q, batch, seqlen_k]
+        index_scores = index_scores.sum(dim=2)
+
+        # Transpose to [batch, seqlen_q, seqlen_k].
+        index_scores = index_scores.transpose(0, 1)
+
+        return index_scores
+
+    def forward_with_scores(
+        self,
+        x: torch.Tensor,
+        qr: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        packed_seq_params: Optional[PackedSeqParams] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Forward pass for DSA Indexer that returns both index scores and top-k indices.
+
+        This is used when KL loss is enabled to compare indexer scores with true attention scores.
+
+        Args:
+            x: hidden states [seqlen, batch, hidden_size].
+            qr: Low-rank query tensor [seqlen, batch, q_lora_rank].
+            mask: Attention mask [batch, seqlen, seqlen].
+            packed_seq_params: Packed sequence parameters for variable length sequences.
+
+        Returns:
+            index_scores: Index scores [batch, seqlen, seqlen].
+            topk_indices: Top-k indices [batch, seqlen, index_topk].
+        """
+        assert packed_seq_params is None, "Packed sequence is not supported for DSAttention"
+
+        # =========================================
+        # Prepare RoPE params
+        # =========================================
+        rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
+            None, None, x, self.config, packed_seq_params
+        )
+        if self.config.rope_type == "rope":
+            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len, packed_seq=False)
+            mscale = 1.0
+        else:
+            rotary_pos_emb, mscale = self.rotary_pos_emb(rotary_seq_len, packed_seq=False)
+
+        # =========================================
+        # Gather inputs if sp is enabled
+        # =========================================
+        if self.config.sequence_parallel and self.pg_collection.tp.size() > 1:
+            x = gather_from_sequence_parallel_region(x, group=self.pg_collection.tp)
+            qr = gather_from_sequence_parallel_region(qr, group=self.pg_collection.tp)
+
+        # =========================================
+        # Get sequence length and batch size
+        # =========================================
+        seqlen, bsz, _ = x.size()
+
+        # =========================================
+        # q linear and apply rope to q
+        # =========================================
+        # [seqlen, batch, q_lora_rank] -> [seqlen, batch, index_n_heads * index_head_dim]
+        q, _ = self.linear_wq_b(qr)
+        # [seqlen, batch, index_n_heads * index_head_dim]
+        #   -> [seqlen, batch, index_n_heads, index_head_dim]
+        q = q.reshape(seqlen, bsz, self.index_n_heads, self.index_head_dim)
+        q = self._apply_rope(q, rotary_pos_emb, mscale)
+
+        # =========================================
+        # k linear and apply rope to k
+        # =========================================
+        # [seqlen, batch, hidden_size] -> [seqlen, batch, index_head_dim]
+        k, _ = self.linear_wk(x)
+        k = self.k_norm(k)
+        # [seqlen, batch, index_head_dim] -> [seqlen, batch, 1, index_head_dim]
+        k = k.reshape(seqlen, bsz, 1, self.index_head_dim)
+        k = self._apply_rope(k, rotary_pos_emb, mscale)
+        # [seqlen, batch, 1, index_head_dim] -> [seqlen, batch, index_head_dim]
+        k = k.reshape(seqlen, bsz, self.index_head_dim)
+
+        # =========================================
+        # Rotate activation
+        # =========================================
+        q = rotate_activation(q)
+        k = rotate_activation(k)
+
+        # =========================================
+        # Compute index scores
+        # =========================================
+        # [seqlen, batch, hidden_size] -> [seqlen, batch, index_n_heads]
+        weights, _ = self.linear_weights_proj(x)
+        weights = weights * (self.index_n_heads**-0.5) * self.softmax_scale
+        # [batch, seqlen, seqlen]
+        index_scores = self._compute_index_scores(q, weights, k)
+        if mask is not None:
+            assert mask.dtype == index_scores.dtype, "Mask dtype must match index scores dtype"
+            index_scores = index_scores + mask
+
+        # =========================================
+        # Select top-k indices
+        # =========================================
+        topk_k = min(self.index_topk, seqlen)
+        # [batch, seqlen, index_topk]
+        topk_indices = index_scores.topk(topk_k, dim=-1)[1]
+
+        return index_scores, topk_indices
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        qr: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        packed_seq_params: Optional[PackedSeqParams] = None,
+    ):
+        """
+        Forward pass for DSA Indexer.
+
+        Args:
+            x: hidden states [seqlen, batch, hidden_size].
+            qr: Low-rank query tensor [seqlen, batch, q_lora_rank].
+            mask: Attention mask [batch, seqlen, seqlen].
+            packed_seq_params: Packed sequence parameters for variable length sequences.
+
+        Returns:
+            topk_indices: Top-k indices for sparse attention [batch, seqlen, index_topk].
+        """
+        _, topk_indices = self.forward_with_scores(x, qr, mask, packed_seq_params)
+        return topk_indices
+
+
+def unfused_dsa_fn(query, key, value, topk_indices, softmax_scale):
+    """
+    Unfused sparse attention implementation.
+    """
+    sq, b, np, hn = query.size()
+    skv = key.size(0)
+    hnv = value.size(3)
+
+    # ===================================
+    # Raw attention scores [b, np, sq, skv]
+    # ===================================
+    # [sq, b, np, hn] -> [b, np, sq, hn] -> [b * np, sq, hn]
+    query = query.permute(1, 2, 0, 3).reshape(b * np, sq, hn)
+    # [skv, b, np, hn] -> [b, np, hn, skv] -> [b * np, hn, skv]
+    key = key.permute(1, 2, 3, 0).reshape(b * np, hn, skv)
+    # Compute attention scores [b * np, sq, skv]
+    attention_scores = torch.bmm(query.float(), key.float()) * softmax_scale
+    # Reshape to [b, np, sq, skv]
+    attention_scores = attention_scores.reshape(b, np, sq, skv)
+
+    # ===================================
+    # Apply sparse mask from indexer
+    # ===================================
+    # index_mask [b, sq, skv]
+    index_mask = torch.full((b, sq, skv), float("-inf"), device=attention_scores.device)
+    index_mask.scatter_(-1, topk_indices, 0)
+    # causal_mask [sq, skv]
+    causal_mask = torch.triu(
+        torch.full((sq, skv), float('-inf'), dtype=torch.float32, device=index_mask.device),
+        diagonal=1,
+    )
+    # [b, sq, skv] + [1, sq, skv] -> [b, sq, skv]
+    index_mask += causal_mask.view(1, sq, skv)
+    # [b, np, sq, skv] + [b, 1, sq, skv] -> [b, np, sq, skv]
+    attention_scores += index_mask.unsqueeze(1)
+    attention_scores = torch.nn.functional.softmax(attention_scores, dim=-1, dtype=torch.float32)
+
+    # ===================================
+    # Output
+    # ===================================
+    # [skv, b, np, hnv] -> [b, np, skv, hnv] -> [b * np, skv, hnv]
+    value = value.permute(1, 2, 0, 3).reshape(b * np, skv, hnv)
+    # Reshape attention_scores: [b, np, sq, skv] -> [b * np, sq, skv]
+    attention_scores = attention_scores.reshape(b * np, sq, skv)
+    # Compute output: [b * np, sq, hnv]
+    output = torch.bmm(attention_scores.to(value.dtype), value)
+    # Reshape output: [b * np, sq, hnv] -> [b, np, sq, hnv] -> [sq, b, np, hnv]
+    output = output.reshape(b, np, sq, hnv).permute(2, 0, 1, 3).contiguous()
+    # Flatten: [sq, b, np, hnv] -> [sq, b, np * hnv]
+    output = output.reshape(sq, b, np * hnv)
+    return output
+
+
+class DSAttention(MegatronModule):
+    """
+    This module implements sparse attention mechanism using an DSA Indexer to compute top-k
+    attention indices for reducing computational complexity.
+
+    Reference:
+        https://github.com/deepseek-ai/DeepSeek-V3.2-Exp/blob/main/inference/model.py#L491-L597
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: DSAttentionSubmodules,
+        layer_number: int,
+        attn_mask_type: AttnMaskType,
+        attention_type: str,
+        attention_dropout: Optional[float] = None,
+        softmax_scale: Optional[float] = None,
+        k_channels: Optional[int] = None,
+        v_channels: Optional[int] = None,
+        cp_comm_type: str = "p2p",
+        pg_collection: ProcessGroupCollection = None,
+    ):
+        super().__init__(config=config)
+
+        self.layer_number = layer_number
+
+        self.indexer = build_module(
+            submodules.indexer, config=self.config, pg_collection=pg_collection
+        )
+
+        if softmax_scale is None:
+            softmax_scale = 1.0 / math.sqrt(
+                k_channels if k_channels is not None else config.kv_channels
+            )
+        self.softmax_scale = softmax_scale
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        x: torch.Tensor,
+        qr: torch.Tensor,
+        attention_mask: torch.Tensor,
+        attn_mask_type: AttnMaskType = None,
+        attention_bias: torch.Tensor = None,
+        packed_seq_params: PackedSeqParams = None,
+    ):
+        """
+        Forward pass for Sparse Attention.
+
+        Args:
+            query: Query tensor [sq, b, np, hn].
+            key: Key tensor [skv, b, np, hn].
+            value: Value tensor [skv, b, np, hnv].
+            x: Original hidden states [sq, b, hidden_size].
+            qr: Low-rank query representation [sq, b, q_lora_rank].
+            attention_mask: Attention mask tensor [b, 1, sq, sk].
+            attn_mask_type: Type of attention mask.
+            attention_bias: Optional attention bias.
+            packed_seq_params: Packed sequence parameters.
+
+        Returns:
+            output: Output tensor [sq, b, hidden_size]
+        """
+        sq, b, np, hn = query.size()
+        skv = key.size(0)
+        hnv = value.size(3)
+
+        # Detach x and qr to prevent gradients of indexer from flowing back to the main model.
+        x = x.detach()
+        qr = qr.detach()
+
+        # Get a FP32 mask with -inf for masked positions.
+        if attn_mask_type is not None:
+            assert attn_mask_type == AttnMaskType.causal, 'Only causal mask is supported for now'
+            # Generate upper triangular mask with -inf above diagonal, 0 elsewhere
+            # torch.triu with diagonal=1 creates upper triangular matrix (excluding main diagonal)
+            # float_mask [sq, skv]
+            float_mask = torch.triu(
+                torch.full((sq, skv), float('-inf'), dtype=torch.float32, device=x.device),
+                diagonal=1,
+            )
+        else:
+            assert attention_mask.shape == (b, 1, sq, skv), 'attention_mask shape mismatch'
+            # [b, 1, sq, skv] -> [b, sq, skv]
+            mask = attention_mask.squeeze()
+            # float_mask [b, sq, skv]
+            float_mask = torch.zeros_like(mask, dtype=torch.float32).masked_fill(
+                mask, float('-inf')
+            )
+
+        # ===================================
+        # Get index scores and top-k indices
+        # ===================================
+        index_scores, topk_indices = self.indexer.forward_with_scores(
+            x, qr, mask=float_mask, packed_seq_params=packed_seq_params
+        )
+
+        # ===================================
+        # Run sparse attention kernel
+        # ===================================
+        output = unfused_dsa_fn(query, key, value, topk_indices, self.softmax_scale)
+
+        # ===================================
+        # Attach indexer loss
+        # ===================================
+        if self.training and torch.is_grad_enabled():
+            # Compute KL divergence loss between indexer scores and true attention scores
+            indexer_loss_coeff = getattr(self.config, 'dsa_indexer_loss_coeff', 0.0)
+            indexer_loss = compute_dsa_indexer_loss(
+                index_scores,
+                topk_indices,
+                query.detach(),
+                key.detach(),
+                self.softmax_scale,
+                indexer_loss_coeff,
+                getattr(self.config, "dsa_indexer_use_sparse_loss", False),
+                self.indexer.pg_collection,
+            )
+            # Save indexer loss for logging
+            if indexer_loss_coeff > 0:
+                DSAIndexerLossLoggingHelper.save_loss_to_tracker(
+                    loss=indexer_loss,
+                    layer_number=self.layer_number,
+                    num_layers=self.config.num_layers,
+                )
+            # Attach loss to output
+            output = DSAIndexerLossAutoScaler.apply(output, indexer_loss)
+
+        return output
diff --git a/megatron/core/transformer/multi_latent_attention.py b/megatron/core/transformer/multi_latent_attention.py
index 074523afd7b..3953d933b45 100644
--- a/megatron/core/transformer/multi_latent_attention.py
+++ b/megatron/core/transformer/multi_latent_attention.py
@@ -243,13 +243,28 @@ def forward(
         # Get the query, key and value tensors based on the type of attention -
         # self or cross attn.
         # query: [96, 1, 16, 128], key:[96, 1, 16, 128], value:[96, 1, 16, 128]
-        query, key, value = self.get_query_key_value_tensors(
-            hidden_states,
-            key_value_states,
-            position_ids,
-            packed_seq_params,
-            inference_context=inference_context,
-        )
+        if self.config.experimental_attention_variant is None:
+            query, key, value = self.get_query_key_value_tensors(
+                hidden_states,
+                key_value_states,
+                position_ids,
+                packed_seq_params,
+                inference_context=inference_context,
+            )
+        elif self.config.experimental_attention_variant == "dsa":
+            query, key, value, q_compressed, _ = self.get_query_key_value_tensors(
+                hidden_states,
+                key_value_states,
+                position_ids,
+                packed_seq_params,
+                inference_context=inference_context,
+                return_compressed_tensors=True,
+            )
+        else:
+            raise ValueError(
+                f"Unsupported experimental attention variant: "
+                f"{self.config.experimental_attention_variant}"
+            )
 
         # ===================================================
         # Adjust key, value for inference
@@ -281,14 +296,34 @@ def forward(
 
             if inference_context is None or inference_context.is_static_batching():
                 with get_fine_grained_offloading_context(self.offload_core_attention):
-                    core_attn_out = self.core_attention(
-                        query,
-                        key,
-                        value,
-                        attention_mask,
-                        packed_seq_params=packed_seq_params,
-                        attn_mask_type=attn_mask_type,
-                    )
+                    if self.config.experimental_attention_variant is None:
+                        core_attn_out = self.core_attention(
+                            query,
+                            key,
+                            value,
+                            attention_mask,
+                            packed_seq_params=packed_seq_params,
+                            attn_mask_type=attn_mask_type,
+                        )
+                    elif self.config.experimental_attention_variant == "dsa":
+                        # For dsa we need to pass in the original hidden states and the compressed
+                        # query representation.
+                        core_attn_out = self.core_attention(
+                            query,
+                            key,
+                            value,
+                            x=hidden_states,
+                            qr=q_compressed,
+                            attention_mask=attention_mask,
+                            attn_mask_type=attn_mask_type,
+                            attention_bias=None,
+                            packed_seq_params=packed_seq_params,
+                        )
+                    else:
+                        raise ValueError(
+                            f"Unsupported attention variant: "
+                            f"{self.config.experimental_attention_variant}"
+                        )
             elif self.cache_mla_latents:
                 # Dynamic batching attention kernel.
                 q, k, v = (query, key, value)
@@ -494,6 +529,7 @@ def get_query_key_value_tensors(
         inference_context=None,
         *,
         inference_params=None,
+        return_compressed_tensors=False,
     ):
         """
         Derives `query`, `key` and `value` tensors from `hidden_states`.
@@ -603,6 +639,16 @@ def get_query_key_value_tensors(
             kv_compressed = kv_compressed.squeeze(1)
             k_pos_emb = k_pos_emb.squeeze(1)
 
+        # =========================================
+        # Apply norm
+        # =========================================
+
+        if self.config.q_lora_rank is not None:
+            # q_compressed: [num_tokens, q_lora_rank]
+            q_compressed = self.q_layernorm(q_compressed)
+
+        kv_compressed = self.kv_layernorm(kv_compressed)
+
         # =========================================
         # QKV up projection and RoPE apply
         # =========================================
@@ -613,7 +659,6 @@ def qkv_up_proj_and_rope_apply_for_cached_latent_kv(
             if self.config.q_lora_rank is not None:
                 # q_compressed: [num_tokens, q_lora_rank]
                 # q: [num_tokens, n * (qk_head_dim + qk_pos_emb_head_dim)]
-                q_compressed = self.q_layernorm(q_compressed)
                 q, _ = self.linear_q_up_proj(q_compressed)
             else:
                 # q_compressed: [num_tokens, hidden_size]
@@ -623,8 +668,6 @@ def qkv_up_proj_and_rope_apply_for_cached_latent_kv(
             # q: [num_tokens, n, q_head_dim]
             q = q.view(*q.size()[:-1], self.num_attention_heads_per_partition, self.q_head_dim)
 
-            kv_compressed = self.kv_layernorm(kv_compressed)
-
             # [num_tokens, qk_pos_emb_head_dim] -> [num_tokens, 1, qk_pos_emb_head_dim]
             k_pos_emb = torch.unsqueeze(k_pos_emb, -2)
 
@@ -688,7 +731,6 @@ def qkv_up_proj_and_rope_apply(q_compressed, kv_compressed, k_pos_emb, rotary_po
             if self.config.q_lora_rank is not None:
                 # q_compressed: [num_tokens, q_lora_rank]
                 # q: [num_tokens, n * (qk_head_dim + qk_pos_emb_head_dim)]
-                q_compressed = self.q_layernorm(q_compressed)
                 q, _ = self.linear_q_up_proj(q_compressed)
             else:
                 # q_compressed: [num_tokens, hidden_size]
@@ -698,8 +740,6 @@ def qkv_up_proj_and_rope_apply(q_compressed, kv_compressed, k_pos_emb, rotary_po
             # q: [num_tokens, n, q_head_dim]
             q = q.view(*q.size()[:-1], self.num_attention_heads_per_partition, self.q_head_dim)
 
-            kv_compressed = self.kv_layernorm(kv_compressed)
-
             # kv: [num_tokens, n * (qk_head_dim + v_head_dim)]
             kv, _ = self.linear_kv_up_proj(kv_compressed)
 
@@ -824,7 +864,10 @@ def qkv_up_proj_and_rope_apply(q_compressed, kv_compressed, k_pos_emb, rotary_po
                     q_compressed, kv_compressed, k_pos_emb, rotary_pos_emb
                 )
 
-        return query, key, value
+        if return_compressed_tensors:
+            return query, key, value, q_compressed, kv_compressed
+        else:
+            return query, key, value
 
     def uncompress_kv_from_cache(self, kv_cached):
         """
diff --git a/megatron/core/transformer/multi_token_prediction.py b/megatron/core/transformer/multi_token_prediction.py
index e79af23ef04..a8f4abfcdd3 100755
--- a/megatron/core/transformer/multi_token_prediction.py
+++ b/megatron/core/transformer/multi_token_prediction.py
@@ -126,7 +126,7 @@ def tie_output_layer_state_dict(
     )
 
 
-def roll_tensor(tensor, shifts=-1, dims=-1, cp_group=None):
+def roll_tensor(tensor, shifts=-1, dims=-1, cp_group=None, packed_seq_params=None):
     """Roll the tensor input along the sequence dimension with Context Parallelism (CP) support.
 
     This function extends the original roll_tensor to support Context Parallelism, which allows
@@ -138,15 +138,24 @@ def roll_tensor(tensor, shifts=-1, dims=-1, cp_group=None):
     For CP>1: Splits tensor into chunks, performs rolling within each chunk, then exchanges
     boundary elements between adjacent CP ranks to maintain sequence continuity.
 
+    For packed sequences: Respects sequence boundaries when rolling to avoid mixing tokens
+    from different sequences.
+
     Args:
         tensor (Tensor): The input tensor to roll.
         shifts (int): The shift of the tensor (typically -1 for MTP).
         dims (int): The dimension to roll (typically -1 for sequence dimension).
         cp_group (ProcessGroup): The context parallelism process group. If None or size=1,
                                falls back to standard rolling behavior.
+        packed_seq_params (PackedSeqParams): Parameters for packed sequence processing.
+                                            If provided, respects sequence boundaries.
     Returns:
         tuple: (rolled_tensor, sum_of_rolled_tensor)
     """
+    # Handle packed sequences cases
+    if packed_seq_params is not None:
+        return _roll_tensor_packed_seq(tensor, shifts, dims, packed_seq_params, cp_group)
+
     # Standard rolling behavior when CP is not enabled (cp_group is None or size=1)
     if cp_group is None or cp_group.size() == 1:
         rolled_tensor = torch.roll(tensor, shifts=shifts, dims=dims)
@@ -215,6 +224,91 @@ def roll_tensor(tensor, shifts=-1, dims=-1, cp_group=None):
     return rolled_tensor, rolled_tensor.sum()
 
 
+def _roll_tensor_packed_seq(tensor, shifts, dims, packed_seq_params, cp_group=None):
+    """Roll tensor with packed sequence support.
+    This function handles rolling for packed sequences by respecting sequence boundaries
+    """
+
+    # Notice: This is a naive implementation to test the correctness,
+    # a better solution will only sync the boundary tokens once.
+    assert (
+        dims == -1 or dims == tensor.dim() - 1
+    ), "Packed sequence roll only supports the last dimension."
+    assert shifts == -1, "Packed sequence roll only supports a single-token left shift."
+    cu_seqlens = packed_seq_params.cu_seqlens_q
+    assert cu_seqlens is not None, "Packed sequence parameters must provide cu_seqlens_q."
+
+    rolled_tensor = tensor.clone()
+
+    cp_size = cp_group.size() if cp_group is not None else 1
+    if cp_size == 1:
+        # CP disabled: roll each packed sequence independently within its boundaries
+        for i in range(len(cu_seqlens) - 1):
+            start_idx = cu_seqlens[i]
+            end_idx = cu_seqlens[i + 1]
+            seq_slice = tensor[..., start_idx:end_idx]
+            rolled_seq = torch.roll(seq_slice, shifts=shifts, dims=dims)
+            # Zero out the last position(s) that would cross sequence boundaries
+            rolled_seq[..., shifts:] = 0
+            rolled_tensor[..., start_idx:end_idx] = rolled_seq
+        return rolled_tensor, rolled_tensor.sum()
+
+    # CP enabled: each rank owns two chunks per sequence (front and mirrored tail).
+    local_rank = torch.distributed.get_rank(group=cp_group)
+    global_ranks = torch.distributed.get_process_group_ranks(group=cp_group)
+    next_rank = global_ranks[(local_rank + 1) % cp_size]
+    prev_rank = global_ranks[(local_rank - 1) % cp_size]
+
+    # Iterate over each sequence individually
+    for i in range(len(cu_seqlens) - 1):
+        start_idx = cu_seqlens[i]
+        end_idx = cu_seqlens[i + 1]
+
+        # the idx has been multiplied by cp_size, need to divide it by cp_size to get the local idx
+        local_start_idx = start_idx // cp_size
+        local_end_idx = end_idx // cp_size
+        tensor_slice = rolled_tensor[..., local_start_idx:local_end_idx].clone()
+
+        # The following code is very similar as the code in roll_tensor function
+        local_chunks = tensor_slice.chunk(2, dim=dims)
+        rolled_chunks = [torch.roll(chunk, shifts=shifts, dims=dims) for chunk in local_chunks]
+
+        tensor_send_list = []
+        tensor_recv_list = []
+        for chunk in rolled_chunks:
+            boundary = chunk.select(dims, shifts).contiguous().clone()
+            tensor_send_list.append(boundary)
+            tensor_recv_list.append(torch.empty_like(boundary))
+
+        ops = []
+        if local_rank != 0:
+            ops.append(torch.distributed.isend(tensor=tensor_send_list[0], dst=prev_rank))
+            ops.append(torch.distributed.irecv(tensor=tensor_recv_list[1], src=prev_rank))
+        else:
+            tensor_recv_list[1].zero_()
+
+        if local_rank != cp_size - 1:
+            ops.append(torch.distributed.irecv(tensor=tensor_recv_list[0], src=next_rank))
+            ops.append(torch.distributed.isend(tensor=tensor_send_list[1], dst=next_rank))
+        else:
+            tensor_recv_list[0].copy_(tensor_send_list[1])
+
+        for op in ops:
+            op.wait()
+
+        index = [slice(None)] * rolled_chunks[0].dim()
+        index[dims] = shifts
+        for chunk, recv in zip(rolled_chunks, tensor_recv_list):
+            chunk[tuple(index)] = recv
+
+        seq_result = torch.cat(rolled_chunks, dim=dims)
+
+        # update the rolled tensor
+        rolled_tensor[..., local_start_idx:local_end_idx] = seq_result
+
+    return rolled_tensor, rolled_tensor.sum()
+
+
 class MTPLossLoggingHelper:
     """Helper class for logging MTP losses."""
 
@@ -595,6 +689,7 @@ def _get_embeddings(
         position_ids: torch.Tensor,
         embedding: Callable,
         hidden_states: torch.Tensor,
+        packed_seq_params: Optional[PackedSeqParams] = None,
     ):
         """
         Preprocesses input data for the Multi-Token Prediction (MTP) layers.
@@ -609,10 +704,23 @@ def _get_embeddings(
                 from gpt model to compute the decoder input.
             hidden_states (torch.Tensor): hidden states tensor of shape [s, b, h] where s is the
                 sequence length, b is the batch size, and h is the hidden size.
+            packed_seq_params (PackedSeqParams): Parameters for packed sequence processing.
         """
         # Calc logits for the current Multi-Token Prediction (MTP) layers.
-        input_ids, _ = roll_tensor(input_ids, shifts=-1, dims=-1, cp_group=self.cp_group)
-        position_ids, _ = roll_tensor(position_ids, shifts=-1, dims=-1, cp_group=self.cp_group)
+        input_ids, _ = roll_tensor(
+            input_ids,
+            shifts=-1,
+            dims=-1,
+            cp_group=self.cp_group,
+            packed_seq_params=packed_seq_params,
+        )
+        position_ids, _ = roll_tensor(
+            position_ids,
+            shifts=-1,
+            dims=-1,
+            cp_group=self.cp_group,
+            packed_seq_params=packed_seq_params,
+        )
         # embedding
         decoder_input = embedding(input_ids=input_ids, position_ids=position_ids)
 
@@ -795,15 +903,13 @@ def forward(
             [s, b, h], and optionally the updated context tensor if cross-attention is used.
         """
         assert context is None, f"multi token prediction + cross attention is not yet supported."
-        assert (
-            packed_seq_params is None
-        ), f"multi token prediction + sequence packing is not yet supported."
 
         input_ids, position_ids, decoder_input, hidden_states = self._get_embeddings(
             input_ids=input_ids,
             position_ids=position_ids,
             embedding=embedding,
             hidden_states=hidden_states,
+            packed_seq_params=packed_seq_params,
         )
 
         if self.config.recompute_granularity == 'full' and self.training:
diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py
index fae2e2f5d4d..656699ea2a2 100644
--- a/megatron/core/transformer/transformer_config.py
+++ b/megatron/core/transformer/transformer_config.py
@@ -233,11 +233,14 @@ class TransformerConfig(ModelParallelConfig):
     16 SMs can generally achieve good bandwidth."""
 
     ####################
-    # linear attention
+    # attention variant
     ####################
-    linear_attention_type: Optional[str] = None
-    """Type of linear attention to use. Currently support gated_delta_net."""
+    experimental_attention_variant: Optional[str] = None
+    """Type of attention variant to use. Currently support gated_delta_net and dsa."""
 
+    ####################
+    # attention variant: gated_delta_net
+    ####################
     linear_attention_freq: Optional[Union[int, List[int]]] = None
     """Frequency between LA (linear attention) layers 
     and SDPA (scaled dot-product attention) layers.
@@ -260,6 +263,25 @@ class TransformerConfig(ModelParallelConfig):
     linear_num_value_heads: Optional[int] = None
     """Number of value and gate heads for the gated delta net."""
 
+    ####################
+    # attention variant: dsa
+    ####################
+    dsa_indexer_n_heads: Optional[int] = None
+    """Number of DSA indexer heads."""
+
+    dsa_indexer_head_dim: Optional[int] = None
+    """Dimension per DSA indexer head."""
+
+    dsa_indexer_topk: Optional[int] = None
+    """Number of top-k tokens to select in DSA indexer."""
+
+    dsa_indexer_loss_coeff: Optional[float] = None
+    """Coefficient for the DSA indexer KL divergence loss. Set to 0 to disable indexer loss."""
+
+    dsa_indexer_use_sparse_loss: Optional[bool] = None
+    """Whether to use sparse DSA indexer loss. If True, the indexer loss will be computed using the
+    top-k indices."""
+
     ####################
     # initialization
     ####################
@@ -855,17 +877,12 @@ def __post_init__(self):
                 f"tensor_model_parallel_size ({self.tensor_model_parallel_size})."
             )
 
-        if self.linear_attention_type is not None:
-            supported_la_types = ["gated_delta_net"]
-            assert self.linear_attention_type in supported_la_types, (
-                f"linear_attention_type ({self.linear_attention_type}) only support"
-                f" one of {supported_la_types}."
-            )
+        if self.experimental_attention_variant in ["gated_delta_net"]:
             assert (
                 self.linear_attention_freq is not None
             ), f"linear_attention_freq must be set for linear attention."
 
-            if self.linear_attention_type == "gated_delta_net":
+            if self.experimental_attention_variant == "gated_delta_net":
                 # Check required parameters
                 assert (
                     self.linear_conv_kernel_dim is not None
@@ -900,6 +917,11 @@ def __post_init__(self):
                     f"Gated delta net does not support context parallel for now,"
                     f" but got {self.context_parallel_size=}."
                 )
+        elif self.experimental_attention_variant == "dsa":
+            assert (
+                self.context_parallel_size == 1
+            ), "Currently context parallelism is not supported by DSAttention!"
+            assert not self.apply_rope_fusion, "RoPE fusion is not supported for DSAttention"
 
         if self.fp8:
             # cannot support first last layer bf16 with delayed scaling
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index 7fca6a20f40..48ba9c8bd5f 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -69,7 +69,7 @@ def add_megatron_arguments(parser: argparse.ArgumentParser):
     parser = _add_vision_args(parser)
     parser = _add_moe_args(parser)
     parser = _add_mla_args(parser)
-    parser = _add_linear_attention_args(parser)
+    parser = _add_experimental_attention_variant_args(parser)
     parser = _add_heterogeneous_args(parser)
     parser = _add_logging_args(parser)
     parser = _add_straggler_detector_args(parser)
@@ -1194,13 +1194,21 @@ def validate_args(args, defaults={}):
             args.no_load_rng = True
             print('Warning: disabling --no-load-rng for upcycling.')
 
+    if args.linear_attention_type is not None:
+        print_rank_0(
+            '--linear-attention-type is deprecated, use --experimental-attention-variant instead.',
+            args.rank,
+        )
+        args.experimental_attention_variant = args.linear_attention_type
+        del args.linear_attention_type
+
     # Muon optimizercheck
     if 'muon' in args.optimizer:
         assert not args.use_distributed_optimizer, "Muon optimizer does not support distributed optimizer for now."
         assert not args.use_torch_fsdp2, "Muon optimizer does not support Torch-FSDP2 for now."
         assert not args.use_megatron_fsdp, "Muon optimizer does not support Megatron-FSDP for now."
         assert args.ckpt_format in ["torch", "torch_dist"], "Muon optimizer supports torch and torch_dist checkpoint format."
-        assert args.linear_attention_type is None, "Muon optimizer does not support linear attention type for now."
+        assert args.experimental_attention_variant is None, "Muon optimizer does not support attention variant for now."
         assert not args.attention_output_gate, "Muon optimizer does not support attention output gate for now."
 
     # Optimizer CPU offload check
@@ -3351,10 +3359,14 @@ def _add_mla_args(parser):
 
     return parser
 
-def _add_linear_attention_args(parser):
-    group = parser.add_argument_group(title="la")
+def _add_experimental_attention_variant_args(parser):
+    group = parser.add_argument_group(title="experimental_attention_variant")
+    group.add_argument('--experimental-attention-variant', default=None, choices=['gated_delta_net', 'dsa'], type=str,
+                       help='Type of attention variant to use. Currently support gated_delta_net and dsa.')
+
+    # Linear attention
     group.add_argument('--linear-attention-type', default=None, choices=['gated_delta_net'], type=str,
-                       help='Type of linear attention to use. Currently support gated_delta_net.')
+                       help='(Deprecated, use --experimental-attention-variant instead) Type of linear attention to use. Currently support gated_delta_net.')
     group.add_argument('--linear-attention-freq', type=la_freq_type, default=None,
                        help='Frequency between LA (linear attention) layers and'
                             ' SDPA (scaled dot-product attention) layers. Accepts either: '
@@ -3374,6 +3386,19 @@ def _add_linear_attention_args(parser):
                        help='Number of query and key heads for the gated delta net.')
     group.add_argument('--linear-num-value-heads', default=32, type=int,
                        help='Number of value and gate heads for the gated delta net.')
+
+    # DSA
+    group.add_argument('--dsa-indexer-n-heads', default=None, type=int,
+                       help='Number of indexer heads for sparse attention. If not set, defaults to num-attention-heads.')
+    group.add_argument('--dsa-indexer-head-dim', default=None, type=int,
+                       help='Dimension per indexer head for sparse attention. If not set, defaults to kv-channels.')
+    group.add_argument('--dsa-indexer-topk', default=None, type=int,
+                       help='Number of top-k tokens to select in sparse attention indexer.')
+    group.add_argument('--dsa-indexer-loss-coeff', default=0.0, type=float,
+                       help='Coefficient for the indexer KL divergence loss. Set to 0 to disable indexer loss.')
+    group.add_argument('--dsa-indexer-use-sparse-loss', action='store_true',
+                       help='Use sparse indexer loss. If set, the indexer loss will be computed using the top-k indices.')
+
     return parser
 
 def _add_heterogeneous_args(parser):
diff --git a/megatron/training/training.py b/megatron/training/training.py
index 9986f931641..5c9de623ce5 100644
--- a/megatron/training/training.py
+++ b/megatron/training/training.py
@@ -91,6 +91,7 @@
 from megatron.core.optimizer_param_scheduler import OptimizerParamScheduler
 from megatron.core.transformer.moe import upcycling_utils
 from megatron.core.transformer.moe.moe_utils import track_moe_metrics
+from megatron.core.transformer.experimental_attention_variant.dsa import DSAIndexerLossLoggingHelper
 from megatron.core.transformer.multi_token_prediction import MTPLossLoggingHelper
 from megatron.core.parallel_state import (
     destroy_global_memory_buffer,
@@ -375,7 +376,8 @@ def transformer_flops():
                 )
             )
 
-        if args.linear_attention_type is not None:
+        linear_attention_variants = ["gated_delta_net"]
+        if args.experimental_attention_variant in linear_attention_variants:
             # Calculate number of dense and MoE Transformer MLPs.
             if isinstance(args.linear_attention_freq, int):
                 linear_attention_pattern = [
@@ -400,7 +402,7 @@ def transformer_flops():
             num_linear_attention_layers = sum(linear_attention_pattern)
             num_standard_attention_layers = num_layers - num_linear_attention_layers
 
-            if args.linear_attention_type == "gated_delta_net":
+            if args.experimental_attention_variant == "gated_delta_net":
                 # Calculate the FLOPs for the gated delta net attention.
                 qk_head_dim = args.linear_key_head_dim
                 v_head_dim = args.linear_value_head_dim
@@ -1698,6 +1700,16 @@ def training_log(
         MTPLossLoggingHelper.track_mtp_metrics(
             mtp_loss_scale, iteration, writer, wandb_writer, total_loss_dict
         )
+    # Track sparse attention indexer loss
+    if args.dsa_indexer_loss_coeff is not None and args.dsa_indexer_loss_coeff > 0:
+        indexer_loss_scale = 1 / get_num_microbatches()
+        DSAIndexerLossLoggingHelper.track_indexer_metrics(
+            loss_scale=indexer_loss_scale,
+            iteration=iteration,
+            writer=writer,
+            wandb_writer=wandb_writer,
+            total_loss_dict=total_loss_dict,
+        )
     if iteration % args.log_interval == 0:
         if args.record_memory_history and (is_last_rank() or torch.distributed.get_backend() == 'fake'):
             snapshot = torch.cuda.memory._snapshot()
@@ -1929,6 +1941,7 @@ def post_training_step_callbacks(
 
     # Straggler detector.
     if iteration % args.log_interval == 0 and args.log_straggler:
+        # Use FLOPs accumulated since last log event and then reset the counter
         stimer.report(num_floating_point_operations_since_last_log_event, args.log_interval)
         num_floating_point_operations_since_last_log_event = 0.0
 
@@ -1970,6 +1983,9 @@ def post_training_step_callbacks(
         if args.manual_gc_interval != 0 and iteration % args.manual_gc_interval == 0:
             gc.collect()
 
+    # Return updated FLOPs accumulator so caller can persist the reset
+    return num_floating_point_operations_since_last_log_event
+
 
 def checkpoint_and_decide_exit(
     model,
@@ -2585,8 +2601,9 @@ def get_e2e_base_metrics():
                 energy_monitor.resume()
 
         # Miscellaneous post-training-step functions (e.g., FT heartbeats, GC).
-        # Some of these only happen at specific iterations.
-        post_training_step_callbacks(
+        # Some of these only happen at specific iterations. Capture updated FLOPs accumulator
+        # (it is reset inside the callback after logging).
+        num_floating_point_operations_since_last_log_event = post_training_step_callbacks(
             model,
             optimizer,
             opt_param_scheduler,
diff --git a/pyproject.toml b/pyproject.toml
index 7f734927c1a..553f898ae6f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ dynamic = ["version", "readme"]
 description = "Megatron Core - a library for efficient and scalable training of transformer based models"
 requires-python = ">=3.10"
 license = { text = "Apache 2.0" }
-dependencies = ["torch", "numpy<2.0.0", "packaging>=24.2"]
+dependencies = ["torch", "numpy", "packaging>=24.2"]
 authors = [{ name = "NVIDIA", email = "nemo-toolkit@nvidia.com" }]
 maintainers = [{ name = "NVIDIA", email = "nemo-toolkit@nvidia.com" }]
 keywords = [
@@ -67,37 +67,44 @@ Homepage = "https://github.com/NVIDIA/Megatron-LM/megatron/core"
 mlm = ["flask-restful", "sentencepiece", "tiktoken", "wandb", "transformers"]
 
 dev = [
-    "nvidia-modelopt[torch]>=0.33.0a0,<0.34.0; sys_platform != 'darwin'",
-    "transformer-engine[pytorch]>=2.9.0a0,<2.10.0",
-    "nvidia-resiliency-ext>=0.4.0a0,<0.5.0",
+    "nvidia-modelopt[torch]; sys_platform != 'darwin'",
+    "transformer-engine[pytorch,core_cu13]>=2.9.0a0,<2.10.0",
+    "nvidia-resiliency-ext",
     "tqdm",
     "einops~=0.8",
     "tensorstore~=0.1,!=0.1.46,!=0.1.72",
     "nvtx~=0.2",
     "multi-storage-client~=0.27",
     "opentelemetry-api~=1.33.1",
-    "setuptools<80.0.0",
     "mamba-ssm~=2.2",
     "causal-conv1d~=1.5",
     "nv-grouped-gemm~=1.1",
     "megatron-energon[av_decode]~=6.0",
-    "av<16.0.0",                                        # At the time, av 16.0.0 is not compatible with Python 3.12
+    "av",
     "flashinfer-python",
     "wget",
     "onnxscript",
     "flash-linear-attention~=0.3.2",
     "emerging_optimizers",
+    "fastapi~=0.50",                                          # Forcing a little bit more recent version of fastapi to be compatible with pydantic 2.0
 ]
 
 lts = [
     "tqdm",
-    "einops",
-    "tensorstore!=0.1.46,!=0.1.72",
-    "nvtx",
-    "transformers",
-    "zarr",
-    "setuptools<80.0.0",
+    "einops~=0.8",
+    "tensorstore~=0.1,!=0.1.46,!=0.1.72",
+    "nvtx~=0.2",
+    "multi-storage-client~=0.27",
+    "opentelemetry-api~=1.33.1",
+    "mamba-ssm~=2.2",
+    "causal-conv1d~=1.5",
+    "nv-grouped-gemm~=1.1",
+    "megatron-energon[av_decode]~=6.0",
+    "av",
+    "flashinfer-python",
     "wget",
+    "onnxscript",
+    "fastapi~=0.50",                      # Forcing a little bit more recent version of fastapi to be compatible with pydantic 2.0
 ]
 
 [dependency-groups]
@@ -141,7 +148,7 @@ linting = [
     "pylint==3.2.6",
 ]
 ci = ["python-gitlab", "slack-sdk", "pandas"]
-flash_mla = ["flash_mla"]
+no_pypi_wheels = ["flash_mla", "emerging_optimizers"]
 
 [tool.uv]
 default-groups = ["linting", "build", "test"]
@@ -168,7 +175,7 @@ override-dependencies = [
 flash_mla = [
     { git = "https://github.com/deepseek-ai/FlashMLA", rev = "9edee0c022cd0938148a18e334203b0aab43aa19" },
 ]
-transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.9" } # on `release_v2.9`
+# transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "release_v2.9" } # on `release_v2.9`
 nemo-run = { git = "https://github.com/NVIDIA-NeMo/Run.git", rev = "01a9a8ba360f7b2908728ad0516e0ad9d936966d" }
 emerging_optimizers = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git", rev = "v0.1.0" }
 
diff --git a/tests/test_utils/python_scripts/download_unit_tests_dataset.py b/tests/test_utils/python_scripts/download_unit_tests_dataset.py
index 04470c2f820..a29394c29de 100644
--- a/tests/test_utils/python_scripts/download_unit_tests_dataset.py
+++ b/tests/test_utils/python_scripts/download_unit_tests_dataset.py
@@ -1,21 +1,35 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 #!/usr/bin/env python3
 """
 Script to fetch the oldest release of NVIDIA/Megatron-LM on GitHub and list its assets.
 Uses the PyGithub SDK to interact with the GitHub API.
 """
 
-import os
-import sys
+import logging
 import tarfile
 import zipfile
 from pathlib import Path
 
 import click
 import requests
-from github import Github
 
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+ASSETS = [
+    {
+        "name": "datasets.zip",
+        "url": "https://github.com/NVIDIA/Megatron-LM/releases/download/v2.5/datasets.zip",
+    },
+    {
+        "name": "tokenizers.zip",
+        "url": "https://github.com/NVIDIA/Megatron-LM/releases/download/v2.5/tokenizers.zip",
+    },
+]
 
-def download_and_extract_asset(asset_url: str, asset_name: str, assets_dir: Path) -> bool:
+
+def download_and_extract_asset(assets_dir: Path) -> bool:
     """
     Download and extract an asset to the assets directory.
 
@@ -27,144 +41,43 @@ def download_and_extract_asset(asset_url: str, asset_name: str, assets_dir: Path
     Returns:
         bool: True if successful, False otherwise
     """
-    try:
-        # Download the asset
-        print(f"  Downloading {asset_name}...")
-        response = requests.get(asset_url, stream=True)
-        response.raise_for_status()
-
-        # Save to temporary file
-        temp_file = assets_dir / asset_name
-        with open(temp_file, 'wb') as f:
-            for chunk in response.iter_content(chunk_size=8192):
-                f.write(chunk)
-
-        print(f"  Extracting {asset_name} to {assets_dir}...")
-
-        # Extract based on file type
-        if asset_name.endswith('.zip'):
-            with zipfile.ZipFile(temp_file, 'r') as zip_ref:
-                zip_ref.extractall(assets_dir)
-        elif asset_name.endswith(('.tar.gz', '.tgz')):
-            with tarfile.open(temp_file, 'r:gz') as tar_ref:
-                tar_ref.extractall(assets_dir)
-        elif asset_name.endswith('.tar'):
-            with tarfile.open(temp_file, 'r') as tar_ref:
-                tar_ref.extractall(assets_dir)
-        else:
-            print(f"  Warning: Unknown file type for {asset_name}, skipping extraction")
-            return False
-
-        # Clean up temporary file
-        temp_file.unlink()
-        print(f"  Successfully extracted to {assets_dir}")
-        return True
-
-    except Exception as e:
-        print(f"  Error downloading/extracting {asset_name}: {e}")
-        return False
-
-
-def get_oldest_release_and_assets(
-    repo_name: str = "NVIDIA/Megatron-LM", assets_dir: str = "assets"
-) -> None:
-    """
-    Fetch the oldest release of a GitHub repository and list its assets.
-
-    Args:
-        repo_name: The repository name in format "owner/repo"
-        assets_dir: Directory to extract assets to
-    """
-    try:
-        # Initialize GitHub client
-        g = Github(login_or_token=os.getenv('GH_TOKEN', None))
-
-        # Get the repository
-        repo = g.get_repo(repo_name)
-        print(f"Repository: {repo.full_name}")
-        print(f"Description: {repo.description}")
-        print(f"URL: {repo.html_url}")
-        print("-" * 80)
-
-        # Get all releases
-        releases = list(repo.get_releases())
-
-        if not releases:
-            print("No releases found for this repository.")
-            return
-
-        # Sort releases by creation date to find the oldest
-        releases.sort(key=lambda x: x.created_at)
-        oldest_release = releases[0]
-
-        print(f"Oldest Release:")
-        print(f"  Tag: {oldest_release.tag_name}")
-        print(f"  Title: {oldest_release.title}")
-        print(f"  Created: {oldest_release.created_at}")
-        print(f"  Published: {oldest_release.published_at}")
-        print(f"  Draft: {oldest_release.draft}")
-        print(f"  Prerelease: {oldest_release.prerelease}")
-        print(f"  URL: {oldest_release.html_url}")
-
-        if oldest_release.body:
-            print(f"  Description: {oldest_release.body[:200]}...")
-
-        print("-" * 80)
-
-        # List assets
-        assets = list(oldest_release.get_assets())
-
-        if not assets:
-            print("No assets found for this release.")
-            return
-
-        print(f"Assets ({len(assets)} total):")
-        print("-" * 80)
-
-        for i, asset in enumerate(assets, 1):
-            print(f"{i}. {asset.name}")
-            print(f"   Size: {asset.size} bytes ({asset.size / 1024 / 1024:.2f} MB)")
-            print(f"   Downloads: {asset.download_count}")
-            print(f"   Content Type: {asset.content_type}")
-            print(f"   URL: {asset.browser_download_url}")
-            print(f"   Created: {asset.created_at}")
-            print(f"   Updated: {asset.updated_at}")
-            print()
-
-        # Summary
-        total_size = sum(asset.size for asset in assets)
-        total_downloads = sum(asset.download_count for asset in assets)
-
-        print(f"Summary:")
-        print(f"  Total assets: {len(assets)}")
-        print(f"  Total size: {total_size} bytes ({total_size / 1024 / 1024:.2f} MB)")
-        print(f"  Total downloads: {total_downloads}")
-
-        # Download and extract assets if requested
-        if assets:
-            print("-" * 80)
-            print("Downloading and extracting assets...")
-
-            # Create assets directory
-            assets_path = Path(assets_dir)
-            assets_path.mkdir(parents=True, exist_ok=True)
-            print(f"Created assets directory: {assets_path.absolute()}")
-
-            successful_downloads = 0
-            for asset in assets:
-                print(f"\nProcessing asset: {asset.name}")
-                if download_and_extract_asset(asset.browser_download_url, asset.name, assets_path):
-                    successful_downloads += 1
-
-            print(f"\nDownload Summary:")
-            print(
-                f"  Successfully downloaded and extracted: {successful_downloads}/{len(assets)} assets"
-            )
-            print(f"  Assets directory: {assets_path.absolute()}")
-
-    except Exception as e:
-        print(f"Error: {e}")
-        sys.exit(1)
+    for asset in ASSETS:
+        asset_name, asset_url = asset.values()
+        try:
+            # Download the asset
+            logger.info(f"  Downloading {asset_name}...")
+            response = requests.get(asset_url, stream=True)
+            response.raise_for_status()
+
+            # Save to temporary file
+            temp_file = assets_dir / asset_name
+            with open(temp_file, 'wb') as f:
+                for chunk in response.iter_content(chunk_size=8192):
+                    f.write(chunk)
+
+            logger.info(f"  Extracting {asset_name} to {assets_dir}...")
+
+            # Extract based on file type
+            if asset_name.endswith('.zip'):
+                with zipfile.ZipFile(temp_file, 'r') as zip_ref:
+                    zip_ref.extractall(assets_dir)
+            elif asset_name.endswith(('.tar.gz', '.tgz')):
+                with tarfile.open(temp_file, 'r:gz') as tar_ref:
+                    tar_ref.extractall(assets_dir)
+            elif asset_name.endswith('.tar'):
+                with tarfile.open(temp_file, 'r') as tar_ref:
+                    tar_ref.extractall(assets_dir)
+            else:
+                logger.warning(
+                    f"  Warning: Unknown file type for {asset_name}, skipping extraction"
+                )
+
+            # Clean up temporary file
+            temp_file.unlink()
+            logger.info(f"  Successfully extracted to {assets_dir}")
+
+        except Exception as e:
+            logger.error(f"  Error downloading/extracting {asset_name}: {e}")
 
 
 @click.command()
@@ -174,10 +87,12 @@ def get_oldest_release_and_assets(
 @click.option('--assets-dir', default='assets', help='Directory to extract assets to')
 def main(repo, assets_dir):
     """Fetch the oldest release of a GitHub repository and download its assets."""
-    print(f"Fetching oldest release of {repo}...")
-    print("=" * 80)
+    logger.info(f"Fetching oldest release of {repo}...")
+    logger.info("=" * 80)
+
+    Path(assets_dir).mkdir(parents=True, exist_ok=True)
 
-    get_oldest_release_and_assets(repo_name=repo, assets_dir=assets_dir)
+    download_and_extract_asset(Path(assets_dir))
 
 
 if __name__ == "__main__":
diff --git a/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml b/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml
index c61128aaca2..1b4786e8230 100644
--- a/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml
+++ b/tests/test_utils/recipes/gpt-dynamic-inference-with-coordinator.yaml
@@ -3,7 +3,7 @@ format_version: 1
 maintainers: [mcore]
 loggers: [stdout]
 spec:
-  name: "{test_case}_{environment}_{platforms}"
+  name: '{test_case}_{environment}_{platforms}'
   model: gpt
   build: mcore-pyt-{environment}
   nodes: 1
@@ -62,5 +62,5 @@ products:
   - test_case: [gpt_dynamic_inference_tp1_pp1_dp8_583m_logitsmatch_zmq]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
diff --git a/tests/test_utils/recipes/gpt-dynamic-inference.yaml b/tests/test_utils/recipes/gpt-dynamic-inference.yaml
index 66fa6887de8..a3853c3d9e1 100644
--- a/tests/test_utils/recipes/gpt-dynamic-inference.yaml
+++ b/tests/test_utils/recipes/gpt-dynamic-inference.yaml
@@ -3,7 +3,7 @@ format_version: 1
 maintainers: [mcore]
 loggers: [stdout]
 spec:
-  name: "{test_case}_{environment}_{platforms}"
+  name: '{test_case}_{environment}_{platforms}'
   model: gpt
   build: mcore-pyt-{environment}
   nodes: 1
@@ -62,15 +62,15 @@ products:
   - test_case: [gpt_dynamic_inference_tp8_pp1_583m_logitsmatch]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
   - test_case: [gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_fp8_logitsmatch]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
   - test_case: [gpt_dynamic_inference_tp1_pp1_583m_cuda_graphs_logitsmatch_decode_graphs_only]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
diff --git a/tests/test_utils/recipes/gpt-static-inference.yaml b/tests/test_utils/recipes/gpt-static-inference.yaml
index 033c6c35116..39c2c3c934e 100644
--- a/tests/test_utils/recipes/gpt-static-inference.yaml
+++ b/tests/test_utils/recipes/gpt-static-inference.yaml
@@ -3,7 +3,7 @@ format_version: 1
 maintainers: [mcore]
 loggers: [stdout]
 spec:
-  name: "{test_case}_{environment}_{platforms}"
+  name: '{test_case}_{environment}_{platforms}'
   model: gpt
   build: mcore-pyt-{environment}
   nodes: 1
@@ -57,20 +57,20 @@ products:
   - test_case: [gpt_static_inference_tp1_pp1_583m_logitsmatch]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
   - test_case: [gpt_static_inference_tp1_pp1_583m_cudagraphs]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
   - test_case: [gpt_static_inference_tp1_pp1_583m_fp8_cudagraphs]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
   - test_case: [gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
diff --git a/tests/test_utils/recipes/gpt.yaml b/tests/test_utils/recipes/gpt.yaml
index 0b3606fd702..0b068c55220 100644
--- a/tests/test_utils/recipes/gpt.yaml
+++ b/tests/test_utils/recipes/gpt.yaml
@@ -110,7 +110,7 @@ products:
   - test_case: [gpt3_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
@@ -124,201 +124,201 @@ products:
   - test_case: [gpt3_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   # - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic]
   #   products:
   #     - environment: [dev]
-  #       scope: [mr, mr-github]
+  #       scope: [mr]
   #     - environment: [lts]
   #       scope: [nightly] # Non-deterministic: #487
   - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       # - environment: [lts]
       #   scope: [nightly] # outdated TE: #501
   - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       # - environment: [lts]
       #   scope: [nightly] # non-determinism: #436
   - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_persistent_disable_bias_linear]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_swiglu]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       # - environment: [lts]
       #   scope: [nightly] # non-determinism: #437
   - test_case: [gpt3_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp1_pp4_vp1]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
       # - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist]
       #   products:
       #     - environment: [dev]
-      #       scope: [mr, mr-github]
+      #       scope: [mr]
       #       platforms: [dgx_h100] # Hangs: #513
       # - environment: [lts]
       #   scope: [nightly]
   - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather]
     products:
       # - environment: [dev]
-      #   scope: [mr, mr-github]
+      #   scope: [mr]
       #   platforms: [dgx_h100] # Hangs: #513
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied]
     products:
       # - environment: [dev]
-      #   scope: [mr, mr-github] # Hangs: #513
+      #   scope: [mr] # Hangs: #513
       #   platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap]
     products:
       # - environment: [dev]
-      #   scope: [mr, mr-github] # Hangs: #513
+      #   scope: [mr] # Hangs: #513
       #   platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp2_pp2_cp2]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
   - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_dp_last]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
   - test_case: [gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
   - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_dp_last]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
   - test_case: [gpt3_mcore_te_tp2_pp2_cp2_nondeterministic]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp2_pp2_cp2_calculate_per_token_loss_nondeterministic]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp2_pp2_cross_entropy_loss_fusion]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
@@ -326,14 +326,14 @@ products:
   - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
@@ -345,96 +345,96 @@ products:
   - test_case: [gpt3_mcore_te_tp2_pp2_mla]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
   - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader]
     products:
       # - environment: [dev]
-      #   scope: [mr, mr-github] # Hangs: #513
+      #   scope: [mr] # Hangs: #513
       #   platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_mcore_tp2_pp2_uninstall_te]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
   - test_case: [gpt3_7b_tp1_pp4_memory_speed]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       # - environment: [lts]
       #   scope: [nightly] # OOM: #434
   - test_case: [gpt3_7b_tp4_pp1_memory_speed]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       # - environment: [lts]
       #   scope: [nightly] # OOM: #434
   - test_case: [gpt3_mcore_te_tp2_zp_z3_resume_fsdp_dtensor]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       # - environment: [lts]
       #   scope: [nightly]
   - test_case: [gpt3_mcore_te_tp2_pp1_modelopt_distill_resume]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       # - environment: [lts]
       #   scope: [nightly] # Outdated: #502
   # - test_case: [gpt3_mcore_te_tp2_pp1_fsdp2_resume_torch_dist]
   #   products:
   # - environment: [dev]
-  #   scope: [mr, mr-github] # Broken: #484
+  #   scope: [mr] # Broken: #484
   # - environment: [lts]
   #   scope: [nightly] # Requires PyT 2.4: #481
   #######################################################################
@@ -450,57 +450,57 @@ products:
   # - test_case: [gpt3_mcore_reruns_persistent_2]
   #   products:
   # - environment: [dev]
-  #   scope: [mr, mr-github]
+  #   scope: [mr]
   #   platforms: [dgx_h100]
   # - environment: [lts]
   #   scope: [nightly]
   - test_case: [gpt3_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer]
     products:
       - environment: [lts]
-        scope: [mr, mr-github]
+        scope: [mr]
       - environment: [dev]
-        scope: [mr, mr-github, mr-slim]
+        scope: [mr, mr-github, mr-github-slim]
         platforms: [dgx_h100]
   - test_case: [gpt3_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
-        scope: [mr, mr-github]
+        scope: [mr]
   - test_case: [gpt3_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather]
     products:
       - environment: [lts]
-        scope: [mr, mr-github]
+        scope: [mr]
       - environment: [dev]
-        scope: [mr, mr-github, mr-slim]
+        scope: [mr, mr-github, mr-github-slim]
         platforms: [dgx_h100]
   - test_case: [gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       - environment: [lts]
-        scope: [mr, mr-github]
+        scope: [mr]
   # - test_case: [gpt3_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone]
   #   products:
   #     - environment: [dev]
-  #       scope: [mr, mr-github]
+  #       scope: [mr]
   #       platforms: [dgx_h100]
   # - test_case: [gpt3_mcore_te_tp1_pp1_frozen_resume_torch_dist_dist_optimizer]
   #   products:
   #     - environment: [dev]
-  #       scope: [mr, mr-github]
+  #       scope: [mr]
   #       platforms: [dgx_h100]
   # - test_case: [gpt3_mcore_te_tp1_pp4_frozen_resume_torch_dist_swiglu]
   #   products:
   #     - environment: [dev]
-  #       scope: [mr, mr-github]
+  #       scope: [mr]
   #       platforms: [dgx_h100]
   # - test_case: [gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_cp2_nondeterministic]
   #   products:
   #     - environment: [dev]
-  #       scope: [mr, mr-github]
+  #       scope: [mr]
   #       platforms: [dgx_a100, dgx_h100]
   # - test_case: [gpt3_weekly_dgx_b200_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap]
   #   products:
@@ -550,4 +550,4 @@ products:
   # - test_case: [gpt3_mcore_tp2_pp2_resume_torch_dist_uninstall_te]
   #   products:
   #     - environment: [dev, lts]
-  #       scope: [mr, mr-github] # Non-deterministic: #483
+  #       scope: [mr] # Non-deterministic: #483
diff --git a/tests/test_utils/recipes/mamba-static-inference.yaml b/tests/test_utils/recipes/mamba-static-inference.yaml
index 06107618916..9645b1b0b8a 100644
--- a/tests/test_utils/recipes/mamba-static-inference.yaml
+++ b/tests/test_utils/recipes/mamba-static-inference.yaml
@@ -3,7 +3,7 @@ format_version: 1
 maintainers: [mcore]
 loggers: [stdout]
 spec:
-  name: "{test_case}_{environment}_{platforms}"
+  name: '{test_case}_{environment}_{platforms}'
   model: hybrid
   build: mcore-pyt-{environment}
   nodes: 1
@@ -57,10 +57,10 @@ products:
   - test_case: [hybrid_static_inference_tp1_pp1_2B_logitsmatch]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
   - test_case: [hybrid_static_inference_tp1_pp1_2B_cudagraphs]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
diff --git a/tests/test_utils/recipes/mamba.yaml b/tests/test_utils/recipes/mamba.yaml
index bb742200d26..92b799d3d1c 100644
--- a/tests/test_utils/recipes/mamba.yaml
+++ b/tests/test_utils/recipes/mamba.yaml
@@ -3,7 +3,7 @@ format_version: 1
 maintainers: [mcore]
 loggers: [stdout]
 spec:
-  name: "{test_case}_{environment}_{platforms}"
+  name: '{test_case}_{environment}_{platforms}'
   model: hybrid
   build: mcore-pyt-{environment}
   nodes: 1
@@ -58,7 +58,7 @@ products:
   - test_case: [hybrid_mr_mcore_te_tp1_pp1_cp1_dgx_a100_1N8G]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       # - environment: [lts] # disabled until triton is bumped
       #   scope: [nightly]
@@ -67,14 +67,14 @@ products:
   # - test_case: [hybrid_mr_mcore_te_tp1_pp4_cp1_dgx_a100_1N8G]
   #   products:
   #     - environment: [dev]
-  #       scope: [mr, mr-github]
+  #       scope: [mr]
   #     - environment: [lts] # disabled until triton is bumped
   #       scope: [nightly]
 
   - test_case: [hybrid_mr_mcore_te_tp2_pp1_cp1_dgx_a100_1N8G]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       # - environment: [lts] # disabled until triton is bumped
       #   scope: [nightly]
@@ -82,7 +82,7 @@ products:
   - test_case: [hybrid_mr_mcore_te_tp2_pp1_cp4_dgx_a100_1N8G]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
       # - environment: [lts] # disabled until triton is bumped
       #   scope: [nightly]
diff --git a/tests/test_utils/recipes/moe-dynamic-inference.yaml b/tests/test_utils/recipes/moe-dynamic-inference.yaml
index 9bb23f8a322..6d8fdc533e1 100644
--- a/tests/test_utils/recipes/moe-dynamic-inference.yaml
+++ b/tests/test_utils/recipes/moe-dynamic-inference.yaml
@@ -3,7 +3,7 @@ format_version: 1
 maintainers: [mcore]
 loggers: [stdout]
 spec:
-  name: "{test_case}_{environment}_{platforms}"
+  name: '{test_case}_{environment}_{platforms}'
   model: moe
   build: mcore-pyt-{environment}
   nodes: 1
@@ -57,10 +57,10 @@ products:
   - test_case: [gpt_dynamic_inference_tp4_pp1_ep4_16B_logitsmatch]
     products:
       - environment: [dev]
-        scope: [mr-broken, mr-github]
+        scope: [mr-broken]
         platforms: [dgx_h100]
   - test_case: [gpt_dynamic_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
diff --git a/tests/test_utils/recipes/moe-static-inference.yaml b/tests/test_utils/recipes/moe-static-inference.yaml
index 136606d0955..9cebb66f2e2 100644
--- a/tests/test_utils/recipes/moe-static-inference.yaml
+++ b/tests/test_utils/recipes/moe-static-inference.yaml
@@ -3,7 +3,7 @@ format_version: 1
 maintainers: [mcore]
 loggers: [stdout]
 spec:
-  name: "{test_case}_{environment}_{platforms}"
+  name: '{test_case}_{environment}_{platforms}'
   model: moe
   build: mcore-pyt-{environment}
   nodes: 1
@@ -57,15 +57,15 @@ products:
   - test_case: [gpt_static_inference_tp1_pp1_ep1_16B_logitsmatch]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
   - test_case: [gpt_static_inference_tp4_pp1_ep4_16B_logitsmatch]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
   - test_case: [gpt_static_inference_cuda_graphs_pad_tp4_pp1_ep4_16B_logitsmatch]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
diff --git a/tests/test_utils/recipes/moe.yaml b/tests/test_utils/recipes/moe.yaml
index 2d4e8c4c94c..285d16c99f3 100644
--- a/tests/test_utils/recipes/moe.yaml
+++ b/tests/test_utils/recipes/moe.yaml
@@ -3,7 +3,7 @@ format_version: 1
 maintainers: [mcore]
 loggers: [stdout]
 spec:
-  name: "{test_case}_{environment}_{platforms}"
+  name: '{test_case}_{environment}_{platforms}'
   model: moe
   build: mcore-pyt-{environment}
   nodes: 1
@@ -84,27 +84,27 @@ products:
   - test_case: [gpt3_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
   - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_resume_torch_dist_attn_cudagraph]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100] # hang: #513
   # - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_selective_recompute_experimental]
   #   products:
   #     - environment: [dev]
-  #       scope: [mr, mr-github]
+  #       scope: [mr]
   #       platforms: [dgx_h100] # hang: #513
   - test_case: [gpt3_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_multi_dist_optimizer_instances]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
   - test_case: [gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
   - test_case: [gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router]
     products:
@@ -114,12 +114,12 @@ products:
   - test_case: [gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
   - test_case: [gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
   # - test_case: [gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon]
   #   products:
@@ -152,12 +152,12 @@ products:
   # - test_case: [gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer]
   #   products:
   #     - environment: [dev]
-  #       scope: [mr, mr-github]
+  #       scope: [mr]
   #       platforms: [dgx_h100]
   # - test_case: [gpt3_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_groupedGEMM]
   #   products:
   #     - environment: [dev]
-  #       scope: [mr, mr-github]
+  #       scope: [mr]
   #       platforms: [dgx_h100]
   ###########################
   # Merge train tests       #
@@ -165,12 +165,12 @@ products:
   - test_case: [gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer]
     products:
       - environment: [dev]
-        scope: [mr, mr-github, mr-slim]
+        scope: [mr, mr-github, mr-github-slim]
         platforms: [dgx_h100]
   - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed]
     products:
       - environment: [dev]
-        scope: [mr, mr-github, mr-slim]
+        scope: [mr]
         platforms: [dgx_h100]
   - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8]
     products:
diff --git a/tests/test_utils/recipes/multimodal-llava.yaml b/tests/test_utils/recipes/multimodal-llava.yaml
index 0e199764c09..72702de33c5 100644
--- a/tests/test_utils/recipes/multimodal-llava.yaml
+++ b/tests/test_utils/recipes/multimodal-llava.yaml
@@ -3,7 +3,7 @@ format_version: 1
 maintainers: [mcore]
 loggers: [stdout]
 spec:
-  name: "{test_case}_{environment}_{platforms}"
+  name: '{test_case}_{environment}_{platforms}'
   model: multimodal-llava
   build: mcore-pyt-{environment}
   nodes: 1
@@ -61,10 +61,10 @@ products:
   - test_case: [multimodal_llava_mcore_te_tp1_pp1]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
   - test_case: [multimodal_llava_mcore_te_tp4_sp_cp2]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr]
         platforms: [dgx_h100]
diff --git a/tests/unit_tests/conftest.py b/tests/unit_tests/conftest.py
index 611f9ae6098..e251a3c1e7e 100644
--- a/tests/unit_tests/conftest.py
+++ b/tests/unit_tests/conftest.py
@@ -1,5 +1,6 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 import os
-import sys
 from pathlib import Path
 
 import pytest
@@ -8,9 +9,7 @@
 
 from megatron.core import config
 from megatron.core.utils import is_te_min_version
-from tests.test_utils.python_scripts.download_unit_tests_dataset import (
-    get_oldest_release_and_assets,
-)
+from tests.test_utils.python_scripts.download_unit_tests_dataset import download_and_extract_asset
 from tests.unit_tests.dist_checkpointing import TempNamedDir
 from tests.unit_tests.test_utilities import Utils
 
@@ -83,7 +82,7 @@ def ensure_test_data():
 
         try:
             # Download assets to /opt/data
-            get_oldest_release_and_assets(assets_dir=str(data_path))
+            download_and_extract_asset(assets_dir=str(data_path))
 
             print("Test data downloaded successfully.")
 
diff --git a/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py b/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py
index 873505fe51c..3ac8e7f6200 100644
--- a/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py
+++ b/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py
@@ -38,6 +38,7 @@ class DistContext:
     group: dist.ProcessGroup
     is_chief: bool
 
+
 # 2. Create a module-scoped fixture
 # This runs ONE time per file, no matter how many test classes you have.
 @pytest.fixture(scope="module")
@@ -48,7 +49,7 @@ def distributed_context():
 
     # --- SETUP ---
     is_external_init = dist.is_initialized()
-    
+
     if not is_external_init:
         # Initialize only if not already done (e.g., by another test runner)
         dist.init_process_group(
@@ -67,15 +68,10 @@ def distributed_context():
     rank = dist.get_rank()
     world_size = dist.get_world_size()
     group = dist.group.WORLD
-    
+
     print(f"[INFO]: Initialized Rank: {rank} / {world_size}")
 
-    context = DistContext(
-        rank=rank,
-        world_size=world_size,
-        group=group,
-        is_chief=(rank == 0)
-    )
+    context = DistContext(rank=rank, world_size=world_size, group=group, is_chief=(rank == 0))
 
     # Yield control to the tests
     yield context
@@ -194,6 +190,7 @@ def init_gpt_dataloader(
     dataloader = DataLoader(dataset, batch_size=batch_size, sampler=sampler)
     return dataloader
 
+
 # skip it for good
 @pytest.mark.skipif(
     ("WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2) or True,
@@ -258,9 +255,7 @@ def test_gpt_model(self, mtp_layers, dispatcher_type, fp8_flag, layer_num):
 @pytest.mark.skipif(
     "WORLD_SIZE" in os.environ and os.environ["WORLD_SIZE"] != "1", reason="Requires single GPU"
 )
-@pytest.mark.skipif(
-    get_device_arch_version() != 10, reason="Requires GPU architecture = 10"
-)
+@pytest.mark.skipif(get_device_arch_version() != 10, reason="Requires GPU architecture = 10")
 class TestFusedLinearCrossEntropyDataParallel:
     def cleanup(self):
         torch.cuda.empty_cache()
@@ -562,9 +557,7 @@ def custom_storage():
     ("WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2),  # or True,
     reason="Requires torchrun with multiple GPUs",
 )
-@pytest.mark.skipif(
-    get_device_arch_version() != 10, reason="Requires GPU architecture = 10"
-)
+@pytest.mark.skipif(get_device_arch_version() != 10, reason="Requires GPU architecture = 10")
 @pytest.mark.usefixtures("distributed_context")
 class TestFusedLinearCrossEntropyTensorParallel:
     @pytest.fixture(autouse=True)
@@ -576,7 +569,6 @@ def setup_attrs(self, distributed_context):
         self.tp_rank = distributed_context.rank
         self.tp_world_size = distributed_context.world_size
         self.is_chief = distributed_context.is_chief
-    
 
     def cleanup(self):
         torch.cuda.empty_cache()
@@ -1005,9 +997,7 @@ def custom_storage():
     "WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2,
     reason="Requires torchrun with multiple GPUs",
 )
-@pytest.mark.skipif(
-    get_device_arch_version() != 10, reason="Requires GPU architecture = 10"
-)
+@pytest.mark.skipif(get_device_arch_version() != 10, reason="Requires GPU architecture = 10")
 @pytest.mark.usefixtures("distributed_context")
 class TestFusedLinearCrossEntropySequenceParallel:
     @pytest.fixture(autouse=True)
diff --git a/tests/unit_tests/ssm/test_gated_delta_net.py b/tests/unit_tests/ssm/test_gated_delta_net.py
index dbf8d203634..89a185e3755 100644
--- a/tests/unit_tests/ssm/test_gated_delta_net.py
+++ b/tests/unit_tests/ssm/test_gated_delta_net.py
@@ -88,7 +88,7 @@ def setup_method(self, tp_size, sp, cp_size):
             context_parallel_size=cp_size,
         )
         gdn_submodules = get_gpt_layer_with_transformer_engine_spec(
-            linear_attention_type="gated_delta_net", normalization="RMSNorm"
+            experimental_attention_variant="gated_delta_net", normalization="RMSNorm"
         ).submodules.self_attention.submodules
 
         self.gdn = GatedDeltaNet(
@@ -157,7 +157,7 @@ def test_parallel_gated_delta_net_correctness(tmp_path_dist_ckpt, tp, sp, cp):
     # Model initialization function
     def initialize_gpt_model(config, pre_process=True, post_process=True, vp_stage=None):
         layer_spec = get_gpt_layer_with_transformer_engine_spec(
-            linear_attention_type="gated_delta_net", normalization=normalization
+            experimental_attention_variant="gated_delta_net", normalization=normalization
         )
         gpt_model = GPTModel(
             config=config,
diff --git a/tests/unit_tests/transformer/test_attention_variant_dsa.py b/tests/unit_tests/transformer/test_attention_variant_dsa.py
new file mode 100644
index 00000000000..bd106aa6f0e
--- /dev/null
+++ b/tests/unit_tests/transformer/test_attention_variant_dsa.py
@@ -0,0 +1,1271 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+from unittest.mock import patch
+
+import pytest
+import torch
+
+import megatron.core.parallel_state as parallel_state
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer import TransformerConfig
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.experimental_attention_variant.dsa import (
+    DSAIndexer,
+    DSAIndexerLossAutoScaler,
+    DSAIndexerSubmodules,
+    DSAttention,
+    DSAttentionSubmodules,
+    compute_dsa_indexer_loss,
+    rotate_activation,
+)
+from megatron.core.transformer.transformer_config import MLATransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+try:
+    from fast_hadamard_transform import hadamard_transform as _hadamard_transform
+
+    HAVE_HADAMARD = True
+except ImportError:
+    HAVE_HADAMARD = False
+    _hadamard_transform = None
+
+
+def mock_hadamard_transform(x: torch.Tensor, scale: float = 1.0) -> torch.Tensor:
+    """Mock implementation of hadamard_transform for testing without the library installed.
+
+    This is a simple identity-like transformation that preserves shape and applies scaling.
+    """
+    return x * scale
+
+
+@pytest.fixture(autouse=True)
+def patch_hadamard_if_needed():
+    """Automatically patch hadamard_transform in dsa module if not installed."""
+    if not HAVE_HADAMARD:
+        with patch(
+            'megatron.core.transformer.experimental_attention_variant.dsa.hadamard_transform',
+            mock_hadamard_transform,
+        ):
+            yield
+    else:
+        yield
+
+
+class TestRotateActivation:
+    """Test rotate_activation function."""
+
+    @pytest.fixture(scope='function', autouse=True)
+    def setup_method(self):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        yield
+        Utils.destroy_model_parallel()
+
+    def test_rotate_activation_shape(self):
+        """Test that rotate_activation preserves shape."""
+        batch_size = 2
+        seq_len = 16
+        hidden_size = 128
+
+        x = torch.randn(seq_len, batch_size, hidden_size, dtype=torch.bfloat16).cuda()
+        output = rotate_activation(x)
+
+        assert output.shape == x.shape
+        assert output.dtype == torch.bfloat16
+
+    def test_rotate_activation_dtype_check(self):
+        """Test that rotate_activation only accepts bfloat16."""
+        x = torch.randn(16, 2, 128, dtype=torch.float32).cuda()
+
+        with pytest.raises(AssertionError, match="only support bf16"):
+            rotate_activation(x)
+
+
+@pytest.mark.parametrize("seqlen_and_topk", [[16, 32], [64, 32]])
+class TestComputeDSAIndexerLoss:
+    """Test compute_dsa_indexer_loss function."""
+
+    @pytest.fixture(scope='function', autouse=True)
+    def setup_method(self):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        self.pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp'])
+        yield
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_indexer_loss_shape(self, seqlen_and_topk):
+        """Test that indexer loss returns a scalar."""
+        batch_size = 2
+        seqlen = seqlen_and_topk[0]
+        num_heads = 4
+        head_dim = 128
+        index_topk = seqlen_and_topk[1]
+
+        # Create dummy index scores
+        index_scores = torch.randn(batch_size, seqlen, seqlen, dtype=torch.float32).cuda()
+
+        # Apply causal mask to index_scores before computing topk
+        causal_mask = torch.triu(
+            torch.full(
+                (seqlen, seqlen), float('-inf'), dtype=torch.float32, device=index_scores.device
+            ),
+            diagonal=1,
+        )
+        # [batch_size, seqlen, seqlen] + [seqlen, seqlen] -> [batch_size, seqlen, seqlen]
+        masked_index_scores = index_scores + causal_mask
+
+        # Get topk indices from masked index_scores
+        topk_k = min(index_topk, seqlen)
+        topk_indices = masked_index_scores.topk(topk_k, dim=-1)[1]
+
+        query = torch.randn(seqlen, batch_size, num_heads, head_dim, dtype=torch.bfloat16).cuda()
+        key = torch.randn(seqlen, batch_size, num_heads, head_dim, dtype=torch.bfloat16).cuda()
+        softmax_scale = head_dim**-0.5
+
+        loss = compute_dsa_indexer_loss(
+            index_scores=index_scores,
+            topk_indices=topk_indices,
+            query=query,
+            key=key,
+            softmax_scale=softmax_scale,
+            loss_coeff=1.0,
+            sparse_loss=False,
+            pg_collection=self.pg_collection,
+        )
+
+        assert loss.shape == torch.Size([])
+        assert loss.dtype == torch.float32
+        assert loss >= 0  # KL divergence should be non-negative
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_indexer_loss_sparse(self, seqlen_and_topk):
+        """Test sparse indexer loss computation."""
+        batch_size = 2
+        seqlen = seqlen_and_topk[0]
+        num_heads = 4
+        head_dim = 128
+        index_topk = seqlen_and_topk[1]
+
+        # Create dummy index scores
+        index_scores = torch.randn(batch_size, seqlen, seqlen, dtype=torch.float32).cuda()
+
+        # Apply causal mask to index_scores before computing topk
+        causal_mask = torch.triu(
+            torch.full(
+                (seqlen, seqlen), float('-inf'), dtype=torch.float32, device=index_scores.device
+            ),
+            diagonal=1,
+        )
+        # [batch_size, seqlen, seqlen] + [seqlen, seqlen] -> [batch_size, seqlen, seqlen]
+        masked_index_scores = index_scores + causal_mask
+
+        # Get topk indices from masked index_scores
+        topk_k = min(index_topk, seqlen)
+        topk_indices = masked_index_scores.topk(topk_k, dim=-1)[1]
+
+        query = torch.randn(seqlen, batch_size, num_heads, head_dim, dtype=torch.bfloat16).cuda()
+        key = torch.randn(seqlen, batch_size, num_heads, head_dim, dtype=torch.bfloat16).cuda()
+        softmax_scale = head_dim**-0.5
+
+        loss_sparse = compute_dsa_indexer_loss(
+            index_scores=index_scores,
+            topk_indices=topk_indices,
+            query=query,
+            key=key,
+            softmax_scale=softmax_scale,
+            loss_coeff=1.0,
+            sparse_loss=True,
+            pg_collection=self.pg_collection,
+        )
+
+        loss_dense = compute_dsa_indexer_loss(
+            index_scores=index_scores,
+            topk_indices=topk_indices,
+            query=query,
+            key=key,
+            softmax_scale=softmax_scale,
+            loss_coeff=1.0,
+            sparse_loss=False,
+            pg_collection=self.pg_collection,
+        )
+
+        # Sparse loss should be different from dense loss
+        if seqlen > index_topk:
+            assert loss_sparse != loss_dense
+        else:
+            assert loss_sparse == loss_dense
+        assert loss_sparse >= 0
+        assert loss_dense >= 0
+
+
+class TestDSAIndexerLossAutoScaler:
+    """Test DSAIndexerLossAutoScaler autograd function."""
+
+    @pytest.fixture(scope='function', autouse=True)
+    def setup_method(self):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        yield
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_forward_pass(self):
+        """Test that forward pass preserves output."""
+        output = torch.randn(16, 2, 128).cuda()
+        output.requires_grad_(True)
+        indexer_loss = torch.tensor(0.5).cuda()
+        indexer_loss.requires_grad_(True)
+
+        result = DSAIndexerLossAutoScaler.apply(output, indexer_loss)
+
+        assert torch.allclose(result, output, atol=0, rtol=0)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_backward_pass(self):
+        """Test that backward pass triggers indexer loss backward and scales gradient correctly."""
+        output = torch.randn(16, 2, 128).cuda()
+        output.requires_grad_(True)
+
+        # Create indexer_loss with computation graph
+        # This simulates compute_dsa_indexer_loss which computes KL divergence
+        dummy_input = torch.randn(10).cuda()
+        dummy_input.requires_grad_(True)
+        indexer_loss = dummy_input.mean()
+
+        # Set loss scale
+        scale = torch.tensor(2.0).cuda()
+        DSAIndexerLossAutoScaler.set_loss_scale(scale)
+
+        # Apply the autograd function
+        result = DSAIndexerLossAutoScaler.apply(output, indexer_loss)
+
+        # Trigger backward
+        main_loss = result.sum()
+        main_loss.backward()
+
+        # Check that gradients flow back to output
+        assert output.grad is not None, "Gradient should flow back to parameters"
+
+        # Check that indexer_loss backward was triggered
+        assert dummy_input.grad is not None, "Indexer loss backward should be triggered"
+
+        # Verify the gradient is scaled correctly
+        expected_grad_per_element = scale.item() / len(dummy_input)
+        assert torch.allclose(
+            dummy_input.grad,
+            torch.full_like(dummy_input, expected_grad_per_element),
+            rtol=0,
+            atol=0,
+        ), f"Gradient should be scaled by loss scale, expected {expected_grad_per_element}, got {dummy_input.grad[0].item()}"
+
+
+@pytest.mark.parametrize("seqlen", [16, 64])
+class TestDSAIndexer:
+    """Test DSA Indexer module basic functionality with TP=1."""
+
+    @pytest.fixture(scope='function', autouse=True)
+    def setup_method(self):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        # Create MLA config with sparse attention parameters
+        self.index_topk = 32
+        self.config = MLATransformerConfig(
+            num_layers=2,
+            hidden_size=256,
+            num_attention_heads=16,
+            use_cpu_initialization=True,
+            bf16=True,
+            params_dtype=torch.bfloat16,
+            # MLA specific configs
+            q_lora_rank=64,
+            kv_lora_rank=64,
+            qk_head_dim=64,
+            qk_pos_emb_head_dim=32,
+            v_head_dim=64,
+            rope_type='rope',
+            rotary_base=10000,
+            rotary_percent=1.0,
+            # Sparse attention specific configs
+            dsa_indexer_n_heads=8,
+            dsa_indexer_head_dim=64,
+            dsa_indexer_topk=self.index_topk,
+        )
+
+        # Create indexer submodules spec
+        from megatron.core.extensions.transformer_engine import TELinear, TENorm
+        from megatron.core.transformer.spec_utils import ModuleSpec
+
+        indexer_submodules = DSAIndexerSubmodules(
+            linear_wq_b=ModuleSpec(module=TELinear),
+            linear_wk=ModuleSpec(module=TELinear),
+            k_norm=ModuleSpec(module=TENorm),
+            linear_weights_proj=ModuleSpec(module=TELinear),
+        )
+
+        self.pg_collection = ProcessGroupCollection.use_mpu_process_groups(
+            required_pgs=['tp', 'cp']
+        )
+        self.indexer = DSAIndexer(self.config, indexer_submodules, self.pg_collection)
+
+        yield
+        Utils.destroy_model_parallel()
+
+    def test_dsa_indexer_constructor(self, seqlen):
+        """Test indexer initialization."""
+        assert isinstance(self.indexer, DSAIndexer)
+        assert self.indexer.hidden_size == 256
+        assert self.indexer.index_n_heads == 8
+        assert self.indexer.index_head_dim == 64
+        assert self.indexer.index_topk == 32
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_indexer_forward(self, seqlen):
+        """Test indexer forward pass."""
+        batch_size = 2
+
+        self.indexer.cuda()
+
+        # Create input tensors
+        x = torch.randn(seqlen, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr = torch.randn(seqlen, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda()
+
+        # Forward pass
+        topk_indices = self.indexer(x, qr)
+
+        # Check output shape
+        assert topk_indices.shape == (batch_size, seqlen, min(self.config.dsa_indexer_topk, seqlen))
+        assert topk_indices.dtype == torch.long
+        assert torch.all((topk_indices >= 0) & (topk_indices < seqlen))
+        # Make sure no duplicate indices are selected
+        assert torch.all(
+            torch.sort(topk_indices, dim=-1).values[:, :, 1:]
+            != torch.sort(topk_indices, dim=-1).values[:, :, :-1]
+        )
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_indexer_forward_with_scores(self, seqlen):
+        """Test indexer forward pass with scores."""
+        batch_size = 2
+
+        self.indexer.cuda()
+
+        # Create input tensors
+        x = torch.randn(seqlen, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr = torch.randn(seqlen, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda()
+
+        # Forward pass with scores
+        index_scores, topk_indices = self.indexer.forward_with_scores(x, qr)
+
+        # Check output shapes
+        assert index_scores.shape == (batch_size, seqlen, seqlen)
+        assert topk_indices.shape == (batch_size, seqlen, min(self.config.dsa_indexer_topk, seqlen))
+        assert index_scores.dtype == torch.float32
+        assert topk_indices.dtype == torch.long
+        assert torch.all((topk_indices >= 0) & (topk_indices < seqlen))
+        # Make sure no duplicate indices are selected
+        assert torch.all(
+            torch.sort(topk_indices, dim=-1).values[:, :, 1:]
+            != torch.sort(topk_indices, dim=-1).values[:, :, :-1]
+        )
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_indexer_with_mask(self, seqlen):
+        """Test indexer with attention mask."""
+        batch_size = 2
+
+        self.indexer.cuda()
+
+        # Create input tensors
+        x = torch.randn(seqlen, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr = torch.randn(seqlen, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda()
+        mask = torch.triu(
+            torch.full((batch_size, seqlen, seqlen), float('-inf'), dtype=torch.float32).cuda(),
+            diagonal=1,
+        )
+
+        # Forward pass with mask
+        index_scores, topk_indices = self.indexer.forward_with_scores(x, qr, mask=mask)
+
+        # Check that masked positions are not selected
+        # For causal mask, topk_indices[b, i, :] should all be <= i (except for the case that
+        # i < index_topk).
+        for b in range(batch_size):
+            for i in range(seqlen):
+                assert torch.all(topk_indices[b, i] <= max(self.index_topk, i))
+
+
+class TestDSAttention:
+    """Test DSAttention module basic functionality with TP=1."""
+
+    @pytest.fixture(scope='function', autouse=True)
+    def setup_method(self):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        # Create MLA config with sparse attention parameters
+        self.config = MLATransformerConfig(
+            num_layers=2,
+            hidden_size=256,
+            num_attention_heads=16,
+            use_cpu_initialization=True,
+            bf16=True,
+            params_dtype=torch.bfloat16,
+            # MLA specific configs
+            q_lora_rank=64,
+            kv_lora_rank=64,
+            qk_head_dim=64,
+            qk_pos_emb_head_dim=32,
+            v_head_dim=64,
+            rope_type='rope',
+            rotary_base=10000,
+            rotary_percent=1.0,
+            # Sparse attention specific configs
+            dsa_indexer_n_heads=8,
+            dsa_indexer_head_dim=64,
+            dsa_indexer_topk=32,
+            dsa_indexer_loss_coeff=1.0,
+            dsa_indexer_use_sparse_loss=False,
+        )
+
+        # Create sparse attention submodules spec
+        from megatron.core.extensions.transformer_engine import TELinear, TENorm
+        from megatron.core.transformer.spec_utils import ModuleSpec
+
+        indexer_submodules = DSAIndexerSubmodules(
+            linear_wq_b=ModuleSpec(module=TELinear),
+            linear_wk=ModuleSpec(module=TELinear),
+            k_norm=ModuleSpec(module=TENorm),
+            linear_weights_proj=ModuleSpec(module=TELinear),
+        )
+        indexer_spec = ModuleSpec(module=DSAIndexer, submodules=indexer_submodules)
+        sparse_attention_submodules = DSAttentionSubmodules(indexer=indexer_spec)
+
+        self.pg_collection = ProcessGroupCollection.use_mpu_process_groups(
+            required_pgs=['tp', 'cp']
+        )
+
+        self.sparse_attention = DSAttention(
+            config=self.config,
+            submodules=sparse_attention_submodules,
+            layer_number=1,
+            attn_mask_type=AttnMaskType.causal,
+            attention_type='self',
+            pg_collection=self.pg_collection,
+        )
+
+        yield
+        Utils.destroy_model_parallel()
+
+    def test_dsa_constructor(self):
+        """Test sparse attention initialization."""
+        assert isinstance(self.sparse_attention, DSAttention)
+        assert hasattr(self.sparse_attention, 'indexer')
+        assert isinstance(self.sparse_attention.indexer, DSAIndexer)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_forward(self):
+        """Test sparse attention forward pass."""
+        seq_len = 16
+        batch_size = 2
+        num_heads = self.config.num_attention_heads
+        head_dim = self.config.hidden_size // num_heads
+
+        self.sparse_attention.cuda()
+
+        # Create input tensors [seq_len, batch, num_heads, head_dim]
+        query = (
+            torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16)
+            .cuda()
+            .requires_grad_(True)
+        )
+        key = (
+            torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16)
+            .cuda()
+            .requires_grad_(True)
+        )
+        value = (
+            torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16)
+            .cuda()
+            .requires_grad_(True)
+        )
+
+        # Original hidden states and low-rank query
+        x = torch.randn(seq_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr = torch.randn(seq_len, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda()
+
+        # Create causal attention mask
+        attention_mask = torch.ones(batch_size, 1, seq_len, seq_len, dtype=torch.bool).cuda()
+        attention_mask = torch.tril(attention_mask)
+
+        # Forward pass
+        output = self.sparse_attention(
+            query=query,
+            key=key,
+            value=value,
+            x=x,
+            qr=qr,
+            attention_mask=attention_mask,
+            attn_mask_type=AttnMaskType.causal,
+        )
+
+        # Check output shape
+        assert output.shape == (seq_len, batch_size, self.config.hidden_size)
+        assert output.dtype == torch.bfloat16
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_backward(self):
+        """Test sparse attention backward pass with indexer loss."""
+        seq_len = 16
+        batch_size = 2
+        num_heads = self.config.num_attention_heads
+        head_dim = self.config.hidden_size // num_heads
+
+        self.sparse_attention.train()
+        self.sparse_attention.cuda()
+
+        # Create input tensors
+        query = (
+            torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16)
+            .cuda()
+            .requires_grad_(True)
+        )
+        key = (
+            torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16)
+            .cuda()
+            .requires_grad_(True)
+        )
+        value = (
+            torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16)
+            .cuda()
+            .requires_grad_(True)
+        )
+
+        # Original hidden states and low-rank query
+        x = torch.randn(seq_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr = torch.randn(seq_len, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda()
+
+        # Create causal attention mask
+        attention_mask = torch.ones(batch_size, 1, seq_len, seq_len, dtype=torch.bool).cuda()
+        attention_mask = torch.tril(attention_mask)
+
+        # Forward pass
+        output = self.sparse_attention(
+            query=query,
+            key=key,
+            value=value,
+            x=x,
+            qr=qr,
+            attention_mask=attention_mask,
+            attn_mask_type=AttnMaskType.causal,
+        )
+
+        # Backward pass
+        loss = output.sum()
+        loss.backward()
+
+        # Check that gradients are computed for inputs
+        assert query.grad is not None
+        assert key.grad is not None
+        assert value.grad is not None
+
+        # Check that indexer parameters have gradients
+        for name, param in self.sparse_attention.indexer.named_parameters():
+            if param.requires_grad:
+                assert param.grad is not None, f"Indexer parameter {name} has no gradient"
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_topk_selection(self):
+        """Test that sparse attention correctly selects top-k indices."""
+        seq_len = 16
+        batch_size = 2
+        num_heads = self.config.num_attention_heads
+        head_dim = self.config.hidden_size // num_heads
+
+        self.sparse_attention.eval()
+        self.sparse_attention.cuda()
+
+        # Create input tensors
+        query = torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16).cuda()
+        key = torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16).cuda()
+        value = torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16).cuda()
+
+        # Original hidden states and low-rank query
+        x = torch.randn(seq_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr = torch.randn(seq_len, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda()
+
+        # Create causal attention mask
+        attention_mask = torch.ones(batch_size, 1, seq_len, seq_len, dtype=torch.bool).cuda()
+        attention_mask = torch.tril(attention_mask)
+
+        with torch.no_grad():
+            # Get topk indices from indexer
+            _, topk_indices = self.sparse_attention.indexer.forward_with_scores(x, qr)
+
+            # Forward pass
+            output = self.sparse_attention(
+                query=query,
+                key=key,
+                value=value,
+                x=x,
+                qr=qr,
+                attention_mask=attention_mask,
+                attn_mask_type=AttnMaskType.causal,
+            )
+
+        # Check that topk_indices are valid
+        assert torch.all(topk_indices >= 0)
+        assert torch.all(topk_indices < seq_len)
+        assert topk_indices.shape[2] == min(self.config.dsa_indexer_topk, seq_len)
+
+
+# ======================================================================================
+# Tensor Parallel Consistency Tests
+# ======================================================================================
+
+
+@pytest.mark.parametrize("tensor_model_parallel_size", [2, 4, 8])
+@pytest.mark.parametrize("sequence_parallel", [False, True])
+class TestIndexerTensorParallel:
+    """Test DSA Indexer with different TP sizes and SP settings, compare with TP=1 baseline."""
+
+    def _create_config(self, sequence_parallel=False):
+        """Helper to create MLA config."""
+        # Get TP size from parallel_state
+        tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
+
+        return MLATransformerConfig(
+            num_layers=2,
+            hidden_size=256,
+            num_attention_heads=16,
+            use_cpu_initialization=True,
+            bf16=True,
+            params_dtype=torch.bfloat16,
+            tensor_model_parallel_size=tensor_model_parallel_size,
+            sequence_parallel=sequence_parallel,
+            # MLA specific configs
+            q_lora_rank=64,
+            kv_lora_rank=64,
+            qk_head_dim=64,
+            qk_pos_emb_head_dim=32,
+            v_head_dim=64,
+            rope_type='rope',
+            rotary_base=10000,
+            rotary_percent=1.0,
+            # Sparse attention specific configs
+            dsa_indexer_n_heads=8,
+            dsa_indexer_head_dim=64,
+            dsa_indexer_topk=32,
+        )
+
+    def _create_indexer(self, config, pg_collection):
+        """Helper to create indexer."""
+        from megatron.core.extensions.transformer_engine import TELinear, TENorm
+        from megatron.core.transformer.spec_utils import ModuleSpec
+
+        indexer_submodules = DSAIndexerSubmodules(
+            linear_wq_b=ModuleSpec(module=TELinear),
+            linear_wk=ModuleSpec(module=TELinear),
+            k_norm=ModuleSpec(module=TENorm),
+            linear_weights_proj=ModuleSpec(module=TELinear),
+        )
+
+        return DSAIndexer(config, indexer_submodules, pg_collection)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_indexer_weight_consistency(self, tensor_model_parallel_size, sequence_parallel):
+        """Test that indexer weights are identical across ALL GPUs."""
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tensor_model_parallel_size, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        config = self._create_config(sequence_parallel=sequence_parallel)
+        pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+        indexer = self._create_indexer(config, pg_collection).cuda()
+
+        # Check that all weights are identical across ALL ranks (not just TP group)
+        world_size = torch.distributed.get_world_size()
+        world_rank = torch.distributed.get_rank()
+
+        if world_size > 1:
+            for name, param in indexer.named_parameters():
+                # Gather weights from ALL ranks in WORLD group
+                param_list = [torch.zeros_like(param.data) for _ in range(world_size)]
+                torch.distributed.all_gather(param_list, param.data)
+
+                # All weights should be identical across all GPUs
+                for i in range(1, world_size):
+                    assert torch.allclose(
+                        param_list[0], param_list[i], rtol=0, atol=0
+                    ), f"Parameter {name} differs between rank 0 and rank {i} (world)"
+
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_indexer_forward_consistency(self, tensor_model_parallel_size, sequence_parallel):
+        """Test that indexer gives consistent results across different TP sizes and SP settings."""
+        # First run with TP=1 to get baseline
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        config_tp1 = self._create_config(sequence_parallel=False)  # TP=1 doesn't use SP
+        pg_collection_tp1 = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+        indexer_tp1 = self._create_indexer(config_tp1, pg_collection_tp1).cuda()
+
+        seq_len = 64
+        batch_size = 2
+
+        # Create one common input (all ranks create same input with same seed)
+        x_input = torch.randn(
+            seq_len, batch_size, config_tp1.hidden_size, dtype=torch.bfloat16
+        ).cuda()
+        qr_input = torch.randn(
+            seq_len, batch_size, config_tp1.q_lora_rank, dtype=torch.bfloat16
+        ).cuda()
+
+        # Forward pass with gradients enabled
+        index_scores_tp1, topk_indices_tp1 = indexer_tp1.forward_with_scores(x_input, qr_input)
+
+        # Backward pass
+        loss_tp1 = index_scores_tp1.sum()
+        loss_tp1.backward()
+
+        # Save gradients from TP=1
+        indexer_tp1_grads = {
+            name: param.grad.clone().cpu()
+            for name, param in indexer_tp1.named_parameters()
+            if param.grad is not None
+        }
+
+        Utils.destroy_model_parallel()
+
+        # Now run with target TP size
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tensor_model_parallel_size, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        config_tpn = self._create_config(sequence_parallel=sequence_parallel)
+        pg_collection_tpn = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+        indexer_tpn = self._create_indexer(config_tpn, pg_collection_tpn).cuda()
+
+        # Prepare input: split along seqlen if SP is enabled
+        if sequence_parallel:
+            tp_rank = parallel_state.get_tensor_model_parallel_rank()
+            seq_per_rank = seq_len // tensor_model_parallel_size
+            start_idx = tp_rank * seq_per_rank
+            end_idx = (tp_rank + 1) * seq_per_rank
+            x_tpn = x_input[start_idx:end_idx]
+            qr_tpn = qr_input[start_idx:end_idx]
+        else:
+            # No SP: all TP ranks see full input
+            x_tpn = x_input
+            qr_tpn = qr_input
+
+        # Forward pass with gradients enabled
+        index_scores_tpn, topk_indices_tpn = indexer_tpn.forward_with_scores(x_tpn, qr_tpn)
+
+        # Backward pass
+        loss_tpn = index_scores_tpn.sum()
+        loss_tpn.backward()
+
+        # Compare forward outputs
+        assert index_scores_tpn.shape == index_scores_tp1.shape
+        assert topk_indices_tpn.shape == topk_indices_tp1.shape
+
+        # Check that index scores are close (allow for floating point accumulation errors)
+        assert torch.allclose(
+            index_scores_tpn, index_scores_tp1, rtol=0, atol=0
+        ), f"Index scores mismatch between TP=1 and TP={tensor_model_parallel_size}, SP={sequence_parallel}"
+
+        # Check that topk indices are exactly the same
+        assert torch.equal(
+            topk_indices_tpn, topk_indices_tp1
+        ), f"Top-k indices mismatch between TP=1 and TP={tensor_model_parallel_size}, SP={sequence_parallel}"
+
+        # Compare gradients - indexer grads should be identical (duplicated weights)
+        for name, param in indexer_tpn.named_parameters():
+            if param.grad is not None and name in indexer_tp1_grads:
+                assert torch.allclose(
+                    param.grad.cpu(), indexer_tp1_grads[name], rtol=0, atol=0
+                ), f"Indexer gradient {name} mismatch between TP=1 and TP={tensor_model_parallel_size}, SP={sequence_parallel}"
+
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_indexer_gradient_sync(self, tensor_model_parallel_size, sequence_parallel):
+        """Test that gradients are properly synchronized within TP group."""
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tensor_model_parallel_size, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        config = self._create_config(sequence_parallel=sequence_parallel)
+        pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+        indexer = self._create_indexer(config, pg_collection).cuda()
+
+        seq_len = 64
+        batch_size = 2
+
+        # Create one common input (all ranks create same input with same seed)
+        x_input = torch.randn(seq_len, batch_size, config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr_input = torch.randn(seq_len, batch_size, config.q_lora_rank, dtype=torch.bfloat16).cuda()
+
+        # Prepare input: split along seqlen if SP is enabled
+        if sequence_parallel:
+            tp_rank = parallel_state.get_tensor_model_parallel_rank()
+            tp_size = parallel_state.get_tensor_model_parallel_world_size()
+            seq_per_rank = seq_len // tp_size
+            start_idx = tp_rank * seq_per_rank
+            end_idx = (tp_rank + 1) * seq_per_rank
+            x = x_input[start_idx:end_idx]
+            qr = qr_input[start_idx:end_idx]
+        else:
+            # No SP: all TP ranks see full input
+            x = x_input
+            qr = qr_input
+
+        # Forward and backward
+        index_scores, topk_indices = indexer.forward_with_scores(x, qr)
+        loss = index_scores.sum()
+        loss.backward()
+
+        # Check that all parameters have gradients
+        for name, param in indexer.named_parameters():
+            if param.requires_grad:
+                assert param.grad is not None, f"Parameter {name} has no gradient"
+
+        # After TP sync, check that gradients are identical within TP group
+        # Note: We only check TP group because DDP sync happens separately
+        tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        if tp_size > 1:
+            for name, param in indexer.named_parameters():
+                if param.requires_grad and param.grad is not None:
+                    # Gather gradients from all ranks in TP group only
+                    grad_list = [torch.zeros_like(param.grad) for _ in range(tp_size)]
+                    torch.distributed.all_gather(grad_list, param.grad, group=pg_collection.tp)
+
+                    # All gradients should be identical within TP group after sync
+                    for i in range(1, tp_size):
+                        assert torch.allclose(
+                            grad_list[0], grad_list[i], rtol=0, atol=0
+                        ), f"Gradient for {name} differs between TP rank 0 and rank {i} after TP sync"
+
+        Utils.destroy_model_parallel()
+
+
+@pytest.mark.parametrize("tensor_model_parallel_size", [2, 4])
+@pytest.mark.parametrize("sequence_parallel", [False, True])
+@pytest.mark.parametrize("use_sparse_indexer_loss", [False, True])
+class TestDSAttentionTensorParallel:
+    """Test DSAttention with different TP sizes, SP settings, and sparse indexer loss."""
+
+    def _create_config(self, sequence_parallel=False, use_sparse_indexer_loss=False):
+        """Helper to create MLA config."""
+        # Get TP size from parallel_state
+        tensor_model_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
+
+        return MLATransformerConfig(
+            num_layers=2,
+            hidden_size=256,
+            num_attention_heads=16,
+            use_cpu_initialization=True,
+            bf16=True,
+            params_dtype=torch.bfloat16,
+            tensor_model_parallel_size=tensor_model_parallel_size,
+            sequence_parallel=sequence_parallel,
+            # MLA specific configs
+            q_lora_rank=64,
+            kv_lora_rank=64,
+            qk_head_dim=64,
+            qk_pos_emb_head_dim=32,
+            v_head_dim=64,
+            rope_type='rope',
+            rotary_base=10000,
+            rotary_percent=1.0,
+            # Sparse attention specific configs
+            dsa_indexer_n_heads=8,
+            dsa_indexer_head_dim=64,
+            dsa_indexer_topk=32,
+            dsa_indexer_loss_coeff=1.0,
+            dsa_indexer_use_sparse_loss=use_sparse_indexer_loss,
+        )
+
+    def _create_sparse_attention(self, config, pg_collection):
+        """Helper to create sparse attention."""
+        from megatron.core.extensions.transformer_engine import TELinear, TENorm
+        from megatron.core.transformer.spec_utils import ModuleSpec
+
+        indexer_submodules = DSAIndexerSubmodules(
+            linear_wq_b=ModuleSpec(module=TELinear),
+            linear_wk=ModuleSpec(module=TELinear),
+            k_norm=ModuleSpec(module=TENorm),
+            linear_weights_proj=ModuleSpec(module=TELinear),
+        )
+        indexer_spec = ModuleSpec(module=DSAIndexer, submodules=indexer_submodules)
+        sparse_attention_submodules = DSAttentionSubmodules(indexer=indexer_spec)
+
+        return DSAttention(
+            config=config,
+            submodules=sparse_attention_submodules,
+            layer_number=1,
+            attn_mask_type=AttnMaskType.causal,
+            attention_type='self',
+            pg_collection=pg_collection,
+        )
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_weight_consistency(
+        self, tensor_model_parallel_size, sequence_parallel, use_sparse_indexer_loss
+    ):
+        """Test that sparse attention indexer weights are identical across ALL GPUs."""
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tensor_model_parallel_size, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        config = self._create_config(
+            sequence_parallel=sequence_parallel, use_sparse_indexer_loss=use_sparse_indexer_loss
+        )
+        pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+        sparse_attention = self._create_sparse_attention(config, pg_collection).cuda()
+
+        # Check that all indexer weights are identical across ALL ranks
+        world_size = torch.distributed.get_world_size()
+        world_rank = torch.distributed.get_rank()
+
+        if world_size > 1:
+            for name, param in sparse_attention.indexer.named_parameters():
+                # Gather weights from ALL ranks in WORLD group
+                param_list = [torch.zeros_like(param.data) for _ in range(world_size)]
+                torch.distributed.all_gather(param_list, param.data)
+
+                # All weights should be identical across all GPUs
+                for i in range(1, world_size):
+                    torch.testing.assert_close(param_list[0], param_list[i], rtol=0, atol=0)
+
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_forward_consistency(
+        self, tensor_model_parallel_size, sequence_parallel, use_sparse_indexer_loss
+    ):
+        """Test that sparse attention gives consistent results across different TP, SP, and sparse loss settings."""
+        # First run with TP=1 to get baseline
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        config_tp1 = self._create_config(
+            sequence_parallel=False, use_sparse_indexer_loss=use_sparse_indexer_loss
+        )  # TP=1 doesn't use SP
+        pg_collection_tp1 = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+        sparse_attention_tp1 = self._create_sparse_attention(config_tp1, pg_collection_tp1).cuda()
+
+        seq_len = 64
+        batch_size = 2
+        num_heads = config_tp1.num_attention_heads
+        head_dim = config_tp1.hidden_size // num_heads
+
+        # Create one common input (all ranks create same input with same seed)
+        query_input = (
+            torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16)
+            .cuda()
+            .requires_grad_(True)
+        )
+        key_input = (
+            torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16)
+            .cuda()
+            .requires_grad_(True)
+        )
+        value_input = (
+            torch.randn(seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16)
+            .cuda()
+            .requires_grad_(True)
+        )
+        x_input = torch.randn(
+            seq_len, batch_size, config_tp1.hidden_size, dtype=torch.bfloat16
+        ).cuda()
+        qr_input = torch.randn(
+            seq_len, batch_size, config_tp1.q_lora_rank, dtype=torch.bfloat16
+        ).cuda()
+        attention_mask = torch.ones(batch_size, 1, seq_len, seq_len, dtype=torch.bool).cuda()
+        attention_mask = torch.tril(attention_mask)
+
+        # Forward pass with gradients enabled
+        sparse_attention_tp1.train()
+        output_tp1 = sparse_attention_tp1(
+            query=query_input,
+            key=key_input,
+            value=value_input,
+            x=x_input,
+            qr=qr_input,
+            attention_mask=attention_mask,
+            attn_mask_type=AttnMaskType.causal,
+        )
+
+        # Backward pass
+        loss_tp1 = output_tp1.sum()
+        loss_tp1.backward()
+
+        # Save gradients from TP=1
+        indexer_tp1_grads = {
+            name: param.grad.clone()
+            for name, param in sparse_attention_tp1.indexer.named_parameters()
+            if param.grad is not None
+        }
+        query_tp1_grad = query_input.grad.clone().cpu()
+        key_tp1_grad = key_input.grad.clone().cpu()
+        value_tp1_grad = value_input.grad.clone().cpu()
+
+        Utils.destroy_model_parallel()
+
+        # Now run with target TP size
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tensor_model_parallel_size, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        config_tpn = self._create_config(
+            sequence_parallel=sequence_parallel, use_sparse_indexer_loss=use_sparse_indexer_loss
+        )
+        pg_collection_tpn = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+        sparse_attention_tpn = self._create_sparse_attention(config_tpn, pg_collection_tpn).cuda()
+
+        # Create one common input (all ranks create same input with same seed)
+        query_input = torch.randn(
+            seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16
+        ).cuda()
+        key_input = torch.randn(
+            seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16
+        ).cuda()
+        value_input = torch.randn(
+            seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16
+        ).cuda()
+        x_input = torch.randn(
+            seq_len, batch_size, config_tp1.hidden_size, dtype=torch.bfloat16
+        ).cuda()
+        qr_input = torch.randn(
+            seq_len, batch_size, config_tp1.q_lora_rank, dtype=torch.bfloat16
+        ).cuda()
+        attention_mask = torch.ones(batch_size, 1, seq_len, seq_len, dtype=torch.bool).cuda()
+        attention_mask = torch.tril(attention_mask)
+
+        # Prepare input: split along seqlen if SP is enabled
+        tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        if sequence_parallel:
+            seq_per_rank = seq_len // tensor_model_parallel_size
+            start_idx = tp_rank * seq_per_rank
+            end_idx = (tp_rank + 1) * seq_per_rank
+            x_tpn = x_input[start_idx:end_idx]
+            qr_tpn = qr_input[start_idx:end_idx]
+        else:
+            x_tpn = x_input
+            qr_tpn = qr_input
+
+        query_input = query_input.detach()
+        key_input = key_input.detach()
+        value_input = value_input.detach()
+        head_per_rank = num_heads // tensor_model_parallel_size
+        start_head = tp_rank * head_per_rank
+        end_head = (tp_rank + 1) * head_per_rank
+        query_tpn = query_input[:, :, start_head:end_head, :].clone().requires_grad_(True)
+        key_tpn = key_input[:, :, start_head:end_head, :].clone().requires_grad_(True)
+        value_tpn = value_input[:, :, start_head:end_head, :].clone().requires_grad_(True)
+        attention_mask_tpn = attention_mask
+
+        # Forward pass with gradients enabled
+        sparse_attention_tpn.train()
+        output_tpn = sparse_attention_tpn(
+            query=query_tpn,
+            key=key_tpn,
+            value=value_tpn,
+            x=x_tpn,
+            qr=qr_tpn,
+            attention_mask=attention_mask_tpn,
+            attn_mask_type=AttnMaskType.causal,
+        )
+
+        # Backward pass
+        loss_tpn = output_tpn.sum()
+        loss_tpn.backward()
+
+        from megatron.core.tensor_parallel.mappings import gather_from_tensor_model_parallel_region
+
+        output_tpn_gathered = gather_from_tensor_model_parallel_region(
+            output_tpn, group=pg_collection_tpn.tp
+        )
+        assert output_tpn_gathered.shape == output_tp1.shape
+        assert torch.allclose(
+            output_tpn_gathered.detach(), output_tp1.detach(), rtol=0, atol=0
+        ), f"Sparse attention outputs mismatch between TP=1 and TP={tensor_model_parallel_size}, SP={sequence_parallel}, sparse_loss={use_sparse_indexer_loss}"
+
+        # 1. Check indexer gradients.
+        for name, param in sparse_attention_tpn.indexer.named_parameters():
+            if param.grad is not None and name in indexer_tp1_grads:
+                torch.testing.assert_close(
+                    param.grad, indexer_tp1_grads[name], rtol=1e-5, atol=1e-5
+                )
+
+        # 2. Query/Key/Value gradients need to be gathered along num_heads dim (dim 2) if SP is enabled
+        # Flatten last two dims: [seq_len, batch, num_heads, head_dim] -> [seq_len, batch, num_heads * head_dim]
+        sq, b, nh, hd = query_tpn.grad.shape
+        query_grad_flat = query_tpn.grad.reshape(sq, b, nh * hd)
+        key_grad_flat = key_tpn.grad.reshape(sq, b, nh * hd)
+        value_grad_flat = value_tpn.grad.reshape(sq, b, nh * hd)
+
+        # Gather along last dim
+        query_grad_gathered_flat = gather_from_tensor_model_parallel_region(
+            query_grad_flat, group=pg_collection_tpn.tp
+        )
+        key_grad_gathered_flat = gather_from_tensor_model_parallel_region(
+            key_grad_flat, group=pg_collection_tpn.tp
+        )
+        value_grad_gathered_flat = gather_from_tensor_model_parallel_region(
+            value_grad_flat, group=pg_collection_tpn.tp
+        )
+
+        # Reshape back: [seq_len, batch, num_heads * head_dim] -> [seq_len, batch, num_heads, head_dim]
+        query_tpn_grad_gathered = query_grad_gathered_flat.reshape(sq, b, num_heads, hd)
+        key_tpn_grad_gathered = key_grad_gathered_flat.reshape(sq, b, num_heads, hd)
+        value_tpn_grad_gathered = value_grad_gathered_flat.reshape(sq, b, num_heads, hd)
+
+        assert torch.allclose(
+            query_tpn_grad_gathered.cpu(), query_tp1_grad, rtol=0, atol=0
+        ), f"Query gradient mismatch between TP=1 and TP={tensor_model_parallel_size}"
+        assert torch.allclose(
+            key_tpn_grad_gathered.cpu(), key_tp1_grad, rtol=0, atol=0
+        ), f"Key gradient mismatch between TP=1 and TP={tensor_model_parallel_size}"
+        assert torch.allclose(
+            value_tpn_grad_gathered.cpu(), value_tp1_grad, rtol=0, atol=0
+        ), f"Value gradient mismatch between TP=1 and TP={tensor_model_parallel_size}"
+
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dsa_gradient_sync(
+        self, tensor_model_parallel_size, sequence_parallel, use_sparse_indexer_loss
+    ):
+        """Test that indexer gradients are properly synchronized within TP group."""
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tensor_model_parallel_size, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        config = self._create_config(
+            sequence_parallel=sequence_parallel, use_sparse_indexer_loss=use_sparse_indexer_loss
+        )
+        pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+        sparse_attention = self._create_sparse_attention(config, pg_collection).cuda()
+        sparse_attention.train()
+
+        seq_len = 64
+        batch_size = 2
+        num_heads = config.num_attention_heads
+        head_dim = config.hidden_size // num_heads
+
+        # Create one common input (all ranks create same input with same seed)
+        query_input = torch.randn(
+            seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16
+        ).cuda()
+        key_input = torch.randn(
+            seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16
+        ).cuda()
+        value_input = torch.randn(
+            seq_len, batch_size, num_heads, head_dim, dtype=torch.bfloat16
+        ).cuda()
+        x_input = torch.randn(seq_len, batch_size, config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr_input = torch.randn(seq_len, batch_size, config.q_lora_rank, dtype=torch.bfloat16).cuda()
+
+        # Prepare input: split along seqlen if SP is enabled
+        tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        if sequence_parallel:
+            tp_size = parallel_state.get_tensor_model_parallel_world_size()
+            seq_per_rank = seq_len // tp_size
+            start_idx = tp_rank * seq_per_rank
+            end_idx = (tp_rank + 1) * seq_per_rank
+            x = x_input[start_idx:end_idx]
+            qr = qr_input[start_idx:end_idx]
+        else:
+            x = x_input
+            qr = qr_input
+
+        # query, key, value should be split along num_heads dim
+        head_per_rank = num_heads // tensor_model_parallel_size
+        start_head = tp_rank * head_per_rank
+        end_head = (tp_rank + 1) * head_per_rank
+        query = query_input[:, :, start_head:end_head, :]
+        key = key_input[:, :, start_head:end_head, :]
+        value = value_input[:, :, start_head:end_head, :]
+
+        attention_mask = torch.ones(batch_size, 1, seq_len, seq_len, dtype=torch.bool).cuda()
+        attention_mask = torch.tril(attention_mask)
+
+        query.requires_grad_(True)
+        key.requires_grad_(True)
+        value.requires_grad_(True)
+
+        # Forward and backward
+        output = sparse_attention(
+            query=query,
+            key=key,
+            value=value,
+            x=x,
+            qr=qr,
+            attention_mask=attention_mask,
+            attn_mask_type=AttnMaskType.causal,
+        )
+
+        loss = output.sum()
+        loss.backward()
+
+        # Check that gradients exist before sync
+        assert query.grad is not None
+        assert key.grad is not None
+        assert value.grad is not None
+
+        # Check that indexer parameters have gradients
+        for name, param in sparse_attention.indexer.named_parameters():
+            if param.requires_grad:
+                assert param.grad is not None, f"Indexer parameter {name} has no gradient"
+
+        # Check that indexer gradients are identical within TP group
+        tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        if tp_size > 1:
+            for name, param in sparse_attention.indexer.named_parameters():
+                if param.requires_grad and param.grad is not None:
+                    # Gather gradients from all ranks in TP group only
+                    grad_list = [torch.zeros_like(param.grad) for _ in range(tp_size)]
+                    torch.distributed.all_gather(grad_list, param.grad, group=pg_collection.tp)
+
+                    # All gradients should be identical within TP group after sync
+                    for i in range(1, tp_size):
+                        assert torch.allclose(
+                            grad_list[0], grad_list[i], rtol=0, atol=0
+                        ), f"Indexer gradient for {name} differs between TP rank 0 and rank {i} after TP sync"
+
+        Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/transformer/test_multi_token_prediction.py b/tests/unit_tests/transformer/test_multi_token_prediction.py
index 9b9d2c67881..ddfa9bfba16 100644
--- a/tests/unit_tests/transformer/test_multi_token_prediction.py
+++ b/tests/unit_tests/transformer/test_multi_token_prediction.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import os
 import sys
@@ -14,11 +14,14 @@
 )
 from megatron.core.models.gpt.gpt_model import GPTModel
 from megatron.core.num_microbatches_calculator import destroy_num_microbatches_calculator
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.parallel_state import get_context_parallel_group
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 from megatron.core.transformer.multi_token_prediction import (
     MTPLossLoggingHelper,
     MultiTokenPredictionBlock,
+    roll_tensor,
 )
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.utils import is_te_min_version
@@ -245,6 +248,66 @@ def get_batch(self, seq_length, micro_batch_size):
         }
         return batch
 
+    def get_packed_batch(self, seq_lengths, micro_batch_size):
+        """
+        Create a packed sequence batch with multiple sequences of varying lengths.
+
+        Args:
+            seq_lengths: List of sequence lengths (e.g., [10, 15, 8] for 3 sequences)
+            micro_batch_size: Batch size (typically 1 for packed sequences)
+
+        Returns:
+            batch: Dictionary containing packed sequences and PackedSeqParams
+        """
+        total_seq_length = sum(seq_lengths)
+
+        # Create packed input_ids, labels, and position_ids
+        input_ids_list = []
+        labels_list = []
+        position_ids_list = []
+
+        for seq_len in seq_lengths:
+            data = list(range(seq_len))
+            input_ids_list.extend(data)
+            labels_list.extend([x + 1 for x in data])
+            position_ids_list.extend(data)
+
+        # Convert to tensors with shape [batch, total_seq_length]
+        input_ids = torch.tensor(input_ids_list, dtype=torch.int64).unsqueeze(0).cuda()
+        labels = torch.tensor(labels_list, dtype=torch.int64).unsqueeze(0).cuda()
+        position_ids = torch.tensor(position_ids_list, dtype=torch.int64).unsqueeze(0).cuda()
+
+        # Create attention mask for packed sequences (all ones for simplicity)
+        attention_mask = torch.ones(
+            (micro_batch_size, 1, total_seq_length, total_seq_length), dtype=bool
+        ).cuda()
+
+        # Create loss mask with shape [batch, total_seq_length]
+        loss_mask = torch.ones(micro_batch_size, total_seq_length).cuda()
+
+        # Create cumulative sequence lengths for PackedSeqParams
+        cu_seqlens = torch.tensor(
+            [0] + [sum(seq_lengths[: i + 1]) for i in range(len(seq_lengths))], dtype=torch.int32
+        ).cuda()
+
+        packed_seq_params = PackedSeqParams(
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_kv=cu_seqlens,
+            max_seqlen_q=max(seq_lengths),
+            max_seqlen_kv=max(seq_lengths),
+            qkv_format='thd',
+        )
+
+        batch = {
+            'tokens': input_ids,
+            'labels': labels,
+            'loss_mask': loss_mask,
+            'attention_mask': attention_mask,
+            'position_ids': position_ids,
+            'packed_seq_params': packed_seq_params,
+        }
+        return batch
+
     @pytest.mark.skipif(
         not HAVE_TE or not is_te_min_version("2.1.0"),
         reason="grouped_gemm requires TransformerEngine >= 2.1.0",
@@ -404,6 +467,149 @@ def test_fp8_support(self, full_recompute):
         loss = output.mean()
         loss.backward()
 
+    @pytest.mark.skipif(
+        not HAVE_TE or not is_te_min_version("2.1.0"),
+        reason="grouped_gemm requires TransformerEngine >= 2.1.0",
+    )
+    @pytest.mark.parametrize(("tp", "cp"), [(1, 1), (2, 1), (2, 2)])
+    def test_packed_sequences(self, tp, cp):
+        """Test MTP with packed sequences."""
+        # Create args with packed sequences support
+        seq_lengths = [16, 24, 12]  # Three sequences of different lengths
+        total_seq_length = sum(seq_lengths)
+
+        args = self.create_test_args(tp, cp, total_seq_length, micro_batch_size=1)
+        set_args(args)
+
+        torch.manual_seed(_SEED)
+        Utils.initialize_model_parallel(tensor_model_parallel_size=tp, context_parallel_size=cp)
+
+        # Get packed batch
+        batch = self.get_packed_batch(seq_lengths, micro_batch_size=1)
+        tokens = batch['tokens']
+        labels = batch['labels']
+        loss_mask = batch['loss_mask']
+        attention_mask = batch['attention_mask']
+        position_ids = batch['position_ids']
+        packed_seq_params = batch['packed_seq_params']
+
+        # Create model
+        gpt_model, optimizer, opt_param_scheduler = setup_model_and_optimizer(
+            self.model_provider, ModelType.encoder_or_decoder
+        )
+
+        # Forward pass with packed sequences
+        output = gpt_model[0].forward(
+            input_ids=tokens,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            labels=labels,
+            loss_mask=loss_mask,
+            packed_seq_params=packed_seq_params,
+        )
+
+        # Verify output shape
+        assert output.shape[0] == 1  # batch size
+        assert output.shape[1] == total_seq_length
+
+        # Verify MTP loss was computed
+        tracker = MTPLossLoggingHelper.tracker
+        assert "values" in tracker
+        mtp_loss = tracker['values'].clone()
+        assert mtp_loss.shape[0] == args.mtp_num_layers
+        MTPLossLoggingHelper.clean_loss_in_tracker()
+
+        # Backward pass
+        loss = output.mean()
+        loss.backward()
+
+        # Verify gradients exist
+        for name, param in gpt_model[0].named_parameters():
+            assert param.main_grad is not None, f"Gradient missing for {name}"
+
+    @pytest.mark.parametrize("cp", [1, 2])
+    def test_roll_tensor_with_packed_sequences(self, cp):
+        """Test roll_tensor function with packed sequences, with and without CP.
+
+        For CP=1: Tests standard packed sequence rolling with verified expected values
+        For CP=2: Tests CP-enabled rolling executes without errors
+        """
+        Utils.initialize_model_parallel(tensor_model_parallel_size=1, context_parallel_size=cp)
+        cp_group = get_context_parallel_group() if cp > 1 else None
+        cp_rank = torch.distributed.get_rank(group=cp_group) if cp_group is not None else 0
+
+        if cp == 1:
+            # Test case: Simple packed sequences (CP disabled)
+            tensor = torch.tensor([1, 2, 3, 4, 5], dtype=torch.float32).cuda()
+            cu_seqlens = torch.tensor([0, 3, 5], dtype=torch.int32).cuda()
+
+            packed_seq_params = PackedSeqParams(
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_kv=cu_seqlens,
+                max_seqlen_q=3,
+                max_seqlen_kv=3,
+                qkv_format='thd',
+            )
+
+            # Roll by -1 (shift left)
+            rolled, sum_val = roll_tensor(
+                tensor, shifts=-1, dims=0, cp_group=cp_group, packed_seq_params=packed_seq_params
+            )
+
+            # Expected: [2, 3, 0, 5, 0] - boundaries at indices 2 and 4 are zeroed
+            expected = torch.tensor([2, 3, 0, 5, 0], dtype=torch.float32).cuda()
+            assert torch.equal(rolled, expected), f"Expected {expected}, got {rolled}"
+        else:
+            # Test case: Packed sequences with CP=2
+            # Two sequences:
+            #   seq1 = [1, 2, 3, 4, 5, 6, 7, 8]
+            #   seq2 = [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
+
+            if cp_rank == 0:
+                # CP Rank 0: first half of each sequence
+                tensor = torch.tensor(
+                    [1, 2, 7, 8, 11, 12, 13, 20, 21, 22], dtype=torch.float32
+                ).cuda()
+                expected = torch.tensor(
+                    [2, 3, 8, 0, 12, 13, 14, 21, 22, 0], dtype=torch.float32
+                ).cuda()
+            else:
+                # CP Rank 1: second half of each sequence
+                tensor = torch.tensor(
+                    [3, 4, 5, 6, 14, 15, 16, 17, 18, 19], dtype=torch.float32
+                ).cuda()
+                expected = torch.tensor(
+                    [4, 5, 6, 7, 15, 16, 17, 18, 19, 20], dtype=torch.float32
+                ).cuda()
+
+            cu_seqlens = torch.tensor([0, 8, 20], dtype=torch.int32).cuda()
+
+            packed_seq_params = PackedSeqParams(
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_kv=cu_seqlens,
+                max_seqlen_q=6,  # max(4, 6) - max local seq length per sequence
+                max_seqlen_kv=6,
+                qkv_format='thd',
+            )
+
+            # Roll by -1 (shift left) with CP communication
+            rolled, sum_val = roll_tensor(
+                tensor, shifts=-1, dims=0, cp_group=cp_group, packed_seq_params=packed_seq_params
+            )
+
+            # Verify the rolled tensor matches expected values
+            assert (
+                rolled.shape == expected.shape
+            ), f"Shape mismatch: expected {expected.shape}, got {rolled.shape}"
+            assert torch.equal(
+                rolled, expected
+            ), f"CP Rank {cp_rank}: Expected\n{expected}\nbut got\n{rolled}\nDiff:\n{rolled - expected}"
+
+            # Verify sum is correct
+            assert sum_val.numel() == 1, "Sum should be a scalar"
+
+        Utils.destroy_model_parallel()
+
 
 class TestMTPLossLoggingHelper:
     def setup_method(self, method):
diff --git a/uv.lock b/uv.lock
index f636a791f12..af8e548b625 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2,50 +2,16 @@ version = 1
 revision = 2
 requires-python = ">=3.10"
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.11' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.11' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.11' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.11' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.11' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.11' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform != 'linux'",
+    "python_full_version == '3.13.*' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
+    "python_full_version < '3.11' and sys_platform == 'linux'",
+    "python_full_version < '3.11' and sys_platform != 'linux'",
 ]
 conflicts = [[
     { package = "megatron-core", extra = "dev" },
@@ -82,7 +48,7 @@ wheels = [
 
 [[package]]
 name = "aiobotocore"
-version = "2.25.1"
+version = "2.26.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "aiohttp" },
@@ -91,11 +57,11 @@ dependencies = [
     { name = "jmespath" },
     { name = "multidict" },
     { name = "python-dateutil" },
-    { name = "wrapt", version = "1.17.3", source = { registry = "https://pypi.org/simple" } },
+    { name = "wrapt" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/62/94/2e4ec48cf1abb89971cb2612d86f979a6240520f0a659b53a43116d344dc/aiobotocore-2.25.1.tar.gz", hash = "sha256:ea9be739bfd7ece8864f072ec99bb9ed5c7e78ebb2b0b15f29781fbe02daedbc", size = 120560, upload-time = "2025-10-28T22:33:21.787Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/4d/f8/99fa90d9c25b78292899fd4946fce97b6353838b5ecc139ad8ba1436e70c/aiobotocore-2.26.0.tar.gz", hash = "sha256:50567feaf8dfe2b653570b4491f5bc8c6e7fb9622479d66442462c021db4fadc", size = 122026, upload-time = "2025-11-28T07:54:59.956Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/95/2a/d275ec4ce5cd0096665043995a7d76f5d0524853c76a3d04656de49f8808/aiobotocore-2.25.1-py3-none-any.whl", hash = "sha256:eb6daebe3cbef5b39a0bb2a97cffbe9c7cb46b2fcc399ad141f369f3c2134b1f", size = 86039, upload-time = "2025-10-28T22:33:19.949Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/58/3bf0b7d474607dc7fd67dd1365c4e0f392c8177eaf4054e5ddee3ebd53b5/aiobotocore-2.26.0-py3-none-any.whl", hash = "sha256:a793db51c07930513b74ea7a95bd79aaa42f545bdb0f011779646eafa216abec", size = 87333, upload-time = "2025-11-28T07:54:58.457Z" },
 ]
 
 [[package]]
@@ -229,11 +195,11 @@ wheels = [
 
 [[package]]
 name = "aioitertools"
-version = "0.12.0"
+version = "0.13.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/06/de/38491a84ab323b47c7f86e94d2830e748780525f7a10c8600b67ead7e9ea/aioitertools-0.12.0.tar.gz", hash = "sha256:c2a9055b4fbb7705f561b9d86053e8af5d10cc845d22c32008c43490b2d8dd6b", size = 19369, upload-time = "2024-09-02T03:33:40.349Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/fd/3c/53c4a17a05fb9ea2313ee1777ff53f5e001aefd5cc85aa2f4c2d982e1e38/aioitertools-0.13.0.tar.gz", hash = "sha256:620bd241acc0bbb9ec819f1ab215866871b4bbd1f73836a55f799200ee86950c", size = 19322, upload-time = "2025-11-06T22:17:07.609Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/85/13/58b70a580de00893223d61de8fea167877a3aed97d4a5e1405c9159ef925/aioitertools-0.12.0-py3-none-any.whl", hash = "sha256:fc1f5fac3d737354de8831cbba3eb04f79dd649d8f3afb4c5b114925e662a796", size = 24345, upload-time = "2024-09-02T03:34:59.454Z" },
+    { url = "https://files.pythonhosted.org/packages/10/a1/510b0a7fadc6f43a6ce50152e69dbd86415240835868bb0bd9b5b88b1e06/aioitertools-0.13.0-py3-none-any.whl", hash = "sha256:0be0292b856f08dfac90e31f4739432f4cb6d7520ab9eb73e143f4f2fa5259be", size = 24182, upload-time = "2025-11-06T22:17:06.502Z" },
 ]
 
 [[package]]
@@ -269,11 +235,11 @@ wheels = [
 
 [[package]]
 name = "annotated-doc"
-version = "0.0.3"
+version = "0.0.4"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/d7/a6/dc46877b911e40c00d395771ea710d5e77b6de7bacd5fdcd78d70cc5a48f/annotated_doc-0.0.3.tar.gz", hash = "sha256:e18370014c70187422c33e945053ff4c286f453a984eba84d0dbfa0c935adeda", size = 5535, upload-time = "2025-10-24T14:57:10.718Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/57/ba/046ceea27344560984e26a590f90bc7f4a75b06701f653222458922b558c/annotated_doc-0.0.4.tar.gz", hash = "sha256:fbcda96e87e9c92ad167c2e53839e57503ecfda18804ea28102353485033faa4", size = 7288, upload-time = "2025-11-10T22:07:42.062Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/02/b7/cf592cb5de5cb3bade3357f8d2cf42bf103bbe39f459824b4939fd212911/annotated_doc-0.0.3-py3-none-any.whl", hash = "sha256:348ec6664a76f1fd3be81f43dffbee4c7e8ce931ba71ec67cc7f4ade7fbbb580", size = 5488, upload-time = "2025-10-24T14:57:09.462Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl", hash = "sha256:571ac1dc6991c450b25a9c2d84a3705e2ae7a53467b5d111c24fa8baabbed320", size = 5303, upload-time = "2025-11-10T22:07:40.673Z" },
 ]
 
 [[package]]
@@ -308,44 +274,38 @@ wheels = [
 
 [[package]]
 name = "apache-tvm-ffi"
-version = "0.1.1"
+version = "0.1.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/d8/e8/7db1ca6db40877d190a8538cc378f740aae247c6fe063815898607c2d2ca/apache_tvm_ffi-0.1.1.tar.gz", hash = "sha256:728ce3f4ae02b89a7147b718f7f670afac3c6d1f96df38d488757274643709fc", size = 1259223, upload-time = "2025-11-04T02:43:38.154Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/8a/ad/550aff4c9652ee8297f90a04c3ab4143ece1d373101010d85b5c9a9a2e7d/apache_tvm_ffi-0.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:af0de7bb9581ac9e090276cba37c4e7ffaeed601a2b2b546bf0e2daed3810cec", size = 1723658, upload-time = "2025-11-04T02:42:37.628Z" },
-    { url = "https://files.pythonhosted.org/packages/48/5a/01e65f4a6c2b146f7c40f6d8d663d76b60c3be324159f8fb8223ea505738/apache_tvm_ffi-0.1.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:eb7d6828652803cb8c0e13d1f06d01fc6bfb8e79e77e3de7e6fd4b5fae5ee9d2", size = 1882437, upload-time = "2025-11-04T02:42:39.647Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/bd/b52b71d03637d7a82388c2e90d48dddec2c46121be1333c9851d6a135824/apache_tvm_ffi-0.1.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1fe072b55a7949720a792a9d455c0659aa097825e709a16a4667d720137b8b5c", size = 1954949, upload-time = "2025-11-04T02:42:41.119Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/ef/ff85926928694785f2399a4c5b793bcfecf8c3cf806dedf9202b7db73b8b/apache_tvm_ffi-0.1.1-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b25178b265903dabd9a35bd767db26928be3b7869f681fe1d6e1aed93d7c0799", size = 1837395, upload-time = "2025-11-04T02:42:42.954Z" },
-    { url = "https://files.pythonhosted.org/packages/de/69/f048bda5e5445a89200737062a202cb39097d3b1902e886654de9cd6b624/apache_tvm_ffi-0.1.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d5552af3c625750361d1b7d646d499a28caf94858967e74c9cce6ed7d4629b28", size = 1947740, upload-time = "2025-11-04T02:42:44.49Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/df/295f71613502edeb39a39b30c8bbb9ec8fcc06bd95b3043dd99b55fa98a8/apache_tvm_ffi-0.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:c102ba5899ce106c8068a3f21155c106790b5b0141fba52a52ed6e9aeb286aff", size = 1710966, upload-time = "2025-11-04T02:42:46.037Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/a9/544767d7058f825c0ceb5bc25760ad3a821b2efcc6a3dbe2e3988a3aee86/apache_tvm_ffi-0.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7cbf31c472920cdc5b3f75f2d2720b8a6b37ddbdb11d573fa94524815ea5a144", size = 1725662, upload-time = "2025-11-04T02:42:47.528Z" },
-    { url = "https://files.pythonhosted.org/packages/54/c3/fe1a9f8968d5ce2d3b674e397c2bf01961e32a72b723817478c67c9780e3/apache_tvm_ffi-0.1.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d7602bc37019387a4705677b6e742059c7e1973a899b6918af235febcb3d3b47", size = 1884278, upload-time = "2025-11-04T02:42:48.998Z" },
-    { url = "https://files.pythonhosted.org/packages/24/b9/80cbba18b2d7d9013031d8c13671986912275b9ca6aaea70a1dd9b361c39/apache_tvm_ffi-0.1.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7941f82a2ae4549f55c07d82d37c5765628d70f29dace98628393fcea525e870", size = 1957018, upload-time = "2025-11-04T02:42:50.538Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/0c/d27beb98d6841a3929468648433ed2c53e4da953fadb73c754b9372b2356/apache_tvm_ffi-0.1.1-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2e0d6d8e0888ee3a3defd2cbe1eff7a65c05900b4e8fa0e18c890048fc6a44a6", size = 1839279, upload-time = "2025-11-04T02:42:52.438Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/10/d7cf7779c65047ad2ca652234a174c2908d936cb69bc4f5156e17382fa91/apache_tvm_ffi-0.1.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:549c2150e1c2d7ca7912cad173f62a192aec90cd981c024bd246161283ea5d78", size = 1950476, upload-time = "2025-11-04T02:42:54.159Z" },
-    { url = "https://files.pythonhosted.org/packages/53/71/bb5ee4bca52a37a8f9580ab1f1de1be5366808a194981c324a756dabbe15/apache_tvm_ffi-0.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:3fbcfe526b458bc8edeafdc769388782d3bb4321c46a987e50bcece93ae78af8", size = 1711278, upload-time = "2025-11-04T02:42:55.56Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/1e/f8d16dbe2303d1e7348037b4207d6c1093c554573484c97c8f3cde61a060/apache_tvm_ffi-0.1.1-cp312-abi3-macosx_11_0_arm64.whl", hash = "sha256:f2c0164a5c6286f9c333ddedeb448b855cbc1225688d0a4c9aeab006ddfa1180", size = 1701072, upload-time = "2025-11-04T02:42:57.28Z" },
-    { url = "https://files.pythonhosted.org/packages/3d/47/f7a55e9b5b741f901ed9101a3ef46fd250f2c1519a6479e055432ff4f308/apache_tvm_ffi-0.1.1-cp312-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:33cc35738e0c44f2a67e550457b6b7dc7de9109ca64422a9e7063b1ba43c336e", size = 1854467, upload-time = "2025-11-04T02:43:00.158Z" },
-    { url = "https://files.pythonhosted.org/packages/f2/db/f3adbe1e2d092fbb18908971a25ceb5496669ec65d01a28b7dd57f471ae0/apache_tvm_ffi-0.1.1-cp312-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e9db6484259120b1bdc600f736084ee3d574775b1f4a3e8fef110323e3a9d2b6", size = 1930968, upload-time = "2025-11-04T02:43:01.96Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/da/7f678675ccc8af1c7d313322f3875e2c829f1faaa58c0d982431beeb3b3e/apache_tvm_ffi-0.1.1-cp312-abi3-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c7bd812058ce9046cb69fd7b3e18538d1d0eefa1719822a1441b00bb841f7af4", size = 1811173, upload-time = "2025-11-04T02:43:03.404Z" },
-    { url = "https://files.pythonhosted.org/packages/e1/11/c8b3b7d69ceebd219dcb06f5e4a3997edea3bc2e0bbdd8f57ae65bba4f2f/apache_tvm_ffi-0.1.1-cp312-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:807def3039fb336a228c120ca8c32eb794bdfd2d7aff218c8611f287ad913736", size = 1922690, upload-time = "2025-11-04T02:43:04.846Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/0b/f816735d761049e53eb388264238655f58fcb42a31e0d1848a4fb6a6556b/apache_tvm_ffi-0.1.1-cp312-abi3-win_amd64.whl", hash = "sha256:624b4430ca3949f85fffd9ef498ebaf1155ff0ac659fc764eec6c6fd66ec7986", size = 1690969, upload-time = "2025-11-04T02:43:06.581Z" },
-    { url = "https://files.pythonhosted.org/packages/12/aa/df81df8f8b39d3c41fbac41b1e6661d192d9987a3ef317fabcefecf727a6/apache_tvm_ffi-0.1.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c93d9de81c1ba9560fcc696cf84d777f88016eb53f05ee2d6288ddcb95a5e72f", size = 1732582, upload-time = "2025-11-04T02:43:08.042Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/55/861090532e4accd855e119f0e67e0e482b42abb866c9505edd8956148ebc/apache_tvm_ffi-0.1.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f9e0227179a0ce83384132b34757fd05f492270f1c031eae615870a5641b5039", size = 1870196, upload-time = "2025-11-04T02:43:09.911Z" },
-    { url = "https://files.pythonhosted.org/packages/2a/c6/470493934559e371ad699e1764649176efc5e022267c6dd0a565217177ad/apache_tvm_ffi-0.1.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:78e75e193d675b9639e6fd0c33c60c3a4259d4c9f848f60baa6a3194df7e1fea", size = 1941999, upload-time = "2025-11-04T02:43:11.467Z" },
-    { url = "https://files.pythonhosted.org/packages/85/b8/84eba0d266c9b10beae59a6863ef5c68044e20a6f12d46a42116e80db774/apache_tvm_ffi-0.1.1-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:49792622720421525a18e378d848411731d32fcb05a00b6e54b84d05ff46cc22", size = 1823965, upload-time = "2025-11-04T02:43:12.941Z" },
-    { url = "https://files.pythonhosted.org/packages/64/73/ca73a43260a1374b1f34d0e6fcf6f8af16f66867a89dfd562b26184af1bd/apache_tvm_ffi-0.1.1-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:039293086d44e7f601bf8231e369198afe7ad38986330969ddb1a5fc7622976b", size = 1933779, upload-time = "2025-11-04T02:43:14.543Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/91/687c3b9ff3313addeebc1188ac50b299a82944ef1784b91890fc6f250ebd/apache_tvm_ffi-0.1.1-cp314-cp314t-win_amd64.whl", hash = "sha256:3f6cbd214bee2e52719d5264f05a2685c955ae7b096980f0361d917a5a9f47a6", size = 1751905, upload-time = "2025-11-04T02:43:16.286Z" },
-]
-
-[[package]]
-name = "asciitree"
-version = "0.3.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/2d/6a/885bc91484e1aa8f618f6f0228d76d0e67000b0fdd6090673b777e311913/asciitree-0.3.3.tar.gz", hash = "sha256:4aa4b9b649f85e3fcb343363d97564aa1fb62e249677f2e18a96765145cc0f6e", size = 3951, upload-time = "2016-09-05T19:10:42.681Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/f9/f0/af641a18833f35b37f01ecbdbf9baa0095805475adf8cd52ebeb7698fa8c/apache_tvm_ffi-0.1.3.tar.gz", hash = "sha256:d33f0bc0d028cddf321d69724c916504272a7f03dfc1d8e507d9d0f88b6f7cbf", size = 1276869, upload-time = "2025-11-21T05:11:00.562Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/68/13/ad0af6fb5203df6c92e404c5465d44a60bae7de0741a93fb1a3b4829692e/apache_tvm_ffi-0.1.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d8999f431b3acd04a2d79f38e2ebfbb089d0f43ed87528674d7bda6d3f796ddc", size = 1743043, upload-time = "2025-11-21T05:10:05.255Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/64/f362d0010daacea93a928de0c31df6b7d40ef8cd57e9117535ee0adc2704/apache_tvm_ffi-0.1.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:81f187d08d9040ec98b22fb6906c68b1df60b41567f2b507293f53f630b0136f", size = 1895551, upload-time = "2025-11-21T05:10:07.223Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/98/daa0f491312ebe4dccc7d84799c0b5b1bc5eee6b1093208a4fbb98175579/apache_tvm_ffi-0.1.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:dacfd2974a60a6b531a5fe8a3985f60368fc88a8ab3872c381fc1a80315d3d24", size = 1969790, upload-time = "2025-11-21T05:10:09.032Z" },
+    { url = "https://files.pythonhosted.org/packages/87/9c/68e30812874e60b141b99202dd3c4e4de964a7cb62cf6455de170b3a5111/apache_tvm_ffi-0.1.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ff65bf8a96dbbd2725937ff1502e52571e7a90d81d355a21a303328dd06449cc", size = 1844888, upload-time = "2025-11-21T05:10:10.871Z" },
+    { url = "https://files.pythonhosted.org/packages/49/97/ffe70c4679aebef0c1e32eec3970dc7e35113995d318aeb8c2ef0e4a3eb9/apache_tvm_ffi-0.1.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:48ad3df2224f1b0943344895c6cba2f3f0a53bc67ddafdd3e9d7a34f56100aa9", size = 1953886, upload-time = "2025-11-21T05:10:12.55Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/f3/e03e5716a4e025d060585a9ca3123ce76e13dff8f464cda4d5e48ef9a26a/apache_tvm_ffi-0.1.3-cp310-cp310-win_amd64.whl", hash = "sha256:6d56b2026aa614bd56d20375e5062ddb8d4baebd7a6b93476bbe3f0339cfa095", size = 1725820, upload-time = "2025-11-21T05:10:14.043Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/f0/d19a0b8e97e102f8376e18cd8234cc0a5f37d5c935ce74bf587e15f8450e/apache_tvm_ffi-0.1.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fae211bb8693c118109e106b73393164e3ca878823185cfd6e03765e04056f37", size = 1742398, upload-time = "2025-11-21T05:10:15.384Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/0c/699e26a3b7db2c1627ac87335deccf8a8b6cb2e218766fe9acd5aadb5f78/apache_tvm_ffi-0.1.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:79ff39b5d6a2ed8665f4b91282391a052e8c7c76ac0f12f776ad0747f212f201", size = 1895272, upload-time = "2025-11-21T05:10:17.164Z" },
+    { url = "https://files.pythonhosted.org/packages/22/39/f64a1f1a23dc3298d3f50ceb275eb9b98b6898ea3df52e6d95fed756610c/apache_tvm_ffi-0.1.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e2cc20f00d98e263ca35fef9a139fe65992988deddd570498ff77c11780ce22e", size = 1969033, upload-time = "2025-11-21T05:10:18.855Z" },
+    { url = "https://files.pythonhosted.org/packages/51/dc/fb9e25b83a57ae7b4df7308d839febf13d2e77b481ea79800e89f1eee470/apache_tvm_ffi-0.1.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b5b2d1c8c421aaa0685fcc77347566da68e45d8d2dc150c2ee957906b1186d62", size = 1844972, upload-time = "2025-11-21T05:10:20.201Z" },
+    { url = "https://files.pythonhosted.org/packages/63/f2/ef1521e617254c2fe38b2f60440694de426b2402b225e1cc4ae04e9a22c2/apache_tvm_ffi-0.1.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:adbc2f3b496d67199adaa999baecb9a3c9137cf1fc32163a4834950062bd0dd7", size = 1954220, upload-time = "2025-11-21T05:10:21.571Z" },
+    { url = "https://files.pythonhosted.org/packages/96/7c/1cadf17119f75b4d22761f8c003a767e63d456aac3f738ae42403ef7d990/apache_tvm_ffi-0.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:d797b29f70ea8c1843f4141a6b12b9770579a2b770f76898a96b721d2f987a23", size = 1725528, upload-time = "2025-11-21T05:10:23.043Z" },
+    { url = "https://files.pythonhosted.org/packages/21/b4/9983c1df90d239cc15055469c795a894bab85ffd75f9325d2f5e392dbf09/apache_tvm_ffi-0.1.3-cp312-abi3-macosx_11_0_arm64.whl", hash = "sha256:71d1de0c139cae3824c1e8b511acf6b2bfd37deccfc640cb83b80ba17b33d6e3", size = 1719369, upload-time = "2025-11-21T05:10:24.768Z" },
+    { url = "https://files.pythonhosted.org/packages/01/e3/1b47af4391863351d9db42ab1ed116e3eba2c4ef49c1e161e4cd0ba379d9/apache_tvm_ffi-0.1.3-cp312-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b0bc38da581c54c862840960c5bf0da5bb78aa007630d6f026675d1d4b1df898", size = 1867353, upload-time = "2025-11-21T05:10:26.481Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/6e/0d12246b90534be733accdfbfe6e2d5bde8d7c722293c21821fe10b09412/apache_tvm_ffi-0.1.3-cp312-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:48160e8fa0235e8f3fad45102c4e856edb798c8b2954603f80f6721e3c0fd7ef", size = 1945829, upload-time = "2025-11-21T05:10:27.831Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/89/c4ad96b76a6e2d38795871bfb048c74aa60d1a7c01fab48cbe4e8c10f1a2/apache_tvm_ffi-0.1.3-cp312-abi3-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b1c215d4608e17d7f2382f3c6b2903a4696255727ac905041f3a005c50a98afc", size = 1817481, upload-time = "2025-11-21T05:10:29.543Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/c7/2f6bc83fcc987c2eb00037c3f27f1d182c2f0d8976a16807ef1395a8ece1/apache_tvm_ffi-0.1.3-cp312-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b75cc773bc29db64bb69f11d260ec66e88ad0a4a951d25650f69d3b2c9f9a186", size = 1927595, upload-time = "2025-11-21T05:10:30.882Z" },
+    { url = "https://files.pythonhosted.org/packages/12/a0/597c522588abef7fcf3fe38492cf832eed8ba9123f01d3c33dfaec174dcc/apache_tvm_ffi-0.1.3-cp312-abi3-win_amd64.whl", hash = "sha256:86fd1e1012ec2ec25213f714f5f28e6f6b897360776872d5f71c4be8cae8aeb8", size = 1706236, upload-time = "2025-11-21T05:10:32.25Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/76/8404875ee3fb61a3c97026e2eaab8d97e7f974601e444d5abb37a765c686/apache_tvm_ffi-0.1.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0ef290a792d6e3734e2fe1ff19b2b82e6bd3af6714216c7fe32d0a39c0d0e8df", size = 1750006, upload-time = "2025-11-21T05:10:33.594Z" },
+    { url = "https://files.pythonhosted.org/packages/98/98/7989ccb343044f97491cb1e46e675da75defc82a56495c320dcb1e31583b/apache_tvm_ffi-0.1.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c7b137ab0c7ec6507f61e88885ddbd3541d7d14d8ca25938f5fa106ca06996d3", size = 1880792, upload-time = "2025-11-21T05:10:35.239Z" },
+    { url = "https://files.pythonhosted.org/packages/64/2e/f772e75f947ebfa2faa305980ba2c172ae26a53f66c8f0c1f8915c4fa690/apache_tvm_ffi-0.1.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d5187a90cf1c0663b8071f34f621f49ba83866412298deed9c4a94d1d991711b", size = 1953343, upload-time = "2025-11-21T05:10:36.879Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/a8/7d1d75f70d5a2cd283ded60784d9657c59fa7516f4b3c32437f70901d117/apache_tvm_ffi-0.1.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:54001ceab111e708a1638fd9e40713d9d55f6a073037a2d4a9f1982f8dda3c69", size = 1829560, upload-time = "2025-11-21T05:10:38.421Z" },
+    { url = "https://files.pythonhosted.org/packages/21/3a/6bee12cf517ace0bb8fd83bb72f6ca227743a49bab0c30918f523b5428df/apache_tvm_ffi-0.1.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:996d87d6f180250e734ce7b7cce39f234e3ad3369fffb3882c8f29c79d280db4", size = 1937457, upload-time = "2025-11-21T05:10:40.505Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/99/107f082536447dba2a628e1571dd423b577df6bd8e441896e3f8b0929001/apache_tvm_ffi-0.1.3-cp314-cp314t-win_amd64.whl", hash = "sha256:6010c918c62fb19995e70c4f149dfc5c248783da0d22d5c40e84649bd89a9357", size = 1766053, upload-time = "2025-11-21T05:10:41.859Z" },
+]
 
 [[package]]
 name = "astroid"
@@ -379,52 +339,59 @@ wheels = [
 
 [[package]]
 name = "av"
-version = "15.1.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/e9/c3/83e6e73d1592bc54436eae0bc61704ae0cff0c3cfbde7b58af9ed67ebb49/av-15.1.0.tar.gz", hash = "sha256:39cda2dc810e11c1938f8cb5759c41d6b630550236b3365790e67a313660ec85", size = 3774192, upload-time = "2025-08-30T04:41:56.076Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/3a/6a/91e3e68ae0d1b53b480ec69a96f2ae820fb007bc60e6b821741f31c7ba4e/av-15.1.0-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:cf067b66cee2248220b29df33b60eb4840d9e7b9b75545d6b922f9c41d88c4ee", size = 21781685, upload-time = "2025-08-30T04:39:13.118Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/6d/afa951b9cb615c3bc6d95c4eed280c6cefb52c006f4e15e79043626fab39/av-15.1.0-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:26426163d96fc3bde9a015ba4d60da09ef848d9284fe79b4ca5e60965a008fc5", size = 26962481, upload-time = "2025-08-30T04:39:16.875Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/42/0c384884235c42c439cef28cbd129e4624ad60229119bf3c6c6020805119/av-15.1.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:92f524541ce74b8a12491d8934164a5c57e983da24826547c212f60123de400b", size = 37571839, upload-time = "2025-08-30T04:39:20.325Z" },
-    { url = "https://files.pythonhosted.org/packages/25/c0/5c967b0872fce1add80a8f50fa7ce11e3e3e5257c2b079263570bc854699/av-15.1.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:659f9d6145fb2c58e8b31907283b6ba876570f5dd6e7e890d74c09614c436c8e", size = 39070227, upload-time = "2025-08-30T04:39:24.079Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/81/e333056d49363c35a74b828ed5f87c96dfbcc1a506b49d79a31ac773b94d/av-15.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:07a8ae30c0cfc3132eff320a6b27d18a5e0dda36effd0ae28892888f4ee14729", size = 39619362, upload-time = "2025-08-30T04:39:27.7Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/ae/50cc2af1bf68452cbfec8d1b2554c18f6d167c8ba6d7ad7707797dfd1541/av-15.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e33a76e38f03bb5de026b9f66ccf23dc01ddd2223221096992cb52ac22e62538", size = 40371627, upload-time = "2025-08-30T04:39:31.207Z" },
-    { url = "https://files.pythonhosted.org/packages/50/e6/381edf1779106dd31c9ef1ac9842f643af4465b8a87cbc278d3eaa76229a/av-15.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:aa4bf12bdce20edc2a3b13a2776c474c5ab63e1817d53793714504476eeba82e", size = 31340369, upload-time = "2025-08-30T04:39:34.774Z" },
-    { url = "https://files.pythonhosted.org/packages/47/58/4e44cf6939be7aba96a4abce024e1be11ba7539ecac74d09369b8c03aa05/av-15.1.0-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:b785948762a8d45fc58fc24a20251496829ace1817e9a7a508a348d6de2182c3", size = 21767323, upload-time = "2025-08-30T04:39:37.989Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/f6/a946544cdb49f6d892d2761b1d61a8bc6ce912fe57ba06769bdc640c0a7f/av-15.1.0-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:9c7131494a3a318612b4ee4db98fe5bc50eb705f6b6536127c7ab776c524fd8b", size = 26946268, upload-time = "2025-08-30T04:39:40.601Z" },
-    { url = "https://files.pythonhosted.org/packages/70/7c/b33513c0af73d0033af59a98f035b521c5b93445a6af7e9efbf41a6e8383/av-15.1.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:2b9623ae848625c59213b610c8665817924f913580c7c5c91e0dc18936deb00d", size = 38062118, upload-time = "2025-08-30T04:39:43.928Z" },
-    { url = "https://files.pythonhosted.org/packages/5e/95/31b7fb34f9fea7c7389240364194f4f56ad2d460095038cc720f50a90bb3/av-15.1.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:c8ef597087db560514617143532b1fafc4825ebb2dda9a22418f548b113a0cc7", size = 39571086, upload-time = "2025-08-30T04:39:47.109Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/b0/7b0b45474a4e90c35c11d0032947d8b3c7386872957ce29c6f12add69a74/av-15.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:08eac47a90ebae1e2bd5935f400dd515166019bab4ff5b03c4625fa6ac3a0a5e", size = 40112634, upload-time = "2025-08-30T04:39:50.981Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/04/038b94bc9a1ee10a451c867d4a2fc91e845f83bfc2dae9df25893abcb57f/av-15.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d3f66ff200ea166e606cb3c5cb1bd2fc714effbec2e262a5d67ce60450c8234a", size = 40878695, upload-time = "2025-08-30T04:39:54.493Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/3d/9f8f96c0deeaaf648485a3dbd1699b2f0580f2ce8a36cb616c0138ba7615/av-15.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:57b99544d91121b8bea570e4ddf61700f679a6b677c1f37966bc1a22e1d4cd5c", size = 31335683, upload-time = "2025-08-30T04:39:57.861Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/58/de78b276d20db6ffcd4371283df771721a833ba525a3d57e753d00a9fe79/av-15.1.0-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:40c5df37f4c354ab8190c6fd68dab7881d112f527906f64ca73da4c252a58cee", size = 21760991, upload-time = "2025-08-30T04:40:00.801Z" },
-    { url = "https://files.pythonhosted.org/packages/56/cc/45f85775304ae60b66976360d82ba5b152ad3fd91f9267d5020a51e9a828/av-15.1.0-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:af455ce65ada3d361f80c90c810d9bced4db5655ab9aa513024d6c71c5c476d5", size = 26953097, upload-time = "2025-08-30T04:40:03.998Z" },
-    { url = "https://files.pythonhosted.org/packages/f3/f8/2d781e5e71d02fc829487e775ccb1185e72f95340d05f2e84eb57a11e093/av-15.1.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:86226d2474c80c3393fa07a9c366106029ae500716098b72b3ec3f67205524c3", size = 38319710, upload-time = "2025-08-30T04:40:07.701Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/13/37737ef2193e83862ccacff23580c39de251da456a1bf0459e762cca273c/av-15.1.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:11326f197e7001c4ca53a83b2dbc67fd39ddff8cdf62ce6be3b22d9f3f9338bd", size = 39915519, upload-time = "2025-08-30T04:40:11.066Z" },
-    { url = "https://files.pythonhosted.org/packages/26/e9/e8032c7b8f2a4129a03f63f896544f8b7cf068e2db2950326fa2400d5c47/av-15.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a631ea879cc553080ee62874f4284765c42ba08ee0279851a98a85e2ceb3cc8d", size = 40286166, upload-time = "2025-08-30T04:40:14.561Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/23/612c0fd809444d04b8387a2dfd942ccc77829507bd78a387ff65a9d98c24/av-15.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8f383949b010c3e731c245f80351d19dc0c08f345e194fc46becb1cb279be3ff", size = 41150592, upload-time = "2025-08-30T04:40:17.951Z" },
-    { url = "https://files.pythonhosted.org/packages/15/74/6f8e38a3b0aea5f28e72813672ff45b64615f2c69e6a4a558718c95edb9f/av-15.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:d5921aa45f4c1f8c1a8d8185eb347e02aa4c3071278a2e2dd56368d54433d643", size = 31336093, upload-time = "2025-08-30T04:40:21.393Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/bc/78b2ffa8235eeffc29aa4a8cc47b02e660cfec32f601f39a00975fb06d0e/av-15.1.0-cp313-cp313-macosx_13_0_arm64.whl", hash = "sha256:2f77853c3119c59d1bff4214ccbe46e3133eccff85ed96adee51c68684443f4e", size = 21726244, upload-time = "2025-08-30T04:40:24.14Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/99/66d69453a2dce028e6e8ebea085d90e880aac03d3a3ab7d8ec16755ffd75/av-15.1.0-cp313-cp313-macosx_13_0_x86_64.whl", hash = "sha256:c0bc4471c156a0a1c70a607502434f477bc8dfe085eef905e55b4b0d66bcd3a5", size = 26918663, upload-time = "2025-08-30T04:40:27.557Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/51/1a7dfbeda71f2772bc46d758af0e7fab1cc8388ce4bc7f24aecbc4bfd764/av-15.1.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:37839d4fa1407f047af82560dfc0f94d8d6266071eff49e1cbe16c4483054621", size = 38041408, upload-time = "2025-08-30T04:40:30.811Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/97/2c4e0288ad4359b6064cb06ae79c2ff3a84ac73d27e91f2161b75fcd86fa/av-15.1.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:729179cd8622815e8b6f6854d13a806fe710576e08895c77e5e4ad254609de9a", size = 39642563, upload-time = "2025-08-30T04:40:34.617Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/94/2362502149e276d00957edabcc201a5f4d5109a8a7b4fd30793714a532f3/av-15.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4abdf085bfa4eec318efccff567831b361ea56c045cc38366811552e3127c665", size = 40022119, upload-time = "2025-08-30T04:40:37.703Z" },
-    { url = "https://files.pythonhosted.org/packages/df/58/1a0ce1b3835d9728da0a7a54aeffaa0a2b1a88405eaed9322efd55212a54/av-15.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f985661644879e4520d28a995fcb2afeb951bc15a1d51412eb8e5f36da85b6fe", size = 40885158, upload-time = "2025-08-30T04:40:40.952Z" },
-    { url = "https://files.pythonhosted.org/packages/30/e6/054bb64e424d90b77ed5fc6a7358e4013fb436154c998fc90a89a186313f/av-15.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:7d7804a44c8048bb4b014a99353dd124663a12cd1d4613ba2bd3b457c3b1d539", size = 31312256, upload-time = "2025-08-30T04:40:44.224Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/8b/89eae6dca10d7d2b83c131025a31ccc750be78699ac0304439faa1d1df99/av-15.1.0-cp314-cp314-macosx_13_0_arm64.whl", hash = "sha256:5dd73c6447947edcb82e5fecf96e1f146aeda0f169c7ad4c54df4d9f66f63fde", size = 21730645, upload-time = "2025-08-30T04:40:47.259Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/f0/abffaf69405ed68041524be12a1e294faf396971d6a0e70eb00e93687df7/av-15.1.0-cp314-cp314-macosx_13_0_x86_64.whl", hash = "sha256:a81cd515934a5d51290aa66b059b7ed29c4a212e704f3c5e99e32877ff1c312c", size = 26913753, upload-time = "2025-08-30T04:40:50.445Z" },
-    { url = "https://files.pythonhosted.org/packages/37/9e/7af078bcfc3cd340c981ac5d613c090ab007023d2ac13b05acd52f22f069/av-15.1.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:57cc7a733a7e7d7a153682f35c9cf5d01e8269367b049c954779de36fc3d0b10", size = 38027048, upload-time = "2025-08-30T04:40:54.076Z" },
-    { url = "https://files.pythonhosted.org/packages/02/76/1f9dac11ad713e3619288993ea04e9c9cf4ec0f04e5ee81e83b8129dd8f3/av-15.1.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:a77b75bdb6899a64302ff923a5246e0747b3f0a3ecee7d61118db407a22c3f53", size = 39565396, upload-time = "2025-08-30T04:40:57.84Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/32/2188c46e2747247458ffc26b230c57dd28e61f65ff7b9e6223a411af5e98/av-15.1.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d0a1154ce081f1720082a133cfe12356c59f62dad2b93a7a1844bf1dcd010d85", size = 40015050, upload-time = "2025-08-30T04:41:01.091Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/41/b57fbce9994580619d7574817ece0fe0e7b822cde2af57904549d0150b8d/av-15.1.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8a7bf5a34dee15c86790414fa86a144e6d0dcc788bc83b565fdcbc080b4fbc90", size = 40821225, upload-time = "2025-08-30T04:41:04.349Z" },
-    { url = "https://files.pythonhosted.org/packages/b1/36/e85cd1f0d3369c6764ad422882895d082f7ececb66d3df8aeae3234ef7a6/av-15.1.0-cp314-cp314-win_amd64.whl", hash = "sha256:e30c9a6fd9734784941384a2e25fad3c22881a7682f378914676aa7e795acdb7", size = 31311750, upload-time = "2025-08-30T04:41:07.744Z" },
-    { url = "https://files.pythonhosted.org/packages/80/d8/08a681758a4e49adfda409a6a35eff533f42654c6a6cfa102bc5cae1a728/av-15.1.0-cp314-cp314t-macosx_13_0_arm64.whl", hash = "sha256:60666833d7e65ebcfc48034a072de74349edbb62c9aaa3e6722fef31ca028eb6", size = 21828343, upload-time = "2025-08-30T04:41:10.81Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/52/29bec3fe68669b21f7d1ab5d94e21f597b8dfd37f50a3e3c9af6a8da925c/av-15.1.0-cp314-cp314t-macosx_13_0_x86_64.whl", hash = "sha256:53fbdae45aa2a49a22e864ff4f4017416ef62c060a172085d3247ba0a101104e", size = 27001666, upload-time = "2025-08-30T04:41:13.822Z" },
-    { url = "https://files.pythonhosted.org/packages/9d/54/2c1d1faced66d708f5df328e800997cb47f90b500a214130c3a0f2ad601e/av-15.1.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:e6c51061667983dc801502aff9140bbc4f0e0d97f879586f17fb2f9a7e49c381", size = 39496753, upload-time = "2025-08-30T04:41:16.759Z" },
-    { url = "https://files.pythonhosted.org/packages/c3/76/06ded5e52c4dcc2d9b5184c6da8de5ea77bd7ecb79a59a2b9700f1984949/av-15.1.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:2f80ec387f04aa34868662b11018b5f09654ae1530a61e24e92a142a24b10b62", size = 40784729, upload-time = "2025-08-30T04:41:20.491Z" },
-    { url = "https://files.pythonhosted.org/packages/52/ef/797b76f3b39c99a96e387f501bbc07dca340b27d3dda12862fe694066b63/av-15.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:4975e03177d37d8165c99c8d494175675ba8acb72458fb5d7e43f746a53e0374", size = 41284953, upload-time = "2025-08-30T04:41:23.949Z" },
-    { url = "https://files.pythonhosted.org/packages/31/47/e4656f00e62fd059ea5a40b492dea784f5aecfe1dfac10c0d7a0664ce200/av-15.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8f78f3dad11780b4cdd024cdb92ce43cb170929297c00f2f4555c2b103f51e55", size = 41985340, upload-time = "2025-08-30T04:41:27.561Z" },
-    { url = "https://files.pythonhosted.org/packages/b1/c9/15bb4fd7a1f39d70db35af2b9c20a0ae19e4220eb58a8b8446e903b98d72/av-15.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:9a20c5eba3ec49c2f4b281797021923fc68a86aeb66c5cda4fd0252fa8004951", size = 31487337, upload-time = "2025-08-30T04:41:30.591Z" },
+version = "16.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/15/c3/fd72a0315bc6c943ced1105aaac6e0ec1be57c70d8a616bd05acaa21ffee/av-16.0.1.tar.gz", hash = "sha256:dd2ce779fa0b5f5889a6d9e00fbbbc39f58e247e52d31044272648fe16ff1dbf", size = 3904030, upload-time = "2025-10-13T12:28:51.082Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e8/3c/eefa29b7d0f5afdf7af9197bbecad8ec2ad06bcb5ac7e909c05a624b00a6/av-16.0.1-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:8b141aaa29a3afc96a1d467d106790782c1914628b57309eaadb8c10c299c9c0", size = 27206679, upload-time = "2025-10-13T12:24:41.145Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/89/a474feb07d5b94aa5af3771b0fe328056e2e0a840039b329f4fa2a1fd13a/av-16.0.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:4b8a08a59a5be0082af063d3f4b216e3950340121c6ea95b505a3f5f5cc8f21d", size = 21774556, upload-time = "2025-10-13T12:24:44.332Z" },
+    { url = "https://files.pythonhosted.org/packages/be/e5/4361010dcac398bc224823e4b2a47803845e159af9f95164662c523770dc/av-16.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:792e7fc3c08eae005ff36486983966476e553cbb55aaeb0ec99adc4909377320", size = 38176763, upload-time = "2025-10-13T12:24:46.98Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/db/b27bdd20c9dc80de5b8792dae16dd6f4edf16408c0c7b28070c6228a8057/av-16.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:4e8ef5df76d8d0ee56139789f80bb90ad1a82a7e6df6e080e2e95c06fa22aea7", size = 39696277, upload-time = "2025-10-13T12:24:50.951Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/c8/dd48e6a3ac1e922c141475a0dc30e2b6dfdef9751b3274829889a9281cce/av-16.0.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4f7a6985784a7464f078e419c71f5528c3e550ee5d605e7149b4a37a111eb136", size = 39576660, upload-time = "2025-10-13T12:24:55.773Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/f0/223d047e2e60672a2fb5e51e28913de8d52195199f3e949cbfda1e6cd64b/av-16.0.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3f45c8d7b803b6faa2a25a26de5964a0a897de68298d9c9672c7af9d65d8b48a", size = 40752775, upload-time = "2025-10-13T12:25:00.827Z" },
+    { url = "https://files.pythonhosted.org/packages/18/73/73acad21c9203bc63d806e8baf42fe705eb5d36dafd1996b71ab5861a933/av-16.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:58e6faf1d9328d8cc6be14c5aadacb7d2965ed6d6ae1af32696993096543ff00", size = 32302328, upload-time = "2025-10-13T12:25:06.042Z" },
+    { url = "https://files.pythonhosted.org/packages/49/d3/f2a483c5273fccd556dfa1fce14fab3b5d6d213b46e28e54e254465a2255/av-16.0.1-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:e310d1fb42879df9bad2152a8db6d2ff8bf332c8c36349a09d62cc122f5070fb", size = 27191982, upload-time = "2025-10-13T12:25:10.622Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/39/dff28bd252131b3befd09d8587992fe18c09d5125eaefc83a6434d5f56ff/av-16.0.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:2f4b357e5615457a84e6b6290916b22864b76b43d5079e1a73bc27581a5b9bac", size = 21760305, upload-time = "2025-10-13T12:25:14.882Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/4d/2312d50a09c84a9b4269f7fea5de84f05dd2b7c7113dd961d31fad6c64c4/av-16.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:286665c77034c3a98080169b8b5586d5568a15da81fbcdaf8099252f2d232d7c", size = 38691616, upload-time = "2025-10-13T12:25:20.063Z" },
+    { url = "https://files.pythonhosted.org/packages/15/9a/3d2d30b56252f998e53fced13720e2ce809c4db477110f944034e0fa4c9f/av-16.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:f88de8e5b8ea29e41af4d8d61df108323d050ccfbc90f15b13ec1f99ce0e841e", size = 40216464, upload-time = "2025-10-13T12:25:24.848Z" },
+    { url = "https://files.pythonhosted.org/packages/98/cb/3860054794a47715b4be0006105158c7119a57be58d9e8882b72e4d4e1dd/av-16.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0cdb71ebe4d1b241cf700f8f0c44a7d2a6602b921e16547dd68c0842113736e1", size = 40094077, upload-time = "2025-10-13T12:25:30.238Z" },
+    { url = "https://files.pythonhosted.org/packages/41/58/79830fb8af0a89c015250f7864bbd427dff09c70575c97847055f8a302f7/av-16.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:28c27a65d40e8cf82b6db2543f8feeb8b56d36c1938f50773494cd3b073c7223", size = 41279948, upload-time = "2025-10-13T12:25:35.24Z" },
+    { url = "https://files.pythonhosted.org/packages/83/79/6e1463b04382f379f857113b851cf5f9d580a2f7bd794211cd75352f4e04/av-16.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:ffea39ac7574f234f5168f9b9602e8d4ecdd81853238ec4d661001f03a6d3f64", size = 32297586, upload-time = "2025-10-13T12:25:39.826Z" },
+    { url = "https://files.pythonhosted.org/packages/44/78/12a11d7a44fdd8b26a65e2efa1d8a5826733c8887a989a78306ec4785956/av-16.0.1-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:e41a8fef85dfb2c717349f9ff74f92f9560122a9f1a94b1c6c9a8a9c9462ba71", size = 27206375, upload-time = "2025-10-13T12:25:44.423Z" },
+    { url = "https://files.pythonhosted.org/packages/27/19/3a4d3882852a0ee136121979ce46f6d2867b974eb217a2c9a070939f55ad/av-16.0.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:6352a64b25c9f985d4f279c2902db9a92424e6f2c972161e67119616f0796cb9", size = 21752603, upload-time = "2025-10-13T12:25:49.122Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/6e/f7abefba6e008e2f69bebb9a17ba38ce1df240c79b36a5b5fcacf8c8fcfd/av-16.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:5201f7b4b5ed2128118cb90c2a6d64feedb0586ca7c783176896c78ffb4bbd5c", size = 38931978, upload-time = "2025-10-13T12:25:55.021Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/7a/1305243ab47f724fdd99ddef7309a594e669af7f0e655e11bdd2c325dfae/av-16.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:daecc2072b82b6a942acbdaa9a2e00c05234c61fef976b22713983c020b07992", size = 40549383, upload-time = "2025-10-13T12:26:00.897Z" },
+    { url = "https://files.pythonhosted.org/packages/32/b2/357cc063185043eb757b4a48782bff780826103bcad1eb40c3ddfc050b7e/av-16.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6573da96e8bebc3536860a7def108d7dbe1875c86517072431ced702447e6aea", size = 40241993, upload-time = "2025-10-13T12:26:06.993Z" },
+    { url = "https://files.pythonhosted.org/packages/20/bb/ced42a4588ba168bf0ef1e9d016982e3ba09fde6992f1dda586fd20dcf71/av-16.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4bc064e48a8de6c087b97dd27cf4ef8c13073f0793108fbce3ecd721201b2502", size = 41532235, upload-time = "2025-10-13T12:26:12.488Z" },
+    { url = "https://files.pythonhosted.org/packages/15/37/c7811eca0f318d5fd3212f7e8c3d8335f75a54907c97a89213dc580b8056/av-16.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0c669b6b6668c8ae74451c15ec6d6d8a36e4c3803dc5d9910f607a174dd18f17", size = 32296912, upload-time = "2025-10-13T12:26:19.187Z" },
+    { url = "https://files.pythonhosted.org/packages/86/59/972f199ccc4f8c9e51f59e0f8962a09407396b3f6d11355e2c697ba555f9/av-16.0.1-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:4c61c6c120f5c5d95c711caf54e2c4a9fb2f1e613ac0a9c273d895f6b2602e44", size = 27170433, upload-time = "2025-10-13T12:26:24.673Z" },
+    { url = "https://files.pythonhosted.org/packages/53/9d/0514cbc185fb20353ab25da54197fbd169a233e39efcbb26533c36a9dbb9/av-16.0.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:7ecc2e41320c69095f44aff93470a0d32c30892b2dbad0a08040441c81efa379", size = 21717654, upload-time = "2025-10-13T12:26:29.12Z" },
+    { url = "https://files.pythonhosted.org/packages/32/8c/881409dd124b4e07d909d2b70568acb21126fc747656390840a2238651c9/av-16.0.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:036f0554d6faef3f4a94acaeb0cedd388e3ab96eb0eb5a14ec27c17369c466c9", size = 38651601, upload-time = "2025-10-13T12:26:33.919Z" },
+    { url = "https://files.pythonhosted.org/packages/35/fd/867ba4cc3ab504442dc89b0c117e6a994fc62782eb634c8f31304586f93e/av-16.0.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:876415470a62e4a3550cc38db2fc0094c25e64eea34d7293b7454125d5958190", size = 40278604, upload-time = "2025-10-13T12:26:39.2Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/87/63cde866c0af09a1fa9727b4f40b34d71b0535785f5665c27894306f1fbc/av-16.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:56902a06bd0828d13f13352874c370670882048267191ff5829534b611ba3956", size = 39984854, upload-time = "2025-10-13T12:26:44.581Z" },
+    { url = "https://files.pythonhosted.org/packages/71/3b/8f40a708bff0e6b0f957836e2ef1f4d4429041cf8d99a415a77ead8ac8a3/av-16.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fe988c2bf0fc2d952858f791f18377ea4ae4e19ba3504793799cd6c2a2562edf", size = 41270352, upload-time = "2025-10-13T12:26:50.817Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/b5/c114292cb58a7269405ae13b7ba48c7d7bfeebbb2e4e66c8073c065a4430/av-16.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:708a66c248848029bf518f0482b81c5803846f1b597ef8013b19c014470b620f", size = 32273242, upload-time = "2025-10-13T12:26:55.788Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/e9/a5b714bc078fdcca8b46c8a0b38484ae5c24cd81d9c1703d3e8ae2b57259/av-16.0.1-cp313-cp313t-macosx_11_0_x86_64.whl", hash = "sha256:79a77ee452537030c21a0b41139bedaf16629636bf764b634e93b99c9d5f4558", size = 27248984, upload-time = "2025-10-13T12:27:00.564Z" },
+    { url = "https://files.pythonhosted.org/packages/06/ef/ff777aaf1f88e3f6ce94aca4c5806a0c360e68d48f9d9f0214e42650f740/av-16.0.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:080823a6ff712f81e7089ae9756fb1512ca1742a138556a852ce50f58e457213", size = 21828098, upload-time = "2025-10-13T12:27:05.433Z" },
+    { url = "https://files.pythonhosted.org/packages/34/d7/a484358d24a42bedde97f61f5d6ee568a7dd866d9df6e33731378db92d9e/av-16.0.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:04e00124afa8b46a850ed48951ddda61de874407fb8307d6a875bba659d5727e", size = 40051697, upload-time = "2025-10-13T12:27:10.525Z" },
+    { url = "https://files.pythonhosted.org/packages/73/87/6772d6080837da5d5c810a98a95bde6977e1f5a6e2e759e8c9292af9ec69/av-16.0.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:bc098c1c6dc4e7080629a7e9560e67bd4b5654951e17e5ddfd2b1515cfcd37db", size = 41352596, upload-time = "2025-10-13T12:27:16.217Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/58/fe448c60cf7f85640a0ed8936f16bac874846aa35e1baa521028949c1ea3/av-16.0.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:e6ffd3559a72c46a76aa622630751a821499ba5a780b0047ecc75105d43a6b61", size = 41183156, upload-time = "2025-10-13T12:27:21.574Z" },
+    { url = "https://files.pythonhosted.org/packages/85/c6/a039a0979d0c278e1bed6758d5a6186416c3ccb8081970df893fdf9a0d99/av-16.0.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7a3f1a36b550adadd7513f4f5ee956f9e06b01a88e59f3150ef5fec6879d6f79", size = 42302331, upload-time = "2025-10-13T12:27:26.953Z" },
+    { url = "https://files.pythonhosted.org/packages/18/7b/2ca4a9e3609ff155436dac384e360f530919cb1e328491f7df294be0f0dc/av-16.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:c6de794abe52b8c0be55d8bb09ade05905efa74b1a5ab4860b4b9c2bfb6578bf", size = 32462194, upload-time = "2025-10-13T12:27:32.942Z" },
+    { url = "https://files.pythonhosted.org/packages/14/9a/6d17e379906cf53a7a44dfac9cf7e4b2e7df2082ba2dbf07126055effcc1/av-16.0.1-cp314-cp314-macosx_11_0_x86_64.whl", hash = "sha256:4b55ba69a943ae592ad7900da67129422954789de9dc384685d6b529925f542e", size = 27167101, upload-time = "2025-10-13T12:27:38.886Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/34/891816cd82d5646cb5a51d201d20be0a578232536d083b7d939734258067/av-16.0.1-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:d4a0c47b6c9bbadad8909b82847f5fe64a608ad392f0b01704e427349bcd9a47", size = 21722708, upload-time = "2025-10-13T12:27:43.29Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/20/c24ad34038423ab8c9728cef3301e0861727c188442dcfd70a4a10834c63/av-16.0.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:8bba52f3035708456f6b1994d10b0371b45cfd8f917b5e84ff81aef4ec2f08bf", size = 38638842, upload-time = "2025-10-13T12:27:49.776Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/32/034412309572ba3ad713079d07a3ffc13739263321aece54a3055d7a4f1f/av-16.0.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:08e34c7e7b5e55e29931180bbe21095e1874ac120992bf6b8615d39574487617", size = 40197789, upload-time = "2025-10-13T12:27:55.688Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/9c/40496298c32f9094e7df28641c5c58aa6fb07554dc232a9ac98a9894376f/av-16.0.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:0d6250ab9db80c641b299987027c987f14935ea837ea4c02c5f5182f6b69d9e5", size = 39980829, upload-time = "2025-10-13T12:28:01.507Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/7e/5c38268ac1d424f309b13b2de4597ad28daea6039ee5af061e62918b12a8/av-16.0.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7b621f28d8bcbb07cdcd7b18943ddc040739ad304545715ae733873b6e1b739d", size = 41205928, upload-time = "2025-10-13T12:28:08.431Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/07/3176e02692d8753a6c4606021c60e4031341afb56292178eee633b6760a4/av-16.0.1-cp314-cp314-win_amd64.whl", hash = "sha256:92101f49082392580c9dba4ba2fe5b931b3bb0fb75a1a848bfb9a11ded68be91", size = 32272836, upload-time = "2025-10-13T12:28:13.405Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/47/10e03b88de097385d1550cbb6d8de96159131705c13adb92bd9b7e677425/av-16.0.1-cp314-cp314t-macosx_11_0_x86_64.whl", hash = "sha256:07c464bf2bc362a154eccc82e235ef64fd3aaf8d76fc8ed63d0ae520943c6d3f", size = 27248864, upload-time = "2025-10-13T12:28:17.467Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/60/7447f206bec3e55e81371f1989098baa2fe9adb7b46c149e6937b7e7c1ca/av-16.0.1-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:750da0673864b669c95882c7b25768cd93ece0e47010d74ebcc29dbb14d611f8", size = 21828185, upload-time = "2025-10-13T12:28:21.461Z" },
+    { url = "https://files.pythonhosted.org/packages/68/48/ee2680e7a01bc4911bbe902b814346911fa2528697a44f3043ee68e0f07e/av-16.0.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:0b7c0d060863b2e341d07cd26851cb9057b7979814148b028fb7ee5d5eb8772d", size = 40040572, upload-time = "2025-10-13T12:28:26.585Z" },
+    { url = "https://files.pythonhosted.org/packages/da/68/2c43d28871721ae07cde432d6e36ae2f7035197cbadb43764cc5bf3d4b33/av-16.0.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:e67c2eca6023ca7d76b0709c5f392b23a5defba499f4c262411f8155b1482cbd", size = 41344288, upload-time = "2025-10-13T12:28:32.512Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/7f/1d801bff43ae1af4758c45eee2eaae64f303bbb460e79f352f08587fd179/av-16.0.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e3243d54d84986e8fbdc1946db634b0c41fe69b6de35a99fa8b763e18503d040", size = 41175142, upload-time = "2025-10-13T12:28:38.356Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/06/bb363138687066bbf8997c1433dbd9c81762bae120955ea431fb72d69d26/av-16.0.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a1bcf73efab5379601e6510abd7afe5f397d0f6defe69b1610c2f37a4a17996b", size = 42293932, upload-time = "2025-10-13T12:28:43.442Z" },
+    { url = "https://files.pythonhosted.org/packages/92/15/5e713098a085f970ccf88550194d277d244464d7b3a7365ad92acb4b6dc1/av-16.0.1-cp314-cp314t-win_amd64.whl", hash = "sha256:6368d4ff153d75469d2a3217bc403630dc870a72fe0a014d9135de550d731a86", size = 32460624, upload-time = "2025-10-13T12:28:48.767Z" },
 ]
 
 [[package]]
@@ -667,16 +634,16 @@ wheels = [
 
 [[package]]
 name = "botocore"
-version = "1.40.61"
+version = "1.41.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "jmespath" },
     { name = "python-dateutil" },
     { name = "urllib3" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/28/a3/81d3a47c2dbfd76f185d3b894f2ad01a75096c006a2dd91f237dca182188/botocore-1.40.61.tar.gz", hash = "sha256:a2487ad69b090f9cccd64cf07c7021cd80ee9c0655ad974f87045b02f3ef52cd", size = 14393956, upload-time = "2025-10-28T19:26:46.108Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/90/22/7fe08c726a2e3b11a0aef8bf177e83891c9cb2dc1809d35c9ed91a9e60e6/botocore-1.41.5.tar.gz", hash = "sha256:0367622b811597d183bfcaab4a350f0d3ede712031ce792ef183cabdee80d3bf", size = 14668152, upload-time = "2025-11-26T20:27:38.026Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/38/c5/f6ce561004db45f0b847c2cd9b19c67c6bf348a82018a48cb718be6b58b0/botocore-1.40.61-py3-none-any.whl", hash = "sha256:17ebae412692fd4824f99cde0f08d50126dc97954008e5ba2b522eb049238aa7", size = 14055973, upload-time = "2025-10-28T19:26:42.15Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/4e/21cd0b8f365449f1576f93de1ec8718ed18a7a3bc086dfbdeb79437bba7a/botocore-1.41.5-py3-none-any.whl", hash = "sha256:3fef7fcda30c82c27202d232cfdbd6782cb27f20f8e7e21b20606483e66ee73a", size = 14337008, upload-time = "2025-11-26T20:27:35.208Z" },
 ]
 
 [[package]]
@@ -719,11 +686,11 @@ sdist = { url = "https://files.pythonhosted.org/packages/64/cb/104778c728dc3d5ea
 
 [[package]]
 name = "certifi"
-version = "2025.10.5"
+version = "2025.11.12"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/4c/5b/b6ce21586237c77ce67d01dc5507039d444b630dd76611bbca2d8e5dcd91/certifi-2025.10.5.tar.gz", hash = "sha256:47c09d31ccf2acf0be3f701ea53595ee7e0b8fa08801c6624be771df09ae7b43", size = 164519, upload-time = "2025-10-05T04:12:15.808Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/a2/8c/58f469717fa48465e4a50c014a0400602d3c437d7c0c468e17ada824da3a/certifi-2025.11.12.tar.gz", hash = "sha256:d8ab5478f2ecd78af242878415affce761ca6bc54a22a27e026d7c25357c3316", size = 160538, upload-time = "2025-11-12T02:54:51.517Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/e4/37/af0d2ef3967ac0d6113837b44a4f0bfe1328c2b9763bd5b1744520e5cfed/certifi-2025.10.5-py3-none-any.whl", hash = "sha256:0f212c2744a9bb6de0c56639a6f68afe01ecd92d91f14ae897c4fe7bbeeef0de", size = 163286, upload-time = "2025-10-05T04:12:14.03Z" },
+    { url = "https://files.pythonhosted.org/packages/70/7d/9bc192684cea499815ff478dfcdc13835ddf401365057044fb721ec6bddb/certifi-2025.11.12-py3-none-any.whl", hash = "sha256:97de8790030bbd5c2d96b7ec782fc2f7820ef8dba6db909ccf95449f2d062d4b", size = 159438, upload-time = "2025-11-12T02:54:49.735Z" },
 ]
 
 [[package]]
@@ -899,14 +866,14 @@ wheels = [
 
 [[package]]
 name = "click"
-version = "8.3.0"
+version = "8.3.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "colorama", marker = "sys_platform == 'win32'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/46/61/de6cd827efad202d7057d93e0fed9294b96952e188f7384832791c7b2254/click-8.3.0.tar.gz", hash = "sha256:e7b8232224eba16f4ebe410c25ced9f7875cb5f3263ffc93cc3e8da705e229c4", size = 276943, upload-time = "2025-09-18T17:32:23.696Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065, upload-time = "2025-11-15T20:45:42.706Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/db/d3/9dcc0f5797f070ec8edf30fbadfb200e71d9db6b84d211e3b2085a7589a0/click-8.3.0-py3-none-any.whl", hash = "sha256:9b9f285302c6e3064f4330c05f05b81945b2a39544279343e6e7c5f27a9baddc", size = 107295, upload-time = "2025-09-18T17:32:22.42Z" },
+    { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" },
 ]
 
 [[package]]
@@ -938,101 +905,101 @@ wheels = [
 
 [[package]]
 name = "coverage"
-version = "7.11.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/1c/38/ee22495420457259d2f3390309505ea98f98a5eed40901cf62196abad006/coverage-7.11.0.tar.gz", hash = "sha256:167bd504ac1ca2af7ff3b81d245dfea0292c5032ebef9d66cc08a7d28c1b8050", size = 811905, upload-time = "2025-10-15T15:15:08.542Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/12/95/c49df0aceb5507a80b9fe5172d3d39bf23f05be40c23c8d77d556df96cec/coverage-7.11.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eb53f1e8adeeb2e78962bade0c08bfdc461853c7969706ed901821e009b35e31", size = 215800, upload-time = "2025-10-15T15:12:19.824Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/c6/7bb46ce01ed634fff1d7bb53a54049f539971862cc388b304ff3c51b4f66/coverage-7.11.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d9a03ec6cb9f40a5c360f138b88266fd8f58408d71e89f536b4f91d85721d075", size = 216198, upload-time = "2025-10-15T15:12:22.549Z" },
-    { url = "https://files.pythonhosted.org/packages/94/b2/75d9d8fbf2900268aca5de29cd0a0fe671b0f69ef88be16767cc3c828b85/coverage-7.11.0-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:0d7f0616c557cbc3d1c2090334eddcbb70e1ae3a40b07222d62b3aa47f608fab", size = 242953, upload-time = "2025-10-15T15:12:24.139Z" },
-    { url = "https://files.pythonhosted.org/packages/65/ac/acaa984c18f440170525a8743eb4b6c960ace2dbad80dc22056a437fc3c6/coverage-7.11.0-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:e44a86a47bbdf83b0a3ea4d7df5410d6b1a0de984fbd805fa5101f3624b9abe0", size = 244766, upload-time = "2025-10-15T15:12:25.974Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/0d/938d0bff76dfa4a6b228c3fc4b3e1c0e2ad4aa6200c141fcda2bd1170227/coverage-7.11.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:596763d2f9a0ee7eec6e643e29660def2eef297e1de0d334c78c08706f1cb785", size = 246625, upload-time = "2025-10-15T15:12:27.387Z" },
-    { url = "https://files.pythonhosted.org/packages/38/54/8f5f5e84bfa268df98f46b2cb396b1009734cfb1e5d6adb663d284893b32/coverage-7.11.0-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ef55537ff511b5e0a43edb4c50a7bf7ba1c3eea20b4f49b1490f1e8e0e42c591", size = 243568, upload-time = "2025-10-15T15:12:28.799Z" },
-    { url = "https://files.pythonhosted.org/packages/68/30/8ba337c2877fe3f2e1af0ed7ff4be0c0c4aca44d6f4007040f3ca2255e99/coverage-7.11.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9cbabd8f4d0d3dc571d77ae5bdbfa6afe5061e679a9d74b6797c48d143307088", size = 244665, upload-time = "2025-10-15T15:12:30.297Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/fb/c6f1d6d9a665536b7dde2333346f0cc41dc6a60bd1ffc10cd5c33e7eb000/coverage-7.11.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e24045453384e0ae2a587d562df2a04d852672eb63051d16096d3f08aa4c7c2f", size = 242681, upload-time = "2025-10-15T15:12:32.326Z" },
-    { url = "https://files.pythonhosted.org/packages/be/38/1b532319af5f991fa153c20373291dc65c2bf532af7dbcffdeef745c8f79/coverage-7.11.0-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:7161edd3426c8d19bdccde7d49e6f27f748f3c31cc350c5de7c633fea445d866", size = 242912, upload-time = "2025-10-15T15:12:34.079Z" },
-    { url = "https://files.pythonhosted.org/packages/67/3d/f39331c60ef6050d2a861dc1b514fa78f85f792820b68e8c04196ad733d6/coverage-7.11.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3d4ed4de17e692ba6415b0587bc7f12bc80915031fc9db46a23ce70fc88c9841", size = 243559, upload-time = "2025-10-15T15:12:35.809Z" },
-    { url = "https://files.pythonhosted.org/packages/4b/55/cb7c9df9d0495036ce582a8a2958d50c23cd73f84a23284bc23bd4711a6f/coverage-7.11.0-cp310-cp310-win32.whl", hash = "sha256:765c0bc8fe46f48e341ef737c91c715bd2a53a12792592296a095f0c237e09cf", size = 218266, upload-time = "2025-10-15T15:12:37.429Z" },
-    { url = "https://files.pythonhosted.org/packages/68/a8/b79cb275fa7bd0208767f89d57a1b5f6ba830813875738599741b97c2e04/coverage-7.11.0-cp310-cp310-win_amd64.whl", hash = "sha256:24d6f3128f1b2d20d84b24f4074475457faedc3d4613a7e66b5e769939c7d969", size = 219169, upload-time = "2025-10-15T15:12:39.25Z" },
-    { url = "https://files.pythonhosted.org/packages/49/3a/ee1074c15c408ddddddb1db7dd904f6b81bc524e01f5a1c5920e13dbde23/coverage-7.11.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3d58ecaa865c5b9fa56e35efc51d1014d4c0d22838815b9fce57a27dd9576847", size = 215912, upload-time = "2025-10-15T15:12:40.665Z" },
-    { url = "https://files.pythonhosted.org/packages/70/c4/9f44bebe5cb15f31608597b037d78799cc5f450044465bcd1ae8cb222fe1/coverage-7.11.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b679e171f1c104a5668550ada700e3c4937110dbdd153b7ef9055c4f1a1ee3cc", size = 216310, upload-time = "2025-10-15T15:12:42.461Z" },
-    { url = "https://files.pythonhosted.org/packages/42/01/5e06077cfef92d8af926bdd86b84fb28bf9bc6ad27343d68be9b501d89f2/coverage-7.11.0-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:ca61691ba8c5b6797deb221a0d09d7470364733ea9c69425a640f1f01b7c5bf0", size = 246706, upload-time = "2025-10-15T15:12:44.001Z" },
-    { url = "https://files.pythonhosted.org/packages/40/b8/7a3f1f33b35cc4a6c37e759137533119560d06c0cc14753d1a803be0cd4a/coverage-7.11.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:aef1747ede4bd8ca9cfc04cc3011516500c6891f1b33a94add3253f6f876b7b7", size = 248634, upload-time = "2025-10-15T15:12:45.768Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/41/7f987eb33de386bc4c665ab0bf98d15fcf203369d6aacae74f5dd8ec489a/coverage-7.11.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a1839d08406e4cba2953dcc0ffb312252f14d7c4c96919f70167611f4dee2623", size = 250741, upload-time = "2025-10-15T15:12:47.222Z" },
-    { url = "https://files.pythonhosted.org/packages/23/c1/a4e0ca6a4e83069fb8216b49b30a7352061ca0cb38654bd2dc96b7b3b7da/coverage-7.11.0-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e0eb0a2dcc62478eb5b4cbb80b97bdee852d7e280b90e81f11b407d0b81c4287", size = 246837, upload-time = "2025-10-15T15:12:48.904Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/03/ced062a17f7c38b4728ff76c3acb40d8465634b20b4833cdb3cc3a74e115/coverage-7.11.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bc1fbea96343b53f65d5351d8fd3b34fd415a2670d7c300b06d3e14a5af4f552", size = 248429, upload-time = "2025-10-15T15:12:50.73Z" },
-    { url = "https://files.pythonhosted.org/packages/97/af/a7c6f194bb8c5a2705ae019036b8fe7f49ea818d638eedb15fdb7bed227c/coverage-7.11.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:214b622259dd0cf435f10241f1333d32caa64dbc27f8790ab693428a141723de", size = 246490, upload-time = "2025-10-15T15:12:52.646Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/c3/aab4df02b04a8fde79068c3c41ad7a622b0ef2b12e1ed154da986a727c3f/coverage-7.11.0-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:258d9967520cca899695d4eb7ea38be03f06951d6ca2f21fb48b1235f791e601", size = 246208, upload-time = "2025-10-15T15:12:54.586Z" },
-    { url = "https://files.pythonhosted.org/packages/30/d8/e282ec19cd658238d60ed404f99ef2e45eed52e81b866ab1518c0d4163cf/coverage-7.11.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:cf9e6ff4ca908ca15c157c409d608da77a56a09877b97c889b98fb2c32b6465e", size = 247126, upload-time = "2025-10-15T15:12:56.485Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/17/a635fa07fac23adb1a5451ec756216768c2767efaed2e4331710342a3399/coverage-7.11.0-cp311-cp311-win32.whl", hash = "sha256:fcc15fc462707b0680cff6242c48625da7f9a16a28a41bb8fd7a4280920e676c", size = 218314, upload-time = "2025-10-15T15:12:58.365Z" },
-    { url = "https://files.pythonhosted.org/packages/2a/29/2ac1dfcdd4ab9a70026edc8d715ece9b4be9a1653075c658ee6f271f394d/coverage-7.11.0-cp311-cp311-win_amd64.whl", hash = "sha256:865965bf955d92790f1facd64fe7ff73551bd2c1e7e6b26443934e9701ba30b9", size = 219203, upload-time = "2025-10-15T15:12:59.902Z" },
-    { url = "https://files.pythonhosted.org/packages/03/21/5ce8b3a0133179115af4c041abf2ee652395837cb896614beb8ce8ddcfd9/coverage-7.11.0-cp311-cp311-win_arm64.whl", hash = "sha256:5693e57a065760dcbeb292d60cc4d0231a6d4b6b6f6a3191561e1d5e8820b745", size = 217879, upload-time = "2025-10-15T15:13:01.35Z" },
-    { url = "https://files.pythonhosted.org/packages/c4/db/86f6906a7c7edc1a52b2c6682d6dd9be775d73c0dfe2b84f8923dfea5784/coverage-7.11.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9c49e77811cf9d024b95faf86c3f059b11c0c9be0b0d61bc598f453703bd6fd1", size = 216098, upload-time = "2025-10-15T15:13:02.916Z" },
-    { url = "https://files.pythonhosted.org/packages/21/54/e7b26157048c7ba555596aad8569ff903d6cd67867d41b75287323678ede/coverage-7.11.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a61e37a403a778e2cda2a6a39abcc895f1d984071942a41074b5c7ee31642007", size = 216331, upload-time = "2025-10-15T15:13:04.403Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/19/1ce6bf444f858b83a733171306134a0544eaddf1ca8851ede6540a55b2ad/coverage-7.11.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:c79cae102bb3b1801e2ef1511fb50e91ec83a1ce466b2c7c25010d884336de46", size = 247825, upload-time = "2025-10-15T15:13:05.92Z" },
-    { url = "https://files.pythonhosted.org/packages/71/0b/d3bcbbc259fcced5fb67c5d78f6e7ee965f49760c14afd931e9e663a83b2/coverage-7.11.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:16ce17ceb5d211f320b62df002fa7016b7442ea0fd260c11cec8ce7730954893", size = 250573, upload-time = "2025-10-15T15:13:07.471Z" },
-    { url = "https://files.pythonhosted.org/packages/58/8d/b0ff3641a320abb047258d36ed1c21d16be33beed4152628331a1baf3365/coverage-7.11.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:80027673e9d0bd6aef86134b0771845e2da85755cf686e7c7c59566cf5a89115", size = 251706, upload-time = "2025-10-15T15:13:09.4Z" },
-    { url = "https://files.pythonhosted.org/packages/59/c8/5a586fe8c7b0458053d9c687f5cff515a74b66c85931f7fe17a1c958b4ac/coverage-7.11.0-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:4d3ffa07a08657306cd2215b0da53761c4d73cb54d9143b9303a6481ec0cd415", size = 248221, upload-time = "2025-10-15T15:13:10.964Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/ff/3a25e3132804ba44cfa9a778cdf2b73dbbe63ef4b0945e39602fc896ba52/coverage-7.11.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a3b6a5f8b2524fd6c1066bc85bfd97e78709bb5e37b5b94911a6506b65f47186", size = 249624, upload-time = "2025-10-15T15:13:12.5Z" },
-    { url = "https://files.pythonhosted.org/packages/c5/12/ff10c8ce3895e1b17a73485ea79ebc1896a9e466a9d0f4aef63e0d17b718/coverage-7.11.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:fcc0a4aa589de34bc56e1a80a740ee0f8c47611bdfb28cd1849de60660f3799d", size = 247744, upload-time = "2025-10-15T15:13:14.554Z" },
-    { url = "https://files.pythonhosted.org/packages/16/02/d500b91f5471b2975947e0629b8980e5e90786fe316b6d7299852c1d793d/coverage-7.11.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:dba82204769d78c3fd31b35c3d5f46e06511936c5019c39f98320e05b08f794d", size = 247325, upload-time = "2025-10-15T15:13:16.438Z" },
-    { url = "https://files.pythonhosted.org/packages/77/11/dee0284fbbd9cd64cfce806b827452c6df3f100d9e66188e82dfe771d4af/coverage-7.11.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:81b335f03ba67309a95210caf3eb43bd6fe75a4e22ba653ef97b4696c56c7ec2", size = 249180, upload-time = "2025-10-15T15:13:17.959Z" },
-    { url = "https://files.pythonhosted.org/packages/59/1b/cdf1def928f0a150a057cab03286774e73e29c2395f0d30ce3d9e9f8e697/coverage-7.11.0-cp312-cp312-win32.whl", hash = "sha256:037b2d064c2f8cc8716fe4d39cb705779af3fbf1ba318dc96a1af858888c7bb5", size = 218479, upload-time = "2025-10-15T15:13:19.608Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/55/e5884d55e031da9c15b94b90a23beccc9d6beee65e9835cd6da0a79e4f3a/coverage-7.11.0-cp312-cp312-win_amd64.whl", hash = "sha256:d66c0104aec3b75e5fd897e7940188ea1892ca1d0235316bf89286d6a22568c0", size = 219290, upload-time = "2025-10-15T15:13:21.593Z" },
-    { url = "https://files.pythonhosted.org/packages/23/a8/faa930cfc71c1d16bc78f9a19bb73700464f9c331d9e547bfbc1dbd3a108/coverage-7.11.0-cp312-cp312-win_arm64.whl", hash = "sha256:d91ebeac603812a09cf6a886ba6e464f3bbb367411904ae3790dfe28311b15ad", size = 217924, upload-time = "2025-10-15T15:13:23.39Z" },
-    { url = "https://files.pythonhosted.org/packages/60/7f/85e4dfe65e400645464b25c036a26ac226cf3a69d4a50c3934c532491cdd/coverage-7.11.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:cc3f49e65ea6e0d5d9bd60368684fe52a704d46f9e7fc413918f18d046ec40e1", size = 216129, upload-time = "2025-10-15T15:13:25.371Z" },
-    { url = "https://files.pythonhosted.org/packages/96/5d/dc5fa98fea3c175caf9d360649cb1aa3715e391ab00dc78c4c66fabd7356/coverage-7.11.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f39ae2f63f37472c17b4990f794035c9890418b1b8cca75c01193f3c8d3e01be", size = 216380, upload-time = "2025-10-15T15:13:26.976Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/f5/3da9cc9596708273385189289c0e4d8197d37a386bdf17619013554b3447/coverage-7.11.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:7db53b5cdd2917b6eaadd0b1251cf4e7d96f4a8d24e174bdbdf2f65b5ea7994d", size = 247375, upload-time = "2025-10-15T15:13:28.923Z" },
-    { url = "https://files.pythonhosted.org/packages/65/6c/f7f59c342359a235559d2bc76b0c73cfc4bac7d61bb0df210965cb1ecffd/coverage-7.11.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:10ad04ac3a122048688387828b4537bc9cf60c0bf4869c1e9989c46e45690b82", size = 249978, upload-time = "2025-10-15T15:13:30.525Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/8c/042dede2e23525e863bf1ccd2b92689692a148d8b5fd37c37899ba882645/coverage-7.11.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4036cc9c7983a2b1f2556d574d2eb2154ac6ed55114761685657e38782b23f52", size = 251253, upload-time = "2025-10-15T15:13:32.174Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/a9/3c58df67bfa809a7bddd786356d9c5283e45d693edb5f3f55d0986dd905a/coverage-7.11.0-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:7ab934dd13b1c5e94b692b1e01bd87e4488cb746e3a50f798cb9464fd128374b", size = 247591, upload-time = "2025-10-15T15:13:34.147Z" },
-    { url = "https://files.pythonhosted.org/packages/26/5b/c7f32efd862ee0477a18c41e4761305de6ddd2d49cdeda0c1116227570fd/coverage-7.11.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:59a6e5a265f7cfc05f76e3bb53eca2e0dfe90f05e07e849930fecd6abb8f40b4", size = 249411, upload-time = "2025-10-15T15:13:38.425Z" },
-    { url = "https://files.pythonhosted.org/packages/76/b5/78cb4f1e86c1611431c990423ec0768122905b03837e1b4c6a6f388a858b/coverage-7.11.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:df01d6c4c81e15a7c88337b795bb7595a8596e92310266b5072c7e301168efbd", size = 247303, upload-time = "2025-10-15T15:13:40.464Z" },
-    { url = "https://files.pythonhosted.org/packages/87/c9/23c753a8641a330f45f221286e707c427e46d0ffd1719b080cedc984ec40/coverage-7.11.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:8c934bd088eed6174210942761e38ee81d28c46de0132ebb1801dbe36a390dcc", size = 247157, upload-time = "2025-10-15T15:13:42.087Z" },
-    { url = "https://files.pythonhosted.org/packages/c5/42/6e0cc71dc8a464486e944a4fa0d85bdec031cc2969e98ed41532a98336b9/coverage-7.11.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5a03eaf7ec24078ad64a07f02e30060aaf22b91dedf31a6b24d0d98d2bba7f48", size = 248921, upload-time = "2025-10-15T15:13:43.715Z" },
-    { url = "https://files.pythonhosted.org/packages/e8/1c/743c2ef665e6858cccb0f84377dfe3a4c25add51e8c7ef19249be92465b6/coverage-7.11.0-cp313-cp313-win32.whl", hash = "sha256:695340f698a5f56f795b2836abe6fb576e7c53d48cd155ad2f80fd24bc63a040", size = 218526, upload-time = "2025-10-15T15:13:45.336Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/d5/226daadfd1bf8ddbccefbd3aa3547d7b960fb48e1bdac124e2dd13a2b71a/coverage-7.11.0-cp313-cp313-win_amd64.whl", hash = "sha256:2727d47fce3ee2bac648528e41455d1b0c46395a087a229deac75e9f88ba5a05", size = 219317, upload-time = "2025-10-15T15:13:47.401Z" },
-    { url = "https://files.pythonhosted.org/packages/97/54/47db81dcbe571a48a298f206183ba8a7ba79200a37cd0d9f4788fcd2af4a/coverage-7.11.0-cp313-cp313-win_arm64.whl", hash = "sha256:0efa742f431529699712b92ecdf22de8ff198df41e43aeaaadf69973eb93f17a", size = 217948, upload-time = "2025-10-15T15:13:49.096Z" },
-    { url = "https://files.pythonhosted.org/packages/e5/8b/cb68425420154e7e2a82fd779a8cc01549b6fa83c2ad3679cd6c088ebd07/coverage-7.11.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:587c38849b853b157706407e9ebdca8fd12f45869edb56defbef2daa5fb0812b", size = 216837, upload-time = "2025-10-15T15:13:51.09Z" },
-    { url = "https://files.pythonhosted.org/packages/33/55/9d61b5765a025685e14659c8d07037247de6383c0385757544ffe4606475/coverage-7.11.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:b971bdefdd75096163dd4261c74be813c4508477e39ff7b92191dea19f24cd37", size = 217061, upload-time = "2025-10-15T15:13:52.747Z" },
-    { url = "https://files.pythonhosted.org/packages/52/85/292459c9186d70dcec6538f06ea251bc968046922497377bf4a1dc9a71de/coverage-7.11.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:269bfe913b7d5be12ab13a95f3a76da23cf147be7fa043933320ba5625f0a8de", size = 258398, upload-time = "2025-10-15T15:13:54.45Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/e2/46edd73fb8bf51446c41148d81944c54ed224854812b6ca549be25113ee0/coverage-7.11.0-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:dadbcce51a10c07b7c72b0ce4a25e4b6dcb0c0372846afb8e5b6307a121eb99f", size = 260574, upload-time = "2025-10-15T15:13:56.145Z" },
-    { url = "https://files.pythonhosted.org/packages/07/5e/1df469a19007ff82e2ca8fe509822820a31e251f80ee7344c34f6cd2ec43/coverage-7.11.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9ed43fa22c6436f7957df036331f8fe4efa7af132054e1844918866cd228af6c", size = 262797, upload-time = "2025-10-15T15:13:58.635Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/50/de216b31a1434b94d9b34a964c09943c6be45069ec704bfc379d8d89a649/coverage-7.11.0-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:9516add7256b6713ec08359b7b05aeff8850c98d357784c7205b2e60aa2513fa", size = 257361, upload-time = "2025-10-15T15:14:00.409Z" },
-    { url = "https://files.pythonhosted.org/packages/82/1e/3f9f8344a48111e152e0fd495b6fff13cc743e771a6050abf1627a7ba918/coverage-7.11.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:eb92e47c92fcbcdc692f428da67db33337fa213756f7adb6a011f7b5a7a20740", size = 260349, upload-time = "2025-10-15T15:14:02.188Z" },
-    { url = "https://files.pythonhosted.org/packages/65/9b/3f52741f9e7d82124272f3070bbe316006a7de1bad1093f88d59bfc6c548/coverage-7.11.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:d06f4fc7acf3cabd6d74941d53329e06bab00a8fe10e4df2714f0b134bfc64ef", size = 258114, upload-time = "2025-10-15T15:14:03.907Z" },
-    { url = "https://files.pythonhosted.org/packages/0b/8b/918f0e15f0365d50d3986bbd3338ca01178717ac5678301f3f547b6619e6/coverage-7.11.0-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:6fbcee1a8f056af07ecd344482f711f563a9eb1c2cad192e87df00338ec3cdb0", size = 256723, upload-time = "2025-10-15T15:14:06.324Z" },
-    { url = "https://files.pythonhosted.org/packages/44/9e/7776829f82d3cf630878a7965a7d70cc6ca94f22c7d20ec4944f7148cb46/coverage-7.11.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:dbbf012be5f32533a490709ad597ad8a8ff80c582a95adc8d62af664e532f9ca", size = 259238, upload-time = "2025-10-15T15:14:08.002Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/b8/49cf253e1e7a3bedb85199b201862dd7ca4859f75b6cf25ffa7298aa0760/coverage-7.11.0-cp313-cp313t-win32.whl", hash = "sha256:cee6291bb4fed184f1c2b663606a115c743df98a537c969c3c64b49989da96c2", size = 219180, upload-time = "2025-10-15T15:14:09.786Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/e1/1a541703826be7ae2125a0fb7f821af5729d56bb71e946e7b933cc7a89a4/coverage-7.11.0-cp313-cp313t-win_amd64.whl", hash = "sha256:a386c1061bf98e7ea4758e4313c0ab5ecf57af341ef0f43a0bf26c2477b5c268", size = 220241, upload-time = "2025-10-15T15:14:11.471Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/d1/5ee0e0a08621140fd418ec4020f595b4d52d7eb429ae6a0c6542b4ba6f14/coverage-7.11.0-cp313-cp313t-win_arm64.whl", hash = "sha256:f9ea02ef40bb83823b2b04964459d281688fe173e20643870bb5d2edf68bc836", size = 218510, upload-time = "2025-10-15T15:14:13.46Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/06/e923830c1985ce808e40a3fa3eb46c13350b3224b7da59757d37b6ce12b8/coverage-7.11.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:c770885b28fb399aaf2a65bbd1c12bf6f307ffd112d6a76c5231a94276f0c497", size = 216110, upload-time = "2025-10-15T15:14:15.157Z" },
-    { url = "https://files.pythonhosted.org/packages/42/82/cdeed03bfead45203fb651ed756dfb5266028f5f939e7f06efac4041dad5/coverage-7.11.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a3d0e2087dba64c86a6b254f43e12d264b636a39e88c5cc0a01a7c71bcfdab7e", size = 216395, upload-time = "2025-10-15T15:14:16.863Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/ba/e1c80caffc3199aa699813f73ff097bc2df7b31642bdbc7493600a8f1de5/coverage-7.11.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:73feb83bb41c32811973b8565f3705caf01d928d972b72042b44e97c71fd70d1", size = 247433, upload-time = "2025-10-15T15:14:18.589Z" },
-    { url = "https://files.pythonhosted.org/packages/80/c0/5b259b029694ce0a5bbc1548834c7ba3db41d3efd3474489d7efce4ceb18/coverage-7.11.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:c6f31f281012235ad08f9a560976cc2fc9c95c17604ff3ab20120fe480169bca", size = 249970, upload-time = "2025-10-15T15:14:20.307Z" },
-    { url = "https://files.pythonhosted.org/packages/8c/86/171b2b5e1aac7e2fd9b43f7158b987dbeb95f06d1fbecad54ad8163ae3e8/coverage-7.11.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e9570ad567f880ef675673992222746a124b9595506826b210fbe0ce3f0499cd", size = 251324, upload-time = "2025-10-15T15:14:22.419Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/7e/7e10414d343385b92024af3932a27a1caf75c6e27ee88ba211221ff1a145/coverage-7.11.0-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8badf70446042553a773547a61fecaa734b55dc738cacf20c56ab04b77425e43", size = 247445, upload-time = "2025-10-15T15:14:24.205Z" },
-    { url = "https://files.pythonhosted.org/packages/c4/3b/e4f966b21f5be8c4bf86ad75ae94efa0de4c99c7bbb8114476323102e345/coverage-7.11.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:a09c1211959903a479e389685b7feb8a17f59ec5a4ef9afde7650bd5eabc2777", size = 249324, upload-time = "2025-10-15T15:14:26.234Z" },
-    { url = "https://files.pythonhosted.org/packages/00/a2/8479325576dfcd909244d0df215f077f47437ab852ab778cfa2f8bf4d954/coverage-7.11.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:5ef83b107f50db3f9ae40f69e34b3bd9337456c5a7fe3461c7abf8b75dd666a2", size = 247261, upload-time = "2025-10-15T15:14:28.42Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/d8/3a9e2db19d94d65771d0f2e21a9ea587d11b831332a73622f901157cc24b/coverage-7.11.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:f91f927a3215b8907e214af77200250bb6aae36eca3f760f89780d13e495388d", size = 247092, upload-time = "2025-10-15T15:14:30.784Z" },
-    { url = "https://files.pythonhosted.org/packages/b3/b1/bbca3c472544f9e2ad2d5116b2379732957048be4b93a9c543fcd0207e5f/coverage-7.11.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:cdbcd376716d6b7fbfeedd687a6c4be019c5a5671b35f804ba76a4c0a778cba4", size = 248755, upload-time = "2025-10-15T15:14:32.585Z" },
-    { url = "https://files.pythonhosted.org/packages/89/49/638d5a45a6a0f00af53d6b637c87007eb2297042186334e9923a61aa8854/coverage-7.11.0-cp314-cp314-win32.whl", hash = "sha256:bab7ec4bb501743edc63609320aaec8cd9188b396354f482f4de4d40a9d10721", size = 218793, upload-time = "2025-10-15T15:14:34.972Z" },
-    { url = "https://files.pythonhosted.org/packages/30/cc/b675a51f2d068adb3cdf3799212c662239b0ca27f4691d1fff81b92ea850/coverage-7.11.0-cp314-cp314-win_amd64.whl", hash = "sha256:3d4ba9a449e9364a936a27322b20d32d8b166553bfe63059bd21527e681e2fad", size = 219587, upload-time = "2025-10-15T15:14:37.047Z" },
-    { url = "https://files.pythonhosted.org/packages/93/98/5ac886876026de04f00820e5094fe22166b98dcb8b426bf6827aaf67048c/coverage-7.11.0-cp314-cp314-win_arm64.whl", hash = "sha256:ce37f215223af94ef0f75ac68ea096f9f8e8c8ec7d6e8c346ee45c0d363f0479", size = 218168, upload-time = "2025-10-15T15:14:38.861Z" },
-    { url = "https://files.pythonhosted.org/packages/14/d1/b4145d35b3e3ecf4d917e97fc8895bcf027d854879ba401d9ff0f533f997/coverage-7.11.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:f413ce6e07e0d0dc9c433228727b619871532674b45165abafe201f200cc215f", size = 216850, upload-time = "2025-10-15T15:14:40.651Z" },
-    { url = "https://files.pythonhosted.org/packages/ca/d1/7f645fc2eccd318369a8a9948acc447bb7c1ade2911e31d3c5620544c22b/coverage-7.11.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:05791e528a18f7072bf5998ba772fe29db4da1234c45c2087866b5ba4dea710e", size = 217071, upload-time = "2025-10-15T15:14:42.755Z" },
-    { url = "https://files.pythonhosted.org/packages/54/7d/64d124649db2737ceced1dfcbdcb79898d5868d311730f622f8ecae84250/coverage-7.11.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:cacb29f420cfeb9283b803263c3b9a068924474ff19ca126ba9103e1278dfa44", size = 258570, upload-time = "2025-10-15T15:14:44.542Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/3f/6f5922f80dc6f2d8b2c6f974835c43f53eb4257a7797727e6ca5b7b2ec1f/coverage-7.11.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:314c24e700d7027ae3ab0d95fbf8d53544fca1f20345fd30cd219b737c6e58d3", size = 260738, upload-time = "2025-10-15T15:14:46.436Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/5f/9e883523c4647c860b3812b417a2017e361eca5b635ee658387dc11b13c1/coverage-7.11.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:630d0bd7a293ad2fc8b4b94e5758c8b2536fdf36c05f1681270203e463cbfa9b", size = 262994, upload-time = "2025-10-15T15:14:48.3Z" },
-    { url = "https://files.pythonhosted.org/packages/07/bb/43b5a8e94c09c8bf51743ffc65c4c841a4ca5d3ed191d0a6919c379a1b83/coverage-7.11.0-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e89641f5175d65e2dbb44db15fe4ea48fade5d5bbb9868fdc2b4fce22f4a469d", size = 257282, upload-time = "2025-10-15T15:14:50.236Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/e5/0ead8af411411330b928733e1d201384b39251a5f043c1612970310e8283/coverage-7.11.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c9f08ea03114a637dab06cedb2e914da9dc67fa52c6015c018ff43fdde25b9c2", size = 260430, upload-time = "2025-10-15T15:14:52.413Z" },
-    { url = "https://files.pythonhosted.org/packages/ae/66/03dd8bb0ba5b971620dcaac145461950f6d8204953e535d2b20c6b65d729/coverage-7.11.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:ce9f3bde4e9b031eaf1eb61df95c1401427029ea1bfddb8621c1161dcb0fa02e", size = 258190, upload-time = "2025-10-15T15:14:54.268Z" },
-    { url = "https://files.pythonhosted.org/packages/45/ae/28a9cce40bf3174426cb2f7e71ee172d98e7f6446dff936a7ccecee34b14/coverage-7.11.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:e4dc07e95495923d6fd4d6c27bf70769425b71c89053083843fd78f378558996", size = 256658, upload-time = "2025-10-15T15:14:56.436Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/7c/3a44234a8599513684bfc8684878fd7b126c2760f79712bb78c56f19efc4/coverage-7.11.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:424538266794db2861db4922b05d729ade0940ee69dcf0591ce8f69784db0e11", size = 259342, upload-time = "2025-10-15T15:14:58.538Z" },
-    { url = "https://files.pythonhosted.org/packages/e1/e6/0108519cba871af0351725ebdb8660fd7a0fe2ba3850d56d32490c7d9b4b/coverage-7.11.0-cp314-cp314t-win32.whl", hash = "sha256:4c1eeb3fb8eb9e0190bebafd0462936f75717687117339f708f395fe455acc73", size = 219568, upload-time = "2025-10-15T15:15:00.382Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/76/44ba876e0942b4e62fdde23ccb029ddb16d19ba1bef081edd00857ba0b16/coverage-7.11.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b56efee146c98dbf2cf5cffc61b9829d1e94442df4d7398b26892a53992d3547", size = 220687, upload-time = "2025-10-15T15:15:02.322Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/0c/0df55ecb20d0d0ed5c322e10a441775e1a3a5d78c60f0c4e1abfe6fcf949/coverage-7.11.0-cp314-cp314t-win_arm64.whl", hash = "sha256:b5c2705afa83f49bd91962a4094b6b082f94aef7626365ab3f8f4bd159c5acf3", size = 218711, upload-time = "2025-10-15T15:15:04.575Z" },
-    { url = "https://files.pythonhosted.org/packages/5f/04/642c1d8a448ae5ea1369eac8495740a79eb4e581a9fb0cbdce56bbf56da1/coverage-7.11.0-py3-none-any.whl", hash = "sha256:4b7589765348d78fb4e5fb6ea35d07564e387da2fc5efff62e0222971f155f68", size = 207761, upload-time = "2025-10-15T15:15:06.439Z" },
+version = "7.12.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/89/26/4a96807b193b011588099c3b5c89fbb05294e5b90e71018e065465f34eb6/coverage-7.12.0.tar.gz", hash = "sha256:fc11e0a4e372cb5f282f16ef90d4a585034050ccda536451901abfb19a57f40c", size = 819341, upload-time = "2025-11-18T13:34:20.766Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/26/4a/0dc3de1c172d35abe512332cfdcc43211b6ebce629e4cc42e6cd25ed8f4d/coverage-7.12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:32b75c2ba3f324ee37af3ccee5b30458038c50b349ad9b88cee85096132a575b", size = 217409, upload-time = "2025-11-18T13:31:53.122Z" },
+    { url = "https://files.pythonhosted.org/packages/01/c3/086198b98db0109ad4f84241e8e9ea7e5fb2db8c8ffb787162d40c26cc76/coverage-7.12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cb2a1b6ab9fe833714a483a915de350abc624a37149649297624c8d57add089c", size = 217927, upload-time = "2025-11-18T13:31:54.458Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/5f/34614dbf5ce0420828fc6c6f915126a0fcb01e25d16cf141bf5361e6aea6/coverage-7.12.0-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5734b5d913c3755e72f70bf6cc37a0518d4f4745cde760c5d8e12005e62f9832", size = 244678, upload-time = "2025-11-18T13:31:55.805Z" },
+    { url = "https://files.pythonhosted.org/packages/55/7b/6b26fb32e8e4a6989ac1d40c4e132b14556131493b1d06bc0f2be169c357/coverage-7.12.0-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b527a08cdf15753279b7afb2339a12073620b761d79b81cbe2cdebdb43d90daa", size = 246507, upload-time = "2025-11-18T13:31:57.05Z" },
+    { url = "https://files.pythonhosted.org/packages/06/42/7d70e6603d3260199b90fb48b537ca29ac183d524a65cc31366b2e905fad/coverage-7.12.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9bb44c889fb68004e94cab71f6a021ec83eac9aeabdbb5a5a88821ec46e1da73", size = 248366, upload-time = "2025-11-18T13:31:58.362Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/4a/d86b837923878424c72458c5b25e899a3c5ca73e663082a915f5b3c4d749/coverage-7.12.0-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:4b59b501455535e2e5dde5881739897967b272ba25988c89145c12d772810ccb", size = 245366, upload-time = "2025-11-18T13:31:59.572Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/c2/2adec557e0aa9721875f06ced19730fdb7fc58e31b02b5aa56f2ebe4944d/coverage-7.12.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d8842f17095b9868a05837b7b1b73495293091bed870e099521ada176aa3e00e", size = 246408, upload-time = "2025-11-18T13:32:00.784Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/4b/8bd1f1148260df11c618e535fdccd1e5aaf646e55b50759006a4f41d8a26/coverage-7.12.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:c5a6f20bf48b8866095c6820641e7ffbe23f2ac84a2efc218d91235e404c7777", size = 244416, upload-time = "2025-11-18T13:32:01.963Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/13/3a248dd6a83df90414c54a4e121fd081fb20602ca43955fbe1d60e2312a9/coverage-7.12.0-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:5f3738279524e988d9da2893f307c2093815c623f8d05a8f79e3eff3a7a9e553", size = 244681, upload-time = "2025-11-18T13:32:03.408Z" },
+    { url = "https://files.pythonhosted.org/packages/76/30/aa833827465a5e8c938935f5d91ba055f70516941078a703740aaf1aa41f/coverage-7.12.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e0d68c1f7eabbc8abe582d11fa393ea483caf4f44b0af86881174769f185c94d", size = 245300, upload-time = "2025-11-18T13:32:04.686Z" },
+    { url = "https://files.pythonhosted.org/packages/38/24/f85b3843af1370fb3739fa7571819b71243daa311289b31214fe3e8c9d68/coverage-7.12.0-cp310-cp310-win32.whl", hash = "sha256:7670d860e18b1e3ee5930b17a7d55ae6287ec6e55d9799982aa103a2cc1fa2ef", size = 220008, upload-time = "2025-11-18T13:32:05.806Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/a2/c7da5b9566f7164db9eefa133d17761ecb2c2fde9385d754e5b5c80f710d/coverage-7.12.0-cp310-cp310-win_amd64.whl", hash = "sha256:f999813dddeb2a56aab5841e687b68169da0d3f6fc78ccf50952fa2463746022", size = 220943, upload-time = "2025-11-18T13:32:07.166Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/0c/0dfe7f0487477d96432e4815537263363fb6dd7289743a796e8e51eabdf2/coverage-7.12.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:aa124a3683d2af98bd9d9c2bfa7a5076ca7e5ab09fdb96b81fa7d89376ae928f", size = 217535, upload-time = "2025-11-18T13:32:08.812Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/f5/f9a4a053a5bbff023d3bec259faac8f11a1e5a6479c2ccf586f910d8dac7/coverage-7.12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d93fbf446c31c0140208dcd07c5d882029832e8ed7891a39d6d44bd65f2316c3", size = 218044, upload-time = "2025-11-18T13:32:10.329Z" },
+    { url = "https://files.pythonhosted.org/packages/95/c5/84fc3697c1fa10cd8571919bf9693f693b7373278daaf3b73e328d502bc8/coverage-7.12.0-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:52ca620260bd8cd6027317bdd8b8ba929be1d741764ee765b42c4d79a408601e", size = 248440, upload-time = "2025-11-18T13:32:12.536Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/36/2d93fbf6a04670f3874aed397d5a5371948a076e3249244a9e84fb0e02d6/coverage-7.12.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:f3433ffd541380f3a0e423cff0f4926d55b0cc8c1d160fdc3be24a4c03aa65f7", size = 250361, upload-time = "2025-11-18T13:32:13.852Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/49/66dc65cc456a6bfc41ea3d0758c4afeaa4068a2b2931bf83be6894cf1058/coverage-7.12.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f7bbb321d4adc9f65e402c677cd1c8e4c2d0105d3ce285b51b4d87f1d5db5245", size = 252472, upload-time = "2025-11-18T13:32:15.068Z" },
+    { url = "https://files.pythonhosted.org/packages/35/1f/ebb8a18dffd406db9fcd4b3ae42254aedcaf612470e8712f12041325930f/coverage-7.12.0-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:22a7aade354a72dff3b59c577bfd18d6945c61f97393bc5fb7bd293a4237024b", size = 248592, upload-time = "2025-11-18T13:32:16.328Z" },
+    { url = "https://files.pythonhosted.org/packages/da/a8/67f213c06e5ea3b3d4980df7dc344d7fea88240b5fe878a5dcbdfe0e2315/coverage-7.12.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3ff651dcd36d2fea66877cd4a82de478004c59b849945446acb5baf9379a1b64", size = 250167, upload-time = "2025-11-18T13:32:17.687Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/00/e52aef68154164ea40cc8389c120c314c747fe63a04b013a5782e989b77f/coverage-7.12.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:31b8b2e38391a56e3cea39d22a23faaa7c3fc911751756ef6d2621d2a9daf742", size = 248238, upload-time = "2025-11-18T13:32:19.2Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/a4/4d88750bcf9d6d66f77865e5a05a20e14db44074c25fd22519777cb69025/coverage-7.12.0-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:297bc2da28440f5ae51c845a47c8175a4db0553a53827886e4fb25c66633000c", size = 247964, upload-time = "2025-11-18T13:32:21.027Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/6b/b74693158899d5b47b0bf6238d2c6722e20ba749f86b74454fac0696bb00/coverage-7.12.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6ff7651cc01a246908eac162a6a86fc0dbab6de1ad165dfb9a1e2ec660b44984", size = 248862, upload-time = "2025-11-18T13:32:22.304Z" },
+    { url = "https://files.pythonhosted.org/packages/18/de/6af6730227ce0e8ade307b1cc4a08e7f51b419a78d02083a86c04ccceb29/coverage-7.12.0-cp311-cp311-win32.whl", hash = "sha256:313672140638b6ddb2c6455ddeda41c6a0b208298034544cfca138978c6baed6", size = 220033, upload-time = "2025-11-18T13:32:23.714Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/a1/e7f63021a7c4fe20994359fcdeae43cbef4a4d0ca36a5a1639feeea5d9e1/coverage-7.12.0-cp311-cp311-win_amd64.whl", hash = "sha256:a1783ed5bd0d5938d4435014626568dc7f93e3cb99bc59188cc18857c47aa3c4", size = 220966, upload-time = "2025-11-18T13:32:25.599Z" },
+    { url = "https://files.pythonhosted.org/packages/77/e8/deae26453f37c20c3aa0c4433a1e32cdc169bf415cce223a693117aa3ddd/coverage-7.12.0-cp311-cp311-win_arm64.whl", hash = "sha256:4648158fd8dd9381b5847622df1c90ff314efbfc1df4550092ab6013c238a5fc", size = 219637, upload-time = "2025-11-18T13:32:27.265Z" },
+    { url = "https://files.pythonhosted.org/packages/02/bf/638c0427c0f0d47638242e2438127f3c8ee3cfc06c7fdeb16778ed47f836/coverage-7.12.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:29644c928772c78512b48e14156b81255000dcfd4817574ff69def189bcb3647", size = 217704, upload-time = "2025-11-18T13:32:28.906Z" },
+    { url = "https://files.pythonhosted.org/packages/08/e1/706fae6692a66c2d6b871a608bbde0da6281903fa0e9f53a39ed441da36a/coverage-7.12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8638cbb002eaa5d7c8d04da667813ce1067080b9a91099801a0053086e52b736", size = 218064, upload-time = "2025-11-18T13:32:30.161Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/8b/eb0231d0540f8af3ffda39720ff43cb91926489d01524e68f60e961366e4/coverage-7.12.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:083631eeff5eb9992c923e14b810a179798bb598e6a0dd60586819fc23be6e60", size = 249560, upload-time = "2025-11-18T13:32:31.835Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/a1/67fb52af642e974d159b5b379e4d4c59d0ebe1288677fbd04bbffe665a82/coverage-7.12.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:99d5415c73ca12d558e07776bd957c4222c687b9f1d26fa0e1b57e3598bdcde8", size = 252318, upload-time = "2025-11-18T13:32:33.178Z" },
+    { url = "https://files.pythonhosted.org/packages/41/e5/38228f31b2c7665ebf9bdfdddd7a184d56450755c7e43ac721c11a4b8dab/coverage-7.12.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e949ebf60c717c3df63adb4a1a366c096c8d7fd8472608cd09359e1bd48ef59f", size = 253403, upload-time = "2025-11-18T13:32:34.45Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/4b/df78e4c8188f9960684267c5a4897836f3f0f20a20c51606ee778a1d9749/coverage-7.12.0-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6d907ddccbca819afa2cd014bc69983b146cca2735a0b1e6259b2a6c10be1e70", size = 249984, upload-time = "2025-11-18T13:32:35.747Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/51/bb163933d195a345c6f63eab9e55743413d064c291b6220df754075c2769/coverage-7.12.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b1518ecbad4e6173f4c6e6c4a46e49555ea5679bf3feda5edb1b935c7c44e8a0", size = 251339, upload-time = "2025-11-18T13:32:37.352Z" },
+    { url = "https://files.pythonhosted.org/packages/15/40/c9b29cdb8412c837cdcbc2cfa054547dd83affe6cbbd4ce4fdb92b6ba7d1/coverage-7.12.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:51777647a749abdf6f6fd8c7cffab12de68ab93aab15efc72fbbb83036c2a068", size = 249489, upload-time = "2025-11-18T13:32:39.212Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/da/b3131e20ba07a0de4437a50ef3b47840dfabf9293675b0cd5c2c7f66dd61/coverage-7.12.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:42435d46d6461a3b305cdfcad7cdd3248787771f53fe18305548cba474e6523b", size = 249070, upload-time = "2025-11-18T13:32:40.598Z" },
+    { url = "https://files.pythonhosted.org/packages/70/81/b653329b5f6302c08d683ceff6785bc60a34be9ae92a5c7b63ee7ee7acec/coverage-7.12.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5bcead88c8423e1855e64b8057d0544e33e4080b95b240c2a355334bb7ced937", size = 250929, upload-time = "2025-11-18T13:32:42.915Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/00/250ac3bca9f252a5fb1338b5ad01331ebb7b40223f72bef5b1b2cb03aa64/coverage-7.12.0-cp312-cp312-win32.whl", hash = "sha256:dcbb630ab034e86d2a0f79aefd2be07e583202f41e037602d438c80044957baa", size = 220241, upload-time = "2025-11-18T13:32:44.665Z" },
+    { url = "https://files.pythonhosted.org/packages/64/1c/77e79e76d37ce83302f6c21980b45e09f8aa4551965213a10e62d71ce0ab/coverage-7.12.0-cp312-cp312-win_amd64.whl", hash = "sha256:2fd8354ed5d69775ac42986a691fbf68b4084278710cee9d7c3eaa0c28fa982a", size = 221051, upload-time = "2025-11-18T13:32:46.008Z" },
+    { url = "https://files.pythonhosted.org/packages/31/f5/641b8a25baae564f9e52cac0e2667b123de961985709a004e287ee7663cc/coverage-7.12.0-cp312-cp312-win_arm64.whl", hash = "sha256:737c3814903be30695b2de20d22bcc5428fdae305c61ba44cdc8b3252984c49c", size = 219692, upload-time = "2025-11-18T13:32:47.372Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/14/771700b4048774e48d2c54ed0c674273702713c9ee7acdfede40c2666747/coverage-7.12.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:47324fffca8d8eae7e185b5bb20c14645f23350f870c1649003618ea91a78941", size = 217725, upload-time = "2025-11-18T13:32:49.22Z" },
+    { url = "https://files.pythonhosted.org/packages/17/a7/3aa4144d3bcb719bf67b22d2d51c2d577bf801498c13cb08f64173e80497/coverage-7.12.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ccf3b2ede91decd2fb53ec73c1f949c3e034129d1e0b07798ff1d02ea0c8fa4a", size = 218098, upload-time = "2025-11-18T13:32:50.78Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/9c/b846bbc774ff81091a12a10203e70562c91ae71badda00c5ae5b613527b1/coverage-7.12.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:b365adc70a6936c6b0582dc38746b33b2454148c02349345412c6e743efb646d", size = 249093, upload-time = "2025-11-18T13:32:52.554Z" },
+    { url = "https://files.pythonhosted.org/packages/76/b6/67d7c0e1f400b32c883e9342de4a8c2ae7c1a0b57c5de87622b7262e2309/coverage-7.12.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:bc13baf85cd8a4cfcf4a35c7bc9d795837ad809775f782f697bf630b7e200211", size = 251686, upload-time = "2025-11-18T13:32:54.862Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/75/b095bd4b39d49c3be4bffbb3135fea18a99a431c52dd7513637c0762fecb/coverage-7.12.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:099d11698385d572ceafb3288a5b80fe1fc58bf665b3f9d362389de488361d3d", size = 252930, upload-time = "2025-11-18T13:32:56.417Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/f3/466f63015c7c80550bead3093aacabf5380c1220a2a93c35d374cae8f762/coverage-7.12.0-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:473dc45d69694069adb7680c405fb1e81f60b2aff42c81e2f2c3feaf544d878c", size = 249296, upload-time = "2025-11-18T13:32:58.074Z" },
+    { url = "https://files.pythonhosted.org/packages/27/86/eba2209bf2b7e28c68698fc13437519a295b2d228ba9e0ec91673e09fa92/coverage-7.12.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:583f9adbefd278e9de33c33d6846aa8f5d164fa49b47144180a0e037f0688bb9", size = 251068, upload-time = "2025-11-18T13:32:59.646Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/55/ca8ae7dbba962a3351f18940b359b94c6bafdd7757945fdc79ec9e452dc7/coverage-7.12.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b2089cc445f2dc0af6f801f0d1355c025b76c24481935303cf1af28f636688f0", size = 249034, upload-time = "2025-11-18T13:33:01.481Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/d7/39136149325cad92d420b023b5fd900dabdd1c3a0d1d5f148ef4a8cedef5/coverage-7.12.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:950411f1eb5d579999c5f66c62a40961f126fc71e5e14419f004471957b51508", size = 248853, upload-time = "2025-11-18T13:33:02.935Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/b6/76e1add8b87ef60e00643b0b7f8f7bb73d4bf5249a3be19ebefc5793dd25/coverage-7.12.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b1aab7302a87bafebfe76b12af681b56ff446dc6f32ed178ff9c092ca776e6bc", size = 250619, upload-time = "2025-11-18T13:33:04.336Z" },
+    { url = "https://files.pythonhosted.org/packages/95/87/924c6dc64f9203f7a3c1832a6a0eee5a8335dbe5f1bdadcc278d6f1b4d74/coverage-7.12.0-cp313-cp313-win32.whl", hash = "sha256:d7e0d0303c13b54db495eb636bc2465b2fb8475d4c8bcec8fe4b5ca454dfbae8", size = 220261, upload-time = "2025-11-18T13:33:06.493Z" },
+    { url = "https://files.pythonhosted.org/packages/91/77/dd4aff9af16ff776bf355a24d87eeb48fc6acde54c907cc1ea89b14a8804/coverage-7.12.0-cp313-cp313-win_amd64.whl", hash = "sha256:ce61969812d6a98a981d147d9ac583a36ac7db7766f2e64a9d4d059c2fe29d07", size = 221072, upload-time = "2025-11-18T13:33:07.926Z" },
+    { url = "https://files.pythonhosted.org/packages/70/49/5c9dc46205fef31b1b226a6e16513193715290584317fd4df91cdaf28b22/coverage-7.12.0-cp313-cp313-win_arm64.whl", hash = "sha256:bcec6f47e4cb8a4c2dc91ce507f6eefc6a1b10f58df32cdc61dff65455031dfc", size = 219702, upload-time = "2025-11-18T13:33:09.631Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/62/f87922641c7198667994dd472a91e1d9b829c95d6c29529ceb52132436ad/coverage-7.12.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:459443346509476170d553035e4a3eed7b860f4fe5242f02de1010501956ce87", size = 218420, upload-time = "2025-11-18T13:33:11.153Z" },
+    { url = "https://files.pythonhosted.org/packages/85/dd/1cc13b2395ef15dbb27d7370a2509b4aee77890a464fb35d72d428f84871/coverage-7.12.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:04a79245ab2b7a61688958f7a855275997134bc84f4a03bc240cf64ff132abf6", size = 218773, upload-time = "2025-11-18T13:33:12.569Z" },
+    { url = "https://files.pythonhosted.org/packages/74/40/35773cc4bb1e9d4658d4fb669eb4195b3151bef3bbd6f866aba5cd5dac82/coverage-7.12.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:09a86acaaa8455f13d6a99221d9654df249b33937b4e212b4e5a822065f12aa7", size = 260078, upload-time = "2025-11-18T13:33:14.037Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/ee/231bb1a6ffc2905e396557585ebc6bdc559e7c66708376d245a1f1d330fc/coverage-7.12.0-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:907e0df1b71ba77463687a74149c6122c3f6aac56c2510a5d906b2f368208560", size = 262144, upload-time = "2025-11-18T13:33:15.601Z" },
+    { url = "https://files.pythonhosted.org/packages/28/be/32f4aa9f3bf0b56f3971001b56508352c7753915345d45fab4296a986f01/coverage-7.12.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9b57e2d0ddd5f0582bae5437c04ee71c46cd908e7bc5d4d0391f9a41e812dd12", size = 264574, upload-time = "2025-11-18T13:33:17.354Z" },
+    { url = "https://files.pythonhosted.org/packages/68/7c/00489fcbc2245d13ab12189b977e0cf06ff3351cb98bc6beba8bd68c5902/coverage-7.12.0-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:58c1c6aa677f3a1411fe6fb28ec3a942e4f665df036a3608816e0847fad23296", size = 259298, upload-time = "2025-11-18T13:33:18.958Z" },
+    { url = "https://files.pythonhosted.org/packages/96/b4/f0760d65d56c3bea95b449e02570d4abd2549dc784bf39a2d4721a2d8ceb/coverage-7.12.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4c589361263ab2953e3c4cd2a94db94c4ad4a8e572776ecfbad2389c626e4507", size = 262150, upload-time = "2025-11-18T13:33:20.644Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/71/9a9314df00f9326d78c1e5a910f520d599205907432d90d1c1b7a97aa4b1/coverage-7.12.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:91b810a163ccad2e43b1faa11d70d3cf4b6f3d83f9fd5f2df82a32d47b648e0d", size = 259763, upload-time = "2025-11-18T13:33:22.189Z" },
+    { url = "https://files.pythonhosted.org/packages/10/34/01a0aceed13fbdf925876b9a15d50862eb8845454301fe3cdd1df08b2182/coverage-7.12.0-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:40c867af715f22592e0d0fb533a33a71ec9e0f73a6945f722a0c85c8c1cbe3a2", size = 258653, upload-time = "2025-11-18T13:33:24.239Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/04/81d8fd64928acf1574bbb0181f66901c6c1c6279c8ccf5f84259d2c68ae9/coverage-7.12.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:68b0d0a2d84f333de875666259dadf28cc67858bc8fd8b3f1eae84d3c2bec455", size = 260856, upload-time = "2025-11-18T13:33:26.365Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/76/fa2a37bfaeaf1f766a2d2360a25a5297d4fb567098112f6517475eee120b/coverage-7.12.0-cp313-cp313t-win32.whl", hash = "sha256:73f9e7fbd51a221818fd11b7090eaa835a353ddd59c236c57b2199486b116c6d", size = 220936, upload-time = "2025-11-18T13:33:28.165Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/52/60f64d932d555102611c366afb0eb434b34266b1d9266fc2fe18ab641c47/coverage-7.12.0-cp313-cp313t-win_amd64.whl", hash = "sha256:24cff9d1f5743f67db7ba46ff284018a6e9aeb649b67aa1e70c396aa1b7cb23c", size = 222001, upload-time = "2025-11-18T13:33:29.656Z" },
+    { url = "https://files.pythonhosted.org/packages/77/df/c303164154a5a3aea7472bf323b7c857fed93b26618ed9fc5c2955566bb0/coverage-7.12.0-cp313-cp313t-win_arm64.whl", hash = "sha256:c87395744f5c77c866d0f5a43d97cc39e17c7f1cb0115e54a2fe67ca75c5d14d", size = 220273, upload-time = "2025-11-18T13:33:31.415Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/2e/fc12db0883478d6e12bbd62d481210f0c8daf036102aa11434a0c5755825/coverage-7.12.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:a1c59b7dc169809a88b21a936eccf71c3895a78f5592051b1af8f4d59c2b4f92", size = 217777, upload-time = "2025-11-18T13:33:32.86Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/c1/ce3e525d223350c6ec16b9be8a057623f54226ef7f4c2fee361ebb6a02b8/coverage-7.12.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:8787b0f982e020adb732b9f051f3e49dd5054cebbc3f3432061278512a2b1360", size = 218100, upload-time = "2025-11-18T13:33:34.532Z" },
+    { url = "https://files.pythonhosted.org/packages/15/87/113757441504aee3808cb422990ed7c8bcc2d53a6779c66c5adef0942939/coverage-7.12.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5ea5a9f7dc8877455b13dd1effd3202e0bca72f6f3ab09f9036b1bcf728f69ac", size = 249151, upload-time = "2025-11-18T13:33:36.135Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/1d/9529d9bd44049b6b05bb319c03a3a7e4b0a8a802d28fa348ad407e10706d/coverage-7.12.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fdba9f15849534594f60b47c9a30bc70409b54947319a7c4fd0e8e3d8d2f355d", size = 251667, upload-time = "2025-11-18T13:33:37.996Z" },
+    { url = "https://files.pythonhosted.org/packages/11/bb/567e751c41e9c03dc29d3ce74b8c89a1e3396313e34f255a2a2e8b9ebb56/coverage-7.12.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a00594770eb715854fb1c57e0dea08cce6720cfbc531accdb9850d7c7770396c", size = 253003, upload-time = "2025-11-18T13:33:39.553Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/b3/c2cce2d8526a02fb9e9ca14a263ca6fc074449b33a6afa4892838c903528/coverage-7.12.0-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:5560c7e0d82b42eb1951e4f68f071f8017c824ebfd5a6ebe42c60ac16c6c2434", size = 249185, upload-time = "2025-11-18T13:33:42.086Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/a7/967f93bb66e82c9113c66a8d0b65ecf72fc865adfba5a145f50c7af7e58d/coverage-7.12.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d6c2e26b481c9159c2773a37947a9718cfdc58893029cdfb177531793e375cfc", size = 251025, upload-time = "2025-11-18T13:33:43.634Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/b2/f2f6f56337bc1af465d5b2dc1ee7ee2141b8b9272f3bf6213fcbc309a836/coverage-7.12.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:6e1a8c066dabcde56d5d9fed6a66bc19a2883a3fe051f0c397a41fc42aedd4cc", size = 248979, upload-time = "2025-11-18T13:33:46.04Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/7a/bf4209f45a4aec09d10a01a57313a46c0e0e8f4c55ff2965467d41a92036/coverage-7.12.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:f7ba9da4726e446d8dd8aae5a6cd872511184a5d861de80a86ef970b5dacce3e", size = 248800, upload-time = "2025-11-18T13:33:47.546Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/b7/1e01b8696fb0521810f60c5bbebf699100d6754183e6cc0679bf2ed76531/coverage-7.12.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e0f483ab4f749039894abaf80c2f9e7ed77bbf3c737517fb88c8e8e305896a17", size = 250460, upload-time = "2025-11-18T13:33:49.537Z" },
+    { url = "https://files.pythonhosted.org/packages/71/ae/84324fb9cb46c024760e706353d9b771a81b398d117d8c1fe010391c186f/coverage-7.12.0-cp314-cp314-win32.whl", hash = "sha256:76336c19a9ef4a94b2f8dc79f8ac2da3f193f625bb5d6f51a328cd19bfc19933", size = 220533, upload-time = "2025-11-18T13:33:51.16Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/71/1033629deb8460a8f97f83e6ac4ca3b93952e2b6f826056684df8275e015/coverage-7.12.0-cp314-cp314-win_amd64.whl", hash = "sha256:7c1059b600aec6ef090721f8f633f60ed70afaffe8ecab85b59df748f24b31fe", size = 221348, upload-time = "2025-11-18T13:33:52.776Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/5f/ac8107a902f623b0c251abdb749be282dc2ab61854a8a4fcf49e276fce2f/coverage-7.12.0-cp314-cp314-win_arm64.whl", hash = "sha256:172cf3a34bfef42611963e2b661302a8931f44df31629e5b1050567d6b90287d", size = 219922, upload-time = "2025-11-18T13:33:54.316Z" },
+    { url = "https://files.pythonhosted.org/packages/79/6e/f27af2d4da367f16077d21ef6fe796c874408219fa6dd3f3efe7751bd910/coverage-7.12.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:aa7d48520a32cb21c7a9b31f81799e8eaec7239db36c3b670be0fa2403828d1d", size = 218511, upload-time = "2025-11-18T13:33:56.343Z" },
+    { url = "https://files.pythonhosted.org/packages/67/dd/65fd874aa460c30da78f9d259400d8e6a4ef457d61ab052fd248f0050558/coverage-7.12.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:90d58ac63bc85e0fb919f14d09d6caa63f35a5512a2205284b7816cafd21bb03", size = 218771, upload-time = "2025-11-18T13:33:57.966Z" },
+    { url = "https://files.pythonhosted.org/packages/55/e0/7c6b71d327d8068cb79c05f8f45bf1b6145f7a0de23bbebe63578fe5240a/coverage-7.12.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:ca8ecfa283764fdda3eae1bdb6afe58bf78c2c3ec2b2edcb05a671f0bba7b3f9", size = 260151, upload-time = "2025-11-18T13:33:59.597Z" },
+    { url = "https://files.pythonhosted.org/packages/49/ce/4697457d58285b7200de6b46d606ea71066c6e674571a946a6ea908fb588/coverage-7.12.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:874fe69a0785d96bd066059cd4368022cebbec1a8958f224f0016979183916e6", size = 262257, upload-time = "2025-11-18T13:34:01.166Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/33/acbc6e447aee4ceba88c15528dbe04a35fb4d67b59d393d2e0d6f1e242c1/coverage-7.12.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5b3c889c0b8b283a24d721a9eabc8ccafcfc3aebf167e4cd0d0e23bf8ec4e339", size = 264671, upload-time = "2025-11-18T13:34:02.795Z" },
+    { url = "https://files.pythonhosted.org/packages/87/ec/e2822a795c1ed44d569980097be839c5e734d4c0c1119ef8e0a073496a30/coverage-7.12.0-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:8bb5b894b3ec09dcd6d3743229dc7f2c42ef7787dc40596ae04c0edda487371e", size = 259231, upload-time = "2025-11-18T13:34:04.397Z" },
+    { url = "https://files.pythonhosted.org/packages/72/c5/a7ec5395bb4a49c9b7ad97e63f0c92f6bf4a9e006b1393555a02dae75f16/coverage-7.12.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:79a44421cd5fba96aa57b5e3b5a4d3274c449d4c622e8f76882d76635501fd13", size = 262137, upload-time = "2025-11-18T13:34:06.068Z" },
+    { url = "https://files.pythonhosted.org/packages/67/0c/02c08858b764129f4ecb8e316684272972e60777ae986f3865b10940bdd6/coverage-7.12.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:33baadc0efd5c7294f436a632566ccc1f72c867f82833eb59820ee37dc811c6f", size = 259745, upload-time = "2025-11-18T13:34:08.04Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/04/4fd32b7084505f3829a8fe45c1a74a7a728cb251aaadbe3bec04abcef06d/coverage-7.12.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:c406a71f544800ef7e9e0000af706b88465f3573ae8b8de37e5f96c59f689ad1", size = 258570, upload-time = "2025-11-18T13:34:09.676Z" },
+    { url = "https://files.pythonhosted.org/packages/48/35/2365e37c90df4f5342c4fa202223744119fe31264ee2924f09f074ea9b6d/coverage-7.12.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:e71bba6a40883b00c6d571599b4627f50c360b3d0d02bfc658168936be74027b", size = 260899, upload-time = "2025-11-18T13:34:11.259Z" },
+    { url = "https://files.pythonhosted.org/packages/05/56/26ab0464ca733fa325e8e71455c58c1c374ce30f7c04cebb88eabb037b18/coverage-7.12.0-cp314-cp314t-win32.whl", hash = "sha256:9157a5e233c40ce6613dead4c131a006adfda70e557b6856b97aceed01b0e27a", size = 221313, upload-time = "2025-11-18T13:34:12.863Z" },
+    { url = "https://files.pythonhosted.org/packages/da/1c/017a3e1113ed34d998b27d2c6dba08a9e7cb97d362f0ec988fcd873dcf81/coverage-7.12.0-cp314-cp314t-win_amd64.whl", hash = "sha256:e84da3a0fd233aeec797b981c51af1cabac74f9bd67be42458365b30d11b5291", size = 222423, upload-time = "2025-11-18T13:34:15.14Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/36/bcc504fdd5169301b52568802bb1b9cdde2e27a01d39fbb3b4b508ab7c2c/coverage-7.12.0-cp314-cp314t-win_arm64.whl", hash = "sha256:01d24af36fedda51c2b1aca56e4330a3710f83b02a5ff3743a6b015ffa7c9384", size = 220459, upload-time = "2025-11-18T13:34:17.222Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/a3/43b749004e3c09452e39bb56347a008f0a0668aad37324a99b5c8ca91d9e/coverage-7.12.0-py3-none-any.whl", hash = "sha256:159d50c0b12e060b15ed3d39f87ed43d4f7f7ad40b8a534f4dd331adbb51104a", size = 209503, upload-time = "2025-11-18T13:34:18.892Z" },
 ]
 
 [package.optional-dependencies]
@@ -1040,82 +1007,6 @@ toml = [
     { name = "tomli", marker = "python_full_version <= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 
-[[package]]
-name = "crc32c"
-version = "2.8"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/e3/66/7e97aa77af7cf6afbff26e3651b564fe41932599bc2d3dce0b2f73d4829a/crc32c-2.8.tar.gz", hash = "sha256:578728964e59c47c356aeeedee6220e021e124b9d3e8631d95d9a5e5f06e261c", size = 48179, upload-time = "2025-10-17T06:20:13.61Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c4/a0/28b4686a8db0bb0f77970f4c6ccede90d1d5740a1d4b4703bd54c3e75655/crc32c-2.8-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2c0f4eb01fe7c0a3e3f973a418e04d52101bb077dd77626fd80c658ec60aaf95", size = 66321, upload-time = "2025-10-17T06:18:53.543Z" },
-    { url = "https://files.pythonhosted.org/packages/76/1f/1697f5b8b770f715ed9b264d79e36b4f77ae0527f81f3c749ef08937a32e/crc32c-2.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6baefcfbca82b1a9678455416da24f18629769a76920c640d5a538620a7d12bb", size = 62985, upload-time = "2025-10-17T06:18:54.97Z" },
-    { url = "https://files.pythonhosted.org/packages/e0/e5/333cfa5ffa8d5779733aced2b984b5e5139b4a8ceaa2c6bc563e9a1092f3/crc32c-2.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d7f959fcf6c5aad1c4a653ee1a50f05760dab1d1c35d98ec4d7f0f68643f7612", size = 61517, upload-time = "2025-10-17T06:18:55.795Z" },
-    { url = "https://files.pythonhosted.org/packages/e1/d8/362a009e8140dd926a153b44d56753e3aa7cb50aca243779a84adadbff11/crc32c-2.8-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:9bb678507a4e4cf3f0506607b046ecc4ed1c58a19e08a3fb3c2d25441c480bf1", size = 79385, upload-time = "2025-10-17T06:18:56.598Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/9f/0d4ea3aa71ffb15f1285669d23024cc40779388ce32157d339dc2584491c/crc32c-2.8-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1a16f7ffa4c242a909558565567cbba95148603717b53538ea299c98da68e7a9", size = 80965, upload-time = "2025-10-17T06:18:57.384Z" },
-    { url = "https://files.pythonhosted.org/packages/20/44/d77657aaca4a2c0283f2356a3da6f8e91b003567bb8f09daaf540cbf192f/crc32c-2.8-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:0184369aad562d801f91f454c81f56b9ecb966f6b96684c4d6cf82fc8741d2ad", size = 79993, upload-time = "2025-10-17T06:18:58.503Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/c0/07017a93ebf85d9408028b7e03ef96d5c6bfb14cb77cfe90d35eedcc1501/crc32c-2.8-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:86d2eeb5f0189bd803720abe7387019328ea34c4acde62999e5723f789bc316b", size = 79243, upload-time = "2025-10-17T06:18:59.273Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/1a/b3c5ac4cf2fd1f82395173d0bd8e1a15d09f0bc1eccdf10ea7f8caaccd67/crc32c-2.8-cp310-cp310-win32.whl", hash = "sha256:51da61904a9e753780a2e6011885677d601db1fa840be4b68799643a113e6f08", size = 64888, upload-time = "2025-10-17T06:19:00.089Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/f2/60c45fc7bb2221d3c93c7a872e921be591f40d45228fe46f879b1d8c0424/crc32c-2.8-cp310-cp310-win_amd64.whl", hash = "sha256:b2d6a1f2500daaf2e4b08f97ad0349aa2eff5faaaa5fd3350314a26eade334cd", size = 66639, upload-time = "2025-10-17T06:19:00.974Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/0b/5e03b22d913698e9cc563f39b9f6bbd508606bf6b8e9122cd6bf196b87ea/crc32c-2.8-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:e560a97fbb96c9897cb1d9b5076ef12fc12e2e25622530a1afd0de4240f17e1f", size = 66329, upload-time = "2025-10-17T06:19:01.771Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/38/2fe0051ffe8c6a650c8b1ac0da31b8802d1dbe5fa40a84e4b6b6f5583db5/crc32c-2.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6762d276d90331a490ef7e71ffee53b9c0eb053bd75a272d786f3b08d3fe3671", size = 62988, upload-time = "2025-10-17T06:19:02.953Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/30/5837a71c014be83aba1469c58820d287fc836512a0cad6b8fdd43868accd/crc32c-2.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:60670569f5ede91e39f48fb0cb4060e05b8d8704dd9e17ede930bf441b2f73ef", size = 61522, upload-time = "2025-10-17T06:19:03.796Z" },
-    { url = "https://files.pythonhosted.org/packages/ca/29/63972fc1452778e2092ae998c50cbfc2fc93e3fa9798a0278650cd6169c5/crc32c-2.8-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:711743da6ccc70b3c6718c328947b0b6f34a1fe6a6c27cc6c1d69cc226bf70e9", size = 80200, upload-time = "2025-10-17T06:19:04.617Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/3a/60eb49d7bdada4122b3ffd45b0df54bdc1b8dd092cda4b069a287bdfcff4/crc32c-2.8-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5eb4094a2054774f13b26f21bf56792bb44fa1fcee6c6ad099387a43ffbfb4fa", size = 81757, upload-time = "2025-10-17T06:19:05.496Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/63/6efc1b64429ef7d23bd58b75b7ac24d15df327e3ebbe9c247a0f7b1c2ed1/crc32c-2.8-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fff15bf2bd3e95780516baae935ed12be88deaa5ebe6143c53eb0d26a7bdc7b7", size = 80830, upload-time = "2025-10-17T06:19:06.621Z" },
-    { url = "https://files.pythonhosted.org/packages/e1/eb/0ae9f436f8004f1c88f7429e659a7218a3879bd11a6b18ed1257aad7e98b/crc32c-2.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4c0e11e3826668121fa53e0745635baf5e4f0ded437e8ff63ea56f38fc4f970a", size = 80095, upload-time = "2025-10-17T06:19:07.381Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/81/4afc9d468977a4cd94a2eb62908553345009a7c0d30e74463a15d4b48ec3/crc32c-2.8-cp311-cp311-win32.whl", hash = "sha256:38f915336715d1f1353ab07d7d786f8a789b119e273aea106ba55355dfc9101d", size = 64886, upload-time = "2025-10-17T06:19:08.497Z" },
-    { url = "https://files.pythonhosted.org/packages/d6/e8/94e839c9f7e767bf8479046a207afd440a08f5c59b52586e1af5e64fa4a0/crc32c-2.8-cp311-cp311-win_amd64.whl", hash = "sha256:60e0a765b1caab8d31b2ea80840639253906a9351d4b861551c8c8625ea20f86", size = 66639, upload-time = "2025-10-17T06:19:09.338Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/36/fd18ef23c42926b79c7003e16cb0f79043b5b179c633521343d3b499e996/crc32c-2.8-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:572ffb1b78cce3d88e8d4143e154d31044a44be42cb3f6fbbf77f1e7a941c5ab", size = 66379, upload-time = "2025-10-17T06:19:10.115Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/b8/c584958e53f7798dd358f5bdb1bbfc97483134f053ee399d3eeb26cca075/crc32c-2.8-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:cf827b3758ee0c4aacd21ceca0e2da83681f10295c38a10bfeb105f7d98f7a68", size = 63042, upload-time = "2025-10-17T06:19:10.946Z" },
-    { url = "https://files.pythonhosted.org/packages/62/e6/6f2af0ec64a668a46c861e5bc778ea3ee42171fedfc5440f791f470fd783/crc32c-2.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:106fbd79013e06fa92bc3b51031694fcc1249811ed4364ef1554ee3dd2c7f5a2", size = 61528, upload-time = "2025-10-17T06:19:11.768Z" },
-    { url = "https://files.pythonhosted.org/packages/17/8b/4a04bd80a024f1a23978f19ae99407783e06549e361ab56e9c08bba3c1d3/crc32c-2.8-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:6dde035f91ffbfe23163e68605ee5a4bb8ceebd71ed54bb1fb1d0526cdd125a2", size = 80028, upload-time = "2025-10-17T06:19:12.554Z" },
-    { url = "https://files.pythonhosted.org/packages/21/8f/01c7afdc76ac2007d0e6a98e7300b4470b170480f8188475b597d1f4b4c6/crc32c-2.8-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e41ebe7c2f0fdcd9f3a3fd206989a36b460b4d3f24816d53e5be6c7dba72c5e1", size = 81531, upload-time = "2025-10-17T06:19:13.406Z" },
-    { url = "https://files.pythonhosted.org/packages/32/2b/8f78c5a8cc66486be5f51b6f038fc347c3ba748d3ea68be17a014283c331/crc32c-2.8-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ecf66cf90266d9c15cea597d5cc86c01917cd1a238dc3c51420c7886fa750d7e", size = 80608, upload-time = "2025-10-17T06:19:14.223Z" },
-    { url = "https://files.pythonhosted.org/packages/db/86/fad1a94cdeeeb6b6e2323c87f970186e74bfd6fbfbc247bf5c88ad0873d5/crc32c-2.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:59eee5f3a69ad0793d5fa9cdc9b9d743b0cd50edf7fccc0a3988a821fef0208c", size = 79886, upload-time = "2025-10-17T06:19:15.345Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/db/1a7cb6757a1e32376fa2dfce00c815ea4ee614a94f9bff8228e37420c183/crc32c-2.8-cp312-cp312-win32.whl", hash = "sha256:a73d03ce3604aa5d7a2698e9057a0eef69f529c46497b27ee1c38158e90ceb76", size = 64896, upload-time = "2025-10-17T06:19:16.457Z" },
-    { url = "https://files.pythonhosted.org/packages/bf/8e/2024de34399b2e401a37dcb54b224b56c747b0dc46de4966886827b4d370/crc32c-2.8-cp312-cp312-win_amd64.whl", hash = "sha256:56b3b7d015247962cf58186e06d18c3d75a1a63d709d3233509e1c50a2d36aa2", size = 66645, upload-time = "2025-10-17T06:19:17.235Z" },
-    { url = "https://files.pythonhosted.org/packages/e8/d8/3ae227890b3be40955a7144106ef4dd97d6123a82c2a5310cdab58ca49d8/crc32c-2.8-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:36f1e03ee9e9c6938e67d3bcb60e36f260170aa5f37da1185e04ef37b56af395", size = 66380, upload-time = "2025-10-17T06:19:18.009Z" },
-    { url = "https://files.pythonhosted.org/packages/bd/8b/178d3f987cd0e049b484615512d3f91f3d2caeeb8ff336bb5896ae317438/crc32c-2.8-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b2f3226b94b85a8dd9b3533601d7a63e9e3e8edf03a8a169830ee8303a199aeb", size = 63048, upload-time = "2025-10-17T06:19:18.853Z" },
-    { url = "https://files.pythonhosted.org/packages/f2/a1/48145ae2545ebc0169d3283ebe882da580ea4606bfb67cf4ca922ac3cfc3/crc32c-2.8-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6e08628bc72d5b6bc8e0730e8f142194b610e780a98c58cb6698e665cb885a5b", size = 61530, upload-time = "2025-10-17T06:19:19.974Z" },
-    { url = "https://files.pythonhosted.org/packages/06/4b/cf05ed9d934cc30e5ae22f97c8272face420a476090e736615d9a6b53de0/crc32c-2.8-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:086f64793c5ec856d1ab31a026d52ad2b895ac83d7a38fce557d74eb857f0a82", size = 80001, upload-time = "2025-10-17T06:19:20.784Z" },
-    { url = "https://files.pythonhosted.org/packages/15/ab/4b04801739faf36345f6ba1920be5b1c70282fec52f8280afd3613fb13e2/crc32c-2.8-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bcf72ee7e0135b3d941c34bb2c26c3fc6bc207106b49fd89aaafaeae223ae209", size = 81543, upload-time = "2025-10-17T06:19:21.557Z" },
-    { url = "https://files.pythonhosted.org/packages/a9/1b/6e38dde5bfd2ea69b7f2ab6ec229fcd972a53d39e2db4efe75c0ac0382ce/crc32c-2.8-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:8a717dd9c3fd777d9bc6603717eae172887d402c4ab589d124ebd0184a83f89e", size = 80644, upload-time = "2025-10-17T06:19:22.325Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/45/012176ffee90059ae8ec7131019c71724ea472aa63e72c0c8edbd1fad1d7/crc32c-2.8-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0450bb845b3c3c7b9bdc0b4e95620ec9a40824abdc8c86d6285c919a90743c1a", size = 79919, upload-time = "2025-10-17T06:19:23.101Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/2b/f557629842f9dec2b3461cb3a0d854bb586ec45b814cea58b082c32f0dde/crc32c-2.8-cp313-cp313-win32.whl", hash = "sha256:765d220bfcbcffa6598ac11eb1e10af0ee4802b49fe126aa6bf79f8ddb9931d1", size = 64896, upload-time = "2025-10-17T06:19:23.88Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/db/fd0f698c15d1e21d47c64181a98290665a08fcbb3940cd559e9c15bda57e/crc32c-2.8-cp313-cp313-win_amd64.whl", hash = "sha256:171ff0260d112c62abcce29332986950a57bddee514e0a2418bfde493ea06bb3", size = 66646, upload-time = "2025-10-17T06:19:24.702Z" },
-    { url = "https://files.pythonhosted.org/packages/db/b9/8e5d7054fe8e7eecab10fd0c8e7ffb01439417bdb6de1d66a81c38fc4a20/crc32c-2.8-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b977a32a3708d6f51703c8557008f190aaa434d7347431efb0e86fcbe78c2a50", size = 66203, upload-time = "2025-10-17T06:19:25.872Z" },
-    { url = "https://files.pythonhosted.org/packages/55/5f/cc926c70057a63cc0c98a3c8a896eb15fc7e74d3034eadd53c94917c6cc3/crc32c-2.8-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:7399b01db4adaf41da2fb36fe2408e75a8d82a179a9564ed7619412e427b26d6", size = 62956, upload-time = "2025-10-17T06:19:26.652Z" },
-    { url = "https://files.pythonhosted.org/packages/a1/8a/0660c44a2dd2cb6ccbb529eb363b9280f5c766f1017bc8355ed8d695bd94/crc32c-2.8-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:4379f73f9cdad31958a673d11a332ec725ca71572401ca865867229f5f15e853", size = 61442, upload-time = "2025-10-17T06:19:27.74Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/5a/6108d2dfc0fe33522ce83ba07aed4b22014911b387afa228808a278e27cd/crc32c-2.8-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2e68264555fab19bab08331550dab58573e351a63ed79c869d455edd3b0aa417", size = 79109, upload-time = "2025-10-17T06:19:28.535Z" },
-    { url = "https://files.pythonhosted.org/packages/84/1e/c054f9e390090c197abf3d2936f4f9effaf0c6ee14569ae03d6ddf86958a/crc32c-2.8-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b48f2486727b8d0e7ccbae4a34cb0300498433d2a9d6b49cb13cb57c2e3f19cb", size = 80987, upload-time = "2025-10-17T06:19:29.305Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/ad/1650e5c3341e4a485f800ea83116d72965030c5d48ccc168fcc685756e4d/crc32c-2.8-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:ecf123348934a086df8c8fde7f9f2d716d523ca0707c5a1367b8bb00d8134823", size = 79994, upload-time = "2025-10-17T06:19:30.109Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/3b/f2ed924b177729cbb2ab30ca2902abff653c31d48c95e7b66717a9ca9fcc/crc32c-2.8-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e636ac60f76de538f7a2c0d0f3abf43104ee83a8f5e516f6345dc283ed1a4df7", size = 79046, upload-time = "2025-10-17T06:19:30.894Z" },
-    { url = "https://files.pythonhosted.org/packages/4b/80/413b05ee6ace613208b31b3670c3135ee1cf451f0e72a9c839b4946acc04/crc32c-2.8-cp313-cp313t-win32.whl", hash = "sha256:8dd4a19505e0253892e1b2f1425cc3bd47f79ae5a04cb8800315d00aad7197f2", size = 64837, upload-time = "2025-10-17T06:19:32.03Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/1b/85eddb6ac5b38496c4e35c20298aae627970c88c3c624a22ab33e84f16c7/crc32c-2.8-cp313-cp313t-win_amd64.whl", hash = "sha256:4bb18e4bd98fb266596523ffc6be9c5b2387b2fa4e505ec56ca36336f49cb639", size = 66574, upload-time = "2025-10-17T06:19:33.143Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/df/50e9079b532ff53dbfc0e66eed781374bd455af02ed5df8b56ad538de4ff/crc32c-2.8-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:3a3b2e4bcf7b3ee333050e7d3ff38e2ba46ea205f1d73d8949b248aaffe937ac", size = 66399, upload-time = "2025-10-17T06:19:34.279Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/2e/67e3b0bc3d30e46ea5d16365cc81203286387671e22f2307eb41f19abb9c/crc32c-2.8-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:445e559e66dff16be54f8a4ef95aa6b01db799a639956d995c5498ba513fccc2", size = 63044, upload-time = "2025-10-17T06:19:35.062Z" },
-    { url = "https://files.pythonhosted.org/packages/36/ea/1723b17437e4344ed8d067456382ecb1f5b535d83fdc5aaebab676c6d273/crc32c-2.8-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:bf3040919e17afa5782e01b1875d6a05f44b8f19c05f211d8b9f8a1deb8bbd9c", size = 61541, upload-time = "2025-10-17T06:19:36.204Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/6a/cbec8a235c5b46a01f319939b538958662159aec0ed3a74944e3a6de21f1/crc32c-2.8-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5607ab8221e1ffd411f64aa40dbb6850cf06dd2908c9debd05d371e1acf62ff3", size = 80139, upload-time = "2025-10-17T06:19:37.351Z" },
-    { url = "https://files.pythonhosted.org/packages/21/31/d096722fe74b692d6e8206c27da1ea5f6b2a12ff92c54a62a6ba2f376254/crc32c-2.8-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c7f5db4f16816926986d3c94253314920689706ae13a9bf4888b47336c6735ce", size = 81736, upload-time = "2025-10-17T06:19:38.16Z" },
-    { url = "https://files.pythonhosted.org/packages/f6/a2/f75ef716ff7e3c22f385ba6ef30c5de80c19a21ebe699dc90824a1903275/crc32c-2.8-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:70b0153c4d418b673309d3529334d117e1074c4a3b2d7f676e430d72c14de67b", size = 80795, upload-time = "2025-10-17T06:19:38.948Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/94/6d647a12d96ab087d9b8eacee3da073f981987827d57c7072f89ffc7b6cd/crc32c-2.8-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5c8933531442042438753755a5c8a9034e4d88b01da9eb796f7e151b31a7256c", size = 80042, upload-time = "2025-10-17T06:19:39.725Z" },
-    { url = "https://files.pythonhosted.org/packages/cd/dc/32b8896b40a0afee7a3c040536d0da5a73e68df2be9fadd21770fd158e16/crc32c-2.8-cp314-cp314-win32.whl", hash = "sha256:cdc83a3fe6c4e5df9457294cfd643de7d95bd4e9382c1dd6ed1e0f0f9169172c", size = 64914, upload-time = "2025-10-17T06:19:40.527Z" },
-    { url = "https://files.pythonhosted.org/packages/f2/b4/4308b27d307e8ecaf8dd1dcc63bbb0e47ae1826d93faa3e62d1ee00ee2d5/crc32c-2.8-cp314-cp314-win_amd64.whl", hash = "sha256:509e10035106df66770fe24b9eb8d9e32b6fb967df17744402fb67772d8b2bc7", size = 66723, upload-time = "2025-10-17T06:19:42.449Z" },
-    { url = "https://files.pythonhosted.org/packages/90/d5/a19d2489fa997a143bfbbf971a5c9a43f8b1ba9e775b1fb362d8fb15260c/crc32c-2.8-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:864359a39777a07b09b28eb31337c0cc603d5c1bf0fc328c3af736a8da624ec0", size = 66201, upload-time = "2025-10-17T06:19:43.273Z" },
-    { url = "https://files.pythonhosted.org/packages/98/c2/5f82f22d2c1242cb6f6fe92aa9a42991ebea86de994b8f9974d9c1d128e2/crc32c-2.8-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:14511d7cfc5d9f5e1a6c6b64caa6225c2bdc1ed00d725e9a374a3e84073ce180", size = 62956, upload-time = "2025-10-17T06:19:44.099Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/61/3d43d33489cf974fb78bfb3500845770e139ae6d1d83473b660bd8f79a6c/crc32c-2.8-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:918b7999b52b5dcbcea34081e9a02d46917d571921a3f209956a9a429b2e06e5", size = 61443, upload-time = "2025-10-17T06:19:44.89Z" },
-    { url = "https://files.pythonhosted.org/packages/52/6d/f306ce64a352a3002f76b0fc88a1373f4541f9d34fad3668688610bab14b/crc32c-2.8-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:cc445da03fc012a5a03b71da1df1b40139729e6a5571fd4215ab40bfb39689c7", size = 79106, upload-time = "2025-10-17T06:19:45.688Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/b7/1f74965dd7ea762954a69d172dfb3a706049c84ffa45d31401d010a4a126/crc32c-2.8-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1e3dde2ec59a8a830511d72a086ead95c0b0b7f0d418f93ea106244c5e77e350", size = 80983, upload-time = "2025-10-17T06:19:46.792Z" },
-    { url = "https://files.pythonhosted.org/packages/1b/50/af93f0d91ccd61833ce77374ebfbd16f5805f5c17d18c6470976d9866d76/crc32c-2.8-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:61d51681a08b6a2a2e771b7f0cd1947fb87cb28f38ed55a01cb7c40b2ac4cdd8", size = 80009, upload-time = "2025-10-17T06:19:47.619Z" },
-    { url = "https://files.pythonhosted.org/packages/ee/fa/94f394beb68a88258af694dab2f1284f55a406b615d7900bdd6235283bc4/crc32c-2.8-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:67c0716c3b1a02d5235be649487b637eed21f2d070f2b3f63f709dcd2fefb4c7", size = 79066, upload-time = "2025-10-17T06:19:48.409Z" },
-    { url = "https://files.pythonhosted.org/packages/91/c6/a6050e0c64fd73c67a97da96cb59f08b05111e00b958fb87ecdce99f17ac/crc32c-2.8-cp314-cp314t-win32.whl", hash = "sha256:2e8fe863fbbd8bdb6b414a2090f1b0f52106e76e9a9c96a413495dbe5ebe492a", size = 64869, upload-time = "2025-10-17T06:19:49.197Z" },
-    { url = "https://files.pythonhosted.org/packages/08/1f/c7735034e401cb1ea14f996a224518e3a3fa9987cb13680e707328a7d779/crc32c-2.8-cp314-cp314t-win_amd64.whl", hash = "sha256:20a9cfb897693eb6da19e52e2a7be2026fd4d9fc8ae318f086c0d71d5dd2d8e0", size = 66633, upload-time = "2025-10-17T06:19:50.003Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/1d/dd926c68eb8aac8b142a1a10b8eb62d95212c1cf81775644373fe7cceac2/crc32c-2.8-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5833f4071da7ea182c514ba17d1eee8aec3c5be927d798222fbfbbd0f5eea02c", size = 62345, upload-time = "2025-10-17T06:20:09.39Z" },
-    { url = "https://files.pythonhosted.org/packages/51/be/803404e5abea2ef2c15042edca04bbb7f625044cca879e47f186b43887c2/crc32c-2.8-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:1dc4da036126ac07b39dd9d03e93e585ec615a2ad28ff12757aef7de175295a8", size = 61229, upload-time = "2025-10-17T06:20:10.236Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/3a/00cc578cd27ed0b22c9be25cef2c24539d92df9fa80ebd67a3fc5419724c/crc32c-2.8-pp311-pypy311_pp73-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:15905fa78344654e241371c47e6ed2411f9eeb2b8095311c68c88eccf541e8b4", size = 64108, upload-time = "2025-10-17T06:20:11.072Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/bc/0587ef99a1c7629f95dd0c9d4f3d894de383a0df85831eb16c48a6afdae4/crc32c-2.8-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c596f918688821f796434e89b431b1698396c38bf0b56de873621528fe3ecb1e", size = 64815, upload-time = "2025-10-17T06:20:11.919Z" },
-    { url = "https://files.pythonhosted.org/packages/73/42/94f2b8b92eae9064fcfb8deef2b971514065bd606231f8857ff8ae02bebd/crc32c-2.8-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:8d23c4fe01b3844cb6e091044bc1cebdef7d16472e058ce12d9fadf10d2614af", size = 66659, upload-time = "2025-10-17T06:20:12.766Z" },
-]
-
 [[package]]
 name = "cryptography"
 version = "42.0.8"
@@ -1207,40 +1098,40 @@ wheels = [
 
 [[package]]
 name = "cython"
-version = "3.2.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/52/82/01f0b63287cb922e5ba96c5147c30f1e51f541ce91bd178025bb3518b1ba/cython-3.2.0.tar.gz", hash = "sha256:41fdce8237baee2d961c292ed0386903dfe126f131e450a62de0fd7a5280d4b2", size = 3267264, upload-time = "2025-11-05T13:35:04.231Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/57/8d/b2e9578d960d38b1b04a278bf66e13008486aa73e73967186f2015d63d1c/cython-3.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ee408125b2d218ec7d7a061e09d24715fcab9bf7ea1a4ac01907c3f8ec8730b3", size = 2953775, upload-time = "2025-11-05T13:35:22.291Z" },
-    { url = "https://files.pythonhosted.org/packages/19/dd/cfd684f98bac9e0f505af1cbb7998498c59d713275e920a72b40dab03bfa/cython-3.2.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c93ce307b05fcd86a5bb0e4a7d7fab238e2f0e9936636097a60bc0e21f2def30", size = 3361627, upload-time = "2025-11-05T13:35:24.519Z" },
-    { url = "https://files.pythonhosted.org/packages/9c/c1/75acdbe9f6292514f0bb92ab1b78df5eedd7049235f4cbd194d2c6c46bfc/cython-3.2.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:191cfc2fa84642ad41a52d5abaacfb330d9a6653a465e4bf0a5681f66197a967", size = 3529751, upload-time = "2025-11-05T13:35:26.341Z" },
-    { url = "https://files.pythonhosted.org/packages/f2/ce/d0468eb6d87b956902b02909f5007ad61e3839d4c07ab235b514911d869b/cython-3.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:a259053037ef82959b743b7fde238bd191ee43f88eb8e51101d5f3d8849f1e32", size = 2758839, upload-time = "2025-11-05T13:35:28.36Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/2b/904493fceda95747ba83971b40a66c8cc29ff009313429903f38ee620140/cython-3.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e9e4b2248dc3a98b86aeba65e9862d2cc881d072c163c0fb31b511d4d72e93c8", size = 2946248, upload-time = "2025-11-05T13:35:30.406Z" },
-    { url = "https://files.pythonhosted.org/packages/89/fe/abe926699fe6c580967e30bc4035da54b5e31355ba9b1f4c0cf574228a84/cython-3.2.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:02fb4990a83d5d6f780dda18ed8baa8d587cb6523f57b4d72bc0b41ad3766c96", size = 3236384, upload-time = "2025-11-05T13:35:32.233Z" },
-    { url = "https://files.pythonhosted.org/packages/1b/36/6b6266549802234286438298d494152deb19922a94928d9dcd256659ebd1/cython-3.2.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8a98925517819d62ea25d2cf40057df60a9bcf75fdd1d6ed3882e6ae0730d82f", size = 3372915, upload-time = "2025-11-05T13:35:34.082Z" },
-    { url = "https://files.pythonhosted.org/packages/29/fa/5cf15466b428f9248e38a28515cf0fd98078ae869aa395cfb300315964c4/cython-3.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:4c959a5d4cd6331e8498822ba47200bd2ff4bf74517c0c91475d5bc21da3b4d5", size = 2762735, upload-time = "2025-11-05T13:35:35.806Z" },
-    { url = "https://files.pythonhosted.org/packages/57/d3/2e6f5f2552c860bb9c00653d092103521846114f6a2ae0648ecf84c0816c/cython-3.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:511d823d9f8a1b850178ec355d6df0a1731b9c20b08ee6d1a780f68215e9013f", size = 2959932, upload-time = "2025-11-05T13:35:37.518Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/bf/7bdc7f231fff6780f78586f939c1740475adecaa03bf256fcb62b2353952/cython-3.2.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bbadeedcb2d135655bcce7380fb28c9e2a75b6810426c12b6e5a6fe6106fafb4", size = 3218588, upload-time = "2025-11-05T13:35:39.642Z" },
-    { url = "https://files.pythonhosted.org/packages/be/81/7d7a81010897dc5abee59691f5fc85849dcc4c8a7687b22ed01bc8d86a7a/cython-3.2.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:92d2394a3e3fe704210b5324eb8118333b514af72c98b1e02a6503945825b231", size = 3381940, upload-time = "2025-11-05T13:35:41.886Z" },
-    { url = "https://files.pythonhosted.org/packages/4f/9d/35e7fb7b591bd9912685a772fcc773d7bb951a8feb6fb9be20addbc38928/cython-3.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:73435e56654a34ece57d4c3304a4556a8402cc4ae2d0e30f71c237a985dc5246", size = 2750886, upload-time = "2025-11-05T13:35:43.629Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/d0/dc4b260e8fde81b23ab4dca56948b3e69617ef470247ec6a3e09370a9849/cython-3.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d900e58e826f9a5a27b0e2b50e33473e9986a5bae375c39b0f2e19f2c545fa23", size = 2950437, upload-time = "2025-11-05T13:35:45.427Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/53/c322bf0486a938ad954a645866b67e978777d79183cf0a042bda6bea11de/cython-3.2.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a9d38cd3aab720d21fa6d6ee168228352f69aea0a95bd4fb84e8879c6ed38fbb", size = 3209331, upload-time = "2025-11-05T13:35:47.278Z" },
-    { url = "https://files.pythonhosted.org/packages/cd/48/55d02dba0606768d3450afd088e2bbcd6f8a54977dce041c2c3c1894631c/cython-3.2.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:92b31d0b7b0a49b3d2aa94faaf75d44a03174cff2616b341a8853c919e511d51", size = 3370974, upload-time = "2025-11-05T13:35:49.534Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/bd/6dab19652b68464572b7a137d07a91ebe86db2a81c35842ff5e49ef23403/cython-3.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:2847b74e76dbad612f6fc7182c12a5f78cffb0d05808fd2c4b638cf02d1aade6", size = 2746274, upload-time = "2025-11-05T13:35:51.522Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/db/de5331ca6489da1761078825709257e1f24e543b4040f86a2502a4b841f9/cython-3.2.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a0a8274959d538d12f865193dcd67bb5630906e020190c890d2b7c13d31713c6", size = 2961164, upload-time = "2025-11-05T13:35:53.826Z" },
-    { url = "https://files.pythonhosted.org/packages/54/3e/64e37e419331f7c4c540ad25c0b3e6d8f44d597f21ab8861afbc66aa7e02/cython-3.2.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0a1c800833c25195833805c7c3626a2c30b3baaaa9ba361a1af3bbc379662a8d", size = 3249627, upload-time = "2025-11-05T13:35:55.524Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/fc/9faedfcc2de807f77115d97a4910c260dd4693f4fa9e0e3be0d9ae89e260/cython-3.2.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:df15af08c21c18a2e848df5954d6fd3310735089b60405132fa4111e2cf7482a", size = 3375458, upload-time = "2025-11-05T13:35:57.279Z" },
-    { url = "https://files.pythonhosted.org/packages/31/e0/30d449cd97ee0d6395aba18f2646b61b52ab3dc5a3851a346e2d363a7d85/cython-3.2.0-cp314-cp314-win_amd64.whl", hash = "sha256:9d6876af2132757fff1b42a2f4eaa72482f991863160e3f0dc8f2c812b300ebf", size = 2783210, upload-time = "2025-11-05T13:35:59.54Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/6b/9e1e171fe19274465d84dffa4610d46f434b1ae945e946802db396695d67/cython-3.2.0-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:04821ce06598a3aa5c9e0270d98960cfe6556dedbd1418c65e4479162b8ae74a", size = 2869249, upload-time = "2025-11-05T13:36:08.944Z" },
-    { url = "https://files.pythonhosted.org/packages/c4/f1/f461726f664668a96072b2a245bdfae566d68e2eb1393ec72780cc59c21e/cython-3.2.0-cp39-abi3-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:54b5b1c72a63da822b3f4739a0e31546c0a19f8e834b174906bf817ed5f9d65f", size = 3204332, upload-time = "2025-11-05T13:36:11.386Z" },
-    { url = "https://files.pythonhosted.org/packages/78/d8/73c07ce64cae496e5f5a6dfe3e53574af1a8ef777e2a834d10dae8b67a4e/cython-3.2.0-cp39-abi3-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6155a6c360e32af1aaa16fa10b0119b49deeadff42a1958973324150870af1b5", size = 2851317, upload-time = "2025-11-05T13:36:13.14Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/d9/d9f321637b8034b5028fa5fe7d1085ffa9351fea350af6510d5cb924c014/cython-3.2.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:861258ac3878b76c57b9b5a379787d772a0bc47fec9167b43986777de542c474", size = 2987155, upload-time = "2025-11-05T13:36:15.018Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/b5/9f9e7d261f083b4066d734b27a7872b0c584fd4c3578196652dbf72b3f62/cython-3.2.0-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:85dbf955e3193893d0288105afa0fa5f4e835ff587061681f240a4f0487c44fb", size = 2884219, upload-time = "2025-11-05T13:36:17.334Z" },
-    { url = "https://files.pythonhosted.org/packages/88/64/5aeb6e43e0ded9efedc5a516f87a487fdca8e434491cc352e5a805380459/cython-3.2.0-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:3b3f13822526726bac43275c0e92916bbcc2c30e9f559edc4c1132670b70498d", size = 3218067, upload-time = "2025-11-05T13:36:19.493Z" },
-    { url = "https://files.pythonhosted.org/packages/c4/a0/1958f54cd79d8251a330b9c9652b2a5ceba6a3fcec10782dd03e2a23c74f/cython-3.2.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ab18d09673d219008be5b6174bcbb6dbfd50904e66371f104a8a4698b791472d", size = 3108277, upload-time = "2025-11-05T13:36:21.203Z" },
-    { url = "https://files.pythonhosted.org/packages/9c/84/9b8112160cab922b97edef00616ed18771567d88b5ba9d30d1736880c345/cython-3.2.0-cp39-abi3-win32.whl", hash = "sha256:c9fd986413fc52929b916187630a9abab9f876299951488c4b905ad5346afee6", size = 2430852, upload-time = "2025-11-05T13:36:23.049Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/57/65d3de140b51c45dd6892846bfabdfaaa032e2418f1cb1a2f46058c1fe42/cython-3.2.0-cp39-abi3-win_arm64.whl", hash = "sha256:ee2ea79ddeb721f912e7efea039b9db059c81767ff04fbf9a995f64e1187df99", size = 2435793, upload-time = "2025-11-05T13:36:25.139Z" },
-    { url = "https://files.pythonhosted.org/packages/20/58/1f798ddb7fe6bfddf85f4f97d2d4ad63a491a7b643e85c1e274d0f09138e/cython-3.2.0-py3-none-any.whl", hash = "sha256:73f7f4c75acde5b5b4df05b11fdc2705ec637b99241d1bc2f4ebf345f7a2ea90", size = 1252818, upload-time = "2025-11-05T13:35:00.391Z" },
+version = "3.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/83/36/cce2972e13e83ffe58bc73bfd9d37340b5e5113e8243841a57511c7ae1c2/cython-3.2.1.tar.gz", hash = "sha256:2be1e4d0cbdf7f4cd4d9b8284a034e1989b59fd060f6bd4d24bf3729394d2ed8", size = 3270455, upload-time = "2025-11-12T19:02:59.847Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/87/74/f9fe9e7034f24aef407e7816880c012d8e863bedaa6b42b9ff33e79ea139/cython-3.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f1d10b3731171a33563ba81fdcba39c229e45087269dfbe07a1c00e7dcb2537f", size = 2957374, upload-time = "2025-11-12T19:03:10.132Z" },
+    { url = "https://files.pythonhosted.org/packages/65/47/f9dd519117f520aaf4d723c88fd9e9139262a0379edc01e71a1e9825e082/cython-3.2.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92b814b6066d178a5057b557d372e2a03854e947e41cb9dec21db732fbd14c3c", size = 3366838, upload-time = "2025-11-12T19:03:11.742Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/3e/d967acfafef00056c3ba832692b9bb358ede2919f641e4a2d24828adacc6/cython-3.2.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9fc6abd0532007827d8c6143b2bfedf80c7cb89a3c1c12f058336663489ed2e", size = 3535901, upload-time = "2025-11-12T19:03:13.545Z" },
+    { url = "https://files.pythonhosted.org/packages/68/79/bc46e714ecb010f80a8aa7f7eaf412c53cbabbe7489590d6aba5f4478ba5/cython-3.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:14f1ed135347587cfddcd3c3219667cac4f0ea0b66aa1c4c0187d50a1b92c222", size = 2764043, upload-time = "2025-11-12T19:03:15.584Z" },
+    { url = "https://files.pythonhosted.org/packages/48/d4/ba7b9f341ec168de78bd659600e04bb7de3b2d069bf98b2178a135e88ea4/cython-3.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3cb32c650e7f4476941d1f735cae75a2067d5e3279576273bb8802e8ea907222", size = 2949720, upload-time = "2025-11-12T19:03:17.492Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/47/c42417f424c0b928361f48d7dd0ae72716ee21f647b73ceb16f66b98663e/cython-3.2.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8a2b306813d7f28aa0a2c3e4e63ada1427a8109917532df942cd5429db228252", size = 3242127, upload-time = "2025-11-12T19:03:19.227Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/fc/1040460889129551649ec35be45e05169871fbcf71bd8e13c533e86f9468/cython-3.2.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0959d9a36d4f004ce63acc1474b3c606745af98b65e8ae709efd0c10988e9d6b", size = 3377094, upload-time = "2025-11-12T19:03:21.25Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/f2/8c754298eefa40e21af0ae3592837c6e71254900d5aea1c8859e96b11de5/cython-3.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:60c62e734421365135cc2842013d883136054a26c617c001be494235edfc447a", size = 2767824, upload-time = "2025-11-12T19:03:23.317Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/0e/19d5041b87f98ed19c94c388607cd27c1f7458078c3bad5de2dead55b2e1/cython-3.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ea5097d97afd2ab14e98637b7033eba5146de29a5dedf89f5e946076396ab891", size = 2966736, upload-time = "2025-11-12T19:03:25.064Z" },
+    { url = "https://files.pythonhosted.org/packages/84/b8/bcc36d9d2464348106984956608a52a42a01ab44ea64031207dffdebc078/cython-3.2.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a4bf12de0475bb6a21e2336a4a04dc4a2b4dd0507a2a3c703e045f3484266605", size = 3221633, upload-time = "2025-11-12T19:03:26.754Z" },
+    { url = "https://files.pythonhosted.org/packages/79/20/7d4807fe4ebcef9f20f2e5f93312d0f5d02f9f76524fd4e37706d04e83f7/cython-3.2.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18c64a0f69a1b8164de70ec7efc72250c589fec21519170de21582300f6aaed9", size = 3389542, upload-time = "2025-11-12T19:03:28.656Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/92/b06ba6721299293bc41e89732070132c453bdbaaeabb8f8cc76851b75345/cython-3.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:5ba14907d5826d8010e82306ce279a0d3650f5b50a4813c80836a17b2213c520", size = 2755307, upload-time = "2025-11-12T19:03:30.684Z" },
+    { url = "https://files.pythonhosted.org/packages/40/28/c6e36c214baeb27ae45b518552e74457536c7c964b1a55b5900b047fa467/cython-3.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:b4e850fc7a2f72d19679dd083fe4d20bf66860fceabb4f3207112f240249d708", size = 2957307, upload-time = "2025-11-12T19:03:32.471Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/c8/b0b9ba64f81f2875c42aab5c0979d6454cd1ac6b3c1e2373ad552701565d/cython-3.2.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3d20ca4afe993f7dccad3aeddbf4c3536cb0fd3ad6dc7a225935a666a5655af2", size = 3210919, upload-time = "2025-11-12T19:03:34.274Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/33/5d9ca6abba0e77e1851b843dd1b3c4095fbc6373166935e83c4414f80e88/cython-3.2.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f5a54a757d01ca6a260b02ce5baf17d9db1c2253566ab5844ee4966ff2a69c19", size = 3373350, upload-time = "2025-11-12T19:03:35.927Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/29/4408c3486ff380a2d6ae0d4b71da5195efcef3c4360017113ee7d1cb7335/cython-3.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:1b81e56584727a328e00d91c164f8f0f2c59b02bf6857c3f000cd830fa571453", size = 2753425, upload-time = "2025-11-12T19:03:38.157Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/32/c1aa03ccadda89487ff31b90d8651c3706ce2744bf4f2c2ae213147e89bd/cython-3.2.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:d7af6ad01c0fe1965d1d3badaeb6df53c1f37383ebae1ccb405b73f628f87713", size = 2967833, upload-time = "2025-11-12T19:03:40.233Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/dc/3488d3ade0635408a2ebb05561a3009e2f54616bfefd1f107088dfeb2c4c/cython-3.2.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e3ea7cd085b62acb67c0fbde5cd17a7d9e47992c965e81ec977cf9ea7c59cd65", size = 3256237, upload-time = "2025-11-12T19:03:42.005Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/ba/f3d35d3803c9a424fa8812893847114deb9e2440c1bc67a31ab9ec4b9355/cython-3.2.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:986aea38fdf231e78d73745f83271c5654852c822dc5141a1d3fba64429a6aa6", size = 3383100, upload-time = "2025-11-12T19:03:43.675Z" },
+    { url = "https://files.pythonhosted.org/packages/86/dc/d72dbb2f8e7ca95d2d18fd86f32b2e385996576230e7ecddd7d250786825/cython-3.2.1-cp314-cp314-win_amd64.whl", hash = "sha256:4960e26cd34c1385f21646339f2e0361fcdd2ed3c01cdb50fe734add577ec56a", size = 2790322, upload-time = "2025-11-12T19:03:45.373Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/7e/1194f4ba98b981bbdca945a292e4f49e87ea09d69516b24445409e7cf611/cython-3.2.1-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:4e9167316bf6ecfea33dcca62f074605648fb93cc053ef46b5deb3e5d12fc0d3", size = 2872858, upload-time = "2025-11-12T19:03:55.074Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/1a/393ca8ffec7ad3f02b8e4bffaba3dba4fb62c4a1c4c0b6dbf3b80e709fe3/cython-3.2.1-cp39-abi3-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:3095df6cd470064742f428c937bed7200c5123b9e19ee04aa09ec61281e565a3", size = 3209664, upload-time = "2025-11-12T19:03:56.771Z" },
+    { url = "https://files.pythonhosted.org/packages/37/57/f209f64c609d3d8fac60a572e56da2f621dc1789e399c58db61d5645a31f/cython-3.2.1-cp39-abi3-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:db3f53b2d9afb206075a2605f1150aa019f0733c7795a38eccc6119c2e9c3f7b", size = 2854607, upload-time = "2025-11-12T19:03:59.413Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/af/1e5c73fe52423f40776130b0be914fd9f9f8dc26c4f6ea4c2ed04772d558/cython-3.2.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0fc5e7687ac8f8e2b2fb95648f43e9e074ebaa72fd5cb3d8e20e5f1e8b8e02d9", size = 2991567, upload-time = "2025-11-12T19:04:02.209Z" },
+    { url = "https://files.pythonhosted.org/packages/39/2c/3ea175b6b1fdfb429f9e9c395240d894155b3c0615caced05fef43264cba/cython-3.2.1-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:bbb3bc152bc0de82b031c8d355418fa4890a92424209d59366c2c0bc9e6cf53c", size = 2889178, upload-time = "2025-11-12T19:04:05.272Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/88/b2ab22a3a3feac78c62354a823c5c0c33659909e9918f53aa05904532b4b/cython-3.2.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:a2022bc48ad0c2c0e0485bf0b54902913a3d81086b7d435f4437620c667799f6", size = 3223755, upload-time = "2025-11-12T19:04:07.262Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/56/9ba58629a03cbffb5965a3c65ccd91fa683d95d588c21a875da72fdc249b/cython-3.2.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:99fdd4ffc2dcb513f4be9ce71c6fedd895b96b1f814655b6bbab196df497b090", size = 3113456, upload-time = "2025-11-12T19:04:09.175Z" },
+    { url = "https://files.pythonhosted.org/packages/56/5b/148c1a7ea5aebe460a70cad716a77e5fd0205be2de9fc5250491eb13ad8c/cython-3.2.1-cp39-abi3-win32.whl", hash = "sha256:06071f85bd5ce040464d43b2f9f287742a79f905e81b709fe904567230f1ed51", size = 2434223, upload-time = "2025-11-12T19:04:11.294Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/54/bb9b0c9db2a92a5e93747ca3027cfc645741411f8f1c6af2fb2a7b82df5d/cython-3.2.1-cp39-abi3-win_arm64.whl", hash = "sha256:e87c131d59480aee1ebac622b64f287c0e1d665ad1a1b7d498ac48accdb36c6b", size = 2439268, upload-time = "2025-11-12T19:04:12.931Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/30/373775b8d933d781d055c1dd0f110f275a101f320dab724c8c63a7c1b945/cython-3.2.1-py3-none-any.whl", hash = "sha256:cd72c46e7bffe8250c52d400e72c8d5d3086437b6aeec5b0eca99ccd337f5834", size = 1254219, upload-time = "2025-11-12T19:02:56.14Z" },
 ]
 
 [[package]]
@@ -1254,7 +1145,8 @@ dependencies = [
     { name = "httpx" },
     { name = "huggingface-hub" },
     { name = "multiprocess" },
-    { name = "numpy" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "packaging" },
     { name = "pandas" },
     { name = "pyarrow" },
@@ -1291,8 +1183,7 @@ name = "deprecated"
 version = "1.3.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "wrapt", version = "1.17.3", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "wrapt", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
+    { name = "wrapt" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/49/85/12f0a49a7c4ffb70572b6c2ef13c90c88fd190debda93b23f026b25f9634/deprecated-1.3.1.tar.gz", hash = "sha256:b1b50e0ff0c1fddaa5708a2c6b0a6588bb09b892825ab2b214ac9ea9d92a5223", size = 2932523, upload-time = "2025-10-30T08:19:02.757Z" }
 wheels = [
@@ -1340,18 +1231,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408, upload-time = "2024-04-23T18:57:14.835Z" },
 ]
 
-[[package]]
-name = "donfig"
-version = "0.8.1.post1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "pyyaml", marker = "python_full_version >= '3.11'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/25/71/80cc718ff6d7abfbabacb1f57aaa42e9c1552bfdd01e64ddd704e4a03638/donfig-0.8.1.post1.tar.gz", hash = "sha256:3bef3413a4c1c601b585e8d297256d0c1470ea012afa6e8461dc28bfb7c23f52", size = 19506, upload-time = "2024-05-23T14:14:31.513Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/0c/d5/c5db1ea3394c6e1732fb3286b3bd878b59507a8f77d32a2cebda7d7b7cd4/donfig-0.8.1.post1-py3-none-any.whl", hash = "sha256:2a3175ce74a06109ff9307d90a230f81215cbac9a751f4d1c6194644b8204f9d", size = 21592, upload-time = "2024-05-23T14:13:55.283Z" },
-]
-
 [[package]]
 name = "ebmlite"
 version = "3.4.1"
@@ -1382,14 +1261,14 @@ dependencies = [
 
 [[package]]
 name = "exceptiongroup"
-version = "1.3.0"
+version = "1.3.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "typing-extensions", marker = "python_full_version < '3.13'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371, upload-time = "2025-11-21T23:01:54.787Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/36/f4/c6e662dade71f56cd2f3735141b265c3c79293c109549c1e6933b0651ffc/exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10", size = 16674, upload-time = "2025-05-10T17:42:49.33Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/0e/97c33bf5009bdbac74fd2beace167cab3f978feb69cc36f1ef79360d6c4e/exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598", size = 16740, upload-time = "2025-11-21T23:01:53.443Z" },
 ]
 
 [[package]]
@@ -1409,7 +1288,7 @@ wheels = [
 
 [[package]]
 name = "fastapi"
-version = "0.121.0"
+version = "0.122.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "annotated-doc" },
@@ -1417,18 +1296,9 @@ dependencies = [
     { name = "starlette" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/8c/e3/77a2df0946703973b9905fd0cde6172c15e0781984320123b4f5079e7113/fastapi-0.121.0.tar.gz", hash = "sha256:06663356a0b1ee93e875bbf05a31fb22314f5bed455afaaad2b2dad7f26e98fa", size = 342412, upload-time = "2025-11-03T10:25:54.818Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/dd/2c/42277afc1ba1a18f8358561eee40785d27becab8f80a1f945c0a3051c6eb/fastapi-0.121.0-py3-none-any.whl", hash = "sha256:8bdf1b15a55f4e4b0d6201033da9109ea15632cb76cf156e7b8b4019f2172106", size = 109183, upload-time = "2025-11-03T10:25:53.27Z" },
-]
-
-[[package]]
-name = "fasteners"
-version = "0.20"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/2d/18/7881a99ba5244bfc82f06017316ffe93217dbbbcfa52b887caa1d4f2a6d3/fasteners-0.20.tar.gz", hash = "sha256:55dce8792a41b56f727ba6e123fcaee77fd87e638a6863cec00007bfea84c8d8", size = 25087, upload-time = "2025-08-11T10:19:37.785Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/b2/de/3ee97a4f6ffef1fb70bf20561e4f88531633bb5045dc6cebc0f8471f764d/fastapi-0.122.0.tar.gz", hash = "sha256:cd9b5352031f93773228af8b4c443eedc2ac2aa74b27780387b853c3726fb94b", size = 346436, upload-time = "2025-11-24T19:17:47.95Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/51/ac/e5d886f892666d2d1e5cb8c1a41146e1d79ae8896477b1153a21711d3b44/fasteners-0.20-py3-none-any.whl", hash = "sha256:9422c40d1e350e4259f509fb2e608d6bc43c0136f79a00db1b49046029d0b3b7", size = 18702, upload-time = "2025-08-11T10:19:35.716Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/93/aa8072af4ff37b795f6bbf43dcaf61115f40f49935c7dbb180c9afc3f421/fastapi-0.122.0-py3-none-any.whl", hash = "sha256:a456e8915dfc6c8914a50d9651133bd47ec96d331c5b44600baa635538a30d67", size = 110671, upload-time = "2025-11-24T19:17:45.96Z" },
 ]
 
 [[package]]
@@ -1513,14 +1383,15 @@ source = { git = "https://github.com/deepseek-ai/FlashMLA?rev=9edee0c022cd093814
 
 [[package]]
 name = "flashinfer-python"
-version = "0.5.1"
+version = "0.5.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "apache-tvm-ffi" },
     { name = "click" },
     { name = "einops" },
     { name = "ninja" },
-    { name = "numpy" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "nvidia-cudnn-frontend" },
     { name = "nvidia-cutlass-dsl" },
     { name = "nvidia-ml-py" },
@@ -1530,9 +1401,9 @@ dependencies = [
     { name = "torch", marker = "sys_platform == 'never'" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/6c/bb/897c3b9d683dcf6490f70e468efb585eebcd673970b13a04ed947b491982/flashinfer_python-0.5.1.tar.gz", hash = "sha256:f12b32d88d8cc10a396456df8ab017f1c4661fbf257e14f4d2461961ec0d090e", size = 4627606, upload-time = "2025-11-04T05:55:02.376Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/b4/91/cca69baeff24bb3efd12c7479a026432c8717ee47193694010494c528b22/flashinfer_python-0.5.3.tar.gz", hash = "sha256:100d59b0ede47878d2808cd3a1b9039d7a952d66338bc9f68dac192ae1b2e3f1", size = 4682367, upload-time = "2025-11-20T21:22:46.976Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/f4/f1/33dedad087a2bc3d66244126bd5d1c79721ea22d1f2124299f9e5bdaf3b1/flashinfer_python-0.5.1-py3-none-any.whl", hash = "sha256:ec8434d21e53a0ec333734a3c61946a0f7d2f972e344aefa99ba5b87e63aa76a", size = 6932706, upload-time = "2025-11-04T05:55:00.335Z" },
+    { url = "https://files.pythonhosted.org/packages/76/78/6dc7e7da8cb87c9965644ea0d2439457a1bc9256c45ceda0044595be4143/flashinfer_python-0.5.3-py3-none-any.whl", hash = "sha256:b601293b72f9138bad173edc28df84b9f239a013be974e2e79d4ba98aeb38cf5", size = 6998069, upload-time = "2025-11-20T21:22:45.104Z" },
 ]
 
 [[package]]
@@ -1820,7 +1691,7 @@ wheels = [
 
 [[package]]
 name = "hatchling"
-version = "1.27.0"
+version = "1.28.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "packaging" },
@@ -1829,9 +1700,9 @@ dependencies = [
     { name = "tomli", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "trove-classifiers" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/8f/8a/cc1debe3514da292094f1c3a700e4ca25442489731ef7c0814358816bb03/hatchling-1.27.0.tar.gz", hash = "sha256:971c296d9819abb3811112fc52c7a9751c8d381898f36533bb16f9791e941fd6", size = 54983, upload-time = "2024-12-15T17:08:11.894Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/0b/8e/e480359492affde4119a131da729dd26da742c2c9b604dff74836e47eef9/hatchling-1.28.0.tar.gz", hash = "sha256:4d50b02aece6892b8cd0b3ce6c82cb218594d3ec5836dbde75bf41a21ab004c8", size = 55365, upload-time = "2025-11-27T00:31:13.766Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/08/e7/ae38d7a6dfba0533684e0b2136817d667588ae3ec984c1a4e5df5eb88482/hatchling-1.27.0-py3-none-any.whl", hash = "sha256:d3a2f3567c4f926ea39849cdf924c7e99e6686c9c8e288ae1037c8fa2a5d937b", size = 75794, upload-time = "2024-12-15T17:08:10.364Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/a5/48cb7efb8b4718b1a4c0c331e3364a3a33f614ff0d6afd2b93ee883d3c47/hatchling-1.28.0-py3-none-any.whl", hash = "sha256:dc48722b68b3f4bbfa3ff618ca07cdea6750e7d03481289ffa8be1521d18a961", size = 76075, upload-time = "2025-11-27T00:31:12.544Z" },
 ]
 
 [[package]]
@@ -1956,74 +1827,14 @@ wheels = [
 name = "importlib-metadata"
 version = "8.6.1"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux'",
-    "python_full_version < '3.11' and sys_platform == 'linux'",
-    "python_full_version < '3.11' and sys_platform != 'linux'",
-]
 dependencies = [
-    { name = "zipp", marker = "extra == 'extra-13-megatron-core-dev'" },
+    { name = "zipp" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/33/08/c1395a292bb23fd03bdf572a1357c5a733d3eecbab877641ceacab23db6e/importlib_metadata-8.6.1.tar.gz", hash = "sha256:310b41d755445d74569f993ccfc22838295d9fe005425094fad953d7f15c8580", size = 55767, upload-time = "2025-01-20T22:21:30.429Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/79/9d/0fb148dc4d6fa4a7dd1d8378168d9b4cd8d4560a6fbf6f0121c5fc34eb68/importlib_metadata-8.6.1-py3-none-any.whl", hash = "sha256:02a89390c1e15fdfdc0d7c6b25cb3e62650d0494005c97d6f148bf5b9787525e", size = 26971, upload-time = "2025-01-20T22:21:29.177Z" },
 ]
 
-[[package]]
-name = "importlib-metadata"
-version = "8.7.0"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.11' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.11' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.11' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.11' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-]
-dependencies = [
-    { name = "zipp", marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/76/66/650a33bd90f786193e4de4b3ad86ea60b53c89b669a5c7be931fac31cdb0/importlib_metadata-8.7.0.tar.gz", hash = "sha256:d13b81ad223b890aa16c5471f2ac3056cf76c5f10f82d6f9292f0b415f389000", size = 56641, upload-time = "2025-04-27T15:29:01.736Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/20/b0/36bd937216ec521246249be3bf9855081de4c5e06a0c9b4219dbeda50373/importlib_metadata-8.7.0-py3-none-any.whl", hash = "sha256:e5dd1551894c77868a30651cef00984d50e1002d06942a7101d34870c5f02afd", size = 27656, upload-time = "2025-04-27T15:29:00.214Z" },
-]
-
 [[package]]
 name = "iniconfig"
 version = "2.3.0"
@@ -2150,7 +1961,7 @@ wheels = [
 
 [[package]]
 name = "leptonai"
-version = "0.26.6"
+version = "0.26.7"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
@@ -2175,7 +1986,7 @@ dependencies = [
     { name = "uvicorn" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/68/b4/e29dfe5a6e63a0e55fc26115a8eef55fbbc004c7677544bbd88798e1c003/leptonai-0.26.6-py3-none-any.whl", hash = "sha256:e76846b52d6ffc186b26a1fa40ebf0432eb1d8108dda1fb2f7785a1f25c803c2", size = 2443372, upload-time = "2025-09-23T08:04:27.984Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/4d/2b5ab13294b23326ba1d8ef6ad703b1d9535bf72a0617030ddd6238eb925/leptonai-0.26.7-py3-none-any.whl", hash = "sha256:74996da36bf177d2b148887dd349627ab8cd78b94623d543bc91ed9ad65ba0e2", size = 2452890, upload-time = "2025-11-07T20:07:14.99Z" },
 ]
 
 [[package]]
@@ -2414,7 +2225,8 @@ wheels = [
 name = "megatron-core"
 source = { editable = "." }
 dependencies = [
-    { name = "numpy" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "packaging" },
     { name = "torch", marker = "sys_platform == 'never' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
@@ -2425,6 +2237,7 @@ dev = [
     { name = "causal-conv1d" },
     { name = "einops" },
     { name = "emerging-optimizers" },
+    { name = "fastapi" },
     { name = "flash-linear-attention" },
     { name = "flashinfer-python" },
     { name = "mamba-ssm" },
@@ -2434,27 +2247,31 @@ dev = [
     { name = "nvidia-modelopt", marker = "(sys_platform != 'darwin' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "nvidia-resiliency-ext" },
     { name = "nvtx" },
-    { name = "onnxscript", version = "0.5.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "onnxscript", version = "0.5.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "onnxscript" },
     { name = "opentelemetry-api" },
-    { name = "setuptools" },
-    { name = "tensorstore", version = "0.1.74", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "tensorstore", version = "0.1.78", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "tensorstore", version = "0.1.78", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "tensorstore", version = "0.1.79", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "tqdm" },
-    { name = "transformer-engine", marker = "extra == 'extra-13-megatron-core-dev'" },
+    { name = "transformer-engine", extra = ["core-cu13", "pytorch"], marker = "extra == 'extra-13-megatron-core-dev'" },
     { name = "wget" },
 ]
 lts = [
+    { name = "av" },
+    { name = "causal-conv1d" },
     { name = "einops" },
+    { name = "fastapi" },
+    { name = "flashinfer-python" },
+    { name = "mamba-ssm" },
+    { name = "megatron-energon", extra = ["av-decode"], marker = "extra == 'extra-13-megatron-core-lts'" },
+    { name = "multi-storage-client" },
+    { name = "nv-grouped-gemm" },
     { name = "nvtx" },
-    { name = "setuptools" },
-    { name = "tensorstore", version = "0.1.74", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.13' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "tensorstore", version = "0.1.78", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "onnxscript" },
+    { name = "opentelemetry-api" },
+    { name = "tensorstore", version = "0.1.78", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "tensorstore", version = "0.1.79", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "tqdm" },
-    { name = "transformers" },
     { name = "wget" },
-    { name = "zarr", version = "2.18.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "zarr", version = "3.1.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 mlm = [
     { name = "flask-restful" },
@@ -2489,9 +2306,6 @@ docs = [
     { name = "sphinx-autodoc2" },
     { name = "sphinx-copybutton" },
 ]
-flash-mla = [
-    { name = "flash-mla" },
-]
 linting = [
     { name = "black" },
     { name = "flake8" },
@@ -2499,6 +2313,10 @@ linting = [
     { name = "pylint" },
     { name = "ruff" },
 ]
+no-pypi-wheels = [
+    { name = "emerging-optimizers" },
+    { name = "flash-mla" },
+]
 test = [
     { name = "coverage" },
     { name = "nemo-run" },
@@ -2512,48 +2330,54 @@ test = [
     { name = "pytest-random-order" },
     { name = "pyyaml" },
     { name = "tensorboard" },
-    { name = "wrapt", version = "1.17.3", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "wrapt", version = "2.0.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
+    { name = "wrapt" },
 ]
 
 [package.metadata]
 requires-dist = [
-    { name = "av", marker = "extra == 'dev'", specifier = "<16.0.0" },
+    { name = "av", marker = "extra == 'dev'" },
+    { name = "av", marker = "extra == 'lts'" },
     { name = "causal-conv1d", marker = "extra == 'dev'", specifier = "~=1.5" },
+    { name = "causal-conv1d", marker = "extra == 'lts'", specifier = "~=1.5" },
     { name = "einops", marker = "extra == 'dev'", specifier = "~=0.8" },
-    { name = "einops", marker = "extra == 'lts'" },
+    { name = "einops", marker = "extra == 'lts'", specifier = "~=0.8" },
     { name = "emerging-optimizers", marker = "extra == 'dev'", git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=v0.1.0" },
+    { name = "fastapi", marker = "extra == 'dev'", specifier = "~=0.50" },
+    { name = "fastapi", marker = "extra == 'lts'", specifier = "~=0.50" },
     { name = "flash-linear-attention", marker = "extra == 'dev'", specifier = "~=0.3.2" },
     { name = "flashinfer-python", marker = "extra == 'dev'" },
+    { name = "flashinfer-python", marker = "extra == 'lts'" },
     { name = "flask-restful", marker = "extra == 'mlm'" },
     { name = "mamba-ssm", marker = "extra == 'dev'", specifier = "~=2.2" },
+    { name = "mamba-ssm", marker = "extra == 'lts'", specifier = "~=2.2" },
     { name = "megatron-energon", extras = ["av-decode"], marker = "extra == 'dev'", specifier = "~=6.0" },
+    { name = "megatron-energon", extras = ["av-decode"], marker = "extra == 'lts'", specifier = "~=6.0" },
     { name = "multi-storage-client", marker = "extra == 'dev'", specifier = "~=0.27" },
-    { name = "numpy", specifier = "<2.0.0" },
+    { name = "multi-storage-client", marker = "extra == 'lts'", specifier = "~=0.27" },
+    { name = "numpy" },
     { name = "nv-grouped-gemm", marker = "extra == 'dev'", specifier = "~=1.1" },
-    { name = "nvidia-modelopt", extras = ["torch"], marker = "sys_platform != 'darwin' and extra == 'dev'", specifier = ">=0.33.0a0,<0.34.0" },
-    { name = "nvidia-resiliency-ext", marker = "extra == 'dev'", specifier = ">=0.4.0a0,<0.5.0" },
+    { name = "nv-grouped-gemm", marker = "extra == 'lts'", specifier = "~=1.1" },
+    { name = "nvidia-modelopt", extras = ["torch"], marker = "sys_platform != 'darwin' and extra == 'dev'" },
+    { name = "nvidia-resiliency-ext", marker = "extra == 'dev'" },
     { name = "nvtx", marker = "extra == 'dev'", specifier = "~=0.2" },
-    { name = "nvtx", marker = "extra == 'lts'" },
+    { name = "nvtx", marker = "extra == 'lts'", specifier = "~=0.2" },
     { name = "onnxscript", marker = "extra == 'dev'" },
+    { name = "onnxscript", marker = "extra == 'lts'" },
     { name = "opentelemetry-api", marker = "extra == 'dev'", specifier = "~=1.33.1" },
+    { name = "opentelemetry-api", marker = "extra == 'lts'", specifier = "~=1.33.1" },
     { name = "packaging", specifier = ">=24.2" },
     { name = "sentencepiece", marker = "extra == 'mlm'" },
-    { name = "setuptools", marker = "extra == 'dev'", specifier = "<80.0.0" },
-    { name = "setuptools", marker = "extra == 'lts'", specifier = "<80.0.0" },
     { name = "tensorstore", marker = "extra == 'dev'", specifier = "~=0.1,!=0.1.46,!=0.1.72" },
-    { name = "tensorstore", marker = "extra == 'lts'", specifier = "!=0.1.46,!=0.1.72" },
+    { name = "tensorstore", marker = "extra == 'lts'", specifier = "~=0.1,!=0.1.46,!=0.1.72" },
     { name = "tiktoken", marker = "extra == 'mlm'" },
     { name = "torch" },
     { name = "tqdm", marker = "extra == 'dev'" },
     { name = "tqdm", marker = "extra == 'lts'" },
-    { name = "transformer-engine", extras = ["pytorch"], marker = "extra == 'dev'", git = "https://github.com/NVIDIA/TransformerEngine.git?rev=release_v2.9" },
-    { name = "transformers", marker = "extra == 'lts'" },
+    { name = "transformer-engine", extras = ["core-cu13", "pytorch"], marker = "extra == 'dev'", specifier = ">=2.9.0a0,<2.10.0" },
     { name = "transformers", marker = "extra == 'mlm'" },
     { name = "wandb", marker = "extra == 'mlm'" },
     { name = "wget", marker = "extra == 'dev'" },
     { name = "wget", marker = "extra == 'lts'" },
-    { name = "zarr", marker = "extra == 'lts'" },
 ]
 provides-extras = ["mlm", "dev", "lts"]
 
@@ -2580,7 +2404,6 @@ docs = [
     { name = "sphinx-autodoc2" },
     { name = "sphinx-copybutton" },
 ]
-flash-mla = [{ name = "flash-mla", git = "https://github.com/deepseek-ai/FlashMLA?rev=9edee0c022cd0938148a18e334203b0aab43aa19" }]
 linting = [
     { name = "black", specifier = "==24.4.2" },
     { name = "flake8", specifier = "==7.1.0" },
@@ -2588,6 +2411,10 @@ linting = [
     { name = "pylint", specifier = "==3.2.6" },
     { name = "ruff", specifier = "~=0.9.0" },
 ]
+no-pypi-wheels = [
+    { name = "emerging-optimizers", git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=v0.1.0" },
+    { name = "flash-mla", git = "https://github.com/deepseek-ai/FlashMLA?rev=9edee0c022cd0938148a18e334203b0aab43aa19" },
+]
 test = [
     { name = "coverage" },
     { name = "nemo-run", git = "https://github.com/NVIDIA-NeMo/Run.git?rev=01a9a8ba360f7b2908728ad0516e0ad9d936966d" },
@@ -2612,7 +2439,8 @@ dependencies = [
     { name = "braceexpand" },
     { name = "click" },
     { name = "multi-storage-client" },
-    { name = "numpy" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "pillow" },
     { name = "pyyaml" },
     { name = "s3fs" },
@@ -2637,84 +2465,48 @@ av-decode = [
 
 [[package]]
 name = "ml-dtypes"
-version = "0.4.1"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'",
-]
-dependencies = [
-    { name = "numpy", marker = "python_full_version >= '3.13'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/fd/15/76f86faa0902836cc133939732f7611ace68cf54148487a99c539c272dc8/ml_dtypes-0.4.1.tar.gz", hash = "sha256:fad5f2de464fd09127e49b7fd1252b9006fb43d2edc1ff112d390c324af5ca7a", size = 692594, upload-time = "2024-09-13T19:07:11.624Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/56/9e/76b84f77c7afee3b116dc8407903a2d5004ba3059a8f3dcdcfa6ebf33fff/ml_dtypes-0.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:1fe8b5b5e70cd67211db94b05cfd58dace592f24489b038dc6f9fe347d2e07d5", size = 397975, upload-time = "2024-09-13T19:06:44.265Z" },
-    { url = "https://files.pythonhosted.org/packages/03/7b/32650e1b2a2713a5923a0af2a8503d0d4a8fc99d1e1e0a1c40e996634460/ml_dtypes-0.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c09a6d11d8475c2a9fd2bc0695628aec105f97cab3b3a3fb7c9660348ff7d24", size = 2182570, upload-time = "2024-09-13T19:06:46.189Z" },
-    { url = "https://files.pythonhosted.org/packages/16/86/a9f7569e7e4f5395f927de38a13b92efa73f809285d04f2923b291783dd2/ml_dtypes-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f5e8f75fa371020dd30f9196e7d73babae2abd51cf59bdd56cb4f8de7e13354", size = 2160365, upload-time = "2024-09-13T19:06:48.198Z" },
-    { url = "https://files.pythonhosted.org/packages/04/1b/9a3afb437702503514f3934ec8d7904270edf013d28074f3e700e5dfbb0f/ml_dtypes-0.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:15fdd922fea57e493844e5abb930b9c0bd0af217d9edd3724479fc3d7ce70e3f", size = 126633, upload-time = "2024-09-13T19:06:50.656Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/76/9835c8609c29f2214359e88f29255fc4aad4ea0f613fb48aa8815ceda1b6/ml_dtypes-0.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2d55b588116a7085d6e074cf0cdb1d6fa3875c059dddc4d2c94a4cc81c23e975", size = 397973, upload-time = "2024-09-13T19:06:51.748Z" },
-    { url = "https://files.pythonhosted.org/packages/7e/99/e68c56fac5de973007a10254b6e17a0362393724f40f66d5e4033f4962c2/ml_dtypes-0.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e138a9b7a48079c900ea969341a5754019a1ad17ae27ee330f7ebf43f23877f9", size = 2185134, upload-time = "2024-09-13T19:06:53.197Z" },
-    { url = "https://files.pythonhosted.org/packages/28/bc/6a2344338ea7b61cd7b46fb24ec459360a5a0903b57c55b156c1e46c644a/ml_dtypes-0.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:74c6cfb5cf78535b103fde9ea3ded8e9f16f75bc07789054edc7776abfb3d752", size = 2163661, upload-time = "2024-09-13T19:06:54.519Z" },
-    { url = "https://files.pythonhosted.org/packages/e8/d3/ddfd9878b223b3aa9a930c6100a99afca5cfab7ea703662e00323acb7568/ml_dtypes-0.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:274cc7193dd73b35fb26bef6c5d40ae3eb258359ee71cd82f6e96a8c948bdaa6", size = 126727, upload-time = "2024-09-13T19:06:55.897Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/1a/99e924f12e4b62139fbac87419698c65f956d58de0dbfa7c028fa5b096aa/ml_dtypes-0.4.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:827d3ca2097085cf0355f8fdf092b888890bb1b1455f52801a2d7756f056f54b", size = 405077, upload-time = "2024-09-13T19:06:57.538Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/8c/7b610bd500617854c8cc6ed7c8cfb9d48d6a5c21a1437a36a4b9bc8a3598/ml_dtypes-0.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:772426b08a6172a891274d581ce58ea2789cc8abc1c002a27223f314aaf894e7", size = 2181554, upload-time = "2024-09-13T19:06:59.196Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/c6/f89620cecc0581dc1839e218c4315171312e46c62a62da6ace204bda91c0/ml_dtypes-0.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:126e7d679b8676d1a958f2651949fbfa182832c3cd08020d8facd94e4114f3e9", size = 2160488, upload-time = "2024-09-13T19:07:03.131Z" },
-    { url = "https://files.pythonhosted.org/packages/ae/11/a742d3c31b2cc8557a48efdde53427fd5f9caa2fa3c9c27d826e78a66f51/ml_dtypes-0.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:df0fb650d5c582a9e72bb5bd96cfebb2cdb889d89daff621c8fbc60295eba66c", size = 127462, upload-time = "2024-09-13T19:07:04.916Z" },
-]
-
-[[package]]
-name = "ml-dtypes"
-version = "0.5.3"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version == '3.12.*' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux'",
-    "python_full_version < '3.11' and sys_platform == 'linux'",
-    "python_full_version < '3.11' and sys_platform != 'linux'",
-]
-dependencies = [
-    { name = "numpy", marker = "python_full_version < '3.13'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/78/a7/aad060393123cfb383956dca68402aff3db1e1caffd5764887ed5153f41b/ml_dtypes-0.5.3.tar.gz", hash = "sha256:95ce33057ba4d05df50b1f3cfefab22e351868a843b3b15a46c65836283670c9", size = 692316, upload-time = "2025-07-29T18:39:19.454Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/ac/bb/1f32124ab6d3a279ea39202fe098aea95b2d81ef0ce1d48612b6bf715e82/ml_dtypes-0.5.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0a1d68a7cb53e3f640b2b6a34d12c0542da3dd935e560fdf463c0c77f339fc20", size = 667409, upload-time = "2025-07-29T18:38:17.321Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/ac/e002d12ae19136e25bb41c7d14d7e1a1b08f3c0e99a44455ff6339796507/ml_dtypes-0.5.3-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0cd5a6c711b5350f3cbc2ac28def81cd1c580075ccb7955e61e9d8f4bfd40d24", size = 4960702, upload-time = "2025-07-29T18:38:19.616Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/12/79e9954e6b3255a4b1becb191a922d6e2e94d03d16a06341ae9261963ae8/ml_dtypes-0.5.3-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bdcf26c2dbc926b8a35ec8cbfad7eff1a8bd8239e12478caca83a1fc2c400dc2", size = 4933471, upload-time = "2025-07-29T18:38:21.809Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/aa/d1eff619e83cd1ddf6b561d8240063d978e5d887d1861ba09ef01778ec3a/ml_dtypes-0.5.3-cp310-cp310-win_amd64.whl", hash = "sha256:aecbd7c5272c82e54d5b99d8435fd10915d1bc704b7df15e4d9ca8dc3902be61", size = 206330, upload-time = "2025-07-29T18:38:23.663Z" },
-    { url = "https://files.pythonhosted.org/packages/af/f1/720cb1409b5d0c05cff9040c0e9fba73fa4c67897d33babf905d5d46a070/ml_dtypes-0.5.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4a177b882667c69422402df6ed5c3428ce07ac2c1f844d8a1314944651439458", size = 667412, upload-time = "2025-07-29T18:38:25.275Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/d5/05861ede5d299f6599f86e6bc1291714e2116d96df003cfe23cc54bcc568/ml_dtypes-0.5.3-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9849ce7267444c0a717c80c6900997de4f36e2815ce34ac560a3edb2d9a64cd2", size = 4964606, upload-time = "2025-07-29T18:38:27.045Z" },
-    { url = "https://files.pythonhosted.org/packages/db/dc/72992b68de367741bfab8df3b3fe7c29f982b7279d341aa5bf3e7ef737ea/ml_dtypes-0.5.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c3f5ae0309d9f888fd825c2e9d0241102fadaca81d888f26f845bc8c13c1e4ee", size = 4938435, upload-time = "2025-07-29T18:38:29.193Z" },
-    { url = "https://files.pythonhosted.org/packages/81/1c/d27a930bca31fb07d975a2d7eaf3404f9388114463b9f15032813c98f893/ml_dtypes-0.5.3-cp311-cp311-win_amd64.whl", hash = "sha256:58e39349d820b5702bb6f94ea0cb2dc8ec62ee81c0267d9622067d8333596a46", size = 206334, upload-time = "2025-07-29T18:38:30.687Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/d8/6922499effa616012cb8dc445280f66d100a7ff39b35c864cfca019b3f89/ml_dtypes-0.5.3-cp311-cp311-win_arm64.whl", hash = "sha256:66c2756ae6cfd7f5224e355c893cfd617fa2f747b8bbd8996152cbdebad9a184", size = 157584, upload-time = "2025-07-29T18:38:32.187Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/eb/bc07c88a6ab002b4635e44585d80fa0b350603f11a2097c9d1bfacc03357/ml_dtypes-0.5.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:156418abeeda48ea4797db6776db3c5bdab9ac7be197c1233771e0880c304057", size = 663864, upload-time = "2025-07-29T18:38:33.777Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/89/11af9b0f21b99e6386b6581ab40fb38d03225f9de5f55cf52097047e2826/ml_dtypes-0.5.3-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1db60c154989af253f6c4a34e8a540c2c9dce4d770784d426945e09908fbb177", size = 4951313, upload-time = "2025-07-29T18:38:36.45Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/a9/b98b86426c24900b0c754aad006dce2863df7ce0bb2bcc2c02f9cc7e8489/ml_dtypes-0.5.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1b255acada256d1fa8c35ed07b5f6d18bc21d1556f842fbc2d5718aea2cd9e55", size = 4928805, upload-time = "2025-07-29T18:38:38.29Z" },
-    { url = "https://files.pythonhosted.org/packages/50/c1/85e6be4fc09c6175f36fb05a45917837f30af9a5146a5151cb3a3f0f9e09/ml_dtypes-0.5.3-cp312-cp312-win_amd64.whl", hash = "sha256:da65e5fd3eea434ccb8984c3624bc234ddcc0d9f4c81864af611aaebcc08a50e", size = 208182, upload-time = "2025-07-29T18:38:39.72Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/17/cf5326d6867be057f232d0610de1458f70a8ce7b6290e4b4a277ea62b4cd/ml_dtypes-0.5.3-cp312-cp312-win_arm64.whl", hash = "sha256:8bb9cd1ce63096567f5f42851f5843b5a0ea11511e50039a7649619abfb4ba6d", size = 161560, upload-time = "2025-07-29T18:38:41.072Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/87/1bcc98a66de7b2455dfb292f271452cac9edc4e870796e0d87033524d790/ml_dtypes-0.5.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:5103856a225465371fe119f2fef737402b705b810bd95ad5f348e6e1a6ae21af", size = 663781, upload-time = "2025-07-29T18:38:42.984Z" },
-    { url = "https://files.pythonhosted.org/packages/fd/2c/bd2a79ba7c759ee192b5601b675b180a3fd6ccf48ffa27fe1782d280f1a7/ml_dtypes-0.5.3-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4cae435a68861660af81fa3c5af16b70ca11a17275c5b662d9c6f58294e0f113", size = 4956217, upload-time = "2025-07-29T18:38:44.65Z" },
-    { url = "https://files.pythonhosted.org/packages/14/f3/091ba84e5395d7fe5b30c081a44dec881cd84b408db1763ee50768b2ab63/ml_dtypes-0.5.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6936283b56d74fbec431ca57ce58a90a908fdbd14d4e2d22eea6d72bb208a7b7", size = 4933109, upload-time = "2025-07-29T18:38:46.405Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/24/054036dbe32c43295382c90a1363241684c4d6aaa1ecc3df26bd0c8d5053/ml_dtypes-0.5.3-cp313-cp313-win_amd64.whl", hash = "sha256:d0f730a17cf4f343b2c7ad50cee3bd19e969e793d2be6ed911f43086460096e4", size = 208187, upload-time = "2025-07-29T18:38:48.24Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/3d/7dc3ec6794a4a9004c765e0c341e32355840b698f73fd2daff46f128afc1/ml_dtypes-0.5.3-cp313-cp313-win_arm64.whl", hash = "sha256:2db74788fc01914a3c7f7da0763427280adfc9cd377e9604b6b64eb8097284bd", size = 161559, upload-time = "2025-07-29T18:38:50.493Z" },
-    { url = "https://files.pythonhosted.org/packages/12/91/e6c7a0d67a152b9330445f9f0cf8ae6eee9b83f990b8c57fe74631e42a90/ml_dtypes-0.5.3-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:93c36a08a6d158db44f2eb9ce3258e53f24a9a4a695325a689494f0fdbc71770", size = 689321, upload-time = "2025-07-29T18:38:52.03Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/6c/b7b94b84a104a5be1883305b87d4c6bd6ae781504474b4cca067cb2340ec/ml_dtypes-0.5.3-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0e44a3761f64bc009d71ddb6d6c71008ba21b53ab6ee588dadab65e2fa79eafc", size = 5274495, upload-time = "2025-07-29T18:38:53.797Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/38/6266604dffb43378055394ea110570cf261a49876fc48f548dfe876f34cc/ml_dtypes-0.5.3-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bdf40d2aaabd3913dec11840f0d0ebb1b93134f99af6a0a4fd88ffe924928ab4", size = 5285422, upload-time = "2025-07-29T18:38:56.603Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/88/8612ff177d043a474b9408f0382605d881eeb4125ba89d4d4b3286573a83/ml_dtypes-0.5.3-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:aec640bd94c4c85c0d11e2733bd13cbb10438fb004852996ec0efbc6cacdaf70", size = 661182, upload-time = "2025-07-29T18:38:58.414Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/2b/0569a5e88b29240d373e835107c94ae9256fb2191d3156b43b2601859eff/ml_dtypes-0.5.3-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bda32ce212baa724e03c68771e5c69f39e584ea426bfe1a701cb01508ffc7035", size = 4956187, upload-time = "2025-07-29T18:39:00.611Z" },
-    { url = "https://files.pythonhosted.org/packages/51/66/273c2a06ae44562b104b61e6b14444da00061fd87652506579d7eb2c40b1/ml_dtypes-0.5.3-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c205cac07d24a29840c163d6469f61069ce4b065518519216297fc2f261f8db9", size = 4930911, upload-time = "2025-07-29T18:39:02.405Z" },
-    { url = "https://files.pythonhosted.org/packages/93/ab/606be3e87dc0821bd360c8c1ee46108025c31a4f96942b63907bb441b87d/ml_dtypes-0.5.3-cp314-cp314-win_amd64.whl", hash = "sha256:cd7c0bb22d4ff86d65ad61b5dd246812e8993fbc95b558553624c33e8b6903ea", size = 216664, upload-time = "2025-07-29T18:39:03.927Z" },
-    { url = "https://files.pythonhosted.org/packages/30/a2/e900690ca47d01dffffd66375c5de8c4f8ced0f1ef809ccd3b25b3e6b8fa/ml_dtypes-0.5.3-cp314-cp314-win_arm64.whl", hash = "sha256:9d55ea7f7baf2aed61bf1872116cefc9d0c3693b45cae3916897ee27ef4b835e", size = 160203, upload-time = "2025-07-29T18:39:05.671Z" },
-    { url = "https://files.pythonhosted.org/packages/53/21/783dfb51f40d2660afeb9bccf3612b99f6a803d980d2a09132b0f9d216ab/ml_dtypes-0.5.3-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:e12e29764a0e66a7a31e9b8bf1de5cc0423ea72979f45909acd4292de834ccd3", size = 689324, upload-time = "2025-07-29T18:39:07.567Z" },
-    { url = "https://files.pythonhosted.org/packages/09/f7/a82d249c711abf411ac027b7163f285487f5e615c3e0716c61033ce996ab/ml_dtypes-0.5.3-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:19f6c3a4f635c2fc9e2aa7d91416bd7a3d649b48350c51f7f715a09370a90d93", size = 5275917, upload-time = "2025-07-29T18:39:09.339Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/3c/541c4b30815ab90ebfbb51df15d0b4254f2f9f1e2b4907ab229300d5e6f2/ml_dtypes-0.5.3-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5ab039ffb40f3dc0aeeeba84fd6c3452781b5e15bef72e2d10bcb33e4bbffc39", size = 5285284, upload-time = "2025-07-29T18:39:11.532Z" },
+version = "0.5.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/0e/4a/c27b42ed9b1c7d13d9ba8b6905dece787d6259152f2309338aed29b2447b/ml_dtypes-0.5.4.tar.gz", hash = "sha256:8ab06a50fb9bf9666dd0fe5dfb4676fa2b0ac0f31ecff72a6c3af8e22c063453", size = 692314, upload-time = "2025-11-17T22:32:31.031Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fe/3a/c5b855752a70267ff729c349e650263adb3c206c29d28cc8ea7ace30a1d5/ml_dtypes-0.5.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b95e97e470fe60ed493fd9ae3911d8da4ebac16bd21f87ffa2b7c588bf22ea2c", size = 679735, upload-time = "2025-11-17T22:31:31.367Z" },
+    { url = "https://files.pythonhosted.org/packages/41/79/7433f30ee04bd4faa303844048f55e1eb939131c8e5195a00a96a0939b64/ml_dtypes-0.5.4-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b4b801ebe0b477be666696bda493a9be8356f1f0057a57f1e35cd26928823e5a", size = 5051883, upload-time = "2025-11-17T22:31:33.658Z" },
+    { url = "https://files.pythonhosted.org/packages/10/b1/8938e8830b0ee2e167fc75a094dea766a1152bde46752cd9bfc57ee78a82/ml_dtypes-0.5.4-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:388d399a2152dd79a3f0456a952284a99ee5c93d3e2f8dfe25977511e0515270", size = 5030369, upload-time = "2025-11-17T22:31:35.595Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/a3/51886727bd16e2f47587997b802dd56398692ce8c6c03c2e5bb32ecafe26/ml_dtypes-0.5.4-cp310-cp310-win_amd64.whl", hash = "sha256:4ff7f3e7ca2972e7de850e7b8fcbb355304271e2933dd90814c1cb847414d6e2", size = 210738, upload-time = "2025-11-17T22:31:37.43Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/5e/712092cfe7e5eb667b8ad9ca7c54442f21ed7ca8979745f1000e24cf8737/ml_dtypes-0.5.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6c7ecb74c4bd71db68a6bea1edf8da8c34f3d9fe218f038814fd1d310ac76c90", size = 679734, upload-time = "2025-11-17T22:31:39.223Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/cf/912146dfd4b5c0eea956836c01dcd2fce6c9c844b2691f5152aca196ce4f/ml_dtypes-0.5.4-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bc11d7e8c44a65115d05e2ab9989d1e045125d7be8e05a071a48bc76eb6d6040", size = 5056165, upload-time = "2025-11-17T22:31:41.071Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/80/19189ea605017473660e43762dc853d2797984b3c7bf30ce656099add30c/ml_dtypes-0.5.4-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:19b9a53598f21e453ea2fbda8aa783c20faff8e1eeb0d7ab899309a0053f1483", size = 5034975, upload-time = "2025-11-17T22:31:42.758Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/24/70bd59276883fdd91600ca20040b41efd4902a923283c4d6edcb1de128d2/ml_dtypes-0.5.4-cp311-cp311-win_amd64.whl", hash = "sha256:7c23c54a00ae43edf48d44066a7ec31e05fdc2eee0be2b8b50dd1903a1db94bb", size = 210742, upload-time = "2025-11-17T22:31:44.068Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/c9/64230ef14e40aa3f1cb254ef623bf812735e6bec7772848d19131111ac0d/ml_dtypes-0.5.4-cp311-cp311-win_arm64.whl", hash = "sha256:557a31a390b7e9439056644cb80ed0735a6e3e3bb09d67fd5687e4b04238d1de", size = 160709, upload-time = "2025-11-17T22:31:46.557Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/b8/3c70881695e056f8a32f8b941126cf78775d9a4d7feba8abcb52cb7b04f2/ml_dtypes-0.5.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a174837a64f5b16cab6f368171a1a03a27936b31699d167684073ff1c4237dac", size = 676927, upload-time = "2025-11-17T22:31:48.182Z" },
+    { url = "https://files.pythonhosted.org/packages/54/0f/428ef6881782e5ebb7eca459689448c0394fa0a80bea3aa9262cba5445ea/ml_dtypes-0.5.4-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a7f7c643e8b1320fd958bf098aa7ecf70623a42ec5154e3be3be673f4c34d900", size = 5028464, upload-time = "2025-11-17T22:31:50.135Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/cb/28ce52eb94390dda42599c98ea0204d74799e4d8047a0eb559b6fd648056/ml_dtypes-0.5.4-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ad459e99793fa6e13bd5b7e6792c8f9190b4e5a1b45c63aba14a4d0a7f1d5ff", size = 5009002, upload-time = "2025-11-17T22:31:52.001Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/f0/0cfadd537c5470378b1b32bd859cf2824972174b51b873c9d95cfd7475a5/ml_dtypes-0.5.4-cp312-cp312-win_amd64.whl", hash = "sha256:c1a953995cccb9e25a4ae19e34316671e4e2edaebe4cf538229b1fc7109087b7", size = 212222, upload-time = "2025-11-17T22:31:53.742Z" },
+    { url = "https://files.pythonhosted.org/packages/16/2e/9acc86985bfad8f2c2d30291b27cd2bb4c74cea08695bd540906ed744249/ml_dtypes-0.5.4-cp312-cp312-win_arm64.whl", hash = "sha256:9bad06436568442575beb2d03389aa7456c690a5b05892c471215bfd8cf39460", size = 160793, upload-time = "2025-11-17T22:31:55.358Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/a1/4008f14bbc616cfb1ac5b39ea485f9c63031c4634ab3f4cf72e7541f816a/ml_dtypes-0.5.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8c760d85a2f82e2bed75867079188c9d18dae2ee77c25a54d60e9cc79be1bc48", size = 676888, upload-time = "2025-11-17T22:31:56.907Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/b7/dff378afc2b0d5a7d6cd9d3209b60474d9819d1189d347521e1688a60a53/ml_dtypes-0.5.4-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ce756d3a10d0c4067172804c9cc276ba9cc0ff47af9078ad439b075d1abdc29b", size = 5036993, upload-time = "2025-11-17T22:31:58.497Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/33/40cd74219417e78b97c47802037cf2d87b91973e18bb968a7da48a96ea44/ml_dtypes-0.5.4-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:533ce891ba774eabf607172254f2e7260ba5f57bdd64030c9a4fcfbd99815d0d", size = 5010956, upload-time = "2025-11-17T22:31:59.931Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/8b/200088c6859d8221454825959df35b5244fa9bdf263fd0249ac5fb75e281/ml_dtypes-0.5.4-cp313-cp313-win_amd64.whl", hash = "sha256:f21c9219ef48ca5ee78402d5cc831bd58ea27ce89beda894428bc67a52da5328", size = 212224, upload-time = "2025-11-17T22:32:01.349Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/75/dfc3775cb36367816e678f69a7843f6f03bd4e2bcd79941e01ea960a068e/ml_dtypes-0.5.4-cp313-cp313-win_arm64.whl", hash = "sha256:35f29491a3e478407f7047b8a4834e4640a77d2737e0b294d049746507af5175", size = 160798, upload-time = "2025-11-17T22:32:02.864Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/74/e9ddb35fd1dd43b1106c20ced3f53c2e8e7fc7598c15638e9f80677f81d4/ml_dtypes-0.5.4-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:304ad47faa395415b9ccbcc06a0350800bc50eda70f0e45326796e27c62f18b6", size = 702083, upload-time = "2025-11-17T22:32:04.08Z" },
+    { url = "https://files.pythonhosted.org/packages/74/f5/667060b0aed1aa63166b22897fdf16dca9eb704e6b4bbf86848d5a181aa7/ml_dtypes-0.5.4-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6a0df4223b514d799b8a1629c65ddc351b3efa833ccf7f8ea0cf654a61d1e35d", size = 5354111, upload-time = "2025-11-17T22:32:05.546Z" },
+    { url = "https://files.pythonhosted.org/packages/40/49/0f8c498a28c0efa5f5c95a9e374c83ec1385ca41d0e85e7cf40e5d519a21/ml_dtypes-0.5.4-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:531eff30e4d368cb6255bc2328d070e35836aa4f282a0fb5f3a0cd7260257298", size = 5366453, upload-time = "2025-11-17T22:32:07.115Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/27/12607423d0a9c6bbbcc780ad19f1f6baa2b68b18ce4bddcdc122c4c68dc9/ml_dtypes-0.5.4-cp313-cp313t-win_amd64.whl", hash = "sha256:cb73dccfc991691c444acc8c0012bee8f2470da826a92e3a20bb333b1a7894e6", size = 225612, upload-time = "2025-11-17T22:32:08.615Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/80/5a5929e92c72936d5b19872c5fb8fc09327c1da67b3b68c6a13139e77e20/ml_dtypes-0.5.4-cp313-cp313t-win_arm64.whl", hash = "sha256:3bbbe120b915090d9dd1375e4684dd17a20a2491ef25d640a908281da85e73f1", size = 164145, upload-time = "2025-11-17T22:32:09.782Z" },
+    { url = "https://files.pythonhosted.org/packages/72/4e/1339dc6e2557a344f5ba5590872e80346f76f6cb2ac3dd16e4666e88818c/ml_dtypes-0.5.4-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:2b857d3af6ac0d39db1de7c706e69c7f9791627209c3d6dedbfca8c7e5faec22", size = 673781, upload-time = "2025-11-17T22:32:11.364Z" },
+    { url = "https://files.pythonhosted.org/packages/04/f9/067b84365c7e83bda15bba2b06c6ca250ce27b20630b1128c435fb7a09aa/ml_dtypes-0.5.4-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:805cef3a38f4eafae3a5bf9ebdcdb741d0bcfd9e1bd90eb54abd24f928cd2465", size = 5036145, upload-time = "2025-11-17T22:32:12.783Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/bb/82c7dcf38070b46172a517e2334e665c5bf374a262f99a283ea454bece7c/ml_dtypes-0.5.4-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:14a4fd3228af936461db66faccef6e4f41c1d82fcc30e9f8d58a08916b1d811f", size = 5010230, upload-time = "2025-11-17T22:32:14.38Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/93/2bfed22d2498c468f6bcd0d9f56b033eaa19f33320389314c19ef6766413/ml_dtypes-0.5.4-cp314-cp314-win_amd64.whl", hash = "sha256:8c6a2dcebd6f3903e05d51960a8058d6e131fe69f952a5397e5dbabc841b6d56", size = 221032, upload-time = "2025-11-17T22:32:15.763Z" },
+    { url = "https://files.pythonhosted.org/packages/76/a3/9c912fe6ea747bb10fe2f8f54d027eb265db05dfb0c6335e3e063e74e6e8/ml_dtypes-0.5.4-cp314-cp314-win_arm64.whl", hash = "sha256:5a0f68ca8fd8d16583dfa7793973feb86f2fbb56ce3966daf9c9f748f52a2049", size = 163353, upload-time = "2025-11-17T22:32:16.932Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/02/48aa7d84cc30ab4ee37624a2fd98c56c02326785750cd212bc0826c2f15b/ml_dtypes-0.5.4-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:bfc534409c5d4b0bf945af29e5d0ab075eae9eecbb549ff8a29280db822f34f9", size = 702085, upload-time = "2025-11-17T22:32:18.175Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/e7/85cb99fe80a7a5513253ec7faa88a65306be071163485e9a626fce1b6e84/ml_dtypes-0.5.4-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2314892cdc3fcf05e373d76d72aaa15fda9fb98625effa73c1d646f331fcecb7", size = 5355358, upload-time = "2025-11-17T22:32:19.7Z" },
+    { url = "https://files.pythonhosted.org/packages/79/2b/a826ba18d2179a56e144aef69e57fb2ab7c464ef0b2111940ee8a3a223a2/ml_dtypes-0.5.4-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0d2ffd05a2575b1519dc928c0b93c06339eb67173ff53acb00724502cda231cf", size = 5366332, upload-time = "2025-11-17T22:32:21.193Z" },
+    { url = "https://files.pythonhosted.org/packages/84/44/f4d18446eacb20ea11e82f133ea8f86e2bf2891785b67d9da8d0ab0ef525/ml_dtypes-0.5.4-cp314-cp314t-win_amd64.whl", hash = "sha256:4381fe2f2452a2d7589689693d3162e876b3ddb0a832cde7a414f8e1adf7eab1", size = 236612, upload-time = "2025-11-17T22:32:22.579Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/3f/3d42e9a78fe5edf792a83c074b13b9b770092a4fbf3462872f4303135f09/ml_dtypes-0.5.4-cp314-cp314t-win_arm64.whl", hash = "sha256:11942cbf2cf92157db91e5022633c0d9474d4dfd813a909383bd23ce828a4b7d", size = 168825, upload-time = "2025-11-17T22:32:23.766Z" },
 ]
 
 [[package]]
@@ -2789,7 +2581,7 @@ wheels = [
 
 [[package]]
 name = "multi-storage-client"
-version = "0.33.0"
+version = "0.36.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
@@ -2802,26 +2594,27 @@ dependencies = [
     { name = "python-dateutil" },
     { name = "pyyaml" },
     { name = "tqdm" },
+    { name = "tzdata" },
     { name = "wcmatch" },
     { name = "xattr" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/5c/c4/6279fb7d4b8b0a7af060047d592f00f8d49c547adfebe50bcd8d0d2dc8a5/multi_storage_client-0.33.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:df52b3040ef5698c6388fa589bd63812ae0d2f967d358a792abcad5638686590", size = 5282006, upload-time = "2025-10-23T03:45:37.761Z" },
-    { url = "https://files.pythonhosted.org/packages/22/3b/23d8beccd73b887c4552bf884275611255b5028388fa3317365cd56c2a93/multi_storage_client-0.33.0-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:370da04b1e56a601ba505a29d42fcabc19b583e10d725a37bc0c11ba3573d211", size = 5403083, upload-time = "2025-10-23T03:53:11.998Z" },
-    { url = "https://files.pythonhosted.org/packages/b0/ad/dc355d05fd369da0d800e5f7de24da0393f542c5a6f775f6bcee7edcacb1/multi_storage_client-0.33.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c57749a28ec5d49440f465fd73e4e2feaab18ece9b6e57c73395308b41950f66", size = 3178432, upload-time = "2025-10-23T04:07:00.543Z" },
-    { url = "https://files.pythonhosted.org/packages/e0/ad/97b54419d8a58f696b85504568391a627641152f80650d7d2697fc2702ed/multi_storage_client-0.33.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c7d95f5fe094aab00a240bf6aa11dfe85bec293b76b3688ec3a9c33d86c751d2", size = 3351102, upload-time = "2025-10-23T03:47:47.622Z" },
-    { url = "https://files.pythonhosted.org/packages/52/28/1038a68b9df1b179a61967ce9f7d2e80b9954cdb289801afecde5f7660db/multi_storage_client-0.33.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4b5a0f5a0b7684835be20ae6782070884982a86665e9bab317375a56a20294d1", size = 5281523, upload-time = "2025-10-23T04:06:36.671Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/c5/e18de5e2a2671efdc0a12383b8d63f523044ca453525725b3450d0179c0e/multi_storage_client-0.33.0-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:0db694311f90f44ee8f6f7734a14a0857738a467f2ae201649218a3ecf1f6ab2", size = 5403353, upload-time = "2025-10-23T04:07:25.941Z" },
-    { url = "https://files.pythonhosted.org/packages/7e/c9/d9f65eb2370151dbbb06925f4216ee017e6cdbf7657263fd98e60944e52b/multi_storage_client-0.33.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cbe3a0b856f0b968f9fc693670a521b5a995b625351241ca008f866fdfff62a", size = 3180052, upload-time = "2025-10-23T03:57:32.797Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/38/08b9d84c93b19ae87caf542ae77f17dfa44a85281ba09de660ffcf3a7718/multi_storage_client-0.33.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:018e7e82255feeff973ff02563f11a30f5e507e4cbc87a2167a9568740144ef2", size = 3351389, upload-time = "2025-10-23T04:02:07.348Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/31/c95634a27723b5ba9d2d74158444cc5e40b151b51ae59ca196fc9993f039/multi_storage_client-0.33.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:030b3a592c6352605e9ebdb8d9303dd42daf5d171ffa684f3283d4a5c6e2edfe", size = 5273976, upload-time = "2025-10-23T04:04:35.99Z" },
-    { url = "https://files.pythonhosted.org/packages/8c/cf/82d1778d73c3baaec331da4ae8d01fa7934bcd73336aa88a08d86d080347/multi_storage_client-0.33.0-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:14dc0ace16d3830917427d6376d14ef62bd053fb2509f893998555ca1e9c4dcb", size = 5400735, upload-time = "2025-10-23T03:58:37.149Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/34/a6194ec725ef80c02de58b5ed3520bb1711807df75a27f7214effd22df34/multi_storage_client-0.33.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a2821765d5c6de365b5b1dcdc7cf2ebba719ff4061fd02975639629f8aa319f6", size = 3182623, upload-time = "2025-10-23T04:03:29.551Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/36/7ec85178fd1dd69c278407a82acaccfb806449deda13f3dbd41f653d73bd/multi_storage_client-0.33.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f92f89480c58067fa53c178785b86e7650e16f277a61a732a8a7019173b16129", size = 3352104, upload-time = "2025-10-23T04:08:51.005Z" },
-    { url = "https://files.pythonhosted.org/packages/88/ef/f2eb2efefb0e0588b29ed573b8354ecd72c38e6143da7ed5ecf53e859bf8/multi_storage_client-0.33.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ed9af7e77e3cbac1f614816062b36975dcbc610bd3f8c86741d48aa18c718781", size = 5272154, upload-time = "2025-10-23T04:07:49.572Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/49/050aa4fccb2579d2ef5bd0d27169ec98fe85c92bba7a2c31154c491a4f75/multi_storage_client-0.33.0-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:c9d75e95a266ee858cf20c88ed255021552de67a40af9c8884d2fc22037dcd2b", size = 5399474, upload-time = "2025-10-23T04:09:14.545Z" },
-    { url = "https://files.pythonhosted.org/packages/f6/4b/70c2df3b60c28360f185188d351e9c3958b702614963a09ffb1dc251c1ca/multi_storage_client-0.33.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48195a2ab9e6e9a2763bde17184cad2bdef82684353e210d0d325f20cea18869", size = 3181788, upload-time = "2025-10-23T04:03:10.404Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/96/5008852677fdad10eb9d8dd08a6ea58c6f7e820199a3b2c56607186ac6d5/multi_storage_client-0.33.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd64403efdcee2a6efcf7bfdb01422dd174c146014563b09f44590346fd835e6", size = 3351269, upload-time = "2025-10-23T04:00:34.714Z" },
+    { url = "https://files.pythonhosted.org/packages/be/5f/8011fd041f695670b339c25f059b68207c315250ccc25a08f190bff78318/multi_storage_client-0.36.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:763cdb5e24b78adf33882b1d1c0d15021cc2c0088ffc6e7b0269259f0cd45fd2", size = 5299321, upload-time = "2025-11-26T20:03:58.147Z" },
+    { url = "https://files.pythonhosted.org/packages/51/06/cfd17d307fe29fbbce9f196ec1d8dda3f93fd44711c0adb282d9c393a2b2/multi_storage_client-0.36.0-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:eb84ea0bdffcfddf9beb7239c6d0b1950a67a0afe36ef970da70ba4ab373c0c9", size = 5420867, upload-time = "2025-11-26T20:05:32.445Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/7f/bf22f9c67c70d5ec2f6a7a4798cb106f3023bf25ba6c21b0ade1a53fa5b3/multi_storage_client-0.36.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ff03a0213ce1377abee61e8deb87607f0ccd35c245fbaab2fee51d2e591e833e", size = 3188237, upload-time = "2025-11-26T20:01:51.354Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/20/c0c019b3dc7719f79c1826364fc9c3e1bbe9b00246b1d7414ce2b4defd0b/multi_storage_client-0.36.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f16e577ef4ee6f8ac481b3f2290e7b0525676efd82c71fb694ba4e6c65a8facd", size = 3363259, upload-time = "2025-11-26T20:00:10.679Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/f8/eea6be7f4258c811373dc989e8eaa23a404499c2574059f6fd876d6904e4/multi_storage_client-0.36.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6c913b132573fbd7a5ada63086d3ce2669b913b79206f86867cc674d57b9164d", size = 5299844, upload-time = "2025-11-26T20:00:32.46Z" },
+    { url = "https://files.pythonhosted.org/packages/df/aa/b73441dc17097ee92e7efac5080e2cfb8fe4515dd4dc91ca351829e6b7a9/multi_storage_client-0.36.0-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:4dd2ccf67deae403098a5e867ce33d35ce348d2acd1a743c9ef485b3b1eea65c", size = 5424007, upload-time = "2025-11-26T19:55:30.305Z" },
+    { url = "https://files.pythonhosted.org/packages/54/d6/850550de6b0dc740ced2f8fbf83f13f757860b5fdaa652e477c567c01f34/multi_storage_client-0.36.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:04b31b6a5d6a3c90a592b23a4b90368fa1dcca8cb03f76a862d307f8b072c1d3", size = 3188451, upload-time = "2025-11-26T19:56:32.191Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/c5/93e038c0cce46cb9b1b8e19f7215ce3e7fa1af5e0a9662f36dfe47062f7e/multi_storage_client-0.36.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:252f84116f674962eabd066e16040f0304f6191c06ab09ef2ec02dbfd2c4d2ea", size = 3366554, upload-time = "2025-11-26T19:58:37.742Z" },
+    { url = "https://files.pythonhosted.org/packages/28/a2/46320db394150a2f0547930b902e8ad045a084fb519f408e2c9b4ca673a0/multi_storage_client-0.36.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2058e8e8f8fd9eef033171b0bf1966596e9862c7f20c2886101ad979996c453b", size = 5293778, upload-time = "2025-11-26T20:07:11.731Z" },
+    { url = "https://files.pythonhosted.org/packages/00/2d/658af3b4104c4f2aa2621469482dca8270490601e98d8f7997361499adaa/multi_storage_client-0.36.0-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:22b69c7f3c9ffa166f38bafa7e08f6b664a5dbee8c88d5d740bed719e6f410a1", size = 5418642, upload-time = "2025-11-26T19:58:15.717Z" },
+    { url = "https://files.pythonhosted.org/packages/09/2f/6441794bf8dc195d614d63ad2b7068ad7703972fd6f960d43202d29748b1/multi_storage_client-0.36.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b384fb326637e79706ff706e60f384b24fdbcc824420bb66ef615a9ef5ffb4ec", size = 3194133, upload-time = "2025-11-26T20:05:54.618Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/ba/b07361ff84e5bd263e299b03776382f59bd92862573c915dd705a09f3c1d/multi_storage_client-0.36.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7111567b971a68719c0eb68245d49a0a3c3bf5af2f609351446f20ac3e83c0d5", size = 3364563, upload-time = "2025-11-26T20:04:20.3Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/4a/cbd61589a457e2f4fbacd08b7e7dd11cdb74690857f4b40042844b1ff894/multi_storage_client-0.36.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a8137558d5f05e4722c54540e2d6067ea61e9ce3d736fa9cb5c541c7f94d1b48", size = 5293550, upload-time = "2025-11-26T20:03:36.459Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/3d/7499a9d537fa950a9acf11604b1f9372ed2cadd582b55f1c7cb885ce6f40/multi_storage_client-0.36.0-cp313-cp313-macosx_11_0_x86_64.whl", hash = "sha256:5394c5e040c32433b42e902d9fcf03f8a475c5c9ff1cca80743b2cb944c8af9e", size = 5417538, upload-time = "2025-11-26T20:06:16.782Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/c3/1b1adc3b3b8569d258a34dbedb6a8c51fc94b947b2df276e251f0f1e23a2/multi_storage_client-0.36.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:195e8c8d57d812b73efd41b96cd60825c484d317ec86379fad3e435e9365a4a6", size = 3193426, upload-time = "2025-11-26T20:00:56.034Z" },
+    { url = "https://files.pythonhosted.org/packages/60/f5/f8b97a87d928057b493733760f37de70ae5ffff84b86f6efae101cdd57a2/multi_storage_client-0.36.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8402d0e1cefedf38ad9eefe8b3c56d3a44cfec7775ef711da18e7dbf72669444", size = 3363531, upload-time = "2025-11-26T20:02:35.296Z" },
 ]
 
 [[package]]
@@ -3025,7 +2818,7 @@ dependencies = [
     { name = "jinja2" },
     { name = "leptonai" },
     { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "networkx", version = "3.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "omegaconf" },
     { name = "packaging" },
     { name = "rich" },
@@ -3049,51 +2842,21 @@ wheels = [
 
 [[package]]
 name = "networkx"
-version = "3.5"
+version = "3.6"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/6c/4f/ccdb8ad3a38e583f214547fd2f7ff1fc160c43a75af88e6aec213404b96a/networkx-3.5.tar.gz", hash = "sha256:d4c6f9cf81f52d69230866796b82afbccdec3db7ae4fbd1b65ea750feed50037", size = 2471065, upload-time = "2025-05-29T11:35:07.804Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/eb/8d/776adee7bbf76365fdd7f2552710282c79a4ead5d2a46408c9043a2b70ba/networkx-3.5-py3-none-any.whl", hash = "sha256:0030d386a9a06dee3565298b4a734b68589749a544acbb6c412dc9e2489ec6ec", size = 2034406, upload-time = "2025-05-29T11:35:04.961Z" },
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform != 'linux'",
+    "python_full_version == '3.13.*' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e8/fc/7b6fd4d22c8c4dc5704430140d8b3f520531d4fe7328b8f8d03f5a7950e8/networkx-3.6.tar.gz", hash = "sha256:285276002ad1f7f7da0f7b42f004bcba70d381e936559166363707fdad3d72ad", size = 2511464, upload-time = "2025-11-24T03:03:47.158Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/07/c7/d64168da60332c17d24c0d2f08bdf3987e8d1ae9d84b5bbd0eec2eb26a55/networkx-3.6-py3-none-any.whl", hash = "sha256:cdb395b105806062473d3be36458d8f1459a4e4b98e236a66c3a48996e07684f", size = 2063713, upload-time = "2025-11-24T03:03:45.21Z" },
 ]
 
 [[package]]
@@ -3138,170 +2901,373 @@ wheels = [
 ]
 
 [[package]]
-name = "numcodecs"
-version = "0.13.1"
+name = "numpy"
+version = "2.2.6"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
     "python_full_version < '3.11' and sys_platform == 'linux'",
     "python_full_version < '3.11' and sys_platform != 'linux'",
 ]
-dependencies = [
-    { name = "numpy", marker = "python_full_version < '3.11'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/85/56/8895a76abe4ec94ebd01eeb6d74f587bc4cddd46569670e1402852a5da13/numcodecs-0.13.1.tar.gz", hash = "sha256:a3cf37881df0898f3a9c0d4477df88133fe85185bffe57ba31bcc2fa207709bc", size = 5955215, upload-time = "2024-10-09T16:28:00.188Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/14/c0/6d72cde772bcec196b7188731d41282993b2958440f77fdf0db216f722da/numcodecs-0.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:96add4f783c5ce57cc7e650b6cac79dd101daf887c479a00a29bc1487ced180b", size = 1580012, upload-time = "2024-10-09T16:27:19.069Z" },
-    { url = "https://files.pythonhosted.org/packages/94/1d/f81fc1fa9210bbea97258242393a1f9feab4f6d8fb201f81f76003005e4b/numcodecs-0.13.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:237b7171609e868a20fd313748494444458ccd696062f67e198f7f8f52000c15", size = 1176919, upload-time = "2024-10-09T16:27:21.634Z" },
-    { url = "https://files.pythonhosted.org/packages/16/e4/b9ec2f4dfc34ecf724bc1beb96a9f6fa9b91801645688ffadacd485089da/numcodecs-0.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:96e42f73c31b8c24259c5fac6adba0c3ebf95536e37749dc6c62ade2989dca28", size = 8625842, upload-time = "2024-10-09T16:27:24.168Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/90/299952e1477954ec4f92813fa03e743945e3ff711bb4f6c9aace431cb3da/numcodecs-0.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:eda7d7823c9282e65234731fd6bd3986b1f9e035755f7fed248d7d366bb291ab", size = 828638, upload-time = "2024-10-09T16:27:27.063Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/78/34b8e869ef143e88d62e8231f4dbfcad85e5c41302a11fc5bd2228a13df5/numcodecs-0.13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2eda97dd2f90add98df6d295f2c6ae846043396e3d51a739ca5db6c03b5eb666", size = 1580199, upload-time = "2024-10-09T16:27:29.336Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/cf/f70797d86bb585d258d1e6993dced30396f2044725b96ce8bcf87a02be9c/numcodecs-0.13.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2a86f5367af9168e30f99727ff03b27d849c31ad4522060dde0bce2923b3a8bc", size = 1177203, upload-time = "2024-10-09T16:27:31.011Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/b5/d14ad69b63fde041153dfd05d7181a49c0d4864de31a7a1093c8370da957/numcodecs-0.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:233bc7f26abce24d57e44ea8ebeb5cd17084690b4e7409dd470fdb75528d615f", size = 8868743, upload-time = "2024-10-09T16:27:32.833Z" },
-    { url = "https://files.pythonhosted.org/packages/13/d4/27a7b5af0b33f6d61e198faf177fbbf3cb83ff10d9d1a6857b7efc525ad5/numcodecs-0.13.1-cp311-cp311-win_amd64.whl", hash = "sha256:796b3e6740107e4fa624cc636248a1580138b3f1c579160f260f76ff13a4261b", size = 829603, upload-time = "2024-10-09T16:27:35.415Z" },
-    { url = "https://files.pythonhosted.org/packages/37/3a/bc09808425e7d3df41e5fc73fc7a802c429ba8c6b05e55f133654ade019d/numcodecs-0.13.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5195bea384a6428f8afcece793860b1ab0ae28143c853f0b2b20d55a8947c917", size = 1575806, upload-time = "2024-10-09T16:27:37.804Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/cc/dc74d0bfdf9ec192332a089d199f1e543e747c556b5659118db7a437dcca/numcodecs-0.13.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3501a848adaddce98a71a262fee15cd3618312692aa419da77acd18af4a6a3f6", size = 1178233, upload-time = "2024-10-09T16:27:40.169Z" },
-    { url = "https://files.pythonhosted.org/packages/d4/ce/434e8e3970b8e92ae9ab6d9db16cb9bc7aa1cd02e17c11de6848224100a1/numcodecs-0.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da2230484e6102e5fa3cc1a5dd37ca1f92dfbd183d91662074d6f7574e3e8f53", size = 8857827, upload-time = "2024-10-09T16:27:42.743Z" },
-    { url = "https://files.pythonhosted.org/packages/83/e7/1d8b1b266a92f9013c755b1c146c5ad71a2bff147ecbc67f86546a2e4d6a/numcodecs-0.13.1-cp312-cp312-win_amd64.whl", hash = "sha256:e5db4824ebd5389ea30e54bc8aeccb82d514d28b6b68da6c536b8fa4596f4bca", size = 826539, upload-time = "2024-10-09T16:27:44.808Z" },
-    { url = "https://files.pythonhosted.org/packages/83/8b/06771dead2cc4a8ae1ea9907737cf1c8d37a323392fa28f938a586373468/numcodecs-0.13.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7a60d75179fd6692e301ddfb3b266d51eb598606dcae7b9fc57f986e8d65cb43", size = 1571660, upload-time = "2024-10-09T16:27:47.125Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/ea/d925bf85f92dfe4635356018da9fe4bfecb07b1c72f62b01c1bc47f936b1/numcodecs-0.13.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3f593c7506b0ab248961a3b13cb148cc6e8355662ff124ac591822310bc55ecf", size = 1169925, upload-time = "2024-10-09T16:27:49.512Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/d6/643a3839d571d8e439a2c77dc4b0b8cab18d96ac808e4a81dbe88e959ab6/numcodecs-0.13.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80d3071465f03522e776a31045ddf2cfee7f52df468b977ed3afdd7fe5869701", size = 8814257, upload-time = "2024-10-09T16:27:52.059Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/c5/f3e56bc9b4e438a287fff738993d6d11abef368c0328a612ac2842ba9fca/numcodecs-0.13.1-cp313-cp313-win_amd64.whl", hash = "sha256:90d3065ae74c9342048ae0046006f99dcb1388b7288da5a19b3bddf9c30c3176", size = 821887, upload-time = "2024-10-09T16:27:55.039Z" },
+sdist = { url = "https://files.pythonhosted.org/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9a/3e/ed6db5be21ce87955c0cbd3009f2803f59fa08df21b5df06862e2d8e2bdd/numpy-2.2.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b412caa66f72040e6d268491a59f2c43bf03eb6c96dd8f0307829feb7fa2b6fb", size = 21165245, upload-time = "2025-05-17T21:27:58.555Z" },
+    { url = "https://files.pythonhosted.org/packages/22/c2/4b9221495b2a132cc9d2eb862e21d42a009f5a60e45fc44b00118c174bff/numpy-2.2.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8e41fd67c52b86603a91c1a505ebaef50b3314de0213461c7a6e99c9a3beff90", size = 14360048, upload-time = "2025-05-17T21:28:21.406Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/77/dc2fcfc66943c6410e2bf598062f5959372735ffda175b39906d54f02349/numpy-2.2.6-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:37e990a01ae6ec7fe7fa1c26c55ecb672dd98b19c3d0e1d1f326fa13cb38d163", size = 5340542, upload-time = "2025-05-17T21:28:30.931Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/4f/1cb5fdc353a5f5cc7feb692db9b8ec2c3d6405453f982435efc52561df58/numpy-2.2.6-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:5a6429d4be8ca66d889b7cf70f536a397dc45ba6faeb5f8c5427935d9592e9cf", size = 6878301, upload-time = "2025-05-17T21:28:41.613Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/17/96a3acd228cec142fcb8723bd3cc39c2a474f7dcf0a5d16731980bcafa95/numpy-2.2.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:efd28d4e9cd7d7a8d39074a4d44c63eda73401580c5c76acda2ce969e0a38e83", size = 14297320, upload-time = "2025-05-17T21:29:02.78Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/63/3de6a34ad7ad6646ac7d2f55ebc6ad439dbbf9c4370017c50cf403fb19b5/numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc7b73d02efb0e18c000e9ad8b83480dfcd5dfd11065997ed4c6747470ae8915", size = 16801050, upload-time = "2025-05-17T21:29:27.675Z" },
+    { url = "https://files.pythonhosted.org/packages/07/b6/89d837eddef52b3d0cec5c6ba0456c1bf1b9ef6a6672fc2b7873c3ec4e2e/numpy-2.2.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:74d4531beb257d2c3f4b261bfb0fc09e0f9ebb8842d82a7b4209415896adc680", size = 15807034, upload-time = "2025-05-17T21:29:51.102Z" },
+    { url = "https://files.pythonhosted.org/packages/01/c8/dc6ae86e3c61cfec1f178e5c9f7858584049b6093f843bca541f94120920/numpy-2.2.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8fc377d995680230e83241d8a96def29f204b5782f371c532579b4f20607a289", size = 18614185, upload-time = "2025-05-17T21:30:18.703Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/c5/0064b1b7e7c89137b471ccec1fd2282fceaae0ab3a9550f2568782d80357/numpy-2.2.6-cp310-cp310-win32.whl", hash = "sha256:b093dd74e50a8cba3e873868d9e93a85b78e0daf2e98c6797566ad8044e8363d", size = 6527149, upload-time = "2025-05-17T21:30:29.788Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/dd/4b822569d6b96c39d1215dbae0582fd99954dcbcf0c1a13c61783feaca3f/numpy-2.2.6-cp310-cp310-win_amd64.whl", hash = "sha256:f0fd6321b839904e15c46e0d257fdd101dd7f530fe03fd6359c1ea63738703f3", size = 12904620, upload-time = "2025-05-17T21:30:48.994Z" },
+    { url = "https://files.pythonhosted.org/packages/da/a8/4f83e2aa666a9fbf56d6118faaaf5f1974d456b1823fda0a176eff722839/numpy-2.2.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f9f1adb22318e121c5c69a09142811a201ef17ab257a1e66ca3025065b7f53ae", size = 21176963, upload-time = "2025-05-17T21:31:19.36Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/2b/64e1affc7972decb74c9e29e5649fac940514910960ba25cd9af4488b66c/numpy-2.2.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c820a93b0255bc360f53eca31a0e676fd1101f673dda8da93454a12e23fc5f7a", size = 14406743, upload-time = "2025-05-17T21:31:41.087Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/9f/0121e375000b5e50ffdd8b25bf78d8e1a5aa4cca3f185d41265198c7b834/numpy-2.2.6-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3d70692235e759f260c3d837193090014aebdf026dfd167834bcba43e30c2a42", size = 5352616, upload-time = "2025-05-17T21:31:50.072Z" },
+    { url = "https://files.pythonhosted.org/packages/31/0d/b48c405c91693635fbe2dcd7bc84a33a602add5f63286e024d3b6741411c/numpy-2.2.6-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:481b49095335f8eed42e39e8041327c05b0f6f4780488f61286ed3c01368d491", size = 6889579, upload-time = "2025-05-17T21:32:01.712Z" },
+    { url = "https://files.pythonhosted.org/packages/52/b8/7f0554d49b565d0171eab6e99001846882000883998e7b7d9f0d98b1f934/numpy-2.2.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b64d8d4d17135e00c8e346e0a738deb17e754230d7e0810ac5012750bbd85a5a", size = 14312005, upload-time = "2025-05-17T21:32:23.332Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/dd/2238b898e51bd6d389b7389ffb20d7f4c10066d80351187ec8e303a5a475/numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba10f8411898fc418a521833e014a77d3ca01c15b0c6cdcce6a0d2897e6dbbdf", size = 16821570, upload-time = "2025-05-17T21:32:47.991Z" },
+    { url = "https://files.pythonhosted.org/packages/83/6c/44d0325722cf644f191042bf47eedad61c1e6df2432ed65cbe28509d404e/numpy-2.2.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bd48227a919f1bafbdda0583705e547892342c26fb127219d60a5c36882609d1", size = 15818548, upload-time = "2025-05-17T21:33:11.728Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/9d/81e8216030ce66be25279098789b665d49ff19eef08bfa8cb96d4957f422/numpy-2.2.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9551a499bf125c1d4f9e250377c1ee2eddd02e01eac6644c080162c0c51778ab", size = 18620521, upload-time = "2025-05-17T21:33:39.139Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/fd/e19617b9530b031db51b0926eed5345ce8ddc669bb3bc0044b23e275ebe8/numpy-2.2.6-cp311-cp311-win32.whl", hash = "sha256:0678000bb9ac1475cd454c6b8c799206af8107e310843532b04d49649c717a47", size = 6525866, upload-time = "2025-05-17T21:33:50.273Z" },
+    { url = "https://files.pythonhosted.org/packages/31/0a/f354fb7176b81747d870f7991dc763e157a934c717b67b58456bc63da3df/numpy-2.2.6-cp311-cp311-win_amd64.whl", hash = "sha256:e8213002e427c69c45a52bbd94163084025f533a55a59d6f9c5b820774ef3303", size = 12907455, upload-time = "2025-05-17T21:34:09.135Z" },
+    { url = "https://files.pythonhosted.org/packages/82/5d/c00588b6cf18e1da539b45d3598d3557084990dcc4331960c15ee776ee41/numpy-2.2.6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41c5a21f4a04fa86436124d388f6ed60a9343a6f767fced1a8a71c3fbca038ff", size = 20875348, upload-time = "2025-05-17T21:34:39.648Z" },
+    { url = "https://files.pythonhosted.org/packages/66/ee/560deadcdde6c2f90200450d5938f63a34b37e27ebff162810f716f6a230/numpy-2.2.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:de749064336d37e340f640b05f24e9e3dd678c57318c7289d222a8a2f543e90c", size = 14119362, upload-time = "2025-05-17T21:35:01.241Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/65/4baa99f1c53b30adf0acd9a5519078871ddde8d2339dc5a7fde80d9d87da/numpy-2.2.6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:894b3a42502226a1cac872f840030665f33326fc3dac8e57c607905773cdcde3", size = 5084103, upload-time = "2025-05-17T21:35:10.622Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/89/e5a34c071a0570cc40c9a54eb472d113eea6d002e9ae12bb3a8407fb912e/numpy-2.2.6-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:71594f7c51a18e728451bb50cc60a3ce4e6538822731b2933209a1f3614e9282", size = 6625382, upload-time = "2025-05-17T21:35:21.414Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/35/8c80729f1ff76b3921d5c9487c7ac3de9b2a103b1cd05e905b3090513510/numpy-2.2.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2618db89be1b4e05f7a1a847a9c1c0abd63e63a1607d892dd54668dd92faf87", size = 14018462, upload-time = "2025-05-17T21:35:42.174Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/3d/1e1db36cfd41f895d266b103df00ca5b3cbe965184df824dec5c08c6b803/numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd83c01228a688733f1ded5201c678f0c53ecc1006ffbc404db9f7a899ac6249", size = 16527618, upload-time = "2025-05-17T21:36:06.711Z" },
+    { url = "https://files.pythonhosted.org/packages/61/c6/03ed30992602c85aa3cd95b9070a514f8b3c33e31124694438d88809ae36/numpy-2.2.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:37c0ca431f82cd5fa716eca9506aefcabc247fb27ba69c5062a6d3ade8cf8f49", size = 15505511, upload-time = "2025-05-17T21:36:29.965Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/25/5761d832a81df431e260719ec45de696414266613c9ee268394dd5ad8236/numpy-2.2.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fe27749d33bb772c80dcd84ae7e8df2adc920ae8297400dabec45f0dedb3f6de", size = 18313783, upload-time = "2025-05-17T21:36:56.883Z" },
+    { url = "https://files.pythonhosted.org/packages/57/0a/72d5a3527c5ebffcd47bde9162c39fae1f90138c961e5296491ce778e682/numpy-2.2.6-cp312-cp312-win32.whl", hash = "sha256:4eeaae00d789f66c7a25ac5f34b71a7035bb474e679f410e5e1a94deb24cf2d4", size = 6246506, upload-time = "2025-05-17T21:37:07.368Z" },
+    { url = "https://files.pythonhosted.org/packages/36/fa/8c9210162ca1b88529ab76b41ba02d433fd54fecaf6feb70ef9f124683f1/numpy-2.2.6-cp312-cp312-win_amd64.whl", hash = "sha256:c1f9540be57940698ed329904db803cf7a402f3fc200bfe599334c9bd84a40b2", size = 12614190, upload-time = "2025-05-17T21:37:26.213Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/5c/6657823f4f594f72b5471f1db1ab12e26e890bb2e41897522d134d2a3e81/numpy-2.2.6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0811bb762109d9708cca4d0b13c4f67146e3c3b7cf8d34018c722adb2d957c84", size = 20867828, upload-time = "2025-05-17T21:37:56.699Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/9e/14520dc3dadf3c803473bd07e9b2bd1b69bc583cb2497b47000fed2fa92f/numpy-2.2.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:287cc3162b6f01463ccd86be154f284d0893d2b3ed7292439ea97eafa8170e0b", size = 14143006, upload-time = "2025-05-17T21:38:18.291Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/06/7e96c57d90bebdce9918412087fc22ca9851cceaf5567a45c1f404480e9e/numpy-2.2.6-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:f1372f041402e37e5e633e586f62aa53de2eac8d98cbfb822806ce4bbefcb74d", size = 5076765, upload-time = "2025-05-17T21:38:27.319Z" },
+    { url = "https://files.pythonhosted.org/packages/73/ed/63d920c23b4289fdac96ddbdd6132e9427790977d5457cd132f18e76eae0/numpy-2.2.6-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:55a4d33fa519660d69614a9fad433be87e5252f4b03850642f88993f7b2ca566", size = 6617736, upload-time = "2025-05-17T21:38:38.141Z" },
+    { url = "https://files.pythonhosted.org/packages/85/c5/e19c8f99d83fd377ec8c7e0cf627a8049746da54afc24ef0a0cb73d5dfb5/numpy-2.2.6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f92729c95468a2f4f15e9bb94c432a9229d0d50de67304399627a943201baa2f", size = 14010719, upload-time = "2025-05-17T21:38:58.433Z" },
+    { url = "https://files.pythonhosted.org/packages/19/49/4df9123aafa7b539317bf6d342cb6d227e49f7a35b99c287a6109b13dd93/numpy-2.2.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1bc23a79bfabc5d056d106f9befb8d50c31ced2fbc70eedb8155aec74a45798f", size = 16526072, upload-time = "2025-05-17T21:39:22.638Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/6c/04b5f47f4f32f7c2b0e7260442a8cbcf8168b0e1a41ff1495da42f42a14f/numpy-2.2.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e3143e4451880bed956e706a3220b4e5cf6172ef05fcc397f6f36a550b1dd868", size = 15503213, upload-time = "2025-05-17T21:39:45.865Z" },
+    { url = "https://files.pythonhosted.org/packages/17/0a/5cd92e352c1307640d5b6fec1b2ffb06cd0dabe7d7b8227f97933d378422/numpy-2.2.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b4f13750ce79751586ae2eb824ba7e1e8dba64784086c98cdbbcc6a42112ce0d", size = 18316632, upload-time = "2025-05-17T21:40:13.331Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/3b/5cba2b1d88760ef86596ad0f3d484b1cbff7c115ae2429678465057c5155/numpy-2.2.6-cp313-cp313-win32.whl", hash = "sha256:5beb72339d9d4fa36522fc63802f469b13cdbe4fdab4a288f0c441b74272ebfd", size = 6244532, upload-time = "2025-05-17T21:43:46.099Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/3b/d58c12eafcb298d4e6d0d40216866ab15f59e55d148a5658bb3132311fcf/numpy-2.2.6-cp313-cp313-win_amd64.whl", hash = "sha256:b0544343a702fa80c95ad5d3d608ea3599dd54d4632df855e4c8d24eb6ecfa1c", size = 12610885, upload-time = "2025-05-17T21:44:05.145Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/9e/4bf918b818e516322db999ac25d00c75788ddfd2d2ade4fa66f1f38097e1/numpy-2.2.6-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0bca768cd85ae743b2affdc762d617eddf3bcf8724435498a1e80132d04879e6", size = 20963467, upload-time = "2025-05-17T21:40:44Z" },
+    { url = "https://files.pythonhosted.org/packages/61/66/d2de6b291507517ff2e438e13ff7b1e2cdbdb7cb40b3ed475377aece69f9/numpy-2.2.6-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:fc0c5673685c508a142ca65209b4e79ed6740a4ed6b2267dbba90f34b0b3cfda", size = 14225144, upload-time = "2025-05-17T21:41:05.695Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/25/480387655407ead912e28ba3a820bc69af9adf13bcbe40b299d454ec011f/numpy-2.2.6-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:5bd4fc3ac8926b3819797a7c0e2631eb889b4118a9898c84f585a54d475b7e40", size = 5200217, upload-time = "2025-05-17T21:41:15.903Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/4a/6e313b5108f53dcbf3aca0c0f3e9c92f4c10ce57a0a721851f9785872895/numpy-2.2.6-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:fee4236c876c4e8369388054d02d0e9bb84821feb1a64dd59e137e6511a551f8", size = 6712014, upload-time = "2025-05-17T21:41:27.321Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/30/172c2d5c4be71fdf476e9de553443cf8e25feddbe185e0bd88b096915bcc/numpy-2.2.6-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e1dda9c7e08dc141e0247a5b8f49cf05984955246a327d4c48bda16821947b2f", size = 14077935, upload-time = "2025-05-17T21:41:49.738Z" },
+    { url = "https://files.pythonhosted.org/packages/12/fb/9e743f8d4e4d3c710902cf87af3512082ae3d43b945d5d16563f26ec251d/numpy-2.2.6-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f447e6acb680fd307f40d3da4852208af94afdfab89cf850986c3ca00562f4fa", size = 16600122, upload-time = "2025-05-17T21:42:14.046Z" },
+    { url = "https://files.pythonhosted.org/packages/12/75/ee20da0e58d3a66f204f38916757e01e33a9737d0b22373b3eb5a27358f9/numpy-2.2.6-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:389d771b1623ec92636b0786bc4ae56abafad4a4c513d36a55dce14bd9ce8571", size = 15586143, upload-time = "2025-05-17T21:42:37.464Z" },
+    { url = "https://files.pythonhosted.org/packages/76/95/bef5b37f29fc5e739947e9ce5179ad402875633308504a52d188302319c8/numpy-2.2.6-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8e9ace4a37db23421249ed236fdcdd457d671e25146786dfc96835cd951aa7c1", size = 18385260, upload-time = "2025-05-17T21:43:05.189Z" },
+    { url = "https://files.pythonhosted.org/packages/09/04/f2f83279d287407cf36a7a8053a5abe7be3622a4363337338f2585e4afda/numpy-2.2.6-cp313-cp313t-win32.whl", hash = "sha256:038613e9fb8c72b0a41f025a7e4c3f0b7a1b5d768ece4796b674c8f3fe13efff", size = 6377225, upload-time = "2025-05-17T21:43:16.254Z" },
+    { url = "https://files.pythonhosted.org/packages/67/0e/35082d13c09c02c011cf21570543d202ad929d961c02a147493cb0c2bdf5/numpy-2.2.6-cp313-cp313t-win_amd64.whl", hash = "sha256:6031dd6dfecc0cf9f668681a37648373bddd6421fff6c66ec1624eed0180ee06", size = 12771374, upload-time = "2025-05-17T21:43:35.479Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/3b/d94a75f4dbf1ef5d321523ecac21ef23a3cd2ac8b78ae2aac40873590229/numpy-2.2.6-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0b605b275d7bd0c640cad4e5d30fa701a8d59302e127e5f79138ad62762c3e3d", size = 21040391, upload-time = "2025-05-17T21:44:35.948Z" },
+    { url = "https://files.pythonhosted.org/packages/17/f4/09b2fa1b58f0fb4f7c7963a1649c64c4d315752240377ed74d9cd878f7b5/numpy-2.2.6-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:7befc596a7dc9da8a337f79802ee8adb30a552a94f792b9c9d18c840055907db", size = 6786754, upload-time = "2025-05-17T21:44:47.446Z" },
+    { url = "https://files.pythonhosted.org/packages/af/30/feba75f143bdc868a1cc3f44ccfa6c4b9ec522b36458e738cd00f67b573f/numpy-2.2.6-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce47521a4754c8f4593837384bd3424880629f718d87c5d44f8ed763edd63543", size = 16643476, upload-time = "2025-05-17T21:45:11.871Z" },
+    { url = "https://files.pythonhosted.org/packages/37/48/ac2a9584402fb6c0cd5b5d1a91dcf176b15760130dd386bbafdbfe3640bf/numpy-2.2.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d042d24c90c41b54fd506da306759e06e568864df8ec17ccc17e9e884634fd00", size = 12812666, upload-time = "2025-05-17T21:45:31.426Z" },
 ]
 
 [[package]]
-name = "numcodecs"
-version = "0.16.3"
+name = "numpy"
+version = "2.3.5"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'",
+    "python_full_version >= '3.14' and sys_platform != 'linux'",
+    "python_full_version == '3.13.*' and sys_platform != 'linux'",
     "python_full_version == '3.12.*' and sys_platform != 'linux'",
     "python_full_version == '3.11.*' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and sys_platform != 'linux'",
 ]
+sdist = { url = "https://files.pythonhosted.org/packages/76/65/21b3bc86aac7b8f2862db1e808f1ea22b028e30a225a34a5ede9bf8678f2/numpy-2.3.5.tar.gz", hash = "sha256:784db1dcdab56bf0517743e746dfb0f885fc68d948aba86eeec2cba234bdf1c0", size = 20584950, upload-time = "2025-11-16T22:52:42.067Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/43/77/84dd1d2e34d7e2792a236ba180b5e8fcc1e3e414e761ce0253f63d7f572e/numpy-2.3.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:de5672f4a7b200c15a4127042170a694d4df43c992948f5e1af57f0174beed10", size = 17034641, upload-time = "2025-11-16T22:49:19.336Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/ea/25e26fa5837106cde46ae7d0b667e20f69cbbc0efd64cba8221411ab26ae/numpy-2.3.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:acfd89508504a19ed06ef963ad544ec6664518c863436306153e13e94605c218", size = 12528324, upload-time = "2025-11-16T22:49:22.582Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/1a/e85f0eea4cf03d6a0228f5c0256b53f2df4bc794706e7df019fc622e47f1/numpy-2.3.5-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:ffe22d2b05504f786c867c8395de703937f934272eb67586817b46188b4ded6d", size = 5356872, upload-time = "2025-11-16T22:49:25.408Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/bb/35ef04afd567f4c989c2060cde39211e4ac5357155c1833bcd1166055c61/numpy-2.3.5-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:872a5cf366aec6bb1147336480fef14c9164b154aeb6542327de4970282cd2f5", size = 6893148, upload-time = "2025-11-16T22:49:27.549Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/2b/05bbeb06e2dff5eab512dfc678b1cc5ee94d8ac5956a0885c64b6b26252b/numpy-2.3.5-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3095bdb8dd297e5920b010e96134ed91d852d81d490e787beca7e35ae1d89cf7", size = 14557282, upload-time = "2025-11-16T22:49:30.964Z" },
+    { url = "https://files.pythonhosted.org/packages/65/fb/2b23769462b34398d9326081fad5655198fcf18966fcb1f1e49db44fbf31/numpy-2.3.5-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8cba086a43d54ca804ce711b2a940b16e452807acebe7852ff327f1ecd49b0d4", size = 16897903, upload-time = "2025-11-16T22:49:34.191Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/14/085f4cf05fc3f1e8aa95e85404e984ffca9b2275a5dc2b1aae18a67538b8/numpy-2.3.5-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6cf9b429b21df6b99f4dee7a1218b8b7ffbbe7df8764dc0bd60ce8a0708fed1e", size = 16341672, upload-time = "2025-11-16T22:49:37.2Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/3b/1f73994904142b2aa290449b3bb99772477b5fd94d787093e4f24f5af763/numpy-2.3.5-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:396084a36abdb603546b119d96528c2f6263921c50df3c8fd7cb28873a237748", size = 18838896, upload-time = "2025-11-16T22:49:39.727Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/b9/cf6649b2124f288309ffc353070792caf42ad69047dcc60da85ee85fea58/numpy-2.3.5-cp311-cp311-win32.whl", hash = "sha256:b0c7088a73aef3d687c4deef8452a3ac7c1be4e29ed8bf3b366c8111128ac60c", size = 6563608, upload-time = "2025-11-16T22:49:42.079Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/44/9fe81ae1dcc29c531843852e2874080dc441338574ccc4306b39e2ff6e59/numpy-2.3.5-cp311-cp311-win_amd64.whl", hash = "sha256:a414504bef8945eae5f2d7cb7be2d4af77c5d1cb5e20b296c2c25b61dff2900c", size = 13078442, upload-time = "2025-11-16T22:49:43.99Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/a7/f99a41553d2da82a20a2f22e93c94f928e4490bb447c9ff3c4ff230581d3/numpy-2.3.5-cp311-cp311-win_arm64.whl", hash = "sha256:0cd00b7b36e35398fa2d16af7b907b65304ef8bb4817a550e06e5012929830fa", size = 10458555, upload-time = "2025-11-16T22:49:47.092Z" },
+    { url = "https://files.pythonhosted.org/packages/44/37/e669fe6cbb2b96c62f6bbedc6a81c0f3b7362f6a59230b23caa673a85721/numpy-2.3.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:74ae7b798248fe62021dbf3c914245ad45d1a6b0cb4a29ecb4b31d0bfbc4cc3e", size = 16733873, upload-time = "2025-11-16T22:49:49.84Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/65/df0db6c097892c9380851ab9e44b52d4f7ba576b833996e0080181c0c439/numpy-2.3.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ee3888d9ff7c14604052b2ca5535a30216aa0a58e948cdd3eeb8d3415f638769", size = 12259838, upload-time = "2025-11-16T22:49:52.863Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/e1/1ee06e70eb2136797abe847d386e7c0e830b67ad1d43f364dd04fa50d338/numpy-2.3.5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:612a95a17655e213502f60cfb9bf9408efdc9eb1d5f50535cc6eb365d11b42b5", size = 5088378, upload-time = "2025-11-16T22:49:55.055Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/9c/1ca85fb86708724275103b81ec4cf1ac1d08f465368acfc8da7ab545bdae/numpy-2.3.5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:3101e5177d114a593d79dd79658650fe28b5a0d8abeb8ce6f437c0e6df5be1a4", size = 6628559, upload-time = "2025-11-16T22:49:57.371Z" },
+    { url = "https://files.pythonhosted.org/packages/74/78/fcd41e5a0ce4f3f7b003da85825acddae6d7ecb60cf25194741b036ca7d6/numpy-2.3.5-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b973c57ff8e184109db042c842423ff4f60446239bd585a5131cc47f06f789d", size = 14250702, upload-time = "2025-11-16T22:49:59.632Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/23/2a1b231b8ff672b4c450dac27164a8b2ca7d9b7144f9c02d2396518352eb/numpy-2.3.5-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0d8163f43acde9a73c2a33605353a4f1bc4798745a8b1d73183b28e5b435ae28", size = 16606086, upload-time = "2025-11-16T22:50:02.127Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/c5/5ad26fbfbe2012e190cc7d5003e4d874b88bb18861d0829edc140a713021/numpy-2.3.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:51c1e14eb1e154ebd80e860722f9e6ed6ec89714ad2db2d3aa33c31d7c12179b", size = 16025985, upload-time = "2025-11-16T22:50:04.536Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/fa/dd48e225c46c819288148d9d060b047fd2a6fb1eb37eae25112ee4cb4453/numpy-2.3.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b46b4ec24f7293f23adcd2d146960559aaf8020213de8ad1909dba6c013bf89c", size = 18542976, upload-time = "2025-11-16T22:50:07.557Z" },
+    { url = "https://files.pythonhosted.org/packages/05/79/ccbd23a75862d95af03d28b5c6901a1b7da4803181513d52f3b86ed9446e/numpy-2.3.5-cp312-cp312-win32.whl", hash = "sha256:3997b5b3c9a771e157f9aae01dd579ee35ad7109be18db0e85dbdbe1de06e952", size = 6285274, upload-time = "2025-11-16T22:50:10.746Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/57/8aeaf160312f7f489dea47ab61e430b5cb051f59a98ae68b7133ce8fa06a/numpy-2.3.5-cp312-cp312-win_amd64.whl", hash = "sha256:86945f2ee6d10cdfd67bcb4069c1662dd711f7e2a4343db5cecec06b87cf31aa", size = 12782922, upload-time = "2025-11-16T22:50:12.811Z" },
+    { url = "https://files.pythonhosted.org/packages/78/a6/aae5cc2ca78c45e64b9ef22f089141d661516856cf7c8a54ba434576900d/numpy-2.3.5-cp312-cp312-win_arm64.whl", hash = "sha256:f28620fe26bee16243be2b7b874da327312240a7cdc38b769a697578d2100013", size = 10194667, upload-time = "2025-11-16T22:50:16.16Z" },
+    { url = "https://files.pythonhosted.org/packages/db/69/9cde09f36da4b5a505341180a3f2e6fadc352fd4d2b7096ce9778db83f1a/numpy-2.3.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d0f23b44f57077c1ede8c5f26b30f706498b4862d3ff0a7298b8411dd2f043ff", size = 16728251, upload-time = "2025-11-16T22:50:19.013Z" },
+    { url = "https://files.pythonhosted.org/packages/79/fb/f505c95ceddd7027347b067689db71ca80bd5ecc926f913f1a23e65cf09b/numpy-2.3.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:aa5bc7c5d59d831d9773d1170acac7893ce3a5e130540605770ade83280e7188", size = 12254652, upload-time = "2025-11-16T22:50:21.487Z" },
+    { url = "https://files.pythonhosted.org/packages/78/da/8c7738060ca9c31b30e9301ee0cf6c5ffdbf889d9593285a1cead337f9a5/numpy-2.3.5-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:ccc933afd4d20aad3c00bcef049cb40049f7f196e0397f1109dba6fed63267b0", size = 5083172, upload-time = "2025-11-16T22:50:24.562Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/b4/ee5bb2537fb9430fd2ef30a616c3672b991a4129bb1c7dcc42aa0abbe5d7/numpy-2.3.5-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:afaffc4393205524af9dfa400fa250143a6c3bc646c08c9f5e25a9f4b4d6a903", size = 6622990, upload-time = "2025-11-16T22:50:26.47Z" },
+    { url = "https://files.pythonhosted.org/packages/95/03/dc0723a013c7d7c19de5ef29e932c3081df1c14ba582b8b86b5de9db7f0f/numpy-2.3.5-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c75442b2209b8470d6d5d8b1c25714270686f14c749028d2199c54e29f20b4d", size = 14248902, upload-time = "2025-11-16T22:50:28.861Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/10/ca162f45a102738958dcec8023062dad0cbc17d1ab99d68c4e4a6c45fb2b/numpy-2.3.5-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:11e06aa0af8c0f05104d56450d6093ee639e15f24ecf62d417329d06e522e017", size = 16597430, upload-time = "2025-11-16T22:50:31.56Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/51/c1e29be863588db58175175f057286900b4b3327a1351e706d5e0f8dd679/numpy-2.3.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ed89927b86296067b4f81f108a2271d8926467a8868e554eaf370fc27fa3ccaf", size = 16024551, upload-time = "2025-11-16T22:50:34.242Z" },
+    { url = "https://files.pythonhosted.org/packages/83/68/8236589d4dbb87253d28259d04d9b814ec0ecce7cb1c7fed29729f4c3a78/numpy-2.3.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:51c55fe3451421f3a6ef9a9c1439e82101c57a2c9eab9feb196a62b1a10b58ce", size = 18533275, upload-time = "2025-11-16T22:50:37.651Z" },
+    { url = "https://files.pythonhosted.org/packages/40/56/2932d75b6f13465239e3b7b7e511be27f1b8161ca2510854f0b6e521c395/numpy-2.3.5-cp313-cp313-win32.whl", hash = "sha256:1978155dd49972084bd6ef388d66ab70f0c323ddee6f693d539376498720fb7e", size = 6277637, upload-time = "2025-11-16T22:50:40.11Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/88/e2eaa6cffb115b85ed7c7c87775cb8bcf0816816bc98ca8dbfa2ee33fe6e/numpy-2.3.5-cp313-cp313-win_amd64.whl", hash = "sha256:00dc4e846108a382c5869e77c6ed514394bdeb3403461d25a829711041217d5b", size = 12779090, upload-time = "2025-11-16T22:50:42.503Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/88/3f41e13a44ebd4034ee17baa384acac29ba6a4fcc2aca95f6f08ca0447d1/numpy-2.3.5-cp313-cp313-win_arm64.whl", hash = "sha256:0472f11f6ec23a74a906a00b48a4dcf3849209696dff7c189714511268d103ae", size = 10194710, upload-time = "2025-11-16T22:50:44.971Z" },
+    { url = "https://files.pythonhosted.org/packages/13/cb/71744144e13389d577f867f745b7df2d8489463654a918eea2eeb166dfc9/numpy-2.3.5-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:414802f3b97f3c1eef41e530aaba3b3c1620649871d8cb38c6eaff034c2e16bd", size = 16827292, upload-time = "2025-11-16T22:50:47.715Z" },
+    { url = "https://files.pythonhosted.org/packages/71/80/ba9dc6f2a4398e7f42b708a7fdc841bb638d353be255655498edbf9a15a8/numpy-2.3.5-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5ee6609ac3604fa7780e30a03e5e241a7956f8e2fcfe547d51e3afa5247ac47f", size = 12378897, upload-time = "2025-11-16T22:50:51.327Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/6d/db2151b9f64264bcceccd51741aa39b50150de9b602d98ecfe7e0c4bff39/numpy-2.3.5-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:86d835afea1eaa143012a2d7a3f45a3adce2d7adc8b4961f0b362214d800846a", size = 5207391, upload-time = "2025-11-16T22:50:54.542Z" },
+    { url = "https://files.pythonhosted.org/packages/80/ae/429bacace5ccad48a14c4ae5332f6aa8ab9f69524193511d60ccdfdc65fa/numpy-2.3.5-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:30bc11310e8153ca664b14c5f1b73e94bd0503681fcf136a163de856f3a50139", size = 6721275, upload-time = "2025-11-16T22:50:56.794Z" },
+    { url = "https://files.pythonhosted.org/packages/74/5b/1919abf32d8722646a38cd527bc3771eb229a32724ee6ba340ead9b92249/numpy-2.3.5-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1062fde1dcf469571705945b0f221b73928f34a20c904ffb45db101907c3454e", size = 14306855, upload-time = "2025-11-16T22:50:59.208Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/87/6831980559434973bebc30cd9c1f21e541a0f2b0c280d43d3afd909b66d0/numpy-2.3.5-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ce581db493ea1a96c0556360ede6607496e8bf9b3a8efa66e06477267bc831e9", size = 16657359, upload-time = "2025-11-16T22:51:01.991Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/91/c797f544491ee99fd00495f12ebb7802c440c1915811d72ac5b4479a3356/numpy-2.3.5-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:cc8920d2ec5fa99875b670bb86ddeb21e295cb07aa331810d9e486e0b969d946", size = 16093374, upload-time = "2025-11-16T22:51:05.291Z" },
+    { url = "https://files.pythonhosted.org/packages/74/a6/54da03253afcbe7a72785ec4da9c69fb7a17710141ff9ac5fcb2e32dbe64/numpy-2.3.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:9ee2197ef8c4f0dfe405d835f3b6a14f5fee7782b5de51ba06fb65fc9b36e9f1", size = 18594587, upload-time = "2025-11-16T22:51:08.585Z" },
+    { url = "https://files.pythonhosted.org/packages/80/e9/aff53abbdd41b0ecca94285f325aff42357c6b5abc482a3fcb4994290b18/numpy-2.3.5-cp313-cp313t-win32.whl", hash = "sha256:70b37199913c1bd300ff6e2693316c6f869c7ee16378faf10e4f5e3275b299c3", size = 6405940, upload-time = "2025-11-16T22:51:11.541Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/81/50613fec9d4de5480de18d4f8ef59ad7e344d497edbef3cfd80f24f98461/numpy-2.3.5-cp313-cp313t-win_amd64.whl", hash = "sha256:b501b5fa195cc9e24fe102f21ec0a44dffc231d2af79950b451e0d99cea02234", size = 12920341, upload-time = "2025-11-16T22:51:14.312Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/ab/08fd63b9a74303947f34f0bd7c5903b9c5532c2d287bead5bdf4c556c486/numpy-2.3.5-cp313-cp313t-win_arm64.whl", hash = "sha256:a80afd79f45f3c4a7d341f13acbe058d1ca8ac017c165d3fa0d3de6bc1a079d7", size = 10262507, upload-time = "2025-11-16T22:51:16.846Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/97/1a914559c19e32d6b2e233cf9a6a114e67c856d35b1d6babca571a3e880f/numpy-2.3.5-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:bf06bc2af43fa8d32d30fae16ad965663e966b1a3202ed407b84c989c3221e82", size = 16735706, upload-time = "2025-11-16T22:51:19.558Z" },
+    { url = "https://files.pythonhosted.org/packages/57/d4/51233b1c1b13ecd796311216ae417796b88b0616cfd8a33ae4536330748a/numpy-2.3.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:052e8c42e0c49d2575621c158934920524f6c5da05a1d3b9bab5d8e259e045f0", size = 12264507, upload-time = "2025-11-16T22:51:22.492Z" },
+    { url = "https://files.pythonhosted.org/packages/45/98/2fe46c5c2675b8306d0b4a3ec3494273e93e1226a490f766e84298576956/numpy-2.3.5-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:1ed1ec893cff7040a02c8aa1c8611b94d395590d553f6b53629a4461dc7f7b63", size = 5093049, upload-time = "2025-11-16T22:51:25.171Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/0e/0698378989bb0ac5f1660c81c78ab1fe5476c1a521ca9ee9d0710ce54099/numpy-2.3.5-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:2dcd0808a421a482a080f89859a18beb0b3d1e905b81e617a188bd80422d62e9", size = 6626603, upload-time = "2025-11-16T22:51:27Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/a6/9ca0eecc489640615642a6cbc0ca9e10df70df38c4d43f5a928ff18d8827/numpy-2.3.5-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:727fd05b57df37dc0bcf1a27767a3d9a78cbbc92822445f32cc3436ba797337b", size = 14262696, upload-time = "2025-11-16T22:51:29.402Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/f6/07ec185b90ec9d7217a00eeeed7383b73d7e709dae2a9a021b051542a708/numpy-2.3.5-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fffe29a1ef00883599d1dc2c51aa2e5d80afe49523c261a74933df395c15c520", size = 16597350, upload-time = "2025-11-16T22:51:32.167Z" },
+    { url = "https://files.pythonhosted.org/packages/75/37/164071d1dde6a1a84c9b8e5b414fa127981bad47adf3a6b7e23917e52190/numpy-2.3.5-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:8f7f0e05112916223d3f438f293abf0727e1181b5983f413dfa2fefc4098245c", size = 16040190, upload-time = "2025-11-16T22:51:35.403Z" },
+    { url = "https://files.pythonhosted.org/packages/08/3c/f18b82a406b04859eb026d204e4e1773eb41c5be58410f41ffa511d114ae/numpy-2.3.5-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2e2eb32ddb9ccb817d620ac1d8dae7c3f641c1e5f55f531a33e8ab97960a75b8", size = 18536749, upload-time = "2025-11-16T22:51:39.698Z" },
+    { url = "https://files.pythonhosted.org/packages/40/79/f82f572bf44cf0023a2fe8588768e23e1592585020d638999f15158609e1/numpy-2.3.5-cp314-cp314-win32.whl", hash = "sha256:66f85ce62c70b843bab1fb14a05d5737741e74e28c7b8b5a064de10142fad248", size = 6335432, upload-time = "2025-11-16T22:51:42.476Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/2e/235b4d96619931192c91660805e5e49242389742a7a82c27665021db690c/numpy-2.3.5-cp314-cp314-win_amd64.whl", hash = "sha256:e6a0bc88393d65807d751a614207b7129a310ca4fe76a74e5c7da5fa5671417e", size = 12919388, upload-time = "2025-11-16T22:51:45.275Z" },
+    { url = "https://files.pythonhosted.org/packages/07/2b/29fd75ce45d22a39c61aad74f3d718e7ab67ccf839ca8b60866054eb15f8/numpy-2.3.5-cp314-cp314-win_arm64.whl", hash = "sha256:aeffcab3d4b43712bb7a60b65f6044d444e75e563ff6180af8f98dd4b905dfd2", size = 10476651, upload-time = "2025-11-16T22:51:47.749Z" },
+    { url = "https://files.pythonhosted.org/packages/17/e1/f6a721234ebd4d87084cfa68d081bcba2f5cfe1974f7de4e0e8b9b2a2ba1/numpy-2.3.5-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:17531366a2e3a9e30762c000f2c43a9aaa05728712e25c11ce1dbe700c53ad41", size = 16834503, upload-time = "2025-11-16T22:51:50.443Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/1c/baf7ffdc3af9c356e1c135e57ab7cf8d247931b9554f55c467efe2c69eff/numpy-2.3.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:d21644de1b609825ede2f48be98dfde4656aefc713654eeee280e37cadc4e0ad", size = 12381612, upload-time = "2025-11-16T22:51:53.609Z" },
+    { url = "https://files.pythonhosted.org/packages/74/91/f7f0295151407ddc9ba34e699013c32c3c91944f9b35fcf9281163dc1468/numpy-2.3.5-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:c804e3a5aba5460c73955c955bdbd5c08c354954e9270a2c1565f62e866bdc39", size = 5210042, upload-time = "2025-11-16T22:51:56.213Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/3b/78aebf345104ec50dd50a4d06ddeb46a9ff5261c33bcc58b1c4f12f85ec2/numpy-2.3.5-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:cc0a57f895b96ec78969c34f682c602bf8da1a0270b09bc65673df2e7638ec20", size = 6724502, upload-time = "2025-11-16T22:51:58.584Z" },
+    { url = "https://files.pythonhosted.org/packages/02/c6/7c34b528740512e57ef1b7c8337ab0b4f0bddf34c723b8996c675bc2bc91/numpy-2.3.5-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:900218e456384ea676e24ea6a0417f030a3b07306d29d7ad843957b40a9d8d52", size = 14308962, upload-time = "2025-11-16T22:52:01.698Z" },
+    { url = "https://files.pythonhosted.org/packages/80/35/09d433c5262bc32d725bafc619e095b6a6651caf94027a03da624146f655/numpy-2.3.5-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:09a1bea522b25109bf8e6f3027bd810f7c1085c64a0c7ce050c1676ad0ba010b", size = 16655054, upload-time = "2025-11-16T22:52:04.267Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/ab/6a7b259703c09a88804fa2430b43d6457b692378f6b74b356155283566ac/numpy-2.3.5-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:04822c00b5fd0323c8166d66c701dc31b7fbd252c100acd708c48f763968d6a3", size = 16091613, upload-time = "2025-11-16T22:52:08.651Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/88/330da2071e8771e60d1038166ff9d73f29da37b01ec3eb43cb1427464e10/numpy-2.3.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:d6889ec4ec662a1a37eb4b4fb26b6100841804dac55bd9df579e326cdc146227", size = 18591147, upload-time = "2025-11-16T22:52:11.453Z" },
+    { url = "https://files.pythonhosted.org/packages/51/41/851c4b4082402d9ea860c3626db5d5df47164a712cb23b54be028b184c1c/numpy-2.3.5-cp314-cp314t-win32.whl", hash = "sha256:93eebbcf1aafdf7e2ddd44c2923e2672e1010bddc014138b229e49725b4d6be5", size = 6479806, upload-time = "2025-11-16T22:52:14.641Z" },
+    { url = "https://files.pythonhosted.org/packages/90/30/d48bde1dfd93332fa557cff1972fbc039e055a52021fbef4c2c4b1eefd17/numpy-2.3.5-cp314-cp314t-win_amd64.whl", hash = "sha256:c8a9958e88b65c3b27e22ca2a076311636850b612d6bbfb76e8d156aacde2aaf", size = 13105760, upload-time = "2025-11-16T22:52:17.975Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/fd/4b5eb0b3e888d86aee4d198c23acec7d214baaf17ea93c1adec94c9518b9/numpy-2.3.5-cp314-cp314t-win_arm64.whl", hash = "sha256:6203fdf9f3dc5bdaed7319ad8698e685c7a3be10819f41d32a0723e611733b42", size = 10545459, upload-time = "2025-11-16T22:52:20.55Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/65/f9dea8e109371ade9c782b4e4756a82edf9d3366bca495d84d79859a0b79/numpy-2.3.5-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:f0963b55cdd70fad460fa4c1341f12f976bb26cb66021a5580329bd498988310", size = 16910689, upload-time = "2025-11-16T22:52:23.247Z" },
+    { url = "https://files.pythonhosted.org/packages/00/4f/edb00032a8fb92ec0a679d3830368355da91a69cab6f3e9c21b64d0bb986/numpy-2.3.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:f4255143f5160d0de972d28c8f9665d882b5f61309d8362fdd3e103cf7bf010c", size = 12457053, upload-time = "2025-11-16T22:52:26.367Z" },
+    { url = "https://files.pythonhosted.org/packages/16/a4/e8a53b5abd500a63836a29ebe145fc1ab1f2eefe1cfe59276020373ae0aa/numpy-2.3.5-pp311-pypy311_pp73-macosx_14_0_arm64.whl", hash = "sha256:a4b9159734b326535f4dd01d947f919c6eefd2d9827466a696c44ced82dfbc18", size = 5285635, upload-time = "2025-11-16T22:52:29.266Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/2f/37eeb9014d9c8b3e9c55bc599c68263ca44fdbc12a93e45a21d1d56df737/numpy-2.3.5-pp311-pypy311_pp73-macosx_14_0_x86_64.whl", hash = "sha256:2feae0d2c91d46e59fcd62784a3a83b3fb677fead592ce51b5a6fbb4f95965ff", size = 6801770, upload-time = "2025-11-16T22:52:31.421Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/e4/68d2f474df2cb671b2b6c2986a02e520671295647dad82484cde80ca427b/numpy-2.3.5-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ffac52f28a7849ad7576293c0cb7b9f08304e8f7d738a8cb8a90ec4c55a998eb", size = 14391768, upload-time = "2025-11-16T22:52:33.593Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/50/94ccd8a2b141cb50651fddd4f6a48874acb3c91c8f0842b08a6afc4b0b21/numpy-2.3.5-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:63c0e9e7eea69588479ebf4a8a270d5ac22763cc5854e9a7eae952a3908103f7", size = 16729263, upload-time = "2025-11-16T22:52:36.369Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/ee/346fa473e666fe14c52fcdd19ec2424157290a032d4c41f98127bfb31ac7/numpy-2.3.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:f16417ec91f12f814b10bafe79ef77e70113a2f5f7018640e7425ff979253425", size = 12967213, upload-time = "2025-11-16T22:52:39.38Z" },
+]
+
+[[package]]
+name = "nv-grouped-gemm"
+version = "1.1.4.post6"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "absl-py" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "torch", marker = "sys_platform == 'never'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/05/79/87c45f32e661b25e0aaa1e325ba166511f57be5dff8f0fcabc12d3e73b64/nv_grouped_gemm-1.1.4.post6.tar.gz", hash = "sha256:dad6115f4b4ff7ceb0bc40ad44e923c13a24fc88cfe1e20b1a6b4c9cf24c445c", size = 26508, upload-time = "2025-10-10T18:52:29.508Z" }
+
+[[package]]
+name = "nv-one-logger-core"
+version = "2.3.1"
+source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy", marker = "python_full_version >= '3.11'" },
-    { name = "typing-extensions", marker = "python_full_version >= '3.11'" },
+    { name = "overrides" },
+    { name = "pydantic" },
+    { name = "strenum" },
+    { name = "toml" },
+    { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/f6/48/6188e359b90a9d8a1850f2bc888c023e66f4a8b2b496820babbea414f008/numcodecs-0.16.3.tar.gz", hash = "sha256:53d705865faaf0a7927c973af3777532001c8fbb653de119c1e844608614d799", size = 6275704, upload-time = "2025-09-18T18:54:57.221Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/3b/37/963095797035f371e0db6ea761f5aaccb624fc786af217115b423baeb0e2/nv_one_logger_core-2.3.1.tar.gz", hash = "sha256:cbb2f87604c78b96a302f32d87199902129d76153a73a20f8455a250b3246c1d", size = 52640, upload-time = "2025-10-29T21:11:55.812Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/d4/cc/917a85972537498f2bbd7914047efc98babc8667587ceb9dcb228378978a/numcodecs-0.16.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:95c9f2a49bef10cf91ad614a761cba9bfe96656b60c12540e1080de5d909b4ca", size = 1642356, upload-time = "2025-09-18T18:54:36.402Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/6a/64c25a089e8537441fe67c09ecb7f3f7fb5d98cd04faf01f605d43aca41c/numcodecs-0.16.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2afe73d5ebaf9ca0cd5c83aad945da80d29a33d860a80d43a7248491d8813ff", size = 1169186, upload-time = "2025-09-18T18:54:37.838Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/a0/0de627baeb43e2045a3d4b3de99bf8b69af329a33df1ed4cda468d70c1fb/numcodecs-0.16.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:913f08194d82dcb37594e6705e6d4ae6ccd4b6571500b832fb3e4a155de1dfe8", size = 8341668, upload-time = "2025-09-18T18:54:39.444Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/0f/49d1f74a216149240c4b9403218111f11670bd11af0919fda357bb056bf2/numcodecs-0.16.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85a7f1cae9eb18b85709af46570bf9c60056e7155c4c8f610e8080c68124d0e5", size = 8866611, upload-time = "2025-09-18T18:54:41.168Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/51/03aece765108fe247717105b5131856546e5428f22a56a14ffdebd017424/numcodecs-0.16.3-cp311-cp311-win_amd64.whl", hash = "sha256:f7bb7f2c46eb7ec8a1c5f8d8fe1a72c222256dd6d6df5af9eaac7a6b905f3575", size = 806787, upload-time = "2025-09-18T18:54:42.78Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/78/e4b34803a3aa1d0769919695de4b133266c18c80c474d32ebc462fa1a9bd/numcodecs-0.16.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c77454d92941a335d148b0b822f5d4783103f392774d5d76283bbf7f21b49529", size = 1681108, upload-time = "2025-09-18T18:54:43.856Z" },
-    { url = "https://files.pythonhosted.org/packages/25/cf/ca36f463b03a4097767d2a1c1b72f31810e8c6384e9449dd9b925203783c/numcodecs-0.16.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:270e7a33ee96bdf5c957acf25a2487002a233811a125a155c400c2f036b69c73", size = 1165589, upload-time = "2025-09-18T18:54:44.954Z" },
-    { url = "https://files.pythonhosted.org/packages/ed/ae/670260c3c4b5ed34a0674561355f3d4ce7fcbdf09a667e5bc841526d271c/numcodecs-0.16.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:12f43fa4a347d1dba775c4506a1c9b15b90144c258433b81f79f1c1b1a990db5", size = 8316365, upload-time = "2025-09-18T18:54:46.073Z" },
-    { url = "https://files.pythonhosted.org/packages/bb/fa/94e022419c751a60ff0f53642ebae5ef81ed3cc3640f958588e3ad3dc18d/numcodecs-0.16.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44869ef564a50aa545215c6a0d42ba5bbc34e9715523fb2336ada3d1fb2b331d", size = 8846228, upload-time = "2025-09-18T18:54:47.858Z" },
-    { url = "https://files.pythonhosted.org/packages/71/60/f23733589f3e059bf8589508acd23ffeec230bdf179f138a54f5ab16e0a6/numcodecs-0.16.3-cp312-cp312-win_amd64.whl", hash = "sha256:9aae6996172ba10c5f5111b2998709071b5aeba6b58b1ee0b26b61ed6aa7f2f4", size = 806260, upload-time = "2025-09-18T18:54:49.41Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/d5/d3536d06ac1e5fb848a3186958204082b68b106364c9a3669652dd786731/numcodecs-0.16.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:947406b01c20f2ce7ce2e631e7f21b782e8a9d4b57b374a41c9e7b1341a8f3a2", size = 1677129, upload-time = "2025-09-18T18:54:50.5Z" },
-    { url = "https://files.pythonhosted.org/packages/e1/fd/b0513a3428dc2b38ec85eea771703ae69c49f09b9650d6c44c9105c80073/numcodecs-0.16.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7cf50e351398a34b45817974c411527629e88937b7683695e276afd65da6ed6f", size = 1159058, upload-time = "2025-09-18T18:54:51.675Z" },
-    { url = "https://files.pythonhosted.org/packages/98/05/b7c127283cfb154a97abb284363825401b69302d71a28608af66f73257cc/numcodecs-0.16.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f7938502fcc060ed9543814f38ca67048b33d7bd2667756e36e6b1060455b17e", size = 8260987, upload-time = "2025-09-18T18:54:52.883Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/46/320d960aff884bc63abaaf846ffa3de4803e83e8070b6f84c5688464839c/numcodecs-0.16.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:010d628c95be1214536fb22c0df4ced58da954b404b1fcb25ddebf64e4a3f7f3", size = 8805295, upload-time = "2025-09-18T18:54:54.698Z" },
-    { url = "https://files.pythonhosted.org/packages/31/ae/acc2e0f1f49ba32afa2174578f170673139248ef86f77e334f2619133867/numcodecs-0.16.3-cp313-cp313-win_amd64.whl", hash = "sha256:e83115e3c32de798c7b7164503e06aae9f9746c1cef564d029616eb44bd6cd90", size = 803204, upload-time = "2025-09-18T18:54:56.192Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/c4/ea91554c4fcbff66057f667690101d7a4b965605741350ac661b03fa6c46/nv_one_logger_core-2.3.1-py3-none-any.whl", hash = "sha256:0c8b77bcdac4daa1ea913bf8d4afd2a057bd5526e3654ac39f67caba157341a6", size = 63066, upload-time = "2025-10-29T21:11:52.753Z" },
 ]
 
-[package.optional-dependencies]
-crc32c = [
-    { name = "crc32c", marker = "python_full_version >= '3.11'" },
+[[package]]
+name = "nv-one-logger-training-telemetry"
+version = "2.3.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "nv-one-logger-core" },
+    { name = "strenum" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c5/21/016fa067967734d52f1ccf5a2a37a1a65216f2d7053bc2b85872cce956ca/nv_one_logger_training_telemetry-2.3.1.tar.gz", hash = "sha256:8c67940ea71799afaf1f46df3ba2f52f93aea26321c6f1c1d54aae02efc2a4af", size = 44435, upload-time = "2025-10-29T21:21:42.035Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e5/15/97e6e4ddfe5fc35bcee74a45b7c33fb73abb83713c7dfa26420b971a86c3/nv_one_logger_training_telemetry-2.3.1-py3-none-any.whl", hash = "sha256:5319443829b59378a498c3c62ac98973e14f31be675c229ff2b14e2fe109aa0b", size = 44140, upload-time = "2025-10-29T21:21:40.72Z" },
 ]
 
 [[package]]
-name = "numpy"
-version = "1.26.4"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010", size = 15786129, upload-time = "2024-02-06T00:26:44.495Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/a7/94/ace0fdea5241a27d13543ee117cbc65868e82213fb31a8eb7fe9ff23f313/numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0", size = 20631468, upload-time = "2024-02-05T23:48:01.194Z" },
-    { url = "https://files.pythonhosted.org/packages/20/f7/b24208eba89f9d1b58c1668bc6c8c4fd472b20c45573cb767f59d49fb0f6/numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a", size = 13966411, upload-time = "2024-02-05T23:48:29.038Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/a5/4beee6488160798683eed5bdb7eead455892c3b4e1f78d79d8d3f3b084ac/numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4", size = 14219016, upload-time = "2024-02-05T23:48:54.098Z" },
-    { url = "https://files.pythonhosted.org/packages/4b/d7/ecf66c1cd12dc28b4040b15ab4d17b773b87fa9d29ca16125de01adb36cd/numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f", size = 18240889, upload-time = "2024-02-05T23:49:25.361Z" },
-    { url = "https://files.pythonhosted.org/packages/24/03/6f229fe3187546435c4f6f89f6d26c129d4f5bed40552899fcf1f0bf9e50/numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a", size = 13876746, upload-time = "2024-02-05T23:49:51.983Z" },
-    { url = "https://files.pythonhosted.org/packages/39/fe/39ada9b094f01f5a35486577c848fe274e374bbf8d8f472e1423a0bbd26d/numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2", size = 18078620, upload-time = "2024-02-05T23:50:22.515Z" },
-    { url = "https://files.pythonhosted.org/packages/d5/ef/6ad11d51197aad206a9ad2286dc1aac6a378059e06e8cf22cd08ed4f20dc/numpy-1.26.4-cp310-cp310-win32.whl", hash = "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07", size = 5972659, upload-time = "2024-02-05T23:50:35.834Z" },
-    { url = "https://files.pythonhosted.org/packages/19/77/538f202862b9183f54108557bfda67e17603fc560c384559e769321c9d92/numpy-1.26.4-cp310-cp310-win_amd64.whl", hash = "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5", size = 15808905, upload-time = "2024-02-05T23:51:03.701Z" },
-    { url = "https://files.pythonhosted.org/packages/11/57/baae43d14fe163fa0e4c47f307b6b2511ab8d7d30177c491960504252053/numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71", size = 20630554, upload-time = "2024-02-05T23:51:50.149Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/2e/151484f49fd03944c4a3ad9c418ed193cfd02724e138ac8a9505d056c582/numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef", size = 13997127, upload-time = "2024-02-05T23:52:15.314Z" },
-    { url = "https://files.pythonhosted.org/packages/79/ae/7e5b85136806f9dadf4878bf73cf223fe5c2636818ba3ab1c585d0403164/numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e", size = 14222994, upload-time = "2024-02-05T23:52:47.569Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/d0/edc009c27b406c4f9cbc79274d6e46d634d139075492ad055e3d68445925/numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5", size = 18252005, upload-time = "2024-02-05T23:53:15.637Z" },
-    { url = "https://files.pythonhosted.org/packages/09/bf/2b1aaf8f525f2923ff6cfcf134ae5e750e279ac65ebf386c75a0cf6da06a/numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a", size = 13885297, upload-time = "2024-02-05T23:53:42.16Z" },
-    { url = "https://files.pythonhosted.org/packages/df/a0/4e0f14d847cfc2a633a1c8621d00724f3206cfeddeb66d35698c4e2cf3d2/numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a", size = 18093567, upload-time = "2024-02-05T23:54:11.696Z" },
-    { url = "https://files.pythonhosted.org/packages/d2/b7/a734c733286e10a7f1a8ad1ae8c90f2d33bf604a96548e0a4a3a6739b468/numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20", size = 5968812, upload-time = "2024-02-05T23:54:26.453Z" },
-    { url = "https://files.pythonhosted.org/packages/3f/6b/5610004206cf7f8e7ad91c5a85a8c71b2f2f8051a0c0c4d5916b76d6cbb2/numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2", size = 15811913, upload-time = "2024-02-05T23:54:53.933Z" },
-    { url = "https://files.pythonhosted.org/packages/95/12/8f2020a8e8b8383ac0177dc9570aad031a3beb12e38847f7129bacd96228/numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218", size = 20335901, upload-time = "2024-02-05T23:55:32.801Z" },
-    { url = "https://files.pythonhosted.org/packages/75/5b/ca6c8bd14007e5ca171c7c03102d17b4f4e0ceb53957e8c44343a9546dcc/numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b", size = 13685868, upload-time = "2024-02-05T23:55:56.28Z" },
-    { url = "https://files.pythonhosted.org/packages/79/f8/97f10e6755e2a7d027ca783f63044d5b1bc1ae7acb12afe6a9b4286eac17/numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b", size = 13925109, upload-time = "2024-02-05T23:56:20.368Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/50/de23fde84e45f5c4fda2488c759b69990fd4512387a8632860f3ac9cd225/numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed", size = 17950613, upload-time = "2024-02-05T23:56:56.054Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/0c/9c603826b6465e82591e05ca230dfc13376da512b25ccd0894709b054ed0/numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a", size = 13572172, upload-time = "2024-02-05T23:57:21.56Z" },
-    { url = "https://files.pythonhosted.org/packages/76/8c/2ba3902e1a0fc1c74962ea9bb33a534bb05984ad7ff9515bf8d07527cadd/numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0", size = 17786643, upload-time = "2024-02-05T23:57:56.585Z" },
-    { url = "https://files.pythonhosted.org/packages/28/4a/46d9e65106879492374999e76eb85f87b15328e06bd1550668f79f7b18c6/numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110", size = 5677803, upload-time = "2024-02-05T23:58:08.963Z" },
-    { url = "https://files.pythonhosted.org/packages/16/2e/86f24451c2d530c88daf997cb8d6ac622c1d40d19f5a031ed68a4b73a374/numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818", size = 15517754, upload-time = "2024-02-05T23:58:36.364Z" },
+name = "nvidia-cublas-cu12"
+version = "12.8.4.1"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/29/99/db44d685f0e257ff0e213ade1964fc459b4a690a73293220e98feb3307cf/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:b86f6dd8935884615a0683b663891d43781b819ac4f2ba2b0c9604676af346d0", size = 590537124, upload-time = "2025-03-07T01:43:53.556Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/61/e24b560ab2e2eaeb3c839129175fb330dfcfc29e5203196e5541a4c44682/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:8ac4e771d5a348c551b2a426eda6193c19aa630236b418086020df5ba9667142", size = 594346921, upload-time = "2025-03-07T01:44:31.254Z" },
+    { url = "https://files.pythonhosted.org/packages/70/61/7d7b3c70186fb651d0fbd35b01dbfc8e755f69fd58f817f3d0f642df20c3/nvidia_cublas_cu12-12.8.4.1-py3-none-win_amd64.whl", hash = "sha256:47e9b82132fa8d2b4944e708049229601448aaad7e6f296f630f2d1a32de35af", size = 567544208, upload-time = "2025-03-07T01:53:30.535Z" },
 ]
 
 [[package]]
-name = "nv-grouped-gemm"
-version = "1.1.4.post6"
+name = "nvidia-cuda-cupti-cu12"
+version = "12.8.90"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d5/1f/b3bd73445e5cb342727fd24fe1f7b748f690b460acadc27ea22f904502c8/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4412396548808ddfed3f17a467b104ba7751e6b58678a4b840675c56d21cf7ed", size = 9533318, upload-time = "2025-03-07T01:40:10.421Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/02/2adcaa145158bf1a8295d83591d22e4103dbfd821bcaf6f3f53151ca4ffa/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ea0cb07ebda26bb9b29ba82cda34849e73c166c18162d3913575b0c9db9a6182", size = 10248621, upload-time = "2025-03-07T01:40:21.213Z" },
+    { url = "https://files.pythonhosted.org/packages/41/bc/83f5426095d93694ae39fe1311431b5d5a9bb82e48bf0dd8e19be2765942/nvidia_cuda_cupti_cu12-12.8.90-py3-none-win_amd64.whl", hash = "sha256:bb479dcdf7e6d4f8b0b01b115260399bf34154a1a2e9fe11c85c517d87efd98e", size = 7015759, upload-time = "2025-03-07T01:51:11.355Z" },
+]
+
+[[package]]
+name = "nvidia-cuda-nvrtc-cu12"
+version = "12.8.93"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/05/6b/32f747947df2da6994e999492ab306a903659555dddc0fbdeb9d71f75e52/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:a7756528852ef889772a84c6cd89d41dfa74667e24cca16bb31f8f061e3e9994", size = 88040029, upload-time = "2025-03-07T01:42:13.562Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/d1/e50d0acaab360482034b84b6e27ee83c6738f7d32182b987f9c7a4e32962/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fc1fec1e1637854b4c0a65fb9a8346b51dd9ee69e61ebaccc82058441f15bce8", size = 43106076, upload-time = "2025-03-07T01:41:59.817Z" },
+    { url = "https://files.pythonhosted.org/packages/45/51/52a3d84baa2136cc8df15500ad731d74d3a1114d4c123e043cb608d4a32b/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-win_amd64.whl", hash = "sha256:7a4b6b2904850fe78e0bd179c4b655c404d4bb799ef03ddc60804247099ae909", size = 73586838, upload-time = "2025-03-07T01:52:13.483Z" },
+]
+
+[[package]]
+name = "nvidia-cuda-runtime-cu12"
+version = "12.8.90"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7c/75/f865a3b236e4647605ea34cc450900854ba123834a5f1598e160b9530c3a/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:52bf7bbee900262ffefe5e9d5a2a69a30d97e2bc5bb6cc866688caa976966e3d", size = 965265, upload-time = "2025-03-07T01:39:43.533Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/9b/a997b638fcd068ad6e4d53b8551a7d30fe8b404d6f1804abf1df69838932/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adade8dcbd0edf427b7204d480d6066d33902cab2a4707dcfc48a2d0fd44ab90", size = 954765, upload-time = "2025-03-07T01:40:01.615Z" },
+    { url = "https://files.pythonhosted.org/packages/30/a5/a515b7600ad361ea14bfa13fb4d6687abf500adc270f19e89849c0590492/nvidia_cuda_runtime_cu12-12.8.90-py3-none-win_amd64.whl", hash = "sha256:c0c6027f01505bfed6c3b21ec546f69c687689aad5f1a377554bc6ca4aa993a8", size = 944318, upload-time = "2025-03-07T01:51:01.794Z" },
+]
+
+[[package]]
+name = "nvidia-cudnn-cu12"
+version = "9.10.2.21"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "absl-py" },
-    { name = "numpy" },
-    { name = "torch", marker = "sys_platform == 'never'" },
+    { name = "nvidia-cublas-cu12" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fa/41/e79269ce215c857c935fd86bcfe91a451a584dfc27f1e068f568b9ad1ab7/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:c9132cc3f8958447b4910a1720036d9eff5928cc3179b0a51fb6d167c6cc87d8", size = 705026878, upload-time = "2025-06-06T21:52:51.348Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/90/0bd6e586701b3a890fd38aa71c387dab4883d619d6e5ad912ccbd05bfd67/nvidia_cudnn_cu12-9.10.2.21-py3-none-win_amd64.whl", hash = "sha256:c6288de7d63e6cf62988f0923f96dc339cea362decb1bf5b3141883392a7d65e", size = 692992268, upload-time = "2025-06-06T21:55:18.114Z" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/05/79/87c45f32e661b25e0aaa1e325ba166511f57be5dff8f0fcabc12d3e73b64/nv_grouped_gemm-1.1.4.post6.tar.gz", hash = "sha256:dad6115f4b4ff7ceb0bc40ad44e923c13a24fc88cfe1e20b1a6b4c9cf24c445c", size = 26508, upload-time = "2025-10-10T18:52:29.508Z" }
 
 [[package]]
 name = "nvidia-cudnn-frontend"
-version = "1.15.0"
+version = "1.16.0"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fa/cf/3cd3cc682df5488288c6043fc0977090497ff015a082ab160076fecb080a/nvidia_cudnn_frontend-1.16.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:83ecbe6d1145dc208a9ae82aa0b45b2c8f74ed8a43d3a102a13eef2117e2fedd", size = 1835542, upload-time = "2025-11-07T01:28:20.133Z" },
+    { url = "https://files.pythonhosted.org/packages/92/45/87f3f2d94a928be21459949b03b0b8bcea13531d30094ad84a8ae4fca761/nvidia_cudnn_frontend-1.16.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:77cb06b91877c8489363867434ba1d9936f3e10bf7ed98d82e98f5f578611920", size = 1950339, upload-time = "2025-11-07T01:31:41.69Z" },
+    { url = "https://files.pythonhosted.org/packages/be/f5/1662f18084ef4441bfb3a01383cbf77194905b53474dcb51c0d0f373c74b/nvidia_cudnn_frontend-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:ee3f3886f107919dad48cbc905fa6ae9207c8d7d5a24165e55625ea96f0fe40f", size = 1367883, upload-time = "2025-11-07T01:25:17.791Z" },
+    { url = "https://files.pythonhosted.org/packages/10/b7/d0a3a337f5e83f26ff79a7fd63a859181ff2911f1d905d6fbab5fc80170d/nvidia_cudnn_frontend-1.16.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c360d5840d6eb597aade9e9c8780e24aec283b8e6bc97d52881c821a35c92aa9", size = 1837573, upload-time = "2025-11-07T01:29:05.507Z" },
+    { url = "https://files.pythonhosted.org/packages/95/dc/465a14f2d235778405f2e84fce336d07ab045bf1c7df6404bdf8033e06a8/nvidia_cudnn_frontend-1.16.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5c4a8fc573d85a86e08b15d9bf37f729e2487298781867a492a59cde6ac295e2", size = 1952630, upload-time = "2025-11-07T01:32:00.242Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/89/f14435f616603a999975930c4456d6140127f6acb19a877c752beccad837/nvidia_cudnn_frontend-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:a257f10a932ffde9741f644efd3611acf77e2fd89d493d81bc6a8353c48f1ec2", size = 1368775, upload-time = "2025-11-07T01:25:42.252Z" },
+    { url = "https://files.pythonhosted.org/packages/00/39/79b606e805abd67ab4fa72f752a5413a496159f10d94fbdb1d67bb5ae86c/nvidia_cudnn_frontend-1.16.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dd6fdd71c0896ff2ca1809d914cbd17f2904d55863f8881f47946e1d634c7a88", size = 1839271, upload-time = "2025-11-07T01:29:53.06Z" },
+    { url = "https://files.pythonhosted.org/packages/09/21/a0e0d50ba8d7b639fe635500fee0d9c0319561b1ae72176d7024ec04b439/nvidia_cudnn_frontend-1.16.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:16efb069d4bda4d3b99134f59f376cfd4d09558298bd96af778fdc7f2851e696", size = 1954062, upload-time = "2025-11-07T01:32:18.556Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/d6/30ae67bb9c010e9459d1211c56d73373eb4e3dd9f57f4c3c1fe0966efcb1/nvidia_cudnn_frontend-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:7b7860db03767c158accbe0b4e9c9553506513cc970ff08ed28c7761681ac466", size = 1368435, upload-time = "2025-11-07T01:26:28.022Z" },
+    { url = "https://files.pythonhosted.org/packages/32/2c/b4376afef0a6342c56e82e3465c1f8f5c719f588293a50dd04019a22ae6e/nvidia_cudnn_frontend-1.16.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b6bcb3a2fbff80538958e21e2227520f082a961164865aaeedaac527f61084f9", size = 1839805, upload-time = "2025-11-07T01:30:31.056Z" },
+    { url = "https://files.pythonhosted.org/packages/71/13/836b90354036154ab82db3861210e5736983fe1fc44bb39c146ad93b333b/nvidia_cudnn_frontend-1.16.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cbdad88b2bec5dde837f8fa7632022334cddb4756f923b5421c06a712cb59d31", size = 1953953, upload-time = "2025-11-07T01:33:03.781Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/30/3025f34f2c86ceef85134dc1f323f8cf2a26d3ffddc5ada48528c80bfae1/nvidia_cudnn_frontend-1.16.0-cp313-cp313-win_amd64.whl", hash = "sha256:138de2bc4697fabb2eb2f0f601a7e31f8fe97874908e26e33d737276f335473c", size = 1368359, upload-time = "2025-11-07T01:26:51.561Z" },
+]
+
+[[package]]
+name = "nvidia-cufft-cu12"
+version = "11.3.3.83"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "nvidia-nvjitlink-cu12" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/60/bc/7771846d3a0272026c416fbb7e5f4c1f146d6d80704534d0b187dd6f4800/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:848ef7224d6305cdb2a4df928759dca7b1201874787083b6e7550dd6765ce69a", size = 193109211, upload-time = "2025-03-07T01:44:56.873Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74", size = 193118695, upload-time = "2025-03-07T01:45:27.821Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/ec/ce1629f1e478bb5ccd208986b5f9e0316a78538dd6ab1d0484f012f8e2a1/nvidia_cufft_cu12-11.3.3.83-py3-none-win_amd64.whl", hash = "sha256:7a64a98ef2a7c47f905aaf8931b69a3a43f27c55530c698bb2ed7c75c0b42cb7", size = 192216559, upload-time = "2025-03-07T01:53:57.106Z" },
+]
+
+[[package]]
+name = "nvidia-cufile-cu12"
+version = "1.13.1.3"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/bb/fe/1bcba1dfbfb8d01be8d93f07bfc502c93fa23afa6fd5ab3fc7c1df71038a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d069003be650e131b21c932ec3d8969c1715379251f8d23a1860554b1cb24fc", size = 1197834, upload-time = "2025-03-07T01:45:50.723Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/f5/5607710447a6fe9fd9b3283956fceeee8a06cda1d2f56ce31371f595db2a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:4beb6d4cce47c1a0f1013d72e02b0994730359e17801d395bdcbf20cfb3bb00a", size = 1120705, upload-time = "2025-03-07T01:45:41.434Z" },
+]
+
+[[package]]
+name = "nvidia-curand-cu12"
+version = "10.3.9.90"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/45/5e/92aa15eca622a388b80fbf8375d4760738df6285b1e92c43d37390a33a9a/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:dfab99248034673b779bc6decafdc3404a8a6f502462201f2f31f11354204acd", size = 63625754, upload-time = "2025-03-07T01:46:10.735Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/aa/6584b56dc84ebe9cf93226a5cde4d99080c8e90ab40f0c27bda7a0f29aa1/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:b32331d4f4df5d6eefa0554c565b626c7216f87a06a4f56fab27c3b68a830ec9", size = 63619976, upload-time = "2025-03-07T01:46:23.323Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/75/70c05b2f3ed5be3bb30b7102b6eb78e100da4bbf6944fd6725c012831cab/nvidia_curand_cu12-10.3.9.90-py3-none-win_amd64.whl", hash = "sha256:f149a8ca457277da854f89cf282d6ef43176861926c7ac85b2a0fbd237c587ec", size = 62765309, upload-time = "2025-03-07T01:54:20.478Z" },
+]
+
+[[package]]
+name = "nvidia-cusolver-cu12"
+version = "11.7.3.90"
 source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "nvidia-cublas-cu12" },
+    { name = "nvidia-cusparse-cu12" },
+    { name = "nvidia-nvjitlink-cu12" },
+]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/d7/3f/d7bf811f4a76f4e9aa4ef390b11217562bba06f0c77f9e14c765681ccba6/nvidia_cudnn_frontend-1.15.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b4e8c77e848502ad79f8aef6b6c699613a6b5139572aba1f55f626d7bf31b44", size = 1743761, upload-time = "2025-10-10T18:54:15.142Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/b8/286f7fb3f1068acf0014a851f86863ed9fec69aff79a10dcc0dfbffe0523/nvidia_cudnn_frontend-1.15.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:64a926602e52268e09127cf7a227e6b3d7c6e9e2a97fb57eebe88132aec8d9c8", size = 1859188, upload-time = "2025-10-10T18:56:59.386Z" },
-    { url = "https://files.pythonhosted.org/packages/e8/f7/6e55b0122ca5924f0cdbd717392d35a92f43c6ed4b6d64c7d378ee01f301/nvidia_cudnn_frontend-1.15.0-cp310-cp310-win_amd64.whl", hash = "sha256:7a21ec041fa4009cc8b76b2d26ad73010ab5e005804e4df8b1c1abdba5e23cd5", size = 1296575, upload-time = "2025-10-10T18:45:45.04Z" },
-    { url = "https://files.pythonhosted.org/packages/80/b8/d0f1ab5c309c513fe1e4235e860872fc7ee60876e69b30eb0a20fe8c35d8/nvidia_cudnn_frontend-1.15.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:570c2e028ff9b8293f9625b31484084a638de6fb685802194b8dfe16db5a44b4", size = 1747611, upload-time = "2025-10-10T18:54:51.427Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/52/5b77edb810063c10040ac34e1517ee62690c4f030f0cf68298a4608552bc/nvidia_cudnn_frontend-1.15.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:21ac16e4add264839a8db570d5378bb6583bf9539649d80bc8802ded00098a20", size = 1860815, upload-time = "2025-10-10T18:57:17.393Z" },
-    { url = "https://files.pythonhosted.org/packages/de/2b/1fa26eee0479ae0b40582679c1bd08eb78a0b49bb5893ec3edce2a606e9f/nvidia_cudnn_frontend-1.15.0-cp311-cp311-win_amd64.whl", hash = "sha256:c1be7480e3200606c2f2f49263cc13adc72c2a38e38f31f18e9b3727d99618b2", size = 1297355, upload-time = "2025-10-10T18:46:10.171Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/9c/0c2340454f8c9cc4143fdbccef8218dad1e49042d62b26c1781915617c40/nvidia_cudnn_frontend-1.15.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6c2cfe2a0f94bff71614bd3add0ae077f513f7d14909c223afca01ac8056ff84", size = 1749017, upload-time = "2025-10-10T18:55:29.412Z" },
-    { url = "https://files.pythonhosted.org/packages/19/b4/c35104b8fc32986111b611b3080bbcf35fd3fd6794d4aec4e068136ea628/nvidia_cudnn_frontend-1.15.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:aab1098ad4c79935b6e8dc251e9145129a04a8dc6ff75eb30871aacdd1487946", size = 1865629, upload-time = "2025-10-10T18:57:35.941Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/d7/6534807d209a27817d101cf86745e335896e96379bf2d207195cfe9f24ab/nvidia_cudnn_frontend-1.15.0-cp312-cp312-win_amd64.whl", hash = "sha256:13e58a5b001154899f0744165716a7ad24cd7567d759a8229a9ada730a1046b2", size = 1297335, upload-time = "2025-10-10T18:46:35.069Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/75/5a75942aae2bb3a0c1cc44378e9f80c1213a6d7b952c8df19b8845836a34/nvidia_cudnn_frontend-1.15.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fda240405eba3c04866e30b3c1beae26ea7775af4fa4d555cd598695067d32ac", size = 1750048, upload-time = "2025-10-10T18:56:06.057Z" },
-    { url = "https://files.pythonhosted.org/packages/79/70/2ed9802725cb305189dac906a67c799eeb47e4f395b97df0249a750c56fe/nvidia_cudnn_frontend-1.15.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:14941c05a6484d3f05f3089cd290c9b1e6614298f37e07cd01789933932c9f28", size = 1867440, upload-time = "2025-10-10T18:57:53.964Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/04/519fd6e3ea12fe7fe98c497c4d51f6c5c87763d02e90ea3102cef32a6ef1/nvidia_cudnn_frontend-1.15.0-cp313-cp313-win_amd64.whl", hash = "sha256:7c8c6f12534b73b0cd55956c5e9419b7840a01e4c260837606112450ce1ca0d9", size = 1297324, upload-time = "2025-10-10T18:46:53.104Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/32/f7cd6ce8a7690544d084ea21c26e910a97e077c9b7f07bf5de623ee19981/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:db9ed69dbef9715071232caa9b69c52ac7de3a95773c2db65bdba85916e4e5c0", size = 267229841, upload-time = "2025-03-07T01:46:54.356Z" },
+    { url = "https://files.pythonhosted.org/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450", size = 267506905, upload-time = "2025-03-07T01:47:16.273Z" },
+    { url = "https://files.pythonhosted.org/packages/13/c0/76ca8551b8a84146ffa189fec81c26d04adba4bc0dbe09cd6e6fd9b7de04/nvidia_cusolver_cu12-11.7.3.90-py3-none-win_amd64.whl", hash = "sha256:4a550db115fcabc4d495eb7d39ac8b58d4ab5d8e63274d3754df1c0ad6a22d34", size = 256720438, upload-time = "2025-03-07T01:54:39.898Z" },
+]
+
+[[package]]
+name = "nvidia-cusparse-cu12"
+version = "12.5.8.93"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "nvidia-nvjitlink-cu12" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/bc/f7/cd777c4109681367721b00a106f491e0d0d15cfa1fd59672ce580ce42a97/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b6c161cb130be1a07a27ea6923df8141f3c295852f4b260c65f18f3e0a091dc", size = 288117129, upload-time = "2025-03-07T01:47:40.407Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b", size = 288216466, upload-time = "2025-03-07T01:48:13.779Z" },
+    { url = "https://files.pythonhosted.org/packages/62/07/f3b2ad63f8e3d257a599f422ae34eb565e70c41031aecefa3d18b62cabd1/nvidia_cusparse_cu12-12.5.8.93-py3-none-win_amd64.whl", hash = "sha256:9a33604331cb2cac199f2e7f5104dfbb8a5a898c367a53dfda9ff2acb6b6b4dd", size = 284937404, upload-time = "2025-03-07T01:55:07.742Z" },
+]
+
+[[package]]
+name = "nvidia-cusparselt-cu12"
+version = "0.7.1"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/73/b9/598f6ff36faaece4b3c50d26f50e38661499ff34346f00e057760b35cc9d/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8878dce784d0fac90131b6817b607e803c36e629ba34dc5b433471382196b6a5", size = 283835557, upload-time = "2025-02-26T00:16:54.265Z" },
+    { url = "https://files.pythonhosted.org/packages/56/79/12978b96bd44274fe38b5dde5cfb660b1d114f70a65ef962bcbbed99b549/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1bb701d6b930d5a7cea44c19ceb973311500847f81b634d802b7b539dc55623", size = 287193691, upload-time = "2025-02-26T00:15:44.104Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/d8/a6b0d0d0c2435e9310f3e2bb0d9c9dd4c33daef86aa5f30b3681defd37ea/nvidia_cusparselt_cu12-0.7.1-py3-none-win_amd64.whl", hash = "sha256:f67fbb5831940ec829c9117b7f33807db9f9678dc2a617fbe781cac17b4e1075", size = 271020911, upload-time = "2025-02-26T00:14:47.204Z" },
 ]
 
 [[package]]
 name = "nvidia-cutlass-dsl"
-version = "4.2.1"
+version = "4.3.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "cuda-python" },
-    { name = "numpy" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "typing-extensions" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b3/0f/1e96ce9fbe07e8c39484fae4d2cf36e328bdf434b311d88ccedccbfed7db/nvidia_cutlass_dsl-4.2.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:1628bacedde042c60c7ebb1aeccce5a82501197f5e5c4fbbf803712fa45fba59", size = 58540319, upload-time = "2025-09-23T14:38:00.634Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/e3/bc6071743d0ad43d837bf633139bfe1202260c28d893e30f247cf0aa8019/nvidia_cutlass_dsl-4.2.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:aec74b50f700a8ef455f15863de4cb5f1486f72b7bd4becea88624c58c555a13", size = 62233601, upload-time = "2025-09-23T14:39:50.44Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/2a/e65312728338e5bb00b592ce0be12b51e7594a3ef288cd8c99bc1c456968/nvidia_cutlass_dsl-4.2.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:04e605417773957405cad0ac6c2d46139a88aca07a783b4f66e1363f3a91a835", size = 58540069, upload-time = "2025-09-23T14:38:56.002Z" },
-    { url = "https://files.pythonhosted.org/packages/be/f3/20eacdf9876abd892668c191003edc5d7100e45fabfa027d9f3f99d21871/nvidia_cutlass_dsl-4.2.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:874aa3620b3d3dc6598af2226fa3b78f2e7998b8656929b492259e0c9f778786", size = 62233009, upload-time = "2025-09-23T14:39:23.308Z" },
-    { url = "https://files.pythonhosted.org/packages/1e/1d/f168a3dbd8570e5dbbe0deca217d7b374c977b4a4970ebadf3b6d0f1174f/nvidia_cutlass_dsl-4.2.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:10ace6e2005cb0bc04d158c7660f8ec104ab29aeffb26f1ed3bb0b5a577ccc34", size = 58535504, upload-time = "2025-09-23T14:38:29.028Z" },
-    { url = "https://files.pythonhosted.org/packages/02/ab/5bcc0c8c620af5d4acbc71abce10e3eb3023e50342e6bc29b6461f72530e/nvidia_cutlass_dsl-4.2.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:d7ddc9c1f5bb803718d736c907fac857fc606f1fce630c0b1d741935a72723b9", size = 62230361, upload-time = "2025-09-23T14:40:18.156Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/d5/9b79faaec3fa12c52b7de1e727af94c54184b00f280c79b667ab045550db/nvidia_cutlass_dsl-4.2.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:c0985124a74ba435e1f756aa78e89f64c6d01e4f54de1d5a5d218ebbc1c92eff", size = 58535424, upload-time = "2025-09-23T14:37:33.064Z" },
-    { url = "https://files.pythonhosted.org/packages/43/86/78c8cd3fa1a684f3976535d7ac69e54f4ede165b5abca7979fd0820f74f2/nvidia_cutlass_dsl-4.2.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:9356604afc8f62aac46634b3a12baf8cb3f3a6f2e44e398dcfe6ec98ff1a8d1b", size = 62230122, upload-time = "2025-09-23T14:40:46.621Z" },
+    { url = "https://files.pythonhosted.org/packages/75/c3/3cd4c440f386a24c348c7c67adff5e38bb2405d08579ae3ac9312fa14ee4/nvidia_cutlass_dsl-4.3.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:29d6ccb56955e6528c818591fe752a820305951a73fbb69f9a816b3e228d57f8", size = 58726035, upload-time = "2025-11-28T00:59:03.749Z" },
+    { url = "https://files.pythonhosted.org/packages/35/b5/854b713e2355e6211624dfc9df65aca5ebc2a8aaae97a696def34a4b9c9a/nvidia_cutlass_dsl-4.3.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f54d98339d4fca37d39390933186c4a7987291b57129da9bf45c7746d47786af", size = 58591793, upload-time = "2025-11-28T01:03:01.473Z" },
+    { url = "https://files.pythonhosted.org/packages/45/24/432ab11c9da47742518e008f61c58166b3cced5d39df987155d103d5e18e/nvidia_cutlass_dsl-4.3.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:c7b27b3faf2d3cb4e9504ad55129ac58c09aa59f3af6eaabb88f4bda010a2792", size = 58725123, upload-time = "2025-11-28T00:58:11.337Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/07/59509304cac496275a0a7bdae436c267829611b38e4500b2622424c9f737/nvidia_cutlass_dsl-4.3.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:24cfbf55aad55b3dd06ddaa340d13028b4e49b15e0e557105187a9d0bbc260db", size = 58592193, upload-time = "2025-11-28T00:59:54.448Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/c5/f1586c64fcf569b890da776d08a32836a3ef2450cbe9e3ac2971dbecbcce/nvidia_cutlass_dsl-4.3.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:025a8c7a0fb80626e2a893954ea19b2e1ece8d131078c7da12b7fabc2634d04d", size = 58726236, upload-time = "2025-11-28T00:59:29.376Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/5b/fe6a2db1688a690a94f8ad03706fa6db2055d82fab0c4fab764e8c89640f/nvidia_cutlass_dsl-4.3.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b95ce5633e09f12c8d1fcd30c5db06b8325d41b3da0875d3e8a4c110ed5b5cdf", size = 58591826, upload-time = "2025-11-28T01:00:19.559Z" },
+    { url = "https://files.pythonhosted.org/packages/40/fe/5e48c63ff5a510c0edbac5167921a819c70f71daf3b6ead0e0e5346b2a42/nvidia_cutlass_dsl-4.3.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:c8e816cc061b34e016906fa87948f2b0fa836a95f27732c14097f3ddda8286e2", size = 58725695, upload-time = "2025-11-28T01:01:32.1Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/ef/34b1bdd375226b818cd810145e207cceb50fd12eaa87e88a6e67820574d4/nvidia_cutlass_dsl-4.3.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:f71adcfb56607fc86ea621edcf9503eaa31f66f70efd7ab719c33683db082183", size = 58592065, upload-time = "2025-11-28T01:02:35.83Z" },
 ]
 
 [[package]]
 name = "nvidia-mathdx"
-version = "25.1.1"
+version = "25.6.0"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/59/00/f1a73ac224d466b31b6eb09794656112e896185678720b05668777e87db3/nvidia_mathdx-25.1.1-py3-none-any.whl", hash = "sha256:4fb948fe4842d24e679f3d0c140c8a0e8e24c3c7ae5eb6e08584253ad94a198b", size = 39894902, upload-time = "2025-05-06T22:58:32.29Z" },
+    { url = "https://files.pythonhosted.org/packages/20/1a/a418b8c1adc58abd87fd69414c19883af5c1b10514e3dbfcc27cde831b13/nvidia_mathdx-25.6.0-py3-none-any.whl", hash = "sha256:22e6ad5d0d005f836be5cbd14e836cf2e9ea42c82deb602707246ce8198eaa96", size = 23013087, upload-time = "2025-11-13T18:25:11.228Z" },
 ]
 
 [[package]]
@@ -3315,13 +3281,13 @@ wheels = [
 
 [[package]]
 name = "nvidia-modelopt"
-version = "0.33.1"
+version = "0.39.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "ninja" },
-    { name = "numpy" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "nvidia-ml-py" },
-    { name = "nvidia-modelopt-core" },
     { name = "packaging" },
     { name = "pulp" },
     { name = "pydantic" },
@@ -3332,52 +3298,76 @@ dependencies = [
     { name = "scipy", version = "1.16.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "torch", marker = "sys_platform == 'never'" },
     { name = "torchprofile" },
-    { name = "torchvision", marker = "sys_platform == 'never'" },
     { name = "tqdm" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ca/cb/4af39357792a96f334c7877ea0380c9337aec210ff4794a7dd95beb7c349/nvidia_modelopt-0.33.1-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:6c51091683a117cd40fdb96a0ec28579f2276f6b627db7ccddc370df544e1dd7", size = 751683, upload-time = "2025-08-12T18:37:48.832Z" },
-    { url = "https://files.pythonhosted.org/packages/0a/b1/fc2f468d140ef58e90fac584759d0cc449db9bc4f64668cdff750ef38fef/nvidia_modelopt-0.33.1-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:ef78a98901890f265596ec413dffac177d4a1865201d89a14f29f4fa0cf8e710", size = 751683, upload-time = "2025-08-12T18:36:59.964Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/d5/b03ad3ffa28984b629a72da678fa98f912fc45bac3b514c4a70cf2a82fe3/nvidia_modelopt-0.39.0-py3-none-any.whl", hash = "sha256:32f05317c81be1ff2ffeab749e5258b7bea8e4c6e60a09c760584f25ad03f648", size = 864981, upload-time = "2025-11-13T07:35:42.761Z" },
 ]
 
 [[package]]
-name = "nvidia-modelopt-core"
-version = "0.33.1"
+name = "nvidia-nccl-cu12"
+version = "2.27.5"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/6a/21/d12ca11f5554340684d11958aae6c6e7755cf0aaae10a2d2c9db217228cf/nvidia_modelopt_core-0.33.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:f25f6a817609c693ee39d1bcf2d3aeef462b9769f971590133de8b1b0310885b", size = 1307716, upload-time = "2025-08-12T18:41:12.086Z" },
-    { url = "https://files.pythonhosted.org/packages/eb/df/7bead24d4854274d9f2818f1ae780fc24260aab60b7b6f73e1af4f056ce5/nvidia_modelopt_core-0.33.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:195f32f06d19bc9f9d858811f1864bddcc1db6278974d98ea6309cb3553427f1", size = 1326896, upload-time = "2025-08-12T18:39:48.243Z" },
-    { url = "https://files.pythonhosted.org/packages/a1/36/3318980c670292d827ace5ac6110ab6054d0f2d87e507382842ea9e7c78f/nvidia_modelopt_core-0.33.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:ffd008a90d8867660ae41c98002156b526e368a4cdf39e225fe20f478adce8b2", size = 1376104, upload-time = "2025-08-12T18:41:47.358Z" },
-    { url = "https://files.pythonhosted.org/packages/27/97/99d1ddabe01ab262c18621619c996e1c2c119bc058607d2bc9ce7eb85fe7/nvidia_modelopt_core-0.33.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:be49121b2f74db4cb73955396a7bb83935d92232c5a20bcfd7b8e7cae68e482f", size = 1393729, upload-time = "2025-08-12T18:40:07.86Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/b5/ba79b1c52b634b24e45dca409f133f947217a5c7ec5c256266e4ec5fa3eb/nvidia_modelopt_core-0.33.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:1ddd9279d8312f8e972b302692a26e6180f1c9fd277232f5925a5589f42b1b76", size = 1338081, upload-time = "2025-08-12T18:40:36.156Z" },
-    { url = "https://files.pythonhosted.org/packages/13/40/4427583475dfd8eb1b8c7522d75d4d059f0512ff03dcc62d6986a22ab918/nvidia_modelopt_core-0.33.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:69d5ace564f2b056c916117be2023f2b7fc01cd1501073915e6b2ced2b8a5394", size = 1363366, upload-time = "2025-08-12T18:39:28.854Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/1c/857979db0ef194ca5e21478a0612bcdbbe59458d7694361882279947b349/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:31432ad4d1fb1004eb0c56203dc9bc2178a1ba69d1d9e02d64a6938ab5e40e7a", size = 322400625, upload-time = "2025-06-26T04:11:04.496Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/89/f7a07dc961b60645dbbf42e80f2bc85ade7feb9a491b11a1e973aa00071f/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457", size = 322348229, upload-time = "2025-06-26T04:11:28.385Z" },
+]
+
+[[package]]
+name = "nvidia-nvjitlink-cu12"
+version = "12.8.93"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f6/74/86a07f1d0f42998ca31312f998bd3b9a7eff7f52378f4f270c8679c77fb9/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:81ff63371a7ebd6e6451970684f916be2eab07321b73c9d244dc2b4da7f73b88", size = 39254836, upload-time = "2025-03-07T01:49:55.661Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/a2/8cee5da30d13430e87bf99bb33455d2724d0a4a9cb5d7926d80ccb96d008/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:adccd7161ace7261e01bb91e44e88da350895c270d23f744f0820c818b7229e7", size = 38386204, upload-time = "2025-03-07T01:49:43.612Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/d7/34f02dad2e30c31b10a51f6b04e025e5dd60e5f936af9045a9b858a05383/nvidia_nvjitlink_cu12-12.8.93-py3-none-win_amd64.whl", hash = "sha256:bd93fbeeee850917903583587f4fc3a4eafa022e34572251368238ab5e6bd67f", size = 268553710, upload-time = "2025-03-07T01:56:24.13Z" },
+]
+
+[[package]]
+name = "nvidia-nvshmem-cu12"
+version = "3.3.20"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/92/9d/3dd98852568fb845ec1f7902c90a22b240fe1cbabda411ccedf2fd737b7b/nvidia_nvshmem_cu12-3.3.20-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b0b960da3842212758e4fa4696b94f129090b30e5122fea3c5345916545cff0", size = 124484616, upload-time = "2025-08-04T20:24:59.172Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/6c/99acb2f9eb85c29fc6f3a7ac4dccfd992e22666dd08a642b303311326a97/nvidia_nvshmem_cu12-3.3.20-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d00f26d3f9b2e3c3065be895e3059d6479ea5c638a3f38c9fec49b1b9dd7c1e5", size = 124657145, upload-time = "2025-08-04T20:25:19.995Z" },
+]
+
+[[package]]
+name = "nvidia-nvtx-cu12"
+version = "12.8.90"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/10/c0/1b303feea90d296f6176f32a2a70b5ef230f9bdeb3a72bddb0dc922dc137/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d7ad891da111ebafbf7e015d34879f7112832fc239ff0d7d776b6cb685274615", size = 91161, upload-time = "2025-03-07T01:42:23.922Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/eb/86626c1bbc2edb86323022371c39aa48df6fd8b0a1647bc274577f72e90b/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f", size = 89954, upload-time = "2025-03-07T01:42:44.131Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/99/4c9c0c329bf9fc125008c3b54c7c94c0023518d06fc025ae36431375e1fe/nvidia_nvtx_cu12-12.8.90-py3-none-win_amd64.whl", hash = "sha256:619c8304aedc69f02ea82dd244541a83c3d9d40993381b3b590f1adaed3db41e", size = 56492, upload-time = "2025-03-07T01:52:24.69Z" },
 ]
 
 [[package]]
 name = "nvidia-resiliency-ext"
-version = "0.4.1"
+version = "0.5.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "defusedxml" },
+    { name = "nv-one-logger-core" },
+    { name = "nv-one-logger-training-telemetry" },
     { name = "nvidia-ml-py" },
     { name = "packaging" },
     { name = "psutil" },
-    { name = "pynvml" },
     { name = "pyyaml" },
     { name = "torch", marker = "sys_platform == 'never'" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a7/8c/6547d9fdea9730d4f69a19ca492ccbe221768f8473b82502a78a824acc3d/nvidia_resiliency_ext-0.4.1-cp310-cp310-manylinux_2_31_aarch64.whl", hash = "sha256:cf80599411018ebbf03da64769527dee6b37746b72b8606f919b7999633770b8", size = 442891, upload-time = "2025-07-17T03:53:38.878Z" },
-    { url = "https://files.pythonhosted.org/packages/34/0d/520cab980949ad11bd5291784fea309bcd6654a9c97943a3a87644c1d111/nvidia_resiliency_ext-0.4.1-cp310-cp310-manylinux_2_31_x86_64.whl", hash = "sha256:0c23e621d598ba436549db83deeb3569c19df0194b89fe6169d62b6ead711be3", size = 448044, upload-time = "2025-07-17T03:48:30.851Z" },
-    { url = "https://files.pythonhosted.org/packages/46/77/8cda264b262e2868a4e6ebcddaea112200b1e34b8d5a35a2fe3b4978d137/nvidia_resiliency_ext-0.4.1-cp311-cp311-manylinux_2_31_aarch64.whl", hash = "sha256:d8ca454a8b8abef72e0ff0e33914686c263414e8891471c02a9f6af9d2d6b925", size = 443649, upload-time = "2025-07-17T03:49:16.183Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/53/029cc7493b5833cb8dfa201f15a1e422e2e1cc6308d34c5b0a90028a73fd/nvidia_resiliency_ext-0.4.1-cp311-cp311-manylinux_2_31_x86_64.whl", hash = "sha256:dde6034f29350ac6326cdd861ceec641bdd93be0eddbf034739f4cd9452a4dd9", size = 449189, upload-time = "2025-07-17T03:52:15.24Z" },
-    { url = "https://files.pythonhosted.org/packages/70/05/38d491962273c7905708762279f440520eb79f3c00b67a023497215ad023/nvidia_resiliency_ext-0.4.1-cp312-cp312-manylinux_2_31_aarch64.whl", hash = "sha256:b3bd5f01535574b16d0f38bca6e39afe3806c4a2896eee1b321cd944e00025a7", size = 444570, upload-time = "2025-07-17T03:50:58.877Z" },
-    { url = "https://files.pythonhosted.org/packages/18/8b/4cb8aa2bbdf3705d3034c3f3dacdadb03b3b7dd3dc7f5200e64663fb477f/nvidia_resiliency_ext-0.4.1-cp312-cp312-manylinux_2_31_x86_64.whl", hash = "sha256:ca9f8de465af345952bedbea53c90c0e2323d88cfd830ded0e806fad91845c0e", size = 450280, upload-time = "2025-07-17T03:49:55.327Z" },
+    { url = "https://files.pythonhosted.org/packages/df/18/1898cad3bdd643c6bfa5f7aee125a5ef308ab1701ab15106e3e9c66bb416/nvidia_resiliency_ext-0.5.0-cp310-cp310-manylinux_2_39_aarch64.whl", hash = "sha256:97d4b68d3949f3b8370addb474d8662d6ac5008c3c1296420cdeb93a88d6a804", size = 402915, upload-time = "2025-11-13T21:28:34.578Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/48/10fc3f278898e3b2aacc3bea65f0ac4b579e6e0e8447b467742d75adeec1/nvidia_resiliency_ext-0.5.0-cp310-cp310-manylinux_2_39_x86_64.whl", hash = "sha256:ceb04ec5a7bc9301fd6f14449bda6b0d1f37ead4fbe37aa3bf1d7b2ad5b662d4", size = 406483, upload-time = "2025-11-13T21:28:58.732Z" },
+    { url = "https://files.pythonhosted.org/packages/14/17/c19dfed8d4aced307a1c1404f0917ee6c1b319db8092b3cfe2af4e76de6d/nvidia_resiliency_ext-0.5.0-cp311-cp311-manylinux_2_39_aarch64.whl", hash = "sha256:62d396356adcf898cb86a54956eeece29017a41b5872db0b364c8449d23f2f66", size = 404062, upload-time = "2025-11-13T21:29:46.873Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/99/b4324595171c3cdffb03cef070006ab9a3de7fca90a22403576ec6423b69/nvidia_resiliency_ext-0.5.0-cp311-cp311-manylinux_2_39_x86_64.whl", hash = "sha256:c4fcd006ef69300f753bb30d17efbb6bcee6699f044e3532209b2825d22e9977", size = 407027, upload-time = "2025-11-13T21:30:09.124Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/73/232d9f25558f3c6165ff1d15c980a434b47c13e8f527f999cd265859abcf/nvidia_resiliency_ext-0.5.0-cp312-cp312-manylinux_2_39_aarch64.whl", hash = "sha256:81e3d827885e90bed369e67f76dda6709dd4073c2e5fa1228df85d6987cee495", size = 403317, upload-time = "2025-11-13T21:31:24.603Z" },
+    { url = "https://files.pythonhosted.org/packages/44/89/4d7f39416aa3be72ee9f1260a7af56af40f2570f5add1e039d96279a8764/nvidia_resiliency_ext-0.5.0-cp312-cp312-manylinux_2_39_x86_64.whl", hash = "sha256:eb720cd25feabef07f971d4051c7bcac2f9ec73642a9031953d2663307950cb9", size = 407963, upload-time = "2025-11-13T21:30:28.998Z" },
 ]
 
 [[package]]
 name = "nvidia-sphinx-theme"
-version = "0.0.8"
+version = "0.0.9.post1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "pydata-sphinx-theme" },
@@ -3385,27 +3375,26 @@ dependencies = [
     { name = "sphinx", version = "8.2.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/dd/74/996dbc314da8ed670cd5e040d0b4b5be79ff1fc3db3fe25e63134deebe9a/nvidia_sphinx_theme-0.0.8-py3-none-any.whl", hash = "sha256:18f117aa154a3a156251a75647279c541464f3e75f7df2ae283e720cc7d0bc2c", size = 140678, upload-time = "2025-03-24T21:56:25.621Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/79/017fab2f7167a9a9795665f894d04f77aafceca80821b51589bb4b23ff5c/nvidia_sphinx_theme-0.0.9.post1-py3-none-any.whl", hash = "sha256:21ca60206dff2f380d7783d64bbaf71a5b9cacae53c7d0686f089c16b5a3d45a", size = 143816, upload-time = "2025-11-09T23:16:55.719Z" },
 ]
 
 [[package]]
 name = "nvtx"
-version = "0.2.13"
+version = "0.2.14"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/97/02/b3fd3da4ba51764cfc0e4d2b22d5a61511fa79d825344d4704f8429c0bd6/nvtx-0.2.13.tar.gz", hash = "sha256:9db7ba135168e14e1f038866100bf8ed42d3e00b404e9bc7b6280ee3af828b92", size = 112104, upload-time = "2025-08-05T03:27:16.383Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/53/64/d27e344632116da937100a81054c88b0fd6a259de09d6778e03e8231216b/nvtx-0.2.13-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:462bdcc65a12b53bfa3e7df564ddfb72092a030a923dccd1cf88c4b771ecae3f", size = 470534, upload-time = "2025-08-04T19:36:19.389Z" },
-    { url = "https://files.pythonhosted.org/packages/34/15/0b56e9b3020613d7d167bc4cdee3ba8686f6320c6aa62e85ed17b54c4dcb/nvtx-0.2.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7874534af889ab7c2c63554c73119d193d2beb7671b551b7f43de5b97ceb5971", size = 474158, upload-time = "2025-08-04T19:39:39.801Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/be/e00ab0d21f4fb46ad66b0eae89d9e9f7d53af65a37c3db2414a590e05e97/nvtx-0.2.13-cp310-cp310-win_amd64.whl", hash = "sha256:4f26d04b5ea5b96096941cb9a7115a73454e9e9d5c247bfcd34ec584559cf9dd", size = 99104, upload-time = "2025-08-04T19:24:01.775Z" },
-    { url = "https://files.pythonhosted.org/packages/22/02/f74e26cedbdb136440d1234a646cedfddf9a43d19586e1ee466d6275e6b6/nvtx-0.2.13-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1ad794a0c046ef268b2fb3b6812a35bb3bce5cd19207d164689943f0031ac45f", size = 522330, upload-time = "2025-08-04T19:34:49.075Z" },
-    { url = "https://files.pythonhosted.org/packages/1d/55/e1e43201959dd854005c72b8a13ec86b775c349cdcb1d23423d841bbad58/nvtx-0.2.13-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5640ca4b8be2c19a8fc4ca8403d3c2598165ea27541940b4897138a7b0a717fe", size = 522841, upload-time = "2025-08-04T19:38:27.819Z" },
-    { url = "https://files.pythonhosted.org/packages/a9/8c/89d1f499a4880e30e0b5bdf429cbd1d8c612d09c49c13016384ce9cd156d/nvtx-0.2.13-cp311-cp311-win_amd64.whl", hash = "sha256:be6d53143cb2bd44e04aecdb7f3b34b48ded96f3673ae41362239d9f54bcfe27", size = 99106, upload-time = "2025-08-04T19:22:49.181Z" },
-    { url = "https://files.pythonhosted.org/packages/c5/73/ad21e09dc2534f1e9723bbe5871fa5f03361ac51ca4d411fea6f765b5b6a/nvtx-0.2.13-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3435cbbffa132f6aaba3abdb01e71a1b961a20858b4cb791883895a25b9305d6", size = 539358, upload-time = "2025-08-04T19:33:16.494Z" },
-    { url = "https://files.pythonhosted.org/packages/12/ab/762da984e7671f7c34ae87e5b70523c3eeb4563759268bfaea07c97f32a6/nvtx-0.2.13-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:453d838dd1424a04303281ee57a73e2b8dca0e03039bc609a945861b8fe7d7d9", size = 545588, upload-time = "2025-08-04T19:37:40.64Z" },
-    { url = "https://files.pythonhosted.org/packages/2a/b6/55bc5916386db70b93cbf543b1e880ead786d9ff0cdcfa262f5a2af46c74/nvtx-0.2.13-cp312-cp312-win_amd64.whl", hash = "sha256:0722d743e0e41e1fb866ebe6446e0cd0d268ca8671313f8da4f8c969956b74d3", size = 99123, upload-time = "2025-08-04T19:24:24.391Z" },
-    { url = "https://files.pythonhosted.org/packages/41/73/98c0669d5f9387a36d56b0e62ea3919124dd8dd7582d896ed1cae2998f57/nvtx-0.2.13-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1561d2111c698b1b1075899ff9c3fa7ba83603fc27c2e8ef567de6bbbe85ce1", size = 519840, upload-time = "2025-08-04T19:34:00.877Z" },
-    { url = "https://files.pythonhosted.org/packages/14/4b/21e975997def8a387543ba2bbe227551ad466781c39fc67f37f53555f37e/nvtx-0.2.13-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:edd7b729ed0211350258a21dd13422f59bc521de2b2fd21feb6c177af492f4e1", size = 524711, upload-time = "2025-08-04T19:38:03.559Z" },
-    { url = "https://files.pythonhosted.org/packages/21/d7/0ca146afd875f1e02636323840960071f768b5d8ba3e7d37f2ac9192bfd9/nvtx-0.2.13-cp313-cp313-win_amd64.whl", hash = "sha256:f0524bb71443d5a1f19a6409a9a81405fc437e53c5edfc4c44b6f4504ccf46e3", size = 97317, upload-time = "2025-08-04T19:24:46.391Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/ca/fa76ea4985fd8f3d8c437bffec2580b1cac7f2401671089ac842610ae466/nvtx-0.2.14-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b70b2415ab97edf19514be226d5058898922c6b6bb1d7fdd5ef92d1e086f3e0f", size = 695204, upload-time = "2025-11-27T17:28:52.688Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/1f/0aa62d52062d700dbed36dd2ebfddf5133c72180d448cce66545e5ccbe5d/nvtx-0.2.14-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23ab874f9c70e5433f39e40ca318ffcfc14fb43ed6798e6be5a30f74e4ca831f", size = 686698, upload-time = "2025-11-27T17:23:19.335Z" },
+    { url = "https://files.pythonhosted.org/packages/18/c9/a12d48157221a8e939f3f7ec8f8a543e232fb9248820afb164ff9eb3eaa7/nvtx-0.2.14-cp310-cp310-win_amd64.whl", hash = "sha256:3a22be895546ca609e83e54614b56739200ab6f4d13e15f5685544082b1b7908", size = 119654, upload-time = "2025-11-27T17:32:08.536Z" },
+    { url = "https://files.pythonhosted.org/packages/87/a6/4d473abd7c07a6d1060c0f708e21ddf46a960258532ffc897681db5c0f46/nvtx-0.2.14-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:227f6406d2fe1a4b890be17eb1f4c1f5bd4df8f7032dd1cb8c7651d379f35541", size = 732764, upload-time = "2025-11-27T17:26:21.853Z" },
+    { url = "https://files.pythonhosted.org/packages/94/06/3ab72e5a463af1b95934638cb8377e99f58e5ef21a47cbf69b92267d6602/nvtx-0.2.14-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0664aa75b24e2ad0abdd0fa52c49e9c8a120652f2194289c85dc2d93cbc6017f", size = 724555, upload-time = "2025-11-27T17:22:36.402Z" },
+    { url = "https://files.pythonhosted.org/packages/18/1d/64f6078a5ab4134af91ba294035ee1ebb3512edaaa9d60d8f0f023178620/nvtx-0.2.14-cp311-cp311-win_amd64.whl", hash = "sha256:10f5971661d61c1a90cd36c3069240452c904ecec4b3a08d0d6fdba1e5398165", size = 119660, upload-time = "2025-11-27T17:32:30.406Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/de/2cc15bb805b1b18317b60837b853ed023757730d0db82de291635fc88bc3/nvtx-0.2.14-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3ece46f555e725db879df06549980744f89db5923a77e6f7a5aecda75292421a", size = 727708, upload-time = "2025-11-27T17:25:20.836Z" },
+    { url = "https://files.pythonhosted.org/packages/81/94/b37d634fef8677ce525b5bfd2886737ea2c064bc3576fc84423973ff5b97/nvtx-0.2.14-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17efe5d903996bceb0c8a12cae80fa9b66bee7ee895923bd9d8ec2a5af1aabd8", size = 737691, upload-time = "2025-11-27T17:21:27.87Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/c1/f633aa32003050ff83626a19402f03c83990a15b4df658a7bf1b590ee83e/nvtx-0.2.14-cp312-cp312-win_amd64.whl", hash = "sha256:f40db4746714d525d3020c702a0df866c2335efd6a27c41e869e577402a53a4b", size = 119193, upload-time = "2025-11-27T17:31:42.943Z" },
+    { url = "https://files.pythonhosted.org/packages/04/a3/603ecdfd5cd97feee59c7e51da4929e22eac8dbe68ac78df53e74152813f/nvtx-0.2.14-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8cd1f2b464675b4d3c2036b7bbaf975baa9307f0795107dc69c556c0c8d191d", size = 710057, upload-time = "2025-11-27T17:28:08.127Z" },
+    { url = "https://files.pythonhosted.org/packages/97/29/945dd440e6bd459e6064f321ed425dbae7d03d39ffa97a38e5434fbcda27/nvtx-0.2.14-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6532556d81f782e24eb12c5e0c75e297493d6ab0431177c93c12bb29c523ea9e", size = 717825, upload-time = "2025-11-27T17:22:57.556Z" },
+    { url = "https://files.pythonhosted.org/packages/16/3e/5d7872f2a0809237e3d524f81a7a3c7fbeb98bdc9dcec4723b75a45cd552/nvtx-0.2.14-cp313-cp313-win_amd64.whl", hash = "sha256:cd86f78ed56aede301b03e5ab8cb1aaeb8ba0b5ed683f98f87fbe474996d73f2", size = 118546, upload-time = "2025-11-27T17:30:32.549Z" },
 ]
 
 [[package]]
@@ -3423,141 +3412,75 @@ wheels = [
 
 [[package]]
 name = "onnx"
-version = "1.19.0"
+version = "1.19.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "ml-dtypes", version = "0.4.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "ml-dtypes", version = "0.5.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy" },
+    { name = "ml-dtypes" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "protobuf" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/5b/bf/b0a63ee9f3759dcd177b28c6f2cb22f2aecc6d9b3efecaabc298883caa5f/onnx-1.19.0.tar.gz", hash = "sha256:aa3f70b60f54a29015e41639298ace06adf1dd6b023b9b30f1bca91bb0db9473", size = 11949859, upload-time = "2025-08-27T02:34:27.107Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/00/b3/8a6f3b05d18dffdc7c18839bd829587c826c8513f4bdbe21ddf37dacce50/onnx-1.19.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:e927d745939d590f164e43c5aec7338c5a75855a15130ee795f492fc3a0fa565", size = 18310869, upload-time = "2025-08-27T02:32:47.346Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/92/550d6155ab3f2c00e95add1726397c95b4b79d6eb4928d049ff591ad4c84/onnx-1.19.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c6cdcb237c5c4202463bac50417c5a7f7092997a8469e8b7ffcd09f51de0f4a9", size = 18028144, upload-time = "2025-08-27T02:32:50.306Z" },
-    { url = "https://files.pythonhosted.org/packages/79/21/9bcc715ea6d9aab3f6c583bfc59504a14777e39e0591030e7345f4e40315/onnx-1.19.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ed0b85a33deacb65baffe6ca4ce91adf2bb906fa2dee3856c3c94e163d2eb563", size = 18200923, upload-time = "2025-08-27T02:32:54.325Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/90/3a6f0741ff22270e2f4b741f440ab68ba5525ebc94775cd6f2c01f531374/onnx-1.19.0-cp310-cp310-win32.whl", hash = "sha256:89a9cefe75547aec14a796352c2243e36793bbbcb642d8897118595ab0c2395b", size = 16332097, upload-time = "2025-08-27T02:32:56.997Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/4c/ef61d359865712803d488672607023d36bfcd21fa008d8dc1d6ee8e8b23c/onnx-1.19.0-cp310-cp310-win_amd64.whl", hash = "sha256:a16a82bfdf4738691c0a6eda5293928645ab8b180ab033df84080817660b5e66", size = 16451402, upload-time = "2025-08-27T02:33:00.534Z" },
-    { url = "https://files.pythonhosted.org/packages/db/5c/b959b17608cfb6ccf6359b39fe56a5b0b7d965b3d6e6a3c0add90812c36e/onnx-1.19.0-cp311-cp311-macosx_12_0_universal2.whl", hash = "sha256:206f00c47b85b5c7af79671e3307147407991a17994c26974565aadc9e96e4e4", size = 18312580, upload-time = "2025-08-27T02:33:03.081Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/ee/ac052bbbc832abe0debb784c2c57f9582444fb5f51d63c2967fd04432444/onnx-1.19.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4d7bee94abaac28988b50da675ae99ef8dd3ce16210d591fbd0b214a5930beb3", size = 18029165, upload-time = "2025-08-27T02:33:05.771Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/c9/8687ba0948d46fd61b04e3952af9237883bbf8f16d716e7ed27e688d73b8/onnx-1.19.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7730b96b68c0c354bbc7857961bb4909b9aaa171360a8e3708d0a4c749aaadeb", size = 18202125, upload-time = "2025-08-27T02:33:09.325Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/16/6249c013e81bd689f46f96c7236d7677f1af5dd9ef22746716b48f10e506/onnx-1.19.0-cp311-cp311-win32.whl", hash = "sha256:7cb7a3ad8059d1a0dfdc5e0a98f71837d82002e441f112825403b137227c2c97", size = 16332738, upload-time = "2025-08-27T02:33:12.448Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/28/34a1e2166e418c6a78e5c82e66f409d9da9317832f11c647f7d4e23846a6/onnx-1.19.0-cp311-cp311-win_amd64.whl", hash = "sha256:d75452a9be868bd30c3ef6aa5991df89bbfe53d0d90b2325c5e730fbd91fff85", size = 16452303, upload-time = "2025-08-27T02:33:15.176Z" },
-    { url = "https://files.pythonhosted.org/packages/e6/b7/639664626e5ba8027860c4d2a639ee02b37e9c322215c921e9222513c3aa/onnx-1.19.0-cp311-cp311-win_arm64.whl", hash = "sha256:23c7959370d7b3236f821e609b0af7763cff7672a758e6c1fc877bac099e786b", size = 16425340, upload-time = "2025-08-27T02:33:17.78Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/94/f56f6ca5e2f921b28c0f0476705eab56486b279f04e1d568ed64c14e7764/onnx-1.19.0-cp312-cp312-macosx_12_0_universal2.whl", hash = "sha256:61d94e6498ca636756f8f4ee2135708434601b2892b7c09536befb19bc8ca007", size = 18322331, upload-time = "2025-08-27T02:33:20.373Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/00/8cc3f3c40b54b28f96923380f57c9176872e475face726f7d7a78bd74098/onnx-1.19.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:224473354462f005bae985c72028aaa5c85ab11de1b71d55b06fdadd64a667dd", size = 18027513, upload-time = "2025-08-27T02:33:23.44Z" },
-    { url = "https://files.pythonhosted.org/packages/61/90/17c4d2566fd0117a5e412688c9525f8950d467f477fbd574e6b32bc9cb8d/onnx-1.19.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ae475c85c89bc4d1f16571006fd21a3e7c0e258dd2c091f6e8aafb083d1ed9b", size = 18202278, upload-time = "2025-08-27T02:33:26.103Z" },
-    { url = "https://files.pythonhosted.org/packages/bc/6e/a9383d9cf6db4ac761a129b081e9fa5d0cd89aad43cf1e3fc6285b915c7d/onnx-1.19.0-cp312-cp312-win32.whl", hash = "sha256:323f6a96383a9cdb3960396cffea0a922593d221f3929b17312781e9f9b7fb9f", size = 16333080, upload-time = "2025-08-27T02:33:28.559Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/2e/3ff480a8c1fa7939662bdc973e41914add2d4a1f2b8572a3c39c2e4982e5/onnx-1.19.0-cp312-cp312-win_amd64.whl", hash = "sha256:50220f3499a499b1a15e19451a678a58e22ad21b34edf2c844c6ef1d9febddc2", size = 16453927, upload-time = "2025-08-27T02:33:31.177Z" },
-    { url = "https://files.pythonhosted.org/packages/57/37/ad500945b1b5c154fe9d7b826b30816ebd629d10211ea82071b5bcc30aa4/onnx-1.19.0-cp312-cp312-win_arm64.whl", hash = "sha256:efb768299580b786e21abe504e1652ae6189f0beed02ab087cd841cb4bb37e43", size = 16426022, upload-time = "2025-08-27T02:33:33.515Z" },
-    { url = "https://files.pythonhosted.org/packages/be/29/d7b731f63d243f815d9256dce0dca3c151dcaa1ac59f73e6ee06c9afbe91/onnx-1.19.0-cp313-cp313-macosx_12_0_universal2.whl", hash = "sha256:9aed51a4b01acc9ea4e0fe522f34b2220d59e9b2a47f105ac8787c2e13ec5111", size = 18322412, upload-time = "2025-08-27T02:33:36.723Z" },
-    { url = "https://files.pythonhosted.org/packages/58/f5/d3106becb42cb374f0e17ff4c9933a97f1ee1d6a798c9452067f7d3ff61b/onnx-1.19.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ce2cdc3eb518bb832668c4ea9aeeda01fbaa59d3e8e5dfaf7aa00f3d37119404", size = 18026565, upload-time = "2025-08-27T02:33:39.493Z" },
-    { url = "https://files.pythonhosted.org/packages/83/fa/b086d17bab3900754c7ffbabfb244f8e5e5da54a34dda2a27022aa2b373b/onnx-1.19.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8b546bd7958734b6abcd40cfede3d025e9c274fd96334053a288ab11106bd0aa", size = 18202077, upload-time = "2025-08-27T02:33:42.115Z" },
-    { url = "https://files.pythonhosted.org/packages/35/f2/5e2dfb9d4cf873f091c3f3c6d151f071da4295f9893fbf880f107efe3447/onnx-1.19.0-cp313-cp313-win32.whl", hash = "sha256:03086bffa1cf5837430cf92f892ca0cd28c72758d8905578c2bf8ffaf86c6743", size = 16333198, upload-time = "2025-08-27T02:33:45.172Z" },
-    { url = "https://files.pythonhosted.org/packages/79/67/b3751a35c2522f62f313156959575619b8fa66aa883db3adda9d897d8eb2/onnx-1.19.0-cp313-cp313-win_amd64.whl", hash = "sha256:1715b51eb0ab65272e34ef51cb34696160204b003566cd8aced2ad20a8f95cb8", size = 16453836, upload-time = "2025-08-27T02:33:47.779Z" },
-    { url = "https://files.pythonhosted.org/packages/14/b9/1df85effc960fbbb90bb7bc36eb3907c676b104bc2f88bce022bcfdaef63/onnx-1.19.0-cp313-cp313-win_arm64.whl", hash = "sha256:6bf5acdb97a3ddd6e70747d50b371846c313952016d0c41133cbd8f61b71a8d5", size = 16425877, upload-time = "2025-08-27T02:33:50.357Z" },
-    { url = "https://files.pythonhosted.org/packages/23/2b/089174a1427be9149f37450f8959a558ba20f79fca506ba461d59379d3a1/onnx-1.19.0-cp313-cp313t-macosx_12_0_universal2.whl", hash = "sha256:46cf29adea63e68be0403c68de45ba1b6acc9bb9592c5ddc8c13675a7c71f2cb", size = 18348546, upload-time = "2025-08-27T02:33:56.132Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/d6/3458f0e3a9dc7677675d45d7d6528cb84ad321c8670cc10c69b32c3e03da/onnx-1.19.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:246f0de1345498d990a443d55a5b5af5101a3e25a05a2c3a5fe8b7bd7a7d0707", size = 18033067, upload-time = "2025-08-27T02:33:58.661Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/16/6e4130e1b4b29465ee1fb07d04e8d6f382227615c28df8f607ba50909e2a/onnx-1.19.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ae0d163ffbc250007d984b8dd692a4e2e4506151236b50ca6e3560b612ccf9ff", size = 18205741, upload-time = "2025-08-27T02:34:01.538Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/d8/f64d010fd024b2a2b11ce0c4ee179e4f8f6d4ccc95f8184961c894c22af1/onnx-1.19.0-cp313-cp313t-win_amd64.whl", hash = "sha256:7c151604c7cca6ae26161c55923a7b9b559df3344938f93ea0074d2d49e7fe78", size = 16453839, upload-time = "2025-08-27T02:34:06.515Z" },
-    { url = "https://files.pythonhosted.org/packages/67/ec/8761048eabef4dad55af4c002c672d139b9bd47c3616abaed642a1710063/onnx-1.19.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:236bc0e60d7c0f4159300da639953dd2564df1c195bce01caba172a712e75af4", size = 18027605, upload-time = "2025-08-27T02:34:08.962Z" },
-]
-
-[[package]]
-name = "onnx-ir"
-version = "0.1.8"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'",
-]
-dependencies = [
-    { name = "ml-dtypes", version = "0.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
-    { name = "numpy", marker = "python_full_version >= '3.13'" },
-    { name = "onnx", marker = "python_full_version >= '3.13'" },
-    { name = "typing-extensions", marker = "python_full_version >= '3.13'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/af/4a/7ea3952e556e7281b8bfe7f7fce016a13fdac85544d6d6af8ebca5cae160/onnx_ir-0.1.8.tar.gz", hash = "sha256:85ea59eaf165b2b107788193480a260e2723cfc7a1dac1bde7085fd0b7e380d7", size = 108961, upload-time = "2025-09-05T15:45:33.887Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/0f/1c/3bb51fa9e278cbc655a1943c8016163d76a6e24137e73e5198ebc20fc965/onnx_ir-0.1.8-py3-none-any.whl", hash = "sha256:61a42021b6249e566ff3b89a03342bc88dce4dc2d984b97cfb060f33ef179f8a", size = 125316, upload-time = "2025-09-05T15:45:31.211Z" },
+sdist = { url = "https://files.pythonhosted.org/packages/27/2f/c619eb65769357e9b6de9212c9a821ab39cd484448e5d6b3fb5fb0a64c6d/onnx-1.19.1.tar.gz", hash = "sha256:737524d6eb3907d3499ea459c6f01c5a96278bb3a0f2ff8ae04786fb5d7f1ed5", size = 12033525, upload-time = "2025-10-10T04:01:34.342Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5b/f3/892eea0206ed13a986239bd508c82b974387ef1b0ffd83ece0ce0725aaf6/onnx-1.19.1-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:7343250cc5276cf439fe623b8f92e11cf0d1eebc733ae4a8b2e86903bb72ae68", size = 18319433, upload-time = "2025-10-10T03:59:47.236Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/f3/c7ea4a1dfda9b9ddeff914a601ffaf5ed151b3352529f223eae74c03c8d1/onnx-1.19.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1fb8f79de7f3920bb82b537f3c6ac70c0ce59f600471d9c3eed2b5f8b079b748", size = 18043327, upload-time = "2025-10-10T03:59:50.854Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/eb/30159bb6a108b03f2b7521410369a5bd8d296be3fbf0b30ab7acd9ef42ad/onnx-1.19.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:92b9d2dece41cc84213dbbfd1acbc2a28c27108c53bd28ddb6d1043fbfcbd2d5", size = 18216877, upload-time = "2025-10-10T03:59:54.512Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/86/dc034e5a723a20ca45aa8dd76dda53c358a5f955908e1436f42c21bdfb3a/onnx-1.19.1-cp310-cp310-win32.whl", hash = "sha256:c0b1a2b6bb19a0fc9f5de7661a547136d082c03c169a5215e18ff3ececd2a82f", size = 16344116, upload-time = "2025-10-10T03:59:57.991Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/60/537f2c19050f71445ee00ed91e78a396b6189dd1fce61b29ac6a0d651c7e/onnx-1.19.1-cp310-cp310-win_amd64.whl", hash = "sha256:1c0498c00db05fcdb3426697d330dcecc3f60020015065e2c76fa795f2c9a605", size = 16462819, upload-time = "2025-10-10T04:00:01.157Z" },
+    { url = "https://files.pythonhosted.org/packages/36/07/0019c72924909e4f64b9199770630ab7b8d7914b912b03230e68f5eda7ae/onnx-1.19.1-cp311-cp311-macosx_12_0_universal2.whl", hash = "sha256:17aaf5832126de0a5197a5864e4f09a764dd7681d3035135547959b4b6b77a09", size = 18320936, upload-time = "2025-10-10T04:00:04.235Z" },
+    { url = "https://files.pythonhosted.org/packages/af/2f/5c47acf740dc35f0decc640844260fbbdc0efa0565657c93fd7ff30f13f3/onnx-1.19.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:01b292a4d0b197c45d8184545bbc8ae1df83466341b604187c1b05902cb9c920", size = 18044269, upload-time = "2025-10-10T04:00:07.449Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/61/6c457ee8c3a62a3cad0a4bfa4c5436bb3ac4df90c3551d40bee1224b5b51/onnx-1.19.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1839af08ab4a909e4af936b8149c27f8c64b96138981024e251906e0539d8bf9", size = 18218092, upload-time = "2025-10-10T04:00:11.135Z" },
+    { url = "https://files.pythonhosted.org/packages/54/d5/ab832e1369505e67926a70e9a102061f89ad01f91aa296c4b1277cb81b25/onnx-1.19.1-cp311-cp311-win32.whl", hash = "sha256:0bdbb676e3722bd32f9227c465d552689f49086f986a696419d865cb4e70b989", size = 16344809, upload-time = "2025-10-10T04:00:14.634Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/b5/6eb4611d24b85002f878ba8476b4cecbe6f9784c0236a3c5eff85236cc0a/onnx-1.19.1-cp311-cp311-win_amd64.whl", hash = "sha256:1346853df5c1e3ebedb2e794cf2a51e0f33759affd655524864ccbcddad7035b", size = 16464319, upload-time = "2025-10-10T04:00:18.235Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/ff/f0e1f06420c70e20d497fec7c94a864d069943b6312bedd4224c0ab946f8/onnx-1.19.1-cp311-cp311-win_arm64.whl", hash = "sha256:2d69c280c0e665b7f923f499243b9bb84fe97970b7a4668afa0032045de602c8", size = 16437503, upload-time = "2025-10-10T04:00:21.247Z" },
+    { url = "https://files.pythonhosted.org/packages/50/07/f6c5b2cffef8c29e739616d1415aea22f7b7ef1f19c17f02b7cff71f5498/onnx-1.19.1-cp312-cp312-macosx_12_0_universal2.whl", hash = "sha256:3612193a89ddbce5c4e86150869b9258780a82fb8c4ca197723a4460178a6ce9", size = 18327840, upload-time = "2025-10-10T04:00:24.259Z" },
+    { url = "https://files.pythonhosted.org/packages/93/20/0568ebd52730287ae80cac8ac893a7301c793ea1630984e2519ee92b02a9/onnx-1.19.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6c2fd2f744e7a3880ad0c262efa2edf6d965d0bd02b8f327ec516ad4cb0f2f15", size = 18042539, upload-time = "2025-10-10T04:00:27.693Z" },
+    { url = "https://files.pythonhosted.org/packages/14/fd/cd7a0fd10a04f8cc5ae436b63e0022e236fe51b9dbb8ee6317fd48568c72/onnx-1.19.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:485d3674d50d789e0ee72fa6f6e174ab81cb14c772d594f992141bd744729d8a", size = 18218271, upload-time = "2025-10-10T04:00:30.495Z" },
+    { url = "https://files.pythonhosted.org/packages/65/68/cc8b8c05469fe08384b446304ad7e6256131ca0463bf6962366eebec98c0/onnx-1.19.1-cp312-cp312-win32.whl", hash = "sha256:638bc56ff1a5718f7441e887aeb4e450f37a81c6eac482040381b140bd9ba601", size = 16345111, upload-time = "2025-10-10T04:00:34.982Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/5e/d1cb16693598a512c2cf9ffe0841d8d8fd2c83ae8e889efd554f5aa427cf/onnx-1.19.1-cp312-cp312-win_amd64.whl", hash = "sha256:bc7e2e4e163e679721e547958b5a7db875bf822cad371b7c1304aa4401a7c7a4", size = 16465621, upload-time = "2025-10-10T04:00:39.107Z" },
+    { url = "https://files.pythonhosted.org/packages/90/32/da116cc61fdef334782aa7f87a1738431dd1af1a5d1a44bd95d6d51ad260/onnx-1.19.1-cp312-cp312-win_arm64.whl", hash = "sha256:17c215b1c0f20fe93b4cbe62668247c1d2294b9bc7f6be0ca9ced28e980c07b7", size = 16437505, upload-time = "2025-10-10T04:00:42.255Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/b8/ab1fdfe2e8502f4dc4289fc893db35816bd20d080d8370f86e74dda5f598/onnx-1.19.1-cp313-cp313-macosx_12_0_universal2.whl", hash = "sha256:4e5f938c68c4dffd3e19e4fd76eb98d298174eb5ebc09319cdd0ec5fe50050dc", size = 18327815, upload-time = "2025-10-10T04:00:45.682Z" },
+    { url = "https://files.pythonhosted.org/packages/04/40/eb875745a4b92aea10e5e32aa2830f409c4d7b6f7b48ca1c4eaad96636c5/onnx-1.19.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:86e20a5984b017feeef2dbf4ceff1c7c161ab9423254968dd77d3696c38691d0", size = 18041464, upload-time = "2025-10-10T04:00:48.557Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/8e/8586135f40dbe4989cec4d413164bc8fc5c73d37c566f33f5ea3a7f2b6f6/onnx-1.19.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8d9c467f0f29993c12f330736af87972f30adb8329b515f39d63a0db929cb2c", size = 18218244, upload-time = "2025-10-10T04:00:51.891Z" },
+    { url = "https://files.pythonhosted.org/packages/51/b5/4201254b8683129db5da3fb55aa1f7e56d0a8d45c66ce875dec21ca1ff25/onnx-1.19.1-cp313-cp313-win32.whl", hash = "sha256:65eee353a51b4e4ca3e797784661e5376e2b209f17557e04921eac9166a8752e", size = 16345330, upload-time = "2025-10-10T04:00:54.858Z" },
+    { url = "https://files.pythonhosted.org/packages/69/67/c6d239afbcdbeb6805432969b908b5c9f700c96d332b34e3f99518d76caf/onnx-1.19.1-cp313-cp313-win_amd64.whl", hash = "sha256:c3bc87e38b53554b1fc9ef7b275c81c6f5c93c90a91935bb0aa8d4d498a6d48e", size = 16465567, upload-time = "2025-10-10T04:00:57.893Z" },
+    { url = "https://files.pythonhosted.org/packages/99/fe/89f1e40f5bc54595ff0dcf5391ce19e578b528973ccc74dd99800196d30d/onnx-1.19.1-cp313-cp313-win_arm64.whl", hash = "sha256:e41496f400afb980ec643d80d5164753a88a85234fa5c06afdeebc8b7d1ec252", size = 16437562, upload-time = "2025-10-10T04:01:00.703Z" },
+    { url = "https://files.pythonhosted.org/packages/86/43/b186ccbc8fe7e93643a6a6d40bbf2bb6ce4fb9469bbd3453c77e270c50ad/onnx-1.19.1-cp313-cp313t-macosx_12_0_universal2.whl", hash = "sha256:5f6274abf0fd74e80e78ecbb44bd44509409634525c89a9b38276c8af47dc0a2", size = 18355703, upload-time = "2025-10-10T04:01:03.735Z" },
+    { url = "https://files.pythonhosted.org/packages/60/f1/22ee4d8b8f9fa4cb1d1b9579da3b4b5187ddab33846ec5ac744af02c0e2b/onnx-1.19.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:07dcd4d83584eb4bf8f21ac04c82643712e5e93ac2a0ed10121ec123cb127e1e", size = 18047830, upload-time = "2025-10-10T04:01:06.552Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/a4/8f3d51e3a095d42cdf2039a590cff06d024f2a10efbd0b1a2a6b3825f019/onnx-1.19.1-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1975860c3e720db25d37f1619976582828264bdcc64fa7511c321ac4fc01add3", size = 18221126, upload-time = "2025-10-10T04:01:09.77Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/0d/f9d6c2237083f1aac14b37f0b03b0d81f1147a8e2af0c3828165e0a6a67b/onnx-1.19.1-cp313-cp313t-win_amd64.whl", hash = "sha256:9807d0e181f6070ee3a6276166acdc571575d1bd522fc7e89dba16fd6e7ffed9", size = 16465560, upload-time = "2025-10-10T04:01:13.212Z" },
+    { url = "https://files.pythonhosted.org/packages/36/70/8418a58faa7d606d6a92cab69ae8d361b3b3969bf7e7e9a65a86d5d1b674/onnx-1.19.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b6ee83e6929d75005482d9f304c502ac7c9b8d6db153aa6b484dae74d0f28570", size = 18042812, upload-time = "2025-10-10T04:01:15.919Z" },
 ]
 
 [[package]]
 name = "onnx-ir"
 version = "0.1.12"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version == '3.12.*' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux'",
-    "python_full_version < '3.11' and sys_platform == 'linux'",
-    "python_full_version < '3.11' and sys_platform != 'linux'",
-]
 dependencies = [
-    { name = "ml-dtypes", version = "0.5.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" },
-    { name = "numpy", marker = "python_full_version < '3.13'" },
-    { name = "onnx", marker = "python_full_version < '3.13'" },
-    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+    { name = "ml-dtypes" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "onnx" },
+    { name = "typing-extensions" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/6c/1a/2a94112a39d01a9d1490f5ef3c205d8a17fe1ca27f307b026c40d62d8e9f/onnx_ir-0.1.12.tar.gz", hash = "sha256:742e0bff875d0547724187560b3f441833191c8aa939c05f14176f4892784deb", size = 112699, upload-time = "2025-10-28T23:43:54.129Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/c8/36/c4df116f5dcaa82ec7944e5d25624a3811f6603fd190660b0b079ea759fb/onnx_ir-0.1.12-py3-none-any.whl", hash = "sha256:17f86faf8a53b979430bde1bc6022c7a162b0d1534550ddb17a1d37eb993e765", size = 129277, upload-time = "2025-10-28T23:43:52.493Z" },
 ]
 
-[[package]]
-name = "onnxscript"
-version = "0.5.0"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'",
-]
-dependencies = [
-    { name = "ml-dtypes", version = "0.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
-    { name = "numpy", marker = "python_full_version >= '3.13'" },
-    { name = "onnx", marker = "python_full_version >= '3.13'" },
-    { name = "onnx-ir", version = "0.1.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
-    { name = "packaging", marker = "python_full_version >= '3.13'" },
-    { name = "typing-extensions", marker = "python_full_version >= '3.13'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/f5/2f/0bb2b6ca727e4d5173f640527f402ab4225def4bc8d667269b83047be8c4/onnxscript-0.5.0.tar.gz", hash = "sha256:4aba215e1f80fbcd07ba0d97d6bca96797fc3e9639eacb5434d35317ce1406aa", size = 588762, upload-time = "2025-09-12T16:57:46.484Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/e7/f7/f0eb0b10771637a8c176a3b0594c65c5ba3cea440847741297901cef2c5e/onnxscript-0.5.0-py3-none-any.whl", hash = "sha256:da33715ac8ec80e0263a5200f1ad1b3532225804c05a13a0d6ea83712b5b4a8f", size = 684685, upload-time = "2025-09-12T16:57:48.869Z" },
-]
-
 [[package]]
 name = "onnxscript"
 version = "0.5.6"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version == '3.12.*' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux'",
-    "python_full_version < '3.11' and sys_platform == 'linux'",
-    "python_full_version < '3.11' and sys_platform != 'linux'",
-]
 dependencies = [
-    { name = "ml-dtypes", version = "0.5.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" },
-    { name = "numpy", marker = "python_full_version < '3.13'" },
-    { name = "onnx", marker = "python_full_version < '3.13'" },
-    { name = "onnx-ir", version = "0.1.12", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" },
-    { name = "packaging", marker = "python_full_version < '3.13'" },
-    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+    { name = "ml-dtypes" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "onnx" },
+    { name = "onnx-ir" },
+    { name = "packaging" },
+    { name = "typing-extensions" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/fb/4b/eed2199327bbf12c3443d7835893e3c4c23b1c1a4aa13efe0f7fbe0a6bf9/onnxscript-0.5.6.tar.gz", hash = "sha256:cc3338b2976daffd2af0bb6ac4866a4dca76aefface1666a0d7bc65ad9850822", size = 587017, upload-time = "2025-10-31T03:50:38.656Z" }
 wheels = [
@@ -3570,13 +3493,22 @@ version = "1.33.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "deprecated" },
-    { name = "importlib-metadata", version = "8.6.1", source = { registry = "https://pypi.org/simple" } },
+    { name = "importlib-metadata" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/9a/8d/1f5a45fbcb9a7d87809d460f09dc3399e3fbd31d7f3e14888345e9d29951/opentelemetry_api-1.33.1.tar.gz", hash = "sha256:1c6055fc0a2d3f23a50c7e17e16ef75ad489345fd3df1f8b8af7c0bbf8a109e8", size = 65002, upload-time = "2025-05-16T18:52:41.146Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/05/44/4c45a34def3506122ae61ad684139f0bbc4e00c39555d4f7e20e0e001c8a/opentelemetry_api-1.33.1-py3-none-any.whl", hash = "sha256:4db83ebcf7ea93e64637ec6ee6fabee45c5cbe4abd9cf3da95c43828ddb50b83", size = 65771, upload-time = "2025-05-16T18:52:17.419Z" },
 ]
 
+[[package]]
+name = "overrides"
+version = "7.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/36/86/b585f53236dec60aba864e050778b25045f857e17f6e5ea0ae95fe80edd2/overrides-7.7.0.tar.gz", hash = "sha256:55158fa3d93b98cc75299b1e67078ad9003ca27945c76162c1c0766d6f91820a", size = 22812, upload-time = "2024-01-27T21:01:33.423Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2c/ab/fc8290c6a4c722e5514d80f62b2dc4c4df1a68a41d1364e625c35990fcf3/overrides-7.7.0-py3-none-any.whl", hash = "sha256:c7ed9d062f78b8e4c1a7b70bd8796b35ead4d9f510227ef9c5dc7626c60d7e49", size = 17832, upload-time = "2024-01-27T21:01:31.393Z" },
+]
+
 [[package]]
 name = "packaging"
 version = "25.0"
@@ -3591,7 +3523,8 @@ name = "pandas"
 version = "2.3.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "python-dateutil" },
     { name = "pytz" },
     { name = "tzdata" },
@@ -3798,14 +3731,14 @@ wheels = [
 
 [[package]]
 name = "prettytable"
-version = "3.16.0"
+version = "3.17.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "wcwidth" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/99/b1/85e18ac92afd08c533603e3393977b6bc1443043115a47bb094f3b98f94f/prettytable-3.16.0.tar.gz", hash = "sha256:3c64b31719d961bf69c9a7e03d0c1e477320906a98da63952bc6698d6164ff57", size = 66276, upload-time = "2025-03-24T19:39:04.008Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/79/45/b0847d88d6cfeb4413566738c8bbf1e1995fad3d42515327ff32cc1eb578/prettytable-3.17.0.tar.gz", hash = "sha256:59f2590776527f3c9e8cf9fe7b66dd215837cca96a9c39567414cbc632e8ddb0", size = 67892, upload-time = "2025-11-14T17:33:20.212Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/02/c7/5613524e606ea1688b3bdbf48aa64bafb6d0a4ac3750274c43b6158a390f/prettytable-3.16.0-py3-none-any.whl", hash = "sha256:b5eccfabb82222f5aa46b798ff02a8452cf530a352c31bddfa29be41242863aa", size = 33863, upload-time = "2025-03-24T19:39:02.359Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/8c/83087ebc47ab0396ce092363001fa37c17153119ee282700c0713a195853/prettytable-3.17.0-py3-none-any.whl", hash = "sha256:aad69b294ddbe3e1f95ef8886a060ed1666a0b83018bbf56295f6f226c43d287", size = 34433, upload-time = "2025-11-14T17:33:19.093Z" },
 ]
 
 [[package]]
@@ -3958,17 +3891,17 @@ wheels = [
 
 [[package]]
 name = "protobuf"
-version = "6.33.0"
+version = "6.33.1"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/19/ff/64a6c8f420818bb873713988ca5492cba3a7946be57e027ac63495157d97/protobuf-6.33.0.tar.gz", hash = "sha256:140303d5c8d2037730c548f8c7b93b20bb1dc301be280c378b82b8894589c954", size = 443463, upload-time = "2025-10-15T20:39:52.159Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/0a/03/a1440979a3f74f16cab3b75b0da1a1a7f922d56a8ddea96092391998edc0/protobuf-6.33.1.tar.gz", hash = "sha256:97f65757e8d09870de6fd973aeddb92f85435607235d20b2dfed93405d00c85b", size = 443432, upload-time = "2025-11-13T16:44:18.895Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7e/ee/52b3fa8feb6db4a833dfea4943e175ce645144532e8a90f72571ad85df4e/protobuf-6.33.0-cp310-abi3-win32.whl", hash = "sha256:d6101ded078042a8f17959eccd9236fb7a9ca20d3b0098bbcb91533a5680d035", size = 425593, upload-time = "2025-10-15T20:39:40.29Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/c6/7a465f1825872c55e0341ff4a80198743f73b69ce5d43ab18043699d1d81/protobuf-6.33.0-cp310-abi3-win_amd64.whl", hash = "sha256:9a031d10f703f03768f2743a1c403af050b6ae1f3480e9c140f39c45f81b13ee", size = 436882, upload-time = "2025-10-15T20:39:42.841Z" },
-    { url = "https://files.pythonhosted.org/packages/e1/a9/b6eee662a6951b9c3640e8e452ab3e09f117d99fc10baa32d1581a0d4099/protobuf-6.33.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:905b07a65f1a4b72412314082c7dbfae91a9e8b68a0cc1577515f8df58ecf455", size = 427521, upload-time = "2025-10-15T20:39:43.803Z" },
-    { url = "https://files.pythonhosted.org/packages/10/35/16d31e0f92c6d2f0e77c2a3ba93185130ea13053dd16200a57434c882f2b/protobuf-6.33.0-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:e0697ece353e6239b90ee43a9231318302ad8353c70e6e45499fa52396debf90", size = 324445, upload-time = "2025-10-15T20:39:44.932Z" },
-    { url = "https://files.pythonhosted.org/packages/e6/eb/2a981a13e35cda8b75b5585aaffae2eb904f8f351bdd3870769692acbd8a/protobuf-6.33.0-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:e0a1715e4f27355afd9570f3ea369735afc853a6c3951a6afe1f80d8569ad298", size = 339159, upload-time = "2025-10-15T20:39:46.186Z" },
-    { url = "https://files.pythonhosted.org/packages/21/51/0b1cbad62074439b867b4e04cc09b93f6699d78fd191bed2bbb44562e077/protobuf-6.33.0-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:35be49fd3f4fefa4e6e2aacc35e8b837d6703c37a2168a55ac21e9b1bc7559ef", size = 323172, upload-time = "2025-10-15T20:39:47.465Z" },
-    { url = "https://files.pythonhosted.org/packages/07/d1/0a28c21707807c6aacd5dc9c3704b2aa1effbf37adebd8caeaf68b17a636/protobuf-6.33.0-py3-none-any.whl", hash = "sha256:25c9e1963c6734448ea2d308cfa610e692b801304ba0908d7bfa564ac5132995", size = 170477, upload-time = "2025-10-15T20:39:51.311Z" },
+    { url = "https://files.pythonhosted.org/packages/06/f1/446a9bbd2c60772ca36556bac8bfde40eceb28d9cc7838755bc41e001d8f/protobuf-6.33.1-cp310-abi3-win32.whl", hash = "sha256:f8d3fdbc966aaab1d05046d0240dd94d40f2a8c62856d41eaa141ff64a79de6b", size = 425593, upload-time = "2025-11-13T16:44:06.275Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/79/8780a378c650e3df849b73de8b13cf5412f521ca2ff9b78a45c247029440/protobuf-6.33.1-cp310-abi3-win_amd64.whl", hash = "sha256:923aa6d27a92bf44394f6abf7ea0500f38769d4b07f4be41cb52bd8b1123b9ed", size = 436883, upload-time = "2025-11-13T16:44:09.222Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/93/26213ff72b103ae55bb0d73e7fb91ea570ef407c3ab4fd2f1f27cac16044/protobuf-6.33.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:fe34575f2bdde76ac429ec7b570235bf0c788883e70aee90068e9981806f2490", size = 427522, upload-time = "2025-11-13T16:44:10.475Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/32/df4a35247923393aa6b887c3b3244a8c941c32a25681775f96e2b418f90e/protobuf-6.33.1-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:f8adba2e44cde2d7618996b3fc02341f03f5bc3f2748be72dc7b063319276178", size = 324445, upload-time = "2025-11-13T16:44:11.869Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/d0/d796e419e2ec93d2f3fa44888861c3f88f722cde02b7c3488fcc6a166820/protobuf-6.33.1-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:0f4cf01222c0d959c2b399142deb526de420be8236f22c71356e2a544e153c53", size = 339161, upload-time = "2025-11-13T16:44:12.778Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/2a/3c5f05a4af06649547027d288747f68525755de692a26a7720dced3652c0/protobuf-6.33.1-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:8fd7d5e0eb08cd5b87fd3df49bc193f5cfd778701f47e11d127d0afc6c39f1d1", size = 323171, upload-time = "2025-11-13T16:44:14.035Z" },
+    { url = "https://files.pythonhosted.org/packages/08/b4/46310463b4f6ceef310f8348786f3cff181cea671578e3d9743ba61a459e/protobuf-6.33.1-py3-none-any.whl", hash = "sha256:d595a9fd694fdeb061a62fbe10eb039cc1e444df81ec9bb70c7fc59ebcb1eafa", size = 170477, upload-time = "2025-11-13T16:44:17.633Z" },
 ]
 
 [[package]]
@@ -4092,7 +4025,7 @@ wheels = [
 
 [[package]]
 name = "pydantic"
-version = "2.12.4"
+version = "2.12.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "annotated-types" },
@@ -4100,9 +4033,9 @@ dependencies = [
     { name = "typing-extensions" },
     { name = "typing-inspection" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/96/ad/a17bc283d7d81837c061c49e3eaa27a45991759a1b7eae1031921c6bd924/pydantic-2.12.4.tar.gz", hash = "sha256:0f8cb9555000a4b5b617f66bfd2566264c4984b27589d3b845685983e8ea85ac", size = 821038, upload-time = "2025-11-05T10:50:08.59Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/69/44/36f1a6e523abc58ae5f928898e4aca2e0ea509b5aa6f6f392a5d882be928/pydantic-2.12.5.tar.gz", hash = "sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49", size = 821591, upload-time = "2025-11-26T15:11:46.471Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/82/2f/e68750da9b04856e2a7ec56fc6f034a5a79775e9b9a81882252789873798/pydantic-2.12.4-py3-none-any.whl", hash = "sha256:92d3d202a745d46f9be6df459ac5a064fdaa3c1c4cd8adcfa332ccf3c05f871e", size = 463400, upload-time = "2025-11-05T10:50:06.732Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/87/b70ad306ebb6f9b585f114d0ac2137d792b48be34d732d60e597c2f8465a/pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d", size = 463580, upload-time = "2025-11-26T15:11:44.605Z" },
 ]
 
 [[package]]
@@ -4311,51 +4244,39 @@ wheels = [
 
 [[package]]
 name = "pynacl"
-version = "1.6.0"
+version = "1.6.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "cffi", marker = "platform_python_implementation != 'PyPy' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/06/c6/a3124dee667a423f2c637cfd262a54d67d8ccf3e160f3c50f622a85b7723/pynacl-1.6.0.tar.gz", hash = "sha256:cb36deafe6e2bce3b286e5d1f3e1c246e0ccdb8808ddb4550bb2792f2df298f2", size = 3505641, upload-time = "2025-09-10T23:39:22.308Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/70/24/1b639176401255605ba7c2b93a7b1eb1e379e0710eca62613633eb204201/pynacl-1.6.0-cp314-cp314t-macosx_10_10_universal2.whl", hash = "sha256:f46386c24a65383a9081d68e9c2de909b1834ec74ff3013271f1bca9c2d233eb", size = 384141, upload-time = "2025-09-10T23:38:28.675Z" },
-    { url = "https://files.pythonhosted.org/packages/5e/7b/874efdf57d6bf172db0df111b479a553c3d9e8bb4f1f69eb3ffff772d6e8/pynacl-1.6.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:dea103a1afcbc333bc0e992e64233d360d393d1e63d0bc88554f572365664348", size = 808132, upload-time = "2025-09-10T23:38:38.995Z" },
-    { url = "https://files.pythonhosted.org/packages/f3/61/9b53f5913f3b75ac3d53170cdb897101b2b98afc76f4d9d3c8de5aa3ac05/pynacl-1.6.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:04f20784083014e265ad58c1b2dd562c3e35864b5394a14ab54f5d150ee9e53e", size = 1407253, upload-time = "2025-09-10T23:38:40.492Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/0a/b138916b22bbf03a1bdbafecec37d714e7489dd7bcaf80cd17852f8b67be/pynacl-1.6.0-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bbcc4452a1eb10cd5217318c822fde4be279c9de8567f78bad24c773c21254f8", size = 843719, upload-time = "2025-09-10T23:38:30.87Z" },
-    { url = "https://files.pythonhosted.org/packages/01/3b/17c368197dfb2c817ce033f94605a47d0cc27901542109e640cef263f0af/pynacl-1.6.0-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:51fed9fe1bec9e7ff9af31cd0abba179d0e984a2960c77e8e5292c7e9b7f7b5d", size = 1445441, upload-time = "2025-09-10T23:38:33.078Z" },
-    { url = "https://files.pythonhosted.org/packages/35/3c/f79b185365ab9be80cd3cd01dacf30bf5895f9b7b001e683b369e0bb6d3d/pynacl-1.6.0-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:10d755cf2a455d8c0f8c767a43d68f24d163b8fe93ccfaabfa7bafd26be58d73", size = 825691, upload-time = "2025-09-10T23:38:34.832Z" },
-    { url = "https://files.pythonhosted.org/packages/f7/1f/8b37d25e95b8f2a434a19499a601d4d272b9839ab8c32f6b0fc1e40c383f/pynacl-1.6.0-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:536703b8f90e911294831a7fbcd0c062b837f3ccaa923d92a6254e11178aaf42", size = 1410726, upload-time = "2025-09-10T23:38:36.893Z" },
-    { url = "https://files.pythonhosted.org/packages/bd/93/5a4a4cf9913014f83d615ad6a2df9187330f764f606246b3a744c0788c03/pynacl-1.6.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:6b08eab48c9669d515a344fb0ef27e2cbde847721e34bba94a343baa0f33f1f4", size = 801035, upload-time = "2025-09-10T23:38:42.109Z" },
-    { url = "https://files.pythonhosted.org/packages/bf/60/40da6b0fe6a4d5fd88f608389eb1df06492ba2edca93fca0b3bebff9b948/pynacl-1.6.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5789f016e08e5606803161ba24de01b5a345d24590a80323379fc4408832d290", size = 1371854, upload-time = "2025-09-10T23:38:44.16Z" },
-    { url = "https://files.pythonhosted.org/packages/44/b2/37ac1d65008f824cba6b5bf68d18b76d97d0f62d7a032367ea69d4a187c8/pynacl-1.6.0-cp314-cp314t-win32.whl", hash = "sha256:4853c154dc16ea12f8f3ee4b7e763331876316cc3a9f06aeedf39bcdca8f9995", size = 230345, upload-time = "2025-09-10T23:38:48.276Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/5a/9234b7b45af890d02ebee9aae41859b9b5f15fb4a5a56d88e3b4d1659834/pynacl-1.6.0-cp314-cp314t-win_amd64.whl", hash = "sha256:347dcddce0b4d83ed3f32fd00379c83c425abee5a9d2cd0a2c84871334eaff64", size = 243103, upload-time = "2025-09-10T23:38:45.503Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/2c/c1a0f19d720ab0af3bc4241af2bdf4d813c3ecdcb96392b5e1ddf2d8f24f/pynacl-1.6.0-cp314-cp314t-win_arm64.whl", hash = "sha256:2d6cd56ce4998cb66a6c112fda7b1fdce5266c9f05044fa72972613bef376d15", size = 187778, upload-time = "2025-09-10T23:38:46.731Z" },
-    { url = "https://files.pythonhosted.org/packages/63/37/87c72df19857c5b3b47ace6f211a26eb862ada495cc96daa372d96048fca/pynacl-1.6.0-cp38-abi3-macosx_10_10_universal2.whl", hash = "sha256:f4b3824920e206b4f52abd7de621ea7a44fd3cb5c8daceb7c3612345dfc54f2e", size = 382610, upload-time = "2025-09-10T23:38:49.459Z" },
-    { url = "https://files.pythonhosted.org/packages/0c/64/3ce958a5817fd3cc6df4ec14441c43fd9854405668d73babccf77f9597a3/pynacl-1.6.0-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:16dd347cdc8ae0b0f6187a2608c0af1c8b7ecbbe6b4a06bff8253c192f696990", size = 798744, upload-time = "2025-09-10T23:38:58.531Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/8a/3f0dd297a0a33fa3739c255feebd0206bb1df0b44c52fbe2caf8e8bc4425/pynacl-1.6.0-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:16c60daceee88d04f8d41d0a4004a7ed8d9a5126b997efd2933e08e93a3bd850", size = 1397879, upload-time = "2025-09-10T23:39:00.44Z" },
-    { url = "https://files.pythonhosted.org/packages/41/94/028ff0434a69448f61348d50d2c147dda51aabdd4fbc93ec61343332174d/pynacl-1.6.0-cp38-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:25720bad35dfac34a2bcdd61d9e08d6bfc6041bebc7751d9c9f2446cf1e77d64", size = 833907, upload-time = "2025-09-10T23:38:50.936Z" },
-    { url = "https://files.pythonhosted.org/packages/52/bc/a5cff7f8c30d5f4c26a07dfb0bcda1176ab8b2de86dda3106c00a02ad787/pynacl-1.6.0-cp38-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8bfaa0a28a1ab718bad6239979a5a57a8d1506d0caf2fba17e524dbb409441cf", size = 1436649, upload-time = "2025-09-10T23:38:52.783Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/20/c397be374fd5d84295046e398de4ba5f0722dc14450f65db76a43c121471/pynacl-1.6.0-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:ef214b90556bb46a485b7da8258e59204c244b1b5b576fb71848819b468c44a7", size = 817142, upload-time = "2025-09-10T23:38:54.4Z" },
-    { url = "https://files.pythonhosted.org/packages/12/30/5efcef3406940cda75296c6d884090b8a9aad2dcc0c304daebb5ae99fb4a/pynacl-1.6.0-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:49c336dd80ea54780bcff6a03ee1a476be1612423010472e60af83452aa0f442", size = 1401794, upload-time = "2025-09-10T23:38:56.614Z" },
-    { url = "https://files.pythonhosted.org/packages/be/e1/a8fe1248cc17ccb03b676d80fa90763760a6d1247da434844ea388d0816c/pynacl-1.6.0-cp38-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:f3482abf0f9815e7246d461fab597aa179b7524628a4bc36f86a7dc418d2608d", size = 772161, upload-time = "2025-09-10T23:39:01.93Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/76/8a62702fb657d6d9104ce13449db221a345665d05e6a3fdefb5a7cafd2ad/pynacl-1.6.0-cp38-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:140373378e34a1f6977e573033d1dd1de88d2a5d90ec6958c9485b2fd9f3eb90", size = 1370720, upload-time = "2025-09-10T23:39:03.531Z" },
-    { url = "https://files.pythonhosted.org/packages/6d/38/9e9e9b777a1c4c8204053733e1a0269672c0bd40852908c9ad6b6eaba82c/pynacl-1.6.0-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6b393bc5e5a0eb86bb85b533deb2d2c815666665f840a09e0aa3362bb6088736", size = 791252, upload-time = "2025-09-10T23:39:05.058Z" },
-    { url = "https://files.pythonhosted.org/packages/63/ef/d972ce3d92ae05c9091363cf185e8646933f91c376e97b8be79ea6e96c22/pynacl-1.6.0-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:4a25cfede801f01e54179b8ff9514bd7b5944da560b7040939732d1804d25419", size = 1362910, upload-time = "2025-09-10T23:39:06.924Z" },
-    { url = "https://files.pythonhosted.org/packages/35/2c/ee0b373a1861f66a7ca8bdb999331525615061320dd628527a50ba8e8a60/pynacl-1.6.0-cp38-abi3-win32.whl", hash = "sha256:dcdeb41c22ff3c66eef5e63049abf7639e0db4edee57ba70531fc1b6b133185d", size = 226461, upload-time = "2025-09-10T23:39:11.894Z" },
-    { url = "https://files.pythonhosted.org/packages/75/f7/41b6c0b9dd9970173b6acc026bab7b4c187e4e5beef2756d419ad65482da/pynacl-1.6.0-cp38-abi3-win_amd64.whl", hash = "sha256:cf831615cc16ba324240de79d925eacae8265b7691412ac6b24221db157f6bd1", size = 238802, upload-time = "2025-09-10T23:39:08.966Z" },
-    { url = "https://files.pythonhosted.org/packages/8e/0f/462326910c6172fa2c6ed07922b22ffc8e77432b3affffd9e18f444dbfbb/pynacl-1.6.0-cp38-abi3-win_arm64.whl", hash = "sha256:84709cea8f888e618c21ed9a0efdb1a59cc63141c403db8bf56c469b71ad56f2", size = 183846, upload-time = "2025-09-10T23:39:10.552Z" },
-]
-
-[[package]]
-name = "pynvml"
-version = "13.0.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "nvidia-ml-py" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/5c/57/da7dc63a79f59e082e26a66ac02d87d69ea316b35b35b7a00d82f3ce3d2f/pynvml-13.0.1.tar.gz", hash = "sha256:1245991d9db786b4d2f277ce66869bd58f38ac654e38c9397d18f243c8f6e48f", size = 35226, upload-time = "2025-09-05T20:33:25.377Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/d7/4a/cac76c174bb439a0c46c9a4413fcbea5c6cabfb01879f7bbdb9fdfaed76c/pynvml-13.0.1-py3-none-any.whl", hash = "sha256:e2b20e0a501eeec951e2455b7ab444759cf048e0e13a57b08049fa2775266aa8", size = 28810, upload-time = "2025-09-05T20:33:24.13Z" },
+sdist = { url = "https://files.pythonhosted.org/packages/b2/46/aeca065d227e2265125aea590c9c47fbf5786128c9400ee0eb7c88931f06/pynacl-1.6.1.tar.gz", hash = "sha256:8d361dac0309f2b6ad33b349a56cd163c98430d409fa503b10b70b3ad66eaa1d", size = 3506616, upload-time = "2025-11-10T16:02:13.195Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/75/d6/4b2dca33ed512de8f54e5c6074aa06eaeb225bfbcd9b16f33a414389d6bd/pynacl-1.6.1-cp314-cp314t-macosx_10_10_universal2.whl", hash = "sha256:7d7c09749450c385301a3c20dca967a525152ae4608c0a096fe8464bfc3df93d", size = 389109, upload-time = "2025-11-10T16:01:28.79Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/30/e8dbb8ff4fa2559bbbb2187ba0d0d7faf728d17cb8396ecf4a898b22d3da/pynacl-1.6.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fc734c1696ffd49b40f7c1779c89ba908157c57345cf626be2e0719488a076d3", size = 808254, upload-time = "2025-11-10T16:01:37.839Z" },
+    { url = "https://files.pythonhosted.org/packages/44/f9/f5449c652f31da00249638dbab065ad4969c635119094b79b17c3a4da2ab/pynacl-1.6.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3cd787ec1f5c155dc8ecf39b1333cfef41415dc96d392f1ce288b4fe970df489", size = 1407365, upload-time = "2025-11-10T16:01:40.454Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/2f/9aa5605f473b712065c0a193ebf4ad4725d7a245533f0cd7e5dcdbc78f35/pynacl-1.6.1-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b35d93ab2df03ecb3aa506be0d3c73609a51449ae0855c2e89c7ed44abde40b", size = 843842, upload-time = "2025-11-10T16:01:30.524Z" },
+    { url = "https://files.pythonhosted.org/packages/32/8d/748f0f6956e207453da8f5f21a70885fbbb2e060d5c9d78e0a4a06781451/pynacl-1.6.1-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dece79aecbb8f4640a1adbb81e4aa3bfb0e98e99834884a80eb3f33c7c30e708", size = 1445559, upload-time = "2025-11-10T16:01:33.663Z" },
+    { url = "https://files.pythonhosted.org/packages/78/d0/2387f0dcb0e9816f38373999e48db4728ed724d31accdd4e737473319d35/pynacl-1.6.1-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:c2228054f04bf32d558fb89bb99f163a8197d5a9bf4efa13069a7fa8d4b93fc3", size = 825791, upload-time = "2025-11-10T16:01:34.823Z" },
+    { url = "https://files.pythonhosted.org/packages/18/3d/ef6fb7eb072aaf15f280bc66f26ab97e7fc9efa50fb1927683013ef47473/pynacl-1.6.1-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:2b12f1b97346f177affcdfdc78875ff42637cb40dcf79484a97dae3448083a78", size = 1410843, upload-time = "2025-11-10T16:01:36.401Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/fb/23824a017526850ee7d8a1cc4cd1e3e5082800522c10832edbbca8619537/pynacl-1.6.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e735c3a1bdfde3834503baf1a6d74d4a143920281cb724ba29fb84c9f49b9c48", size = 801140, upload-time = "2025-11-10T16:01:42.013Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/d1/ebc6b182cb98603a35635b727d62f094bc201bf610f97a3bb6357fe688d2/pynacl-1.6.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3384a454adf5d716a9fadcb5eb2e3e72cd49302d1374a60edc531c9957a9b014", size = 1371966, upload-time = "2025-11-10T16:01:43.297Z" },
+    { url = "https://files.pythonhosted.org/packages/64/f4/c9d7b6f02924b1f31db546c7bd2a83a2421c6b4a8e6a2e53425c9f2802e0/pynacl-1.6.1-cp314-cp314t-win32.whl", hash = "sha256:d8615ee34d01c8e0ab3f302dcdd7b32e2bcf698ba5f4809e7cc407c8cdea7717", size = 230482, upload-time = "2025-11-10T16:01:47.688Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/2c/942477957fba22da7bf99131850e5ebdff66623418ab48964e78a7a8293e/pynacl-1.6.1-cp314-cp314t-win_amd64.whl", hash = "sha256:5f5b35c1a266f8a9ad22525049280a600b19edd1f785bccd01ae838437dcf935", size = 243232, upload-time = "2025-11-10T16:01:45.208Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/0c/bdbc0d04a53b96a765ab03aa2cf9a76ad8653d70bf1665459b9a0dedaa1c/pynacl-1.6.1-cp314-cp314t-win_arm64.whl", hash = "sha256:d984c91fe3494793b2a1fb1e91429539c6c28e9ec8209d26d25041ec599ccf63", size = 187907, upload-time = "2025-11-10T16:01:46.328Z" },
+    { url = "https://files.pythonhosted.org/packages/49/41/3cfb3b4f3519f6ff62bf71bf1722547644bcfb1b05b8fdbdc300249ba113/pynacl-1.6.1-cp38-abi3-macosx_10_10_universal2.whl", hash = "sha256:a6f9fd6d6639b1e81115c7f8ff16b8dedba1e8098d2756275d63d208b0e32021", size = 387591, upload-time = "2025-11-10T16:01:49.1Z" },
+    { url = "https://files.pythonhosted.org/packages/18/21/b8a6563637799f617a3960f659513eccb3fcc655d5fc2be6e9dc6416826f/pynacl-1.6.1-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e49a3f3d0da9f79c1bec2aa013261ab9fa651c7da045d376bd306cf7c1792993", size = 798866, upload-time = "2025-11-10T16:01:55.688Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/6c/dc38033bc3ea461e05ae8f15a81e0e67ab9a01861d352ae971c99de23e7c/pynacl-1.6.1-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7713f8977b5d25f54a811ec9efa2738ac592e846dd6e8a4d3f7578346a841078", size = 1398001, upload-time = "2025-11-10T16:01:57.101Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/05/3ec0796a9917100a62c5073b20c4bce7bf0fea49e99b7906d1699cc7b61b/pynacl-1.6.1-cp38-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5a3becafc1ee2e5ea7f9abc642f56b82dcf5be69b961e782a96ea52b55d8a9fc", size = 834024, upload-time = "2025-11-10T16:01:50.228Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/b7/ae9982be0f344f58d9c64a1c25d1f0125c79201634efe3c87305ac7cb3e3/pynacl-1.6.1-cp38-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4ce50d19f1566c391fedc8dc2f2f5be265ae214112ebe55315e41d1f36a7f0a9", size = 1436766, upload-time = "2025-11-10T16:01:51.886Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/51/b2ccbf89cf3025a02e044dd68a365cad593ebf70f532299f2c047d2b7714/pynacl-1.6.1-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:543f869140f67d42b9b8d47f922552d7a967e6c116aad028c9bfc5f3f3b3a7b7", size = 817275, upload-time = "2025-11-10T16:01:53.351Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/6c/dd9ee8214edf63ac563b08a9b30f98d116942b621d39a751ac3256694536/pynacl-1.6.1-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:a2bb472458c7ca959aeeff8401b8efef329b0fc44a89d3775cffe8fad3398ad8", size = 1401891, upload-time = "2025-11-10T16:01:54.587Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/c1/97d3e1c83772d78ee1db3053fd674bc6c524afbace2bfe8d419fd55d7ed1/pynacl-1.6.1-cp38-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3206fa98737fdc66d59b8782cecc3d37d30aeec4593d1c8c145825a345bba0f0", size = 772291, upload-time = "2025-11-10T16:01:58.111Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/ca/691ff2fe12f3bb3e43e8e8df4b806f6384593d427f635104d337b8e00291/pynacl-1.6.1-cp38-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:53543b4f3d8acb344f75fd4d49f75e6572fce139f4bfb4815a9282296ff9f4c0", size = 1370839, upload-time = "2025-11-10T16:01:59.252Z" },
+    { url = "https://files.pythonhosted.org/packages/30/27/06fe5389d30391fce006442246062cc35773c84fbcad0209fbbf5e173734/pynacl-1.6.1-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:319de653ef84c4f04e045eb250e6101d23132372b0a61a7acf91bac0fda8e58c", size = 791371, upload-time = "2025-11-10T16:02:01.075Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/7a/e2bde8c9d39074a5aa046c7d7953401608d1f16f71e237f4bef3fb9d7e49/pynacl-1.6.1-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:262a8de6bba4aee8a66f5edf62c214b06647461c9b6b641f8cd0cb1e3b3196fe", size = 1363031, upload-time = "2025-11-10T16:02:02.656Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/b6/63fd77264dae1087770a1bb414bc604470f58fbc21d83822fc9c76248076/pynacl-1.6.1-cp38-abi3-win32.whl", hash = "sha256:9fd1a4eb03caf8a2fe27b515a998d26923adb9ddb68db78e35ca2875a3830dde", size = 226585, upload-time = "2025-11-10T16:02:07.116Z" },
+    { url = "https://files.pythonhosted.org/packages/12/c8/b419180f3fdb72ab4d45e1d88580761c267c7ca6eda9a20dcbcba254efe6/pynacl-1.6.1-cp38-abi3-win_amd64.whl", hash = "sha256:a569a4069a7855f963940040f35e87d8bc084cb2d6347428d5ad20550a0a1a21", size = 238923, upload-time = "2025-11-10T16:02:04.401Z" },
+    { url = "https://files.pythonhosted.org/packages/35/76/c34426d532e4dce7ff36e4d92cb20f4cbbd94b619964b93d24e8f5b5510f/pynacl-1.6.1-cp38-abi3-win_arm64.whl", hash = "sha256:5953e8b8cfadb10889a6e7bd0f53041a745d1b3d30111386a1bb37af171e6daf", size = 183970, upload-time = "2025-11-10T16:02:05.786Z" },
 ]
 
 [[package]]
@@ -4390,16 +4311,16 @@ wheels = [
 
 [[package]]
 name = "pytest-asyncio"
-version = "1.2.0"
+version = "1.3.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "backports-asyncio-runner", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "pytest" },
     { name = "typing-extensions", marker = "python_full_version < '3.13' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/42/86/9e3c5f48f7b7b638b216e4b9e645f54d199d7abbbab7a64a13b4e12ba10f/pytest_asyncio-1.2.0.tar.gz", hash = "sha256:c609a64a2a8768462d0c99811ddb8bd2583c33fd33cf7f21af1c142e824ffb57", size = 50119, upload-time = "2025-09-12T07:33:53.816Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/90/2c/8af215c0f776415f3590cac4f9086ccefd6fd463befeae41cd4d3f193e5a/pytest_asyncio-1.3.0.tar.gz", hash = "sha256:d7f52f36d231b80ee124cd216ffb19369aa168fc10095013c6b014a34d3ee9e5", size = 50087, upload-time = "2025-11-10T16:07:47.256Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/04/93/2fa34714b7a4ae72f2f8dad66ba17dd9a2c793220719e736dda28b7aec27/pytest_asyncio-1.2.0-py3-none-any.whl", hash = "sha256:8e17ae5e46d8e7efe51ab6494dd2010f4ca8dae51652aa3c8d55acf50bfb2e99", size = 15095, upload-time = "2025-09-12T07:33:52.639Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/35/f8b19922b6a25bc0880171a2f1a003eaeb93657475193ab516fd87cac9da/pytest_asyncio-1.3.0-py3-none-any.whl", hash = "sha256:611e26147c7f77640e6d0a92a38ed17c3e9848063698d5c93d5aa7aa11cebff5", size = 15075, upload-time = "2025-11-10T16:07:45.537Z" },
 ]
 
 [[package]]
@@ -4595,7 +4516,7 @@ wheels = [
 
 [[package]]
 name = "ray"
-version = "2.49.2"
+version = "2.51.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "click" },
@@ -4608,25 +4529,21 @@ dependencies = [
     { name = "requests" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/e4/99/517f224ffd073689c4905bdb185c21d9d8936d75066a96d454878f9e1e47/ray-2.49.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:08bec467576bc030d8bd0638004e1b8e075588929349112988a4bd4928684e8c", size = 66869076, upload-time = "2025-09-19T19:14:37.371Z" },
-    { url = "https://files.pythonhosted.org/packages/61/c5/c2ceba832fe3f47cfd7e11cd7cc7a1bbc2c028424c5bca70435aa4ca1dec/ray-2.49.2-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:3e441bf2acd7f368cf45132752066c5c3b83d88cd5f85762e703774bba4f2b6d", size = 69263514, upload-time = "2025-09-19T19:14:45.519Z" },
-    { url = "https://files.pythonhosted.org/packages/63/0e/830df5a0f7e2b582422ee8ad0cdf2a2a9563aa63bb8e60be9ceec494981c/ray-2.49.2-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:eae07b3fed45f5b041a8bf9795cd26fad2464be5126efd447e4484905a29b677", size = 69125462, upload-time = "2025-09-19T19:14:51.029Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/85/a340eba596db3f66d3a338aff43942d8bac32732fb4cf4a20ed4bbbd07eb/ray-2.49.2-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:74566876af7bf4e48ea4b9b3b75b34db053d1064cc4d4b1670dc4ce78f6894af", size = 69935752, upload-time = "2025-09-19T19:14:56.191Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/e6/809730d87cdf762e76728ea6bb3f96e38fa2dc7ef7d572a49c0d7ebcde95/ray-2.49.2-cp310-cp310-win_amd64.whl", hash = "sha256:e6becc2026d900ca0ba07eff12a130c9d651a91290bb24d43594842b575cc4e5", size = 26246695, upload-time = "2025-09-19T19:15:00.9Z" },
-    { url = "https://files.pythonhosted.org/packages/b5/63/27c7fb49513c816b825c809dd33a8570b35d511d1b5e568a4b33b0557997/ray-2.49.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:4fb9f9bf62fd5c92d22da20cd2aacb4ade1fb23033765fa9274f0a0c50bc42f6", size = 66869606, upload-time = "2025-09-19T19:15:05.838Z" },
-    { url = "https://files.pythonhosted.org/packages/52/9a/9728d1e9dc5473acf0e4f67081dc323d3333c8c87a1e9260ea8878720017/ray-2.49.2-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:9ece957a13985f7bbf4077f4ff0204314d7e99a941f95dff2a16b453d5376dc3", size = 69273124, upload-time = "2025-09-19T19:15:11.348Z" },
-    { url = "https://files.pythonhosted.org/packages/38/67/93f0d6d558874a730581059eb6dfa8860991a5410502ea0685dba5e788e4/ray-2.49.2-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:eada9dd89ccda643a3c6c2cba7016b59898432d126e10b38fed52d74165364f4", size = 69266231, upload-time = "2025-09-19T19:15:16.92Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/2b/f2efd0e7bcef06d51422db1af48cc5695a3f9b40a444f9d270a2d4663252/ray-2.49.2-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:54077dde338c5ffba349a4ab61b72352a3c3be69ea5b4f1b436d98d40b312763", size = 70070382, upload-time = "2025-09-19T19:15:22.048Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/b5/dfe1240e13d88dc68de03ee7c617f7578ef026e8569a42f7eeeb4729c5e3/ray-2.49.2-cp311-cp311-win_amd64.whl", hash = "sha256:41e11802ebbc487380e6c21dc041cb405e69fdda717a4eafdfeea294c6c3f9ca", size = 26243798, upload-time = "2025-09-19T19:15:26.405Z" },
-    { url = "https://files.pythonhosted.org/packages/01/66/0d4e518d611486244b357a6cf58a31d7d184f5558e03d5e482c335749616/ray-2.49.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:d6d612de5c6341b776fc75edeee5b698bb4af7ee84a2ff30552b32a9e6e4a772", size = 66857495, upload-time = "2025-09-19T19:15:31.427Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/4c/76f2c7c0946645fdd8d286a3e00e2c42130d676286de206be5d60d271218/ray-2.49.2-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:6784e076e4418222ef8ee3b6a8bfeb867d8797803b25bcfcce3bf3bc5414bef1", size = 69262599, upload-time = "2025-09-19T19:15:36.732Z" },
-    { url = "https://files.pythonhosted.org/packages/da/99/23b732c0b7b2ee2ffd28bf632257fb98924a03251d251810cb637512fcab/ray-2.49.2-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:dd0d8d8641d142fafe6d83e87d3c19bd5637d21e34608d3ff69ad71ea3e2f462", size = 69287193, upload-time = "2025-09-19T19:15:42.093Z" },
-    { url = "https://files.pythonhosted.org/packages/69/ca/94791be5c3b68ed0df85589a8ca558334818a47bf2978000f85533245aed/ray-2.49.2-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:2ecaaa51f588ccdda2b61563a8be3843bf65dfaaa83a240588a307f4ebb82471", size = 70114942, upload-time = "2025-09-19T19:15:47.536Z" },
-    { url = "https://files.pythonhosted.org/packages/e0/22/3f4b77498eefb3152a5946f9f544fcf336e7b9970c5c8af8e2d5eed13f0b/ray-2.49.2-cp312-cp312-win_amd64.whl", hash = "sha256:cba59684f031c9e778c588bc925777967e1b49bab3f00c638e4980bfdab07aec", size = 26223595, upload-time = "2025-09-19T19:15:51.803Z" },
-    { url = "https://files.pythonhosted.org/packages/99/dc/a7e569bf7030e0ec50163aed731189e744ca857d74f51b24361ce426697a/ray-2.49.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:2e2fe20fa90562e73630da9ff7932d3ed6507e73291c4d9bdf566537ae9deddf", size = 66803846, upload-time = "2025-09-19T19:15:56.928Z" },
-    { url = "https://files.pythonhosted.org/packages/4e/cf/6667e01f39cd28637f082273e9147f16d5f8fff34e2fb0ca60cc5da76e22/ray-2.49.2-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:b2f4f0fed936faf688e87ffdcc9356c034513c00259a2f1a8589e345fcfbdbc0", size = 69208426, upload-time = "2025-09-19T19:16:02.085Z" },
-    { url = "https://files.pythonhosted.org/packages/c5/84/5361bcdc9c9fb9f4abbf836801803b7df75c76c16a56493413eb154b8a34/ray-2.49.2-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:b4c7869688c518e902f7b6288edec2365ab4d28a464291e6d0a7040c7d01b5f7", size = 69198140, upload-time = "2025-09-19T19:16:07.413Z" },
-    { url = "https://files.pythonhosted.org/packages/b0/0c/9e49c3da7502f18483e4deb3273a3104d501c5e9cf1664a136b8ea36df48/ray-2.49.2-cp313-cp313-manylinux2014_x86_64.whl", hash = "sha256:b7d8214cff86df044fec727eeeabccc3bfc9b0271d28d61ba92c09f0d127d01d", size = 70027331, upload-time = "2025-09-19T19:16:12.968Z" },
+    { url = "https://files.pythonhosted.org/packages/72/4b/8ded0ecb0ed08b75af47340fac4b14b15196a76a6d733f3945cc5cb77354/ray-2.51.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e8ce218c85e9f4043c37136fc90b41343bdb844fcdc9520f21c000d1d8d49f89", size = 68039113, upload-time = "2025-11-01T03:23:30.619Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/a7/aba274bd1e1014cb232ee04548cc3d7aab9b84eb13c44d71b72d189421f9/ray-2.51.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:36feb519f31c52d3b4dbcd68ffb2baf93195ceec06ea711e21559096bab95fed", size = 70340511, upload-time = "2025-11-01T03:23:38.217Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/42/a5712f4f8c911ea5b8b3cb406ceef18a1c1bc98490c66fa902cb72391af3/ray-2.51.1-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:8a21f5914baa3deefcb4fa5f3878e03b589c190b864fe1b80e6dc0cbfba26004", size = 71166513, upload-time = "2025-11-01T03:23:44.123Z" },
+    { url = "https://files.pythonhosted.org/packages/91/1e/eeae1da4ffac6eeeeafce2d11c0b6133fd4df1b3e53bc44d61c30c05b6d9/ray-2.51.1-cp310-cp310-win_amd64.whl", hash = "sha256:a82417b89260ed751a76e9cfaef6d11392ab0da464cde1a9d07a0bb7dc272a7b", size = 26695587, upload-time = "2025-11-01T03:23:49.739Z" },
+    { url = "https://files.pythonhosted.org/packages/43/66/f1e11291d9fdf0634ea763cfb167cf449773d13918bb04390e6263b7129b/ray-2.51.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:bd8211fc033be1bce9c039e474e97a9077be593020978fdcfba1d770bdc40ba5", size = 68043927, upload-time = "2025-11-01T03:23:59.655Z" },
+    { url = "https://files.pythonhosted.org/packages/be/89/9a11d0addbba6143f5a34929ed1fdef51159328b9b76a877c0c7f98b2848/ray-2.51.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:d2d7c8af45441ff50bc002352d31e0afec5c85dd5075bf527027178931497bce", size = 70460551, upload-time = "2025-11-01T03:24:05.77Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/67/40a8d63e4cb3ff1a1a5a12db77ca655e21cb13f10e024a9513f24ed11d98/ray-2.51.1-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:dd353010d2548bc345e46c45795f70291bb460c236aa6a3393b51a9cd861b56f", size = 71280610, upload-time = "2025-11-01T03:24:11.981Z" },
+    { url = "https://files.pythonhosted.org/packages/62/97/90bcfed6b8c986f9ea24def19bbb81480575dd5fa87630eeaa4c92652507/ray-2.51.1-cp311-cp311-win_amd64.whl", hash = "sha256:606c6e0733eb18fc307c9645ea84ccbd1aad8a5ba8bad764bed54b94e926d33c", size = 26691238, upload-time = "2025-11-01T03:24:16.978Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/95/51e44ce79e42f02ca1c4d4c5501e6dd49f3a384c5f6324aceb4e0015988a/ray-2.51.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:ef847b025ca758baee4571a1ca001d973897cad772f8e95d7f303d24c38b649e", size = 68029226, upload-time = "2025-11-01T03:24:21.928Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/b5/a93e39e131067edb7cba3385a609f61aaaf7aa54728cd3a7474bfbf3b0fc/ray-2.51.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:0bed9408712bad1511e65683a455302f88d94e5e5cb6a58cc4a154b61d8a0b4a", size = 70502423, upload-time = "2025-11-01T03:24:27.398Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/59/69b7a653ed8176fc7fd894d462ed34bb1477e7fa71700324de99179b5b7e/ray-2.51.1-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:4e786da7862cf73664977d0212a505d6d5a585beadf63e7dc1e1c129259bee20", size = 71353730, upload-time = "2025-11-01T03:24:33.495Z" },
+    { url = "https://files.pythonhosted.org/packages/38/91/0c4fe7aed34baa14d9c050c88f39ff16083d555bd6dcd6c4ffb4332a6f8a/ray-2.51.1-cp312-cp312-win_amd64.whl", hash = "sha256:198fda93074a6863555f4003e9013bb2ba0cd50b59b18c02affdc294b28a2eef", size = 26674921, upload-time = "2025-11-01T03:24:38.394Z" },
+    { url = "https://files.pythonhosted.org/packages/65/1c/3ebf7277d8ae5f99150a5890bff4bdc627021e3a1be7caacd075d2996c7a/ray-2.51.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:d81547886435142dbd79bff1d4e4edf578a5f20e3b11bbd4ced49cfafbd37d27", size = 67974221, upload-time = "2025-11-01T03:24:44.118Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/47/13ba6c4d0e97aff94dcf8537f2832d1101c2080a0aea5c973a4de1d4d8bd/ray-2.51.1-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:3f2bd2acf9b7f4738c17d08592caaad26eafb7a4fc380ad9ab42d5f0a78f73ad", size = 70410610, upload-time = "2025-11-01T03:24:50.075Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/87/3cdf6d0504659d8192baa6576dd7a17ea395a4d969010274f7cc0e894281/ray-2.51.1-cp313-cp313-manylinux2014_x86_64.whl", hash = "sha256:265ecd6fd6d4a695b09c686e17d58fca0c09e7198c073628ae7bf4974b03e9ca", size = 71269225, upload-time = "2025-11-01T03:24:55.929Z" },
 ]
 
 [[package]]
@@ -4801,124 +4718,124 @@ wheels = [
 
 [[package]]
 name = "rpds-py"
-version = "0.28.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/48/dc/95f074d43452b3ef5d06276696ece4b3b5d696e7c9ad7173c54b1390cd70/rpds_py-0.28.0.tar.gz", hash = "sha256:abd4df20485a0983e2ca334a216249b6186d6e3c1627e106651943dbdb791aea", size = 27419, upload-time = "2025-10-22T22:24:29.327Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/82/f8/13bb772dc7cbf2c3c5b816febc34fa0cb2c64a08e0569869585684ce6631/rpds_py-0.28.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:7b6013db815417eeb56b2d9d7324e64fcd4fa289caeee6e7a78b2e11fc9b438a", size = 362820, upload-time = "2025-10-22T22:21:15.074Z" },
-    { url = "https://files.pythonhosted.org/packages/84/91/6acce964aab32469c3dbe792cb041a752d64739c534e9c493c701ef0c032/rpds_py-0.28.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1a4c6b05c685c0c03f80dabaeb73e74218c49deea965ca63f76a752807397207", size = 348499, upload-time = "2025-10-22T22:21:17.658Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/93/c05bb1f4f5e0234db7c4917cb8dd5e2e0a9a7b26dc74b1b7bee3c9cfd477/rpds_py-0.28.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4794c6c3fbe8f9ac87699b131a1f26e7b4abcf6d828da46a3a52648c7930eba", size = 379356, upload-time = "2025-10-22T22:21:19.847Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/37/e292da436f0773e319753c567263427cdf6c645d30b44f09463ff8216cda/rpds_py-0.28.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2e8456b6ee5527112ff2354dd9087b030e3429e43a74f480d4a5ca79d269fd85", size = 390151, upload-time = "2025-10-22T22:21:21.569Z" },
-    { url = "https://files.pythonhosted.org/packages/76/87/a4e3267131616e8faf10486dc00eaedf09bd61c87f01e5ef98e782ee06c9/rpds_py-0.28.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:beb880a9ca0a117415f241f66d56025c02037f7c4efc6fe59b5b8454f1eaa50d", size = 524831, upload-time = "2025-10-22T22:21:23.394Z" },
-    { url = "https://files.pythonhosted.org/packages/e1/c8/4a4ca76f0befae9515da3fad11038f0fce44f6bb60b21fe9d9364dd51fb0/rpds_py-0.28.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6897bebb118c44b38c9cb62a178e09f1593c949391b9a1a6fe777ccab5934ee7", size = 404687, upload-time = "2025-10-22T22:21:25.201Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/65/118afe854424456beafbbebc6b34dcf6d72eae3a08b4632bc4220f8240d9/rpds_py-0.28.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1b553dd06e875249fd43efd727785efb57a53180e0fde321468222eabbeaafa", size = 382683, upload-time = "2025-10-22T22:21:26.536Z" },
-    { url = "https://files.pythonhosted.org/packages/f7/bc/0625064041fb3a0c77ecc8878c0e8341b0ae27ad0f00cf8f2b57337a1e63/rpds_py-0.28.0-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:f0b2044fdddeea5b05df832e50d2a06fe61023acb44d76978e1b060206a8a476", size = 398927, upload-time = "2025-10-22T22:21:27.864Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/1a/fed7cf2f1ee8a5e4778f2054153f2cfcf517748875e2f5b21cf8907cd77d/rpds_py-0.28.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:05cf1e74900e8da73fa08cc76c74a03345e5a3e37691d07cfe2092d7d8e27b04", size = 411590, upload-time = "2025-10-22T22:21:29.474Z" },
-    { url = "https://files.pythonhosted.org/packages/c1/64/a8e0f67fa374a6c472dbb0afdaf1ef744724f165abb6899f20e2f1563137/rpds_py-0.28.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:efd489fec7c311dae25e94fe7eeda4b3d06be71c68f2cf2e8ef990ffcd2cd7e8", size = 559843, upload-time = "2025-10-22T22:21:30.917Z" },
-    { url = "https://files.pythonhosted.org/packages/a9/ea/e10353f6d7c105be09b8135b72787a65919971ae0330ad97d87e4e199880/rpds_py-0.28.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:ada7754a10faacd4f26067e62de52d6af93b6d9542f0df73c57b9771eb3ba9c4", size = 584188, upload-time = "2025-10-22T22:21:32.827Z" },
-    { url = "https://files.pythonhosted.org/packages/18/b0/a19743e0763caf0c89f6fc6ba6fbd9a353b24ffb4256a492420c5517da5a/rpds_py-0.28.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c2a34fd26588949e1e7977cfcbb17a9a42c948c100cab890c6d8d823f0586457", size = 550052, upload-time = "2025-10-22T22:21:34.702Z" },
-    { url = "https://files.pythonhosted.org/packages/de/bc/ec2c004f6c7d6ab1e25dae875cdb1aee087c3ebed5b73712ed3000e3851a/rpds_py-0.28.0-cp310-cp310-win32.whl", hash = "sha256:f9174471d6920cbc5e82a7822de8dfd4dcea86eb828b04fc8c6519a77b0ee51e", size = 215110, upload-time = "2025-10-22T22:21:36.645Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/de/4ce8abf59674e17187023933547d2018363e8fc76ada4f1d4d22871ccb6e/rpds_py-0.28.0-cp310-cp310-win_amd64.whl", hash = "sha256:6e32dd207e2c4f8475257a3540ab8a93eff997abfa0a3fdb287cae0d6cd874b8", size = 223850, upload-time = "2025-10-22T22:21:38.006Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/34/058d0db5471c6be7bef82487ad5021ff8d1d1d27794be8730aad938649cf/rpds_py-0.28.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:03065002fd2e287725d95fbc69688e0c6daf6c6314ba38bdbaa3895418e09296", size = 362344, upload-time = "2025-10-22T22:21:39.713Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/67/9503f0ec8c055a0782880f300c50a2b8e5e72eb1f94dfc2053da527444dd/rpds_py-0.28.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:28ea02215f262b6d078daec0b45344c89e161eab9526b0d898221d96fdda5f27", size = 348440, upload-time = "2025-10-22T22:21:41.056Z" },
-    { url = "https://files.pythonhosted.org/packages/68/2e/94223ee9b32332a41d75b6f94b37b4ce3e93878a556fc5f152cbd856a81f/rpds_py-0.28.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25dbade8fbf30bcc551cb352376c0ad64b067e4fc56f90e22ba70c3ce205988c", size = 379068, upload-time = "2025-10-22T22:21:42.593Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/25/54fd48f9f680cfc44e6a7f39a5fadf1d4a4a1fd0848076af4a43e79f998c/rpds_py-0.28.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3c03002f54cc855860bfdc3442928ffdca9081e73b5b382ed0b9e8efe6e5e205", size = 390518, upload-time = "2025-10-22T22:21:43.998Z" },
-    { url = "https://files.pythonhosted.org/packages/1b/85/ac258c9c27f2ccb1bd5d0697e53a82ebcf8088e3186d5d2bf8498ee7ed44/rpds_py-0.28.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b9699fa7990368b22032baf2b2dce1f634388e4ffc03dfefaaac79f4695edc95", size = 525319, upload-time = "2025-10-22T22:21:45.645Z" },
-    { url = "https://files.pythonhosted.org/packages/40/cb/c6734774789566d46775f193964b76627cd5f42ecf246d257ce84d1912ed/rpds_py-0.28.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b9b06fe1a75e05e0713f06ea0c89ecb6452210fd60e2f1b6ddc1067b990e08d9", size = 404896, upload-time = "2025-10-22T22:21:47.544Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/53/14e37ce83202c632c89b0691185dca9532288ff9d390eacae3d2ff771bae/rpds_py-0.28.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac9f83e7b326a3f9ec3ef84cda98fb0a74c7159f33e692032233046e7fd15da2", size = 382862, upload-time = "2025-10-22T22:21:49.176Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/83/f3642483ca971a54d60caa4449f9d6d4dbb56a53e0072d0deff51b38af74/rpds_py-0.28.0-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:0d3259ea9ad8743a75a43eb7819324cdab393263c91be86e2d1901ee65c314e0", size = 398848, upload-time = "2025-10-22T22:21:51.024Z" },
-    { url = "https://files.pythonhosted.org/packages/44/09/2d9c8b2f88e399b4cfe86efdf2935feaf0394e4f14ab30c6c5945d60af7d/rpds_py-0.28.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9a7548b345f66f6695943b4ef6afe33ccd3f1b638bd9afd0f730dd255c249c9e", size = 412030, upload-time = "2025-10-22T22:21:52.665Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/f5/e1cec473d4bde6df1fd3738be8e82d64dd0600868e76e92dfeaebbc2d18f/rpds_py-0.28.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9a40040aa388b037eb39416710fbcce9443498d2eaab0b9b45ae988b53f5c67", size = 559700, upload-time = "2025-10-22T22:21:54.123Z" },
-    { url = "https://files.pythonhosted.org/packages/8d/be/73bb241c1649edbf14e98e9e78899c2c5e52bbe47cb64811f44d2cc11808/rpds_py-0.28.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8f60c7ea34e78c199acd0d3cda37a99be2c861dd2b8cf67399784f70c9f8e57d", size = 584581, upload-time = "2025-10-22T22:21:56.102Z" },
-    { url = "https://files.pythonhosted.org/packages/9c/9c/ffc6e9218cd1eb5c2c7dbd276c87cd10e8c2232c456b554169eb363381df/rpds_py-0.28.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1571ae4292649100d743b26d5f9c63503bb1fedf538a8f29a98dce2d5ba6b4e6", size = 549981, upload-time = "2025-10-22T22:21:58.253Z" },
-    { url = "https://files.pythonhosted.org/packages/5f/50/da8b6d33803a94df0149345ee33e5d91ed4d25fc6517de6a25587eae4133/rpds_py-0.28.0-cp311-cp311-win32.whl", hash = "sha256:5cfa9af45e7c1140af7321fa0bef25b386ee9faa8928c80dc3a5360971a29e8c", size = 214729, upload-time = "2025-10-22T22:21:59.625Z" },
-    { url = "https://files.pythonhosted.org/packages/12/fd/b0f48c4c320ee24c8c20df8b44acffb7353991ddf688af01eef5f93d7018/rpds_py-0.28.0-cp311-cp311-win_amd64.whl", hash = "sha256:dd8d86b5d29d1b74100982424ba53e56033dc47720a6de9ba0259cf81d7cecaa", size = 223977, upload-time = "2025-10-22T22:22:01.092Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/21/c8e77a2ac66e2ec4e21f18a04b4e9a0417ecf8e61b5eaeaa9360a91713b4/rpds_py-0.28.0-cp311-cp311-win_arm64.whl", hash = "sha256:4e27d3a5709cc2b3e013bf93679a849213c79ae0573f9b894b284b55e729e120", size = 217326, upload-time = "2025-10-22T22:22:02.944Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/5c/6c3936495003875fe7b14f90ea812841a08fca50ab26bd840e924097d9c8/rpds_py-0.28.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:6b4f28583a4f247ff60cd7bdda83db8c3f5b05a7a82ff20dd4b078571747708f", size = 366439, upload-time = "2025-10-22T22:22:04.525Z" },
-    { url = "https://files.pythonhosted.org/packages/56/f9/a0f1ca194c50aa29895b442771f036a25b6c41a35e4f35b1a0ea713bedae/rpds_py-0.28.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d678e91b610c29c4b3d52a2c148b641df2b4676ffe47c59f6388d58b99cdc424", size = 348170, upload-time = "2025-10-22T22:22:06.397Z" },
-    { url = "https://files.pythonhosted.org/packages/18/ea/42d243d3a586beb72c77fa5def0487daf827210069a95f36328e869599ea/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e819e0e37a44a78e1383bf1970076e2ccc4dc8c2bbaa2f9bd1dc987e9afff628", size = 378838, upload-time = "2025-10-22T22:22:07.932Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/78/3de32e18a94791af8f33601402d9d4f39613136398658412a4e0b3047327/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5ee514e0f0523db5d3fb171f397c54875dbbd69760a414dccf9d4d7ad628b5bd", size = 393299, upload-time = "2025-10-22T22:22:09.435Z" },
-    { url = "https://files.pythonhosted.org/packages/13/7e/4bdb435afb18acea2eb8a25ad56b956f28de7c59f8a1d32827effa0d4514/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5f3fa06d27fdcee47f07a39e02862da0100cb4982508f5ead53ec533cd5fe55e", size = 518000, upload-time = "2025-10-22T22:22:11.326Z" },
-    { url = "https://files.pythonhosted.org/packages/31/d0/5f52a656875cdc60498ab035a7a0ac8f399890cc1ee73ebd567bac4e39ae/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:46959ef2e64f9e4a41fc89aa20dbca2b85531f9a72c21099a3360f35d10b0d5a", size = 408746, upload-time = "2025-10-22T22:22:13.143Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/cd/49ce51767b879cde77e7ad9fae164ea15dce3616fe591d9ea1df51152706/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8455933b4bcd6e83fde3fefc987a023389c4b13f9a58c8d23e4b3f6d13f78c84", size = 386379, upload-time = "2025-10-22T22:22:14.602Z" },
-    { url = "https://files.pythonhosted.org/packages/6a/99/e4e1e1ee93a98f72fc450e36c0e4d99c35370220e815288e3ecd2ec36a2a/rpds_py-0.28.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:ad50614a02c8c2962feebe6012b52f9802deec4263946cddea37aaf28dd25a66", size = 401280, upload-time = "2025-10-22T22:22:16.063Z" },
-    { url = "https://files.pythonhosted.org/packages/61/35/e0c6a57488392a8b319d2200d03dad2b29c0db9996f5662c3b02d0b86c02/rpds_py-0.28.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e5deca01b271492553fdb6c7fd974659dce736a15bae5dad7ab8b93555bceb28", size = 412365, upload-time = "2025-10-22T22:22:17.504Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/6a/841337980ea253ec797eb084665436007a1aad0faac1ba097fb906c5f69c/rpds_py-0.28.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:735f8495a13159ce6a0d533f01e8674cec0c57038c920495f87dcb20b3ddb48a", size = 559573, upload-time = "2025-10-22T22:22:19.108Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/5e/64826ec58afd4c489731f8b00729c5f6afdb86f1df1df60bfede55d650bb/rpds_py-0.28.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:961ca621ff10d198bbe6ba4957decca61aa2a0c56695384c1d6b79bf61436df5", size = 583973, upload-time = "2025-10-22T22:22:20.768Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/ee/44d024b4843f8386a4eeaa4c171b3d31d55f7177c415545fd1a24c249b5d/rpds_py-0.28.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2374e16cc9131022e7d9a8f8d65d261d9ba55048c78f3b6e017971a4f5e6353c", size = 553800, upload-time = "2025-10-22T22:22:22.25Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/89/33e675dccff11a06d4d85dbb4d1865f878d5020cbb69b2c1e7b2d3f82562/rpds_py-0.28.0-cp312-cp312-win32.whl", hash = "sha256:d15431e334fba488b081d47f30f091e5d03c18527c325386091f31718952fe08", size = 216954, upload-time = "2025-10-22T22:22:24.105Z" },
-    { url = "https://files.pythonhosted.org/packages/af/36/45f6ebb3210887e8ee6dbf1bc710ae8400bb417ce165aaf3024b8360d999/rpds_py-0.28.0-cp312-cp312-win_amd64.whl", hash = "sha256:a410542d61fc54710f750d3764380b53bf09e8c4edbf2f9141a82aa774a04f7c", size = 227844, upload-time = "2025-10-22T22:22:25.551Z" },
-    { url = "https://files.pythonhosted.org/packages/57/91/f3fb250d7e73de71080f9a221d19bd6a1c1eb0d12a1ea26513f6c1052ad6/rpds_py-0.28.0-cp312-cp312-win_arm64.whl", hash = "sha256:1f0cfd1c69e2d14f8c892b893997fa9a60d890a0c8a603e88dca4955f26d1edd", size = 217624, upload-time = "2025-10-22T22:22:26.914Z" },
-    { url = "https://files.pythonhosted.org/packages/d3/03/ce566d92611dfac0085c2f4b048cd53ed7c274a5c05974b882a908d540a2/rpds_py-0.28.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:e9e184408a0297086f880556b6168fa927d677716f83d3472ea333b42171ee3b", size = 366235, upload-time = "2025-10-22T22:22:28.397Z" },
-    { url = "https://files.pythonhosted.org/packages/00/34/1c61da1b25592b86fd285bd7bd8422f4c9d748a7373b46126f9ae792a004/rpds_py-0.28.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:edd267266a9b0448f33dc465a97cfc5d467594b600fe28e7fa2f36450e03053a", size = 348241, upload-time = "2025-10-22T22:22:30.171Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/00/ed1e28616848c61c493a067779633ebf4b569eccaacf9ccbdc0e7cba2b9d/rpds_py-0.28.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85beb8b3f45e4e32f6802fb6cd6b17f615ef6c6a52f265371fb916fae02814aa", size = 378079, upload-time = "2025-10-22T22:22:31.644Z" },
-    { url = "https://files.pythonhosted.org/packages/11/b2/ccb30333a16a470091b6e50289adb4d3ec656fd9951ba8c5e3aaa0746a67/rpds_py-0.28.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d2412be8d00a1b895f8ad827cc2116455196e20ed994bb704bf138fe91a42724", size = 393151, upload-time = "2025-10-22T22:22:33.453Z" },
-    { url = "https://files.pythonhosted.org/packages/8c/d0/73e2217c3ee486d555cb84920597480627d8c0240ff3062005c6cc47773e/rpds_py-0.28.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cf128350d384b777da0e68796afdcebc2e9f63f0e9f242217754e647f6d32491", size = 517520, upload-time = "2025-10-22T22:22:34.949Z" },
-    { url = "https://files.pythonhosted.org/packages/c4/91/23efe81c700427d0841a4ae7ea23e305654381831e6029499fe80be8a071/rpds_py-0.28.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a2036d09b363aa36695d1cc1a97b36865597f4478470b0697b5ee9403f4fe399", size = 408699, upload-time = "2025-10-22T22:22:36.584Z" },
-    { url = "https://files.pythonhosted.org/packages/ca/ee/a324d3198da151820a326c1f988caaa4f37fc27955148a76fff7a2d787a9/rpds_py-0.28.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8e1e9be4fa6305a16be628959188e4fd5cd6f1b0e724d63c6d8b2a8adf74ea6", size = 385720, upload-time = "2025-10-22T22:22:38.014Z" },
-    { url = "https://files.pythonhosted.org/packages/19/ad/e68120dc05af8b7cab4a789fccd8cdcf0fe7e6581461038cc5c164cd97d2/rpds_py-0.28.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:0a403460c9dd91a7f23fc3188de6d8977f1d9603a351d5db6cf20aaea95b538d", size = 401096, upload-time = "2025-10-22T22:22:39.869Z" },
-    { url = "https://files.pythonhosted.org/packages/99/90/c1e070620042459d60df6356b666bb1f62198a89d68881816a7ed121595a/rpds_py-0.28.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d7366b6553cdc805abcc512b849a519167db8f5e5c3472010cd1228b224265cb", size = 411465, upload-time = "2025-10-22T22:22:41.395Z" },
-    { url = "https://files.pythonhosted.org/packages/68/61/7c195b30d57f1b8d5970f600efee72a4fad79ec829057972e13a0370fd24/rpds_py-0.28.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5b43c6a3726efd50f18d8120ec0551241c38785b68952d240c45ea553912ac41", size = 558832, upload-time = "2025-10-22T22:22:42.871Z" },
-    { url = "https://files.pythonhosted.org/packages/b0/3d/06f3a718864773f69941d4deccdf18e5e47dd298b4628062f004c10f3b34/rpds_py-0.28.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:0cb7203c7bc69d7c1585ebb33a2e6074492d2fc21ad28a7b9d40457ac2a51ab7", size = 583230, upload-time = "2025-10-22T22:22:44.877Z" },
-    { url = "https://files.pythonhosted.org/packages/66/df/62fc783781a121e77fee9a21ead0a926f1b652280a33f5956a5e7833ed30/rpds_py-0.28.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7a52a5169c664dfb495882adc75c304ae1d50df552fbd68e100fdc719dee4ff9", size = 553268, upload-time = "2025-10-22T22:22:46.441Z" },
-    { url = "https://files.pythonhosted.org/packages/84/85/d34366e335140a4837902d3dea89b51f087bd6a63c993ebdff59e93ee61d/rpds_py-0.28.0-cp313-cp313-win32.whl", hash = "sha256:2e42456917b6687215b3e606ab46aa6bca040c77af7df9a08a6dcfe8a4d10ca5", size = 217100, upload-time = "2025-10-22T22:22:48.342Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/1c/f25a3f3752ad7601476e3eff395fe075e0f7813fbb9862bd67c82440e880/rpds_py-0.28.0-cp313-cp313-win_amd64.whl", hash = "sha256:e0a0311caedc8069d68fc2bf4c9019b58a2d5ce3cd7cb656c845f1615b577e1e", size = 227759, upload-time = "2025-10-22T22:22:50.219Z" },
-    { url = "https://files.pythonhosted.org/packages/e0/d6/5f39b42b99615b5bc2f36ab90423ea404830bdfee1c706820943e9a645eb/rpds_py-0.28.0-cp313-cp313-win_arm64.whl", hash = "sha256:04c1b207ab8b581108801528d59ad80aa83bb170b35b0ddffb29c20e411acdc1", size = 217326, upload-time = "2025-10-22T22:22:51.647Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/8b/0c69b72d1cee20a63db534be0df271effe715ef6c744fdf1ff23bb2b0b1c/rpds_py-0.28.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:f296ea3054e11fc58ad42e850e8b75c62d9a93a9f981ad04b2e5ae7d2186ff9c", size = 355736, upload-time = "2025-10-22T22:22:53.211Z" },
-    { url = "https://files.pythonhosted.org/packages/f7/6d/0c2ee773cfb55c31a8514d2cece856dd299170a49babd50dcffb15ddc749/rpds_py-0.28.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5a7306c19b19005ad98468fcefeb7100b19c79fc23a5f24a12e06d91181193fa", size = 342677, upload-time = "2025-10-22T22:22:54.723Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/1c/22513ab25a27ea205144414724743e305e8153e6abe81833b5e678650f5a/rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5d9b86aa501fed9862a443c5c3116f6ead8bc9296185f369277c42542bd646b", size = 371847, upload-time = "2025-10-22T22:22:56.295Z" },
-    { url = "https://files.pythonhosted.org/packages/60/07/68e6ccdb4b05115ffe61d31afc94adef1833d3a72f76c9632d4d90d67954/rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e5bbc701eff140ba0e872691d573b3d5d30059ea26e5785acba9132d10c8c31d", size = 381800, upload-time = "2025-10-22T22:22:57.808Z" },
-    { url = "https://files.pythonhosted.org/packages/73/bf/6d6d15df80781d7f9f368e7c1a00caf764436518c4877fb28b029c4624af/rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9a5690671cd672a45aa8616d7374fdf334a1b9c04a0cac3c854b1136e92374fe", size = 518827, upload-time = "2025-10-22T22:22:59.826Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/d3/2decbb2976cc452cbf12a2b0aaac5f1b9dc5dd9d1f7e2509a3ee00421249/rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9f1d92ecea4fa12f978a367c32a5375a1982834649cdb96539dcdc12e609ab1a", size = 399471, upload-time = "2025-10-22T22:23:01.968Z" },
-    { url = "https://files.pythonhosted.org/packages/b1/2c/f30892f9e54bd02e5faca3f6a26d6933c51055e67d54818af90abed9748e/rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d252db6b1a78d0a3928b6190156042d54c93660ce4d98290d7b16b5296fb7cc", size = 377578, upload-time = "2025-10-22T22:23:03.52Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/5d/3bce97e5534157318f29ac06bf2d279dae2674ec12f7cb9c12739cee64d8/rpds_py-0.28.0-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:d61b355c3275acb825f8777d6c4505f42b5007e357af500939d4a35b19177259", size = 390482, upload-time = "2025-10-22T22:23:05.391Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/f0/886bd515ed457b5bd93b166175edb80a0b21a210c10e993392127f1e3931/rpds_py-0.28.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:acbe5e8b1026c0c580d0321c8aae4b0a1e1676861d48d6e8c6586625055b606a", size = 402447, upload-time = "2025-10-22T22:23:06.93Z" },
-    { url = "https://files.pythonhosted.org/packages/42/b5/71e8777ac55e6af1f4f1c05b47542a1eaa6c33c1cf0d300dca6a1c6e159a/rpds_py-0.28.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:8aa23b6f0fc59b85b4c7d89ba2965af274346f738e8d9fc2455763602e62fd5f", size = 552385, upload-time = "2025-10-22T22:23:08.557Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/cb/6ca2d70cbda5a8e36605e7788c4aa3bea7c17d71d213465a5a675079b98d/rpds_py-0.28.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:7b14b0c680286958817c22d76fcbca4800ddacef6f678f3a7c79a1fe7067fe37", size = 575642, upload-time = "2025-10-22T22:23:10.348Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/d4/407ad9960ca7856d7b25c96dcbe019270b5ffdd83a561787bc682c797086/rpds_py-0.28.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:bcf1d210dfee61a6c86551d67ee1031899c0fdbae88b2d44a569995d43797712", size = 544507, upload-time = "2025-10-22T22:23:12.434Z" },
-    { url = "https://files.pythonhosted.org/packages/51/31/2f46fe0efcac23fbf5797c6b6b7e1c76f7d60773e525cb65fcbc582ee0f2/rpds_py-0.28.0-cp313-cp313t-win32.whl", hash = "sha256:3aa4dc0fdab4a7029ac63959a3ccf4ed605fee048ba67ce89ca3168da34a1342", size = 205376, upload-time = "2025-10-22T22:23:13.979Z" },
-    { url = "https://files.pythonhosted.org/packages/92/e4/15947bda33cbedfc134490a41841ab8870a72a867a03d4969d886f6594a2/rpds_py-0.28.0-cp313-cp313t-win_amd64.whl", hash = "sha256:7b7d9d83c942855e4fdcfa75d4f96f6b9e272d42fffcb72cd4bb2577db2e2907", size = 215907, upload-time = "2025-10-22T22:23:15.5Z" },
-    { url = "https://files.pythonhosted.org/packages/08/47/ffe8cd7a6a02833b10623bf765fbb57ce977e9a4318ca0e8cf97e9c3d2b3/rpds_py-0.28.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:dcdcb890b3ada98a03f9f2bb108489cdc7580176cb73b4f2d789e9a1dac1d472", size = 353830, upload-time = "2025-10-22T22:23:17.03Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/9f/890f36cbd83a58491d0d91ae0db1702639edb33fb48eeb356f80ecc6b000/rpds_py-0.28.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:f274f56a926ba2dc02976ca5b11c32855cbd5925534e57cfe1fda64e04d1add2", size = 341819, upload-time = "2025-10-22T22:23:18.57Z" },
-    { url = "https://files.pythonhosted.org/packages/09/e3/921eb109f682aa24fb76207698fbbcf9418738f35a40c21652c29053f23d/rpds_py-0.28.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4fe0438ac4a29a520ea94c8c7f1754cdd8feb1bc490dfda1bfd990072363d527", size = 373127, upload-time = "2025-10-22T22:23:20.216Z" },
-    { url = "https://files.pythonhosted.org/packages/23/13/bce4384d9f8f4989f1a9599c71b7a2d877462e5fd7175e1f69b398f729f4/rpds_py-0.28.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8a358a32dd3ae50e933347889b6af9a1bdf207ba5d1a3f34e1a38cd3540e6733", size = 382767, upload-time = "2025-10-22T22:23:21.787Z" },
-    { url = "https://files.pythonhosted.org/packages/23/e1/579512b2d89a77c64ccef5a0bc46a6ef7f72ae0cf03d4b26dcd52e57ee0a/rpds_py-0.28.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e80848a71c78aa328fefaba9c244d588a342c8e03bda518447b624ea64d1ff56", size = 517585, upload-time = "2025-10-22T22:23:23.699Z" },
-    { url = "https://files.pythonhosted.org/packages/62/3c/ca704b8d324a2591b0b0adcfcaadf9c862375b11f2f667ac03c61b4fd0a6/rpds_py-0.28.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f586db2e209d54fe177e58e0bc4946bea5fb0102f150b1b2f13de03e1f0976f8", size = 399828, upload-time = "2025-10-22T22:23:25.713Z" },
-    { url = "https://files.pythonhosted.org/packages/da/37/e84283b9e897e3adc46b4c88bb3f6ec92a43bd4d2f7ef5b13459963b2e9c/rpds_py-0.28.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ae8ee156d6b586e4292491e885d41483136ab994e719a13458055bec14cf370", size = 375509, upload-time = "2025-10-22T22:23:27.32Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/c2/a980beab869d86258bf76ec42dec778ba98151f253a952b02fe36d72b29c/rpds_py-0.28.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:a805e9b3973f7e27f7cab63a6b4f61d90f2e5557cff73b6e97cd5b8540276d3d", size = 392014, upload-time = "2025-10-22T22:23:29.332Z" },
-    { url = "https://files.pythonhosted.org/packages/da/b5/b1d3c5f9d3fa5aeef74265f9c64de3c34a0d6d5cd3c81c8b17d5c8f10ed4/rpds_py-0.28.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5d3fd16b6dc89c73a4da0b4ac8b12a7ecc75b2864b95c9e5afed8003cb50a728", size = 402410, upload-time = "2025-10-22T22:23:31.14Z" },
-    { url = "https://files.pythonhosted.org/packages/74/ae/cab05ff08dfcc052afc73dcb38cbc765ffc86f94e966f3924cd17492293c/rpds_py-0.28.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:6796079e5d24fdaba6d49bda28e2c47347e89834678f2bc2c1b4fc1489c0fb01", size = 553593, upload-time = "2025-10-22T22:23:32.834Z" },
-    { url = "https://files.pythonhosted.org/packages/70/80/50d5706ea2a9bfc9e9c5f401d91879e7c790c619969369800cde202da214/rpds_py-0.28.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:76500820c2af232435cbe215e3324c75b950a027134e044423f59f5b9a1ba515", size = 576925, upload-time = "2025-10-22T22:23:34.47Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/12/85a57d7a5855a3b188d024b099fd09c90db55d32a03626d0ed16352413ff/rpds_py-0.28.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:bbdc5640900a7dbf9dd707fe6388972f5bbd883633eb68b76591044cfe346f7e", size = 542444, upload-time = "2025-10-22T22:23:36.093Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/65/10643fb50179509150eb94d558e8837c57ca8b9adc04bd07b98e57b48f8c/rpds_py-0.28.0-cp314-cp314-win32.whl", hash = "sha256:adc8aa88486857d2b35d75f0640b949759f79dc105f50aa2c27816b2e0dd749f", size = 207968, upload-time = "2025-10-22T22:23:37.638Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/84/0c11fe4d9aaea784ff4652499e365963222481ac647bcd0251c88af646eb/rpds_py-0.28.0-cp314-cp314-win_amd64.whl", hash = "sha256:66e6fa8e075b58946e76a78e69e1a124a21d9a48a5b4766d15ba5b06869d1fa1", size = 218876, upload-time = "2025-10-22T22:23:39.179Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/e0/3ab3b86ded7bb18478392dc3e835f7b754cd446f62f3fc96f4fe2aca78f6/rpds_py-0.28.0-cp314-cp314-win_arm64.whl", hash = "sha256:a6fe887c2c5c59413353b7c0caff25d0e566623501ccfff88957fa438a69377d", size = 212506, upload-time = "2025-10-22T22:23:40.755Z" },
-    { url = "https://files.pythonhosted.org/packages/51/ec/d5681bb425226c3501eab50fc30e9d275de20c131869322c8a1729c7b61c/rpds_py-0.28.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:7a69df082db13c7070f7b8b1f155fa9e687f1d6aefb7b0e3f7231653b79a067b", size = 355433, upload-time = "2025-10-22T22:23:42.259Z" },
-    { url = "https://files.pythonhosted.org/packages/be/ec/568c5e689e1cfb1ea8b875cffea3649260955f677fdd7ddc6176902d04cd/rpds_py-0.28.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b1cde22f2c30ebb049a9e74c5374994157b9b70a16147d332f89c99c5960737a", size = 342601, upload-time = "2025-10-22T22:23:44.372Z" },
-    { url = "https://files.pythonhosted.org/packages/32/fe/51ada84d1d2a1d9d8f2c902cfddd0133b4a5eb543196ab5161d1c07ed2ad/rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5338742f6ba7a51012ea470bd4dc600a8c713c0c72adaa0977a1b1f4327d6592", size = 372039, upload-time = "2025-10-22T22:23:46.025Z" },
-    { url = "https://files.pythonhosted.org/packages/07/c1/60144a2f2620abade1a78e0d91b298ac2d9b91bc08864493fa00451ef06e/rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e1460ebde1bcf6d496d80b191d854adedcc619f84ff17dc1c6d550f58c9efbba", size = 382407, upload-time = "2025-10-22T22:23:48.098Z" },
-    { url = "https://files.pythonhosted.org/packages/45/ed/091a7bbdcf4038a60a461df50bc4c82a7ed6d5d5e27649aab61771c17585/rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e3eb248f2feba84c692579257a043a7699e28a77d86c77b032c1d9fbb3f0219c", size = 518172, upload-time = "2025-10-22T22:23:50.16Z" },
-    { url = "https://files.pythonhosted.org/packages/54/dd/02cc90c2fd9c2ef8016fd7813bfacd1c3a1325633ec8f244c47b449fc868/rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd3bbba5def70b16cd1c1d7255666aad3b290fbf8d0fe7f9f91abafb73611a91", size = 399020, upload-time = "2025-10-22T22:23:51.81Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/81/5d98cc0329bbb911ccecd0b9e19fbf7f3a5de8094b4cda5e71013b2dd77e/rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3114f4db69ac5a1f32e7e4d1cbbe7c8f9cf8217f78e6e002cedf2d54c2a548ed", size = 377451, upload-time = "2025-10-22T22:23:53.711Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/07/4d5bcd49e3dfed2d38e2dcb49ab6615f2ceb9f89f5a372c46dbdebb4e028/rpds_py-0.28.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:4b0cb8a906b1a0196b863d460c0222fb8ad0f34041568da5620f9799b83ccf0b", size = 390355, upload-time = "2025-10-22T22:23:55.299Z" },
-    { url = "https://files.pythonhosted.org/packages/3f/79/9f14ba9010fee74e4f40bf578735cfcbb91d2e642ffd1abe429bb0b96364/rpds_py-0.28.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cf681ac76a60b667106141e11a92a3330890257e6f559ca995fbb5265160b56e", size = 403146, upload-time = "2025-10-22T22:23:56.929Z" },
-    { url = "https://files.pythonhosted.org/packages/39/4c/f08283a82ac141331a83a40652830edd3a4a92c34e07e2bbe00baaea2f5f/rpds_py-0.28.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:1e8ee6413cfc677ce8898d9cde18cc3a60fc2ba756b0dec5b71eb6eb21c49fa1", size = 552656, upload-time = "2025-10-22T22:23:58.62Z" },
-    { url = "https://files.pythonhosted.org/packages/61/47/d922fc0666f0dd8e40c33990d055f4cc6ecff6f502c2d01569dbed830f9b/rpds_py-0.28.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:b3072b16904d0b5572a15eb9d31c1954e0d3227a585fc1351aa9878729099d6c", size = 576782, upload-time = "2025-10-22T22:24:00.312Z" },
-    { url = "https://files.pythonhosted.org/packages/d3/0c/5bafdd8ccf6aa9d3bfc630cfece457ff5b581af24f46a9f3590f790e3df2/rpds_py-0.28.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:b670c30fd87a6aec281c3c9896d3bae4b205fd75d79d06dc87c2503717e46092", size = 544671, upload-time = "2025-10-22T22:24:02.297Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/37/dcc5d8397caa924988693519069d0beea077a866128719351a4ad95e82fc/rpds_py-0.28.0-cp314-cp314t-win32.whl", hash = "sha256:8014045a15b4d2b3476f0a287fcc93d4f823472d7d1308d47884ecac9e612be3", size = 205749, upload-time = "2025-10-22T22:24:03.848Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/69/64d43b21a10d72b45939a28961216baeb721cc2a430f5f7c3bfa21659a53/rpds_py-0.28.0-cp314-cp314t-win_amd64.whl", hash = "sha256:7a4e59c90d9c27c561eb3160323634a9ff50b04e4f7820600a2beb0ac90db578", size = 216233, upload-time = "2025-10-22T22:24:05.471Z" },
-    { url = "https://files.pythonhosted.org/packages/ae/bc/b43f2ea505f28119bd551ae75f70be0c803d2dbcd37c1b3734909e40620b/rpds_py-0.28.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f5e7101145427087e493b9c9b959da68d357c28c562792300dd21a095118ed16", size = 363913, upload-time = "2025-10-22T22:24:07.129Z" },
-    { url = "https://files.pythonhosted.org/packages/28/f2/db318195d324c89a2c57dc5195058cbadd71b20d220685c5bd1da79ee7fe/rpds_py-0.28.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:31eb671150b9c62409a888850aaa8e6533635704fe2b78335f9aaf7ff81eec4d", size = 350452, upload-time = "2025-10-22T22:24:08.754Z" },
-    { url = "https://files.pythonhosted.org/packages/ae/f2/1391c819b8573a4898cedd6b6c5ec5bc370ce59e5d6bdcebe3c9c1db4588/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48b55c1f64482f7d8bd39942f376bfdf2f6aec637ee8c805b5041e14eeb771db", size = 380957, upload-time = "2025-10-22T22:24:10.826Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/5c/e5de68ee7eb7248fce93269833d1b329a196d736aefb1a7481d1e99d1222/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:24743a7b372e9a76171f6b69c01aedf927e8ac3e16c474d9fe20d552a8cb45c7", size = 391919, upload-time = "2025-10-22T22:24:12.559Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/4f/2376336112cbfeb122fd435d608ad8d5041b3aed176f85a3cb32c262eb80/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:389c29045ee8bbb1627ea190b4976a310a295559eaf9f1464a1a6f2bf84dde78", size = 528541, upload-time = "2025-10-22T22:24:14.197Z" },
-    { url = "https://files.pythonhosted.org/packages/68/53/5ae232e795853dd20da7225c5dd13a09c0a905b1a655e92bdf8d78a99fd9/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:23690b5827e643150cf7b49569679ec13fe9a610a15949ed48b85eb7f98f34ec", size = 405629, upload-time = "2025-10-22T22:24:16.001Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/2d/351a3b852b683ca9b6b8b38ed9efb2347596973849ba6c3a0e99877c10aa/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f0c9266c26580e7243ad0d72fc3e01d6b33866cfab5084a6da7576bcf1c4f72", size = 384123, upload-time = "2025-10-22T22:24:17.585Z" },
-    { url = "https://files.pythonhosted.org/packages/e0/15/870804daa00202728cc91cb8e2385fa9f1f4eb49857c49cfce89e304eae6/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:4c6c4db5d73d179746951486df97fd25e92396be07fc29ee8ff9a8f5afbdfb27", size = 400923, upload-time = "2025-10-22T22:24:19.512Z" },
-    { url = "https://files.pythonhosted.org/packages/53/25/3706b83c125fa2a0bccceac951de3f76631f6bd0ee4d02a0ed780712ef1b/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a3b695a8fa799dd2cfdb4804b37096c5f6dba1ac7f48a7fbf6d0485bcd060316", size = 413767, upload-time = "2025-10-22T22:24:21.316Z" },
-    { url = "https://files.pythonhosted.org/packages/ef/f9/ce43dbe62767432273ed2584cef71fef8411bddfb64125d4c19128015018/rpds_py-0.28.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:6aa1bfce3f83baf00d9c5fcdbba93a3ab79958b4c7d7d1f55e7fe68c20e63912", size = 561530, upload-time = "2025-10-22T22:24:22.958Z" },
-    { url = "https://files.pythonhosted.org/packages/46/c9/ffe77999ed8f81e30713dd38fd9ecaa161f28ec48bb80fa1cd9118399c27/rpds_py-0.28.0-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:7b0f9dceb221792b3ee6acb5438eb1f02b0cb2c247796a72b016dcc92c6de829", size = 585453, upload-time = "2025-10-22T22:24:24.779Z" },
-    { url = "https://files.pythonhosted.org/packages/ed/d2/4a73b18821fd4669762c855fd1f4e80ceb66fb72d71162d14da58444a763/rpds_py-0.28.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:5d0145edba8abd3db0ab22b5300c99dc152f5c9021fab861be0f0544dc3cbc5f", size = 552199, upload-time = "2025-10-22T22:24:26.54Z" },
+version = "0.29.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/98/33/23b3b3419b6a3e0f559c7c0d2ca8fc1b9448382b25245033788785921332/rpds_py-0.29.0.tar.gz", hash = "sha256:fe55fe686908f50154d1dc599232016e50c243b438c3b7432f24e2895b0e5359", size = 69359, upload-time = "2025-11-16T14:50:39.532Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9e/7a/c5b2ff381b74bc742768e8d870f26babac4ef256ba160bdbf8d57af56461/rpds_py-0.29.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:4ae4b88c6617e1b9e5038ab3fccd7bac0842fdda2b703117b2aa99bc85379113", size = 372385, upload-time = "2025-11-16T14:47:36.287Z" },
+    { url = "https://files.pythonhosted.org/packages/28/36/531f1eb4d5bed4a9c150f363a7ec4a98d2dc746151bba5473bc38ee85dec/rpds_py-0.29.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7d9128ec9d8cecda6f044001fde4fb71ea7c24325336612ef8179091eb9596b9", size = 362869, upload-time = "2025-11-16T14:47:38.196Z" },
+    { url = "https://files.pythonhosted.org/packages/54/df/7e9c0493a2015d9c82807a2d5f023ea9774e27a4c15b33ef1cdb7456138d/rpds_py-0.29.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d37812c3da8e06f2bb35b3cf10e4a7b68e776a706c13058997238762b4e07f4f", size = 391582, upload-time = "2025-11-16T14:47:39.746Z" },
+    { url = "https://files.pythonhosted.org/packages/15/38/42a981c3592ef46fbd7e17adbf8730cc5ec87e6aa1770c658c44bbb52960/rpds_py-0.29.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:66786c3fb1d8de416a7fa8e1cb1ec6ba0a745b2b0eee42f9b7daa26f1a495545", size = 405685, upload-time = "2025-11-16T14:47:41.472Z" },
+    { url = "https://files.pythonhosted.org/packages/12/45/628b8c15856c3849c3f52ec6dac93c046ed5faeed4a435af03b70525fd29/rpds_py-0.29.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b58f5c77f1af888b5fd1876c9a0d9858f6f88a39c9dd7c073a88e57e577da66d", size = 527067, upload-time = "2025-11-16T14:47:43.036Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/ba/6b56d09badeabd95098016d72a437d4a0fd82d4672ce92a7607df5d70a42/rpds_py-0.29.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:799156ef1f3529ed82c36eb012b5d7a4cf4b6ef556dd7cc192148991d07206ae", size = 412532, upload-time = "2025-11-16T14:47:44.484Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/39/2f1f3db92888314b50b8f9641f679188bd24b3665a8cb9923b7201ae8011/rpds_py-0.29.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:453783477aa4f2d9104c4b59b08c871431647cb7af51b549bbf2d9eb9c827756", size = 392736, upload-time = "2025-11-16T14:47:46.053Z" },
+    { url = "https://files.pythonhosted.org/packages/60/43/3c3b1dcd827e50f2ae28786d846b8a351080d8a69a3b49bc10ae44cc39b1/rpds_py-0.29.0-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:24a7231493e3c4a4b30138b50cca089a598e52c34cf60b2f35cebf62f274fdea", size = 406300, upload-time = "2025-11-16T14:47:47.268Z" },
+    { url = "https://files.pythonhosted.org/packages/da/02/bc96021b67f8525e6bcdd68935c4543ada61e1f3dcb067ed037d68b8c6d2/rpds_py-0.29.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7033c1010b1f57bb44d8067e8c25aa6fa2e944dbf46ccc8c92b25043839c3fd2", size = 423641, upload-time = "2025-11-16T14:47:48.878Z" },
+    { url = "https://files.pythonhosted.org/packages/38/e9/c435ddb602ced19a80b8277a41371734f33ad3f91cc4ceb4d82596800a3c/rpds_py-0.29.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:0248b19405422573621172ab8e3a1f29141362d13d9f72bafa2e28ea0cdca5a2", size = 574153, upload-time = "2025-11-16T14:47:50.435Z" },
+    { url = "https://files.pythonhosted.org/packages/84/82/dc3c32e1f89ecba8a59600d4cd65fe0ad81b6c636ccdbf6cd177fd6a7bac/rpds_py-0.29.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:f9f436aee28d13b9ad2c764fc273e0457e37c2e61529a07b928346b219fcde3b", size = 600304, upload-time = "2025-11-16T14:47:51.599Z" },
+    { url = "https://files.pythonhosted.org/packages/35/98/785290e0b7142470735dc1b1f68fb33aae29e5296f062c88396eedf796c8/rpds_py-0.29.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:24a16cb7163933906c62c272de20ea3c228e4542c8c45c1d7dc2b9913e17369a", size = 562211, upload-time = "2025-11-16T14:47:53.094Z" },
+    { url = "https://files.pythonhosted.org/packages/30/58/4eeddcb0737c6875f3e30c65dc9d7e7a10dfd5779646a990fa602c6d56c5/rpds_py-0.29.0-cp310-cp310-win32.whl", hash = "sha256:1a409b0310a566bfd1be82119891fefbdce615ccc8aa558aff7835c27988cbef", size = 221803, upload-time = "2025-11-16T14:47:54.404Z" },
+    { url = "https://files.pythonhosted.org/packages/54/77/b35a8dbdcbeb32505500547cdafaa9f8863e85f8faac50ef34464ec5a256/rpds_py-0.29.0-cp310-cp310-win_amd64.whl", hash = "sha256:c5523b0009e7c3c1263471b69d8da1c7d41b3ecb4cb62ef72be206b92040a950", size = 235530, upload-time = "2025-11-16T14:47:56.061Z" },
+    { url = "https://files.pythonhosted.org/packages/36/ab/7fb95163a53ab122c74a7c42d2d2f012819af2cf3deb43fb0d5acf45cc1a/rpds_py-0.29.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:9b9c764a11fd637e0322a488560533112837f5334ffeb48b1be20f6d98a7b437", size = 372344, upload-time = "2025-11-16T14:47:57.279Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/45/f3c30084c03b0d0f918cb4c5ae2c20b0a148b51ba2b3f6456765b629bedd/rpds_py-0.29.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3fd2164d73812026ce970d44c3ebd51e019d2a26a4425a5dcbdfa93a34abc383", size = 363041, upload-time = "2025-11-16T14:47:58.908Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/e9/4d044a1662608c47a87cbb37b999d4d5af54c6d6ebdda93a4d8bbf8b2a10/rpds_py-0.29.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a097b7f7f7274164566ae90a221fd725363c0e9d243e2e9ed43d195ccc5495c", size = 391775, upload-time = "2025-11-16T14:48:00.197Z" },
+    { url = "https://files.pythonhosted.org/packages/50/c9/7616d3ace4e6731aeb6e3cd85123e03aec58e439044e214b9c5c60fd8eb1/rpds_py-0.29.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7cdc0490374e31cedefefaa1520d5fe38e82fde8748cbc926e7284574c714d6b", size = 405624, upload-time = "2025-11-16T14:48:01.496Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/e2/6d7d6941ca0843609fd2d72c966a438d6f22617baf22d46c3d2156c31350/rpds_py-0.29.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:89ca2e673ddd5bde9b386da9a0aac0cab0e76f40c8f0aaf0d6311b6bbf2aa311", size = 527894, upload-time = "2025-11-16T14:48:03.167Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/f7/aee14dc2db61bb2ae1e3068f134ca9da5f28c586120889a70ff504bb026f/rpds_py-0.29.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a5d9da3ff5af1ca1249b1adb8ef0573b94c76e6ae880ba1852f033bf429d4588", size = 412720, upload-time = "2025-11-16T14:48:04.413Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/e2/2293f236e887c0360c2723d90c00d48dee296406994d6271faf1712e94ec/rpds_py-0.29.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8238d1d310283e87376c12f658b61e1ee23a14c0e54c7c0ce953efdbdc72deed", size = 392945, upload-time = "2025-11-16T14:48:06.252Z" },
+    { url = "https://files.pythonhosted.org/packages/14/cd/ceea6147acd3bd1fd028d1975228f08ff19d62098078d5ec3eed49703797/rpds_py-0.29.0-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:2d6fb2ad1c36f91c4646989811e84b1ea5e0c3cf9690b826b6e32b7965853a63", size = 406385, upload-time = "2025-11-16T14:48:07.575Z" },
+    { url = "https://files.pythonhosted.org/packages/52/36/fe4dead19e45eb77a0524acfdbf51e6cda597b26fc5b6dddbff55fbbb1a5/rpds_py-0.29.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:534dc9df211387547267ccdb42253aa30527482acb38dd9b21c5c115d66a96d2", size = 423943, upload-time = "2025-11-16T14:48:10.175Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/7b/4551510803b582fa4abbc8645441a2d15aa0c962c3b21ebb380b7e74f6a1/rpds_py-0.29.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d456e64724a075441e4ed648d7f154dc62e9aabff29bcdf723d0c00e9e1d352f", size = 574204, upload-time = "2025-11-16T14:48:11.499Z" },
+    { url = "https://files.pythonhosted.org/packages/64/ba/071ccdd7b171e727a6ae079f02c26f75790b41555f12ca8f1151336d2124/rpds_py-0.29.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:a738f2da2f565989401bd6fd0b15990a4d1523c6d7fe83f300b7e7d17212feca", size = 600587, upload-time = "2025-11-16T14:48:12.822Z" },
+    { url = "https://files.pythonhosted.org/packages/03/09/96983d48c8cf5a1e03c7d9cc1f4b48266adfb858ae48c7c2ce978dbba349/rpds_py-0.29.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a110e14508fd26fd2e472bb541f37c209409876ba601cf57e739e87d8a53cf95", size = 562287, upload-time = "2025-11-16T14:48:14.108Z" },
+    { url = "https://files.pythonhosted.org/packages/40/f0/8c01aaedc0fa92156f0391f39ea93b5952bc0ec56b897763858f95da8168/rpds_py-0.29.0-cp311-cp311-win32.whl", hash = "sha256:923248a56dd8d158389a28934f6f69ebf89f218ef96a6b216a9be6861804d3f4", size = 221394, upload-time = "2025-11-16T14:48:15.374Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/a5/a8b21c54c7d234efdc83dc034a4d7cd9668e3613b6316876a29b49dece71/rpds_py-0.29.0-cp311-cp311-win_amd64.whl", hash = "sha256:539eb77eb043afcc45314d1be09ea6d6cafb3addc73e0547c171c6d636957f60", size = 235713, upload-time = "2025-11-16T14:48:16.636Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/1f/df3c56219523947b1be402fa12e6323fe6d61d883cf35d6cb5d5bb6db9d9/rpds_py-0.29.0-cp311-cp311-win_arm64.whl", hash = "sha256:bdb67151ea81fcf02d8f494703fb728d4d34d24556cbff5f417d74f6f5792e7c", size = 229157, upload-time = "2025-11-16T14:48:17.891Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/50/bc0e6e736d94e420df79be4deb5c9476b63165c87bb8f19ef75d100d21b3/rpds_py-0.29.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a0891cfd8db43e085c0ab93ab7e9b0c8fee84780d436d3b266b113e51e79f954", size = 376000, upload-time = "2025-11-16T14:48:19.141Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/3a/46676277160f014ae95f24de53bed0e3b7ea66c235e7de0b9df7bd5d68ba/rpds_py-0.29.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3897924d3f9a0361472d884051f9a2460358f9a45b1d85a39a158d2f8f1ad71c", size = 360575, upload-time = "2025-11-16T14:48:20.443Z" },
+    { url = "https://files.pythonhosted.org/packages/75/ba/411d414ed99ea1afdd185bbabeeaac00624bd1e4b22840b5e9967ade6337/rpds_py-0.29.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2a21deb8e0d1571508c6491ce5ea5e25669b1dd4adf1c9d64b6314842f708b5d", size = 392159, upload-time = "2025-11-16T14:48:22.12Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/b1/e18aa3a331f705467a48d0296778dc1fea9d7f6cf675bd261f9a846c7e90/rpds_py-0.29.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9efe71687d6427737a0a2de9ca1c0a216510e6cd08925c44162be23ed7bed2d5", size = 410602, upload-time = "2025-11-16T14:48:23.563Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/6c/04f27f0c9f2299274c76612ac9d2c36c5048bb2c6c2e52c38c60bf3868d9/rpds_py-0.29.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:40f65470919dc189c833e86b2c4bd21bd355f98436a2cef9e0a9a92aebc8e57e", size = 515808, upload-time = "2025-11-16T14:48:24.949Z" },
+    { url = "https://files.pythonhosted.org/packages/83/56/a8412aa464fb151f8bc0d91fb0bb888adc9039bd41c1c6ba8d94990d8cf8/rpds_py-0.29.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:def48ff59f181130f1a2cb7c517d16328efac3ec03951cca40c1dc2049747e83", size = 416015, upload-time = "2025-11-16T14:48:26.782Z" },
+    { url = "https://files.pythonhosted.org/packages/04/4c/f9b8a05faca3d9e0a6397c90d13acb9307c9792b2bff621430c58b1d6e76/rpds_py-0.29.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad7bd570be92695d89285a4b373006930715b78d96449f686af422debb4d3949", size = 395325, upload-time = "2025-11-16T14:48:28.055Z" },
+    { url = "https://files.pythonhosted.org/packages/34/60/869f3bfbf8ed7b54f1ad9a5543e0fdffdd40b5a8f587fe300ee7b4f19340/rpds_py-0.29.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:5a572911cd053137bbff8e3a52d31c5d2dba51d3a67ad902629c70185f3f2181", size = 410160, upload-time = "2025-11-16T14:48:29.338Z" },
+    { url = "https://files.pythonhosted.org/packages/91/aa/e5b496334e3aba4fe4c8a80187b89f3c1294c5c36f2a926da74338fa5a73/rpds_py-0.29.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d583d4403bcbf10cffc3ab5cee23d7643fcc960dff85973fd3c2d6c86e8dbb0c", size = 425309, upload-time = "2025-11-16T14:48:30.691Z" },
+    { url = "https://files.pythonhosted.org/packages/85/68/4e24a34189751ceb6d66b28f18159922828dd84155876551f7ca5b25f14f/rpds_py-0.29.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:070befbb868f257d24c3bb350dbd6e2f645e83731f31264b19d7231dd5c396c7", size = 574644, upload-time = "2025-11-16T14:48:31.964Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/cf/474a005ea4ea9c3b4f17b6108b6b13cebfc98ebaff11d6e1b193204b3a93/rpds_py-0.29.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:fc935f6b20b0c9f919a8ff024739174522abd331978f750a74bb68abd117bd19", size = 601605, upload-time = "2025-11-16T14:48:33.252Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/b1/c56f6a9ab8c5f6bb5c65c4b5f8229167a3a525245b0773f2c0896686b64e/rpds_py-0.29.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8c5a8ecaa44ce2d8d9d20a68a2483a74c07f05d72e94a4dff88906c8807e77b0", size = 564593, upload-time = "2025-11-16T14:48:34.643Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/13/0494cecce4848f68501e0a229432620b4b57022388b071eeff95f3e1e75b/rpds_py-0.29.0-cp312-cp312-win32.whl", hash = "sha256:ba5e1aeaf8dd6d8f6caba1f5539cddda87d511331714b7b5fc908b6cfc3636b7", size = 223853, upload-time = "2025-11-16T14:48:36.419Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/6a/51e9aeb444a00cdc520b032a28b07e5f8dc7bc328b57760c53e7f96997b4/rpds_py-0.29.0-cp312-cp312-win_amd64.whl", hash = "sha256:b5f6134faf54b3cb83375db0f113506f8b7770785be1f95a631e7e2892101977", size = 239895, upload-time = "2025-11-16T14:48:37.956Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/d4/8bce56cdad1ab873e3f27cb31c6a51d8f384d66b022b820525b879f8bed1/rpds_py-0.29.0-cp312-cp312-win_arm64.whl", hash = "sha256:b016eddf00dca7944721bf0cd85b6af7f6c4efaf83ee0b37c4133bd39757a8c7", size = 230321, upload-time = "2025-11-16T14:48:39.71Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/d9/c5de60d9d371bbb186c3e9bf75f4fc5665e11117a25a06a6b2e0afb7380e/rpds_py-0.29.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:1585648d0760b88292eecab5181f5651111a69d90eff35d6b78aa32998886a61", size = 375710, upload-time = "2025-11-16T14:48:41.063Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/b3/0860cdd012291dc21272895ce107f1e98e335509ba986dd83d72658b82b9/rpds_py-0.29.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:521807963971a23996ddaf764c682b3e46459b3c58ccd79fefbe16718db43154", size = 360582, upload-time = "2025-11-16T14:48:42.423Z" },
+    { url = "https://files.pythonhosted.org/packages/92/8a/a18c2f4a61b3407e56175f6aab6deacdf9d360191a3d6f38566e1eaf7266/rpds_py-0.29.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a8896986efaa243ab713c69e6491a4138410f0fe36f2f4c71e18bd5501e8014", size = 391172, upload-time = "2025-11-16T14:48:43.75Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/49/e93354258508c50abc15cdcd5fcf7ac4117f67bb6233ad7859f75e7372a0/rpds_py-0.29.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1d24564a700ef41480a984c5ebed62b74e6ce5860429b98b1fede76049e953e6", size = 409586, upload-time = "2025-11-16T14:48:45.498Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/8d/a27860dae1c19a6bdc901f90c81f0d581df1943355802961a57cdb5b6cd1/rpds_py-0.29.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e6596b93c010d386ae46c9fba9bfc9fc5965fa8228edeac51576299182c2e31c", size = 516339, upload-time = "2025-11-16T14:48:47.308Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/ad/a75e603161e79b7110c647163d130872b271c6b28712c803c65d492100f7/rpds_py-0.29.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5cc58aac218826d054c7da7f95821eba94125d88be673ff44267bb89d12a5866", size = 416201, upload-time = "2025-11-16T14:48:48.615Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/42/555b4ee17508beafac135c8b450816ace5a96194ce97fefc49d58e5652ea/rpds_py-0.29.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de73e40ebc04dd5d9556f50180395322193a78ec247e637e741c1b954810f295", size = 395095, upload-time = "2025-11-16T14:48:50.027Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/f0/c90b671b9031e800ec45112be42ea9f027f94f9ac25faaac8770596a16a1/rpds_py-0.29.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:295ce5ac7f0cf69a651ea75c8f76d02a31f98e5698e82a50a5f4d4982fbbae3b", size = 410077, upload-time = "2025-11-16T14:48:51.515Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/80/9af8b640b81fe21e6f718e9dec36c0b5f670332747243130a5490f292245/rpds_py-0.29.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1ea59b23ea931d494459c8338056fe7d93458c0bf3ecc061cd03916505369d55", size = 424548, upload-time = "2025-11-16T14:48:53.237Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/0b/b5647446e991736e6a495ef510e6710df91e880575a586e763baeb0aa770/rpds_py-0.29.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f49d41559cebd608042fdcf54ba597a4a7555b49ad5c1c0c03e0af82692661cd", size = 573661, upload-time = "2025-11-16T14:48:54.769Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/b3/1b1c9576839ff583d1428efbf59f9ee70498d8ce6c0b328ac02f1e470879/rpds_py-0.29.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:05a2bd42768ea988294ca328206efbcc66e220d2d9b7836ee5712c07ad6340ea", size = 600937, upload-time = "2025-11-16T14:48:56.247Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/7b/b6cfca2f9fee4c4494ce54f7fb1b9f578867495a9aa9fc0d44f5f735c8e0/rpds_py-0.29.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:33ca7bdfedd83339ca55da3a5e1527ee5870d4b8369456b5777b197756f3ca22", size = 564496, upload-time = "2025-11-16T14:48:57.691Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/fb/ba29ec7f0f06eb801bac5a23057a9ff7670623b5e8013bd59bec4aa09de8/rpds_py-0.29.0-cp313-cp313-win32.whl", hash = "sha256:20c51ae86a0bb9accc9ad4e6cdeec58d5ebb7f1b09dd4466331fc65e1766aae7", size = 223126, upload-time = "2025-11-16T14:48:59.058Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/6b/0229d3bed4ddaa409e6d90b0ae967ed4380e4bdd0dad6e59b92c17d42457/rpds_py-0.29.0-cp313-cp313-win_amd64.whl", hash = "sha256:6410e66f02803600edb0b1889541f4b5cc298a5ccda0ad789cc50ef23b54813e", size = 239771, upload-time = "2025-11-16T14:49:00.872Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/38/d2868f058b164f8efd89754d85d7b1c08b454f5c07ac2e6cc2e9bd4bd05b/rpds_py-0.29.0-cp313-cp313-win_arm64.whl", hash = "sha256:56838e1cd9174dc23c5691ee29f1d1be9eab357f27efef6bded1328b23e1ced2", size = 229994, upload-time = "2025-11-16T14:49:02.673Z" },
+    { url = "https://files.pythonhosted.org/packages/52/91/5de91c5ec7d41759beec9b251630824dbb8e32d20c3756da1a9a9d309709/rpds_py-0.29.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:37d94eadf764d16b9a04307f2ab1d7af6dc28774bbe0535c9323101e14877b4c", size = 365886, upload-time = "2025-11-16T14:49:04.133Z" },
+    { url = "https://files.pythonhosted.org/packages/85/7c/415d8c1b016d5f47ecec5145d9d6d21002d39dce8761b30f6c88810b455a/rpds_py-0.29.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:d472cf73efe5726a067dce63eebe8215b14beabea7c12606fd9994267b3cfe2b", size = 355262, upload-time = "2025-11-16T14:49:05.543Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/14/bf83e2daa4f980e4dc848aed9299792a8b84af95e12541d9e7562f84a6ef/rpds_py-0.29.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:72fdfd5ff8992e4636621826371e3ac5f3e3b8323e9d0e48378e9c13c3dac9d0", size = 384826, upload-time = "2025-11-16T14:49:07.301Z" },
+    { url = "https://files.pythonhosted.org/packages/33/b8/53330c50a810ae22b4fbba5e6cf961b68b9d72d9bd6780a7c0a79b070857/rpds_py-0.29.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2549d833abdf8275c901313b9e8ff8fba57e50f6a495035a2a4e30621a2f7cc4", size = 394234, upload-time = "2025-11-16T14:49:08.782Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/32/01e2e9645cef0e584f518cfde4567563e57db2257244632b603f61b40e50/rpds_py-0.29.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4448dad428f28a6a767c3e3b80cde3446a22a0efbddaa2360f4bb4dc836d0688", size = 520008, upload-time = "2025-11-16T14:49:10.253Z" },
+    { url = "https://files.pythonhosted.org/packages/98/c3/0d1b95a81affae2b10f950782e33a1fd2edd6ce2a479966cac98c9a66f57/rpds_py-0.29.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:115f48170fd4296a33938d8c11f697f5f26e0472e43d28f35624764173a60e4d", size = 409569, upload-time = "2025-11-16T14:49:12.478Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/60/aa3b8678f3f009f675b99174fa2754302a7fbfe749162e8043d111de2d88/rpds_py-0.29.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8e5bb73ffc029820f4348e9b66b3027493ae00bca6629129cd433fd7a76308ee", size = 385188, upload-time = "2025-11-16T14:49:13.88Z" },
+    { url = "https://files.pythonhosted.org/packages/92/02/5546c1c8aa89c18d40c1fcffdcc957ba730dee53fb7c3ca3a46f114761d2/rpds_py-0.29.0-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:b1581fcde18fcdf42ea2403a16a6b646f8eb1e58d7f90a0ce693da441f76942e", size = 398587, upload-time = "2025-11-16T14:49:15.339Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/e0/ad6eeaf47e236eba052fa34c4073078b9e092bd44da6bbb35aaae9580669/rpds_py-0.29.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:16e9da2bda9eb17ea318b4c335ec9ac1818e88922cbe03a5743ea0da9ecf74fb", size = 416641, upload-time = "2025-11-16T14:49:16.832Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/93/0acedfd50ad9cdd3879c615a6dc8c5f1ce78d2fdf8b87727468bb5bb4077/rpds_py-0.29.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:28fd300326dd21198f311534bdb6d7e989dd09b3418b3a91d54a0f384c700967", size = 566683, upload-time = "2025-11-16T14:49:18.342Z" },
+    { url = "https://files.pythonhosted.org/packages/62/53/8c64e0f340a9e801459fc6456821abc15b3582cb5dc3932d48705a9d9ac7/rpds_py-0.29.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:2aba991e041d031c7939e1358f583ae405a7bf04804ca806b97a5c0e0af1ea5e", size = 592730, upload-time = "2025-11-16T14:49:19.767Z" },
+    { url = "https://files.pythonhosted.org/packages/85/ef/3109b6584f8c4b0d2490747c916df833c127ecfa82be04d9a40a376f2090/rpds_py-0.29.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:7f437026dbbc3f08c99cc41a5b2570c6e1a1ddbe48ab19a9b814254128d4ea7a", size = 557361, upload-time = "2025-11-16T14:49:21.574Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/3b/61586475e82d57f01da2c16edb9115a618afe00ce86fe1b58936880b15af/rpds_py-0.29.0-cp313-cp313t-win32.whl", hash = "sha256:6e97846e9800a5d0fe7be4d008f0c93d0feeb2700da7b1f7528dabafb31dfadb", size = 211227, upload-time = "2025-11-16T14:49:23.03Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/3a/12dc43f13594a54ea0c9d7e9d43002116557330e3ad45bc56097ddf266e2/rpds_py-0.29.0-cp313-cp313t-win_amd64.whl", hash = "sha256:f49196aec7c4b406495f60e6f947ad71f317a765f956d74bbd83996b9edc0352", size = 225248, upload-time = "2025-11-16T14:49:24.841Z" },
+    { url = "https://files.pythonhosted.org/packages/89/b1/0b1474e7899371d9540d3bbb2a499a3427ae1fc39c998563fe9035a1073b/rpds_py-0.29.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:394d27e4453d3b4d82bb85665dc1fcf4b0badc30fc84282defed71643b50e1a1", size = 363731, upload-time = "2025-11-16T14:49:26.683Z" },
+    { url = "https://files.pythonhosted.org/packages/28/12/3b7cf2068d0a334ed1d7b385a9c3c8509f4c2bcba3d4648ea71369de0881/rpds_py-0.29.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:55d827b2ae95425d3be9bc9a5838b6c29d664924f98146557f7715e331d06df8", size = 354343, upload-time = "2025-11-16T14:49:28.24Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/73/5afcf8924bc02a749416eda64e17ac9c9b28f825f4737385295a0e99b0c1/rpds_py-0.29.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc31a07ed352e5462d3ee1b22e89285f4ce97d5266f6d1169da1142e78045626", size = 385406, upload-time = "2025-11-16T14:49:29.943Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/37/5db736730662508535221737a21563591b6f43c77f2e388951c42f143242/rpds_py-0.29.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c4695dd224212f6105db7ea62197144230b808d6b2bba52238906a2762f1d1e7", size = 396162, upload-time = "2025-11-16T14:49:31.833Z" },
+    { url = "https://files.pythonhosted.org/packages/70/0d/491c1017d14f62ce7bac07c32768d209a50ec567d76d9f383b4cfad19b80/rpds_py-0.29.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcae1770b401167f8b9e1e3f566562e6966ffa9ce63639916248a9e25fa8a244", size = 517719, upload-time = "2025-11-16T14:49:33.804Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/25/b11132afcb17cd5d82db173f0c8dab270ffdfaba43e5ce7a591837ae9649/rpds_py-0.29.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:90f30d15f45048448b8da21c41703b31c61119c06c216a1bf8c245812a0f0c17", size = 409498, upload-time = "2025-11-16T14:49:35.222Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/7d/e6543cedfb2e6403a1845710a5ab0e0ccf8fc288e0b5af9a70bfe2c12053/rpds_py-0.29.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44a91e0ab77bdc0004b43261a4b8cd6d6b451e8d443754cfda830002b5745b32", size = 382743, upload-time = "2025-11-16T14:49:36.704Z" },
+    { url = "https://files.pythonhosted.org/packages/75/11/a4ebc9f654293ae9fefb83b2b6be7f3253e85ea42a5db2f77d50ad19aaeb/rpds_py-0.29.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:4aa195e5804d32c682e453b34474f411ca108e4291c6a0f824ebdc30a91c973c", size = 400317, upload-time = "2025-11-16T14:49:39.132Z" },
+    { url = "https://files.pythonhosted.org/packages/52/18/97677a60a81c7f0e5f64e51fb3f8271c5c8fcabf3a2df18e97af53d7c2bf/rpds_py-0.29.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7971bdb7bf4ee0f7e6f67fa4c7fbc6019d9850cc977d126904392d363f6f8318", size = 416979, upload-time = "2025-11-16T14:49:40.575Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/69/28ab391a9968f6c746b2a2db181eaa4d16afaa859fedc9c2f682d19f7e18/rpds_py-0.29.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:8ae33ad9ce580c7a47452c3b3f7d8a9095ef6208e0a0c7e4e2384f9fc5bf8212", size = 567288, upload-time = "2025-11-16T14:49:42.24Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/d3/0c7afdcdb830eee94f5611b64e71354ffe6ac8df82d00c2faf2bfffd1d4e/rpds_py-0.29.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:c661132ab2fb4eeede2ef69670fd60da5235209874d001a98f1542f31f2a8a94", size = 593157, upload-time = "2025-11-16T14:49:43.782Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/ac/a0fcbc2feed4241cf26d32268c195eb88ddd4bd862adfc9d4b25edfba535/rpds_py-0.29.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:bb78b3a0d31ac1bde132c67015a809948db751cb4e92cdb3f0b242e430b6ed0d", size = 554741, upload-time = "2025-11-16T14:49:45.557Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/f1/fcc24137c470df8588674a677f33719d5800ec053aaacd1de8a5d5d84d9e/rpds_py-0.29.0-cp314-cp314-win32.whl", hash = "sha256:f475f103488312e9bd4000bc890a95955a07b2d0b6e8884aef4be56132adbbf1", size = 215508, upload-time = "2025-11-16T14:49:47.562Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/c7/1d169b2045512eac019918fc1021ea07c30e84a4343f9f344e3e0aa8c788/rpds_py-0.29.0-cp314-cp314-win_amd64.whl", hash = "sha256:b9cf2359a4fca87cfb6801fae83a76aedf66ee1254a7a151f1341632acf67f1b", size = 228125, upload-time = "2025-11-16T14:49:49.064Z" },
+    { url = "https://files.pythonhosted.org/packages/be/36/0cec88aaba70ec4a6e381c444b0d916738497d27f0c30406e3d9fcbd3bc2/rpds_py-0.29.0-cp314-cp314-win_arm64.whl", hash = "sha256:9ba8028597e824854f0f1733d8b964e914ae3003b22a10c2c664cb6927e0feb9", size = 221992, upload-time = "2025-11-16T14:49:50.777Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/fa/a2e524631717c9c0eb5d90d30f648cfba6b731047821c994acacb618406c/rpds_py-0.29.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:e71136fd0612556b35c575dc2726ae04a1669e6a6c378f2240312cf5d1a2ab10", size = 366425, upload-time = "2025-11-16T14:49:52.691Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/a4/6d43ebe0746ff694a30233f63f454aed1677bd50ab7a59ff6b2bb5ac61f2/rpds_py-0.29.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:76fe96632d53f3bf0ea31ede2f53bbe3540cc2736d4aec3b3801b0458499ef3a", size = 355282, upload-time = "2025-11-16T14:49:54.292Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/a7/52fd8270e0320b09eaf295766ae81dd175f65394687906709b3e75c71d06/rpds_py-0.29.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9459a33f077130dbb2c7c3cea72ee9932271fb3126404ba2a2661e4fe9eb7b79", size = 384968, upload-time = "2025-11-16T14:49:55.857Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/7d/e6bc526b7a14e1ef80579a52c1d4ad39260a058a51d66c6039035d14db9d/rpds_py-0.29.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5c9546cfdd5d45e562cc0444b6dddc191e625c62e866bf567a2c69487c7ad28a", size = 394714, upload-time = "2025-11-16T14:49:57.343Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/3f/f0ade3954e7db95c791e7eaf978aa7e08a756d2046e8bdd04d08146ed188/rpds_py-0.29.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:12597d11d97b8f7e376c88929a6e17acb980e234547c92992f9f7c058f1a7310", size = 520136, upload-time = "2025-11-16T14:49:59.162Z" },
+    { url = "https://files.pythonhosted.org/packages/87/b3/07122ead1b97009715ab9d4082be6d9bd9546099b2b03fae37c3116f72be/rpds_py-0.29.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28de03cf48b8a9e6ec10318f2197b83946ed91e2891f651a109611be4106ac4b", size = 409250, upload-time = "2025-11-16T14:50:00.698Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/c6/dcbee61fd1dc892aedcb1b489ba661313101aa82ec84b1a015d4c63ebfda/rpds_py-0.29.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd7951c964069039acc9d67a8ff1f0a7f34845ae180ca542b17dc1456b1f1808", size = 384940, upload-time = "2025-11-16T14:50:02.312Z" },
+    { url = "https://files.pythonhosted.org/packages/47/11/914ecb6f3574cf9bf8b38aced4063e0f787d6e1eb30b181a7efbc6c1da9a/rpds_py-0.29.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:c07d107b7316088f1ac0177a7661ca0c6670d443f6fe72e836069025e6266761", size = 399392, upload-time = "2025-11-16T14:50:03.829Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/fd/2f4bd9433f58f816434bb934313584caa47dbc6f03ce5484df8ac8980561/rpds_py-0.29.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1de2345af363d25696969befc0c1688a6cb5e8b1d32b515ef84fc245c6cddba3", size = 416796, upload-time = "2025-11-16T14:50:05.558Z" },
+    { url = "https://files.pythonhosted.org/packages/79/a5/449f0281af33efa29d5c71014399d74842342ae908d8cd38260320167692/rpds_py-0.29.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:00e56b12d2199ca96068057e1ae7f9998ab6e99cda82431afafd32f3ec98cca9", size = 566843, upload-time = "2025-11-16T14:50:07.243Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/32/0a6a1ccee2e37fcb1b7ba9afde762b77182dbb57937352a729c6cd3cf2bb/rpds_py-0.29.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:3919a3bbecee589300ed25000b6944174e07cd20db70552159207b3f4bbb45b8", size = 593956, upload-time = "2025-11-16T14:50:09.029Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/3d/eb820f95dce4306f07a495ede02fb61bef36ea201d9137d4fcd5ab94ec1e/rpds_py-0.29.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:e7fa2ccc312bbd91e43aa5e0869e46bc03278a3dddb8d58833150a18b0f0283a", size = 557288, upload-time = "2025-11-16T14:50:10.73Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/f8/b8ff786f40470462a252918e0836e0db903c28e88e3eec66bc4a7856ee5d/rpds_py-0.29.0-cp314-cp314t-win32.whl", hash = "sha256:97c817863ffc397f1e6a6e9d2d89fe5408c0a9922dac0329672fb0f35c867ea5", size = 211382, upload-time = "2025-11-16T14:50:12.827Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/7f/1a65ae870bc9d0576aebb0c501ea5dccf1ae2178fe2821042150ebd2e707/rpds_py-0.29.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2023473f444752f0f82a58dfcbee040d0a1b3d1b3c2ec40e884bd25db6d117d2", size = 225919, upload-time = "2025-11-16T14:50:14.734Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/ac/b97e80bf107159e5b9ba9c91df1ab95f69e5e41b435f27bdd737f0d583ac/rpds_py-0.29.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:acd82a9e39082dc5f4492d15a6b6c8599aa21db5c35aaf7d6889aea16502c07d", size = 373963, upload-time = "2025-11-16T14:50:16.205Z" },
+    { url = "https://files.pythonhosted.org/packages/40/5a/55e72962d5d29bd912f40c594e68880d3c7a52774b0f75542775f9250712/rpds_py-0.29.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:715b67eac317bf1c7657508170a3e011a1ea6ccb1c9d5f296e20ba14196be6b3", size = 364644, upload-time = "2025-11-16T14:50:18.22Z" },
+    { url = "https://files.pythonhosted.org/packages/99/2a/6b6524d0191b7fc1351c3c0840baac42250515afb48ae40c7ed15499a6a2/rpds_py-0.29.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3b1b87a237cb2dba4db18bcfaaa44ba4cd5936b91121b62292ff21df577fc43", size = 393847, upload-time = "2025-11-16T14:50:20.012Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/b8/c5692a7df577b3c0c7faed7ac01ee3c608b81750fc5d89f84529229b6873/rpds_py-0.29.0-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1c3c3e8101bb06e337c88eb0c0ede3187131f19d97d43ea0e1c5407ea74c0cbf", size = 407281, upload-time = "2025-11-16T14:50:21.64Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/57/0546c6f84031b7ea08b76646a8e33e45607cc6bd879ff1917dc077bb881e/rpds_py-0.29.0-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b8e54d6e61f3ecd3abe032065ce83ea63417a24f437e4a3d73d2f85ce7b7cfe", size = 529213, upload-time = "2025-11-16T14:50:23.219Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/c1/01dd5f444233605555bc11fe5fed6a5c18f379f02013870c176c8e630a23/rpds_py-0.29.0-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3fbd4e9aebf110473a420dea85a238b254cf8a15acb04b22a5a6b5ce8925b760", size = 413808, upload-time = "2025-11-16T14:50:25.262Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/0a/60f98b06156ea2a7af849fb148e00fbcfdb540909a5174a5ed10c93745c7/rpds_py-0.29.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80fdf53d36e6c72819993e35d1ebeeb8e8fc688d0c6c2b391b55e335b3afba5a", size = 394600, upload-time = "2025-11-16T14:50:26.956Z" },
+    { url = "https://files.pythonhosted.org/packages/37/f1/dc9312fc9bec040ece08396429f2bd9e0977924ba7a11c5ad7056428465e/rpds_py-0.29.0-pp311-pypy311_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:ea7173df5d86f625f8dde6d5929629ad811ed8decda3b60ae603903839ac9ac0", size = 408634, upload-time = "2025-11-16T14:50:28.989Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/41/65024c9fd40c89bb7d604cf73beda4cbdbcebe92d8765345dd65855b6449/rpds_py-0.29.0-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:76054d540061eda273274f3d13a21a4abdde90e13eaefdc205db37c05230efce", size = 426064, upload-time = "2025-11-16T14:50:30.674Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/e0/cf95478881fc88ca2fdbf56381d7df36567cccc39a05394beac72182cd62/rpds_py-0.29.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:9f84c549746a5be3bc7415830747a3a0312573afc9f95785eb35228bb17742ec", size = 575871, upload-time = "2025-11-16T14:50:33.428Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/c0/df88097e64339a0218b57bd5f9ca49898e4c394db756c67fccc64add850a/rpds_py-0.29.0-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:0ea962671af5cb9a260489e311fa22b2e97103e3f9f0caaea6f81390af96a9ed", size = 601702, upload-time = "2025-11-16T14:50:36.051Z" },
+    { url = "https://files.pythonhosted.org/packages/87/f4/09ffb3ebd0cbb9e2c7c9b84d252557ecf434cd71584ee1e32f66013824df/rpds_py-0.29.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:f7728653900035fb7b8d06e1e5900545d8088efc9d5d4545782da7df03ec803f", size = 564054, upload-time = "2025-11-16T14:50:37.733Z" },
 ]
 
 [[package]]
@@ -4962,24 +4879,28 @@ wheels = [
 
 [[package]]
 name = "safetensors"
-version = "0.6.2"
+version = "0.7.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/ac/cc/738f3011628920e027a11754d9cae9abec1aed00f7ae860abbf843755233/safetensors-0.6.2.tar.gz", hash = "sha256:43ff2aa0e6fa2dc3ea5524ac7ad93a9839256b8703761e76e2d0b2a3fa4f15d9", size = 197968, upload-time = "2025-08-08T13:13:58.654Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/4d/b1/3f5fd73c039fc87dba3ff8b5d528bfc5a32b597fea8e7a6a4800343a17c7/safetensors-0.6.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:9c85ede8ec58f120bad982ec47746981e210492a6db876882aa021446af8ffba", size = 454797, upload-time = "2025-08-08T13:13:52.066Z" },
-    { url = "https://files.pythonhosted.org/packages/8c/c9/bb114c158540ee17907ec470d01980957fdaf87b4aa07914c24eba87b9c6/safetensors-0.6.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d6675cf4b39c98dbd7d940598028f3742e0375a6b4d4277e76beb0c35f4b843b", size = 432206, upload-time = "2025-08-08T13:13:50.931Z" },
-    { url = "https://files.pythonhosted.org/packages/d3/8e/f70c34e47df3110e8e0bb268d90db8d4be8958a54ab0336c9be4fe86dac8/safetensors-0.6.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d2d2b3ce1e2509c68932ca03ab8f20570920cd9754b05063d4368ee52833ecd", size = 473261, upload-time = "2025-08-08T13:13:41.259Z" },
-    { url = "https://files.pythonhosted.org/packages/2a/f5/be9c6a7c7ef773e1996dc214e73485286df1836dbd063e8085ee1976f9cb/safetensors-0.6.2-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:93de35a18f46b0f5a6a1f9e26d91b442094f2df02e9fd7acf224cfec4238821a", size = 485117, upload-time = "2025-08-08T13:13:43.506Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/55/23f2d0a2c96ed8665bf17a30ab4ce5270413f4d74b6d87dd663258b9af31/safetensors-0.6.2-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:89a89b505f335640f9120fac65ddeb83e40f1fd081cb8ed88b505bdccec8d0a1", size = 616154, upload-time = "2025-08-08T13:13:45.096Z" },
-    { url = "https://files.pythonhosted.org/packages/98/c6/affb0bd9ce02aa46e7acddbe087912a04d953d7a4d74b708c91b5806ef3f/safetensors-0.6.2-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc4d0d0b937e04bdf2ae6f70cd3ad51328635fe0e6214aa1fc811f3b576b3bda", size = 520713, upload-time = "2025-08-08T13:13:46.25Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/5d/5a514d7b88e310c8b146e2404e0dc161282e78634d9358975fd56dfd14be/safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8045db2c872db8f4cbe3faa0495932d89c38c899c603f21e9b6486951a5ecb8f", size = 485835, upload-time = "2025-08-08T13:13:49.373Z" },
-    { url = "https://files.pythonhosted.org/packages/7a/7b/4fc3b2ba62c352b2071bea9cfbad330fadda70579f617506ae1a2f129cab/safetensors-0.6.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:81e67e8bab9878bb568cffbc5f5e655adb38d2418351dc0859ccac158f753e19", size = 521503, upload-time = "2025-08-08T13:13:47.651Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/50/0057e11fe1f3cead9254315a6c106a16dd4b1a19cd247f7cc6414f6b7866/safetensors-0.6.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b0e4d029ab0a0e0e4fdf142b194514695b1d7d3735503ba700cf36d0fc7136ce", size = 652256, upload-time = "2025-08-08T13:13:53.167Z" },
-    { url = "https://files.pythonhosted.org/packages/e9/29/473f789e4ac242593ac1656fbece6e1ecd860bb289e635e963667807afe3/safetensors-0.6.2-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:fa48268185c52bfe8771e46325a1e21d317207bcabcb72e65c6e28e9ffeb29c7", size = 747281, upload-time = "2025-08-08T13:13:54.656Z" },
-    { url = "https://files.pythonhosted.org/packages/68/52/f7324aad7f2df99e05525c84d352dc217e0fa637a4f603e9f2eedfbe2c67/safetensors-0.6.2-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:d83c20c12c2d2f465997c51b7ecb00e407e5f94d7dec3ea0cc11d86f60d3fde5", size = 692286, upload-time = "2025-08-08T13:13:55.884Z" },
-    { url = "https://files.pythonhosted.org/packages/ad/fe/cad1d9762868c7c5dc70c8620074df28ebb1a8e4c17d4c0cb031889c457e/safetensors-0.6.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:d944cea65fad0ead848b6ec2c37cc0b197194bec228f8020054742190e9312ac", size = 655957, upload-time = "2025-08-08T13:13:57.029Z" },
-    { url = "https://files.pythonhosted.org/packages/59/a7/e2158e17bbe57d104f0abbd95dff60dda916cf277c9f9663b4bf9bad8b6e/safetensors-0.6.2-cp38-abi3-win32.whl", hash = "sha256:cab75ca7c064d3911411461151cb69380c9225798a20e712b102edda2542ddb1", size = 308926, upload-time = "2025-08-08T13:14:01.095Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/c3/c0be1135726618dc1e28d181b8c442403d8dbb9e273fd791de2d4384bcdd/safetensors-0.6.2-cp38-abi3-win_amd64.whl", hash = "sha256:c7b214870df923cbc1593c3faee16bec59ea462758699bd3fee399d00aac072c", size = 320192, upload-time = "2025-08-08T13:13:59.467Z" },
+sdist = { url = "https://files.pythonhosted.org/packages/29/9c/6e74567782559a63bd040a236edca26fd71bc7ba88de2ef35d75df3bca5e/safetensors-0.7.0.tar.gz", hash = "sha256:07663963b67e8bd9f0b8ad15bb9163606cd27cc5a1b96235a50d8369803b96b0", size = 200878, upload-time = "2025-11-19T15:18:43.199Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fa/47/aef6c06649039accf914afef490268e1067ed82be62bcfa5b7e886ad15e8/safetensors-0.7.0-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:c82f4d474cf725255d9e6acf17252991c3c8aac038d6ef363a4bf8be2f6db517", size = 467781, upload-time = "2025-11-19T15:18:35.84Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/00/374c0c068e30cd31f1e1b46b4b5738168ec79e7689ca82ee93ddfea05109/safetensors-0.7.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:94fd4858284736bb67a897a41608b5b0c2496c9bdb3bf2af1fa3409127f20d57", size = 447058, upload-time = "2025-11-19T15:18:34.416Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/06/578ffed52c2296f93d7fd2d844cabfa92be51a587c38c8afbb8ae449ca89/safetensors-0.7.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e07d91d0c92a31200f25351f4acb2bc6aff7f48094e13ebb1d0fb995b54b6542", size = 491748, upload-time = "2025-11-19T15:18:09.79Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/33/1debbbb70e4791dde185edb9413d1fe01619255abb64b300157d7f15dddd/safetensors-0.7.0-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8469155f4cb518bafb4acf4865e8bb9d6804110d2d9bdcaa78564b9fd841e104", size = 503881, upload-time = "2025-11-19T15:18:16.145Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/1c/40c2ca924d60792c3be509833df711b553c60effbd91da6f5284a83f7122/safetensors-0.7.0-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54bef08bf00a2bff599982f6b08e8770e09cc012d7bba00783fc7ea38f1fb37d", size = 623463, upload-time = "2025-11-19T15:18:21.11Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/3a/13784a9364bd43b0d61eef4bea2845039bc2030458b16594a1bd787ae26e/safetensors-0.7.0-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:42cb091236206bb2016d245c377ed383aa7f78691748f3bb6ee1bfa51ae2ce6a", size = 532855, upload-time = "2025-11-19T15:18:25.719Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/60/429e9b1cb3fc651937727befe258ea24122d9663e4d5709a48c9cbfceecb/safetensors-0.7.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dac7252938f0696ddea46f5e855dd3138444e82236e3be475f54929f0c510d48", size = 507152, upload-time = "2025-11-19T15:18:33.023Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/a8/4b45e4e059270d17af60359713ffd83f97900d45a6afa73aaa0d737d48b6/safetensors-0.7.0-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1d060c70284127fa805085d8f10fbd0962792aed71879d00864acda69dbab981", size = 541856, upload-time = "2025-11-19T15:18:31.075Z" },
+    { url = "https://files.pythonhosted.org/packages/06/87/d26d8407c44175d8ae164a95b5a62707fcc445f3c0c56108e37d98070a3d/safetensors-0.7.0-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:cdab83a366799fa730f90a4ebb563e494f28e9e92c4819e556152ad55e43591b", size = 674060, upload-time = "2025-11-19T15:18:37.211Z" },
+    { url = "https://files.pythonhosted.org/packages/11/f5/57644a2ff08dc6325816ba7217e5095f17269dada2554b658442c66aed51/safetensors-0.7.0-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:672132907fcad9f2aedcb705b2d7b3b93354a2aec1b2f706c4db852abe338f85", size = 771715, upload-time = "2025-11-19T15:18:38.689Z" },
+    { url = "https://files.pythonhosted.org/packages/86/31/17883e13a814bd278ae6e266b13282a01049b0c81341da7fd0e3e71a80a3/safetensors-0.7.0-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:5d72abdb8a4d56d4020713724ba81dac065fedb7f3667151c4a637f1d3fb26c0", size = 714377, upload-time = "2025-11-19T15:18:40.162Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/d8/0c8a7dc9b41dcac53c4cbf9df2b9c83e0e0097203de8b37a712b345c0be5/safetensors-0.7.0-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b0f6d66c1c538d5a94a73aa9ddca8ccc4227e6c9ff555322ea40bdd142391dd4", size = 677368, upload-time = "2025-11-19T15:18:41.627Z" },
+    { url = "https://files.pythonhosted.org/packages/05/e5/cb4b713c8a93469e3c5be7c3f8d77d307e65fe89673e731f5c2bfd0a9237/safetensors-0.7.0-cp38-abi3-win32.whl", hash = "sha256:c74af94bf3ac15ac4d0f2a7c7b4663a15f8c2ab15ed0fc7531ca61d0835eccba", size = 326423, upload-time = "2025-11-19T15:18:45.74Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/e6/ec8471c8072382cb91233ba7267fd931219753bb43814cbc71757bfd4dab/safetensors-0.7.0-cp38-abi3-win_amd64.whl", hash = "sha256:d1239932053f56f3456f32eb9625590cc7582e905021f94636202a864d470755", size = 341380, upload-time = "2025-11-19T15:18:44.427Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/6a/4d08d89a6fcbe905c5ae68b8b34f0791850882fc19782d0d02c65abbdf3b/safetensors-0.7.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4729811a6640d019a4b7ba8638ee2fd21fa5ca8c7e7bdf0fed62068fcaac737", size = 492430, upload-time = "2025-11-19T15:18:11.884Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/29/59ed8152b30f72c42d00d241e58eaca558ae9dbfa5695206e2e0f54c7063/safetensors-0.7.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:12f49080303fa6bb424b362149a12949dfbbf1e06811a88f2307276b0c131afd", size = 503977, upload-time = "2025-11-19T15:18:17.523Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/0b/4811bfec67fa260e791369b16dab105e4bae82686120554cc484064e22b4/safetensors-0.7.0-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0071bffba4150c2f46cae1432d31995d77acfd9f8db598b5d1a2ce67e8440ad2", size = 623890, upload-time = "2025-11-19T15:18:22.666Z" },
+    { url = "https://files.pythonhosted.org/packages/58/5b/632a58724221ef03d78ab65062e82a1010e1bef8e8e0b9d7c6d7b8044841/safetensors-0.7.0-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:473b32699f4200e69801bf5abf93f1a4ecd432a70984df164fc22ccf39c4a6f3", size = 531885, upload-time = "2025-11-19T15:18:27.146Z" },
 ]
 
 [[package]]
@@ -4991,7 +4912,7 @@ resolution-markers = [
     "python_full_version < '3.11' and sys_platform != 'linux'",
 ]
 dependencies = [
-    { name = "numpy", marker = "python_full_version < '3.11'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/0f/37/6964b830433e654ec7485e45a00fc9a27cf868d622838f6b6d9c5ec0d532/scipy-1.15.3.tar.gz", hash = "sha256:eae3cf522bc7df64b42cad3925c876e1b0b6c35c1337c93e12c0f366f55b0eaf", size = 59419214, upload-time = "2025-05-08T16:13:05.955Z" }
 wheels = [
@@ -5047,21 +4968,17 @@ name = "scipy"
 version = "1.16.3"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
     "python_full_version == '3.12.*' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'",
+    "python_full_version >= '3.14' and sys_platform != 'linux'",
+    "python_full_version == '3.13.*' and sys_platform != 'linux'",
     "python_full_version == '3.12.*' and sys_platform != 'linux'",
     "python_full_version == '3.11.*' and sys_platform == 'linux'",
     "python_full_version == '3.11.*' and sys_platform != 'linux'",
 ]
 dependencies = [
-    { name = "numpy", marker = "python_full_version >= '3.11'" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/0a/ca/d8ace4f98322d01abcd52d381134344bf7b431eba7ed8b42bdea5a3c2ac9/scipy-1.16.3.tar.gz", hash = "sha256:01e87659402762f43bd2fee13370553a17ada367d42e7487800bf2916535aecb", size = 30597883, upload-time = "2025-10-28T17:38:54.068Z" }
 wheels = [
@@ -5193,15 +5110,15 @@ wheels = [
 
 [[package]]
 name = "sentry-sdk"
-version = "2.43.0"
+version = "2.46.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "certifi" },
     { name = "urllib3" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/b3/18/09875b4323b03ca9025bae7e6539797b27e4fc032998a466b4b9c3d24653/sentry_sdk-2.43.0.tar.gz", hash = "sha256:52ed6e251c5d2c084224d73efee56b007ef5c2d408a4a071270e82131d336e20", size = 368953, upload-time = "2025-10-29T11:26:08.156Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/7c/d7/c140a5837649e2bf2ec758494fde1d9a016c76777eab64e75ef38d685bbb/sentry_sdk-2.46.0.tar.gz", hash = "sha256:91821a23460725734b7741523021601593f35731808afc0bb2ba46c27b8acd91", size = 374761, upload-time = "2025-11-24T09:34:13.932Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/69/31/8228fa962f7fd8814d634e4ebece8780e2cdcfbdf0cd2e14d4a6861a7cd5/sentry_sdk-2.43.0-py2.py3-none-any.whl", hash = "sha256:4aacafcf1756ef066d359ae35030881917160ba7f6fc3ae11e0e58b09edc2d5d", size = 400997, upload-time = "2025-10-29T11:26:05.77Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/b6/ce7c502a366f4835b1f9c057753f6989a92d3c70cbadb168193f5fb7499b/sentry_sdk-2.46.0-py2.py3-none-any.whl", hash = "sha256:4eeeb60198074dff8d066ea153fa6f241fef1668c10900ea53a4200abc8da9b1", size = 406266, upload-time = "2025-11-24T09:34:12.114Z" },
 ]
 
 [[package]]
@@ -5233,11 +5150,11 @@ wheels = [
 
 [[package]]
 name = "slack-sdk"
-version = "3.37.0"
+version = "3.39.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/8e/c2/0a174a155623d7dc3ed4d1360cdf755590acdc2c3fc9ce0d2340f468909f/slack_sdk-3.37.0.tar.gz", hash = "sha256:242d6cffbd9e843af807487ff04853189b812081aeaa22f90a8f159f20220ed9", size = 241612, upload-time = "2025-10-06T23:07:20.856Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/b6/dd/645f3eb93fce38eadbb649e85684730b1fc3906c2674ca59bddc2ca2bd2e/slack_sdk-3.39.0.tar.gz", hash = "sha256:6a56be10dc155c436ff658c6b776e1c082e29eae6a771fccf8b0a235822bbcb1", size = 247207, upload-time = "2025-11-20T15:27:57.556Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/07/fd/a502ee24d8c7d12a8f749878ae0949b8eeb50aeac22dc5a613d417a256d0/slack_sdk-3.37.0-py2.py3-none-any.whl", hash = "sha256:e108a0836eafda74d8a95e76c12c2bcb010e645d504d8497451e4c7ebb229c87", size = 302751, upload-time = "2025-10-06T23:07:19.542Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/1f/32bcf088e535c1870b1a1f2e3b916129c66fdfe565a793316317241d41e5/slack_sdk-3.39.0-py2.py3-none-any.whl", hash = "sha256:b1556b2f5b8b12b94e5ea3f56c4f2c7f04462e4e1013d325c5764ff118044fa8", size = 309850, upload-time = "2025-11-20T15:27:55.729Z" },
 ]
 
 [[package]]
@@ -5282,7 +5199,8 @@ version = "0.13.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "cffi" },
-    { name = "numpy" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/e1/41/9b873a8c055582859b239be17902a85339bec6a30ad162f98c9b0288a2cc/soundfile-0.13.1.tar.gz", hash = "sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b", size = 46156, upload-time = "2025-01-25T09:17:04.831Z" }
 wheels = [
@@ -5341,44 +5259,14 @@ name = "sphinx"
 version = "8.2.3"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform != 'linux'",
+    "python_full_version == '3.13.*' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
 ]
 dependencies = [
     { name = "alabaster", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
@@ -5430,44 +5318,14 @@ name = "sphinx-autobuild"
 version = "2025.8.25"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform != 'linux'",
+    "python_full_version == '3.13.*' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
 ]
 dependencies = [
     { name = "colorama", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
@@ -5565,15 +5423,24 @@ wheels = [
 
 [[package]]
 name = "starlette"
-version = "0.49.3"
+version = "0.50.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
-    { name = "typing-extensions", marker = "python_full_version < '3.13' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/de/1a/608df0b10b53b0beb96a37854ee05864d182ddd4b1156a22f1ad3860425a/starlette-0.49.3.tar.gz", hash = "sha256:1c14546f299b5901a1ea0e34410575bc33bbd741377a10484a54445588d00284", size = 2655031, upload-time = "2025-11-01T15:12:26.13Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/ba/b8/73a0e6a6e079a9d9cfa64113d771e421640b6f679a52eeb9b32f72d871a1/starlette-0.50.0.tar.gz", hash = "sha256:a2a17b22203254bcbc2e1f926d2d55f3f9497f769416b3190768befe598fa3ca", size = 2646985, upload-time = "2025-11-01T15:25:27.516Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a3/e0/021c772d6a662f43b63044ab481dc6ac7592447605b5b35a957785363122/starlette-0.49.3-py3-none-any.whl", hash = "sha256:b579b99715fdc2980cf88c8ec96d3bf1ce16f5a8051a7c2b84ef9b1cdecaea2f", size = 74340, upload-time = "2025-11-01T15:12:24.387Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/52/1064f510b141bd54025f9b55105e26d1fa970b9be67ad766380a3c9b74b0/starlette-0.50.0-py3-none-any.whl", hash = "sha256:9e5391843ec9b6e472eed1365a78c8098cfceb7a74bfd4d6b1c0c0095efb3bca", size = 74033, upload-time = "2025-11-01T15:25:25.461Z" },
+]
+
+[[package]]
+name = "strenum"
+version = "0.4.15"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/85/ad/430fb60d90e1d112a62ff57bdd1f286ec73a2a0331272febfddd21f330e1/StrEnum-0.4.15.tar.gz", hash = "sha256:878fb5ab705442070e4dd1929bb5e2249511c0bcf2b0eeacf3bcd80875c82eff", size = 23384, upload-time = "2023-06-29T22:02:58.399Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/81/69/297302c5f5f59c862faa31e6cb9a4cd74721cd1e052b38e464c5b402df8b/StrEnum-0.4.15-py3-none-any.whl", hash = "sha256:a30cda4af7cc6b5bf52c8055bc4bf4b2b6b14a93b574626da33df53cf7740659", size = 8851, upload-time = "2023-06-29T22:02:56.947Z" },
 ]
 
 [[package]]
@@ -5581,7 +5448,7 @@ name = "sympy"
 version = "1.14.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "mpmath", marker = "sys_platform != 'linux'" },
+    { name = "mpmath" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" }
 wheels = [
@@ -5605,7 +5472,8 @@ dependencies = [
     { name = "absl-py" },
     { name = "grpcio" },
     { name = "markdown" },
-    { name = "numpy" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "packaging" },
     { name = "pillow" },
     { name = "protobuf" },
@@ -5627,63 +5495,17 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/73/c6/825dab04195756cf8ff2e12698f22513b3db2f64925bdd41671bfb33aaa5/tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:ef687163c24185ae9754ed5650eb5bc4d84ff257aabdc33f0cc6f74d8ba54530", size = 6590363, upload-time = "2023-10-23T21:23:35.583Z" },
 ]
 
-[[package]]
-name = "tensorstore"
-version = "0.1.74"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'",
-]
-dependencies = [
-    { name = "ml-dtypes", version = "0.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
-    { name = "numpy", marker = "python_full_version >= '3.13'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/3c/b9/ea25aba62c688a87d7d7d9cc5926d602e2f9e84fa72586825486fb180b7e/tensorstore-0.1.74.tar.gz", hash = "sha256:a062875f27283d30ce4959c408c253ecb336fce8e3f9837c064e3d30cda79203", size = 6795605, upload-time = "2025-04-24T15:42:18.829Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/f2/20/1e7e776dc30f2f07416223c12f9ad244ec539af5fa1fbef9320812a9a3b6/tensorstore-0.1.74-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:edfae80aceb05640ac2209a11a4b76cecd5d9c4a95c01ede8c89c8edaa90f9d5", size = 15292660, upload-time = "2025-04-24T15:41:18.253Z" },
-    { url = "https://files.pythonhosted.org/packages/76/cc/81bf2d6a4caa239d38905b439864d3a8bf06b27d6d31bb2396e3f4f5cc55/tensorstore-0.1.74-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ab985d767d53e9478987c23dc7aea8f7e8aed2ef90ec8f7f939e8b399667feb1", size = 13260438, upload-time = "2025-04-24T15:41:22.596Z" },
-    { url = "https://files.pythonhosted.org/packages/88/4c/a26c4c8b8e7573d2b552505cd46a658b9a68a80d88e9d3c68f16d10e4d62/tensorstore-0.1.74-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d16d1181c292ea065ebd203e823420c65e365d0407eea8f0a3dd82995da0cc65", size = 17041531, upload-time = "2025-04-24T15:41:25.492Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/a9/3859b1b497dacf2093e196e1d4ed3b95e8553c7d7c9fe1f88216c72253a9/tensorstore-0.1.74-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f327e813152705b5297f251824a91106e17a06fd2f6b5f6e94c6401c5937da8c", size = 18392852, upload-time = "2025-04-24T15:41:28.136Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/3b/b7494ea0a37dd4cd3721f104fc52d4c953354b801eb1adf08e40bc08aaa0/tensorstore-0.1.74-cp310-cp310-win_amd64.whl", hash = "sha256:e56e9690cc20463951a52a6908e18056a93ce5bcd4a881834e2b5962801a1125", size = 12429998, upload-time = "2025-04-24T15:41:30.794Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/3e/d67bb3d9bb7409469d15fb90ef5756e6ac8b835af7f27c02fc542c4b4059/tensorstore-0.1.74-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:8353e619d9140ca50fc0cb5b846e07c68462dd5015b4714752a0a664e48a03d3", size = 15294582, upload-time = "2025-04-24T15:41:33.794Z" },
-    { url = "https://files.pythonhosted.org/packages/01/f4/49cb5ea8e63303fcb0a6ebf0ed546aaec63982a4abca0e9801da5e3a24e3/tensorstore-0.1.74-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3ad1bfbb257ab84de1a5c9b79a60cebb5fbb7a411ddb1c246c21c9795789ba1", size = 13261395, upload-time = "2025-04-24T15:41:36.372Z" },
-    { url = "https://files.pythonhosted.org/packages/ad/7b/9c12d4687e6ff19222f12719286c13a546f1714e5dbed75d52a4267534ed/tensorstore-0.1.74-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3ad9daf4c757db41ad091a1a5502807baeb848be0937986d8766049c39c8466", size = 17042621, upload-time = "2025-04-24T15:41:39.284Z" },
-    { url = "https://files.pythonhosted.org/packages/b5/07/cf0dc4540a78bc715fbcf4417c5dc708f3d12ed1664bf117f22463f411fc/tensorstore-0.1.74-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a35364804e7d71bf5e86d2dae4de04c90249b61ff71448b9713b4e72b2389bd", size = 18393581, upload-time = "2025-04-24T15:41:42.554Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/42/edf004c5a101e021f052ea3564250d773d7cf6458f92934456ffa967383f/tensorstore-0.1.74-cp311-cp311-win_amd64.whl", hash = "sha256:15dcb6ce282e32d005caad34d595b0be070947578448a2861c63fdd608fc7394", size = 12431849, upload-time = "2025-04-24T15:41:45.263Z" },
-    { url = "https://files.pythonhosted.org/packages/a1/14/2e6d1cad744af9e9a1a78d881a908a859ad95b61b15de10397069f55fbd8/tensorstore-0.1.74-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:7218722ee5d74e4d01f357917d3b1b7b1d6b1c068aa73e3d801cb3d58fc45116", size = 15334307, upload-time = "2025-04-24T15:41:48.315Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/ac/8d572b8c6d689eb50db0252e9d35ee6278a6aed481b64d7e025cf51e32c4/tensorstore-0.1.74-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a6926554a8633d0210bdba619d3996fff6a6af4214237fbca626e6ddfcc8ea39", size = 13288669, upload-time = "2025-04-24T15:41:50.808Z" },
-    { url = "https://files.pythonhosted.org/packages/9d/6c/3e76d614ad70b61670686d91abaa3ddee6b01255bf2b40f050beb15b7970/tensorstore-0.1.74-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d584e468eb4ef8195f5d21a9da4780cf96c6074b87ef219b43a89efce3d503ca", size = 17031720, upload-time = "2025-04-24T15:41:55.092Z" },
-    { url = "https://files.pythonhosted.org/packages/31/f3/09d7c3ad7c9517f89b5be9b4460b83333e98dce1c9ab0a52464ded0bab67/tensorstore-0.1.74-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0af2225431d59f8a2bb4db4c1519252f10ee407e6550875d78212d3d34ee743", size = 18378829, upload-time = "2025-04-24T15:41:58.167Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/f2/45ece38705280ed9ebf4ccaf084ed1e76e35b1eeec8c510e589978ac8dcd/tensorstore-0.1.74-cp312-cp312-win_amd64.whl", hash = "sha256:4e35f3679873cdc488aae20b9ae2cea4589c7b147a80edb07eb3f09eba47d43d", size = 12432300, upload-time = "2025-04-24T15:42:00.761Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/e9/a08c6a6eb7d6b4b26053d4575196a06c6fccf4e89f9bc625f81e7c91bb5d/tensorstore-0.1.74-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:f7d2c80de9ab352ca14aeca798d6650c5670725e6f8eac73f4fcc8f3147ca614", size = 15334469, upload-time = "2025-04-24T15:42:03.731Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/a9/64b90c6e66e0b8043e641090144c6614b0c78d9a719b9110d953d13a516d/tensorstore-0.1.74-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ceef7d2dcfd1caf61356f7eeb9a37896b4825b4be2750b00615cf5fb1ae47a8b", size = 13288791, upload-time = "2025-04-24T15:42:06.145Z" },
-    { url = "https://files.pythonhosted.org/packages/62/e8/226cfc25d7eac00e783ff2ee4994830c4a42cd8690e207c4a8b93210f3d9/tensorstore-0.1.74-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e71637002a806bc1b0f0f05556d1c33493a43f3ab35f9632b3d48855677d93dc", size = 17031815, upload-time = "2025-04-24T15:42:09.239Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/09/dce8a0942d84f6bb039b5ea3e8bc6a479b1a9535cd216b0d42dd03c4f761/tensorstore-0.1.74-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c799edf9000aee68d6676e3d2f73d4e1a56fc817c47e150732f6d3bd2b1ef46d", size = 18378091, upload-time = "2025-04-24T15:42:13.546Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/23/5218575d25de9d8debfb3faf290a1e3b9a7b6be9e77ba07ff3a63a0bc899/tensorstore-0.1.74-cp313-cp313-win_amd64.whl", hash = "sha256:5da86437ffa1ee0f0c590c38daa2f4b548890ce66b1f470ac98714cb0eabdbf5", size = 12432635, upload-time = "2025-04-24T15:42:16.275Z" },
-]
-
 [[package]]
 name = "tensorstore"
 version = "0.1.78"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version == '3.12.*' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux'",
     "python_full_version < '3.11' and sys_platform == 'linux'",
     "python_full_version < '3.11' and sys_platform != 'linux'",
 ]
 dependencies = [
-    { name = "ml-dtypes", version = "0.5.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.13'" },
-    { name = "numpy", marker = "python_full_version < '3.13'" },
+    { name = "ml-dtypes", marker = "python_full_version < '3.11'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/9f/ee/05eb424437f4db63331c90e4605025eedc0f71da3faff97161d5d7b405af/tensorstore-0.1.78.tar.gz", hash = "sha256:e26074ffe462394cf54197eb76d6569b500f347573cd74da3f4dd5f510a4ad7c", size = 6913502, upload-time = "2025-10-06T17:44:29.649Z" }
 wheels = [
@@ -5709,6 +5531,48 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/db/a2/dbd1af0e97d5d549051309d72c6e3f2fe81fae636f9db3692d21adc9c731/tensorstore-0.1.78-cp313-cp313-win_amd64.whl", hash = "sha256:e0073de8fa3074bc4cc92ced0210310fd89851899faf42a5ba256f0ba87d095c", size = 12711250, upload-time = "2025-10-06T17:44:27.926Z" },
 ]
 
+[[package]]
+name = "tensorstore"
+version = "0.1.79"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'linux'",
+    "python_full_version == '3.13.*' and sys_platform == 'linux'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux'",
+    "python_full_version >= '3.14' and sys_platform != 'linux'",
+    "python_full_version == '3.13.*' and sys_platform != 'linux'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux'",
+]
+dependencies = [
+    { name = "ml-dtypes", marker = "python_full_version >= '3.11'" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/26/2c/50ab489a0862ca88d2d766130a6fec45ccd5174f0e04081d8b7b07a8aedd/tensorstore-0.1.79.tar.gz", hash = "sha256:8dad44a8a7f2952a5d0030a8bd868b3cfdff048bd40ab53e7226f3d8b0881c5e", size = 7075782, upload-time = "2025-11-11T22:05:23.824Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/68/a9/1695d7ea197c4568c2f02f34b203eef702ec8080422331f00a65c6fb2a37/tensorstore-0.1.79-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:11a2c62694ea9c21770bc5a09938d3d15c4b9662b738ae6e1e513c26ed96251a", size = 16466511, upload-time = "2025-11-11T22:04:18.614Z" },
+    { url = "https://files.pythonhosted.org/packages/db/0e/5ce8a615c7f9ad7cf8ed4ac6e182fe0ef46fd06fef89757e49ba84a6ba9e/tensorstore-0.1.79-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5e152d334bf34fbabdfe8e5bc35b87d1f9947065924ff83c29e659308b36e948", size = 14499810, upload-time = "2025-11-11T22:04:21.725Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/29/2cb9552138fe84ab29421489121350e4af0502eafff31ccd9017490be0d8/tensorstore-0.1.79-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c4230b8fd29795e88e441f749d881973eca8dadf33c5262b367839fb8891f79b", size = 18937510, upload-time = "2025-11-11T22:04:24.221Z" },
+    { url = "https://files.pythonhosted.org/packages/42/70/d2a672a93faebdd176cd8541405cd5614b14d3d8dc812fbeaf2cf46d390a/tensorstore-0.1.79-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:83072ee0e551d6dca582e154b64c8b8066d276ec0759784e3149c28212a61f18", size = 20910324, upload-time = "2025-11-11T22:04:26.769Z" },
+    { url = "https://files.pythonhosted.org/packages/91/d5/7958cbfb614c4ffa5070ae9575874d46937067c0d81a7739e67fb1d62de5/tensorstore-0.1.79-cp311-cp311-win_amd64.whl", hash = "sha256:6c98c6b74c00e00eba7969292144e471d5c45d67088f0dc08e3a4c60a15ee191", size = 13206191, upload-time = "2025-11-11T22:04:29.254Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/a2/a77be16b4a882ace36da0748305795f35306bdad568472f208bd89b96b9d/tensorstore-0.1.79-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:71aa9b45436d888c37b965f7b71195916d15438119b7dccb66a3b0776bfba367", size = 16485740, upload-time = "2025-11-11T22:04:33.478Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/e4/7fe268ec41aa70b71a1c56b1ec83346fbcbf12f4bfbefc79d14fb9c03408/tensorstore-0.1.79-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:108c0e867aa2c87d4982cc6325a2de0c4f5bd63c2bea18adb193a370c40594ce", size = 14508736, upload-time = "2025-11-11T22:04:38.613Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/f1/b1248dae02598ce534834413e841f915a32ab185c36ecd05e4c67bdc8d19/tensorstore-0.1.79-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:debd435042c00be68ba1fb3cf59325a7babb3f4a3cf4744c87dde346802cbbb4", size = 18947817, upload-time = "2025-11-11T22:04:40.768Z" },
+    { url = "https://files.pythonhosted.org/packages/87/4a/60e234147570e21bbab4ac70ab79dd794a5ef9a4945d36c34c1914a73205/tensorstore-0.1.79-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:608f7178ec6e4e4a3c26545b0a44f44bf83438d04bf2d960cd0e7699eaa99ef6", size = 20929832, upload-time = "2025-11-11T22:04:43.613Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/48/0531868bce12a2f520002e810d4200ec6f01ba33a2f27b6bd7289fbc197b/tensorstore-0.1.79-cp312-cp312-win_amd64.whl", hash = "sha256:a071c6c255b7e412957a6aa563bc4250242c7894edad06ae6358e3d30b7d88ce", size = 13211970, upload-time = "2025-11-11T22:04:46.179Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/0b/54a44e55836d8e8f576343134c0e3db71c6c837d39a0ac44699aba5b01df/tensorstore-0.1.79-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:1e8e2d098829919caac6a62cf568902e34789069ceddb28497d6e36ebcb95c0b", size = 16485855, upload-time = "2025-11-11T22:04:48.734Z" },
+    { url = "https://files.pythonhosted.org/packages/04/59/cadb9a45896d480882476df4759cda1659c70669aff87a4d5a4a07ded084/tensorstore-0.1.79-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:29cf4336153af136ac8ac528e2ed46df19367edae7e14e37bca1a8b7c4848ef2", size = 14508277, upload-time = "2025-11-11T22:04:50.775Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/cb/3647bdd03c7692882ebc10c19df9ede49f290c216b2906f785edbdb53ef1/tensorstore-0.1.79-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:94d8fc9df1721b0287046aca7209fd5040889cad4202e7b73a1fdb77cd9b71c6", size = 18949307, upload-time = "2025-11-11T22:04:53.145Z" },
+    { url = "https://files.pythonhosted.org/packages/20/a0/f91ac492cf2ee9f7541aefaaed4ad1258e73e33f3cd3e06cdce5859431db/tensorstore-0.1.79-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9f2dc3342e4686af98f6e259dc9fb377f1bf657b649c247bf6647bbe4f98090", size = 20930427, upload-time = "2025-11-11T22:04:55.353Z" },
+    { url = "https://files.pythonhosted.org/packages/69/a6/752fd11747eb9fead715b02d389da7fb180a56172b885de0b48b20237d1e/tensorstore-0.1.79-cp313-cp313-win_amd64.whl", hash = "sha256:0fd6165f3df49abc7c9de029b2b72d74bebd2ff2481a5ced003607eb61c56d3e", size = 13212196, upload-time = "2025-11-11T22:05:00.451Z" },
+    { url = "https://files.pythonhosted.org/packages/46/57/1649019893accb3f195780fec55b8bf6793343faf140040bc73f1c28d6a5/tensorstore-0.1.79-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:6f8f5a940eab434a951c2dadcc7c0516c7bef6d8b7a7144054f7a0c56152b5f5", size = 16488849, upload-time = "2025-11-11T22:05:03.014Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/23/2668cb120e855a6a7a8a5eb0eba30e2e7020da932a4d3fa13c6ee3c41f9f/tensorstore-0.1.79-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:97756d2cba3c5ce21e15602c2af5a02521cc0ecda7f9fb6d18da2f3bd51827f4", size = 14511448, upload-time = "2025-11-11T22:05:05.58Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/0e/c38f079f3933cc284aab53d52976f6cb4f1ad43bb6a704ac27e0b710f176/tensorstore-0.1.79-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:847982652273fb7b2d694b789205747aaf3e50ae64738c5cb7b5eb03d86a9947", size = 18949282, upload-time = "2025-11-11T22:05:07.562Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/99/03479deea5bfd27a0d8a8c75d5f1d85417a7bbc9c6c7a90fb85b4a4e347a/tensorstore-0.1.79-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7af9422269c2bfcdecf9dd55309060665ab9c2d7f6c892377ed32c032400feea", size = 20931601, upload-time = "2025-11-11T22:05:10.098Z" },
+    { url = "https://files.pythonhosted.org/packages/26/36/2617edf6c6d6fc73b3ff96d9d0b97332adf0d0c56fa2014a226bf4f7dfa6/tensorstore-0.1.79-cp314-cp314-win_amd64.whl", hash = "sha256:bbd8c1ab7d2e3c03ded3d40bb373ee9a67668e33a564484927865ce43b210386", size = 13599766, upload-time = "2025-11-11T22:05:12.265Z" },
+]
+
 [[package]]
 name = "tiktoken"
 version = "0.12.0"
@@ -5864,48 +5728,63 @@ wheels = [
 
 [[package]]
 name = "torch"
-version = "2.9.0"
+version = "2.9.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "filelock", marker = "sys_platform != 'linux'" },
-    { name = "fsspec", marker = "sys_platform != 'linux'" },
-    { name = "jinja2", marker = "sys_platform != 'linux'" },
-    { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and sys_platform != 'linux') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and sys_platform != 'linux') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "setuptools", marker = "python_full_version >= '3.12' and sys_platform != 'linux'" },
-    { name = "sympy", marker = "sys_platform != 'linux'" },
+    { name = "filelock" },
+    { name = "fsspec" },
+    { name = "jinja2" },
+    { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "networkx", version = "3.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cufile-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvshmem-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "setuptools", marker = "python_full_version >= '3.12'" },
+    { name = "sympy" },
     { name = "triton", marker = "sys_platform == 'never'" },
-    { name = "typing-extensions", marker = "sys_platform != 'linux'" },
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/bb/86/245c240d2138c17ed572c943c289056c2721abab70810d772c6bf5495b28/torch-2.9.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:030bbfe367379ae6a4ae4042b6c44da25383343b8b3c68abaa9c7231efbaf2dd", size = 104213554, upload-time = "2025-10-15T15:45:59.798Z" },
-    { url = "https://files.pythonhosted.org/packages/58/1d/fd1e88ae0948825efcab7dd66d12bec23f05d4d38ed81573c8d453c14c06/torch-2.9.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:51cb63902182a78e90886e8068befd8ea102af4b00e420263591a3d70c7d3c6c", size = 899795167, upload-time = "2025-10-15T15:47:12.695Z" },
-    { url = "https://files.pythonhosted.org/packages/63/5a/496197b45c14982bef4e079b24c61dc108e3ab0d0cc9718dba9f54f45a46/torch-2.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:3f6aad4d2f0ee2248bac25339d74858ff846c3969b27d14ac235821f055af83d", size = 109310314, upload-time = "2025-10-15T15:46:16.633Z" },
-    { url = "https://files.pythonhosted.org/packages/58/b0/2b4e647b0fc706e88eb6c253d05511865578f5f67b55fad639bf3272a4a1/torch-2.9.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:413e1654c9203733138858780e184d9fc59442f0b3b209e16f39354eb893db9b", size = 74452019, upload-time = "2025-10-15T15:46:04.296Z" },
-    { url = "https://files.pythonhosted.org/packages/58/fe/334225e6330e672b36aef23d77451fa906ea12881570c08638a91331a212/torch-2.9.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:c596708b5105d0b199215acf0c9be7c1db5f1680d88eddadf4b75a299259a677", size = 104230578, upload-time = "2025-10-15T15:46:08.182Z" },
-    { url = "https://files.pythonhosted.org/packages/05/cc/49566caaa218872ec9a2912456f470ff92649894a4bc2e5274aa9ef87c4a/torch-2.9.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:51de31219c97c51cf4bf2be94d622e3deb5dcc526c6dc00e97c17eaec0fc1d67", size = 899815990, upload-time = "2025-10-15T15:48:03.336Z" },
-    { url = "https://files.pythonhosted.org/packages/74/25/e9ab21d5925b642d008f139d4a3c9664fc9ee1faafca22913c080cc4c0a5/torch-2.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:dd515c70059afd95f48b8192733764c08ca37a1d19803af6401b5ecad7c8676e", size = 109313698, upload-time = "2025-10-15T15:46:12.425Z" },
-    { url = "https://files.pythonhosted.org/packages/b3/b7/205ef3e94de636feffd64b28bb59a0dfac0771221201b9871acf9236f5ca/torch-2.9.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:614a185e4986326d526a91210c8fc1397e76e8cfafa78baf6296a790e53a9eec", size = 74463678, upload-time = "2025-10-15T15:46:29.779Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/d3/3985739f3b8e88675127bf70f82b3a48ae083e39cda56305dbd90398fec0/torch-2.9.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:e5f7af1dc4c0a7c4a260c2534f41ddaf209714f7c89145e644c44712fbd6b642", size = 104107898, upload-time = "2025-10-15T15:46:20.883Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/4b/f4bb2e6c25d0272f798cd6d7a04ed315da76cec68c602d87040c7847287f/torch-2.9.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:01cff95ecd9a212ea2f141db28acccdceb6a4c54f64e6c51091146f5e2a772c6", size = 899738273, upload-time = "2025-10-15T15:50:04.188Z" },
-    { url = "https://files.pythonhosted.org/packages/66/11/c1c5ba6691cda6279087c35bd626536e4fd29521fe740abf5008377a9a02/torch-2.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:4582b162f541651f0cb184d3e291c05c2f556c7117c64a9873e2ee158d40062b", size = 109280887, upload-time = "2025-10-15T15:46:26.228Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/5f/b85bd8c05312d71de9402bf5868d217c38827cfd09d8f8514e5be128a52b/torch-2.9.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:33f58e9a102a91259af289d50525c30323b5c9ae1d31322b6447c0814da68695", size = 74478983, upload-time = "2025-10-15T15:46:39.406Z" },
-    { url = "https://files.pythonhosted.org/packages/c2/1c/90eb13833cdf4969ea9707586d7b57095c3b6e2b223a7256bf111689bcb8/torch-2.9.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:c30a17fc83eeab346913e237c64b15b5ba6407fff812f6c541e322e19bc9ea0e", size = 104111330, upload-time = "2025-10-15T15:46:35.238Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/21/2254c54b8d523592c25ef4434769aa23e29b1e6bf5f4c0ad9e27bf442927/torch-2.9.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:8f25033b8667b57857dfd01458fbf2a9e6a6df1f8def23aef0dc46292f6aa642", size = 899750243, upload-time = "2025-10-15T15:48:57.459Z" },
-    { url = "https://files.pythonhosted.org/packages/b7/a5/5cb94fa4fd1e78223455c23c200f30f6dc10c6d4a2bcc8f6e7f2a2588370/torch-2.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:d037f1b4ffd25013be4a7bf3651a0a910c68554956c7b2c92ebe87c76475dece", size = 109284513, upload-time = "2025-10-15T15:46:45.061Z" },
-    { url = "https://files.pythonhosted.org/packages/66/e8/fc414d8656250ee46120b44836ffbb3266343db424b3e18ca79ebbf69d4f/torch-2.9.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e4e5b5cba837a2a8d1a497ba9a58dae46fa392593eaa13b871c42f71847503a5", size = 74830362, upload-time = "2025-10-15T15:46:48.983Z" },
-    { url = "https://files.pythonhosted.org/packages/ed/5f/9474c98fc5ae0cd04b9466035428cd360e6611a86b8352a0fc2fa504acdc/torch-2.9.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:64693568f5dc4dbd5f880a478b1cea0201cc6b510d91d1bc54fea86ac5d1a637", size = 104144940, upload-time = "2025-10-15T15:47:29.076Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/5a/8e0c1cf57830172c109d4bd6be2708cabeaf550983eee7029291322447a0/torch-2.9.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:f8ed31ddd7d10bfb3fbe0b9fe01b1243577f13d75e6f4a0839a283915ce3791e", size = 899744054, upload-time = "2025-10-15T15:48:29.864Z" },
-    { url = "https://files.pythonhosted.org/packages/6d/28/82c28b30fcb4b7c9cdd995763d18bbb830d6521356712faebbad92ffa61d/torch-2.9.0-cp313-cp313t-win_amd64.whl", hash = "sha256:eff527d4e4846e6f70d2afd8058b73825761203d66576a7e04ea2ecfebcb4ab8", size = 109517546, upload-time = "2025-10-15T15:47:33.395Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/c3/a91f96ec74347fa5fd24453fa514bc61c61ecc79196fa760b012a1873d96/torch-2.9.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:f8877779cf56d1ce431a7636703bdb13307f5960bb1af49716d8b179225e0e6a", size = 74480732, upload-time = "2025-10-15T15:47:38.002Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/73/9f70af34b334a7e0ef496ceec96b7ec767bd778ea35385ce6f77557534d1/torch-2.9.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:7e614fae699838038d888729f82b687c03413c5989ce2a9481f9a7e7a396e0bb", size = 74433037, upload-time = "2025-10-15T15:47:41.894Z" },
-    { url = "https://files.pythonhosted.org/packages/b7/84/37cf88625901934c97109e583ecc21777d21c6f54cda97a7e5bbad1ee2f2/torch-2.9.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:dfb5b8cd310ba3436c7e14e8b7833ef658cf3045e50d2bdaed23c8fc517065eb", size = 104116482, upload-time = "2025-10-15T15:47:46.266Z" },
-    { url = "https://files.pythonhosted.org/packages/56/8e/ca8b17866943a8d4f4664d402ea84210aa274588b4c5d89918f5caa24eec/torch-2.9.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:b3d29524993a478e46f5d598b249cd824b7ed98d7fba538bd9c4cde6c803948f", size = 899746916, upload-time = "2025-10-15T15:50:40.294Z" },
-    { url = "https://files.pythonhosted.org/packages/43/65/3b17c0fbbdab6501c5b320a52a648628d0d44e7379f64e27d9eef701b6bf/torch-2.9.0-cp314-cp314-win_amd64.whl", hash = "sha256:71c7578984f5ec0eb645eb4816ac8435fcf3e3e2ae1901bcd2f519a9cafb5125", size = 109275151, upload-time = "2025-10-15T15:49:20.715Z" },
-    { url = "https://files.pythonhosted.org/packages/83/36/74f8c051f785500396e42f93542422422dfd874a174f21f8d955d36e5d64/torch-2.9.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:71d9309aee457bbe0b164bce2111cd911c4ed4e847e65d5077dbbcd3aba6befc", size = 74823353, upload-time = "2025-10-15T15:49:16.59Z" },
-    { url = "https://files.pythonhosted.org/packages/62/51/dc3b4e2f9ba98ae27238f0153ca098bf9340b2dafcc67fde645d496dfc2a/torch-2.9.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:c08fb654d783899e204a32cca758a7ce8a45b2d78eeb89517cc937088316f78e", size = 104140340, upload-time = "2025-10-15T15:50:19.67Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/8d/b00657f8141ac16af7bb6cda2e67de18499a3263b78d516b9a93fcbc98e3/torch-2.9.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:ec8feb0099b2daa5728fbc7abb0b05730fd97e0f359ff8bda09865aaa7bd7d4b", size = 899731750, upload-time = "2025-10-15T15:49:36.673Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/29/bd361e0cbb2c79ce6450f42643aaf6919956f89923a50571b0ebfe92d142/torch-2.9.0-cp314-cp314t-win_amd64.whl", hash = "sha256:695ba920f234ad4170c9c50e28d56c848432f8f530e6bc7f88fcb15ddf338e75", size = 109503850, upload-time = "2025-10-15T15:50:24.118Z" },
+    { name = "typing-extensions" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5f/56/9577683b23072075ed2e40d725c52c2019d71a972fab8e083763da8e707e/torch-2.9.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:1cc208435f6c379f9b8fdfd5ceb5be1e3b72a6bdf1cb46c0d2812aa73472db9e", size = 104207681, upload-time = "2025-11-12T15:19:56.48Z" },
+    { url = "https://files.pythonhosted.org/packages/38/45/be5a74f221df8f4b609b78ff79dc789b0cc9017624544ac4dd1c03973150/torch-2.9.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:9fd35c68b3679378c11f5eb73220fdcb4e6f4592295277fbb657d31fd053237c", size = 899794036, upload-time = "2025-11-12T15:21:01.886Z" },
+    { url = "https://files.pythonhosted.org/packages/67/95/a581e8a382596b69385a44bab2733f1273d45c842f5d4a504c0edc3133b6/torch-2.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:2af70e3be4a13becba4655d6cc07dcfec7ae844db6ac38d6c1dafeb245d17d65", size = 110969861, upload-time = "2025-11-12T15:21:30.145Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/51/1756dc128d2bf6ea4e0a915cb89ea5e730315ff33d60c1ff56fd626ba3eb/torch-2.9.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:a83b0e84cc375e3318a808d032510dde99d696a85fe9473fc8575612b63ae951", size = 74452222, upload-time = "2025-11-12T15:20:46.223Z" },
+    { url = "https://files.pythonhosted.org/packages/15/db/c064112ac0089af3d2f7a2b5bfbabf4aa407a78b74f87889e524b91c5402/torch-2.9.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:62b3fd888277946918cba4478cf849303da5359f0fb4e3bfb86b0533ba2eaf8d", size = 104220430, upload-time = "2025-11-12T15:20:31.705Z" },
+    { url = "https://files.pythonhosted.org/packages/56/be/76eaa36c9cd032d3b01b001e2c5a05943df75f26211f68fae79e62f87734/torch-2.9.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:d033ff0ac3f5400df862a51bdde9bad83561f3739ea0046e68f5401ebfa67c1b", size = 899821446, upload-time = "2025-11-12T15:20:15.544Z" },
+    { url = "https://files.pythonhosted.org/packages/47/cc/7a2949e38dfe3244c4df21f0e1c27bce8aedd6c604a587dd44fc21017cb4/torch-2.9.1-cp311-cp311-win_amd64.whl", hash = "sha256:0d06b30a9207b7c3516a9e0102114024755a07045f0c1d2f2a56b1819ac06bcb", size = 110973074, upload-time = "2025-11-12T15:21:39.958Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/ce/7d251155a783fb2c1bb6837b2b7023c622a2070a0a72726ca1df47e7ea34/torch-2.9.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:52347912d868653e1528b47cafaf79b285b98be3f4f35d5955389b1b95224475", size = 74463887, upload-time = "2025-11-12T15:20:36.611Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/27/07c645c7673e73e53ded71705045d6cb5bae94c4b021b03aa8d03eee90ab/torch-2.9.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:da5f6f4d7f4940a173e5572791af238cb0b9e21b1aab592bd8b26da4c99f1cd6", size = 104126592, upload-time = "2025-11-12T15:20:41.62Z" },
+    { url = "https://files.pythonhosted.org/packages/19/17/e377a460603132b00760511299fceba4102bd95db1a0ee788da21298ccff/torch-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:27331cd902fb4322252657f3902adf1c4f6acad9dcad81d8df3ae14c7c4f07c4", size = 899742281, upload-time = "2025-11-12T15:22:17.602Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/1a/64f5769025db846a82567fa5b7d21dba4558a7234ee631712ee4771c436c/torch-2.9.1-cp312-cp312-win_amd64.whl", hash = "sha256:81a285002d7b8cfd3fdf1b98aa8df138d41f1a8334fd9ea37511517cedf43083", size = 110940568, upload-time = "2025-11-12T15:21:18.689Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/ab/07739fd776618e5882661d04c43f5b5586323e2f6a2d7d84aac20d8f20bd/torch-2.9.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:c0d25d1d8e531b8343bea0ed811d5d528958f1dcbd37e7245bc686273177ad7e", size = 74479191, upload-time = "2025-11-12T15:21:25.816Z" },
+    { url = "https://files.pythonhosted.org/packages/20/60/8fc5e828d050bddfab469b3fe78e5ab9a7e53dda9c3bdc6a43d17ce99e63/torch-2.9.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:c29455d2b910b98738131990394da3e50eea8291dfeb4b12de71ecf1fdeb21cb", size = 104135743, upload-time = "2025-11-12T15:21:34.936Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/b7/6d3f80e6918213babddb2a37b46dbb14c15b14c5f473e347869a51f40e1f/torch-2.9.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:524de44cd13931208ba2c4bde9ec7741fd4ae6bfd06409a604fc32f6520c2bc9", size = 899749493, upload-time = "2025-11-12T15:24:36.356Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/47/c7843d69d6de8938c1cbb1eba426b1d48ddf375f101473d3e31a5fc52b74/torch-2.9.1-cp313-cp313-win_amd64.whl", hash = "sha256:545844cc16b3f91e08ce3b40e9c2d77012dd33a48d505aed34b7740ed627a1b2", size = 110944162, upload-time = "2025-11-12T15:21:53.151Z" },
+    { url = "https://files.pythonhosted.org/packages/28/0e/2a37247957e72c12151b33a01e4df651d9d155dd74d8cfcbfad15a79b44a/torch-2.9.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5be4bf7496f1e3ffb1dd44b672adb1ac3f081f204c5ca81eba6442f5f634df8e", size = 74830751, upload-time = "2025-11-12T15:21:43.792Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/f7/7a18745edcd7b9ca2381aa03353647bca8aace91683c4975f19ac233809d/torch-2.9.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:30a3e170a84894f3652434b56d59a64a2c11366b0ed5776fab33c2439396bf9a", size = 104142929, upload-time = "2025-11-12T15:21:48.319Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/dd/f1c0d879f2863ef209e18823a988dc7a1bf40470750e3ebe927efdb9407f/torch-2.9.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:8301a7b431e51764629208d0edaa4f9e4c33e6df0f2f90b90e261d623df6a4e2", size = 899748978, upload-time = "2025-11-12T15:23:04.568Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/9f/6986b83a53b4d043e36f3f898b798ab51f7f20fdf1a9b01a2720f445043d/torch-2.9.1-cp313-cp313t-win_amd64.whl", hash = "sha256:2e1c42c0ae92bf803a4b2409fdfed85e30f9027a66887f5e7dcdbc014c7531db", size = 111176995, upload-time = "2025-11-12T15:22:01.618Z" },
+    { url = "https://files.pythonhosted.org/packages/40/60/71c698b466dd01e65d0e9514b5405faae200c52a76901baf6906856f17e4/torch-2.9.1-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:2c14b3da5df416cf9cb5efab83aa3056f5b8cd8620b8fde81b4987ecab730587", size = 74480347, upload-time = "2025-11-12T15:21:57.648Z" },
+    { url = "https://files.pythonhosted.org/packages/48/50/c4b5112546d0d13cc9eaa1c732b823d676a9f49ae8b6f97772f795874a03/torch-2.9.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1edee27a7c9897f4e0b7c14cfc2f3008c571921134522d5b9b5ec4ebbc69041a", size = 74433245, upload-time = "2025-11-12T15:22:39.027Z" },
+    { url = "https://files.pythonhosted.org/packages/81/c9/2628f408f0518b3bae49c95f5af3728b6ab498c8624ab1e03a43dd53d650/torch-2.9.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:19d144d6b3e29921f1fc70503e9f2fc572cde6a5115c0c0de2f7ca8b1483e8b6", size = 104134804, upload-time = "2025-11-12T15:22:35.222Z" },
+    { url = "https://files.pythonhosted.org/packages/28/fc/5bc91d6d831ae41bf6e9e6da6468f25330522e92347c9156eb3f1cb95956/torch-2.9.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:c432d04376f6d9767a9852ea0def7b47a7bbc8e7af3b16ac9cf9ce02b12851c9", size = 899747132, upload-time = "2025-11-12T15:23:36.068Z" },
+    { url = "https://files.pythonhosted.org/packages/63/5d/e8d4e009e52b6b2cf1684bde2a6be157b96fb873732542fb2a9a99e85a83/torch-2.9.1-cp314-cp314-win_amd64.whl", hash = "sha256:d187566a2cdc726fc80138c3cdb260970fab1c27e99f85452721f7759bbd554d", size = 110934845, upload-time = "2025-11-12T15:22:48.367Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/b2/2d15a52516b2ea3f414643b8de68fa4cb220d3877ac8b1028c83dc8ca1c4/torch-2.9.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:cb10896a1f7fedaddbccc2017ce6ca9ecaaf990f0973bdfcf405439750118d2c", size = 74823558, upload-time = "2025-11-12T15:22:43.392Z" },
+    { url = "https://files.pythonhosted.org/packages/86/5c/5b2e5d84f5b9850cd1e71af07524d8cbb74cba19379800f1f9f7c997fc70/torch-2.9.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:0a2bd769944991c74acf0c4ef23603b9c777fdf7637f115605a4b2d8023110c7", size = 104145788, upload-time = "2025-11-12T15:23:52.109Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/8c/3da60787bcf70add986c4ad485993026ac0ca74f2fc21410bc4eb1bb7695/torch-2.9.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:07c8a9660bc9414c39cac530ac83b1fb1b679d7155824144a40a54f4a47bfa73", size = 899735500, upload-time = "2025-11-12T15:24:08.788Z" },
+    { url = "https://files.pythonhosted.org/packages/db/2b/f7818f6ec88758dfd21da46b6cd46af9d1b3433e53ddbb19ad1e0da17f9b/torch-2.9.1-cp314-cp314t-win_amd64.whl", hash = "sha256:c88d3299ddeb2b35dcc31753305612db485ab6f1823e37fb29451c8b2732b87e", size = 111163659, upload-time = "2025-11-12T15:23:20.009Z" },
 ]
 
 [[package]]
@@ -5913,7 +5792,8 @@ name = "torchprofile"
 version = "0.0.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "torch", marker = "sys_platform == 'never'" },
     { name = "torchvision", marker = "sys_platform == 'never'" },
 ]
@@ -5924,42 +5804,43 @@ wheels = [
 
 [[package]]
 name = "torchvision"
-version = "0.24.0"
+version = "0.24.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy", marker = "sys_platform != 'linux'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and sys_platform != 'linux' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform == 'linux' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "pillow", marker = "sys_platform != 'linux'" },
     { name = "torch", marker = "sys_platform == 'never'" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/63/5b/1404eeab00819df71a30e916c2081654366741f7838fcc4fff86b7bd9e7e/torchvision-0.24.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5e8d5e667deff87bd66d26df6d225f46224bb0782d4f3f8f5d2f3068b5fd4492", size = 1891723, upload-time = "2025-10-15T15:51:08.5Z" },
-    { url = "https://files.pythonhosted.org/packages/88/e3/1b003ecd52bd721f8304aeb66691edfbc2002747ec83d36188ad6abab506/torchvision-0.24.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:a110a51c75e89807a8382b0d8034f5e180fb9319570be3389ffd3d4ac4fd57a9", size = 2418988, upload-time = "2025-10-15T15:51:25.195Z" },
-    { url = "https://files.pythonhosted.org/packages/56/2e/3c19a35e62da0f606baf8f6e2ceeab1eb66aaa2f84c6528538b06b416d54/torchvision-0.24.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:81d5b12a6df1bb2cc8bdbad837b637d6ea446f2866e6d94f1b5d478856331be3", size = 8046769, upload-time = "2025-10-15T15:51:15.221Z" },
-    { url = "https://files.pythonhosted.org/packages/e0/1d/e7ab614a1ace820a2366eab1532679fbe81bd9501ffd6a1b7be14936366d/torchvision-0.24.0-cp310-cp310-win_amd64.whl", hash = "sha256:0839dbb305d34671f5a64f558782095134b04bbeff8b90f11eb80515d7d50092", size = 3686529, upload-time = "2025-10-15T15:51:20.982Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/17/54ed2ec6944ea972b461a86424c8c7f98835982c90cbc45bf59bd962863a/torchvision-0.24.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f771cf918351ad509a28488be475f3e9cc71a750d6b1467842bfb64863a5e986", size = 1891719, upload-time = "2025-10-15T15:51:10.384Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/07/0cd6776eee784742ad3cb2bfd3295383d84cb2f9e87386119333d1587f0f/torchvision-0.24.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:bbd63bf4ebff84c48c50123eba90526cc9f794fe45bc9f5dd07cec19e8c62bce", size = 2420513, upload-time = "2025-10-15T15:51:18.087Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/f4/6026c08011ddcefcbc14161c5aa9dce55c35c6b045e04ef0952e88bf4594/torchvision-0.24.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:78fe414b3bb6dbf7e6f6da6f733ba96881f6b29a9b997228de7c5f603e5ed940", size = 8048018, upload-time = "2025-10-15T15:51:13.579Z" },
-    { url = "https://files.pythonhosted.org/packages/2f/b4/362b4e67ed87cee0fb4f8f0363a852eaeef527968bf62c07ed56f764d729/torchvision-0.24.0-cp311-cp311-win_amd64.whl", hash = "sha256:629584b94e52f32a6278f2a35d85eeaae95fcc38730fcb765064f26c3c96df5d", size = 4027686, upload-time = "2025-10-15T15:51:19.189Z" },
-    { url = "https://files.pythonhosted.org/packages/47/ef/81e4e69e02e2c4650b30e8c11c8974f946682a30e0ab7e9803a831beff76/torchvision-0.24.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c61d40bcd2e2451e932902a702ad495ba1ec6f279e90b1e15cef2bb55dc911e2", size = 1891726, upload-time = "2025-10-15T15:51:16.977Z" },
-    { url = "https://files.pythonhosted.org/packages/00/7b/e3809b3302caea9a12c13f3adebe4fef127188438e719fd6c8dc93db1da6/torchvision-0.24.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b0531d1483fc322d7da0d83be52f0df860a75114ab87dbeeb9de765feaeda843", size = 2419495, upload-time = "2025-10-15T15:51:11.885Z" },
-    { url = "https://files.pythonhosted.org/packages/7e/e6/7324ead6793075a8c75c56abeed1236d1750de16a5613cfe2ddad164a92a/torchvision-0.24.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:26b9dd9c083f8e5f7ac827de6d5b88c615d9c582dc87666770fbdf16887e4c25", size = 8050480, upload-time = "2025-10-15T15:51:24.012Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/ad/3c56fcd2a0d6e8afa80e115b5ade4302232ec99655220a51d05709819523/torchvision-0.24.0-cp312-cp312-win_amd64.whl", hash = "sha256:060b7c50ed4b3fb0316b08e2e31bfd874ec2f63ef5ae02f81e54341ca4e88703", size = 4292225, upload-time = "2025-10-15T15:51:27.699Z" },
-    { url = "https://files.pythonhosted.org/packages/4f/b5/b2008e4b77a8d6aada828dd0f6a438d8f94befa23fdd2d62fa0ac6e60113/torchvision-0.24.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:84d79cfc6457310107ce4d712de7a3d388b24484bc9aeded4a76d8f8e3a2813d", size = 1891722, upload-time = "2025-10-15T15:51:28.854Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/02/e2f6b0ff93ca4db5751ac9c5be43f13d5e53d9e9412324f464dca1775027/torchvision-0.24.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:fec12a269cf80f6b0b71471c8d498cd3bdd9d8e892c425bf39fecb604852c3b0", size = 2371478, upload-time = "2025-10-15T15:51:37.842Z" },
-    { url = "https://files.pythonhosted.org/packages/77/85/42e5fc4f716ec7b73cf1f32eeb5c77961be4d4054b26cd6a5ff97f20c966/torchvision-0.24.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:7323a9be5e3da695605753f501cdc87824888c5655d27735cdeaa9986b45884c", size = 8050200, upload-time = "2025-10-15T15:51:46.276Z" },
-    { url = "https://files.pythonhosted.org/packages/93/c2/48cb0b6b26276d2120b1e0dbc877579a748eae02b4091a7522ce54f6d5e1/torchvision-0.24.0-cp313-cp313-win_amd64.whl", hash = "sha256:08cad8b204196e945f0b2d73adee952d433db1c03645851d52b22a45f1015b13", size = 4309939, upload-time = "2025-10-15T15:51:39.002Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/d7/3dd10830b047eeb46ae6b465474258d7b4fbb7d8872dca69bd42449f5c82/torchvision-0.24.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:6ab956a6e588623353e0f20d4b03eb1656cb4a3c75ca4dd8b4e32e01bc43271a", size = 2028355, upload-time = "2025-10-15T15:51:22.384Z" },
-    { url = "https://files.pythonhosted.org/packages/f7/cf/2d7e43409089ce7070f5336161f9216d58653ee1cb26bcb5d6c84cc2de36/torchvision-0.24.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:b1b3db80609c32a088554e8e94b4fc31f1033fe5bb4ac0673ec49c3eb03fb4da", size = 2374466, upload-time = "2025-10-15T15:51:35.382Z" },
-    { url = "https://files.pythonhosted.org/packages/e9/30/8f7c328fd7e0a9665da4b6b56b1c627665c18470bfe62f3729ad3eda9aec/torchvision-0.24.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:e6635f100d455c80b43f297df4b8585a76c6a2e114802f6567ddd28d7b5479b0", size = 8217068, upload-time = "2025-10-15T15:51:36.623Z" },
-    { url = "https://files.pythonhosted.org/packages/55/a2/b6f9e40e2904574c80b3bb872c66af20bbd642053e7c8e1b9e99ab396535/torchvision-0.24.0-cp313-cp313t-win_amd64.whl", hash = "sha256:4ce158bbdc3a9086034bced0b5212888bd5b251fee6d08a9eff151d30b4b228a", size = 4273912, upload-time = "2025-10-15T15:51:33.866Z" },
-    { url = "https://files.pythonhosted.org/packages/1b/24/790a39645cc8c71bf442d54a76da9bda5caeb2a44c5f7e02498649cd99d4/torchvision-0.24.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4bdfc85a5ed706421555f32cdc5e3ddb6d40bf65ef03a274ce3c176393e2904b", size = 2028335, upload-time = "2025-10-15T15:51:26.252Z" },
-    { url = "https://files.pythonhosted.org/packages/b0/d7/69479a066ea773653e88eda99031e38681e9094046f87cb957af5036db0e/torchvision-0.24.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:73576a9c4a593223fbae85a64e8bbd77049abd1101893ecf3c5e981284fd58b4", size = 2371609, upload-time = "2025-10-15T15:51:29.859Z" },
-    { url = "https://files.pythonhosted.org/packages/46/64/3c7fdb3771ec992b9445a1f7a969466b23ce2cdb14e09303b3db351a0655/torchvision-0.24.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:dd565b1b06666ff399d0801d4d1824fa570c0167a179ca700a5be232527b3c62", size = 8214918, upload-time = "2025-10-15T15:51:41.465Z" },
-    { url = "https://files.pythonhosted.org/packages/58/51/abc416bc34d574ad479af738e413d9ebf93027ee92d0f4ae38f966b818f7/torchvision-0.24.0-cp314-cp314-win_amd64.whl", hash = "sha256:eb45d12ac48d757738788fd3fb8e88e647d6b2ab2424134ca87556efc72d81b5", size = 4257776, upload-time = "2025-10-15T15:51:42.642Z" },
-    { url = "https://files.pythonhosted.org/packages/08/f7/261d1353c611820541ecd43046b89da3f1ae998dc786e4288b890a009883/torchvision-0.24.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:68120e7e03c31900e499a10bb7fdd63cfd67f0054c9fa108e7e27f9cd372f315", size = 2028359, upload-time = "2025-10-15T15:51:32.119Z" },
-    { url = "https://files.pythonhosted.org/packages/a2/fd/615d8a86db1578345de7fa1edaf476fbcf4f057bf7e4fd898306b620c487/torchvision-0.24.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:64e54494043eecf9f57a9881c6fdea49c62282782e737c002ae8b1639e6ea80e", size = 2374469, upload-time = "2025-10-15T15:51:40.19Z" },
-    { url = "https://files.pythonhosted.org/packages/04/98/bac11e8fdbf00d6c398246ff2781370aa72c99f2ac685c01ce79354c9a32/torchvision-0.24.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:75ef9546323b321a451239d886f0cb528f7e98bb294da47a3200effd4e572064", size = 8217060, upload-time = "2025-10-15T15:51:45.033Z" },
-    { url = "https://files.pythonhosted.org/packages/47/6f/9fba8abc468c904570699eceeb51588f9622172b8fffa4ab11bcf15598c2/torchvision-0.24.0-cp314-cp314t-win_amd64.whl", hash = "sha256:2efb617667950814fc8bb9437e5893861b3616e214285be33cbc364a3f42c599", size = 4358490, upload-time = "2025-10-15T15:51:43.884Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/09/d51aadf8591138e08b74c64a6eb783630c7a31ca2634416277115a9c3a2b/torchvision-0.24.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ded5e625788572e4e1c4d155d1bbc48805c113794100d70e19c76e39e4d53465", size = 1891441, upload-time = "2025-11-12T15:25:01.687Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/49/a35df863e7c153aad82af7505abd8264a5b510306689712ef86bea862822/torchvision-0.24.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:54ed17c3d30e718e08d8da3fd5b30ea44b0311317e55647cb97077a29ecbc25b", size = 2386226, upload-time = "2025-11-12T15:25:05.449Z" },
+    { url = "https://files.pythonhosted.org/packages/49/20/f2d7cd1eea052887c1083afff0b8df5228ec93b53e03759f20b1a3c6d22a/torchvision-0.24.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:f476da4e085b7307aaab6f540219617d46d5926aeda24be33e1359771c83778f", size = 8046093, upload-time = "2025-11-12T15:25:09.425Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/cf/0ff4007c09903199307da5f53a192ff5d62b45447069e9ef3a19bdc5ff12/torchvision-0.24.1-cp310-cp310-win_amd64.whl", hash = "sha256:fbdbdae5e540b868a681240b7dbd6473986c862445ee8a138680a6a97d6c34ff", size = 3696202, upload-time = "2025-11-12T15:25:10.657Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/69/30f5f03752aa1a7c23931d2519b31e557f3f10af5089d787cddf3b903ecf/torchvision-0.24.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:056c525dc875f18fe8e9c27079ada166a7b2755cea5a2199b0bc7f1f8364e600", size = 1891436, upload-time = "2025-11-12T15:25:04.3Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/69/49aae86edb75fe16460b59a191fcc0f568c2378f780bb063850db0fe007a/torchvision-0.24.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:1e39619de698e2821d71976c92c8a9e50cdfd1e993507dfb340f2688bfdd8283", size = 2387757, upload-time = "2025-11-12T15:25:06.795Z" },
+    { url = "https://files.pythonhosted.org/packages/11/c9/1dfc3db98797b326f1d0c3f3bb61c83b167a813fc7eab6fcd2edb8c7eb9d/torchvision-0.24.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a0f106663e60332aa4fcb1ca2159ef8c3f2ed266b0e6df88de261048a840e0df", size = 8047682, upload-time = "2025-11-12T15:25:21.125Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/bb/cfc6a6f6ccc84a534ed1fdf029ae5716dd6ff04e57ed9dc2dab38bf652d5/torchvision-0.24.1-cp311-cp311-win_amd64.whl", hash = "sha256:a9308cdd37d8a42e14a3e7fd9d271830c7fecb150dd929b642f3c1460514599a", size = 4037588, upload-time = "2025-11-12T15:25:14.402Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/af/18e2c6b9538a045f60718a0c5a058908ccb24f88fde8e6f0fc12d5ff7bd3/torchvision-0.24.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e48bf6a8ec95872eb45763f06499f87bd2fb246b9b96cb00aae260fda2f96193", size = 1891433, upload-time = "2025-11-12T15:25:03.232Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/43/600e5cfb0643d10d633124f5982d7abc2170dfd7ce985584ff16edab3e76/torchvision-0.24.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:7fb7590c737ebe3e1c077ad60c0e5e2e56bb26e7bccc3b9d04dbfc34fd09f050", size = 2386737, upload-time = "2025-11-12T15:25:08.288Z" },
+    { url = "https://files.pythonhosted.org/packages/93/b1/db2941526ecddd84884132e2742a55c9311296a6a38627f9e2627f5ac889/torchvision-0.24.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:66a98471fc18cad9064123106d810a75f57f0838eee20edc56233fd8484b0cc7", size = 8049868, upload-time = "2025-11-12T15:25:13.058Z" },
+    { url = "https://files.pythonhosted.org/packages/69/98/16e583f59f86cd59949f59d52bfa8fc286f86341a229a9d15cbe7a694f0c/torchvision-0.24.1-cp312-cp312-win_amd64.whl", hash = "sha256:4aa6cb806eb8541e92c9b313e96192c6b826e9eb0042720e2fa250d021079952", size = 4302006, upload-time = "2025-11-12T15:25:16.184Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/97/ab40550f482577f2788304c27220e8ba02c63313bd74cf2f8920526aac20/torchvision-0.24.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:8a6696db7fb71eadb2c6a48602106e136c785642e598eb1533e0b27744f2cce6", size = 1891435, upload-time = "2025-11-12T15:25:28.642Z" },
+    { url = "https://files.pythonhosted.org/packages/30/65/ac0a3f9be6abdbe4e1d82c915d7e20de97e7fd0e9a277970508b015309f3/torchvision-0.24.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:db2125c46f9cb25dc740be831ce3ce99303cfe60439249a41b04fd9f373be671", size = 2338718, upload-time = "2025-11-12T15:25:26.19Z" },
+    { url = "https://files.pythonhosted.org/packages/10/b5/5bba24ff9d325181508501ed7f0c3de8ed3dd2edca0784d48b144b6c5252/torchvision-0.24.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:f035f0cacd1f44a8ff6cb7ca3627d84c54d685055961d73a1a9fb9827a5414c8", size = 8049661, upload-time = "2025-11-12T15:25:22.558Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/ec/54a96ae9ab6a0dd66d4bba27771f892e36478a9c3489fa56e51c70abcc4d/torchvision-0.24.1-cp313-cp313-win_amd64.whl", hash = "sha256:16274823b93048e0a29d83415166a2e9e0bf4e1b432668357b657612a4802864", size = 4319808, upload-time = "2025-11-12T15:25:17.318Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/f3/a90a389a7e547f3eb8821b13f96ea7c0563cdefbbbb60a10e08dda9720ff/torchvision-0.24.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e3f96208b4bef54cd60e415545f5200346a65024e04f29a26cd0006dbf9e8e66", size = 2005342, upload-time = "2025-11-12T15:25:11.871Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/fe/ff27d2ed1b524078164bea1062f23d2618a5fc3208e247d6153c18c91a76/torchvision-0.24.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:f231f6a4f2aa6522713326d0d2563538fa72d613741ae364f9913027fa52ea35", size = 2341708, upload-time = "2025-11-12T15:25:25.08Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/b9/d6c903495cbdfd2533b3ef6f7b5643ff589ea062f8feb5c206ee79b9d9e5/torchvision-0.24.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:1540a9e7f8cf55fe17554482f5a125a7e426347b71de07327d5de6bfd8d17caa", size = 8177239, upload-time = "2025-11-12T15:25:18.554Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/2b/ba02e4261369c3798310483028495cf507e6cb3f394f42e4796981ecf3a7/torchvision-0.24.1-cp313-cp313t-win_amd64.whl", hash = "sha256:d83e16d70ea85d2f196d678bfb702c36be7a655b003abed84e465988b6128938", size = 4251604, upload-time = "2025-11-12T15:25:34.069Z" },
+    { url = "https://files.pythonhosted.org/packages/42/84/577b2cef8f32094add5f52887867da4c2a3e6b4261538447e9b48eb25812/torchvision-0.24.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:cccf4b4fec7fdfcd3431b9ea75d1588c0a8596d0333245dafebee0462abe3388", size = 2005319, upload-time = "2025-11-12T15:25:23.827Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/34/ecb786bffe0159a3b49941a61caaae089853132f3cd1e8f555e3621f7e6f/torchvision-0.24.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:1b495edd3a8f9911292424117544f0b4ab780452e998649425d1f4b2bed6695f", size = 2338844, upload-time = "2025-11-12T15:25:32.625Z" },
+    { url = "https://files.pythonhosted.org/packages/51/99/a84623786a6969504c87f2dc3892200f586ee13503f519d282faab0bb4f0/torchvision-0.24.1-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:ab211e1807dc3e53acf8f6638df9a7444c80c0ad050466e8d652b3e83776987b", size = 8175144, upload-time = "2025-11-12T15:25:31.355Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/ba/8fae3525b233e109317ce6a9c1de922ab2881737b029a7e88021f81e068f/torchvision-0.24.1-cp314-cp314-win_amd64.whl", hash = "sha256:18f9cb60e64b37b551cd605a3d62c15730c086362b40682d23e24b616a697d41", size = 4234459, upload-time = "2025-11-12T15:25:19.859Z" },
+    { url = "https://files.pythonhosted.org/packages/50/33/481602c1c72d0485d4b3a6b48c9534b71c2957c9d83bf860eb837bf5a620/torchvision-0.24.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ec9d7379c519428395e4ffda4dbb99ec56be64b0a75b95989e00f9ec7ae0b2d7", size = 2005336, upload-time = "2025-11-12T15:25:27.225Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/7f/372de60bf3dd8f5593bd0d03f4aecf0d1fd58f5bc6943618d9d913f5e6d5/torchvision-0.24.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:af9201184c2712d808bd4eb656899011afdfce1e83721c7cb08000034df353fe", size = 2341704, upload-time = "2025-11-12T15:25:29.857Z" },
+    { url = "https://files.pythonhosted.org/packages/36/9b/0f3b9ff3d0225ee2324ec663de0e7fb3eb855615ca958ac1875f22f1f8e5/torchvision-0.24.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:9ef95d819fd6df81bc7cc97b8f21a15d2c0d3ac5dbfaab5cbc2d2ce57114b19e", size = 8177422, upload-time = "2025-11-12T15:25:37.357Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/ab/e2bcc7c2f13d882a58f8b30ff86f794210b075736587ea50f8c545834f8a/torchvision-0.24.1-cp314-cp314t-win_amd64.whl", hash = "sha256:480b271d6edff83ac2e8d69bbb4cf2073f93366516a50d48f140ccfceedb002e", size = 4335190, upload-time = "2025-11-12T15:25:35.745Z" },
 ]
 
 [[package]]
@@ -5971,8 +5852,7 @@ dependencies = [
     { name = "docstring-parser" },
     { name = "filelock" },
     { name = "fsspec" },
-    { name = "importlib-metadata", version = "8.6.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "importlib-metadata", version = "8.7.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
+    { name = "importlib-metadata" },
     { name = "pyre-extensions" },
     { name = "pyyaml" },
     { name = "tabulate" },
@@ -5997,27 +5877,70 @@ wheels = [
 
 [[package]]
 name = "transformer-engine"
-version = "2.9.0+70f53666"
-source = { git = "https://github.com/NVIDIA/TransformerEngine.git?rev=release_v2.9#70f536662ae10a62a54f4ed1ba92e3314c5cfd69" }
+version = "2.9.0"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6d/5c/21152e73aa46ac7c969d694ce86cdeb199024c7810b2d700e900ea4efb1a/transformer_engine-2.9.0-py3-none-any.whl", hash = "sha256:953147ed4c490e54c9884bb0d876a1341f05c5c5b7d304bf61f4740f6faee5af", size = 662107, upload-time = "2025-11-11T15:50:49.167Z" },
+]
+
+[package.optional-dependencies]
+core-cu13 = [
+    { name = "transformer-engine-cu13" },
+]
+pytorch = [
+    { name = "transformer-engine-torch" },
+]
+
+[[package]]
+name = "transformer-engine-cu12"
+version = "2.9.0"
+source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "einops" },
-    { name = "importlib-metadata", version = "8.6.1", source = { registry = "https://pypi.org/simple" } },
-    { name = "onnx" },
-    { name = "onnxscript", version = "0.5.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "onnxscript", version = "0.5.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "importlib-metadata" },
     { name = "packaging" },
     { name = "pydantic" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a6/af/1c449ad0c43d3d6b5c529c812a4e8338b20965ae5361a9b612c7dce21e4d/transformer_engine_cu12-2.9.0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:81162874c0618f3e62eb5ffba0bb1b608b4e56d70238205b1dced7ee965d82b3", size = 303669451, upload-time = "2025-11-11T15:54:12.008Z" },
+    { url = "https://files.pythonhosted.org/packages/82/21/aa351994d8ade95681763df2b10770c768900ecc7f1cedbfa4e89fe1935a/transformer_engine_cu12-2.9.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:ad14981cbbd964f8e4446c35199d1bc5349ea30244e76bc57c1cceb5d469dd24", size = 304164366, upload-time = "2025-11-11T15:50:22.169Z" },
+]
+
+[[package]]
+name = "transformer-engine-cu13"
+version = "2.9.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "importlib-metadata" },
+    { name = "packaging" },
+    { name = "pydantic" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ab/b9/c1c788875848bf50faa22749107d91e92e9c0c78bb1878b99939209e40f9/transformer_engine_cu13-2.9.0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:590aaeb3a4d552fe9ebc7019d43315f3e61153fcd1c5a07dc0c90bd8b278316e", size = 185010342, upload-time = "2025-11-13T22:35:04.742Z" },
+    { url = "https://files.pythonhosted.org/packages/95/7f/3019c21565f63eeb79d24fa7d3bae39b5b73f21c72d7d5123d21d7ce945a/transformer_engine_cu13-2.9.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:4e869f5a0fd74aaa05a5d801a96688ed21827d23efe9774bd3038d5f2802ef46", size = 185669069, upload-time = "2025-11-13T22:35:13.709Z" },
+]
+
+[[package]]
+name = "transformer-engine-torch"
+version = "2.9.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "einops" },
+    { name = "onnx" },
+    { name = "onnxscript" },
     { name = "torch", marker = "sys_platform == 'never'" },
+    { name = "transformer-engine-cu12" },
 ]
+sdist = { url = "https://files.pythonhosted.org/packages/a2/a3/401d741eceb8f402595e63ee0b1828d60cae988b22f2f23c9cfcc24185bd/transformer_engine_torch-2.9.0.tar.gz", hash = "sha256:abbc59f6acf635abf865085ecdf90e7d4ca9a3782bc91a9845e38adb2655a547", size = 215138, upload-time = "2025-11-11T15:49:04.258Z" }
 
 [[package]]
 name = "transformers"
-version = "4.57.1"
+version = "4.57.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
     { name = "huggingface-hub" },
-    { name = "numpy" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "packaging" },
     { name = "pyyaml" },
     { name = "regex" },
@@ -6026,39 +5949,39 @@ dependencies = [
     { name = "tokenizers" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/d6/68/a39307bcc4116a30b2106f2e689130a48de8bd8a1e635b5e1030e46fcd9e/transformers-4.57.1.tar.gz", hash = "sha256:f06c837959196c75039809636cd964b959f6604b75b8eeec6fdfc0440b89cc55", size = 10142511, upload-time = "2025-10-14T15:39:26.18Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/dd/70/d42a739e8dfde3d92bb2fff5819cbf331fe9657323221e79415cd5eb65ee/transformers-4.57.3.tar.gz", hash = "sha256:df4945029aaddd7c09eec5cad851f30662f8bd1746721b34cc031d70c65afebc", size = 10139680, upload-time = "2025-11-25T15:51:30.139Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/71/d3/c16c3b3cf7655a67db1144da94b021c200ac1303f82428f2beef6c2e72bb/transformers-4.57.1-py3-none-any.whl", hash = "sha256:b10d05da8fa67dc41644dbbf9bc45a44cb86ae33da6f9295f5fbf5b7890bd267", size = 11990925, upload-time = "2025-10-14T15:39:23.085Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/6b/2f416568b3c4c91c96e5a365d164f8a4a4a88030aa8ab4644181fdadce97/transformers-4.57.3-py3-none-any.whl", hash = "sha256:c77d353a4851b1880191603d36acb313411d3577f6e2897814f333841f7003f4", size = 11993463, upload-time = "2025-11-25T15:51:26.493Z" },
 ]
 
 [[package]]
 name = "triton"
-version = "3.5.0"
+version = "3.5.1"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/dd/22/507b6f58a35e05e84381630b2dc2a3cee1a7a2a7eaf4cba857c638a18a24/triton-3.5.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6f90de6a6566bb619b4c0adc9855729e1b1b5e26533fca1bf6206e96b6d277a3", size = 159827599, upload-time = "2025-10-15T19:15:43.87Z" },
-    { url = "https://files.pythonhosted.org/packages/0b/eb/09e31d107a5d00eb281aa7e6635ca463e9bca86515944e399480eadb71f8/triton-3.5.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d5d3b3d480debf24eaa739623c9a42446b0b77f95593d30eb1f64cd2278cc1f0", size = 170333110, upload-time = "2025-10-13T16:37:49.588Z" },
-    { url = "https://files.pythonhosted.org/packages/79/f9/b6f60f978397c616fd8dacca2305759fe4f80d397b20ef72534803244bd5/triton-3.5.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8457b22148defefdcb7fa8144b05ce211b9faefad650a1ce85b23df488d5549c", size = 159926731, upload-time = "2025-10-15T19:15:49.682Z" },
-    { url = "https://files.pythonhosted.org/packages/3d/78/949a04391c21956c816523678f0e5fa308eb5b1e7622d88c4e4ef5fceca0/triton-3.5.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f34bfa21c5b3a203c0f0eab28dcc1e49bd1f67d22724e77fb6665a659200a4ec", size = 170433488, upload-time = "2025-10-13T16:37:57.132Z" },
-    { url = "https://files.pythonhosted.org/packages/87/9b/30988039e1e84df7554fba24e6a734d2d0e847af33cabdf9b532b3c51456/triton-3.5.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7da21fccceafc163e3a5e857abe34351ef76345af06cabf9637a914742671f0b", size = 159946647, upload-time = "2025-10-15T19:15:56.325Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/3a/e991574f3102147b642e49637e0281e9bb7c4ba254edb2bab78247c85e01/triton-3.5.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9e71db82261c4ffa3921cd050cd5faa18322d2d405c30eb56084afaff3b0833", size = 170476535, upload-time = "2025-10-13T16:38:05.18Z" },
-    { url = "https://files.pythonhosted.org/packages/cd/85/e37f1197acb04c8f3d83851d23d5d6ed5060ef74580668b112e23fdfa203/triton-3.5.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:188da5b81fa2f8322c27fec1627703eac24cb9bb7ab0dfbe9925973bc1b070d3", size = 159958970, upload-time = "2025-10-15T19:16:01.717Z" },
-    { url = "https://files.pythonhosted.org/packages/6c/29/10728de8a6e932e517c10773486b8e99f85d1b1d9dd87d9a9616e1fef4a1/triton-3.5.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e6bb9aa5519c084a333acdba443789e50012a4b851cd486c54f0b8dc2a8d3a12", size = 170487289, upload-time = "2025-10-13T16:38:11.662Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/1d/38258f05010ac17a7b058c022911c9cae6526e149b7397134a048cf5a6c2/triton-3.5.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:03127d9b33aaf979c856676b394bc059ec1d68cb6da68ae03f62dd8ad77a04ae", size = 160073012, upload-time = "2025-10-15T19:16:07.477Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/38/db80e48b9220c9bce872b0f616ad0446cdf554a40b85c7865cbca99ab3c2/triton-3.5.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c83f2343e1a220a716c7b3ab9fccfcbe3ad4020d189549200e2d2e8d5868bed9", size = 170577179, upload-time = "2025-10-13T16:38:17.865Z" },
-    { url = "https://files.pythonhosted.org/packages/91/fe/8f5771d00227f4eb1ee034f218ed427102b989366d2275fe3b3c105a3921/triton-3.5.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:468936651d383f4a6d10068d34a627505e13af55be5d002b9f27b987e7a5f0ac", size = 159957460, upload-time = "2025-10-15T19:16:12.626Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/60/1810655d1d856c9a4fcc90ee8966d85f552d98c53a6589f95ab2cbe27bb8/triton-3.5.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da0fa67ccd76c3dcfb0bffe1b1c57c685136a6bd33d141c24d9655d4185b1289", size = 170487949, upload-time = "2025-10-13T16:38:24.881Z" },
-    { url = "https://files.pythonhosted.org/packages/78/59/99edd103958fe6e42b50b9ad8ce4f223ddf4ccf475259cf7d2b53381dc6c/triton-3.5.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c7ceef21410229ac23173a28eee5cfc0e37c1dfdb8b4bc11ecda2e3ecec7c686", size = 160075629, upload-time = "2025-10-15T19:16:18.746Z" },
-    { url = "https://files.pythonhosted.org/packages/fb/b7/1dec8433ac604c061173d0589d99217fe7bf90a70bdc375e745d044b8aad/triton-3.5.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:317fe477ea8fd4524a6a8c499fb0a36984a56d0b75bf9c9cb6133a1c56d5a6e7", size = 170580176, upload-time = "2025-10-13T16:38:31.14Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/2e/f95e673222afa2c7f0c687d8913e98fcf2589ef0b1405de76894e37fe18f/triton-3.5.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f63e34dcb32d7bd3a1d0195f60f30d2aee8b08a69a0424189b71017e23dfc3d2", size = 159821655, upload-time = "2025-11-11T17:51:44.09Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/6e/676ab5019b4dde8b9b7bab71245102fc02778ef3df48218b298686b9ffd6/triton-3.5.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5fc53d849f879911ea13f4a877243afc513187bc7ee92d1f2c0f1ba3169e3c94", size = 170320692, upload-time = "2025-11-11T17:40:46.074Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/dc/6ce44d055f2fc2403c4ec6b3cfd3a9b25f57b7d95efadccdea91497f8e81/triton-3.5.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:da47169e30a779bade679ce78df4810fca6d78a955843d2ddb11f226adc517dc", size = 159928005, upload-time = "2025-11-11T17:51:50.008Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/72/ec90c3519eaf168f22cb1757ad412f3a2add4782ad3a92861c9ad135d886/triton-3.5.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:61413522a48add32302353fdbaaf92daaaab06f6b5e3229940d21b5207f47579", size = 170425802, upload-time = "2025-11-11T17:40:53.209Z" },
+    { url = "https://files.pythonhosted.org/packages/db/53/2bcc46879910991f09c063eea07627baef2bc62fe725302ba8f46a2c1ae5/triton-3.5.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:275a045b6ed670dd1bd005c3e6c2d61846c74c66f4512d6f33cc027b11de8fd4", size = 159940689, upload-time = "2025-11-11T17:51:55.938Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/50/9a8358d3ef58162c0a415d173cfb45b67de60176e1024f71fbc4d24c0b6d/triton-3.5.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d2c6b915a03888ab931a9fd3e55ba36785e1fe70cbea0b40c6ef93b20fc85232", size = 170470207, upload-time = "2025-11-11T17:41:00.253Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/ba/805684a992ee32d486b7948d36aed2f5e3c643fc63883bf8bdca1c3f3980/triton-3.5.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:56765ffe12c554cd560698398b8a268db1f616c120007bfd8829d27139abd24a", size = 159955460, upload-time = "2025-11-11T17:52:01.861Z" },
+    { url = "https://files.pythonhosted.org/packages/27/46/8c3bbb5b0a19313f50edcaa363b599e5a1a5ac9683ead82b9b80fe497c8d/triton-3.5.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f3f4346b6ebbd4fad18773f5ba839114f4826037c9f2f34e0148894cd5dd3dba", size = 170470410, upload-time = "2025-11-11T17:41:06.319Z" },
+    { url = "https://files.pythonhosted.org/packages/84/1e/7df59baef41931e21159371c481c31a517ff4c2517343b62503d0cd2be99/triton-3.5.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:02c770856f5e407d24d28ddc66e33cf026e6f4d360dcb8b2fabe6ea1fc758621", size = 160072799, upload-time = "2025-11-11T17:52:07.293Z" },
+    { url = "https://files.pythonhosted.org/packages/37/92/e97fcc6b2c27cdb87ce5ee063d77f8f26f19f06916aa680464c8104ef0f6/triton-3.5.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0b4d2c70127fca6a23e247f9348b8adde979d2e7a20391bfbabaac6aebc7e6a8", size = 170579924, upload-time = "2025-11-11T17:41:12.455Z" },
+    { url = "https://files.pythonhosted.org/packages/14/f9/0430e879c1e63a1016cb843261528fd3187c872c3a9539132efc39514753/triton-3.5.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f617aa7925f9ea9968ec2e1adaf93e87864ff51549c8f04ce658f29bbdb71e2d", size = 159956163, upload-time = "2025-11-11T17:52:12.999Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/e6/c595c35e5c50c4bc56a7bac96493dad321e9e29b953b526bbbe20f9911d0/triton-3.5.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d0637b1efb1db599a8e9dc960d53ab6e4637db7d4ab6630a0974705d77b14b60", size = 170480488, upload-time = "2025-11-11T17:41:18.222Z" },
+    { url = "https://files.pythonhosted.org/packages/41/1e/63d367c576c75919e268e4fbc33c1cb33b6dc12bb85e8bfe531c2a8bd5d3/triton-3.5.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8932391d7f93698dfe5bc9bead77c47a24f97329e9f20c10786bb230a9083f56", size = 160073620, upload-time = "2025-11-11T17:52:18.403Z" },
+    { url = "https://files.pythonhosted.org/packages/16/b5/b0d3d8b901b6a04ca38df5e24c27e53afb15b93624d7fd7d658c7cd9352a/triton-3.5.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bac7f7d959ad0f48c0e97d6643a1cc0fd5786fe61cb1f83b537c6b2d54776478", size = 170582192, upload-time = "2025-11-11T17:41:23.963Z" },
 ]
 
 [[package]]
 name = "trove-classifiers"
-version = "2025.9.11.17"
+version = "2025.11.14.15"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/ca/9a/778622bc06632529817c3c524c82749a112603ae2bbcf72ee3eb33a2c4f1/trove_classifiers-2025.9.11.17.tar.gz", hash = "sha256:931ca9841a5e9c9408bc2ae67b50d28acf85bef56219b56860876dd1f2d024dd", size = 16975, upload-time = "2025-09-11T17:07:50.97Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/bf/a9/880cccf76af9e7b322112f52e4e2dbb3534cbe671197b8f443a42189dfc7/trove_classifiers-2025.11.14.15.tar.gz", hash = "sha256:6b60f49d40bbd895bc61d8dc414fc2f2286d70eb72ed23548db8cf94f62804ca", size = 16995, upload-time = "2025-11-14T15:23:13.78Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/e1/85/a4ff8758c66f1fc32aa5e9a145908394bf9cf1c79ffd1113cfdeb77e74e4/trove_classifiers-2025.9.11.17-py3-none-any.whl", hash = "sha256:5d392f2d244deb1866556457d6f3516792124a23d1c3a463a2e8668a5d1c15dd", size = 14158, upload-time = "2025-09-11T17:07:49.886Z" },
+    { url = "https://files.pythonhosted.org/packages/49/f6/73c4aa003d1237ee9bea8a46f49dc38c45dfe95af4f0da7e60678d388011/trove_classifiers-2025.11.14.15-py3-none-any.whl", hash = "sha256:d1dac259c1e908939862e3331177931c6df0a37af2c1a8debcc603d9115fcdd9", size = 14191, upload-time = "2025-11-14T15:23:12.467Z" },
 ]
 
 [[package]]
@@ -6144,7 +6067,7 @@ wheels = [
 
 [[package]]
 name = "wandb"
-version = "0.22.3"
+version = "0.23.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "click" },
@@ -6158,17 +6081,17 @@ dependencies = [
     { name = "sentry-sdk" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/c1/d1/6b70f365ed86bd69debba8ad55dec8606fc21006e7ca703a5a091bd3b719/wandb-0.22.3.tar.gz", hash = "sha256:04468a8ab2769a46f5e384c9c4ada5da0dced005ca689a8424e4b8b5cb2a0291", size = 44337368, upload-time = "2025-10-28T23:59:10.275Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/ef/8b/db2d44395c967cd452517311fd6ede5d1e07310769f448358d4874248512/wandb-0.23.0.tar.gz", hash = "sha256:e5f98c61a8acc3ee84583ca78057f64344162ce026b9f71cb06eea44aec27c93", size = 44413921, upload-time = "2025-11-11T21:06:30.737Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/23/02/87fb60f587ec249f784a40bd91c30de1b2b24d691ee72675d5b66c3d0728/wandb-0.22.3-py3-none-macosx_12_0_arm64.whl", hash = "sha256:81b3b6e405f38342b0a080898b7d00c5b9375432f5ba358942a09e65cdcfe781", size = 18758047, upload-time = "2025-10-28T23:58:46.56Z" },
-    { url = "https://files.pythonhosted.org/packages/26/88/64081740ef2b2efc7fbcb2139a07a849e42bcb09ae0c56ae50c41bd0ad63/wandb-0.22.3-py3-none-macosx_12_0_x86_64.whl", hash = "sha256:d29c16817cca6401b4919069ec7570c781eacb67dc0b1ff2e0096a9a59581720", size = 19798011, upload-time = "2025-10-28T23:58:49.718Z" },
-    { url = "https://files.pythonhosted.org/packages/19/72/c4f922b33dbb84d1c81ee045ff8791dd14e26d79e1e9bbafff964b7043e2/wandb-0.22.3-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb955d73a4ba55df9adc61fafbabef5556784d33fc39c7b5c8165d2694ddeb3b", size = 18542713, upload-time = "2025-10-28T23:58:51.927Z" },
-    { url = "https://files.pythonhosted.org/packages/ad/98/3ce5f6e2086d91b0c51b38ae7ff591109e7da2bb25fe1a12eec0cdbaa494/wandb-0.22.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23f3ebe41a26506117a098fdfd2706ed0e50b37899bfbefe3a0628fcbd70c69d", size = 19984910, upload-time = "2025-10-28T23:58:54.641Z" },
-    { url = "https://files.pythonhosted.org/packages/5e/57/e68cb38427b60490d6ddf1b992e6c7f36be83be1079d291ce87a8d347f48/wandb-0.22.3-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:2973462bed5d4a653b1a97cf9fc350673bb200fb356a2f4eba34beae9b87e0aa", size = 18581776, upload-time = "2025-10-28T23:58:56.975Z" },
-    { url = "https://files.pythonhosted.org/packages/66/6d/543f907ce0c6b6da13628b23d19ca7282c559fd73eb47b04977b9a61d0c6/wandb-0.22.3-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:c5c2bd18f95c1639863c527da0a5818ac6b0e5194f9c691426b265908ddd8b2c", size = 20078800, upload-time = "2025-10-28T23:58:59.217Z" },
-    { url = "https://files.pythonhosted.org/packages/da/91/1decaf1a6ac2017481c782e0fad7f90bc9ae4057f3d76d478cb6527f3dd3/wandb-0.22.3-py3-none-win32.whl", hash = "sha256:09ca1edfe0fd6dc30447d368acddb825668e60ee705c98594a6bbfd30d34d47e", size = 19160297, upload-time = "2025-10-28T23:59:01.536Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/ba/3b092634279994b0c79fe05220532822be09f3a353ae95c54e7142769db8/wandb-0.22.3-py3-none-win_amd64.whl", hash = "sha256:55403bf93872c9978433d101324f51e43e78c70c809bf6d06ca7b2760e39f497", size = 19160300, upload-time = "2025-10-28T23:59:04.06Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/80/4662fce9eebcc8c71f5083e9152ccaf7d43d4ca9c446e1422f9aa784a51c/wandb-0.22.3-py3-none-win_arm64.whl", hash = "sha256:49f66b05882abfa53816cc8d01b3c2435a89c5a090176802fa6928b5979d34d9", size = 17461959, upload-time = "2025-10-28T23:59:07.059Z" },
+    { url = "https://files.pythonhosted.org/packages/41/61/a3220c7fa4cadfb2b2a5c09e3fa401787326584ade86d7c1f58bf1cd43bd/wandb-0.23.0-py3-none-macosx_12_0_arm64.whl", hash = "sha256:b682ec5e38fc97bd2e868ac7615a0ab4fc6a15220ee1159e87270a5ebb7a816d", size = 18992250, upload-time = "2025-11-11T21:06:03.412Z" },
+    { url = "https://files.pythonhosted.org/packages/90/16/e69333cf3d11e7847f424afc6c8ae325e1f6061b2e5118d7a17f41b6525d/wandb-0.23.0-py3-none-macosx_12_0_x86_64.whl", hash = "sha256:ec094eb71b778e77db8c188da19e52c4f96cb9d5b4421d7dc05028afc66fd7e7", size = 20045616, upload-time = "2025-11-11T21:06:07.109Z" },
+    { url = "https://files.pythonhosted.org/packages/62/79/42dc6c7bb0b425775fe77f1a3f1a22d75d392841a06b43e150a3a7f2553a/wandb-0.23.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e43f1f04b98c34f407dcd2744cec0a590abce39bed14a61358287f817514a7b", size = 18758848, upload-time = "2025-11-11T21:06:09.832Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/94/d6ddb78334996ccfc1179444bfcfc0f37ffd07ee79bb98940466da6f68f8/wandb-0.23.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e5847f98cbb3175caf5291932374410141f5bb3b7c25f9c5e562c1988ce0bf5", size = 20231493, upload-time = "2025-11-11T21:06:12.323Z" },
+    { url = "https://files.pythonhosted.org/packages/52/4d/0ad6df0e750c19dabd24d2cecad0938964f69a072f05fbdab7281bec2b64/wandb-0.23.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:6151355fd922539926e870be811474238c9614b96541773b990f1ce53368aef6", size = 18793473, upload-time = "2025-11-11T21:06:14.967Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/da/c2ba49c5573dff93dafc0acce691bb1c3d57361bf834b2f2c58e6193439b/wandb-0.23.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:df62e426e448ebc44269140deb7240df474e743b12d4b1f53b753afde4aa06d4", size = 20332882, upload-time = "2025-11-11T21:06:17.865Z" },
+    { url = "https://files.pythonhosted.org/packages/40/65/21bfb10ee5cd93fbcaf794958863c7e05bac4bbeb1cc1b652094aa3743a5/wandb-0.23.0-py3-none-win32.whl", hash = "sha256:6c21d3eadda17aef7df6febdffdddfb0b4835c7754435fc4fe27631724269f5c", size = 19433198, upload-time = "2025-11-11T21:06:21.913Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/33/cbe79e66c171204e32cf940c7fdfb8b5f7d2af7a00f301c632f3a38aa84b/wandb-0.23.0-py3-none-win_amd64.whl", hash = "sha256:b50635fa0e16e528bde25715bf446e9153368428634ca7a5dbd7a22c8ae4e915", size = 19433201, upload-time = "2025-11-11T21:06:24.607Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/a0/5ecfae12d78ea036a746c071e4c13b54b28d641efbba61d2947c73b3e6f9/wandb-0.23.0-py3-none-win_arm64.whl", hash = "sha256:fa0181b02ce4d1993588f4a728d8b73ae487eb3cb341e6ce01c156be7a98ec72", size = 17678649, upload-time = "2025-11-11T21:06:27.289Z" },
 ]
 
 [[package]]
@@ -6301,7 +6224,8 @@ version = "1.0.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "braceexpand" },
-    { name = "numpy" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "pyyaml" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/5a/3a/68800d92e065cf4750ebecf973b13979c0c929b439e1293012938862038d/webdataset-1.0.2.tar.gz", hash = "sha256:7f0498be827cfa46cc5430a58768a24e2c6a410676a61be1838f53d61afdaab4", size = 80090, upload-time = "2025-06-19T23:26:21.945Z" }
@@ -6399,22 +6323,6 @@ wheels = [
 name = "wrapt"
 version = "1.17.3"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux'",
-    "python_full_version < '3.11' and sys_platform == 'linux'",
-    "python_full_version < '3.11' and sys_platform != 'linux'",
-]
 sdist = { url = "https://files.pythonhosted.org/packages/95/8f/aeb76c5b46e273670962298c23e7ddde79916cb74db802131d49a85e4b7d/wrapt-1.17.3.tar.gz", hash = "sha256:f66eb08feaa410fe4eebd17f2a2c8e2e46d3476e9f8c783daa8e09e0faa666d0", size = 55547, upload-time = "2025-08-12T05:53:21.714Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/3f/23/bb82321b86411eb51e5a5db3fb8f8032fd30bd7c2d74bfe936136b2fa1d6/wrapt-1.17.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:88bbae4d40d5a46142e70d58bf664a89b6b4befaea7b2ecc14e03cedb8e06c04", size = 53482, upload-time = "2025-08-12T05:51:44.467Z" },
@@ -6480,131 +6388,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22", size = 23591, upload-time = "2025-08-12T05:53:20.674Z" },
 ]
 
-[[package]]
-name = "wrapt"
-version = "2.0.0"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.11' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.11' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.12.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.11' and sys_platform == 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.11' and sys_platform != 'linux' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/49/19/5e5bcd855d808892fe02d49219f97a50f64cd6d8313d75df3494ee97b1a3/wrapt-2.0.0.tar.gz", hash = "sha256:35a542cc7a962331d0279735c30995b024e852cf40481e384fd63caaa391cbb9", size = 81722, upload-time = "2025-10-19T23:47:54.07Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/ee/db/ac9546e89b645e525686727f8749847485e3b45ffc4507b61c4669358638/wrapt-2.0.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a7cebcee61f21b1e46aa32db8d9d93826d0fbf1ad85defc2ccfb93b4adef1435", size = 77431, upload-time = "2025-10-19T23:45:25.177Z" },
-    { url = "https://files.pythonhosted.org/packages/74/bc/3b57c8012bbd0d02eec5ae838681c1a819df6c5e765ebc897f52623b5eb1/wrapt-2.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:827e6e3a3a560f6ec1f5ee92d4319c21a0549384f896ec692f3201eda31ebd11", size = 60644, upload-time = "2025-10-19T23:45:27.511Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/6e/b5e7d47713e3d46c30ec6ae83fafd369bc34de8148668c6e3168d9301863/wrapt-2.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1a91075a5383a7cbfe46aed1845ef7c3f027e8e20e7d9a8a75e36ebc9b0dd15e", size = 61526, upload-time = "2025-10-19T23:45:28.789Z" },
-    { url = "https://files.pythonhosted.org/packages/28/8d/d5df2af58ae479785473607a3b25726c295640cdcaee830847cee339eff9/wrapt-2.0.0-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b6a18c813196e18146b8d041e20875bdb0cb09b94ac1d1e1146e0fa87b2deb0d", size = 113638, upload-time = "2025-10-19T23:45:31.977Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/b7/9501c45ab93b4d6ba396ef02fcfb55867866bc8579fff045bb54cae58423/wrapt-2.0.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ec5028d26011a53c76bd91bb6198b30b438c6e0f7adb45f2ad84fe2655b6a104", size = 115651, upload-time = "2025-10-19T23:45:33.257Z" },
-    { url = "https://files.pythonhosted.org/packages/5e/3a/bfebe2ba51cf98ae80c5dbb6fa5892ae75d1acf1a4c404eda88e28f5ab06/wrapt-2.0.0-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bed9b04900204721a24bcefc652ca267b01c1e8ad8bc8c0cff81558a45a3aadc", size = 112060, upload-time = "2025-10-19T23:45:30.298Z" },
-    { url = "https://files.pythonhosted.org/packages/00/e7/cd50a32bed022d98f61a90e57faf782aa063f7930f57eb67eb105d3189be/wrapt-2.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:03442f2b45fa3f2b98a94a1917f52fb34670de8f96c0a009c02dbd512d855a3d", size = 114829, upload-time = "2025-10-19T23:45:34.23Z" },
-    { url = "https://files.pythonhosted.org/packages/9d/2c/c709578271df0c70a27ab8f797c44c258650f24a32b452f03d7afedc070d/wrapt-2.0.0-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:17d0b5c42495ba142a1cee52b76414f9210591c84aae94dffda70240753bfb3c", size = 111249, upload-time = "2025-10-19T23:45:35.554Z" },
-    { url = "https://files.pythonhosted.org/packages/60/ef/cb58f6eea41f129600bda68d1ae4c80b14d4e0663eec1d5220cbffe50be5/wrapt-2.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ee44215e7d13e112a8fc74e12ed1a1f41cab2bc07b11cc703f2398cd114b261c", size = 113312, upload-time = "2025-10-19T23:45:36.66Z" },
-    { url = "https://files.pythonhosted.org/packages/59/55/97e6c4e1c175fb27f8dec717a3e36493ff0c4e50173a95f439496556910f/wrapt-2.0.0-cp310-cp310-win32.whl", hash = "sha256:fe6eafac3bc3c957ab6597a0c0654a0a308868458d00d218743e5b5fae51951c", size = 57961, upload-time = "2025-10-19T23:45:40.958Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/0a/898b1d81ae1f3dd9a79fd2e0330a7c8dd793982f815a318548777cb21ee5/wrapt-2.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:9e070c3491397fba0445b8977900271eca9656570cca7c900d9b9352186703a0", size = 60311, upload-time = "2025-10-19T23:45:38.033Z" },
-    { url = "https://files.pythonhosted.org/packages/44/f1/e7e92f9535f5624ee22879f09456df9d1f1ae9bb338eef711077b48e456a/wrapt-2.0.0-cp310-cp310-win_arm64.whl", hash = "sha256:806e2e73186eb5e3546f39fb5d0405040e0088db0fc8b2f667fd1863de2b3c99", size = 58822, upload-time = "2025-10-19T23:45:39.785Z" },
-    { url = "https://files.pythonhosted.org/packages/12/8f/8e4c8b6da60b4205191d588cbac448fb9ff4f5ed89f4e555dc4813ab30cf/wrapt-2.0.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b7e221abb6c5387819db9323dac3c875b459695057449634f1111955d753c621", size = 77433, upload-time = "2025-10-19T23:45:42.543Z" },
-    { url = "https://files.pythonhosted.org/packages/22/9a/01a29ccb029aa8e78241f8b53cb89ae8826c240129abbbb6ebba3416eff9/wrapt-2.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1147a84c8fc852426580af8b6e33138461ddbc65aa459a25ea539374d32069fa", size = 60641, upload-time = "2025-10-19T23:45:43.866Z" },
-    { url = "https://files.pythonhosted.org/packages/3d/ec/e058997971428b7665b5c3665a55b18bb251ea7e08d002925e3ca017c020/wrapt-2.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5d6691d4a711504a0bc10de789842ad6ac627bed22937b10f37a1211a8ab7bb3", size = 61526, upload-time = "2025-10-19T23:45:44.839Z" },
-    { url = "https://files.pythonhosted.org/packages/70/c3/c82263503f554715aa1847e85dc75a69631a54e9d7ab0f1a55e34a22d44a/wrapt-2.0.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:f460e1eb8e75a17c3918c8e35ba57625721eef2439ef0bcf05304ac278a65e1d", size = 114069, upload-time = "2025-10-19T23:45:47.223Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/97/d95e88a3a1bc2890a1aa47880c2762cf0eb6d231b5a64048e351cec6f071/wrapt-2.0.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:12c37784b77bf043bf65cc96c7195a5db474b8e54173208af076bdbb61df7b3e", size = 116109, upload-time = "2025-10-19T23:45:48.252Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/36/cba0bf954f2303897b80fa5342499b43f8c5201110dddf0d578d6841b149/wrapt-2.0.0-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:75e5c049eb583835f7a0e0e311d9dde9bfbaac723a6dd89d052540f9b2809977", size = 112500, upload-time = "2025-10-19T23:45:45.838Z" },
-    { url = "https://files.pythonhosted.org/packages/d7/2b/8cb88e63bec989f641d208acb3fd198bfdbbb4ef7dfb71f0cac3c90b07a9/wrapt-2.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e50bcbd5b65dac21b82319fcf18486e6ac439947e9305034b00704eb7405f553", size = 115356, upload-time = "2025-10-19T23:45:49.249Z" },
-    { url = "https://files.pythonhosted.org/packages/bb/60/a6d5fb94648cd430648705bef9f4241bd22ead123ead552b6d2873ad5240/wrapt-2.0.0-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:06b78cb6b9320f57737a52fede882640d93cface98332d1a3df0c5696ec9ae9f", size = 111754, upload-time = "2025-10-19T23:45:51.21Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/44/1963854edf0592ae806307899dc7bf891e76cec19e598f55845c94603a65/wrapt-2.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:8c8349ebfc3cd98bc9105e0112dd8c8ac1f3c7cb5601f9d02248cae83a63f748", size = 113789, upload-time = "2025-10-19T23:45:52.473Z" },
-    { url = "https://files.pythonhosted.org/packages/62/ec/4b1d76cb6d96ac511aaaa92efc57f528e57f06082a595b8b2663fcdb0f20/wrapt-2.0.0-cp311-cp311-win32.whl", hash = "sha256:028f19ec29e204fe725139d4a8b09f77ecfb64f8f02b7ab5ee822c85e330b68b", size = 57954, upload-time = "2025-10-19T23:45:57.03Z" },
-    { url = "https://files.pythonhosted.org/packages/d4/cf/df8ff9bd64d4a75f9a9f6c1c93480a51904d0c9bd71c11994301c47d8a33/wrapt-2.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:c6961f05e58d919153ba311b397b7b904b907132b7b8344dde47865d4bb5ec89", size = 60308, upload-time = "2025-10-19T23:45:54.314Z" },
-    { url = "https://files.pythonhosted.org/packages/69/d8/61e245fe387d58d84b3f913d5da9d909c4f239b887db692a05105aaf2a1b/wrapt-2.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:be7e316c2accd5a31dbcc230de19e2a846a325f8967fdea72704d00e38e6af06", size = 58822, upload-time = "2025-10-19T23:45:55.772Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/28/7f266b5bf50c3ad0c99c524d99faa0f7d6eecb045d950e7d2c9e1f0e1338/wrapt-2.0.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:73c6f734aecb1a030d9a265c13a425897e1ea821b73249bb14471445467ca71c", size = 78078, upload-time = "2025-10-19T23:45:58.855Z" },
-    { url = "https://files.pythonhosted.org/packages/06/0c/bbdcad7eb535fae9d6b0fcfa3995c364797cd8e2b423bba5559ab2d88dcf/wrapt-2.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b4a7f8023b8ce8a36370154733c747f8d65c8697cb977d8b6efeb89291fff23e", size = 61158, upload-time = "2025-10-19T23:46:00.096Z" },
-    { url = "https://files.pythonhosted.org/packages/d3/8a/bba3e7a4ebf4d1624103ee59d97b78a1fbb08fb5753ff5d1b69f5ef5e863/wrapt-2.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a1cb62f686c50e9dab5983c68f6c8e9cbf14a6007935e683662898a7d892fa69", size = 61646, upload-time = "2025-10-19T23:46:01.279Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/0c/0f565294897a72493dbafe7b46229b5f09f3776795a894d6b737e98387de/wrapt-2.0.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:43dc0550ae15e33e6bb45a82a5e1b5495be2587fbaa996244b509921810ee49f", size = 121442, upload-time = "2025-10-19T23:46:04.287Z" },
-    { url = "https://files.pythonhosted.org/packages/da/80/7f03501a8a078ad79b19b1a888f9192a9494e62ddf8985267902766a4f30/wrapt-2.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:39c5b45b056d630545e40674d1f5e1b51864b3546f25ab6a4a331943de96262e", size = 123018, upload-time = "2025-10-19T23:46:06.052Z" },
-    { url = "https://files.pythonhosted.org/packages/37/6b/ad0e1ff98359f13b4b0c2c52848e792841146fe79ac5f56899b9a028fc0d/wrapt-2.0.0-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:804e88f824b76240a1b670330637ccfd2d18b9efa3bb4f02eb20b2f64880b324", size = 117369, upload-time = "2025-10-19T23:46:02.53Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/6c/a90437bba8cb1ce2ed639af979515e09784678c2a7f4ffc79f2cf7de809e/wrapt-2.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c2c476aa3fc2b9899c3f7b20963fac4f952e7edb74a31fc92f7745389a2e3618", size = 121453, upload-time = "2025-10-19T23:46:07.747Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/a9/b3982f9bd15bd45857a23c48b7c36e47d05db4a4dcc5061c31f169238845/wrapt-2.0.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:8d851e526891216f89fcb7a1820dad9bd503ba3468fb9635ee28e93c781aa98e", size = 116250, upload-time = "2025-10-19T23:46:09.385Z" },
-    { url = "https://files.pythonhosted.org/packages/73/e2/b7a8b1afac9f791d8f5eac0d9726559f1d7ec4a2b5a6b4e67ac145b007a5/wrapt-2.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b95733c2360c4a8656ee93c7af78e84c0bd617da04a236d7a456c8faa34e7a2d", size = 120575, upload-time = "2025-10-19T23:46:11.882Z" },
-    { url = "https://files.pythonhosted.org/packages/a2/0f/37920eeea96094f450ae35505d39f1135df951a2cdee0d4e01d4f843396a/wrapt-2.0.0-cp312-cp312-win32.whl", hash = "sha256:ea56817176834edf143df1109ae8fdaa087be82fdad3492648de0baa8ae82bf2", size = 58175, upload-time = "2025-10-19T23:46:15.678Z" },
-    { url = "https://files.pythonhosted.org/packages/f0/db/b395f3b0c7f2c60d9219afacc54ceb699801ccf2d3d969ba556dc6d3af20/wrapt-2.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:3c7d3bee7be7a2665286103f4d1f15405c8074e6e1f89dac5774f9357c9a3809", size = 60415, upload-time = "2025-10-19T23:46:12.913Z" },
-    { url = "https://files.pythonhosted.org/packages/86/22/33d660214548af47fc59d9eec8c0e0693bcedc5b3a0b52e8cbdd61f3b646/wrapt-2.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:680f707e1d26acbc60926659799b15659f077df5897a6791c7c598a5d4a211c4", size = 58911, upload-time = "2025-10-19T23:46:13.889Z" },
-    { url = "https://files.pythonhosted.org/packages/18/0a/dd88abfe756b1aa79f0777e5ee4ce9e4b5dc4999bd805e9b04b52efc7b18/wrapt-2.0.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:e2ea096db28d5eb64d381af0e93464621ace38a7003a364b6b5ffb7dd713aabe", size = 78083, upload-time = "2025-10-19T23:46:16.937Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/b9/8afebc1655a863bb2178b23c2d699b8743f3a7dab466904adc6155f3c858/wrapt-2.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c92b5a82d28491e3f14f037e1aae99a27a5e6e0bb161e65f52c0445a3fa7c940", size = 61156, upload-time = "2025-10-19T23:46:17.927Z" },
-    { url = "https://files.pythonhosted.org/packages/bb/8b/f710a6528ccc52e21943f42c8cf64814cde90f9adbd3bcd58c7c274b4f75/wrapt-2.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:81d234718aabe632d179fac52c7f69f0f99fbaac4d4bcd670e62462bbcbfcad7", size = 61641, upload-time = "2025-10-19T23:46:19.229Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/5f/e4eabd0cc6684c5b208c2abc5c3459449c4d15be1694a9bbcf51e0e135fd/wrapt-2.0.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:db2eea83c43f84e4e41dbbb4c1de371a53166e55f900a6b130c3ef51c6345c1a", size = 121454, upload-time = "2025-10-19T23:46:21.808Z" },
-    { url = "https://files.pythonhosted.org/packages/6f/c4/ec31ee17cc7866960d323609ba7402be786d211a6d713a59f776c4270bb3/wrapt-2.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:65f50e356c425c061e1e17fe687ff30e294fed9bf3441dc1f13ef73859c2a817", size = 123063, upload-time = "2025-10-19T23:46:23.545Z" },
-    { url = "https://files.pythonhosted.org/packages/b0/2b/a4b10c3c0022e40aeae9bec009bafb049f440493f0575ebb27ecf61c32f8/wrapt-2.0.0-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:887f2a667e3cbfb19e204032d42ad7dedaa43972e4861dc7a3d51ae951d9b578", size = 117401, upload-time = "2025-10-19T23:46:20.433Z" },
-    { url = "https://files.pythonhosted.org/packages/2a/4a/ade23a76967e1f148e461076a4d0e24a7950a5f18b394c9107fe60224ae2/wrapt-2.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9054829da4be461e3ad3192e4b6bbf1fc18af64c9975ce613aec191924e004dc", size = 121485, upload-time = "2025-10-19T23:46:24.85Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/ba/33b5f3e2edede4e1cfd259f0d9c203cf370f259bb9b215dd58fc6cbb94e9/wrapt-2.0.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:b952ffd77133a5a2798ee3feb18e51b0a299d2f440961e5bb7737dbb02e57289", size = 116276, upload-time = "2025-10-19T23:46:27.006Z" },
-    { url = "https://files.pythonhosted.org/packages/eb/bf/b7f95bb4529a35ca11eb95d48f9d1a563b495471f7cf404c644566fb4293/wrapt-2.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e25fde03c480061b8234d8ee4863eb5f40a9be4fb258ce105b364de38fc6bcf9", size = 120578, upload-time = "2025-10-19T23:46:28.679Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/71/984849df6f052592474a44aafd6b847e1cffad39b0debc5390a04aa46331/wrapt-2.0.0-cp313-cp313-win32.whl", hash = "sha256:49e982b7860d325094978292a49e0418833fc7fc42c0dc7cd0b7524d7d06ee74", size = 58178, upload-time = "2025-10-19T23:46:32.372Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/3b/4e1fc0f2e1355fbc55ab248311bf4c958dbbd96bd9183b9e96882cc16213/wrapt-2.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:6e5c86389d9964050ce50babe247d172a5e3911d59a64023b90db2b4fa00ae7c", size = 60423, upload-time = "2025-10-19T23:46:30.041Z" },
-    { url = "https://files.pythonhosted.org/packages/20/0a/9384e0551f56fe361f41bb8f209a13bb9ef689c3a18264225b249849b12c/wrapt-2.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:b96fdaa4611e05c7231937930567d3c16782be9dbcf03eb9f60d83e57dd2f129", size = 58918, upload-time = "2025-10-19T23:46:31.056Z" },
-    { url = "https://files.pythonhosted.org/packages/68/70/37b90d3ee5bf0d0dc4859306383da08b685c9a51abff6fd6b0a7c052e117/wrapt-2.0.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:f2c7b7fead096dbf1dcc455b7f59facb05de3f5bfb04f60a69f98cdfe6049e5f", size = 81980, upload-time = "2025-10-19T23:46:33.368Z" },
-    { url = "https://files.pythonhosted.org/packages/95/23/0ce69cc90806b90b3ee4cfd9ad8d2ee9becc3a1aab7df3c3bfc7d0904cb6/wrapt-2.0.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:04c7c8393f25b11c0faa5d907dd9eb462e87e4e7ba55e308a046d7ed37f4bbe2", size = 62900, upload-time = "2025-10-19T23:46:34.415Z" },
-    { url = "https://files.pythonhosted.org/packages/54/76/03ec08170c02f38f3be3646977920976b968e0b704a0693a98f95d02f4d2/wrapt-2.0.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a93e0f8b376c0735b2f4daf58018b4823614d2b896cb72b6641c4d3dbdca1d75", size = 63636, upload-time = "2025-10-19T23:46:35.643Z" },
-    { url = "https://files.pythonhosted.org/packages/75/c1/04ce0511e504cdcd84cdb6980bc7d4efa38ac358e8103d6dd0cd278bfc6d/wrapt-2.0.0-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b42d13603da4416c43c430dbc6313c8d7ff745c40942f146ed4f6dd02c7d2547", size = 152650, upload-time = "2025-10-19T23:46:38.717Z" },
-    { url = "https://files.pythonhosted.org/packages/17/06/cd2e32b5f744701189c954f9ab5eee449c86695b13f414bb8ea7a83f6d48/wrapt-2.0.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c8bbd2472abf8c33480ad2314b1f8fac45d592aba6cc093e8839a7b2045660e6", size = 158811, upload-time = "2025-10-19T23:46:40.875Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/a2/a6d920695cca62563c1b969064e5cd2051344a6e330c184b6f80383d87e4/wrapt-2.0.0-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e64a3a1fd9a308ab9b815a2ad7a65b679730629dbf85f8fc3f7f970d634ee5df", size = 146033, upload-time = "2025-10-19T23:46:37.351Z" },
-    { url = "https://files.pythonhosted.org/packages/c6/90/7fd2abe4ec646bc43cb6b0d05086be6fcf15e64f06f51fc4198804396d68/wrapt-2.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:d61214525eaf88e0d0edf3d1ad5b5889863c6f88e588c6cdc6aa4ee5d1f10a4a", size = 155673, upload-time = "2025-10-19T23:46:42.582Z" },
-    { url = "https://files.pythonhosted.org/packages/5f/8d/6cce7f8c41633e677ac8aa34e84b53a22a645ec2a680deb991785ca2798d/wrapt-2.0.0-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:04f7a5f92c5f7324a1735043cc467b1295a1c5b4e0c1395472b7c44706e3dc61", size = 144364, upload-time = "2025-10-19T23:46:44.381Z" },
-    { url = "https://files.pythonhosted.org/packages/72/42/9570349e03afa9d83daf7f33ffb17e8cdc62d7e84c0d09005d0f51912efa/wrapt-2.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:2356f76cb99b3de5b4e5b8210367fbbb81c7309fe39b622f5d199dd88eb7f765", size = 150275, upload-time = "2025-10-19T23:46:45.662Z" },
-    { url = "https://files.pythonhosted.org/packages/f2/d8/448728e6fe030e5c4f1022c82cd3af1de1c672fa53d2d5b36b32a55ce7bf/wrapt-2.0.0-cp313-cp313t-win32.whl", hash = "sha256:0a921b657a224e40e4bc161b5d33934583b34f0c9c5bdda4e6ac66f9d2fcb849", size = 59867, upload-time = "2025-10-19T23:46:49.593Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/b1/ad812b1fe1cd85f6498dc3a3c9809a1e880d6108283b1735119bec217041/wrapt-2.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:c16f6d4eea98080f6659a8a7fc559d4a0a337ee66960659265cad2c8a40f7c0f", size = 63170, upload-time = "2025-10-19T23:46:46.87Z" },
-    { url = "https://files.pythonhosted.org/packages/7f/29/c105b1e76650c82823c491952a7a8eafe09b78944f7a43f22d37ed860229/wrapt-2.0.0-cp313-cp313t-win_arm64.whl", hash = "sha256:52878edc13dc151c58a9966621d67163a80654bc6cff4b2e1c79fa62d0352b26", size = 60339, upload-time = "2025-10-19T23:46:47.862Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/38/0dd39f83163fd28326afba84e3e416656938df07e60a924ac4d992b30220/wrapt-2.0.0-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:79a53d86c2aff7b32cc77267e3a308365d1fcb881e74bc9cbe26f63ee90e37f0", size = 78242, upload-time = "2025-10-19T23:46:51.096Z" },
-    { url = "https://files.pythonhosted.org/packages/08/ef/fa7a5c1d73f8690c712f9d2e4615700c6809942536dd3f441b9ba650a310/wrapt-2.0.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d731a4f22ed6ffa4cb551b4d2b0c24ff940c27a88edaf8e3490a5ee3a05aef71", size = 61207, upload-time = "2025-10-19T23:46:52.558Z" },
-    { url = "https://files.pythonhosted.org/packages/23/d9/67cb93da492eb0a1cb17b7ed18220d059e58f00467ce6728b674d3441b3d/wrapt-2.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:3e02ab8c0ac766a5a6e81cd3b6cc39200c69051826243182175555872522bd5a", size = 61748, upload-time = "2025-10-19T23:46:54.468Z" },
-    { url = "https://files.pythonhosted.org/packages/e5/be/912bbd70cc614f491b526a1d7fe85695b283deed19287b9f32460178c54d/wrapt-2.0.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:895870602d65d7338edb3b6a717d856632ad9f14f7ff566214e4fb11f0816649", size = 120424, upload-time = "2025-10-19T23:46:57.575Z" },
-    { url = "https://files.pythonhosted.org/packages/b2/e1/10df8937e7da2aa9bc3662a4b623e51a323c68f42cad7b13f0e61a700ce2/wrapt-2.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0b9ad4fab76a0086dc364c4f17f39ad289600e73ef5c6e9ab529aff22cac1ac3", size = 122804, upload-time = "2025-10-19T23:46:59.308Z" },
-    { url = "https://files.pythonhosted.org/packages/f3/60/576751b1919adab9f63168e3b5fd46c0d1565871b1cc4c2569503ccf4be6/wrapt-2.0.0-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e7ca0562606d7bad2736b2c18f61295d61f50cd3f4bfc51753df13614dbcce1b", size = 117398, upload-time = "2025-10-19T23:46:55.814Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/55/243411f360cc27bae5f8e21c16f1a8d87674c5534f4558e8a97c1e0d1c6f/wrapt-2.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:fe089d9f5a4a3dea0108a8ae34bced114d0c4cca417bada1c5e8f42d98af9050", size = 121230, upload-time = "2025-10-19T23:47:01.347Z" },
-    { url = "https://files.pythonhosted.org/packages/d6/23/2f21f692c3b3f0857cb82708ce0c341fbac55a489d4025ae4e3fd5d5de8c/wrapt-2.0.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:e761f2d2f8dbc80384af3d547b522a80e67db3e319c7b02e7fd97aded0a8a678", size = 116296, upload-time = "2025-10-19T23:47:02.659Z" },
-    { url = "https://files.pythonhosted.org/packages/bd/ed/678957fad212cfb1b65b2359d62f5619f5087d1d1cf296c6a996be45171c/wrapt-2.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:17ba1bdc52d0c783481850996aa26cea5237720769197335abea2ae6b4c23bc0", size = 119602, upload-time = "2025-10-19T23:47:03.775Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/e3/aeb4c3b052d3eed95e61babc20dcb1a512651e098cca4b84a6896585c06a/wrapt-2.0.0-cp314-cp314-win32.whl", hash = "sha256:f73318741b141223a4674ba96992aa2291b1b3f7a5e85cb3c2c964f86171eb45", size = 58649, upload-time = "2025-10-19T23:47:07.382Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/2a/a71c51cb211798405b59172c7df5789a5b934b18317223cf22e0c6f852de/wrapt-2.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:8e08d4edb13cafe7b3260f31d4de033f73d3205774540cf583bffaa4bec97db9", size = 60897, upload-time = "2025-10-19T23:47:04.862Z" },
-    { url = "https://files.pythonhosted.org/packages/f8/a5/acc5628035d06f69e9144cca543ca54c33b42a5a23b6f1e8fa131026db89/wrapt-2.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:af01695c2b7bbd8d67b869d8e3de2b123a7bfbee0185bdd138c2775f75373b83", size = 59306, upload-time = "2025-10-19T23:47:05.883Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/e6/1318ca07d7fcee57e4592a78dacd9d5493b8ddd971c553a62904fb2c0cf2/wrapt-2.0.0-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:057f02c13cce7b26c79624c06a3e1c2353e6dc9708525232232f6768118042ca", size = 81987, upload-time = "2025-10-19T23:47:08.7Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/bf/ffac358ddf61c3923d94a8b0e7620f2af1cd1b637a0fe4963a3919aa62b7/wrapt-2.0.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:79bdd84570267f3f43d609c892ae2d30b91ee4b8614c2cbfd311a2965f1c9bdb", size = 62902, upload-time = "2025-10-19T23:47:10.248Z" },
-    { url = "https://files.pythonhosted.org/packages/b5/af/387c51f9e7b544fe95d852fc94f9f3866e3f7d7d39c2ee65041752f90bc2/wrapt-2.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:93c8b4f4d54fd401a817abbfc9bf482aa72fd447f8adf19ce81d035b3f5c762c", size = 63635, upload-time = "2025-10-19T23:47:11.746Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/99/d38d8c80b9cc352531d4d539a17e3674169a5cc25a7e6e5e3c27bc29893e/wrapt-2.0.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5e09ffd31001dce71c2c2a4fc201bdba9a2f9f62b23700cf24af42266e784741", size = 152659, upload-time = "2025-10-19T23:47:15.344Z" },
-    { url = "https://files.pythonhosted.org/packages/5a/2a/e154432f274e22ecf2465583386c5ceffa5e0bab3947c1c5b26cc8e7b275/wrapt-2.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d87c285ff04e26083c4b03546e7b74df7ba4f1f32f1dcb92e9ac13c2dbb4c379", size = 158818, upload-time = "2025-10-19T23:47:17.569Z" },
-    { url = "https://files.pythonhosted.org/packages/c5/7a/3a40c453300e2898e99c27495b8109ff7cd526997d12cfb8ebd1843199a4/wrapt-2.0.0-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e52e50ea0a72ea48d1291cf8b8aaedcc99072d9dc5baba6b820486dcf4c67da8", size = 146113, upload-time = "2025-10-19T23:47:13.026Z" },
-    { url = "https://files.pythonhosted.org/packages/9e/e2/3116a9eade8bea2bf5eedba3fa420e3c7d193d4b047440330d8eaf1098de/wrapt-2.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:1fd4c95536975895f32571073446e614d5e2810b666b64955586dcddfd438fd3", size = 155689, upload-time = "2025-10-19T23:47:19.397Z" },
-    { url = "https://files.pythonhosted.org/packages/43/1c/277d3fbe9d177830ab9e54fe9253f38455b75a22d639a4bd9fa092d55ae5/wrapt-2.0.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:d6ebfe9283209220ed9de80a3e9442aab8fc2be5a9bbf8491b99e02ca9349a89", size = 144403, upload-time = "2025-10-19T23:47:20.779Z" },
-    { url = "https://files.pythonhosted.org/packages/d8/37/ab6ddaf182248aac5ed925725ef4c69a510594764665ecbd95bdd4481f16/wrapt-2.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5d3ebd784804f146b7ea55359beb138e23cc18e5a5cc2cf26ad438723c00ce3a", size = 150307, upload-time = "2025-10-19T23:47:22.604Z" },
-    { url = "https://files.pythonhosted.org/packages/f6/d7/df9e2d8040a3af618ff9496261cf90ca4f886fd226af0f4a69ac0c020c3b/wrapt-2.0.0-cp314-cp314t-win32.whl", hash = "sha256:9b15940ae9debc8b40b15dc57e1ce4433f7fb9d3f8761c7fab1ddd94cb999d99", size = 60557, upload-time = "2025-10-19T23:47:26.73Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/c2/502bd4557a3a9199ea73cc5932cf83354bd362682162f0b14164d2e90216/wrapt-2.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:7a0efbbc06d3e2077476a04f55859819d23206600b4c33f791359a8e6fa3c362", size = 63988, upload-time = "2025-10-19T23:47:23.826Z" },
-    { url = "https://files.pythonhosted.org/packages/1f/f2/632b13942f45db7af709f346ff38b8992c8c21b004e61ab320b0dec525fe/wrapt-2.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:7fec8a9455c029c8cf4ff143a53b6e7c463268d42be6c17efa847ebd2f809965", size = 60584, upload-time = "2025-10-19T23:47:25.396Z" },
-    { url = "https://files.pythonhosted.org/packages/00/5c/c34575f96a0a038579683c7f10fca943c15c7946037d1d254ab9db1536ec/wrapt-2.0.0-py3-none-any.whl", hash = "sha256:02482fb0df89857e35427dfb844319417e14fae05878f295ee43fa3bf3b15502", size = 43998, upload-time = "2025-10-19T23:47:52.858Z" },
-]
-
 [[package]]
 name = "xattr"
 version = "1.3.0"
@@ -6902,55 +6685,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/73/ae/b48f95715333080afb75a4504487cbe142cae1268afc482d06692d605ae6/yarl-1.22.0-py3-none-any.whl", hash = "sha256:1380560bdba02b6b6c90de54133c81c9f2a453dee9912fe58c1dcced1edb7cff", size = 46814, upload-time = "2025-10-06T14:12:53.872Z" },
 ]
 
-[[package]]
-name = "zarr"
-version = "2.18.3"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version < '3.11' and sys_platform == 'linux'",
-    "python_full_version < '3.11' and sys_platform != 'linux'",
-]
-dependencies = [
-    { name = "asciitree", marker = "python_full_version < '3.11'" },
-    { name = "fasteners", marker = "python_full_version < '3.11' and sys_platform != 'emscripten'" },
-    { name = "numcodecs", version = "0.13.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
-    { name = "numpy", marker = "python_full_version < '3.11'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/23/c4/187a21ce7cf7c8f00c060dd0e04c2a81139bb7b1ab178bba83f2e1134ce2/zarr-2.18.3.tar.gz", hash = "sha256:2580d8cb6dd84621771a10d31c4d777dca8a27706a1a89b29f42d2d37e2df5ce", size = 3603224, upload-time = "2024-09-04T23:20:16.595Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/ed/c9/142095e654c2b97133ff71df60979422717b29738b08bc8a1709a5d5e0d0/zarr-2.18.3-py3-none-any.whl", hash = "sha256:b1f7dfd2496f436745cdd4c7bcf8d3b4bc1dceef5fdd0d589c87130d842496dd", size = 210723, upload-time = "2024-09-04T23:20:14.491Z" },
-]
-
-[[package]]
-name = "zarr"
-version = "3.1.3"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform == 'linux'",
-    "python_full_version == '3.12.*' and sys_platform == 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation != 'PyPy' and sys_platform != 'linux'",
-    "python_full_version >= '3.14' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'",
-    "python_full_version == '3.13.*' and platform_python_implementation == 'PyPy' and sys_platform != 'linux'",
-    "python_full_version == '3.12.*' and sys_platform != 'linux'",
-    "python_full_version == '3.11.*' and sys_platform == 'linux'",
-    "python_full_version == '3.11.*' and sys_platform != 'linux'",
-]
-dependencies = [
-    { name = "donfig", marker = "python_full_version >= '3.11'" },
-    { name = "numcodecs", version = "0.16.3", source = { registry = "https://pypi.org/simple" }, extra = ["crc32c"], marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", marker = "python_full_version >= '3.11'" },
-    { name = "packaging", marker = "python_full_version >= '3.11'" },
-    { name = "typing-extensions", marker = "python_full_version >= '3.11'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/d6/67/14be68a7bad15eecda09b1e81fca2420f7533645fe187bf4d6104c1aad52/zarr-3.1.3.tar.gz", hash = "sha256:01342f3e26a02ed5670db608a5576fbdb8d76acb5c280bd2d0082454b1ba6f79", size = 349125, upload-time = "2025-09-18T19:32:41.688Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/1a/71/9de7229515a53d1cc5705ca9c411530f711a2242f962214d9dbfe2741aa4/zarr-3.1.3-py3-none-any.whl", hash = "sha256:45f67f87f65f14fa453f99dd8110a5936b7ac69f3a21981d33e90407c80c302a", size = 276427, upload-time = "2025-09-18T19:32:40.042Z" },
-]
-
 [[package]]
 name = "zipp"
 version = "3.23.0"