NVIDIA · aghilann · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026 · Mar 2, 2026
diff --git a/src/tilegym/ops/cutile/fused_linear_cross_entropy.py b/src/tilegym/ops/cutile/fused_linear_cross_entropy.py
@@ -0,0 +1,169 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: MIT
+
+"""Forward-only chunked fused Linear + Cross-Entropy for cuTile experiments."""
+
+import cuda.tile as ct
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+ConstInt = ct.Constant[int]
+
+_ALIGN = 8
+
+
+@ct.kernel(occupancy=1)
+def _ce_online_kernel(
+    logits,
+    loss_out,
+    target_logits,
+    n_rows: ConstInt,
+    vocab_size: ConstInt,
+    tile_v: ConstInt,
+):
+    """2-pass online softmax over vocab tiles; writes loss and softmax probs in-place."""
+    pid = ct.bid(0)
+    num_blocks = ct.num_blocks(0)
+    num_chunks = ct.cdiv(vocab_size, tile_v)
+    col_base = ct.arange(tile_v, dtype=ct.int32)
+
+    for row in range(pid, n_rows, num_blocks):
+        row_max = ct.full((1,), -1e30, dtype=ct.float32)
+        sum_exp = ct.full((1,), 0.0, dtype=ct.float32)
+
+        for chunk_idx in range(num_chunks):
+            cols = ct.add(ct.full((tile_v,), chunk_idx * tile_v, dtype=ct.int32), col_base)
+            chunk = ct.gather(logits, (row, cols), check_bounds=True, padding_value=-1e30)
+            chunk_f32 = ct.astype(chunk, ct.float32)
+
+            chunk_max = ct.max(chunk_f32, 0, keepdims=True)
+            new_max = ct.maximum(row_max, chunk_max)
+            sum_exp = ct.mul(sum_exp, ct.exp(ct.sub(row_max, new_max)))
+            exp_chunk = ct.exp(ct.sub(chunk_f32, new_max))
+            sum_exp = ct.add(sum_exp, ct.sum(exp_chunk, 0, keepdims=True))
+            row_max = new_max
+
+        lse = ct.add(row_max, ct.log(sum_exp))
+        tgt_logit = ct.load(target_logits, index=(row,), shape=(1,), padding_mode=ct.PaddingMode.ZERO)
+        tgt_logit = ct.astype(tgt_logit, ct.float32)
+        loss = ct.sub(ct.reshape(lse, (1,)), tgt_logit)
+        ct.store(loss_out, index=(row,), tile=loss, allow_tma=False)
+
+        inv_sum = ct.truediv(ct.full((1,), 1.0, dtype=ct.float32), sum_exp)
+
+        for chunk_idx in range(num_chunks):
+            cols = ct.add(ct.full((tile_v,), chunk_idx * tile_v, dtype=ct.int32), col_base)
+            chunk = ct.gather(logits, (row, cols), check_bounds=True, padding_value=-1e30)
+            chunk_f32 = ct.astype(chunk, ct.float32)
+            probs = ct.mul(ct.exp(ct.sub(chunk_f32, row_max)), inv_sum)
+            ct.scatter(logits, (row, cols), ct.astype(probs, logits.dtype), check_bounds=True)
+
+
+def _ce_cutile(logits_chunk: Tensor, target_chunk: Tensor, loss_chunk: Tensor, ignore_index: int) -> None:
+    """Compute CE loss in-place for one (chunk_size, vocab) block."""
+    n_rows, _vocab_size = logits_chunk.shape
+    valid = target_chunk != ignore_index
+    safe_target = target_chunk.clamp(min=0)
+    rows = torch.arange(n_rows, device=logits_chunk.device)
+
+    # Gather target logits once in PyTorch so the kernel can compute loss directly.
+    target_logits = logits_chunk[rows, safe_target].float()
+    target_logits[~valid] = 0.0
+
+    tile_v = 4096
+    sm_count = torch.cuda.get_device_properties("cuda").multi_processor_count
+    grid = (min(sm_count * 4, n_rows),)
+    ct.launch(
+        torch.cuda.current_stream(),
+        grid,
+        _ce_online_kernel,
+        (logits_chunk, loss_chunk, target_logits, n_rows, logits_chunk.shape[1], tile_v),
+    )
+
+    if not valid.all():
+        loss_chunk[~valid] = 0.0
+
+
+def _chunked_fwd_loss(
+    x: Tensor,
+    weight: Tensor,
+    target: Tensor,
+    chunk_size: int,
+    ignore_index: int,
+) -> Tensor:
+    bt = x.shape[0]
+    vocab_size = weight.shape[0]
+    num_chunks = (bt + chunk_size - 1) // chunk_size
+
+    loss = torch.empty(bt, device=x.device, dtype=torch.float32)
+    # Reuse one logits buffer per BT chunk to avoid materializing full [BT, V].
+    logits_buf = torch.empty((chunk_size, vocab_size), device=x.device, dtype=x.dtype)
+
+    for i in range(num_chunks):
+        start, end = i * chunk_size, min((i + 1) * chunk_size, bt)
+        clen = end - start
+
+        x_chunk = x[start:end]
+        target_chunk = target[start:end]
+        loss_chunk = loss[start:end]
+        logits_chunk = logits_buf[:clen]
+
+        # GEMM 1: logits = x @ W^T for this chunk.
+        torch.mm(x_chunk, weight.mT, out=logits_chunk)
+        _ce_cutile(logits_chunk, target_chunk, loss_chunk, ignore_index)
+
+    return loss
+
+
+def fused_linear_cross_entropy_forward_only(
+    hidden_states: Tensor,
+    weight: Tensor,
+    target: Tensor,
+    bias: Tensor | None = None,
+    ignore_index: int = -100,
+    chunk_size: int = 4096,
+    reduction: str = "mean",
+) -> Tensor:
+    """Forward-only chunked fused linear + cross entropy.
+
+    Notes:
+    - Forward-only experimental API (not backend-registered in TileGym dispatch).
+    - Main tradeoff: often higher latency than dense PyTorch CE, but much lower
+      peak memory on large BT because full logits [BT, V] are not materialized.
+    """
+    if reduction not in {"mean", "sum"}:
+        raise ValueError(f"Unsupported reduction: {reduction}")
+
+    if hidden_states.ndim == 3:
+        hidden_states = hidden_states.reshape(-1, hidden_states.shape[-1])
+        target = target.reshape(-1)
+
+    if bias is not None:
+        logits = F.linear(hidden_states, weight, bias)
+        return F.cross_entropy(logits, target, ignore_index=ignore_index, reduction=reduction)
+
+    bt = hidden_states.shape[0]
+
+    # Pad BT for TensorCore-friendly GEMM alignment.
+    pad = (-bt) % _ALIGN
+    if pad:
+        x_flat = F.pad(hidden_states, (0, 0, 0, pad))
+        target_flat = F.pad(target.reshape(-1), (0, pad), value=ignore_index)
+    else:
+        x_flat = hidden_states
+        target_flat = target.reshape(-1)
+
+    loss = _chunked_fwd_loss(x_flat, weight, target_flat, chunk_size, ignore_index)
+
+    if pad:
+        loss = loss[:bt]
+
+    if reduction == "sum":
+        return loss.sum()
+
+    n_valid = (target_flat[:bt] != ignore_index).sum()
+    if n_valid == 0:
+        return torch.tensor(0.0, device=hidden_states.device, dtype=torch.float32)
+    return loss.sum() / n_valid.float()
diff --git a/tests/benchmark/README.md b/tests/benchmark/README.md
@@ -30,6 +30,7 @@ Available benchmark scripts:
 - `bench_bmm.py`
 - `bench_dropout.py`
 - `bench_fused_attention.py`
+- `bench_fused_linear_cross_entropy.py`
 - `bench_matrix_multiplication.py`
 - `bench_mix_triton_cutile.py`
 - `bench_mla.py`

diff --git a/tests/benchmark/bench_fused_linear_cross_entropy.py b/tests/benchmark/bench_fused_linear_cross_entropy.py
@@ -0,0 +1,158 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: MIT
+
+"""Benchmark forward-only fused linear cross-entropy with Triton perf_report style.
+
+Expected behavior on large BT:
+- CuTile path may be slower than PyTorch in pure latency.
+- CuTile path should use much less peak memory because it chunks over BT and
+  avoids materializing full [BT, V] logits.
+"""
+
+import torch
+import torch.nn.functional as F
+import triton
+
+from tilegym.backend import is_backend_available
+from tilegym.ops.cutile.fused_linear_cross_entropy import fused_linear_cross_entropy_forward_only
+
+DEVICE = triton.runtime.driver.active.get_active_torch_device()
+
+ALL_BACKENDS = [
+    ("cutile", "CuTile", ("blue", "-")) if is_backend_available("cutile") else None,
+    ("torch", "PyTorch", ("green", "-")),
+]
+
+
+def _supported_backends():
+    return [b for b in ALL_BACKENDS if b is not None]
+
+
+def _torch_fused_linear_cross_entropy(
+    hidden_states: torch.Tensor,
+    weight: torch.Tensor,
+    target: torch.Tensor,
+    bias: torch.Tensor | None = None,
+    ignore_index: int = -100,
+    reduction: str = "mean",
+):
+    logits = F.linear(hidden_states, weight, bias)
+    if hidden_states.ndim == 3:
+        logits = logits.reshape(-1, logits.shape[-1])
+        target = target.reshape(-1)
+    return F.cross_entropy(logits, target, ignore_index=ignore_index, reduction=reduction)
+
+
+def _create_latency_config(hidden_size, vocab_size):
+    available = _supported_backends()
+    if not available:
+        return None
+    backends, names, styles = zip(*available)
+    return triton.testing.Benchmark(
+        x_names=["BT"],
+        x_vals=[512, 1024, 2048, 4096, 8192, 16384],
+        line_arg="backend",
+        line_vals=list(backends),
+        line_names=list(names),
+        styles=list(styles),
+        ylabel="Latency (ms)",
+        plot_name=f"fused-lce-latency-H{hidden_size}-V{vocab_size}",
+        args={
+            "hidden_size": hidden_size,
+            "vocab_size": vocab_size,
+        },
+    )
+
+
+def _create_memory_config(hidden_size, vocab_size):
+    available = _supported_backends()
+    if not available:
+        return None
+    backends, names, styles = zip(*available)
+    return triton.testing.Benchmark(
+        x_names=["BT"],
+        x_vals=[512, 1024, 2048, 4096, 8192, 16384],
+        line_arg="backend",
+        line_vals=list(backends),
+        line_names=list(names),
+        styles=list(styles),
+        ylabel="Peak Memory (MB)",
+        plot_name=f"fused-lce-peakmem-H{hidden_size}-V{vocab_size}",
+        args={
+            "hidden_size": hidden_size,
+            "vocab_size": vocab_size,
+        },
+    )
+
+
+@triton.testing.perf_report(
+    [
+        _create_latency_config(hidden_size=1024, vocab_size=32768),
+    ]
+)
+def bench_fused_linear_cross_entropy_latency(BT, backend, hidden_size, vocab_size, device=DEVICE):
+    dtype = torch.bfloat16
+
+    x = torch.randn(BT, hidden_size, device=device, dtype=dtype)
+    w = torch.randn(vocab_size, hidden_size, device=device, dtype=dtype)
+    t = torch.randint(0, vocab_size, (BT,), device=device)
+
+    if backend == "cutile":
+        fn = lambda: fused_linear_cross_entropy_forward_only(
+            x,
+            w,
+            t,
+            ignore_index=-100,
+            chunk_size=512,
+            reduction="mean",
+        )
+    else:
+        fn = lambda: _torch_fused_linear_cross_entropy(x, w, t, ignore_index=-100, reduction="mean")
+
+    ms = triton.testing.do_bench(fn)
+    return ms
+
+
+@triton.testing.perf_report(
+    [
+        _create_memory_config(hidden_size=1024, vocab_size=32768),
+    ]
+)
+def bench_fused_linear_cross_entropy_peak_memory(BT, backend, hidden_size, vocab_size, device=DEVICE):
+    dtype = torch.bfloat16
+
+    x = torch.randn(BT, hidden_size, device=device, dtype=dtype)
+    w = torch.randn(vocab_size, hidden_size, device=device, dtype=dtype)
+    t = torch.randint(0, vocab_size, (BT,), device=device)
+
+    if backend == "cutile":
+        run_once = lambda: fused_linear_cross_entropy_forward_only(
+            x,
+            w,
+            t,
+            ignore_index=-100,
+            chunk_size=512,
+            reduction="mean",
+        )
+    else:
+        run_once = lambda: _torch_fused_linear_cross_entropy(x, w, t, ignore_index=-100, reduction="mean")
+
+    for _ in range(2):
+        run_once()
+    torch.cuda.synchronize()
+    torch.cuda.empty_cache()
+    torch.cuda.reset_peak_memory_stats()
+    run_once()
+    torch.cuda.synchronize()
+
+    return torch.cuda.max_memory_allocated() / (1024**2)
+
+
+if __name__ == "__main__":
+    if not torch.cuda.is_available():
+        print("CUDA is required")
+    else:
+        print("Note: this kernel can be slower than PyTorch but typically saves significant peak memory at large BT.")
+        bench_fused_linear_cross_entropy_latency.run(print_data=True)
+        bench_fused_linear_cross_entropy_peak_memory.run(print_data=True)