ROCm · alextmagro · Sep 19, 2025 · Sep 22, 2025 · Sep 22, 2025 · Sep 23, 2025
@@ -83,7 +83,9 @@ jobs:
       options: --user root
     steps:
       - name: 'Dependencies'
-        run: pip install torch pybind11[global] einops onnxscript
+        run: |
+          pip install pybind11[global] einops onnxscript
+          pip install torch --index-url https://download.pytorch.org/whl/cu130
       - name: 'Checkout'
         uses: actions/checkout@v3
         with:

@@ -38,3 +38,9 @@ repos:
         entry: clang-format -i
         args: ["-style=file"]
         files: ^transformer_engine.*\.(c|cc|cxx|cpp|cu|cuh|h|hpp)$
+
+  - repo: https://github.com/netromdk/vermin
+    rev: c75aca72f4e85c6e47252139e8695f1c8b5f9ae3
+    hooks:
+      - id: vermin
+        args: ['-t=3.10', '--violations']
@@ -362,6 +362,14 @@ Transformer Engine
 
 Latest News
 ===========
+
+* [09/2025] `Pretraining Large Language Models with NVFP4 <https://www.arxiv.org/pdf/2509.25149>`_
+* [09/2025] `Native FP8 Mixed Precision Training for Ling 2.0, Open Sourced! <https://huggingface.co/blog/im0qianqian/ling-mini-2-fp8-mixed-precision-training-solution>`_
+* [09/2025] `Faster Training Throughput in FP8 Precision with NVIDIA NeMo <https://developer.nvidia.com/blog/faster-training-throughput-in-fp8-precision-with-nvidia-nemo/>`_
+* [08/2025] `How we built DeepL's next-generation LLMs with FP8 for training and inference <https://www.deepl.com/en/blog/tech/next-generation-llm-fp8-training>`_
+* [08/2025] `NVFP4 Trains with Precision of 16-bit and Speed and Efficiency of 4-bit <https://developer.nvidia.com/blog/nvfp4-trains-with-precision-of-16-bit-and-speed-and-efficiency-of-4-bit/>`_
+* [06/2025] `Floating Point 8: An Introduction to Efficient, Lower-Precision AI Training <https://developer.nvidia.com/blog/floating-point-8-an-introduction-to-efficient-lower-precision-ai-training/>`_
+* [05/2025] `Advanced Optimization Strategies for LLM Training on NVIDIA Grace Hopper <https://developer.nvidia.com/blog/advanced-optimization-strategies-for-llm-training-on-nvidia-grace-hopper/>`_
 * [03/2025] `Stable and Scalable FP8 Deep Learning Training on Blackwell | GTC 2025 <https://www.nvidia.com/en-us/on-demand/session/gtc25-s72778/>`_
 * [03/2025] `Measure and Improve AI Workload Performance with NVIDIA DGX Cloud Benchmarking <https://developer.nvidia.com/blog/measure-and-improve-ai-workload-performance-with-nvidia-dgx-cloud-benchmarking/>`_
 
@@ -436,7 +444,7 @@ PyTorch
   fp8_recipe = recipe.DelayedScaling(margin=0, fp8_format=recipe.Format.E4M3)
 
   # Enable autocasting for the forward pass
-  with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe):
+  with te.autocast(enabled=True, recipe=fp8_recipe):
       out = model(inp)
 
   loss = out.sum()
@@ -471,7 +479,7 @@ Flax
   fp8_recipe = recipe.DelayedScaling(margin=0, fp8_format=recipe.Format.HYBRID)
 
   # Enable autocasting for the forward pass
-  with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe):
+  with te.autocast(enabled=True, recipe=fp8_recipe):
       model = te_flax.DenseGeneral(features=HIDDEN)
 
       def loss_fn(params, other_vars, inp):
@@ -547,7 +555,7 @@ pip Installation
 **Prerequisites for pip installation:**
 
 * A compatible C++ compiler
-* CUDA Toolkit with cuDNN and NVCC (NVIDIA CUDA Compiler) installed
+* CUDA Toolkit with cuDNN and NVCC (NVIDIA CUDA Compiler) if installing from source.
 
 To install the latest stable version with pip:
 

@@ -0,0 +1,152 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import argparse
+import torch
+import pandas as pd
+import torch.utils.benchmark as benchmark
+
+import transformer_engine.pytorch as te
+import transformer_engine_torch as tex
+import transformer_engine.pytorch.cpp_extensions as ext
+
+from transformer_engine.pytorch.tensor.nvfp4_tensor import NVFP4Quantizer
+
+scale_padding_to = 1
+permute_scale = False
+
+TORCH_TO_TE_FLOAT_MAP = {
+    torch.bfloat16: tex.DType.kBFloat16,
+}
+
+
+def run_kernel(shape, stochastic_rounding: bool, input_dtype=torch.bfloat16):
+    # Generate random input data
+    M, K = shape
+    x = torch.randn([M, K], dtype=input_dtype, device="cuda")
+
+    assert shape[0] % 16 == 0, "Shape must be divisible by 16"
+    assert shape[1] % 16 == 0, "Shape must be divisible by 16"
+
+    # Quantize
+    nvfp4_quantizer = NVFP4Quantizer(
+        fp4_dtype=tex.DType.kFloat4E2M1,
+        rowwise=True,
+        columnwise=True,
+        with_amax_reduction=False,
+        amax_reduction_group=None,
+        with_rht=True,
+        with_post_rht_amax=True,
+        with_random_sign_mask=True,
+        stochastic_rounding=stochastic_rounding,
+    )
+    x_nvfp4_sut = nvfp4_quantizer.make_empty(
+        (M, K), dtype=x.dtype, device=x.device, requires_grad=False
+    )
+    x_nvfp4_sut = nvfp4_quantizer.update_quantized(x, x_nvfp4_sut)
+
+    with torch.no_grad():
+        stmt = "kernel_func(input, output)"
+        globals_dict = {
+            "kernel_func": nvfp4_quantizer.update_quantized,
+            "input": x,
+            "output": x_nvfp4_sut,
+        }
+
+        timing = benchmark.Timer(
+            stmt=stmt,
+            globals=globals_dict,
+            num_threads=1,
+        ).blocked_autorange(min_run_time=5)
+    print(timing)
+    timing_us = timing.median * 1e6
+
+    input_nbytes = shape[0] * shape[1] * 2  # bf16
+    output_nbytes = shape[0] * shape[1] // 2  # //2 for fp4
+    sf_nbytes = shape[0] * shape[1] // 16  # //16 for 1 byte per 16 elems
+
+    total_nbytes = (
+        0
+        + input_nbytes
+        * 3  # Reading input for Amax(x)&Amax(RHT(x.T)), Reading input for Cast(x), Reaindg input for Cast(RHT(x.T))
+        + 2 * 4  # Output 2 * float for scale & amax
+        + 2 * 4  # Input 2 * float
+        + output_nbytes * 2  # Output from Cast(x) and Cast(RHT(x.T))
+        + sf_nbytes * 2  # Scale factor
+    )
+
+    throughput_GBps = total_nbytes / (1024 * 1024 * 1024) / (timing_us / 1e6)
+
+    print(
+        f"Stochastic rounding: {stochastic_rounding}, Total: {total_nbytes} bytes, Throughput:"
+        f" {throughput_GBps} GB/s"
+    )
+    return timing_us, throughput_GBps
+
+
+# Nsight Compute Profiling Command:
+# ncu -f -o block_scaled_1d_cast_transpose_kernel --set=full --kernel-name "block_scaled_1d_cast_transpose_kernel" -s 5 -c 5 python benchmark_cast_transpose_1d_block.py --profile
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--profile", action="store_true", help="Enable profiling mode")
+    args = parser.parse_args()
+
+    if args.profile:
+        print("Profiling is enabled.")
+    else:
+        print("Profiling is disabled.")
+
+    shapes = [
+        (8192, 5120),
+        (8192, 10240),
+        (8192, 2560),
+        (8192, 11328),
+        (8192, 512),
+        (8192, 3584),
+        (5120, 8192),
+        (10240, 8192),
+        (2560, 8192),
+        (11328, 8192),
+        (512, 8192),
+        (3584, 8192),
+        (4096, 16384),
+        (14336, 16384),
+    ]
+
+    if args.profile:
+        shapes = [
+            (16384, 6144),
+        ]
+
+    data = []
+    for stochastic_rounding in [True]:  # , False]:
+        for shape in shapes:
+            print(
+                f"Running benchmark_func with shape {shape} and stochastic_rounding"
+                f" {stochastic_rounding}"
+            )
+            timing_us, throughput_GBps = run_kernel(shape, stochastic_rounding)
+            data.append(
+                [
+                    "benchmark_func",
+                    shape,
+                    stochastic_rounding,
+                    timing_us,
+                    throughput_GBps,
+                ]
+            )
+
+    df = pd.DataFrame(
+        data=data,
+        columns=[
+            "kernel",
+            "shape",
+            "stochastic_rounding",
+            "timing_us",
+            "throughput(GB/s)",
+        ],
+    )
+    print(df)
+    df.to_csv("benchmark_cast_nvfp4.csv", index=False)
@@ -6,58 +6,69 @@
 import torch
 import torch.utils.benchmark as benchmark
 import pandas as pd
-import pathlib
 
 from transformer_engine.pytorch.module import GroupedLinear
-from transformer_engine.common.recipe import Float8BlockScaling, MXFP8BlockScaling
-from transformer_engine.pytorch.fp8 import fp8_autocast, FP8GlobalStateManager
+from transformer_engine.common.recipe import (
+    Float8BlockScaling,
+    MXFP8BlockScaling,
+    NVFP4BlockScaling,
+)
+from transformer_engine.pytorch.quantization import autocast, FP8GlobalStateManager
 from contextlib import nullcontext
 
 """
 # Profile BF16 recipe with Nsight Systems
 nsys profile \
-    --output=./benchmarks/linear/b200_mkn_4096_4096_4096_numgemm_8_bf16 \
+    --output=./benchmarks/linear/b200_numgemm_8_bf16 \
     --force-overwrite true \
     --trace=cuda,nvtx,cudnn,cublas \
     python benchmarks/linear/benchmark_grouped_linear.py --profile --recipe bf16
 
 # Profile FP8 sub-channel recipe with Nsight Systems
 nsys profile \
-    --output=./benchmarks/linear/h100hbm_mkn_4096_4096_4096_numgemm_8_fp8_sub_channel \
+    --output=./benchmarks/linear/h100hbm_numgemm_8_fp8_sub_channel \
     --force-overwrite true \
     --trace=cuda,nvtx,cudnn,cublas \
     python benchmarks/linear/benchmark_grouped_linear.py --profile --recipe fp8_sub_channel
 
 # Profile MXFP8 recipe with Nsight Systems
 nsys profile \
-    --output=./benchmarks/linear/b200_mkn_4096_4096_4096_numgemm_8_mxfp8 \
+    --output=./benchmarks/linear/b200_numgemm_8_mxfp8 \
     --force-overwrite true \
     --trace=cuda,nvtx,cudnn,cublas \
     python benchmarks/linear/benchmark_grouped_linear.py --profile --recipe mxfp8
 
+# Profile NVFP4 recipe with Nsight Systems
+nsys profile \
+    --output=./benchmarks/linear/b200_numgemm_8_nvfp4 \
+    --force-overwrite true \
+    --trace=cuda,nvtx,cudnn,cublas \
+    python benchmarks/linear/benchmark_grouped_linear.py --profile --recipe nvfp4
+
 """
 
 RECIPES = {
     "bf16": None,
     "fp8_sub_channel": Float8BlockScaling(),
     "mxfp8": MXFP8BlockScaling(),
+    "nvfp4": NVFP4BlockScaling(),
 }
 
 mxfp8_available, reason_for_no_mxfp8 = FP8GlobalStateManager.is_mxfp8_available()
 fp8_block_scaling_available, reason_for_no_fp8_block_scaling = (
     FP8GlobalStateManager.is_fp8_block_scaling_available()
 )
+nvfp4_available, reason_for_no_nvfp4 = FP8GlobalStateManager.is_nvfp4_available()
 
 
 def run_linear_multiple_steps(layer, x, m_splits, mode, gradient, run_num_steps=1, recipe=None):
     assert mode in ["fwd_only", "fwd_bwd"]
-    fp8_context = (
-        fp8_autocast(enabled=True, fp8_recipe=recipe) if recipe is not None else nullcontext()
+    quantization_context = (
+        autocast(enabled=True, recipe=recipe) if recipe is not None else nullcontext()
     )
-    # print(f"fp8_context: {fp8_context} and is it nullcontext? {isinstance(fp8_context, nullcontext)}")
 
     if mode == "fwd_only":
-        with torch.no_grad(), fp8_context:
+        with torch.no_grad(), quantization_context:
             for i in range(run_num_steps):
                 y_q = layer.forward(
                     x,
@@ -70,7 +81,7 @@ def run_linear_multiple_steps(layer, x, m_splits, mode, gradient, run_num_steps=
         layer.zero_grad()
         x.grad = None
 
-        with fp8_context:
+        with quantization_context:
             for i in range(run_num_steps):
                 label = f"step_{i}"
                 torch.cuda.nvtx.range_push(label)
@@ -145,7 +156,7 @@ def benchmark_linear(
             "recipe": recipe,
         },
         num_threads=1,
-    ).blocked_autorange(min_run_time=5)
+    ).blocked_autorange(min_run_time=10)
     print(f"{recipe_name}: {timing} \n")
     timing_ms = timing.median * 1000 / num_microbatches
 
@@ -228,30 +239,44 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4):
 
     use_bias = False
     # Set the MKN values to benchmark
+    # Deepseek V3 EP64, SEQ_LEN=8192, topK8
+    # 256 expert => 4 local experts
+    # Avg M per expert: AvgM = SEQ_LEN * topK / localExperts = 16384
+    # M = AvgM * localExperts = 65536
+    # K = 7168
+    # N = 2048
+
+    # Deepseek V3 EP32, SEQ_LEN=8192, topK8
+    # 256 expert => 8 local experts
+    # Avg M per expert: AvgM = SEQ_LEN * topK / localExperts = 8192
+    # M = AvgM * localExperts = 65536
+    # K = 7168
+    # N = 2048
+
+    # 4 or 8local experts per rank
+    num_gemms_list = [4, 8]
+
+    # MKN for group linear
     mkns = []
-    for m in [8192]:
-        # for m in [4096, 8192, 16384]:
-        # for n in [1024, 2048, 4096, 8192, 16384]:
-        for n in [8192]:
-            for k in [4096]:
+    for m in [65536]:
+        for k in [7168]:
+            for n in [2048]:
                 mkns.append((m, k, n))
 
     # default recipes to run if not specified
     recipe_list = ["bf16"]
 
     if args.recipe == "all":
-        recipe_list = ["bf16", "fp8_sub_channel", "mxfp8"]
+        recipe_list = ["bf16", "fp8_sub_channel", "mxfp8", "nvfp4"]
     else:
         recipe_list = [args.recipe]
 
-    num_gemms_list = [8]
-
     if args.profile:
-        mkns = [(4096 * 8, 4096, 4096)]
+        mkns = [(8192 * 8, 7168, 2048)]
         # in profile mode, only run one recipe specified in args.recipe
         assert args.recipe != "all", (
             "In profile mode, only one recipe can be specified, please specify the recipe as"
-            " fp8_sub_channel, mxfp8, or bf16"
+            " fp8_sub_channel, mxfp8, nvfp4, or bf16"
         )
         recipe_list = [args.recipe]
         num_gemms_list = [8]
@@ -268,13 +293,17 @@ def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4):
                 "bf16",
                 "fp8_sub_channel",
                 "mxfp8",
-            ], "Recipe must be one of bf16, fp8_sub_channel, or mxfp8"
+                "nvfp4",
+            ], "Recipe must be one of bf16, fp8_sub_channel, mxfp8, or nvfp4"
             if recipe_name == "mxfp8" and not mxfp8_available:
                 print(f"MXFP8 is not available, skipping {recipe_name}")
                 continue
             if recipe_name == "fp8_sub_channel" and not fp8_block_scaling_available:
                 print(f"FP8 block scaling is not available, skipping {recipe_name}")
                 continue
+            if recipe_name == "nvfp4" and not nvfp4_available:
+                print(f"NVFP4 is not available, skipping {recipe_name}")
+                continue
 
             df = run_benchmark_linear(
                 mkns,

@@ -1 +1 @@
-2.8.0.dev0
+2.10.0.dev0