SystemPanic
diff --git a/‎.github/workflows/release-ci-docker.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/release-ci-docker.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 2 additions & 2 deletions b/‎CONTRIBUTING.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎Jenkinsfile‎
Lines changed: 5 additions & 1 deletion b/‎Jenkinsfile‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎benchmarks/README.md‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/routines/moe.py‎
Lines changed: 45 additions & 4 deletions b/‎benchmarks/routines/moe.py‎
Lines changed: 45 additions & 4 deletions
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        cuda: [cu126, cu128, cu129]
+        cuda: [cu126, cu128, cu129, cu130]
         arch: [amd64, arm64]
     steps:
       - name: Free Disk Space
@@ -55,7 +55,7 @@ jobs:
     needs: build
     strategy:
       matrix:
-        cuda: [cu126, cu128, cu129]
+        cuda: [cu126, cu128, cu129, cu130]
     steps:
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
 
@@ -42,6 +42,6 @@ When incrementing a version and creating a release, follow [Semantic Versioning]
 * minor increment signals added functionality that is backwards-compatible (e.g. new kernels, new SM support, etc)
 * patch increment signals backwards-compatible bug fixes (both for functional and performance issues)
 
-Optionally, use post-releases (e.g., `X.Y.Zpost1`) for minor changes, like a documentation change.
+Optionally, use post-releases (e.g., `X.Y.Z.post1`) for minor changes, like a documentation change.
 
-[^1]: We have not followed this strictly through v0.2.13. But after v0.2.13, the versioning should follow SemVer.
+[^1]: We have not followed this strictly through v0.2.14.post1. But after v0.2.14.post1, the versioning should follow SemVer.
@@ -286,6 +286,10 @@ stage('Unittest') {
     'JIT-Unittest-4-cu129': {
       run_with_spot_retry('GPU-G5-SPOT', 'GPU-G5', 'JIT-Unittest-4-cu129',
         { node_type -> shard_run_unittest_GPU(node_type, 4, 'cu129') })
-    }
+    },
+    'JIT-Unittest-5-cu129': {
+      run_with_spot_retry('GPU-G5-SPOT', 'GPU-G5', 'JIT-Unittest-5-cu129',
+        { node_type -> shard_run_unittest_GPU(node_type, 5, 'cu129') })
+    },
   )
 }
@@ -152,6 +152,7 @@ The output CSV will contain detailed metrics including:
 | `--ep_size`              | Expert-parallel world size                                                                                  |
 | `--ep_rank`              | Expert-parallel rank                                                                                        |
 | `--gated_act`            | Gated activation function: `swiglu` (default) or `geglu`                                                   |
+| `--autotune`             | Enable autotune for supported operation                                                                     |
 
 ### MOE Routing Method Compatibility
 
 
@@ -5,6 +5,7 @@
 import torch
 
 import flashinfer
+from flashinfer.autotuner import autotune
 from flashinfer.fused_moe import (
     WeightLayout,
     trtllm_fp4_block_scale_moe,
@@ -186,6 +187,14 @@ def parse_moe_args(line, parser):
         choices=["swiglu", "geglu"],
         help="Type of gated activation function: swiglu | geglu.",
     )
+    parser.add_argument(
+        "--autotune",
+        action="store_true",
+        default=False,
+        help=(
+            "Enable autotuner warmup for supported routines (trtllm_fp4_block_scale_moe and cutlass_fused_moe)."
+        ),
+    )
 
     # CUTLASS fused MoE specific
     parser.add_argument(
@@ -604,10 +613,11 @@ def testTrtllmFp4BlockScaleMoe(args):
     hidden_states_fp4 = hidden_states_fp4_bytes.view(torch.uint8).reshape(
         hidden_states.shape[0], hidden_states.shape[1] // 2
     )
+    # Hidden-states scale for FP4 must be 2D: [num_tokens, hidden_size // 16]
     hidden_states_scale_linear_fp4 = hidden_states_scale_fp4_bytes.view(
         torch.float8_e4m3fn
-    ).reshape(-1)
-    # Ensure expected vector size (16 elements per hidden value for NvFP4)
+    )
+    # Ensure expected shape (16 elements per hidden value for NvFP4)
     expected_scale_elems = (num_tokens * hidden_size) // 16
     if hidden_states_scale_linear_fp4.numel() != expected_scale_elems:
         if args.verbose >= 1:
@@ -617,6 +627,9 @@ def testTrtllmFp4BlockScaleMoe(args):
         hidden_states_scale_linear_fp4 = torch.ones(
             expected_scale_elems, device=device, dtype=torch.float8_e4m3fn
         )
+    hidden_states_scale_linear_fp4 = hidden_states_scale_linear_fp4.reshape(
+        num_tokens, hidden_size // 16
+    )
 
     # Prepare weights for kernel
     # For FP4 weights, keep them as uint8 (packed format) - don't convert to float8_e4m3fn
@@ -691,6 +704,22 @@ def run_fp4_moe():
             do_finalize=True,
         )
 
+    backend = "trtllm"
+
+    # Optional autotune warmup (supported for FP4 TRTLlm fused MoE)
+    if getattr(args, "autotune", False):
+        warmup_iters = (
+            args.dry_run_iters if args.dry_run_iters and args.dry_run_iters > 0 else 10
+        )
+        backend = "trtllm_autotune"
+        if args.verbose >= 1:
+            print(
+                f"[INFO] Autotune warmup for FP4 block scale MoE: {warmup_iters} iters"
+            )
+        with autotune(True):
+            for _ in range(warmup_iters):
+                run_fp4_moe()
+
     # Benchmark timing
     if is_cuda_graph_compatible:
         times = bench_gpu_time_with_cudagraph(
@@ -734,7 +763,6 @@ def run_fp4_moe():
         routing_logits_dtype=routing_logits.dtype,
     )
 
-    backend = "trtllm"
     print_perf_metrics(backend, median_time, std_time, tflops, tb_per_sec)
 
     res = []
@@ -1011,6 +1039,20 @@ def run_cutlass():
     else:
         raise ValueError(f"Unknown cutlass_variant: {variant}")
 
+    backend = "cutlass"
+
+    # Optional autotune warmup (supported for CUTLASS fused MoE)
+    if getattr(args, "autotune", False):
+        warmup_iters = (
+            args.dry_run_iters if args.dry_run_iters and args.dry_run_iters > 0 else 10
+        )
+        backend = "cutlass_autotune"
+        if args.verbose >= 1:
+            print(f"[INFO] Autotune warmup for CUTLASS fused MoE: {warmup_iters} iters")
+        with autotune(True):
+            for _ in range(warmup_iters):
+                run_cutlass()
+
     # Measure
     if is_cuda_graph_compatible:
         times = bench_gpu_time_with_cudagraph(
@@ -1064,7 +1106,6 @@ def run_cutlass():
         active_experts=int(selected_experts.unique().numel()),
     )
 
-    backend = "cutlass"
     print_perf_metrics(backend, median_time, std_time, tflops, tb_per_sec)
 
     res = []
Original file line number	Diff line number	Diff line change
`@@ -286,6 +286,10 @@ stage('Unittest') {`
`286`	`286`	`'JIT-Unittest-4-cu129': {`
`287`	`287`	`run_with_spot_retry('GPU-G5-SPOT', 'GPU-G5', 'JIT-Unittest-4-cu129',`
`288`	`288`	`{ node_type -> shard_run_unittest_GPU(node_type, 4, 'cu129') })`
`289`		`- }`
	`289`	`+ },`
	`290`	`+ 'JIT-Unittest-5-cu129': {`
	`291`	`+ run_with_spot_retry('GPU-G5-SPOT', 'GPU-G5', 'JIT-Unittest-5-cu129',`
	`292`	`+ { node_type -> shard_run_unittest_GPU(node_type, 5, 'cu129') })`
	`293`	`+ },`
`290`	`294`	`)`
`291`	`295`	`}`