Skip to content

Commit 433ac0d

Browse files
committed
v0.3.0 release
Signed-off-by: Javier <25750030+SystemPanic@users.noreply.github.com>
2 parents 702e3c1 + f131f3d commit 433ac0d

90 files changed

Lines changed: 9755 additions & 1040 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/release-ci-docker.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ jobs:
88
runs-on: ubuntu-latest
99
strategy:
1010
matrix:
11-
cuda: [cu126, cu128, cu129]
11+
cuda: [cu126, cu128, cu129, cu130]
1212
arch: [amd64, arm64]
1313
steps:
1414
- name: Free Disk Space
@@ -55,7 +55,7 @@ jobs:
5555
needs: build
5656
strategy:
5757
matrix:
58-
cuda: [cu126, cu128, cu129]
58+
cuda: [cu126, cu128, cu129, cu130]
5959
steps:
6060
- name: Set up Docker Buildx
6161
uses: docker/setup-buildx-action@v3

CONTRIBUTING.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,6 @@ When incrementing a version and creating a release, follow [Semantic Versioning]
4242
* minor increment signals added functionality that is backwards-compatible (e.g. new kernels, new SM support, etc)
4343
* patch increment signals backwards-compatible bug fixes (both for functional and performance issues)
4444

45-
Optionally, use post-releases (e.g., `X.Y.Zpost1`) for minor changes, like a documentation change.
45+
Optionally, use post-releases (e.g., `X.Y.Z.post1`) for minor changes, like a documentation change.
4646

47-
[^1]: We have not followed this strictly through v0.2.13. But after v0.2.13, the versioning should follow SemVer.
47+
[^1]: We have not followed this strictly through v0.2.14.post1. But after v0.2.14.post1, the versioning should follow SemVer.

Jenkinsfile

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,10 @@ stage('Unittest') {
286286
'JIT-Unittest-4-cu129': {
287287
run_with_spot_retry('GPU-G5-SPOT', 'GPU-G5', 'JIT-Unittest-4-cu129',
288288
{ node_type -> shard_run_unittest_GPU(node_type, 4, 'cu129') })
289-
}
289+
},
290+
'JIT-Unittest-5-cu129': {
291+
run_with_spot_retry('GPU-G5-SPOT', 'GPU-G5', 'JIT-Unittest-5-cu129',
292+
{ node_type -> shard_run_unittest_GPU(node_type, 5, 'cu129') })
293+
},
290294
)
291295
}

benchmarks/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ The output CSV will contain detailed metrics including:
152152
| `--ep_size` | Expert-parallel world size |
153153
| `--ep_rank` | Expert-parallel rank |
154154
| `--gated_act` | Gated activation function: `swiglu` (default) or `geglu` |
155+
| `--autotune` | Enable autotune for supported operation |
155156

156157
### MOE Routing Method Compatibility
157158

benchmarks/routines/moe.py

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import torch
66

77
import flashinfer
8+
from flashinfer.autotuner import autotune
89
from flashinfer.fused_moe import (
910
WeightLayout,
1011
trtllm_fp4_block_scale_moe,
@@ -186,6 +187,14 @@ def parse_moe_args(line, parser):
186187
choices=["swiglu", "geglu"],
187188
help="Type of gated activation function: swiglu | geglu.",
188189
)
190+
parser.add_argument(
191+
"--autotune",
192+
action="store_true",
193+
default=False,
194+
help=(
195+
"Enable autotuner warmup for supported routines (trtllm_fp4_block_scale_moe and cutlass_fused_moe)."
196+
),
197+
)
189198

190199
# CUTLASS fused MoE specific
191200
parser.add_argument(
@@ -604,10 +613,11 @@ def testTrtllmFp4BlockScaleMoe(args):
604613
hidden_states_fp4 = hidden_states_fp4_bytes.view(torch.uint8).reshape(
605614
hidden_states.shape[0], hidden_states.shape[1] // 2
606615
)
616+
# Hidden-states scale for FP4 must be 2D: [num_tokens, hidden_size // 16]
607617
hidden_states_scale_linear_fp4 = hidden_states_scale_fp4_bytes.view(
608618
torch.float8_e4m3fn
609-
).reshape(-1)
610-
# Ensure expected vector size (16 elements per hidden value for NvFP4)
619+
)
620+
# Ensure expected shape (16 elements per hidden value for NvFP4)
611621
expected_scale_elems = (num_tokens * hidden_size) // 16
612622
if hidden_states_scale_linear_fp4.numel() != expected_scale_elems:
613623
if args.verbose >= 1:
@@ -617,6 +627,9 @@ def testTrtllmFp4BlockScaleMoe(args):
617627
hidden_states_scale_linear_fp4 = torch.ones(
618628
expected_scale_elems, device=device, dtype=torch.float8_e4m3fn
619629
)
630+
hidden_states_scale_linear_fp4 = hidden_states_scale_linear_fp4.reshape(
631+
num_tokens, hidden_size // 16
632+
)
620633

621634
# Prepare weights for kernel
622635
# For FP4 weights, keep them as uint8 (packed format) - don't convert to float8_e4m3fn
@@ -691,6 +704,22 @@ def run_fp4_moe():
691704
do_finalize=True,
692705
)
693706

707+
backend = "trtllm"
708+
709+
# Optional autotune warmup (supported for FP4 TRTLlm fused MoE)
710+
if getattr(args, "autotune", False):
711+
warmup_iters = (
712+
args.dry_run_iters if args.dry_run_iters and args.dry_run_iters > 0 else 10
713+
)
714+
backend = "trtllm_autotune"
715+
if args.verbose >= 1:
716+
print(
717+
f"[INFO] Autotune warmup for FP4 block scale MoE: {warmup_iters} iters"
718+
)
719+
with autotune(True):
720+
for _ in range(warmup_iters):
721+
run_fp4_moe()
722+
694723
# Benchmark timing
695724
if is_cuda_graph_compatible:
696725
times = bench_gpu_time_with_cudagraph(
@@ -734,7 +763,6 @@ def run_fp4_moe():
734763
routing_logits_dtype=routing_logits.dtype,
735764
)
736765

737-
backend = "trtllm"
738766
print_perf_metrics(backend, median_time, std_time, tflops, tb_per_sec)
739767

740768
res = []
@@ -1011,6 +1039,20 @@ def run_cutlass():
10111039
else:
10121040
raise ValueError(f"Unknown cutlass_variant: {variant}")
10131041

1042+
backend = "cutlass"
1043+
1044+
# Optional autotune warmup (supported for CUTLASS fused MoE)
1045+
if getattr(args, "autotune", False):
1046+
warmup_iters = (
1047+
args.dry_run_iters if args.dry_run_iters and args.dry_run_iters > 0 else 10
1048+
)
1049+
backend = "cutlass_autotune"
1050+
if args.verbose >= 1:
1051+
print(f"[INFO] Autotune warmup for CUTLASS fused MoE: {warmup_iters} iters")
1052+
with autotune(True):
1053+
for _ in range(warmup_iters):
1054+
run_cutlass()
1055+
10141056
# Measure
10151057
if is_cuda_graph_compatible:
10161058
times = bench_gpu_time_with_cudagraph(
@@ -1064,7 +1106,6 @@ def run_cutlass():
10641106
active_experts=int(selected_experts.unique().numel()),
10651107
)
10661108

1067-
backend = "cutlass"
10681109
print_perf_metrics(backend, median_time, std_time, tflops, tb_per_sec)
10691110

10701111
res = []

0 commit comments

Comments
 (0)