-
-
Notifications
You must be signed in to change notification settings - Fork 14.2k
[ROCm][Bugfix] Fix MXFP4 MoE emulate fallback logic on MX-capable hardware #36422
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,141 @@ | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||
| """Unit tests for the QuarkOCP_MX_MoEMethod emulate dispatch logic. | ||
|
|
||
| The emulate flag determines whether native CK / Triton MXFP4 kernels are | ||
| used or whether the computation falls back to high-precision emulation. | ||
| A Boolean-logic regression in this flag (PR #29008) caused gibberish | ||
| output on MI350X (Issue #36337) because the fallback was silently | ||
| disabled on MX-capable hardware. | ||
|
|
||
| These tests verify the flag is set correctly for every relevant | ||
| combination of (hardware_support × scheme × aiter_enabled × backend). | ||
| No GPU is required — all platform / env-var dependencies are mocked. | ||
| """ | ||
|
|
||
| import pytest | ||
|
|
||
|
|
||
| def _compute_emulate( | ||
| supports_mx: bool, | ||
| ocp_mx_scheme: str | None, | ||
| use_rocm_aiter_moe: bool, | ||
| mxfp4_backend_available: bool, | ||
| ) -> bool: | ||
| """Mirror the emulate logic from QuarkOCP_MX_MoEMethod.__init__. | ||
|
|
||
| See vllm/model_executor/layers/quantization/quark/quark_moe.py, | ||
| around line 733. | ||
| """ | ||
| can_use_native_ck = ( | ||
| supports_mx | ||
| and ocp_mx_scheme is not None | ||
| and ocp_mx_scheme.startswith("w_mxfp4") | ||
| and use_rocm_aiter_moe | ||
| ) | ||
| can_use_mxfp4_backend = mxfp4_backend_available | ||
|
|
||
| return not (can_use_native_ck or can_use_mxfp4_backend) | ||
|
|
||
|
|
||
| # ── Native CK path tests ────────────────────────────────────────────── | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "supports_mx, scheme, aiter_enabled, backend, expected_emulate", | ||
| [ | ||
| # All conditions met → native CK → no emulation | ||
| (True, "w_mxfp4_a_mxfp4", True, False, False), | ||
| (True, "w_mxfp4_a_fp8", True, False, False), | ||
| (True, "w_mxfp4", True, False, False), | ||
| # AITER disabled (VLLM_ROCM_USE_AITER_MOE=0) → must emulate | ||
| (True, "w_mxfp4_a_mxfp4", False, False, True), | ||
| # Hardware doesn't support MX → must emulate | ||
| (False, "w_mxfp4_a_mxfp4", True, False, True), | ||
| (False, "w_mxfp4_a_mxfp4", False, False, True), | ||
| # Non-mxfp4 scheme → must emulate (no backend either) | ||
| (True, "w_mxfp6_e3m2", True, False, True), | ||
| (True, "w_mxfp6_e3m2_a_mxfp6_e3m2", True, False, True), | ||
| (False, "w_mxfp6_e3m2", True, False, True), | ||
| # scheme is None → must emulate | ||
| (True, None, True, False, True), | ||
| ], | ||
| ids=[ | ||
| "mi350x-w4a4-aiter_on", | ||
| "mi350x-w4afp8-aiter_on", | ||
| "mi350x-w4_only-aiter_on", | ||
| "mi350x-w4a4-aiter_off", | ||
| "no_mx-w4a4-aiter_on", | ||
| "no_mx-w4a4-aiter_off", | ||
| "mi350x-mxfp6-aiter_on", | ||
| "mi350x-mxfp6_sym-aiter_on", | ||
| "no_mx-mxfp6-aiter_on", | ||
| "mi350x-none_scheme-aiter_on", | ||
| ], | ||
| ) | ||
| def test_emulate_native_ck_path( | ||
| supports_mx: bool, | ||
| scheme: str | None, | ||
| aiter_enabled: bool, | ||
| backend: bool, | ||
| expected_emulate: bool, | ||
| ): | ||
| result = _compute_emulate(supports_mx, scheme, aiter_enabled, backend) | ||
| assert result == expected_emulate, ( | ||
| f"emulate should be {expected_emulate} for " | ||
| f"supports_mx={supports_mx}, scheme={scheme!r}, " | ||
| f"aiter_enabled={aiter_enabled}, backend={backend}" | ||
| ) | ||
|
|
||
|
|
||
| # ── Triton mxfp4 backend tests ──────────────────────────────────────── | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "supports_mx, scheme, aiter_enabled, backend, expected_emulate", | ||
| [ | ||
| # Backend available → no emulation, even without CK | ||
| (False, "w_mxfp4", False, True, False), | ||
| (True, "w_mxfp4", False, True, False), | ||
| # Backend available + CK also available → still no emulation | ||
| (True, "w_mxfp4_a_mxfp4", True, True, False), | ||
| ], | ||
| ids=[ | ||
| "no_mx-backend_on-aiter_off", | ||
| "mi350x-backend_on-aiter_off", | ||
| "mi350x-backend_on-aiter_on", | ||
| ], | ||
| ) | ||
| def test_emulate_mxfp4_backend_path( | ||
| supports_mx: bool, | ||
| scheme: str | None, | ||
| aiter_enabled: bool, | ||
| backend: bool, | ||
| expected_emulate: bool, | ||
| ): | ||
| result = _compute_emulate(supports_mx, scheme, aiter_enabled, backend) | ||
| assert result == expected_emulate | ||
|
|
||
|
|
||
| # ── Regression test for Issue #36337 ────────────────────────────────── | ||
|
|
||
|
|
||
| def test_regression_issue_36337_aiter_disabled_forces_emulation(): | ||
| """On MI350X (supports_mx=True) with w_mxfp4_a_mxfp4 scheme, | ||
| setting VLLM_ROCM_USE_AITER_MOE=0 (aiter_enabled=False) MUST | ||
| result in emulate=True so the user can fall back to the safe | ||
| emulation path when AITER CK kernels are incompatible. | ||
|
|
||
| The old logic (PR #29008) evaluated to emulate=False here because: | ||
| (not True or not True) and (...) → (False) and (...) → False | ||
| """ | ||
| result = _compute_emulate( | ||
| supports_mx=True, | ||
| ocp_mx_scheme="w_mxfp4_a_mxfp4", | ||
| use_rocm_aiter_moe=False, | ||
| mxfp4_backend_available=False, | ||
| ) | ||
| assert result is True, ( | ||
| "emulate must be True when AITER is disabled on MI350X — " | ||
| "this is the exact regression from Issue #36337" | ||
| ) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -730,10 +730,19 @@ def __init__( | |
| get_current_vllm_config().model_config.hf_config, "model_type", None | ||
| ) | ||
|
|
||
| self.emulate = ( | ||
| not current_platform.supports_mx() | ||
| or not self.ocp_mx_scheme.startswith("w_mxfp4") | ||
| ) and (self.mxfp4_backend is None or not self.use_rocm_aiter_moe) | ||
| # Native CK path requires MX hardware + w_mxfp4 scheme + AITER MoE. | ||
| # The Triton mxfp4 backend is available for weight-only mxfp4. | ||
| # If neither path is available, fall back to emulation (dequant to | ||
| # high-precision and compute in BF16). | ||
| can_use_native_ck = ( | ||
| current_platform.supports_mx() | ||
| and self.ocp_mx_scheme is not None | ||
| and self.ocp_mx_scheme.startswith("w_mxfp4") | ||
| and self.use_rocm_aiter_moe | ||
| ) | ||
| can_use_mxfp4_backend = self.mxfp4_backend is not None | ||
|
|
||
| self.emulate = not (can_use_native_ck or can_use_mxfp4_backend) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks, fyi I plan to refactor emulate into a backend after #34285 is landed, so the logic can be merged and cleaned up. |
||
|
|
||
| # CK's pre-compiled MXFP4 MoE GEMM kernel instances have dimension | ||
| # alignment requirements. When violated (e.g. MiniMax-M2.1 with | ||
|
|
@@ -769,7 +778,9 @@ def __init__( | |
| "does not support native MXFP4/MXFP6 " | ||
| "computation. Simulated weight dequantization and activation " | ||
| "QDQ (quantize and dequantize) will be used, with the linear " | ||
| "layers computed in high precision." | ||
| "layers computed in high precision. If you see gibberish " | ||
| "output with native mode, try VLLM_ROCM_USE_AITER_MOE=0 " | ||
| "to force emulation as a workaround." | ||
| ) | ||
| else: | ||
| logger.warning_once( | ||
|
|
@@ -966,6 +977,14 @@ def process_weights_after_loading(self, layer): | |
|
|
||
| from aiter.utility.fp4_utils import e8m0_shuffle | ||
|
|
||
| try: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we move this under https://github.com/vllm-project/vllm/blob/main/vllm/_aiter_ops.py maybe? pulling the version seems like something we probably want to have more generally available? |
||
| import aiter | ||
|
|
||
| aiter_version = getattr(aiter, "__version__", "unknown") | ||
| except ImportError: | ||
| aiter_version = "unknown" | ||
| logger.info("Using AITER %s for MXFP4 MoE weight processing", aiter_version) | ||
|
|
||
| # Pre-shuffle weight scales | ||
| s0, s1, _ = layer.w13_weight_scale.shape | ||
| w13_weight_scale = layer.w13_weight_scale.view(s0 * s1, -1) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixes the same bug as #35855 (comment), introduced in introduced in #29008 (https://github.com/vllm-project/vllm/pull/29008/changes#r2877732813).