From 581a56452514ee69ea5bd6065d7ee98cea5a009d Mon Sep 17 00:00:00 2001 From: Anuj Jalota Date: Fri, 19 Dec 2025 19:01:18 +0530 Subject: [PATCH 1/7] Fix QMoE blockwise quantization support for TRT-RTX execution provider --- src/python/py/models/builder.py | 9 +- src/python/py/models/builders/base.py | 65 +++++--- src/python/py/models/builders/gptoss.py | 197 ++++++++++++++++++------ 3 files changed, 204 insertions(+), 67 deletions(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index e0fa1bbb0..d3ed06642 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -63,6 +63,7 @@ def check_extra_options(kv_pairs, execution_provider): "shared_embeddings", "hf_remote", "disable_qkv_fusion", + "gpt_oss_swiglu_fusion", ] for key in bools: if key in kv_pairs: @@ -135,7 +136,7 @@ def parse_hf_token(hf_token): def set_io_dtype(precision, execution_provider, extra_options) -> ir.DataType: int4_cpu = precision == "int4" and execution_provider == "cpu" fp32_webgpu = execution_provider == "webgpu" and extra_options.get("use_webgpu_fp32", False) - bf16_cuda = precision == "int4" and execution_provider == "cuda" and extra_options.get("use_cuda_bf16", False) + bf16_cuda = precision == "int4" and execution_provider in {"cuda", "trt-rtx"} and extra_options.get("use_cuda_bf16", False) if precision in {"int8", "fp32"} or int4_cpu or fp32_webgpu: # FP32 precision @@ -405,6 +406,8 @@ def get_args(): Default is 4 for the CPU EP and 0 for non-CPU EPs. int4_block_size = 16/32/64/128/256: Specify the block size for int4 quantization. Default value is 32. + int4_qdq_block_size = 16/32/64/128/256: Specify the block size for quantize/dequantize nodes for int4 quantization. + Default value is int4_block_size. int4_is_symmetric = Quantize the weights symmetrically. Default is true. If true, quantization is done to int4. If false, quantization is done to uint4. int4_op_types_to_quantize = MatMul/Gather: Specify op types to target for int4 quantization. @@ -462,6 +465,8 @@ def get_args(): Use this option to enable GPUs that do not support FP16 on WebGPU (e.g. GTX 10xx). use_cuda_bf16 = Use BF16 I/O precision in quantized ONNX models for CUDA EP. Use this option to create quantized ONNX models that use BF16 precision. + gpt_oss_swiglu_fusion = Fuse gate and up tensors into a single tensor. Default is true. + This is only applicable to GPT-OSS models. adapter_path = Path to folder on disk containing the adapter files (adapter_config.json and adapter model weights). Use this option for LoRA models. """), @@ -469,7 +474,7 @@ def get_args(): args = parser.parse_args() print( - "Valid precision + execution provider combinations are: FP32 CPU, FP32 CUDA, FP16 CUDA, FP16 DML, BF16 CUDA, FP16 TRT-RTX, INT4 CPU, INT4 CUDA, INT4 DML, INT4 WebGPU" + "Valid precision + execution provider combinations are: FP32 CPU, FP32 CUDA, FP16 CUDA, FP16 DML, BF16 CUDA, FP16 TRT-RTX, BF16 TRT-RTX, INT4 CPU, INT4 CUDA, INT4 DML, INT4 WebGPU" ) return args diff --git a/src/python/py/models/builders/base.py b/src/python/py/models/builders/base.py index 9267a5ec5..55a3d6dc9 100644 --- a/src/python/py/models/builders/base.py +++ b/src/python/py/models/builders/base.py @@ -358,12 +358,16 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): # Quantization-specific variables (INT4, INT8, etc.) int4_algo_config = self.make_int4_algo_config(extra_options.get("int4_algo_config", "default")) self.int4_block_size = extra_options.get("int4_block_size", 32) + self.int4_qdq_block_size = extra_options.get("int4_qdq_block_size", self.int4_block_size) + self.gpt_oss_swiglu_fusion = extra_options.get("gpt_oss_swiglu_fusion", True) - # Validate that only CPU and WebGPU EPs support int4_block_size for QMoE - if self.ep not in ["cpu", "webgpu"] and "int4_block_size" in extra_options and moe_op_type == "QMoE": + # Validate that only supported EPs can use int4_block_size for QMoE + # CPU, WebGPU, CUDA, and TRT-RTX support block-wise quantization + supported_blockwise_eps = ["cpu", "webgpu", "cuda", "trt-rtx", "NvTensorRtRtx"] + if self.ep not in supported_blockwise_eps and "int4_block_size" in extra_options and moe_op_type == "QMoE": raise ValueError( f"The 'int4_block_size' option is not supported for {self.ep} execution provider with QMoE. " - "Block-wise quantization (block_size attribute) is only supported for CPU and WebGPU execution providers." + f"Block-wise quantization is only supported for: {', '.join(supported_blockwise_eps)}." ) self.quant_attrs = { @@ -372,6 +376,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): extra_options.get("int4_accuracy_level", 4 if self.ep in ["cpu", "webgpu"] else 0) ), "block_size": int(self.int4_block_size), + "qdq_block_size": int(self.int4_qdq_block_size), "is_symmetric": extra_options.get("int4_is_symmetric", True), "op_types_to_quantize": extra_options.get("int4_op_types_to_quantize", ("MatMul",)), "nodes_to_exclude": extra_options.get("int4_nodes_to_exclude", []), @@ -501,6 +506,7 @@ def is_gqa_supported(self) -> bool: ("webgpu", ir.DataType.FLOAT16), ("webgpu", ir.DataType.FLOAT), ("trt-rtx", ir.DataType.FLOAT16), + ("trt-rtx", ir.DataType.BFLOAT16), } return (self.ep, self.io_dtype) in valid_gqa_configurations @@ -703,7 +709,7 @@ def make_int4_algo_config(self, quant_method: str): def to_int4(self) -> ir.Model: quant = MatMulNBitsQuantizer( model=ir.to_proto(self.model), - block_size=self.quant_attrs["int4"]["block_size"], + block_size=self.quant_attrs["int4"]["qdq_block_size"], is_symmetric=self.quant_attrs["int4"]["is_symmetric"], accuracy_level=self.quant_attrs["int4"]["accuracy_level"], nodes_to_exclude=self.quant_attrs["int4"]["nodes_to_exclude"], @@ -712,7 +718,22 @@ def to_int4(self) -> ir.Model: algo_config=self.quant_attrs["int4"]["algo_config"], ) quant.process() - return ir.from_proto(quant.model.model) + model = ir.from_proto(quant.model.model) + + # Convert float32 scales to bfloat16 if io_dtype is bfloat16. + # MatMulNBitsQuantizer doesn't natively support bfloat16, so we saved weights as float32 + # for quantization and now convert the resulting scales to the target io_dtype. + if self.io_dtype == ir.DataType.BFLOAT16: + for initializer in model.graph.initializers.values(): + # Scale tensors are named with "_scales" or "_DQ_scales" suffix + if initializer.name.endswith("_scales") or initializer.name.endswith("_DQ_scales"): + if initializer.dtype == ir.DataType.FLOAT: + # Convert float32 scales to bfloat16 + float32_data = initializer.const_value.numpy() + bfloat16_data = torch.from_numpy(float32_data).to(torch.bfloat16) + initializer.const_value = TorchTensor(bfloat16_data, name=initializer.name) + + return model def save_model(self, out_dir): print(f"Saving ONNX model in {out_dir}") @@ -1056,7 +1077,14 @@ def make_matmul_op(self, matmul, basename, root_input, **kwargs): def make_matmul_float(self, matmul, name, root_input, **kwargs): weight = name[1:].replace("/", ".") + ".weight" - self.make_initializer(matmul.weight.T, weight, to=self.io_dtype) + # When onnx_dtype is INT4/UINT4, weights will be quantized by MatMulNBitsQuantizer later. + # MatMulNBitsQuantizer doesn't properly support BFLOAT16 inputs, so we need to save + # weights as FLOAT32 to ensure correct quantization with proper scales. + if self.onnx_dtype in {ir.DataType.INT4, ir.DataType.UINT4} and self.io_dtype == ir.DataType.BFLOAT16: + weight_dtype = ir.DataType.FLOAT + else: + weight_dtype = self.io_dtype + self.make_initializer(matmul.weight.T, weight, to=weight_dtype) last_dim = matmul.weight.shape[0] output = "logits" if kwargs.get("logits", False) else f"{name}/output_0" @@ -3264,13 +3292,11 @@ def make_qmoe_weights(self, weights): dtype = torch.quint4x2 if self.moe_attrs["expert_weight_bits"] == 4 else torch.int8 qweight, scales = None, None - # For QMoE, only use block-wise quantization when explicitly requested - # via int4_block_size and when using CPU or WebGPU execution providers, since - # block_size is only supported for these EPs in the QMoE operator. - use_blockwise_quant = "int4_block_size" in self.extra_options and self.ep in ["cpu", "webgpu"] + # Get block size from quantization attributes + block_size = self.quant_attrs["int4"]["block_size"] - if use_blockwise_quant: - block_size = self.quant_attrs["int4"]["block_size"] + # Use block-wise quantization if block_size > 0 + if block_size > 0: try: qweight, scales = self._symmetric_blockwise_quantize(weights, block_size) self.moe_attrs["block_size"] = block_size @@ -3278,7 +3304,7 @@ def make_qmoe_weights(self, weights): except Exception as e: raise RuntimeError(f"Block-wise quantization failed with block_size={block_size}: {e}") - # Use tensor-level quantization (default for QMoE) + # block_size is 0, so we're using tensor-level quantization self.moe_attrs["block_size"] = 0 # Existing tensor-level quantization implementation (fallback) @@ -3354,13 +3380,11 @@ def _symmetric_blockwise_quantize(self, weights, block_size): quantized_flat = quantized_int8.view(*original_shape[:-1], num_blocks * block_size) - if pad_size > 0: - quantized_flat = quantized_flat[..., :-pad_size] - + # Keep padding in quantized weights for proper alignment quantized_uint4 = (quantized_flat + 8).to(torch.uint8) - packed_shape = list(original_shape) - packed_shape[-1] = (original_shape[-1] + 1) // 2 + packed_shape = list(quantized_uint4.shape) + packed_shape[-1] = (quantized_uint4.shape[-1] + 1) // 2 qweight = torch.zeros(packed_shape, dtype=torch.uint8, device=weights.device) # Pack two 4-bit values per byte @@ -3377,11 +3401,8 @@ def _symmetric_blockwise_quantize(self, weights, block_size): else: # 8-bit quantized_int8 = quantized.to(torch.int8) + # Keep padding in quantized weights for proper alignment qweight = quantized_int8.view(*original_shape[:-1], num_blocks * block_size) - if pad_size > 0: - qweight = qweight[..., :-pad_size] - else: - qweight = qweight.view(original_shape) return qweight.cpu(), scales.cpu() diff --git a/src/python/py/models/builders/gptoss.py b/src/python/py/models/builders/gptoss.py index 65eb7a94a..1018d8500 100644 --- a/src/python/py/models/builders/gptoss.py +++ b/src/python/py/models/builders/gptoss.py @@ -19,7 +19,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): self.moe_attrs["activation_beta"] = 1.0 self.moe_attrs["activation_type"] = "swiglu" self.moe_attrs["normalize_routing_weights"] = True - self.moe_attrs["swiglu_fusion"] = 1 + self.moe_attrs["swiglu_fusion"] = 1 if self.gpt_oss_swiglu_fusion else 0 def make_layer(self, layer_id, layer): # Each LLM decoder layer is typically defined as: @@ -70,7 +70,7 @@ def make_attention(self, layer_id, attention, root_input, **kwargs): self.window_size = original_window_size def make_moe(self, layer_id, mlp, root_input): - if self.ep in {"cpu", "cuda"}: + if self.ep in {"cpu", "cuda", "NvTensorRtRtx", "trt-rtx"}: self.make_moe_fused(layer_id, mlp, root_input) else: self.make_moe_decomposed(layer_id, mlp, root_input) @@ -592,6 +592,8 @@ def make_moe_fused(self, layer_id, mlp, root_input): gate_up_proj_layout = mlp.experts.gate_up_proj.transpose(-1, -2) down_proj_layout = mlp.experts.down_proj.transpose(-1, -2) + moe_name = f"{basename}/{op_type}" + if op_type == "MoE" and not has_quark_experts: # Save non-quantized MoE weights as initializers self.make_initializer( @@ -604,6 +606,22 @@ def make_moe_fused(self, layer_id, mlp, root_input): down_proj_weight, to=self.io_dtype, ) + + # Save MoE biases as initializers + self.make_initializer(mlp.experts.gate_up_proj_bias, gate_up_proj_bias, to=self.io_dtype) + self.make_initializer(mlp.experts.down_proj_bias, down_proj_bias, to=self.io_dtype) + + self.make_moe_op( + moe_name, + root_input=root_input, + router_probs=f"{router_reshape_name}/output_0", + weight1=gate_up_proj_weight, + scales1=gate_up_proj_scales, + bias1=gate_up_proj_bias, + weight2=down_proj_weight, + scales2=down_proj_scales, + bias2=down_proj_bias, + ) else: if has_quark_experts: # Use pre-quantized Quark experts @@ -639,48 +657,141 @@ def make_moe_fused(self, layer_id, mlp, root_input): down_proj_qweight_tensor = torch.stack(down_proj_qweight_list, dim=0).to(torch.uint8) down_proj_scales_tensor = torch.stack(down_proj_scales_list, dim=0) - # qweight tensors always use the same shape regardless of quantization method - pack_size = 8 // self.moe_attrs["expert_weight_bits"] - self.make_initializer( - gate_up_proj_qweight_tensor.view(self.moe_attrs["num_experts"], -1, self.hidden_size // pack_size), - gate_up_proj_weight, - ) - self.make_initializer( - down_proj_qweight_tensor.view( - self.moe_attrs["num_experts"], self.hidden_size, self.intermediate_size // pack_size - ), - down_proj_weight, - ) - - # scales tensors have different shapes depending on quantization method - self.make_initializer(gate_up_proj_scales_tensor, gate_up_proj_scales, to=self.io_dtype) - self.make_initializer(down_proj_scales_tensor, down_proj_scales, to=self.io_dtype) - - # Save MoE biases as initializers - if has_quark_experts: - gate_up_bias = self.combine_quark_gate_up_biases_from_experts(mlp.experts) - down_bias = self.combine_quark_down_biases_from_experts(mlp.experts) - else: - gate_up_bias = mlp.experts.gate_up_proj_bias - down_bias = mlp.experts.down_proj_bias - - self.make_initializer(gate_up_bias, gate_up_proj_bias, to=self.io_dtype) - self.make_initializer(down_bias, down_proj_bias, to=self.io_dtype) + if has_quark_experts: + # Quark experts: use original sizes (no padding) + pack_size = 8 // self.moe_attrs["expert_weight_bits"] + hidden_size_padded = self.hidden_size + intermediate_size_padded = self.intermediate_size + + # Save Quark qweight tensors + self.make_initializer( + gate_up_proj_qweight_tensor.view(self.moe_attrs["num_experts"], -1, hidden_size_padded // pack_size), + gate_up_proj_weight, + ) + self.make_initializer( + down_proj_qweight_tensor.view( + self.moe_attrs["num_experts"], self.hidden_size, intermediate_size_padded // pack_size + ), + down_proj_weight, + ) - moe_name = f"{basename}/{op_type}" - self.make_moe_op( - moe_name, - root_input=root_input, - router_probs=f"{router_reshape_name}/output_0", - weight1=gate_up_proj_weight, - scales1=gate_up_proj_scales, - bias1=gate_up_proj_bias, - weight2=down_proj_weight, - scales2=down_proj_scales, - bias2=down_proj_bias, - zero_points1=gate_up_proj_zero_points if has_quark_experts else "", - zero_points2=down_proj_zero_points if has_quark_experts else "", - ) + # Save Quark scales tensors + self.make_initializer(gate_up_proj_scales_tensor, gate_up_proj_scales, to=self.io_dtype) + self.make_initializer(down_proj_scales_tensor, down_proj_scales, to=self.io_dtype) + + # Save Quark biases + gate_up_bias = self.combine_quark_gate_up_biases_from_experts(mlp.experts) + down_bias = self.combine_quark_down_biases_from_experts(mlp.experts) + self.make_initializer(gate_up_bias, gate_up_proj_bias, to=self.io_dtype) + self.make_initializer(down_bias, down_proj_bias, to=self.io_dtype) + + # Quark always uses fused path with zero_points + self.make_moe_op( + moe_name, + root_input=root_input, + router_probs=f"{router_reshape_name}/output_0", + weight1=gate_up_proj_weight, + scales1=gate_up_proj_scales, + bias1=gate_up_proj_bias, + weight2=down_proj_weight, + scales2=down_proj_scales, + bias2=down_proj_bias, + zero_points1=gate_up_proj_zero_points, + zero_points2=down_proj_zero_points, + ) + else: + # Non-Quark QMoE: use quantized weights with optional padding + pack_size = 8 // self.moe_attrs["expert_weight_bits"] + hidden_size_padded = gate_up_proj_qweight_list[0].shape[-1] * pack_size + intermediate_size_padded = down_proj_qweight_list[0].shape[-1] * pack_size + + if self.moe_attrs["swiglu_fusion"] == 0: + # UNFUSED: split gate/up projections into separate tensors (for TRT-RTX) + gate_proj_weight = f"model.layers.{layer_id}.moe.experts.gate_proj.{moe_weight_type}" + gate_proj_scales = f"model.layers.{layer_id}.moe.experts.gate_proj.scales" + gate_proj_bias = f"model.layers.{layer_id}.moe.experts.gate_proj.bias" + up_proj_weight = f"model.layers.{layer_id}.moe.experts.up_proj.{moe_weight_type}" + up_proj_scales = f"model.layers.{layer_id}.moe.experts.up_proj.scales" + up_proj_bias = f"model.layers.{layer_id}.moe.experts.up_proj.bias" + + # Split gate_up into gate (even indices) and up (odd indices) + gate_proj_qweight_tensor = gate_up_proj_qweight_tensor[:, ::2, :] + up_proj_qweight_tensor = gate_up_proj_qweight_tensor[:, 1::2, :] + gate_proj_scales_tensor = gate_up_proj_scales_tensor[:, ::2] + up_proj_scales_tensor = gate_up_proj_scales_tensor[:, 1::2] + + # Save qweight tensors + self.make_initializer( + gate_proj_qweight_tensor.view(self.moe_attrs["num_experts"], -1, hidden_size_padded // pack_size), + gate_proj_weight, + ) + self.make_initializer( + up_proj_qweight_tensor.view(self.moe_attrs["num_experts"], -1, hidden_size_padded // pack_size), + up_proj_weight, + ) + self.make_initializer( + down_proj_qweight_tensor.view( + self.moe_attrs["num_experts"], self.hidden_size, intermediate_size_padded // pack_size + ), + down_proj_weight, + ) + + # Save scales tensors + self.make_initializer(gate_proj_scales_tensor, gate_proj_scales, to=self.io_dtype) + self.make_initializer(up_proj_scales_tensor, up_proj_scales, to=self.io_dtype) + self.make_initializer(down_proj_scales_tensor, down_proj_scales, to=self.io_dtype) + + # Save biases (split) + self.make_initializer(mlp.experts.gate_up_proj_bias[:, ::2], gate_proj_bias, to=self.io_dtype) + self.make_initializer(mlp.experts.gate_up_proj_bias[:, 1::2], up_proj_bias, to=self.io_dtype) + self.make_initializer(mlp.experts.down_proj_bias, down_proj_bias, to=self.io_dtype) + + self.make_moe_op( + moe_name, + root_input=root_input, + router_probs=f"{router_reshape_name}/output_0", + weight1=gate_proj_weight, + scales1=gate_proj_scales, + bias1=gate_proj_bias, + weight2=down_proj_weight, + scales2=down_proj_scales, + bias2=down_proj_bias, + weight3=up_proj_weight, + scales3=up_proj_scales, + bias3=up_proj_bias, + ) + else: + # FUSED: keep gate and up combined (default CUDA path) + self.make_initializer( + gate_up_proj_qweight_tensor.view(self.moe_attrs["num_experts"], -1, hidden_size_padded // pack_size), + gate_up_proj_weight, + ) + self.make_initializer( + down_proj_qweight_tensor.view( + self.moe_attrs["num_experts"], self.hidden_size, intermediate_size_padded // pack_size + ), + down_proj_weight, + ) + + # Save scales tensors + self.make_initializer(gate_up_proj_scales_tensor, gate_up_proj_scales, to=self.io_dtype) + self.make_initializer(down_proj_scales_tensor, down_proj_scales, to=self.io_dtype) + + # Save biases + self.make_initializer(mlp.experts.gate_up_proj_bias, gate_up_proj_bias, to=self.io_dtype) + self.make_initializer(mlp.experts.down_proj_bias, down_proj_bias, to=self.io_dtype) + + self.make_moe_op( + moe_name, + root_input=root_input, + router_probs=f"{router_reshape_name}/output_0", + weight1=gate_up_proj_weight, + scales1=gate_up_proj_scales, + bias1=gate_up_proj_bias, + weight2=down_proj_weight, + scales2=down_proj_scales, + bias2=down_proj_bias, + ) # Assign output 0 of previous MoE as root input to next SkipLayerNorm self.layernorm_attrs["skip_input"] = f"{moe_name}/output_0" From 9f88bcd94ba55372573762261193af4b32e69880 Mon Sep 17 00:00:00 2001 From: Anuj Jalota Date: Wed, 24 Dec 2025 17:31:03 +0530 Subject: [PATCH 2/7] remvoed madding --- src/python/py/models/builders/base.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/python/py/models/builders/base.py b/src/python/py/models/builders/base.py index 55a3d6dc9..f64265f3c 100644 --- a/src/python/py/models/builders/base.py +++ b/src/python/py/models/builders/base.py @@ -3380,11 +3380,14 @@ def _symmetric_blockwise_quantize(self, weights, block_size): quantized_flat = quantized_int8.view(*original_shape[:-1], num_blocks * block_size) - # Keep padding in quantized weights for proper alignment + # remove padding + if pad_size > 0: + quantized_flat = quantized_flat[..., :-pad_size] + quantized_uint4 = (quantized_flat + 8).to(torch.uint8) - packed_shape = list(quantized_uint4.shape) - packed_shape[-1] = (quantized_uint4.shape[-1] + 1) // 2 + packed_shape = list(original_shape) + packed_shape[-1] = (original_shape[-1] + 1) // 2 qweight = torch.zeros(packed_shape, dtype=torch.uint8, device=weights.device) # Pack two 4-bit values per byte @@ -3401,8 +3404,11 @@ def _symmetric_blockwise_quantize(self, weights, block_size): else: # 8-bit quantized_int8 = quantized.to(torch.int8) - # Keep padding in quantized weights for proper alignment qweight = quantized_int8.view(*original_shape[:-1], num_blocks * block_size) + if pad_size > 0: + qweight = qweight[..., :-pad_size] + else: + qweight = qweight.view(original_shape) return qweight.cpu(), scales.cpu() From 9ae34f6357bd4fe4f7f0a09b61fcb830c887e123 Mon Sep 17 00:00:00 2001 From: Anuj Jalota Date: Tue, 6 Jan 2026 17:39:05 +0530 Subject: [PATCH 3/7] trt-rtx guarg --- src/python/py/models/builders/gptoss.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/python/py/models/builders/gptoss.py b/src/python/py/models/builders/gptoss.py index 1018d8500..469bbdb69 100644 --- a/src/python/py/models/builders/gptoss.py +++ b/src/python/py/models/builders/gptoss.py @@ -705,8 +705,8 @@ def make_moe_fused(self, layer_id, mlp, root_input): hidden_size_padded = gate_up_proj_qweight_list[0].shape[-1] * pack_size intermediate_size_padded = down_proj_qweight_list[0].shape[-1] * pack_size - if self.moe_attrs["swiglu_fusion"] == 0: - # UNFUSED: split gate/up projections into separate tensors (for TRT-RTX) + if self.moe_attrs["swiglu_fusion"] == 0 and self.ep in {"NvTensorRtRtx", "trt-rtx"}: + # UNFUSED: split gate/up projections into separate tensors (TRT-RTX only) gate_proj_weight = f"model.layers.{layer_id}.moe.experts.gate_proj.{moe_weight_type}" gate_proj_scales = f"model.layers.{layer_id}.moe.experts.gate_proj.scales" gate_proj_bias = f"model.layers.{layer_id}.moe.experts.gate_proj.bias" @@ -761,7 +761,7 @@ def make_moe_fused(self, layer_id, mlp, root_input): bias3=up_proj_bias, ) else: - # FUSED: keep gate and up combined (default CUDA path) + # FUSED: keep gate and up combined self.make_initializer( gate_up_proj_qweight_tensor.view(self.moe_attrs["num_experts"], -1, hidden_size_padded // pack_size), gate_up_proj_weight, From 6d4ebca97422369a3b50a7e947149506d4789cef Mon Sep 17 00:00:00 2001 From: Anuj Jalota Date: Tue, 6 Jan 2026 18:43:07 +0530 Subject: [PATCH 4/7] Only add zero_points inputs to QMoE when needed for Quark asymmetric quantization --- src/python/py/models/builders/base.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/python/py/models/builders/base.py b/src/python/py/models/builders/base.py index f64265f3c..3e1db4c13 100644 --- a/src/python/py/models/builders/base.py +++ b/src/python/py/models/builders/base.py @@ -3255,11 +3255,15 @@ def make_qmoe_op(self, name, **kwargs): kwargs.get("weight3", ""), kwargs.get("scales3", ""), kwargs.get("bias3", ""), - kwargs.get("zero_points1", ""), - kwargs.get("zero_points2", ""), - kwargs.get("zero_points3", ""), ] + # Only add zero_points inputs if they are provided (for Quark asymmetric quantization) + zero_points1 = kwargs.get("zero_points1", "") + zero_points2 = kwargs.get("zero_points2", "") + zero_points3 = kwargs.get("zero_points3", "") + if zero_points1 or zero_points2 or zero_points3: + inputs.extend([zero_points1, zero_points2, zero_points3]) + output = f"{name}/output_0" extra_kwargs = ( From 60dfcdf8bacb0aa572087766efe02492d37d1aaf Mon Sep 17 00:00:00 2001 From: Anuj Jalota Date: Wed, 7 Jan 2026 17:54:41 +0530 Subject: [PATCH 5/7] Remove unfused SwiGLU, int4_block_size for MatMulNBits, int4_qmoe_block_size for QMoE (default 128) --- src/python/py/models/builder.py | 9 +- src/python/py/models/builders/base.py | 15 ++- src/python/py/models/builders/gptoss.py | 120 +++++++----------------- 3 files changed, 42 insertions(+), 102 deletions(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index d3ed06642..3550b415f 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -63,7 +63,6 @@ def check_extra_options(kv_pairs, execution_provider): "shared_embeddings", "hf_remote", "disable_qkv_fusion", - "gpt_oss_swiglu_fusion", ] for key in bools: if key in kv_pairs: @@ -404,10 +403,10 @@ def get_args(): 2 is fp16. 1 is fp32. Default is 4 for the CPU EP and 0 for non-CPU EPs. - int4_block_size = 16/32/64/128/256: Specify the block size for int4 quantization. + int4_block_size = 16/32/64/128/256: Specify the block size for int4 quantization (MatMulNBits). Default value is 32. - int4_qdq_block_size = 16/32/64/128/256: Specify the block size for quantize/dequantize nodes for int4 quantization. - Default value is int4_block_size. + int4_qmoe_block_size = 16/32/64/128/256: Specify the block size for QMoE expert weights quantization. + Default value is 128. int4_is_symmetric = Quantize the weights symmetrically. Default is true. If true, quantization is done to int4. If false, quantization is done to uint4. int4_op_types_to_quantize = MatMul/Gather: Specify op types to target for int4 quantization. @@ -465,8 +464,6 @@ def get_args(): Use this option to enable GPUs that do not support FP16 on WebGPU (e.g. GTX 10xx). use_cuda_bf16 = Use BF16 I/O precision in quantized ONNX models for CUDA EP. Use this option to create quantized ONNX models that use BF16 precision. - gpt_oss_swiglu_fusion = Fuse gate and up tensors into a single tensor. Default is true. - This is only applicable to GPT-OSS models. adapter_path = Path to folder on disk containing the adapter files (adapter_config.json and adapter model weights). Use this option for LoRA models. """), diff --git a/src/python/py/models/builders/base.py b/src/python/py/models/builders/base.py index 3e1db4c13..3114a7200 100644 --- a/src/python/py/models/builders/base.py +++ b/src/python/py/models/builders/base.py @@ -358,15 +358,14 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): # Quantization-specific variables (INT4, INT8, etc.) int4_algo_config = self.make_int4_algo_config(extra_options.get("int4_algo_config", "default")) self.int4_block_size = extra_options.get("int4_block_size", 32) - self.int4_qdq_block_size = extra_options.get("int4_qdq_block_size", self.int4_block_size) - self.gpt_oss_swiglu_fusion = extra_options.get("gpt_oss_swiglu_fusion", True) + self.int4_qmoe_block_size = extra_options.get("int4_qmoe_block_size", 128) - # Validate that only supported EPs can use int4_block_size for QMoE + # Validate that only supported EPs can use int4_qmoe_block_size for QMoE # CPU, WebGPU, CUDA, and TRT-RTX support block-wise quantization supported_blockwise_eps = ["cpu", "webgpu", "cuda", "trt-rtx", "NvTensorRtRtx"] - if self.ep not in supported_blockwise_eps and "int4_block_size" in extra_options and moe_op_type == "QMoE": + if self.ep not in supported_blockwise_eps and "int4_qmoe_block_size" in extra_options and moe_op_type == "QMoE": raise ValueError( - f"The 'int4_block_size' option is not supported for {self.ep} execution provider with QMoE. " + f"The 'int4_qmoe_block_size' option is not supported for {self.ep} execution provider with QMoE. " f"Block-wise quantization is only supported for: {', '.join(supported_blockwise_eps)}." ) @@ -375,8 +374,8 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): "accuracy_level": int( extra_options.get("int4_accuracy_level", 4 if self.ep in ["cpu", "webgpu"] else 0) ), - "block_size": int(self.int4_block_size), - "qdq_block_size": int(self.int4_qdq_block_size), + "block_size": int(self.int4_qmoe_block_size), + "qdq_block_size": int(self.int4_block_size), "is_symmetric": extra_options.get("int4_is_symmetric", True), "op_types_to_quantize": extra_options.get("int4_op_types_to_quantize", ("MatMul",)), "nodes_to_exclude": extra_options.get("int4_nodes_to_exclude", []), @@ -389,7 +388,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): # QMoE on CPU/WebGPU supports block-wise quantization via the 'block_size' attribute. # Ensure the attribute is set on the MoE op so runtime kernels can honor it. if self.moe_attrs.get("op_type") == "QMoE" and self.ep in ["cpu", "webgpu"]: - self.moe_attrs["block_size"] = int(self.int4_block_size) + self.moe_attrs["block_size"] = int(self.int4_qmoe_block_size) if self.quant_type is not None: # Create quantized attributes from quantization config self.quant_attrs["config"] = config.quantization_config diff --git a/src/python/py/models/builders/gptoss.py b/src/python/py/models/builders/gptoss.py index 469bbdb69..38d1906ed 100644 --- a/src/python/py/models/builders/gptoss.py +++ b/src/python/py/models/builders/gptoss.py @@ -19,7 +19,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): self.moe_attrs["activation_beta"] = 1.0 self.moe_attrs["activation_type"] = "swiglu" self.moe_attrs["normalize_routing_weights"] = True - self.moe_attrs["swiglu_fusion"] = 1 if self.gpt_oss_swiglu_fusion else 0 + self.moe_attrs["swiglu_fusion"] = 1 def make_layer(self, layer_id, layer): # Each LLM decoder layer is typically defined as: @@ -705,93 +705,37 @@ def make_moe_fused(self, layer_id, mlp, root_input): hidden_size_padded = gate_up_proj_qweight_list[0].shape[-1] * pack_size intermediate_size_padded = down_proj_qweight_list[0].shape[-1] * pack_size - if self.moe_attrs["swiglu_fusion"] == 0 and self.ep in {"NvTensorRtRtx", "trt-rtx"}: - # UNFUSED: split gate/up projections into separate tensors (TRT-RTX only) - gate_proj_weight = f"model.layers.{layer_id}.moe.experts.gate_proj.{moe_weight_type}" - gate_proj_scales = f"model.layers.{layer_id}.moe.experts.gate_proj.scales" - gate_proj_bias = f"model.layers.{layer_id}.moe.experts.gate_proj.bias" - up_proj_weight = f"model.layers.{layer_id}.moe.experts.up_proj.{moe_weight_type}" - up_proj_scales = f"model.layers.{layer_id}.moe.experts.up_proj.scales" - up_proj_bias = f"model.layers.{layer_id}.moe.experts.up_proj.bias" - - # Split gate_up into gate (even indices) and up (odd indices) - gate_proj_qweight_tensor = gate_up_proj_qweight_tensor[:, ::2, :] - up_proj_qweight_tensor = gate_up_proj_qweight_tensor[:, 1::2, :] - gate_proj_scales_tensor = gate_up_proj_scales_tensor[:, ::2] - up_proj_scales_tensor = gate_up_proj_scales_tensor[:, 1::2] - - # Save qweight tensors - self.make_initializer( - gate_proj_qweight_tensor.view(self.moe_attrs["num_experts"], -1, hidden_size_padded // pack_size), - gate_proj_weight, - ) - self.make_initializer( - up_proj_qweight_tensor.view(self.moe_attrs["num_experts"], -1, hidden_size_padded // pack_size), - up_proj_weight, - ) - self.make_initializer( - down_proj_qweight_tensor.view( - self.moe_attrs["num_experts"], self.hidden_size, intermediate_size_padded // pack_size - ), - down_proj_weight, - ) - - # Save scales tensors - self.make_initializer(gate_proj_scales_tensor, gate_proj_scales, to=self.io_dtype) - self.make_initializer(up_proj_scales_tensor, up_proj_scales, to=self.io_dtype) - self.make_initializer(down_proj_scales_tensor, down_proj_scales, to=self.io_dtype) - - # Save biases (split) - self.make_initializer(mlp.experts.gate_up_proj_bias[:, ::2], gate_proj_bias, to=self.io_dtype) - self.make_initializer(mlp.experts.gate_up_proj_bias[:, 1::2], up_proj_bias, to=self.io_dtype) - self.make_initializer(mlp.experts.down_proj_bias, down_proj_bias, to=self.io_dtype) - - self.make_moe_op( - moe_name, - root_input=root_input, - router_probs=f"{router_reshape_name}/output_0", - weight1=gate_proj_weight, - scales1=gate_proj_scales, - bias1=gate_proj_bias, - weight2=down_proj_weight, - scales2=down_proj_scales, - bias2=down_proj_bias, - weight3=up_proj_weight, - scales3=up_proj_scales, - bias3=up_proj_bias, - ) - else: - # FUSED: keep gate and up combined - self.make_initializer( - gate_up_proj_qweight_tensor.view(self.moe_attrs["num_experts"], -1, hidden_size_padded // pack_size), - gate_up_proj_weight, - ) - self.make_initializer( - down_proj_qweight_tensor.view( - self.moe_attrs["num_experts"], self.hidden_size, intermediate_size_padded // pack_size - ), - down_proj_weight, - ) - - # Save scales tensors - self.make_initializer(gate_up_proj_scales_tensor, gate_up_proj_scales, to=self.io_dtype) - self.make_initializer(down_proj_scales_tensor, down_proj_scales, to=self.io_dtype) - - # Save biases - self.make_initializer(mlp.experts.gate_up_proj_bias, gate_up_proj_bias, to=self.io_dtype) - self.make_initializer(mlp.experts.down_proj_bias, down_proj_bias, to=self.io_dtype) - - self.make_moe_op( - moe_name, - root_input=root_input, - router_probs=f"{router_reshape_name}/output_0", - weight1=gate_up_proj_weight, - scales1=gate_up_proj_scales, - bias1=gate_up_proj_bias, - weight2=down_proj_weight, - scales2=down_proj_scales, - bias2=down_proj_bias, - ) + # FUSED: keep gate and up combined + self.make_initializer( + gate_up_proj_qweight_tensor.view(self.moe_attrs["num_experts"], -1, hidden_size_padded // pack_size), + gate_up_proj_weight, + ) + self.make_initializer( + down_proj_qweight_tensor.view( + self.moe_attrs["num_experts"], self.hidden_size, intermediate_size_padded // pack_size + ), + down_proj_weight, + ) + + # Save scales tensors + self.make_initializer(gate_up_proj_scales_tensor, gate_up_proj_scales, to=self.io_dtype) + self.make_initializer(down_proj_scales_tensor, down_proj_scales, to=self.io_dtype) + + # Save biases + self.make_initializer(mlp.experts.gate_up_proj_bias, gate_up_proj_bias, to=self.io_dtype) + self.make_initializer(mlp.experts.down_proj_bias, down_proj_bias, to=self.io_dtype) + + self.make_moe_op( + moe_name, + root_input=root_input, + router_probs=f"{router_reshape_name}/output_0", + weight1=gate_up_proj_weight, + scales1=gate_up_proj_scales, + bias1=gate_up_proj_bias, + weight2=down_proj_weight, + scales2=down_proj_scales, + bias2=down_proj_bias, + ) # Assign output 0 of previous MoE as root input to next SkipLayerNorm self.layernorm_attrs["skip_input"] = f"{moe_name}/output_0" From 18edebb739c3bb93d7d7531a2e3f7c3716945d7f Mon Sep 17 00:00:00 2001 From: Anuj Jalota Date: Wed, 7 Jan 2026 18:19:33 +0530 Subject: [PATCH 6/7] cuda dont support block size qnat for MOE --- src/python/py/models/builders/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/py/models/builders/base.py b/src/python/py/models/builders/base.py index 3114a7200..c6c46fbe7 100644 --- a/src/python/py/models/builders/base.py +++ b/src/python/py/models/builders/base.py @@ -361,8 +361,8 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): self.int4_qmoe_block_size = extra_options.get("int4_qmoe_block_size", 128) # Validate that only supported EPs can use int4_qmoe_block_size for QMoE - # CPU, WebGPU, CUDA, and TRT-RTX support block-wise quantization - supported_blockwise_eps = ["cpu", "webgpu", "cuda", "trt-rtx", "NvTensorRtRtx"] + # CPU, WebGPU, and TRT-RTX support block-wise quantization + supported_blockwise_eps = ["cpu", "webgpu", "trt-rtx", "NvTensorRtRtx"] if self.ep not in supported_blockwise_eps and "int4_qmoe_block_size" in extra_options and moe_op_type == "QMoE": raise ValueError( f"The 'int4_qmoe_block_size' option is not supported for {self.ep} execution provider with QMoE. " From 73c67edef8a4ce9422a9d6f84ffdd4d5b5b4894d Mon Sep 17 00:00:00 2001 From: Anuj Jalota Date: Wed, 7 Jan 2026 22:40:17 +0530 Subject: [PATCH 7/7] minor fixes --- src/python/py/models/builder.py | 2 +- src/python/py/models/builders/base.py | 15 +-- src/python/py/models/builders/gptoss.py | 135 ++++++++---------------- 3 files changed, 56 insertions(+), 96 deletions(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 3550b415f..8b86651b7 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -406,7 +406,7 @@ def get_args(): int4_block_size = 16/32/64/128/256: Specify the block size for int4 quantization (MatMulNBits). Default value is 32. int4_qmoe_block_size = 16/32/64/128/256: Specify the block size for QMoE expert weights quantization. - Default value is 128. + Default is 128 for trt-rtx, 0 (tensor-level) for others. Supported EPs: cpu, webgpu, trt-rtx. int4_is_symmetric = Quantize the weights symmetrically. Default is true. If true, quantization is done to int4. If false, quantization is done to uint4. int4_op_types_to_quantize = MatMul/Gather: Specify op types to target for int4 quantization. diff --git a/src/python/py/models/builders/base.py b/src/python/py/models/builders/base.py index c6c46fbe7..849c425dc 100644 --- a/src/python/py/models/builders/base.py +++ b/src/python/py/models/builders/base.py @@ -358,11 +358,14 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): # Quantization-specific variables (INT4, INT8, etc.) int4_algo_config = self.make_int4_algo_config(extra_options.get("int4_algo_config", "default")) self.int4_block_size = extra_options.get("int4_block_size", 32) - self.int4_qmoe_block_size = extra_options.get("int4_qmoe_block_size", 128) - # Validate that only supported EPs can use int4_qmoe_block_size for QMoE - # CPU, WebGPU, and TRT-RTX support block-wise quantization + # CPU, WebGPU, and TRT-RTX support block-wise quantization for QMoE. + # TRT-RTX defaults to 128; CPU/WebGPU default to 0 (tensor-level) for backward compatibility. supported_blockwise_eps = ["cpu", "webgpu", "trt-rtx", "NvTensorRtRtx"] + default_qmoe_block_size = 128 if self.ep in ["trt-rtx", "NvTensorRtRtx"] else 0 + self.int4_qmoe_block_size = extra_options.get("int4_qmoe_block_size", default_qmoe_block_size) + + # Validate that unsupported EPs don't explicitly request block-wise quantization if self.ep not in supported_blockwise_eps and "int4_qmoe_block_size" in extra_options and moe_op_type == "QMoE": raise ValueError( f"The 'int4_qmoe_block_size' option is not supported for {self.ep} execution provider with QMoE. " @@ -384,10 +387,10 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options): "use_qdq": extra_options.get("use_qdq", False), } - # Propagate block_size to MoE/QMoE op when supported and requested. - # QMoE on CPU/WebGPU supports block-wise quantization via the 'block_size' attribute. + # Propagate block_size to MoE/QMoE op when supported. + # QMoE on supported EPs uses block-wise quantization via the 'block_size' attribute. # Ensure the attribute is set on the MoE op so runtime kernels can honor it. - if self.moe_attrs.get("op_type") == "QMoE" and self.ep in ["cpu", "webgpu"]: + if self.moe_attrs.get("op_type") == "QMoE" and self.ep in supported_blockwise_eps: self.moe_attrs["block_size"] = int(self.int4_qmoe_block_size) if self.quant_type is not None: # Create quantized attributes from quantization config diff --git a/src/python/py/models/builders/gptoss.py b/src/python/py/models/builders/gptoss.py index 38d1906ed..324caf160 100644 --- a/src/python/py/models/builders/gptoss.py +++ b/src/python/py/models/builders/gptoss.py @@ -592,8 +592,6 @@ def make_moe_fused(self, layer_id, mlp, root_input): gate_up_proj_layout = mlp.experts.gate_up_proj.transpose(-1, -2) down_proj_layout = mlp.experts.down_proj.transpose(-1, -2) - moe_name = f"{basename}/{op_type}" - if op_type == "MoE" and not has_quark_experts: # Save non-quantized MoE weights as initializers self.make_initializer( @@ -606,22 +604,6 @@ def make_moe_fused(self, layer_id, mlp, root_input): down_proj_weight, to=self.io_dtype, ) - - # Save MoE biases as initializers - self.make_initializer(mlp.experts.gate_up_proj_bias, gate_up_proj_bias, to=self.io_dtype) - self.make_initializer(mlp.experts.down_proj_bias, down_proj_bias, to=self.io_dtype) - - self.make_moe_op( - moe_name, - root_input=root_input, - router_probs=f"{router_reshape_name}/output_0", - weight1=gate_up_proj_weight, - scales1=gate_up_proj_scales, - bias1=gate_up_proj_bias, - weight2=down_proj_weight, - scales2=down_proj_scales, - bias2=down_proj_bias, - ) else: if has_quark_experts: # Use pre-quantized Quark experts @@ -657,85 +639,60 @@ def make_moe_fused(self, layer_id, mlp, root_input): down_proj_qweight_tensor = torch.stack(down_proj_qweight_list, dim=0).to(torch.uint8) down_proj_scales_tensor = torch.stack(down_proj_scales_list, dim=0) + # Determine shape based on Quark vs non-Quark + pack_size = 8 // self.moe_attrs["expert_weight_bits"] if has_quark_experts: - # Quark experts: use original sizes (no padding) - pack_size = 8 // self.moe_attrs["expert_weight_bits"] hidden_size_padded = self.hidden_size intermediate_size_padded = self.intermediate_size - - # Save Quark qweight tensors - self.make_initializer( - gate_up_proj_qweight_tensor.view(self.moe_attrs["num_experts"], -1, hidden_size_padded // pack_size), - gate_up_proj_weight, - ) - self.make_initializer( - down_proj_qweight_tensor.view( - self.moe_attrs["num_experts"], self.hidden_size, intermediate_size_padded // pack_size - ), - down_proj_weight, - ) - - # Save Quark scales tensors - self.make_initializer(gate_up_proj_scales_tensor, gate_up_proj_scales, to=self.io_dtype) - self.make_initializer(down_proj_scales_tensor, down_proj_scales, to=self.io_dtype) - - # Save Quark biases - gate_up_bias = self.combine_quark_gate_up_biases_from_experts(mlp.experts) - down_bias = self.combine_quark_down_biases_from_experts(mlp.experts) - self.make_initializer(gate_up_bias, gate_up_proj_bias, to=self.io_dtype) - self.make_initializer(down_bias, down_proj_bias, to=self.io_dtype) - - # Quark always uses fused path with zero_points - self.make_moe_op( - moe_name, - root_input=root_input, - router_probs=f"{router_reshape_name}/output_0", - weight1=gate_up_proj_weight, - scales1=gate_up_proj_scales, - bias1=gate_up_proj_bias, - weight2=down_proj_weight, - scales2=down_proj_scales, - bias2=down_proj_bias, - zero_points1=gate_up_proj_zero_points, - zero_points2=down_proj_zero_points, - ) else: - # Non-Quark QMoE: use quantized weights with optional padding - pack_size = 8 // self.moe_attrs["expert_weight_bits"] hidden_size_padded = gate_up_proj_qweight_list[0].shape[-1] * pack_size intermediate_size_padded = down_proj_qweight_list[0].shape[-1] * pack_size - # FUSED: keep gate and up combined - self.make_initializer( - gate_up_proj_qweight_tensor.view(self.moe_attrs["num_experts"], -1, hidden_size_padded // pack_size), - gate_up_proj_weight, - ) - self.make_initializer( - down_proj_qweight_tensor.view( - self.moe_attrs["num_experts"], self.hidden_size, intermediate_size_padded // pack_size - ), - down_proj_weight, - ) + # Save qweight tensors + self.make_initializer( + gate_up_proj_qweight_tensor.view(self.moe_attrs["num_experts"], -1, hidden_size_padded // pack_size), + gate_up_proj_weight, + ) + self.make_initializer( + down_proj_qweight_tensor.view( + self.moe_attrs["num_experts"], self.hidden_size, intermediate_size_padded // pack_size + ), + down_proj_weight, + ) - # Save scales tensors - self.make_initializer(gate_up_proj_scales_tensor, gate_up_proj_scales, to=self.io_dtype) - self.make_initializer(down_proj_scales_tensor, down_proj_scales, to=self.io_dtype) - - # Save biases - self.make_initializer(mlp.experts.gate_up_proj_bias, gate_up_proj_bias, to=self.io_dtype) - self.make_initializer(mlp.experts.down_proj_bias, down_proj_bias, to=self.io_dtype) - - self.make_moe_op( - moe_name, - root_input=root_input, - router_probs=f"{router_reshape_name}/output_0", - weight1=gate_up_proj_weight, - scales1=gate_up_proj_scales, - bias1=gate_up_proj_bias, - weight2=down_proj_weight, - scales2=down_proj_scales, - bias2=down_proj_bias, - ) + # Save scales tensors + self.make_initializer(gate_up_proj_scales_tensor, gate_up_proj_scales, to=self.io_dtype) + self.make_initializer(down_proj_scales_tensor, down_proj_scales, to=self.io_dtype) + + # Save biases (shared for all paths) + if has_quark_experts: + gate_up_bias = self.combine_quark_gate_up_biases_from_experts(mlp.experts) + down_bias = self.combine_quark_down_biases_from_experts(mlp.experts) + else: + gate_up_bias = mlp.experts.gate_up_proj_bias + down_bias = mlp.experts.down_proj_bias + + self.make_initializer(gate_up_bias, gate_up_proj_bias, to=self.io_dtype) + self.make_initializer(down_bias, down_proj_bias, to=self.io_dtype) + + # Single make_moe_op call with EP-based zero_points + # TRT-RTX doesn't support zero_points inputs + moe_name = f"{basename}/{op_type}" + use_zero_points = has_quark_experts and self.ep not in {"NvTensorRtRtx", "trt-rtx"} + + self.make_moe_op( + moe_name, + root_input=root_input, + router_probs=f"{router_reshape_name}/output_0", + weight1=gate_up_proj_weight, + scales1=gate_up_proj_scales, + bias1=gate_up_proj_bias, + weight2=down_proj_weight, + scales2=down_proj_scales, + bias2=down_proj_bias, + zero_points1=gate_up_proj_zero_points if use_zero_points else "", + zero_points2=down_proj_zero_points if use_zero_points else "", + ) # Assign output 0 of previous MoE as root input to next SkipLayerNorm self.layernorm_attrs["skip_input"] = f"{moe_name}/output_0"