From c795b2d336a3a08c3a128f2358b6de95604c1d98 Mon Sep 17 00:00:00 2001 From: jlarson4 Date: Fri, 10 Apr 2026 13:18:33 -0500 Subject: [PATCH 1/5] Initila Qwen3MoE adapter setup --- .../model_bridge/test_qwen3_moe_bridge.py | 198 ++++++++++++++++ .../model_bridge/test_qwen3_moe_adapter.py | 218 ++++++++++++++++++ .../factories/architecture_adapter_factory.py | 2 + .../supported_architectures/__init__.py | 4 + .../supported_architectures/qwen3_moe.py | 164 +++++++++++++ .../model_registry/data/supported_models.json | 60 ++++- 6 files changed, 644 insertions(+), 2 deletions(-) create mode 100644 tests/integration/model_bridge/test_qwen3_moe_bridge.py create mode 100644 tests/unit/model_bridge/test_qwen3_moe_adapter.py create mode 100644 transformer_lens/model_bridge/supported_architectures/qwen3_moe.py diff --git a/tests/integration/model_bridge/test_qwen3_moe_bridge.py b/tests/integration/model_bridge/test_qwen3_moe_bridge.py new file mode 100644 index 000000000..906f1ab17 --- /dev/null +++ b/tests/integration/model_bridge/test_qwen3_moe_bridge.py @@ -0,0 +1,198 @@ +"""Integration tests for the Qwen3MoE TransformerBridge. + +All tests use a tiny programmatic Qwen3MoE config on the meta device — no +network access and no actual weights are downloaded. The meta device means +tensor operations cannot execute, so forward-pass tests are explicitly skipped +and marked for manual execution during verification. + +Fixture pattern mirrors tests/unit/model_bridge/test_gpt_oss_moe.py. +""" + +import pytest +import torch +from transformers import AutoConfig, AutoModelForCausalLM + +from transformer_lens.config import TransformerBridgeConfig +from transformer_lens.model_bridge.bridge import TransformerBridge +from transformer_lens.model_bridge.generalized_components import MoEBridge +from transformer_lens.model_bridge.sources.transformers import ( + map_default_transformer_lens_config, +) +from transformer_lens.model_bridge.supported_architectures.qwen3_moe import ( + Qwen3MoeArchitectureAdapter, +) + +# --------------------------------------------------------------------------- +# Tiny programmatic model fixture (meta device, no weights) +# --------------------------------------------------------------------------- + + +class _MockTokenizer: + """Minimal stand-in so TransformerBridge(tokenizer=...) is satisfied.""" + + pass + + +@pytest.fixture(scope="module") +def tiny_qwen3moe_config(): + """Return a small Qwen3MoeConfig (2 layers, 4 heads, 4 experts).""" + return AutoConfig.for_model( + "qwen3_moe", + hidden_size=64, + num_hidden_layers=2, + num_attention_heads=4, + num_key_value_heads=2, + head_dim=16, + moe_intermediate_size=32, + num_experts=4, + num_experts_per_tok=2, + vocab_size=256, + max_position_embeddings=128, + decoder_sparse_step=1, + mlp_only_layers=[], + ) + + +@pytest.fixture(scope="module") +def tiny_qwen3moe_model_meta(tiny_qwen3moe_config): + """Create a Qwen3MoE model structure on meta device (no weights loaded).""" + with torch.device("meta"): + model = AutoModelForCausalLM.from_config(tiny_qwen3moe_config) + return model + + +@pytest.fixture(scope="module") +def tiny_qwen3moe_bridge(tiny_qwen3moe_config, tiny_qwen3moe_model_meta): + """Create a TransformerBridge wrapping the tiny meta-device Qwen3MoE model.""" + tl_config = map_default_transformer_lens_config(tiny_qwen3moe_config) + + bridge_config = TransformerBridgeConfig( + d_model=tl_config.d_model, + d_head=tl_config.d_head, + n_layers=tl_config.n_layers, + n_ctx=tl_config.n_ctx, + n_heads=tl_config.n_heads, + n_key_value_heads=tl_config.n_key_value_heads, + d_vocab=tl_config.d_vocab, + architecture="Qwen3MoeForCausalLM", + ) + + adapter = Qwen3MoeArchitectureAdapter(bridge_config) + + return TransformerBridge( + model=tiny_qwen3moe_model_meta, + adapter=adapter, + tokenizer=_MockTokenizer(), + ) + + +# --------------------------------------------------------------------------- +# HF model structure +# --------------------------------------------------------------------------- + + +class TestQwen3MoeModelStructure: + def test_model_has_layers(self, tiny_qwen3moe_model_meta) -> None: + assert hasattr(tiny_qwen3moe_model_meta, "model") + assert hasattr(tiny_qwen3moe_model_meta.model, "layers") + assert len(tiny_qwen3moe_model_meta.model.layers) == 2 + + def test_layer_has_sparse_moe_block(self, tiny_qwen3moe_model_meta) -> None: + layer0_mlp = tiny_qwen3moe_model_meta.model.layers[0].mlp + # Qwen3MoeSparseMoeBlock uses batched expert parameters (not a ModuleList) + assert hasattr(layer0_mlp, "experts") + experts = layer0_mlp.experts + assert hasattr(experts, "gate_up_proj") + assert hasattr(experts, "down_proj") + # Experts are NOT iterable — stored as batched 3D tensors + assert not hasattr(experts, "__iter__") + + def test_layer_has_gate_router(self, tiny_qwen3moe_model_meta) -> None: + layer0_mlp = tiny_qwen3moe_model_meta.model.layers[0].mlp + assert hasattr(layer0_mlp, "gate") + + def test_attention_has_q_norm_k_norm(self, tiny_qwen3moe_model_meta) -> None: + attn = tiny_qwen3moe_model_meta.model.layers[0].self_attn + assert hasattr(attn, "q_norm") + assert hasattr(attn, "k_norm") + + +# --------------------------------------------------------------------------- +# Bridge structure +# --------------------------------------------------------------------------- + + +class TestQwen3MoeBridgeStructure: + def test_block_count(self, tiny_qwen3moe_bridge) -> None: + assert len(tiny_qwen3moe_bridge.blocks) == 2 + + def test_has_core_components(self, tiny_qwen3moe_bridge) -> None: + assert hasattr(tiny_qwen3moe_bridge, "embed") + assert hasattr(tiny_qwen3moe_bridge, "unembed") + assert hasattr(tiny_qwen3moe_bridge, "ln_final") + + def test_cfg_final_rms_is_true(self, tiny_qwen3moe_bridge) -> None: + """Critical Qwen3MoE config flag — differs from OLMoE which uses False.""" + assert tiny_qwen3moe_bridge.cfg.final_rms is True + + def test_cfg_n_kv_heads(self, tiny_qwen3moe_bridge) -> None: + assert tiny_qwen3moe_bridge.cfg.n_key_value_heads == 2 + + def test_cfg_positional_embedding_type(self, tiny_qwen3moe_bridge) -> None: + assert tiny_qwen3moe_bridge.cfg.positional_embedding_type == "rotary" + + def test_cfg_normalization_type(self, tiny_qwen3moe_bridge) -> None: + assert tiny_qwen3moe_bridge.cfg.normalization_type == "RMS" + + def test_mlp_blocks_are_moe_bridge(self, tiny_qwen3moe_bridge) -> None: + for i, block in enumerate(tiny_qwen3moe_bridge.blocks): + assert isinstance( + block.mlp, MoEBridge + ), f"Block {i} mlp is {type(block.mlp).__name__}, expected MoEBridge" + + def test_moe_bridge_has_router_scores_hook(self, tiny_qwen3moe_bridge) -> None: + mlp = tiny_qwen3moe_bridge.blocks[0].mlp + assert hasattr(mlp, "hook_router_scores") + + def test_block_has_ln1_and_ln2(self, tiny_qwen3moe_bridge) -> None: + block = tiny_qwen3moe_bridge.blocks[0] + assert hasattr(block, "ln1") + assert hasattr(block, "ln2") + + def test_block_attn_has_q_norm_k_norm(self, tiny_qwen3moe_bridge) -> None: + attn = tiny_qwen3moe_bridge.blocks[0].attn + assert hasattr(attn, "q_norm") + assert hasattr(attn, "k_norm") + + +# --------------------------------------------------------------------------- +# Forward-pass tests — skipped on meta device, run manually during verification +# --------------------------------------------------------------------------- + + +@pytest.mark.skip(reason="Requires real weights — run manually during verification") +def test_forward_pass_matches_hf(tiny_qwen3moe_bridge) -> None: + """Bridge forward should produce logits identical to the HF model. + + Run this test manually with a real (non-meta) model during Step 3 + verification. On meta device, tensor operations raise NotImplementedError. + """ + tokens = torch.tensor([[1, 2, 3, 4]]) + with torch.no_grad(): + bridge_out = tiny_qwen3moe_bridge(tokens) + hf_out = tiny_qwen3moe_bridge.original_model(tokens).logits + max_diff = (bridge_out - hf_out).abs().max().item() + assert max_diff < 1e-4, f"Bridge vs HF max diff = {max_diff}" + + +@pytest.mark.skip(reason="Requires real weights — run manually during verification") +def test_run_with_cache_captures_moe_router_scores(tiny_qwen3moe_bridge) -> None: + """MoEBridge should capture router scores in the activation cache. + + Run manually with real weights during Step 3 verification. + """ + tiny_qwen3moe_bridge.enable_compatibility_mode(no_processing=True) + tokens = torch.tensor([[1, 2, 3, 4]]) + _, cache = tiny_qwen3moe_bridge.run_with_cache(tokens) + for i in range(len(tiny_qwen3moe_bridge.blocks)): + assert f"blocks.{i}.mlp.hook_router_scores" in cache, f"Missing router scores for block {i}" diff --git a/tests/unit/model_bridge/test_qwen3_moe_adapter.py b/tests/unit/model_bridge/test_qwen3_moe_adapter.py new file mode 100644 index 000000000..df741a977 --- /dev/null +++ b/tests/unit/model_bridge/test_qwen3_moe_adapter.py @@ -0,0 +1,218 @@ +"""Unit tests for the Qwen3MoeArchitectureAdapter. + +No network access and no model downloads — all tests use programmatic +TransformerBridgeConfig instances. +""" + +import pytest + +from transformer_lens.config import TransformerBridgeConfig +from transformer_lens.conversion_utils.conversion_steps.rearrange_tensor_conversion import ( + RearrangeTensorConversion, +) +from transformer_lens.conversion_utils.param_processing_conversion import ( + ParamProcessingConversion, +) +from transformer_lens.factories.architecture_adapter_factory import ( + SUPPORTED_ARCHITECTURES, +) +from transformer_lens.model_bridge.generalized_components import ( + MoEBridge, + RMSNormalizationBridge, +) +from transformer_lens.model_bridge.supported_architectures.qwen3_moe import ( + Qwen3MoeArchitectureAdapter, +) + +# --------------------------------------------------------------------------- +# Shared fixture +# --------------------------------------------------------------------------- + + +@pytest.fixture +def cfg() -> TransformerBridgeConfig: + return TransformerBridgeConfig( + d_model=64, + d_head=16, + n_layers=2, + n_ctx=128, + n_heads=4, + n_key_value_heads=2, + d_vocab=256, + architecture="Qwen3MoeForCausalLM", + ) + + +@pytest.fixture +def adapter(cfg: TransformerBridgeConfig) -> Qwen3MoeArchitectureAdapter: + return Qwen3MoeArchitectureAdapter(cfg) + + +# --------------------------------------------------------------------------- +# Config attribute correctness +# --------------------------------------------------------------------------- + + +class TestQwen3MoeAdapterConfig: + def test_normalization_type_is_rms(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + assert adapter.cfg.normalization_type == "RMS" + + def test_positional_embedding_type_is_rotary( + self, adapter: Qwen3MoeArchitectureAdapter + ) -> None: + assert adapter.cfg.positional_embedding_type == "rotary" + + def test_final_rms_is_true(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + """Critical: Qwen3MoE uses final_rms=True; OLMoE uses False.""" + assert adapter.cfg.final_rms is True + + def test_gated_mlp_is_true(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + assert adapter.cfg.gated_mlp is True + + def test_uses_rms_norm_is_true(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + assert adapter.cfg.uses_rms_norm is True + + def test_attn_implementation_is_eager(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + assert adapter.cfg.attn_implementation == "eager" + + def test_default_prepend_bos_is_false(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + assert adapter.cfg.default_prepend_bos is False + + def test_n_kv_heads_propagated(self) -> None: + """n_key_value_heads from the loaded config must be preserved.""" + cfg = TransformerBridgeConfig( + d_model=64, + d_head=16, + n_layers=2, + n_ctx=128, + n_heads=4, + n_key_value_heads=2, + d_vocab=256, + architecture="Qwen3MoeForCausalLM", + ) + adapter = Qwen3MoeArchitectureAdapter(cfg) + assert adapter.cfg.n_key_value_heads == 2 + + +# --------------------------------------------------------------------------- +# Weight processing conversions +# --------------------------------------------------------------------------- + + +class TestQwen3MoeWeightConversions: + def test_has_qkvo_keys(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + convs = adapter.weight_processing_conversions + assert convs is not None + assert "blocks.{i}.attn.q.weight" in convs + assert "blocks.{i}.attn.k.weight" in convs + assert "blocks.{i}.attn.v.weight" in convs + assert "blocks.{i}.attn.o.weight" in convs + + def test_q_rearrange_uses_n_heads(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + """Q weight conversion must use n_heads (4) for the rearrange.""" + convs = adapter.weight_processing_conversions + assert convs is not None + q_conv = convs["blocks.{i}.attn.q.weight"] + assert isinstance(q_conv, ParamProcessingConversion) + assert isinstance(q_conv.tensor_conversion, RearrangeTensorConversion) + axes = q_conv.tensor_conversion.axes_lengths + assert axes.get("n") == 4 + + def test_kv_rearrange_uses_n_kv_heads(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + """K/V weight conversions must use n_key_value_heads (2) for GQA.""" + convs = adapter.weight_processing_conversions + assert convs is not None + k_conv = convs["blocks.{i}.attn.k.weight"] + v_conv = convs["blocks.{i}.attn.v.weight"] + assert isinstance(k_conv, ParamProcessingConversion) + assert isinstance(v_conv, ParamProcessingConversion) + assert isinstance(k_conv.tensor_conversion, RearrangeTensorConversion) + assert isinstance(v_conv.tensor_conversion, RearrangeTensorConversion) + assert k_conv.tensor_conversion.axes_lengths.get("n") == 2 + assert v_conv.tensor_conversion.axes_lengths.get("n") == 2 + + def test_o_rearrange_uses_n_heads(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + """O weight conversion must use n_heads (4).""" + convs = adapter.weight_processing_conversions + assert convs is not None + o_conv = convs["blocks.{i}.attn.o.weight"] + assert isinstance(o_conv, ParamProcessingConversion) + assert isinstance(o_conv.tensor_conversion, RearrangeTensorConversion) + assert o_conv.tensor_conversion.axes_lengths.get("n") == 4 + + +# --------------------------------------------------------------------------- +# Component mapping structure +# --------------------------------------------------------------------------- + + +class TestQwen3MoeComponentMapping: + def test_has_required_top_level_keys(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + mapping = adapter.component_mapping + assert mapping is not None + for key in ("embed", "rotary_emb", "blocks", "ln_final", "unembed"): + assert key in mapping, f"Missing top-level key: {key!r}" + + def test_blocks_has_required_submodules(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + mapping = adapter.component_mapping + assert mapping is not None + blocks = mapping["blocks"] + for key in ("ln1", "ln2", "attn", "mlp"): + assert key in blocks.submodules, f"Missing blocks submodule: {key!r}" + + def test_attn_has_all_submodules(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + mapping = adapter.component_mapping + assert mapping is not None + attn = mapping["blocks"].submodules["attn"] + for key in ("q", "k", "v", "o", "q_norm", "k_norm"): + assert key in attn.submodules, f"Missing attn submodule: {key!r}" + + def test_ln1_ln2_are_rms_norm_bridges(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + mapping = adapter.component_mapping + assert mapping is not None + subs = mapping["blocks"].submodules + assert isinstance(subs["ln1"], RMSNormalizationBridge) + assert isinstance(subs["ln2"], RMSNormalizationBridge) + + def test_mlp_is_moe_bridge(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + mapping = adapter.component_mapping + assert mapping is not None + mlp = mapping["blocks"].submodules["mlp"] + assert isinstance(mlp, MoEBridge) + + def test_mlp_has_gate_submodule(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + mapping = adapter.component_mapping + assert mapping is not None + mlp = mapping["blocks"].submodules["mlp"] + assert "gate" in mlp.submodules + + def test_q_norm_k_norm_are_rms_norm_bridges(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + mapping = adapter.component_mapping + assert mapping is not None + attn_subs = mapping["blocks"].submodules["attn"].submodules + assert isinstance(attn_subs["q_norm"], RMSNormalizationBridge) + assert isinstance(attn_subs["k_norm"], RMSNormalizationBridge) + + def test_hf_module_paths(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + """Verify key HF module path names are correctly mapped.""" + mapping = adapter.component_mapping + assert mapping is not None + assert mapping["embed"].name == "model.embed_tokens" + assert mapping["ln_final"].name == "model.norm" + assert mapping["unembed"].name == "lm_head" + assert mapping["blocks"].name == "model.layers" + subs = mapping["blocks"].submodules + assert subs["ln1"].name == "input_layernorm" + assert subs["ln2"].name == "post_attention_layernorm" + assert subs["attn"].name == "self_attn" + assert subs["mlp"].name == "mlp" + + +# --------------------------------------------------------------------------- +# Factory registration +# --------------------------------------------------------------------------- + + +class TestQwen3MoeFactoryRegistration: + def test_factory_lookup_returns_adapter_class(self) -> None: + assert SUPPORTED_ARCHITECTURES["Qwen3MoeForCausalLM"] is Qwen3MoeArchitectureAdapter diff --git a/transformer_lens/factories/architecture_adapter_factory.py b/transformer_lens/factories/architecture_adapter_factory.py index 458d1b073..2766d4439 100644 --- a/transformer_lens/factories/architecture_adapter_factory.py +++ b/transformer_lens/factories/architecture_adapter_factory.py @@ -43,6 +43,7 @@ PhiArchitectureAdapter, Qwen2ArchitectureAdapter, Qwen3ArchitectureAdapter, + Qwen3MoeArchitectureAdapter, QwenArchitectureAdapter, StableLmArchitectureAdapter, T5ArchitectureAdapter, @@ -88,6 +89,7 @@ "QwenForCausalLM": QwenArchitectureAdapter, "Qwen2ForCausalLM": Qwen2ArchitectureAdapter, "Qwen3ForCausalLM": Qwen3ArchitectureAdapter, + "Qwen3MoeForCausalLM": Qwen3MoeArchitectureAdapter, "StableLmForCausalLM": StableLmArchitectureAdapter, "T5ForConditionalGeneration": T5ArchitectureAdapter, "NanoGPTForCausalLM": NanogptArchitectureAdapter, diff --git a/transformer_lens/model_bridge/supported_architectures/__init__.py b/transformer_lens/model_bridge/supported_architectures/__init__.py index 2c32f6b38..f371567c8 100644 --- a/transformer_lens/model_bridge/supported_architectures/__init__.py +++ b/transformer_lens/model_bridge/supported_architectures/__init__.py @@ -120,6 +120,9 @@ from transformer_lens.model_bridge.supported_architectures.qwen3 import ( Qwen3ArchitectureAdapter, ) +from transformer_lens.model_bridge.supported_architectures.qwen3_moe import ( + Qwen3MoeArchitectureAdapter, +) from transformer_lens.model_bridge.supported_architectures.stablelm import ( StableLmArchitectureAdapter, ) @@ -167,6 +170,7 @@ "QwenArchitectureAdapter", "Qwen2ArchitectureAdapter", "Qwen3ArchitectureAdapter", + "Qwen3MoeArchitectureAdapter", "StableLmArchitectureAdapter", "T5ArchitectureAdapter", ] diff --git a/transformer_lens/model_bridge/supported_architectures/qwen3_moe.py b/transformer_lens/model_bridge/supported_architectures/qwen3_moe.py new file mode 100644 index 000000000..2d8cff3f5 --- /dev/null +++ b/transformer_lens/model_bridge/supported_architectures/qwen3_moe.py @@ -0,0 +1,164 @@ +"""Qwen3MoE (Mixture of Experts) architecture adapter.""" + +from typing import Any + +from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter +from transformer_lens.model_bridge.generalized_components import ( + BlockBridge, + EmbeddingBridge, + LinearBridge, + MoEBridge, + PositionEmbeddingsAttentionBridge, + RMSNormalizationBridge, + RotaryEmbeddingBridge, + UnembeddingBridge, +) + + +class Qwen3MoeArchitectureAdapter(ArchitectureAdapter): + """Architecture adapter for Qwen3MoE (Mixture of Experts) models. + + Qwen3MoE is a sparse Mixture-of-Experts decoder-only Transformer that closely + mirrors OLMoE in structure. Key architectural features: + + - Pre-norm: RMSNorm applied BEFORE attention (input_layernorm) and BEFORE MLP + (post_attention_layernorm). + - Q/K normalization: RMSNorm applied to queries and keys after projection and + before rotary embedding application. + - Sparse MoE: 128 experts with top-8 routing (in the public 30B-A3B checkpoints). + - Batched expert parameters: gate_up_proj [num_experts, 2*moe_intermediate_size, + hidden_size] and down_proj [num_experts, hidden_size, moe_intermediate_size] are + stored as single 3D tensors rather than a ModuleList. + - final_rms=True (Qwen3-style; differs from OLMoE which uses False). + - No biases on any projections (attention_bias=False in all public checkpoints). + - GQA: num_key_value_heads < num_attention_heads in all public models. + + Limitation — all-MoE configuration only: + All public Qwen3MoE models have decoder_sparse_step=1 and mlp_only_layers=[] + (every decoder layer is a sparse MoE block). This adapter supports only that + all-MoE configuration. Models with a non-empty mlp_only_layers list are NOT + supported because MoEBridge cannot handle the dense Qwen3MoeMLP fallback layers. + + Optional Parameters (may not exist in state_dict): + ------------------------------------------------- + - blocks.{i}.attn.b_Q - No bias on query projection (attention_bias=False) + - blocks.{i}.attn.b_K - No bias on key projection (attention_bias=False) + - blocks.{i}.attn.b_V - No bias on value projection (attention_bias=False) + - blocks.{i}.attn.b_O - No bias on output projection (attention_bias=False) + - blocks.{i}.ln1.b - RMSNorm has no additive bias + - blocks.{i}.ln2.b - RMSNorm has no additive bias + - ln_final.b - RMSNorm has no additive bias + """ + + def __init__(self, cfg: Any) -> None: + """Initialize the Qwen3MoE architecture adapter.""" + super().__init__(cfg) + + # ------------------------------------------------------------------ # + # Config attributes + # ------------------------------------------------------------------ # + self.cfg.normalization_type = "RMS" + self.cfg.positional_embedding_type = "rotary" + self.cfg.final_rms = True # Qwen3-style; OLMoE uses False + self.cfg.gated_mlp = True # SwiGLU-style gate in every MoE expert + self.cfg.attn_only = False + self.cfg.uses_rms_norm = True + # Force eager attention for output_attentions hook support + self.cfg.attn_implementation = "eager" + self.cfg.default_prepend_bos = False # Qwen3 family convention + + # GQA: propagate n_key_value_heads when provided by the loaded config. + # map_default_transformer_lens_config() sets this from num_key_value_heads + # in the HF checkpoint config; we do not hard-code a fallback value. + if hasattr(cfg, "n_key_value_heads") and cfg.n_key_value_heads is not None: + self.cfg.n_key_value_heads = cfg.n_key_value_heads + + # ------------------------------------------------------------------ # + # Weight processing conversions + # ------------------------------------------------------------------ # + # Standard QKVO rearrangements; _qkvo_weight_conversions() resolves + # n_kv_heads from self.cfg.n_key_value_heads automatically. + self.weight_processing_conversions = { + **self._qkvo_weight_conversions(), + } + # MoE expert weights (gate_up_proj, down_proj) and gate router weights + # (gate.weight) pass through unchanged — HF's native forward handles them. + + # ------------------------------------------------------------------ # + # Component mapping — pre-norm architecture + # ------------------------------------------------------------------ # + # ln1 = input_layernorm (applied BEFORE attention) + # ln2 = post_attention_layernorm (applied BEFORE MLP) + self.component_mapping = { + "embed": EmbeddingBridge(name="model.embed_tokens"), + "rotary_emb": RotaryEmbeddingBridge(name="model.rotary_emb", config=self.cfg), + "blocks": BlockBridge( + name="model.layers", + submodules={ + "ln1": RMSNormalizationBridge(name="input_layernorm", config=self.cfg), + "ln2": RMSNormalizationBridge(name="post_attention_layernorm", config=self.cfg), + "attn": PositionEmbeddingsAttentionBridge( + name="self_attn", + config=self.cfg, + submodules={ + "q": LinearBridge(name="q_proj"), + "k": LinearBridge(name="k_proj"), + "v": LinearBridge(name="v_proj"), + "o": LinearBridge(name="o_proj"), + "q_norm": RMSNormalizationBridge(name="q_norm", config=self.cfg), + "k_norm": RMSNormalizationBridge(name="k_norm", config=self.cfg), + }, + requires_attention_mask=True, + requires_position_embeddings=True, + ), + # Qwen3MoeSparseMoeBlock uses batched expert parameters + # (gate_up_proj / down_proj as 3D tensors) rather than a + # ModuleList. MoEBridge wraps the entire block and delegates + # to HF's native forward. The gate (Qwen3MoeTopKRouter) is + # mapped as a submodule via LinearBridge for hook access — + # same pattern as OLMoE. + "mlp": MoEBridge( + name="mlp", + config=self.cfg, + submodules={ + "gate": LinearBridge(name="gate"), + }, + ), + }, + ), + "ln_final": RMSNormalizationBridge(name="model.norm", config=self.cfg), + "unembed": UnembeddingBridge(name="lm_head", config=self.cfg), + } + + def setup_component_testing(self, hf_model: Any, bridge_model: Any = None) -> None: + """Set up rotary embedding references for Qwen3MoE component testing. + + Qwen3MoE uses RoPE (Rotary Position Embeddings) stored at model.rotary_emb. + We retrieve the rotary_emb instance from the HF model and attach it to all + attention bridge instances so that component-level tests can run the full + attention forward pass correctly. + + Args: + hf_model: The HuggingFace Qwen3MoeForCausalLM model instance. + bridge_model: The TransformerBridge model (if available). + """ + rotary_emb = hf_model.model.rotary_emb + + # Force eager attention on the HF model to match bridge implementation + if hasattr(hf_model, "config") and hasattr(hf_model.config, "_attn_implementation"): + hf_model.config._attn_implementation = "eager" + + if hasattr(hf_model, "model") and hasattr(hf_model.model, "layers"): + for layer in hf_model.model.layers: + if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "config"): + layer.self_attn.config._attn_implementation = "eager" + + # Attach rotary_emb to each block's attention bridge + if bridge_model is not None and hasattr(bridge_model, "blocks"): + for block in bridge_model.blocks: + if hasattr(block, "attn"): + block.attn.set_rotary_emb(rotary_emb) + + # Also set on the template bridge for get_generalized_component() calls + attn_bridge = self.get_generalized_component("blocks.0.attn") + attn_bridge.set_rotary_emb(rotary_emb) diff --git a/transformer_lens/tools/model_registry/data/supported_models.json b/transformer_lens/tools/model_registry/data/supported_models.json index 1c8d879d0..6d65aeacd 100644 --- a/transformer_lens/tools/model_registry/data/supported_models.json +++ b/transformer_lens/tools/model_registry/data/supported_models.json @@ -6,8 +6,8 @@ "min_downloads": 500, "scan_duration_seconds": 3.2 }, - "total_architectures": 36, - "total_models": 5553, + "total_architectures": 37, + "total_models": 5557, "total_verified": 690, "models": [ { @@ -77668,6 +77668,62 @@ "phase3_score": 100.0, "phase4_score": 89.9, "phase7_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-Coder-30B-A3B-Instruct", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-Thinking-2507", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null } ] } From 7ab7e9780205c7bcc4b2d7be006fdfaca4d9a4b0 Mon Sep 17 00:00:00 2001 From: jlarson4 Date: Fri, 10 Apr 2026 13:18:59 -0500 Subject: [PATCH 2/5] MLP handling in verify models --- .../tools/model_registry/verify_models.py | 35 +++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/transformer_lens/tools/model_registry/verify_models.py b/transformer_lens/tools/model_registry/verify_models.py index a31e91a63..570e8101d 100644 --- a/transformer_lens/tools/model_registry/verify_models.py +++ b/transformer_lens/tools/model_registry/verify_models.py @@ -249,6 +249,7 @@ def estimate_model_params(model_id: str) -> int: "mixtral", "qwen2", "qwen3", + "qwen3_moe", "phi3", "stablelm", ) @@ -262,8 +263,13 @@ def estimate_model_params(model_id: str) -> int: lang_config, "num_experts", None ) if num_experts and num_experts > 1: + # For architectures like Qwen3MoE, the per-expert MLP hidden size is stored + # in moe_intermediate_size rather than intermediate_size (which may refer to + # a separate dense MLP used in non-MoE layers). Use moe_intermediate_size + # when present. + moe_d_mlp = getattr(lang_config, "moe_intermediate_size", None) or d_mlp # For MoE, MLP params are multiplied by num_experts + gate params - mlp_per_layer = d_model * d_mlp * mlp_multiplier + mlp_per_layer = d_model * moe_d_mlp * mlp_multiplier moe_per_layer = (mlp_per_layer + d_model) * num_experts # Replace the non-MoE MLP contribution n_params -= n_layers * (d_model * d_mlp * mlp_multiplier) @@ -279,13 +285,15 @@ def estimate_benchmark_memory_gb( n_params: int, dtype: str = "float32", phases: Optional[list[int]] = None, + use_hf_reference: bool = True, ) -> float: """Estimate peak memory needed for benchmark suite. Phases run sequentially, so peak memory is the maximum of any single phase, not the sum. The multiplier represents how many model copies exist at peak: - Phase 1: Briefly loads HF ref + Bridge → 2.0x peak + Phase 1 (with HF ref): Briefly loads HF ref + Bridge → 2.0x peak + Phase 1 (no HF ref): Bridge only → 1.0x peak Phase 2: Bridge + HookedTransformer (separate copy) → 2.0x model + overhead Phase 3: Same as Phase 2 (processed versions) → 2.0x model + overhead Phase 4: Bridge + GPT-2 scorer (~500MB) → ~1.0x model + 0.5 GB @@ -294,6 +302,9 @@ def estimate_benchmark_memory_gb( n_params: Number of model parameters dtype: Data type for memory calculation phases: Which phases will be run (None = all phases) + use_hf_reference: Whether Phase 1 loads an HF reference model alongside + the Bridge. When False, Phase 1 only needs 1x model memory instead + of 2x. This matches the ``--no-hf-reference`` CLI flag. Returns: Estimated peak memory in GB @@ -315,8 +326,14 @@ def estimate_benchmark_memory_gb( phases = [1, 2, 3, 4] for p in phases: - if p in (1, 2, 3): - # Phase 1: HF ref + Bridge = 2 copies briefly + if p == 1: + if use_hf_reference: + # Phase 1: HF ref + Bridge = 2 copies briefly + phase_peaks.append(model_size_gb * 2.0 * (1 + overhead_fraction)) + else: + # No HF reference: Bridge alone + phase_peaks.append(model_size_gb * (1 + overhead_fraction)) + elif p in (2, 3): # Phase 2/3: Bridge + HookedTransformer = 2 copies phase_peaks.append(model_size_gb * 2.0 * (1 + overhead_fraction)) elif p == 4: @@ -781,7 +798,9 @@ def verify_models( continue # Step 2: Check memory - estimated_mem = estimate_benchmark_memory_gb(n_params, dtype, phases=phases) + estimated_mem = estimate_benchmark_memory_gb( + n_params, dtype, phases=phases, use_hf_reference=use_hf_reference + ) candidate.estimated_memory_gb = estimated_mem if not quiet: print( @@ -1087,6 +1106,7 @@ def _print_dry_run( dtype: str, max_memory_gb: float, phases: Optional[list[int]] = None, + use_hf_reference: bool = True, ) -> None: """Print what would be tested in a dry run.""" print(f"\nDry run: {len(candidates)} models would be tested") @@ -1107,7 +1127,9 @@ def _print_dry_run( for c in models: try: n_params = estimate_model_params(c.model_id) - mem = estimate_benchmark_memory_gb(n_params, dtype, phases=phases) + mem = estimate_benchmark_memory_gb( + n_params, dtype, phases=phases, use_hf_reference=use_hf_reference + ) status = "OK" if mem <= max_memory_gb else "SKIP (too large)" if mem > max_memory_gb: skippable += 1 @@ -1339,6 +1361,7 @@ def main() -> None: args.dtype, max_memory_gb, phases=args.phases, + use_hf_reference=not args.no_hf_reference, ) return From e76b6f88f7168e4158ed6944d85a0fd42671ef10 Mon Sep 17 00:00:00 2001 From: jlarson4 Date: Fri, 10 Apr 2026 15:22:49 -0500 Subject: [PATCH 3/5] verification of models --- .../model_registry/data/supported_models.json | 2165 +++++++++++++++- .../data/supported_models_qwen3_moe.json | 2200 +++++++++++++++++ .../data/verification_history.json | 92 +- 3 files changed, 4453 insertions(+), 4 deletions(-) create mode 100644 transformer_lens/tools/model_registry/data/supported_models_qwen3_moe.json diff --git a/transformer_lens/tools/model_registry/data/supported_models.json b/transformer_lens/tools/model_registry/data/supported_models.json index b8550359f..7d57d1acc 100644 --- a/transformer_lens/tools/model_registry/data/supported_models.json +++ b/transformer_lens/tools/model_registry/data/supported_models.json @@ -6,9 +6,9 @@ "min_downloads": 500, "scan_duration_seconds": 12.1 }, - "total_architectures": 37, - "total_models": 6691, - "total_verified": 690, + "total_architectures": 38, + "total_models": 6822, + "total_verified": 698, "models": [ { "architecture_id": "Qwen3ForCausalLM", @@ -93656,6 +93656,2165 @@ "phase4_score": 67.5, "phase7_score": null, "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 594767, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-Instruct-2507-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 370376, + "total_params": 30533947392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 354610, + "total_params": 30533947392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 293539, + "total_params": 9298753536 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 235228, + "total_params": 235107904512 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-30B-A3B-Instruct-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 212096, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-30B-A3B-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 209682, + "total_params": 15583623168 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "trl-internal-testing/tiny-Qwen3MoeForCausalLM", + "status": 3, + "verified_date": "2026-04-10", + "metadata": { + "downloads": 207171, + "total_params": 2574656 + }, + "note": "Below threshold: P3=89.5% but required tests failed: logits_equivalence \u2014 Text quality score: 72.2/100 (avg perplexity: 558.8) \u2014 generated text may be incoherent", + "phase1_score": 100.0, + "phase2_score": 100.0, + "phase3_score": 89.5, + "phase4_score": 72.2, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-30B-A3B-Instruct-MLX-5bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 206997, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-30B-A3B-Instruct-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 196979, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-30B-A3B-Instruct-MLX-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 194457, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-Instruct-2507", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 183785, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-GPTQ-Int4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 139859, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-Coder-480B-A35B-Instruct", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 90099, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "stelterlab/Qwen3-30B-A3B-Instruct-2507-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 88254, + "total_params": 4605856128 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-30B-A3B-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 86609, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 83207, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-Thinking-2507", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 81021, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 75715, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "cyankiwi/Qwen3-Coder-30B-A3B-Instruct-AWQ-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 74870, + "total_params": 5306567040 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 64513, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 49515, + "total_params": 235107904512 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "NVFP4/Qwen3-30B-A3B-Instruct-2507-FP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 46285, + "total_params": 15583623168 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-Base", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 45932, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuixiAI/Qwen3-30B-A3B-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 44268, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 42807, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "typhoon-ai/typhoon2.5-qwen3-30b-a3b", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 42205, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Instruct-2507-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 35168, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Instruct-2507-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 34301, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-Thinking-2507-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 34117, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Instruct-2507-MLX-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 33498, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "JunHowie/Qwen3-30B-A3B-Instruct-2507-GPTQ-Int4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 33311, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "cyankiwi/Qwen3-30B-A3B-Instruct-2507-AWQ-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 27742, + "total_params": 5306567040 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-30B-A3B-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 26590, + "total_params": 17452222848 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nytopop/Qwen3-30B-A3B.w8a8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 23560, + "total_params": 30554486784 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "NVFP4/Qwen3-Coder-30B-A3B-Instruct-FP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 18001, + "total_params": 15583623168 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "np-cr/testing-qwen3-moe", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 17816, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-Thinking-2507-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 16886, + "total_params": 235107904512 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Alibaba-NLP/Tongyi-DeepResearch-30B-A3B", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 16568, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-30B-A3B-GGUF", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 16377, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-30B-A3B-FP8-block", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 15451, + "total_params": 30533947392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-Nemotron-235B-A22B-GenRM", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 15251, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nightmedia/Huihui-Qwen3-30B-A3B-Instruct-2507-abliterated-dwq4-mlx", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 15234, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 14566, + "total_params": 4770822144 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 13999, + "total_params": 8587311104 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "BCCard/Qwen3-30B-A3B-FP8-Dynamic", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 9881, + "total_params": 30554486784 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "stelterlab/Qwen3-Coder-30B-A3B-Instruct-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 9718, + "total_params": 4605856128 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-235B-A22B-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 9521, + "total_params": 132806618624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-GPTQ-Int4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 7615, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuantTrio/Qwen3-30B-A3B-Thinking-2507-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 7283, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-480B-A35B-Instruct-MLX-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 6725, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-235B-A22B-GGUF", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 5776, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-480B-A35B-Instruct-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 5510, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuantTrio/Qwen3-235B-A22B-Instruct-2507-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 5349, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-480B-A35B-Instruct-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 5217, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 5150, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-Coder-30B-A3B-Instruct", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4870, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4742, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4650, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4326, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "yujiepan/qwen3-moe-tiny-random", + "status": 2, + "verified_date": "2026-04-10", + "metadata": { + "downloads": 4314, + "total_params": 9970624 + }, + "note": "Unsupported: decoder_sparse_step=2 creates mixed dense/sparse layers; adapter only supports all-MoE config (decoder_sparse_step=1)", + "phase1_score": 0.0, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-30B-A3B-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4284, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Applied-Innovation-Center/Karnak", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4268, + "total_params": 40669136896 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "optimum-internal-testing/tiny-random-qwen3_moe", + "status": 2, + "verified_date": "2026-04-10", + "metadata": { + "downloads": 4265, + "total_params": 9970624 + }, + "note": "Unsupported: decoder_sparse_step=2 creates mixed dense/sparse layers; adapter only supports all-MoE config (decoder_sparse_step=1)", + "phase1_score": 0.0, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "llm-jp/llm-jp-4-32b-a3b-thinking", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4061, + "total_params": 32139028992 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-30B-A3B-FP8-dynamic", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4060, + "total_params": 30554486784 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "miromind-ai/MiroThinker-1.7", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3960, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-235B-A22B-Instruct-2507-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3797, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-235B-A22B-Instruct-2507-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3667, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-30B-A3B-Instruct-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3656, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-235B-A22B-Instruct-2507-MLX-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3608, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-235B-A22B-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3584, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-30B-A3B-quantized.w4a16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3443, + "total_params": 4605856128 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-235B-A22B-FP8-dynamic", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3029, + "total_params": 235181131264 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "MerbAI/Karnak", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3013, + "total_params": 40669136896 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "miromind-ai/MiroThinker-v1.5-30B", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2963, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-235B-A22B-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2742, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "ig1/Qwen3-Coder-30B-A3B-Instruct-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2685, + "total_params": 17452222848 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-235B-A22B-Instruct-2507-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2655, + "total_params": 119968378368 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-30B-A3B-Instruct-2507", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2600, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Ravi07bec/SQL-v2-oct-16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2548, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "miromind-ai/MiroThinker-1.7-mini", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2495, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-GPTQ-Int8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2274, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-Coder-480B-A35B-Instruct-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2152, + "total_params": 241041798656 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "cyankiwi/Qwen3-30B-A3B-Thinking-2507-AWQ-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1959, + "total_params": 5306567040 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-30B-A3B-128K-GGUF", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1888, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-Nemotron-235B-A22B-GenRM-2603", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1785, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "koushd/Qwen3-235B-A22B-Instruct-2507-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1697, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuantTrio/Qwen3-Coder-480B-A35B-Instruct-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1550, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-30B-A3B-Instruct-5bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1525, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlabonne/Qwen3-30B-A3B-abliterated", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1522, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-30B-A3B-Instruct-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1499, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Aratako/Qwen3-30B-A3B-NSFW-JP", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1462, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Thinking-2507-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1452, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-30B-A3B-Instruct-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1142, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Thinking-2507-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1108, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "amd/Qwen3-235B-A22B-Instruct-2507-MXFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1081, + "total_params": 118194187776 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Sophia-AI/Qwen3-30B-A3B-Instruct-2507-AWQ-W4A16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1061, + "total_params": 5419812864 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-MLX-bf16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1051, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuixiAI/Qwen3-235B-A22B-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1046, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "huihui-ai/Huihui-Qwen3-Coder-30B-A3B-Instruct-abliterated", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1004, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-235B-A22B-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 879, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-30B-A3B-Instruct-4bit-dwq-v2", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 878, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "huihui-ai/Huihui-MoE-0.8B-2E", + "status": 1, + "verified_date": "2026-04-10", + "metadata": { + "downloads": 868, + "total_params": 860348416 + }, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 91.9, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "tokyotech-llm/Qwen3-Swallow-30B-A3B-CPT-v0.2", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 855, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-235B-A22B-128K-GGUF", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 854, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "zianglih/Qwen3-30B-A3B-Instruct-2507-MXFP8-last-8-BF16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 845, + "total_params": 31310690304 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Thinking-2507-MLX-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 823, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-235B-A22B-Thinking-2507-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 819, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "OpenMOSS-Team/SciThinker-30B", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 788, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-30B-A3B-Instruct-2507-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 763, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nightmedia/Qwen3-42B-A3B-2507-Thinking-Abliterated-uncensored-TOTAL-RECALL-v2-Medium-MASTER-CODER-qx4-mlx", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 758, + "total_params": 42371414784 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-235B-A22B-Thinking-2507-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 743, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "baichuan-inc/Baichuan-M3-235B", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 737, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "huihui-ai/Huihui-MiroThinker-v1.5-30B-abliterated", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 728, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-30B-A3B-Base", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 726, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Jebadiah/Qwen3-30B-A3B-seed-0", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 719, + "total_params": 30531028992 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "DavidAU/Qwen3-42B-A3B-2507-Thinking-Abliterated-uncensored-TOTAL-RECALL-v2-Medium-MASTER-CODER", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 717, + "total_params": 42371414784 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nytopop/Qwen3-30B-A3B.w4a16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 691, + "total_params": 4605856128 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "tokyotech-llm/Qwen3-Swallow-30B-A3B-RL-v0.2", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 677, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-235B-A22B-Thinking-2507-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 673, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nightmedia/Qwen3-30B-A3B-Thinking-2507-Claude-4.5-Sonnet-High-Reasoning-Distill-mxfp4-mlx", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 669, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-30B-A3B-Instruct-2507-quantized.w4a16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 663, + "total_params": 4594944480 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-235B-A22B-Instruct-2507-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 658, + "total_params": 135739033600 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-30B-A3B-Instruct-2507-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 647, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "huihui-ai/Huihui-MoE-1B-A0.6B", + "status": 1, + "verified_date": "2026-04-10", + "metadata": { + "downloads": 223, + "total_params": 1120000000 + }, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 97.6, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "imdatta0/tiny_qwen3_moe_2.8B_0.7B", + "status": 1, + "verified_date": "2026-04-10", + "metadata": { + "downloads": 218, + "total_params": 2800000000 + }, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 70.4, + "phase7_score": null, + "phase8_score": null } ] } diff --git a/transformer_lens/tools/model_registry/data/supported_models_qwen3_moe.json b/transformer_lens/tools/model_registry/data/supported_models_qwen3_moe.json new file mode 100644 index 000000000..cd5b8e104 --- /dev/null +++ b/transformer_lens/tools/model_registry/data/supported_models_qwen3_moe.json @@ -0,0 +1,2200 @@ +{ + "architecture_id": "Qwen3MoeForCausalLM", + "total_models": 129, + "scanned": 10000, + "models": [ + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1577944, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-Coder-30B-A3B-Instruct", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1177329, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1071372, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 594767, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-Instruct-2507-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 370376, + "total_params": 30533947392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 354610, + "total_params": 30533947392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 293539, + "total_params": 9298753536 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 235228, + "total_params": 235107904512 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-30B-A3B-Instruct-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 212096, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-30B-A3B-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 209682, + "total_params": 15583623168 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "trl-internal-testing/tiny-Qwen3MoeForCausalLM", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 207171, + "total_params": 2574656 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-30B-A3B-Instruct-MLX-5bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 206997, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-30B-A3B-Instruct-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 196979, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-30B-A3B-Instruct-MLX-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 194457, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-Thinking-2507", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 190710, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-Instruct-2507", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 183785, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-GPTQ-Int4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 139859, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-Coder-480B-A35B-Instruct", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 90099, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "stelterlab/Qwen3-30B-A3B-Instruct-2507-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 88254, + "total_params": 4605856128 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-30B-A3B-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 86609, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 83207, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-Thinking-2507", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 81021, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 75715, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "cyankiwi/Qwen3-Coder-30B-A3B-Instruct-AWQ-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 74870, + "total_params": 5306567040 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 64513, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 49515, + "total_params": 235107904512 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "NVFP4/Qwen3-30B-A3B-Instruct-2507-FP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 46285, + "total_params": 15583623168 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-Base", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 45932, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuixiAI/Qwen3-30B-A3B-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 44268, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 42807, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "typhoon-ai/typhoon2.5-qwen3-30b-a3b", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 42205, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Instruct-2507-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 35168, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Instruct-2507-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 34301, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-Thinking-2507-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 34117, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Instruct-2507-MLX-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 33498, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "JunHowie/Qwen3-30B-A3B-Instruct-2507-GPTQ-Int4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 33311, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "cyankiwi/Qwen3-30B-A3B-Instruct-2507-AWQ-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 27742, + "total_params": 5306567040 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-30B-A3B-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 26590, + "total_params": 17452222848 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nytopop/Qwen3-30B-A3B.w8a8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 23560, + "total_params": 30554486784 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "NVFP4/Qwen3-Coder-30B-A3B-Instruct-FP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 18001, + "total_params": 15583623168 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "np-cr/testing-qwen3-moe", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 17816, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-Thinking-2507-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 16886, + "total_params": 235107904512 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Alibaba-NLP/Tongyi-DeepResearch-30B-A3B", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 16568, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-30B-A3B-GGUF", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 16377, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-30B-A3B-FP8-block", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 15451, + "total_params": 30533947392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-Nemotron-235B-A22B-GenRM", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 15251, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nightmedia/Huihui-Qwen3-30B-A3B-Instruct-2507-abliterated-dwq4-mlx", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 15234, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 14566, + "total_params": 4770822144 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 13999, + "total_params": 8587311104 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "BCCard/Qwen3-30B-A3B-FP8-Dynamic", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 9881, + "total_params": 30554486784 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "stelterlab/Qwen3-Coder-30B-A3B-Instruct-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 9718, + "total_params": 4605856128 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-235B-A22B-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 9521, + "total_params": 132806618624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-GPTQ-Int4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 7615, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuantTrio/Qwen3-30B-A3B-Thinking-2507-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 7283, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-480B-A35B-Instruct-MLX-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 6725, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-235B-A22B-GGUF", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 5776, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-480B-A35B-Instruct-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 5510, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuantTrio/Qwen3-235B-A22B-Instruct-2507-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 5349, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-480B-A35B-Instruct-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 5217, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 5150, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-Coder-30B-A3B-Instruct", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4870, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4742, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4650, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4326, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "yujiepan/qwen3-moe-tiny-random", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4314, + "total_params": 9970624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-30B-A3B-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4284, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Applied-Innovation-Center/Karnak", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4268, + "total_params": 40669136896 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "optimum-internal-testing/tiny-random-qwen3_moe", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4265, + "total_params": 9970624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "llm-jp/llm-jp-4-32b-a3b-thinking", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4061, + "total_params": 32139028992 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-30B-A3B-FP8-dynamic", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4060, + "total_params": 30554486784 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "miromind-ai/MiroThinker-1.7", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3960, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-235B-A22B-Instruct-2507-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3797, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-235B-A22B-Instruct-2507-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3667, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-30B-A3B-Instruct-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3656, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-235B-A22B-Instruct-2507-MLX-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3608, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-235B-A22B-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3584, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-30B-A3B-quantized.w4a16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3443, + "total_params": 4605856128 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-235B-A22B-FP8-dynamic", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3029, + "total_params": 235181131264 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "MerbAI/Karnak", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3013, + "total_params": 40669136896 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "miromind-ai/MiroThinker-v1.5-30B", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2963, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-235B-A22B-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2742, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "ig1/Qwen3-Coder-30B-A3B-Instruct-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2685, + "total_params": 17452222848 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-235B-A22B-Instruct-2507-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2655, + "total_params": 119968378368 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-30B-A3B-Instruct-2507", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2600, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Ravi07bec/SQL-v2-oct-16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2548, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "miromind-ai/MiroThinker-1.7-mini", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2495, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-GPTQ-Int8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2274, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-Coder-480B-A35B-Instruct-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2152, + "total_params": 241041798656 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "cyankiwi/Qwen3-30B-A3B-Thinking-2507-AWQ-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1959, + "total_params": 5306567040 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-30B-A3B-128K-GGUF", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1888, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-Nemotron-235B-A22B-GenRM-2603", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1785, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "koushd/Qwen3-235B-A22B-Instruct-2507-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1697, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuantTrio/Qwen3-Coder-480B-A35B-Instruct-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1550, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-30B-A3B-Instruct-5bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1525, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlabonne/Qwen3-30B-A3B-abliterated", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1522, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-30B-A3B-Instruct-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1499, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Aratako/Qwen3-30B-A3B-NSFW-JP", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1462, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Thinking-2507-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1452, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-30B-A3B-Instruct-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1142, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Thinking-2507-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1108, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "amd/Qwen3-235B-A22B-Instruct-2507-MXFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1081, + "total_params": 118194187776 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Sophia-AI/Qwen3-30B-A3B-Instruct-2507-AWQ-W4A16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1061, + "total_params": 5419812864 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-MLX-bf16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1051, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuixiAI/Qwen3-235B-A22B-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1046, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "huihui-ai/Huihui-Qwen3-Coder-30B-A3B-Instruct-abliterated", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1004, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-235B-A22B-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 879, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-30B-A3B-Instruct-4bit-dwq-v2", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 878, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "huihui-ai/Huihui-MoE-0.8B-2E", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 868, + "total_params": 860348416 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "tokyotech-llm/Qwen3-Swallow-30B-A3B-CPT-v0.2", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 855, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-235B-A22B-128K-GGUF", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 854, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "zianglih/Qwen3-30B-A3B-Instruct-2507-MXFP8-last-8-BF16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 845, + "total_params": 31310690304 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Thinking-2507-MLX-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 823, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-235B-A22B-Thinking-2507-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 819, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "OpenMOSS-Team/SciThinker-30B", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 788, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-30B-A3B-Instruct-2507-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 763, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nightmedia/Qwen3-42B-A3B-2507-Thinking-Abliterated-uncensored-TOTAL-RECALL-v2-Medium-MASTER-CODER-qx4-mlx", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 758, + "total_params": 42371414784 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-235B-A22B-Thinking-2507-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 743, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "baichuan-inc/Baichuan-M3-235B", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 737, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "huihui-ai/Huihui-MiroThinker-v1.5-30B-abliterated", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 728, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-30B-A3B-Base", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 726, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Jebadiah/Qwen3-30B-A3B-seed-0", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 719, + "total_params": 30531028992 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "DavidAU/Qwen3-42B-A3B-2507-Thinking-Abliterated-uncensored-TOTAL-RECALL-v2-Medium-MASTER-CODER", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 717, + "total_params": 42371414784 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nytopop/Qwen3-30B-A3B.w4a16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 691, + "total_params": 4605856128 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "tokyotech-llm/Qwen3-Swallow-30B-A3B-RL-v0.2", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 677, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-235B-A22B-Thinking-2507-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 673, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nightmedia/Qwen3-30B-A3B-Thinking-2507-Claude-4.5-Sonnet-High-Reasoning-Distill-mxfp4-mlx", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 669, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-30B-A3B-Instruct-2507-quantized.w4a16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 663, + "total_params": 4594944480 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-235B-A22B-Instruct-2507-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 658, + "total_params": 135739033600 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-30B-A3B-Instruct-2507-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 647, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + } + ] +} \ No newline at end of file diff --git a/transformer_lens/tools/model_registry/data/verification_history.json b/transformer_lens/tools/model_registry/data/verification_history.json index 9eb2e7648..3564204e8 100644 --- a/transformer_lens/tools/model_registry/data/verification_history.json +++ b/transformer_lens/tools/model_registry/data/verification_history.json @@ -1,5 +1,5 @@ { - "last_updated": "2026-04-09T16:34:36.818082", + "last_updated": "2026-04-10T15:08:21.930188", "records": [ { "model_id": "Macropodus/macbert4mdcspell_v1", @@ -11200,6 +11200,96 @@ "notes": "Full verification completed", "invalidated": false, "invalidation_reason": null + }, + { + "model_id": "trl-internal-testing/tiny-Qwen3MoeForCausalLM", + "architecture_id": "Qwen3MoeForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P3=89.5% but required tests failed: logits_equivalence \u2014 Text quality score: 72.2/100 (avg perplexity: 558.8) \u2014 generated text may be incoherent", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "huihui-ai/Huihui-MoE-0.8B-2E", + "architecture_id": "Qwen3MoeForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P3=89.5% but required tests failed: logits_equivalence, loss_equivalence \u2014 Tensors differ: max_diff=0.625000, mean_rel=0.002930", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "huihui-ai/Huihui-MoE-0.8B-2E", + "architecture_id": "Qwen3MoeForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P3=89.5% but required tests failed: logits_equivalence, loss_equivalence \u2014 Tensors differ: max_diff=0.625000, mean_rel=0.002930", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "huihui-ai/Huihui-MoE-0.8B-2E", + "architecture_id": "Qwen3MoeForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P3=89.5% but required tests failed: logits_equivalence, loss_equivalence \u2014 Tensors differ: max_diff=0.625000, mean_rel=0.002930", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "huihui-ai/Huihui-MoE-0.8B-2E", + "architecture_id": "Qwen3MoeForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "huihui-ai/Huihui-MoE-1B-A0.6B", + "architecture_id": "Qwen3MoeForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "imdatta0/tiny_qwen3_moe_2.8B_0.7B", + "architecture_id": "Qwen3MoeForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "yujiepan/qwen3-moe-tiny-random", + "architecture_id": "Qwen3MoeForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: 'Qwen3MoeMLP' object has no attribute 'gate'", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "optimum-internal-testing/tiny-random-qwen3_moe", + "architecture_id": "Qwen3MoeForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: 'Qwen3MoeMLP' object has no attribute 'gate'", + "invalidated": false, + "invalidation_reason": null } ] } From 24c4a4c1ddb90ec94f3025b5883bc01200df4546 Mon Sep 17 00:00:00 2001 From: jlarson4 Date: Fri, 10 Apr 2026 15:35:51 -0500 Subject: [PATCH 4/5] Comment cleanup --- .../model_bridge/test_qwen3_moe_bridge.py | 53 +++------- .../model_bridge/test_qwen3_moe_adapter.py | 40 ++----- .../supported_architectures/qwen3_moe.py | 100 +++++++----------- .../tools/model_registry/verify_models.py | 28 ++--- 4 files changed, 73 insertions(+), 148 deletions(-) diff --git a/tests/integration/model_bridge/test_qwen3_moe_bridge.py b/tests/integration/model_bridge/test_qwen3_moe_bridge.py index 906f1ab17..e952f6fb6 100644 --- a/tests/integration/model_bridge/test_qwen3_moe_bridge.py +++ b/tests/integration/model_bridge/test_qwen3_moe_bridge.py @@ -1,11 +1,9 @@ """Integration tests for the Qwen3MoE TransformerBridge. -All tests use a tiny programmatic Qwen3MoE config on the meta device — no -network access and no actual weights are downloaded. The meta device means -tensor operations cannot execute, so forward-pass tests are explicitly skipped -and marked for manual execution during verification. - -Fixture pattern mirrors tests/unit/model_bridge/test_gpt_oss_moe.py. +Uses a tiny programmatic config on the meta device — no network access or +weight downloads. Tensor ops can't execute on meta, so forward-pass tests are +skipped and run manually during verification. Fixture pattern mirrors +tests/unit/model_bridge/test_gpt_oss_moe.py. """ import pytest @@ -22,20 +20,16 @@ Qwen3MoeArchitectureAdapter, ) -# --------------------------------------------------------------------------- -# Tiny programmatic model fixture (meta device, no weights) -# --------------------------------------------------------------------------- - class _MockTokenizer: - """Minimal stand-in so TransformerBridge(tokenizer=...) is satisfied.""" + """Stand-in to satisfy TransformerBridge(tokenizer=...).""" pass @pytest.fixture(scope="module") def tiny_qwen3moe_config(): - """Return a small Qwen3MoeConfig (2 layers, 4 heads, 4 experts).""" + """Small Qwen3MoeConfig: 2 layers, 4 heads, 4 experts.""" return AutoConfig.for_model( "qwen3_moe", hidden_size=64, @@ -55,7 +49,7 @@ def tiny_qwen3moe_config(): @pytest.fixture(scope="module") def tiny_qwen3moe_model_meta(tiny_qwen3moe_config): - """Create a Qwen3MoE model structure on meta device (no weights loaded).""" + """Qwen3MoE model on meta device (no weights loaded).""" with torch.device("meta"): model = AutoModelForCausalLM.from_config(tiny_qwen3moe_config) return model @@ -63,7 +57,7 @@ def tiny_qwen3moe_model_meta(tiny_qwen3moe_config): @pytest.fixture(scope="module") def tiny_qwen3moe_bridge(tiny_qwen3moe_config, tiny_qwen3moe_model_meta): - """Create a TransformerBridge wrapping the tiny meta-device Qwen3MoE model.""" + """TransformerBridge wrapping the tiny meta-device Qwen3MoE model.""" tl_config = map_default_transformer_lens_config(tiny_qwen3moe_config) bridge_config = TransformerBridgeConfig( @@ -86,11 +80,6 @@ def tiny_qwen3moe_bridge(tiny_qwen3moe_config, tiny_qwen3moe_model_meta): ) -# --------------------------------------------------------------------------- -# HF model structure -# --------------------------------------------------------------------------- - - class TestQwen3MoeModelStructure: def test_model_has_layers(self, tiny_qwen3moe_model_meta) -> None: assert hasattr(tiny_qwen3moe_model_meta, "model") @@ -98,13 +87,12 @@ def test_model_has_layers(self, tiny_qwen3moe_model_meta) -> None: assert len(tiny_qwen3moe_model_meta.model.layers) == 2 def test_layer_has_sparse_moe_block(self, tiny_qwen3moe_model_meta) -> None: + # Qwen3MoeSparseMoeBlock stores experts as batched 3D tensors, not a ModuleList layer0_mlp = tiny_qwen3moe_model_meta.model.layers[0].mlp - # Qwen3MoeSparseMoeBlock uses batched expert parameters (not a ModuleList) assert hasattr(layer0_mlp, "experts") experts = layer0_mlp.experts assert hasattr(experts, "gate_up_proj") assert hasattr(experts, "down_proj") - # Experts are NOT iterable — stored as batched 3D tensors assert not hasattr(experts, "__iter__") def test_layer_has_gate_router(self, tiny_qwen3moe_model_meta) -> None: @@ -117,11 +105,6 @@ def test_attention_has_q_norm_k_norm(self, tiny_qwen3moe_model_meta) -> None: assert hasattr(attn, "k_norm") -# --------------------------------------------------------------------------- -# Bridge structure -# --------------------------------------------------------------------------- - - class TestQwen3MoeBridgeStructure: def test_block_count(self, tiny_qwen3moe_bridge) -> None: assert len(tiny_qwen3moe_bridge.blocks) == 2 @@ -132,7 +115,7 @@ def test_has_core_components(self, tiny_qwen3moe_bridge) -> None: assert hasattr(tiny_qwen3moe_bridge, "ln_final") def test_cfg_final_rms_is_true(self, tiny_qwen3moe_bridge) -> None: - """Critical Qwen3MoE config flag — differs from OLMoE which uses False.""" + """Qwen3MoE uses final_rms=True; OLMoE uses False.""" assert tiny_qwen3moe_bridge.cfg.final_rms is True def test_cfg_n_kv_heads(self, tiny_qwen3moe_bridge) -> None: @@ -165,18 +148,13 @@ def test_block_attn_has_q_norm_k_norm(self, tiny_qwen3moe_bridge) -> None: assert hasattr(attn, "k_norm") -# --------------------------------------------------------------------------- -# Forward-pass tests — skipped on meta device, run manually during verification -# --------------------------------------------------------------------------- +# Forward-pass tests require real weights — meta-device tensor ops raise +# NotImplementedError. Run these manually during Step 3 verification. @pytest.mark.skip(reason="Requires real weights — run manually during verification") def test_forward_pass_matches_hf(tiny_qwen3moe_bridge) -> None: - """Bridge forward should produce logits identical to the HF model. - - Run this test manually with a real (non-meta) model during Step 3 - verification. On meta device, tensor operations raise NotImplementedError. - """ + """Bridge logits match the HF model.""" tokens = torch.tensor([[1, 2, 3, 4]]) with torch.no_grad(): bridge_out = tiny_qwen3moe_bridge(tokens) @@ -187,10 +165,7 @@ def test_forward_pass_matches_hf(tiny_qwen3moe_bridge) -> None: @pytest.mark.skip(reason="Requires real weights — run manually during verification") def test_run_with_cache_captures_moe_router_scores(tiny_qwen3moe_bridge) -> None: - """MoEBridge should capture router scores in the activation cache. - - Run manually with real weights during Step 3 verification. - """ + """MoEBridge captures router scores in the activation cache.""" tiny_qwen3moe_bridge.enable_compatibility_mode(no_processing=True) tokens = torch.tensor([[1, 2, 3, 4]]) _, cache = tiny_qwen3moe_bridge.run_with_cache(tokens) diff --git a/tests/unit/model_bridge/test_qwen3_moe_adapter.py b/tests/unit/model_bridge/test_qwen3_moe_adapter.py index df741a977..af6a0155c 100644 --- a/tests/unit/model_bridge/test_qwen3_moe_adapter.py +++ b/tests/unit/model_bridge/test_qwen3_moe_adapter.py @@ -1,7 +1,7 @@ """Unit tests for the Qwen3MoeArchitectureAdapter. -No network access and no model downloads — all tests use programmatic -TransformerBridgeConfig instances. +All tests use programmatic TransformerBridgeConfig instances — no network access +or model downloads. """ import pytest @@ -24,10 +24,6 @@ Qwen3MoeArchitectureAdapter, ) -# --------------------------------------------------------------------------- -# Shared fixture -# --------------------------------------------------------------------------- - @pytest.fixture def cfg() -> TransformerBridgeConfig: @@ -48,11 +44,6 @@ def adapter(cfg: TransformerBridgeConfig) -> Qwen3MoeArchitectureAdapter: return Qwen3MoeArchitectureAdapter(cfg) -# --------------------------------------------------------------------------- -# Config attribute correctness -# --------------------------------------------------------------------------- - - class TestQwen3MoeAdapterConfig: def test_normalization_type_is_rms(self, adapter: Qwen3MoeArchitectureAdapter) -> None: assert adapter.cfg.normalization_type == "RMS" @@ -63,7 +54,7 @@ def test_positional_embedding_type_is_rotary( assert adapter.cfg.positional_embedding_type == "rotary" def test_final_rms_is_true(self, adapter: Qwen3MoeArchitectureAdapter) -> None: - """Critical: Qwen3MoE uses final_rms=True; OLMoE uses False.""" + """Qwen3MoE uses final_rms=True; OLMoE uses False.""" assert adapter.cfg.final_rms is True def test_gated_mlp_is_true(self, adapter: Qwen3MoeArchitectureAdapter) -> None: @@ -79,7 +70,7 @@ def test_default_prepend_bos_is_false(self, adapter: Qwen3MoeArchitectureAdapter assert adapter.cfg.default_prepend_bos is False def test_n_kv_heads_propagated(self) -> None: - """n_key_value_heads from the loaded config must be preserved.""" + """n_key_value_heads from the loaded config is preserved.""" cfg = TransformerBridgeConfig( d_model=64, d_head=16, @@ -94,11 +85,6 @@ def test_n_kv_heads_propagated(self) -> None: assert adapter.cfg.n_key_value_heads == 2 -# --------------------------------------------------------------------------- -# Weight processing conversions -# --------------------------------------------------------------------------- - - class TestQwen3MoeWeightConversions: def test_has_qkvo_keys(self, adapter: Qwen3MoeArchitectureAdapter) -> None: convs = adapter.weight_processing_conversions @@ -109,7 +95,7 @@ def test_has_qkvo_keys(self, adapter: Qwen3MoeArchitectureAdapter) -> None: assert "blocks.{i}.attn.o.weight" in convs def test_q_rearrange_uses_n_heads(self, adapter: Qwen3MoeArchitectureAdapter) -> None: - """Q weight conversion must use n_heads (4) for the rearrange.""" + """Q rearrange uses n_heads (4).""" convs = adapter.weight_processing_conversions assert convs is not None q_conv = convs["blocks.{i}.attn.q.weight"] @@ -119,7 +105,7 @@ def test_q_rearrange_uses_n_heads(self, adapter: Qwen3MoeArchitectureAdapter) -> assert axes.get("n") == 4 def test_kv_rearrange_uses_n_kv_heads(self, adapter: Qwen3MoeArchitectureAdapter) -> None: - """K/V weight conversions must use n_key_value_heads (2) for GQA.""" + """K/V rearrange uses n_key_value_heads (2) for GQA.""" convs = adapter.weight_processing_conversions assert convs is not None k_conv = convs["blocks.{i}.attn.k.weight"] @@ -132,7 +118,7 @@ def test_kv_rearrange_uses_n_kv_heads(self, adapter: Qwen3MoeArchitectureAdapter assert v_conv.tensor_conversion.axes_lengths.get("n") == 2 def test_o_rearrange_uses_n_heads(self, adapter: Qwen3MoeArchitectureAdapter) -> None: - """O weight conversion must use n_heads (4).""" + """O rearrange uses n_heads (4).""" convs = adapter.weight_processing_conversions assert convs is not None o_conv = convs["blocks.{i}.attn.o.weight"] @@ -141,11 +127,6 @@ def test_o_rearrange_uses_n_heads(self, adapter: Qwen3MoeArchitectureAdapter) -> assert o_conv.tensor_conversion.axes_lengths.get("n") == 4 -# --------------------------------------------------------------------------- -# Component mapping structure -# --------------------------------------------------------------------------- - - class TestQwen3MoeComponentMapping: def test_has_required_top_level_keys(self, adapter: Qwen3MoeArchitectureAdapter) -> None: mapping = adapter.component_mapping @@ -194,7 +175,7 @@ def test_q_norm_k_norm_are_rms_norm_bridges(self, adapter: Qwen3MoeArchitectureA assert isinstance(attn_subs["k_norm"], RMSNormalizationBridge) def test_hf_module_paths(self, adapter: Qwen3MoeArchitectureAdapter) -> None: - """Verify key HF module path names are correctly mapped.""" + """HF module path names are mapped correctly.""" mapping = adapter.component_mapping assert mapping is not None assert mapping["embed"].name == "model.embed_tokens" @@ -208,11 +189,6 @@ def test_hf_module_paths(self, adapter: Qwen3MoeArchitectureAdapter) -> None: assert subs["mlp"].name == "mlp" -# --------------------------------------------------------------------------- -# Factory registration -# --------------------------------------------------------------------------- - - class TestQwen3MoeFactoryRegistration: def test_factory_lookup_returns_adapter_class(self) -> None: assert SUPPORTED_ARCHITECTURES["Qwen3MoeForCausalLM"] is Qwen3MoeArchitectureAdapter diff --git a/transformer_lens/model_bridge/supported_architectures/qwen3_moe.py b/transformer_lens/model_bridge/supported_architectures/qwen3_moe.py index 2d8cff3f5..643892cee 100644 --- a/transformer_lens/model_bridge/supported_architectures/qwen3_moe.py +++ b/transformer_lens/model_bridge/supported_architectures/qwen3_moe.py @@ -18,76 +18,59 @@ class Qwen3MoeArchitectureAdapter(ArchitectureAdapter): """Architecture adapter for Qwen3MoE (Mixture of Experts) models. - Qwen3MoE is a sparse Mixture-of-Experts decoder-only Transformer that closely - mirrors OLMoE in structure. Key architectural features: - - - Pre-norm: RMSNorm applied BEFORE attention (input_layernorm) and BEFORE MLP - (post_attention_layernorm). - - Q/K normalization: RMSNorm applied to queries and keys after projection and - before rotary embedding application. - - Sparse MoE: 128 experts with top-8 routing (in the public 30B-A3B checkpoints). - - Batched expert parameters: gate_up_proj [num_experts, 2*moe_intermediate_size, - hidden_size] and down_proj [num_experts, hidden_size, moe_intermediate_size] are - stored as single 3D tensors rather than a ModuleList. - - final_rms=True (Qwen3-style; differs from OLMoE which uses False). - - No biases on any projections (attention_bias=False in all public checkpoints). - - GQA: num_key_value_heads < num_attention_heads in all public models. - - Limitation — all-MoE configuration only: - All public Qwen3MoE models have decoder_sparse_step=1 and mlp_only_layers=[] - (every decoder layer is a sparse MoE block). This adapter supports only that - all-MoE configuration. Models with a non-empty mlp_only_layers list are NOT - supported because MoEBridge cannot handle the dense Qwen3MoeMLP fallback layers. + Qwen3MoE is a sparse MoE decoder-only Transformer, structurally close to OLMoE. + Key features: + + - Pre-norm: RMSNorm applied BEFORE attention and BEFORE MLP. + - Q/K normalization: RMSNorm applied to queries and keys after projection. + - Sparse MoE: 128 experts with top-8 routing (public 30B-A3B checkpoints). + - Batched expert parameters: gate_up_proj and down_proj as single 3D tensors, + not a ModuleList. + - final_rms=True (Qwen3-style; OLMoE uses False). + - No biases on any projections. + - GQA: n_key_value_heads < n_heads in all public checkpoints. + + Only the all-MoE configuration is supported (decoder_sparse_step=1, + mlp_only_layers=[]). Models with dense fallback layers cannot be wrapped + because MoEBridge does not handle the dense Qwen3MoeMLP path. Optional Parameters (may not exist in state_dict): ------------------------------------------------- - - blocks.{i}.attn.b_Q - No bias on query projection (attention_bias=False) - - blocks.{i}.attn.b_K - No bias on key projection (attention_bias=False) - - blocks.{i}.attn.b_V - No bias on value projection (attention_bias=False) - - blocks.{i}.attn.b_O - No bias on output projection (attention_bias=False) - - blocks.{i}.ln1.b - RMSNorm has no additive bias - - blocks.{i}.ln2.b - RMSNorm has no additive bias - - ln_final.b - RMSNorm has no additive bias + - blocks.{i}.attn.b_Q - No bias on query projection + - blocks.{i}.attn.b_K - No bias on key projection + - blocks.{i}.attn.b_V - No bias on value projection + - blocks.{i}.attn.b_O - No bias on output projection + - blocks.{i}.ln1.b - RMSNorm has no bias + - blocks.{i}.ln2.b - RMSNorm has no bias + - ln_final.b - RMSNorm has no bias """ def __init__(self, cfg: Any) -> None: """Initialize the Qwen3MoE architecture adapter.""" super().__init__(cfg) - # ------------------------------------------------------------------ # - # Config attributes - # ------------------------------------------------------------------ # + # Set config variables for weight processing self.cfg.normalization_type = "RMS" self.cfg.positional_embedding_type = "rotary" self.cfg.final_rms = True # Qwen3-style; OLMoE uses False - self.cfg.gated_mlp = True # SwiGLU-style gate in every MoE expert + self.cfg.gated_mlp = True self.cfg.attn_only = False self.cfg.uses_rms_norm = True # Force eager attention for output_attentions hook support self.cfg.attn_implementation = "eager" self.cfg.default_prepend_bos = False # Qwen3 family convention - # GQA: propagate n_key_value_heads when provided by the loaded config. - # map_default_transformer_lens_config() sets this from num_key_value_heads - # in the HF checkpoint config; we do not hard-code a fallback value. + # GQA support if hasattr(cfg, "n_key_value_heads") and cfg.n_key_value_heads is not None: self.cfg.n_key_value_heads = cfg.n_key_value_heads - # ------------------------------------------------------------------ # - # Weight processing conversions - # ------------------------------------------------------------------ # - # Standard QKVO rearrangements; _qkvo_weight_conversions() resolves - # n_kv_heads from self.cfg.n_key_value_heads automatically. + # QKVO rearrangements; MoE expert and gate weights pass through unchanged self.weight_processing_conversions = { **self._qkvo_weight_conversions(), } - # MoE expert weights (gate_up_proj, down_proj) and gate router weights - # (gate.weight) pass through unchanged — HF's native forward handles them. - # ------------------------------------------------------------------ # - # Component mapping — pre-norm architecture - # ------------------------------------------------------------------ # - # ln1 = input_layernorm (applied BEFORE attention) + # Component mapping — PRE-NORM architecture: + # ln1 = input_layernorm (applied BEFORE attention) # ln2 = post_attention_layernorm (applied BEFORE MLP) self.component_mapping = { "embed": EmbeddingBridge(name="model.embed_tokens"), @@ -111,12 +94,10 @@ def __init__(self, cfg: Any) -> None: requires_attention_mask=True, requires_position_embeddings=True, ), - # Qwen3MoeSparseMoeBlock uses batched expert parameters - # (gate_up_proj / down_proj as 3D tensors) rather than a - # ModuleList. MoEBridge wraps the entire block and delegates - # to HF's native forward. The gate (Qwen3MoeTopKRouter) is - # mapped as a submodule via LinearBridge for hook access — - # same pattern as OLMoE. + # Qwen3MoeSparseMoeBlock stores experts as batched 3D tensors + # rather than a ModuleList. MoEBridge wraps the entire block and + # delegates to HF's native forward. The gate (router) is mapped + # as a submodule for hook access — same pattern as OLMoE. "mlp": MoEBridge( name="mlp", config=self.cfg, @@ -133,18 +114,17 @@ def __init__(self, cfg: Any) -> None: def setup_component_testing(self, hf_model: Any, bridge_model: Any = None) -> None: """Set up rotary embedding references for Qwen3MoE component testing. - Qwen3MoE uses RoPE (Rotary Position Embeddings) stored at model.rotary_emb. - We retrieve the rotary_emb instance from the HF model and attach it to all - attention bridge instances so that component-level tests can run the full - attention forward pass correctly. + Qwen3MoE uses RoPE (Rotary Position Embeddings). We set the rotary_emb + reference on all attention bridge instances for component testing. Args: - hf_model: The HuggingFace Qwen3MoeForCausalLM model instance. - bridge_model: The TransformerBridge model (if available). + hf_model: The HuggingFace Qwen3MoE model instance + bridge_model: The TransformerBridge model (if available) """ + # Get rotary embedding instance from the model rotary_emb = hf_model.model.rotary_emb - # Force eager attention on the HF model to match bridge implementation + # Force HF model to use "eager" attention to match bridge implementation if hasattr(hf_model, "config") and hasattr(hf_model.config, "_attn_implementation"): hf_model.config._attn_implementation = "eager" @@ -153,12 +133,12 @@ def setup_component_testing(self, hf_model: Any, bridge_model: Any = None) -> No if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "config"): layer.self_attn.config._attn_implementation = "eager" - # Attach rotary_emb to each block's attention bridge + # Set rotary_emb on actual bridge instances in bridge_model if available if bridge_model is not None and hasattr(bridge_model, "blocks"): for block in bridge_model.blocks: if hasattr(block, "attn"): block.attn.set_rotary_emb(rotary_emb) - # Also set on the template bridge for get_generalized_component() calls + # Also set on the template for get_generalized_component() calls attn_bridge = self.get_generalized_component("blocks.0.attn") attn_bridge.set_rotary_emb(rotary_emb) diff --git a/transformer_lens/tools/model_registry/verify_models.py b/transformer_lens/tools/model_registry/verify_models.py index 570e8101d..0aecc8eba 100644 --- a/transformer_lens/tools/model_registry/verify_models.py +++ b/transformer_lens/tools/model_registry/verify_models.py @@ -263,12 +263,10 @@ def estimate_model_params(model_id: str) -> int: lang_config, "num_experts", None ) if num_experts and num_experts > 1: - # For architectures like Qwen3MoE, the per-expert MLP hidden size is stored - # in moe_intermediate_size rather than intermediate_size (which may refer to - # a separate dense MLP used in non-MoE layers). Use moe_intermediate_size - # when present. + # Qwen3MoE and similar store per-expert hidden size in moe_intermediate_size; + # intermediate_size refers to a dense fallback MLP that we don't use here. moe_d_mlp = getattr(lang_config, "moe_intermediate_size", None) or d_mlp - # For MoE, MLP params are multiplied by num_experts + gate params + # MLP params scale with num_experts; add gate params per expert mlp_per_layer = d_model * moe_d_mlp * mlp_multiplier moe_per_layer = (mlp_per_layer + d_model) * num_experts # Replace the non-MoE MLP contribution @@ -292,8 +290,8 @@ def estimate_benchmark_memory_gb( Phases run sequentially, so peak memory is the maximum of any single phase, not the sum. The multiplier represents how many model copies exist at peak: - Phase 1 (with HF ref): Briefly loads HF ref + Bridge → 2.0x peak - Phase 1 (no HF ref): Bridge only → 1.0x peak + Phase 1 (HF ref on): HF ref + Bridge → 2.0x peak + Phase 1 (HF ref off): Bridge only → 1.0x peak Phase 2: Bridge + HookedTransformer (separate copy) → 2.0x model + overhead Phase 3: Same as Phase 2 (processed versions) → 2.0x model + overhead Phase 4: Bridge + GPT-2 scorer (~500MB) → ~1.0x model + 0.5 GB @@ -302,9 +300,8 @@ def estimate_benchmark_memory_gb( n_params: Number of model parameters dtype: Data type for memory calculation phases: Which phases will be run (None = all phases) - use_hf_reference: Whether Phase 1 loads an HF reference model alongside - the Bridge. When False, Phase 1 only needs 1x model memory instead - of 2x. This matches the ``--no-hf-reference`` CLI flag. + use_hf_reference: Whether Phase 1 loads an HF reference alongside the + Bridge. Mirrors the ``--no-hf-reference`` CLI flag. Returns: Estimated peak memory in GB @@ -327,14 +324,11 @@ def estimate_benchmark_memory_gb( for p in phases: if p == 1: - if use_hf_reference: - # Phase 1: HF ref + Bridge = 2 copies briefly - phase_peaks.append(model_size_gb * 2.0 * (1 + overhead_fraction)) - else: - # No HF reference: Bridge alone - phase_peaks.append(model_size_gb * (1 + overhead_fraction)) + # HF ref + Bridge (2 copies) or Bridge alone + multiplier = 2.0 if use_hf_reference else 1.0 + phase_peaks.append(model_size_gb * multiplier * (1 + overhead_fraction)) elif p in (2, 3): - # Phase 2/3: Bridge + HookedTransformer = 2 copies + # Bridge + HookedTransformer = 2 copies phase_peaks.append(model_size_gb * 2.0 * (1 + overhead_fraction)) elif p == 4: # Bridge + GPT-2 scorer From 4d94fd9b81a32f16c5ccd7c1090a530a1fca237a Mon Sep 17 00:00:00 2001 From: jlarson4 Date: Fri, 10 Apr 2026 15:50:40 -0500 Subject: [PATCH 5/5] Fixed missing closing brace --- .../model_bridge/supported_architectures/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transformer_lens/model_bridge/supported_architectures/__init__.py b/transformer_lens/model_bridge/supported_architectures/__init__.py index c415c959d..a63f82abc 100644 --- a/transformer_lens/model_bridge/supported_architectures/__init__.py +++ b/transformer_lens/model_bridge/supported_architectures/__init__.py @@ -125,6 +125,7 @@ ) from transformer_lens.model_bridge.supported_architectures.qwen3_moe import ( Qwen3MoeArchitectureAdapter, +) from transformer_lens.model_bridge.supported_architectures.qwen3_next import ( Qwen3NextArchitectureAdapter, )