diff --git a/tests/integration/model_bridge/test_qwen3_moe_bridge.py b/tests/integration/model_bridge/test_qwen3_moe_bridge.py new file mode 100644 index 000000000..e952f6fb6 --- /dev/null +++ b/tests/integration/model_bridge/test_qwen3_moe_bridge.py @@ -0,0 +1,173 @@ +"""Integration tests for the Qwen3MoE TransformerBridge. + +Uses a tiny programmatic config on the meta device — no network access or +weight downloads. Tensor ops can't execute on meta, so forward-pass tests are +skipped and run manually during verification. Fixture pattern mirrors +tests/unit/model_bridge/test_gpt_oss_moe.py. +""" + +import pytest +import torch +from transformers import AutoConfig, AutoModelForCausalLM + +from transformer_lens.config import TransformerBridgeConfig +from transformer_lens.model_bridge.bridge import TransformerBridge +from transformer_lens.model_bridge.generalized_components import MoEBridge +from transformer_lens.model_bridge.sources.transformers import ( + map_default_transformer_lens_config, +) +from transformer_lens.model_bridge.supported_architectures.qwen3_moe import ( + Qwen3MoeArchitectureAdapter, +) + + +class _MockTokenizer: + """Stand-in to satisfy TransformerBridge(tokenizer=...).""" + + pass + + +@pytest.fixture(scope="module") +def tiny_qwen3moe_config(): + """Small Qwen3MoeConfig: 2 layers, 4 heads, 4 experts.""" + return AutoConfig.for_model( + "qwen3_moe", + hidden_size=64, + num_hidden_layers=2, + num_attention_heads=4, + num_key_value_heads=2, + head_dim=16, + moe_intermediate_size=32, + num_experts=4, + num_experts_per_tok=2, + vocab_size=256, + max_position_embeddings=128, + decoder_sparse_step=1, + mlp_only_layers=[], + ) + + +@pytest.fixture(scope="module") +def tiny_qwen3moe_model_meta(tiny_qwen3moe_config): + """Qwen3MoE model on meta device (no weights loaded).""" + with torch.device("meta"): + model = AutoModelForCausalLM.from_config(tiny_qwen3moe_config) + return model + + +@pytest.fixture(scope="module") +def tiny_qwen3moe_bridge(tiny_qwen3moe_config, tiny_qwen3moe_model_meta): + """TransformerBridge wrapping the tiny meta-device Qwen3MoE model.""" + tl_config = map_default_transformer_lens_config(tiny_qwen3moe_config) + + bridge_config = TransformerBridgeConfig( + d_model=tl_config.d_model, + d_head=tl_config.d_head, + n_layers=tl_config.n_layers, + n_ctx=tl_config.n_ctx, + n_heads=tl_config.n_heads, + n_key_value_heads=tl_config.n_key_value_heads, + d_vocab=tl_config.d_vocab, + architecture="Qwen3MoeForCausalLM", + ) + + adapter = Qwen3MoeArchitectureAdapter(bridge_config) + + return TransformerBridge( + model=tiny_qwen3moe_model_meta, + adapter=adapter, + tokenizer=_MockTokenizer(), + ) + + +class TestQwen3MoeModelStructure: + def test_model_has_layers(self, tiny_qwen3moe_model_meta) -> None: + assert hasattr(tiny_qwen3moe_model_meta, "model") + assert hasattr(tiny_qwen3moe_model_meta.model, "layers") + assert len(tiny_qwen3moe_model_meta.model.layers) == 2 + + def test_layer_has_sparse_moe_block(self, tiny_qwen3moe_model_meta) -> None: + # Qwen3MoeSparseMoeBlock stores experts as batched 3D tensors, not a ModuleList + layer0_mlp = tiny_qwen3moe_model_meta.model.layers[0].mlp + assert hasattr(layer0_mlp, "experts") + experts = layer0_mlp.experts + assert hasattr(experts, "gate_up_proj") + assert hasattr(experts, "down_proj") + assert not hasattr(experts, "__iter__") + + def test_layer_has_gate_router(self, tiny_qwen3moe_model_meta) -> None: + layer0_mlp = tiny_qwen3moe_model_meta.model.layers[0].mlp + assert hasattr(layer0_mlp, "gate") + + def test_attention_has_q_norm_k_norm(self, tiny_qwen3moe_model_meta) -> None: + attn = tiny_qwen3moe_model_meta.model.layers[0].self_attn + assert hasattr(attn, "q_norm") + assert hasattr(attn, "k_norm") + + +class TestQwen3MoeBridgeStructure: + def test_block_count(self, tiny_qwen3moe_bridge) -> None: + assert len(tiny_qwen3moe_bridge.blocks) == 2 + + def test_has_core_components(self, tiny_qwen3moe_bridge) -> None: + assert hasattr(tiny_qwen3moe_bridge, "embed") + assert hasattr(tiny_qwen3moe_bridge, "unembed") + assert hasattr(tiny_qwen3moe_bridge, "ln_final") + + def test_cfg_final_rms_is_true(self, tiny_qwen3moe_bridge) -> None: + """Qwen3MoE uses final_rms=True; OLMoE uses False.""" + assert tiny_qwen3moe_bridge.cfg.final_rms is True + + def test_cfg_n_kv_heads(self, tiny_qwen3moe_bridge) -> None: + assert tiny_qwen3moe_bridge.cfg.n_key_value_heads == 2 + + def test_cfg_positional_embedding_type(self, tiny_qwen3moe_bridge) -> None: + assert tiny_qwen3moe_bridge.cfg.positional_embedding_type == "rotary" + + def test_cfg_normalization_type(self, tiny_qwen3moe_bridge) -> None: + assert tiny_qwen3moe_bridge.cfg.normalization_type == "RMS" + + def test_mlp_blocks_are_moe_bridge(self, tiny_qwen3moe_bridge) -> None: + for i, block in enumerate(tiny_qwen3moe_bridge.blocks): + assert isinstance( + block.mlp, MoEBridge + ), f"Block {i} mlp is {type(block.mlp).__name__}, expected MoEBridge" + + def test_moe_bridge_has_router_scores_hook(self, tiny_qwen3moe_bridge) -> None: + mlp = tiny_qwen3moe_bridge.blocks[0].mlp + assert hasattr(mlp, "hook_router_scores") + + def test_block_has_ln1_and_ln2(self, tiny_qwen3moe_bridge) -> None: + block = tiny_qwen3moe_bridge.blocks[0] + assert hasattr(block, "ln1") + assert hasattr(block, "ln2") + + def test_block_attn_has_q_norm_k_norm(self, tiny_qwen3moe_bridge) -> None: + attn = tiny_qwen3moe_bridge.blocks[0].attn + assert hasattr(attn, "q_norm") + assert hasattr(attn, "k_norm") + + +# Forward-pass tests require real weights — meta-device tensor ops raise +# NotImplementedError. Run these manually during Step 3 verification. + + +@pytest.mark.skip(reason="Requires real weights — run manually during verification") +def test_forward_pass_matches_hf(tiny_qwen3moe_bridge) -> None: + """Bridge logits match the HF model.""" + tokens = torch.tensor([[1, 2, 3, 4]]) + with torch.no_grad(): + bridge_out = tiny_qwen3moe_bridge(tokens) + hf_out = tiny_qwen3moe_bridge.original_model(tokens).logits + max_diff = (bridge_out - hf_out).abs().max().item() + assert max_diff < 1e-4, f"Bridge vs HF max diff = {max_diff}" + + +@pytest.mark.skip(reason="Requires real weights — run manually during verification") +def test_run_with_cache_captures_moe_router_scores(tiny_qwen3moe_bridge) -> None: + """MoEBridge captures router scores in the activation cache.""" + tiny_qwen3moe_bridge.enable_compatibility_mode(no_processing=True) + tokens = torch.tensor([[1, 2, 3, 4]]) + _, cache = tiny_qwen3moe_bridge.run_with_cache(tokens) + for i in range(len(tiny_qwen3moe_bridge.blocks)): + assert f"blocks.{i}.mlp.hook_router_scores" in cache, f"Missing router scores for block {i}" diff --git a/tests/unit/model_bridge/test_qwen3_moe_adapter.py b/tests/unit/model_bridge/test_qwen3_moe_adapter.py new file mode 100644 index 000000000..af6a0155c --- /dev/null +++ b/tests/unit/model_bridge/test_qwen3_moe_adapter.py @@ -0,0 +1,194 @@ +"""Unit tests for the Qwen3MoeArchitectureAdapter. + +All tests use programmatic TransformerBridgeConfig instances — no network access +or model downloads. +""" + +import pytest + +from transformer_lens.config import TransformerBridgeConfig +from transformer_lens.conversion_utils.conversion_steps.rearrange_tensor_conversion import ( + RearrangeTensorConversion, +) +from transformer_lens.conversion_utils.param_processing_conversion import ( + ParamProcessingConversion, +) +from transformer_lens.factories.architecture_adapter_factory import ( + SUPPORTED_ARCHITECTURES, +) +from transformer_lens.model_bridge.generalized_components import ( + MoEBridge, + RMSNormalizationBridge, +) +from transformer_lens.model_bridge.supported_architectures.qwen3_moe import ( + Qwen3MoeArchitectureAdapter, +) + + +@pytest.fixture +def cfg() -> TransformerBridgeConfig: + return TransformerBridgeConfig( + d_model=64, + d_head=16, + n_layers=2, + n_ctx=128, + n_heads=4, + n_key_value_heads=2, + d_vocab=256, + architecture="Qwen3MoeForCausalLM", + ) + + +@pytest.fixture +def adapter(cfg: TransformerBridgeConfig) -> Qwen3MoeArchitectureAdapter: + return Qwen3MoeArchitectureAdapter(cfg) + + +class TestQwen3MoeAdapterConfig: + def test_normalization_type_is_rms(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + assert adapter.cfg.normalization_type == "RMS" + + def test_positional_embedding_type_is_rotary( + self, adapter: Qwen3MoeArchitectureAdapter + ) -> None: + assert adapter.cfg.positional_embedding_type == "rotary" + + def test_final_rms_is_true(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + """Qwen3MoE uses final_rms=True; OLMoE uses False.""" + assert adapter.cfg.final_rms is True + + def test_gated_mlp_is_true(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + assert adapter.cfg.gated_mlp is True + + def test_uses_rms_norm_is_true(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + assert adapter.cfg.uses_rms_norm is True + + def test_attn_implementation_is_eager(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + assert adapter.cfg.attn_implementation == "eager" + + def test_default_prepend_bos_is_false(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + assert adapter.cfg.default_prepend_bos is False + + def test_n_kv_heads_propagated(self) -> None: + """n_key_value_heads from the loaded config is preserved.""" + cfg = TransformerBridgeConfig( + d_model=64, + d_head=16, + n_layers=2, + n_ctx=128, + n_heads=4, + n_key_value_heads=2, + d_vocab=256, + architecture="Qwen3MoeForCausalLM", + ) + adapter = Qwen3MoeArchitectureAdapter(cfg) + assert adapter.cfg.n_key_value_heads == 2 + + +class TestQwen3MoeWeightConversions: + def test_has_qkvo_keys(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + convs = adapter.weight_processing_conversions + assert convs is not None + assert "blocks.{i}.attn.q.weight" in convs + assert "blocks.{i}.attn.k.weight" in convs + assert "blocks.{i}.attn.v.weight" in convs + assert "blocks.{i}.attn.o.weight" in convs + + def test_q_rearrange_uses_n_heads(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + """Q rearrange uses n_heads (4).""" + convs = adapter.weight_processing_conversions + assert convs is not None + q_conv = convs["blocks.{i}.attn.q.weight"] + assert isinstance(q_conv, ParamProcessingConversion) + assert isinstance(q_conv.tensor_conversion, RearrangeTensorConversion) + axes = q_conv.tensor_conversion.axes_lengths + assert axes.get("n") == 4 + + def test_kv_rearrange_uses_n_kv_heads(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + """K/V rearrange uses n_key_value_heads (2) for GQA.""" + convs = adapter.weight_processing_conversions + assert convs is not None + k_conv = convs["blocks.{i}.attn.k.weight"] + v_conv = convs["blocks.{i}.attn.v.weight"] + assert isinstance(k_conv, ParamProcessingConversion) + assert isinstance(v_conv, ParamProcessingConversion) + assert isinstance(k_conv.tensor_conversion, RearrangeTensorConversion) + assert isinstance(v_conv.tensor_conversion, RearrangeTensorConversion) + assert k_conv.tensor_conversion.axes_lengths.get("n") == 2 + assert v_conv.tensor_conversion.axes_lengths.get("n") == 2 + + def test_o_rearrange_uses_n_heads(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + """O rearrange uses n_heads (4).""" + convs = adapter.weight_processing_conversions + assert convs is not None + o_conv = convs["blocks.{i}.attn.o.weight"] + assert isinstance(o_conv, ParamProcessingConversion) + assert isinstance(o_conv.tensor_conversion, RearrangeTensorConversion) + assert o_conv.tensor_conversion.axes_lengths.get("n") == 4 + + +class TestQwen3MoeComponentMapping: + def test_has_required_top_level_keys(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + mapping = adapter.component_mapping + assert mapping is not None + for key in ("embed", "rotary_emb", "blocks", "ln_final", "unembed"): + assert key in mapping, f"Missing top-level key: {key!r}" + + def test_blocks_has_required_submodules(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + mapping = adapter.component_mapping + assert mapping is not None + blocks = mapping["blocks"] + for key in ("ln1", "ln2", "attn", "mlp"): + assert key in blocks.submodules, f"Missing blocks submodule: {key!r}" + + def test_attn_has_all_submodules(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + mapping = adapter.component_mapping + assert mapping is not None + attn = mapping["blocks"].submodules["attn"] + for key in ("q", "k", "v", "o", "q_norm", "k_norm"): + assert key in attn.submodules, f"Missing attn submodule: {key!r}" + + def test_ln1_ln2_are_rms_norm_bridges(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + mapping = adapter.component_mapping + assert mapping is not None + subs = mapping["blocks"].submodules + assert isinstance(subs["ln1"], RMSNormalizationBridge) + assert isinstance(subs["ln2"], RMSNormalizationBridge) + + def test_mlp_is_moe_bridge(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + mapping = adapter.component_mapping + assert mapping is not None + mlp = mapping["blocks"].submodules["mlp"] + assert isinstance(mlp, MoEBridge) + + def test_mlp_has_gate_submodule(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + mapping = adapter.component_mapping + assert mapping is not None + mlp = mapping["blocks"].submodules["mlp"] + assert "gate" in mlp.submodules + + def test_q_norm_k_norm_are_rms_norm_bridges(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + mapping = adapter.component_mapping + assert mapping is not None + attn_subs = mapping["blocks"].submodules["attn"].submodules + assert isinstance(attn_subs["q_norm"], RMSNormalizationBridge) + assert isinstance(attn_subs["k_norm"], RMSNormalizationBridge) + + def test_hf_module_paths(self, adapter: Qwen3MoeArchitectureAdapter) -> None: + """HF module path names are mapped correctly.""" + mapping = adapter.component_mapping + assert mapping is not None + assert mapping["embed"].name == "model.embed_tokens" + assert mapping["ln_final"].name == "model.norm" + assert mapping["unembed"].name == "lm_head" + assert mapping["blocks"].name == "model.layers" + subs = mapping["blocks"].submodules + assert subs["ln1"].name == "input_layernorm" + assert subs["ln2"].name == "post_attention_layernorm" + assert subs["attn"].name == "self_attn" + assert subs["mlp"].name == "mlp" + + +class TestQwen3MoeFactoryRegistration: + def test_factory_lookup_returns_adapter_class(self) -> None: + assert SUPPORTED_ARCHITECTURES["Qwen3MoeForCausalLM"] is Qwen3MoeArchitectureAdapter diff --git a/transformer_lens/factories/architecture_adapter_factory.py b/transformer_lens/factories/architecture_adapter_factory.py index 01e0c59b7..eb62e4f0d 100644 --- a/transformer_lens/factories/architecture_adapter_factory.py +++ b/transformer_lens/factories/architecture_adapter_factory.py @@ -44,6 +44,7 @@ PhiArchitectureAdapter, Qwen2ArchitectureAdapter, Qwen3ArchitectureAdapter, + Qwen3MoeArchitectureAdapter, Qwen3NextArchitectureAdapter, QwenArchitectureAdapter, StableLmArchitectureAdapter, @@ -91,6 +92,7 @@ "QwenForCausalLM": QwenArchitectureAdapter, "Qwen2ForCausalLM": Qwen2ArchitectureAdapter, "Qwen3ForCausalLM": Qwen3ArchitectureAdapter, + "Qwen3MoeForCausalLM": Qwen3MoeArchitectureAdapter, "Qwen3NextForCausalLM": Qwen3NextArchitectureAdapter, "StableLmForCausalLM": StableLmArchitectureAdapter, "T5ForConditionalGeneration": T5ArchitectureAdapter, diff --git a/transformer_lens/model_bridge/supported_architectures/__init__.py b/transformer_lens/model_bridge/supported_architectures/__init__.py index c2d5510ad..a63f82abc 100644 --- a/transformer_lens/model_bridge/supported_architectures/__init__.py +++ b/transformer_lens/model_bridge/supported_architectures/__init__.py @@ -123,6 +123,9 @@ from transformer_lens.model_bridge.supported_architectures.qwen3 import ( Qwen3ArchitectureAdapter, ) +from transformer_lens.model_bridge.supported_architectures.qwen3_moe import ( + Qwen3MoeArchitectureAdapter, +) from transformer_lens.model_bridge.supported_architectures.qwen3_next import ( Qwen3NextArchitectureAdapter, ) @@ -174,6 +177,7 @@ "QwenArchitectureAdapter", "Qwen2ArchitectureAdapter", "Qwen3ArchitectureAdapter", + "Qwen3MoeArchitectureAdapter", "Qwen3NextArchitectureAdapter", "StableLmArchitectureAdapter", "T5ArchitectureAdapter", diff --git a/transformer_lens/model_bridge/supported_architectures/qwen3_moe.py b/transformer_lens/model_bridge/supported_architectures/qwen3_moe.py new file mode 100644 index 000000000..643892cee --- /dev/null +++ b/transformer_lens/model_bridge/supported_architectures/qwen3_moe.py @@ -0,0 +1,144 @@ +"""Qwen3MoE (Mixture of Experts) architecture adapter.""" + +from typing import Any + +from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter +from transformer_lens.model_bridge.generalized_components import ( + BlockBridge, + EmbeddingBridge, + LinearBridge, + MoEBridge, + PositionEmbeddingsAttentionBridge, + RMSNormalizationBridge, + RotaryEmbeddingBridge, + UnembeddingBridge, +) + + +class Qwen3MoeArchitectureAdapter(ArchitectureAdapter): + """Architecture adapter for Qwen3MoE (Mixture of Experts) models. + + Qwen3MoE is a sparse MoE decoder-only Transformer, structurally close to OLMoE. + Key features: + + - Pre-norm: RMSNorm applied BEFORE attention and BEFORE MLP. + - Q/K normalization: RMSNorm applied to queries and keys after projection. + - Sparse MoE: 128 experts with top-8 routing (public 30B-A3B checkpoints). + - Batched expert parameters: gate_up_proj and down_proj as single 3D tensors, + not a ModuleList. + - final_rms=True (Qwen3-style; OLMoE uses False). + - No biases on any projections. + - GQA: n_key_value_heads < n_heads in all public checkpoints. + + Only the all-MoE configuration is supported (decoder_sparse_step=1, + mlp_only_layers=[]). Models with dense fallback layers cannot be wrapped + because MoEBridge does not handle the dense Qwen3MoeMLP path. + + Optional Parameters (may not exist in state_dict): + ------------------------------------------------- + - blocks.{i}.attn.b_Q - No bias on query projection + - blocks.{i}.attn.b_K - No bias on key projection + - blocks.{i}.attn.b_V - No bias on value projection + - blocks.{i}.attn.b_O - No bias on output projection + - blocks.{i}.ln1.b - RMSNorm has no bias + - blocks.{i}.ln2.b - RMSNorm has no bias + - ln_final.b - RMSNorm has no bias + """ + + def __init__(self, cfg: Any) -> None: + """Initialize the Qwen3MoE architecture adapter.""" + super().__init__(cfg) + + # Set config variables for weight processing + self.cfg.normalization_type = "RMS" + self.cfg.positional_embedding_type = "rotary" + self.cfg.final_rms = True # Qwen3-style; OLMoE uses False + self.cfg.gated_mlp = True + self.cfg.attn_only = False + self.cfg.uses_rms_norm = True + # Force eager attention for output_attentions hook support + self.cfg.attn_implementation = "eager" + self.cfg.default_prepend_bos = False # Qwen3 family convention + + # GQA support + if hasattr(cfg, "n_key_value_heads") and cfg.n_key_value_heads is not None: + self.cfg.n_key_value_heads = cfg.n_key_value_heads + + # QKVO rearrangements; MoE expert and gate weights pass through unchanged + self.weight_processing_conversions = { + **self._qkvo_weight_conversions(), + } + + # Component mapping — PRE-NORM architecture: + # ln1 = input_layernorm (applied BEFORE attention) + # ln2 = post_attention_layernorm (applied BEFORE MLP) + self.component_mapping = { + "embed": EmbeddingBridge(name="model.embed_tokens"), + "rotary_emb": RotaryEmbeddingBridge(name="model.rotary_emb", config=self.cfg), + "blocks": BlockBridge( + name="model.layers", + submodules={ + "ln1": RMSNormalizationBridge(name="input_layernorm", config=self.cfg), + "ln2": RMSNormalizationBridge(name="post_attention_layernorm", config=self.cfg), + "attn": PositionEmbeddingsAttentionBridge( + name="self_attn", + config=self.cfg, + submodules={ + "q": LinearBridge(name="q_proj"), + "k": LinearBridge(name="k_proj"), + "v": LinearBridge(name="v_proj"), + "o": LinearBridge(name="o_proj"), + "q_norm": RMSNormalizationBridge(name="q_norm", config=self.cfg), + "k_norm": RMSNormalizationBridge(name="k_norm", config=self.cfg), + }, + requires_attention_mask=True, + requires_position_embeddings=True, + ), + # Qwen3MoeSparseMoeBlock stores experts as batched 3D tensors + # rather than a ModuleList. MoEBridge wraps the entire block and + # delegates to HF's native forward. The gate (router) is mapped + # as a submodule for hook access — same pattern as OLMoE. + "mlp": MoEBridge( + name="mlp", + config=self.cfg, + submodules={ + "gate": LinearBridge(name="gate"), + }, + ), + }, + ), + "ln_final": RMSNormalizationBridge(name="model.norm", config=self.cfg), + "unembed": UnembeddingBridge(name="lm_head", config=self.cfg), + } + + def setup_component_testing(self, hf_model: Any, bridge_model: Any = None) -> None: + """Set up rotary embedding references for Qwen3MoE component testing. + + Qwen3MoE uses RoPE (Rotary Position Embeddings). We set the rotary_emb + reference on all attention bridge instances for component testing. + + Args: + hf_model: The HuggingFace Qwen3MoE model instance + bridge_model: The TransformerBridge model (if available) + """ + # Get rotary embedding instance from the model + rotary_emb = hf_model.model.rotary_emb + + # Force HF model to use "eager" attention to match bridge implementation + if hasattr(hf_model, "config") and hasattr(hf_model.config, "_attn_implementation"): + hf_model.config._attn_implementation = "eager" + + if hasattr(hf_model, "model") and hasattr(hf_model.model, "layers"): + for layer in hf_model.model.layers: + if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "config"): + layer.self_attn.config._attn_implementation = "eager" + + # Set rotary_emb on actual bridge instances in bridge_model if available + if bridge_model is not None and hasattr(bridge_model, "blocks"): + for block in bridge_model.blocks: + if hasattr(block, "attn"): + block.attn.set_rotary_emb(rotary_emb) + + # Also set on the template for get_generalized_component() calls + attn_bridge = self.get_generalized_component("blocks.0.attn") + attn_bridge.set_rotary_emb(rotary_emb) diff --git a/transformer_lens/tools/model_registry/data/supported_models.json b/transformer_lens/tools/model_registry/data/supported_models.json index 2d8a67999..531437391 100644 --- a/transformer_lens/tools/model_registry/data/supported_models.json +++ b/transformer_lens/tools/model_registry/data/supported_models.json @@ -6,9 +6,9 @@ "min_downloads": 500, "scan_duration_seconds": 0.0 }, - "total_architectures": 37, - "total_models": 5563, - "total_verified": 693, + "total_architectures": 38, + "total_models": 6822, + "total_verified": 698, "models": [ { "architecture_id": "Qwen3NextForCausalLM", @@ -77820,6 +77820,20 @@ "phase4_score": 89.9, "phase7_score": null }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, { "architecture_id": "Qwen2ForCausalLM", "model_id": "xw1234gan/Extended_GRPO_KL_Qwen2.5-3B-Instruct_MATH_beta0_lr1e-05_mb2_ga128_n2048_seed42", @@ -77848,6 +77862,34 @@ "phase7_score": null, "phase8_score": null }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-Coder-30B-A3B-Instruct", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, { "architecture_id": "LlamaForCausalLM", "model_id": "Yukang/Llama-2-13b-longlora-16k-ft", @@ -77862,6 +77904,20 @@ "phase7_score": null, "phase8_score": null }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-Thinking-2507", + "status": 0, + "verified_date": null, + "metadata": null, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, { "architecture_id": "LlamaForCausalLM", "model_id": "bhenrym14/airoboros-3_1-yi-34b-200k", @@ -93751,6 +93807,2165 @@ "phase4_score": 67.5, "phase7_score": null, "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 594767, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-Instruct-2507-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 370376, + "total_params": 30533947392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 354610, + "total_params": 30533947392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 293539, + "total_params": 9298753536 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 235228, + "total_params": 235107904512 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-30B-A3B-Instruct-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 212096, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-30B-A3B-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 209682, + "total_params": 15583623168 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "trl-internal-testing/tiny-Qwen3MoeForCausalLM", + "status": 3, + "verified_date": "2026-04-10", + "metadata": { + "downloads": 207171, + "total_params": 2574656 + }, + "note": "Below threshold: P3=89.5% but required tests failed: logits_equivalence \u2014 Text quality score: 72.2/100 (avg perplexity: 558.8) \u2014 generated text may be incoherent", + "phase1_score": 100.0, + "phase2_score": 100.0, + "phase3_score": 89.5, + "phase4_score": 72.2, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-30B-A3B-Instruct-MLX-5bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 206997, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-30B-A3B-Instruct-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 196979, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-30B-A3B-Instruct-MLX-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 194457, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-Instruct-2507", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 183785, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-GPTQ-Int4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 139859, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-Coder-480B-A35B-Instruct", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 90099, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "stelterlab/Qwen3-30B-A3B-Instruct-2507-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 88254, + "total_params": 4605856128 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-30B-A3B-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 86609, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 83207, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-Thinking-2507", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 81021, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 75715, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "cyankiwi/Qwen3-Coder-30B-A3B-Instruct-AWQ-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 74870, + "total_params": 5306567040 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 64513, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 49515, + "total_params": 235107904512 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "NVFP4/Qwen3-30B-A3B-Instruct-2507-FP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 46285, + "total_params": 15583623168 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-Base", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 45932, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuixiAI/Qwen3-30B-A3B-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 44268, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 42807, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "typhoon-ai/typhoon2.5-qwen3-30b-a3b", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 42205, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Instruct-2507-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 35168, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Instruct-2507-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 34301, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-Thinking-2507-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 34117, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Instruct-2507-MLX-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 33498, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "JunHowie/Qwen3-30B-A3B-Instruct-2507-GPTQ-Int4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 33311, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "cyankiwi/Qwen3-30B-A3B-Instruct-2507-AWQ-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 27742, + "total_params": 5306567040 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-30B-A3B-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 26590, + "total_params": 17452222848 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nytopop/Qwen3-30B-A3B.w8a8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 23560, + "total_params": 30554486784 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "NVFP4/Qwen3-Coder-30B-A3B-Instruct-FP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 18001, + "total_params": 15583623168 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "np-cr/testing-qwen3-moe", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 17816, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-Thinking-2507-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 16886, + "total_params": 235107904512 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Alibaba-NLP/Tongyi-DeepResearch-30B-A3B", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 16568, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-30B-A3B-GGUF", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 16377, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-30B-A3B-FP8-block", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 15451, + "total_params": 30533947392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-Nemotron-235B-A22B-GenRM", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 15251, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nightmedia/Huihui-Qwen3-30B-A3B-Instruct-2507-abliterated-dwq4-mlx", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 15234, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 14566, + "total_params": 4770822144 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 13999, + "total_params": 8587311104 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "BCCard/Qwen3-30B-A3B-FP8-Dynamic", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 9881, + "total_params": 30554486784 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "stelterlab/Qwen3-Coder-30B-A3B-Instruct-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 9718, + "total_params": 4605856128 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-235B-A22B-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 9521, + "total_params": 132806618624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-GPTQ-Int4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 7615, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuantTrio/Qwen3-30B-A3B-Thinking-2507-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 7283, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-480B-A35B-Instruct-MLX-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 6725, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-235B-A22B-GGUF", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 5776, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-480B-A35B-Instruct-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 5510, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuantTrio/Qwen3-235B-A22B-Instruct-2507-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 5349, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-480B-A35B-Instruct-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 5217, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 5150, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-Coder-30B-A3B-Instruct", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4870, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4742, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4650, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4326, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "yujiepan/qwen3-moe-tiny-random", + "status": 2, + "verified_date": "2026-04-10", + "metadata": { + "downloads": 4314, + "total_params": 9970624 + }, + "note": "Unsupported: decoder_sparse_step=2 creates mixed dense/sparse layers; adapter only supports all-MoE config (decoder_sparse_step=1)", + "phase1_score": 0.0, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-30B-A3B-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4284, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Applied-Innovation-Center/Karnak", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4268, + "total_params": 40669136896 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "optimum-internal-testing/tiny-random-qwen3_moe", + "status": 2, + "verified_date": "2026-04-10", + "metadata": { + "downloads": 4265, + "total_params": 9970624 + }, + "note": "Unsupported: decoder_sparse_step=2 creates mixed dense/sparse layers; adapter only supports all-MoE config (decoder_sparse_step=1)", + "phase1_score": 0.0, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "llm-jp/llm-jp-4-32b-a3b-thinking", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4061, + "total_params": 32139028992 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-30B-A3B-FP8-dynamic", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4060, + "total_params": 30554486784 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "miromind-ai/MiroThinker-1.7", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3960, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-235B-A22B-Instruct-2507-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3797, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-235B-A22B-Instruct-2507-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3667, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-30B-A3B-Instruct-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3656, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-235B-A22B-Instruct-2507-MLX-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3608, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-235B-A22B-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3584, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-30B-A3B-quantized.w4a16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3443, + "total_params": 4605856128 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-235B-A22B-FP8-dynamic", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3029, + "total_params": 235181131264 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "MerbAI/Karnak", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3013, + "total_params": 40669136896 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "miromind-ai/MiroThinker-v1.5-30B", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2963, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-235B-A22B-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2742, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "ig1/Qwen3-Coder-30B-A3B-Instruct-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2685, + "total_params": 17452222848 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-235B-A22B-Instruct-2507-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2655, + "total_params": 119968378368 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-30B-A3B-Instruct-2507", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2600, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Ravi07bec/SQL-v2-oct-16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2548, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "miromind-ai/MiroThinker-1.7-mini", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2495, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-GPTQ-Int8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2274, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-Coder-480B-A35B-Instruct-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2152, + "total_params": 241041798656 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "cyankiwi/Qwen3-30B-A3B-Thinking-2507-AWQ-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1959, + "total_params": 5306567040 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-30B-A3B-128K-GGUF", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1888, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-Nemotron-235B-A22B-GenRM-2603", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1785, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "koushd/Qwen3-235B-A22B-Instruct-2507-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1697, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuantTrio/Qwen3-Coder-480B-A35B-Instruct-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1550, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-30B-A3B-Instruct-5bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1525, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlabonne/Qwen3-30B-A3B-abliterated", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1522, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-30B-A3B-Instruct-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1499, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Aratako/Qwen3-30B-A3B-NSFW-JP", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1462, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Thinking-2507-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1452, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-30B-A3B-Instruct-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1142, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Thinking-2507-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1108, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "amd/Qwen3-235B-A22B-Instruct-2507-MXFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1081, + "total_params": 118194187776 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Sophia-AI/Qwen3-30B-A3B-Instruct-2507-AWQ-W4A16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1061, + "total_params": 5419812864 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-MLX-bf16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1051, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuixiAI/Qwen3-235B-A22B-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1046, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "huihui-ai/Huihui-Qwen3-Coder-30B-A3B-Instruct-abliterated", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1004, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-235B-A22B-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 879, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-30B-A3B-Instruct-4bit-dwq-v2", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 878, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "huihui-ai/Huihui-MoE-0.8B-2E", + "status": 1, + "verified_date": "2026-04-10", + "metadata": { + "downloads": 868, + "total_params": 860348416 + }, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 91.9, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "tokyotech-llm/Qwen3-Swallow-30B-A3B-CPT-v0.2", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 855, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-235B-A22B-128K-GGUF", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 854, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "zianglih/Qwen3-30B-A3B-Instruct-2507-MXFP8-last-8-BF16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 845, + "total_params": 31310690304 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Thinking-2507-MLX-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 823, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-235B-A22B-Thinking-2507-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 819, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "OpenMOSS-Team/SciThinker-30B", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 788, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-30B-A3B-Instruct-2507-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 763, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nightmedia/Qwen3-42B-A3B-2507-Thinking-Abliterated-uncensored-TOTAL-RECALL-v2-Medium-MASTER-CODER-qx4-mlx", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 758, + "total_params": 42371414784 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-235B-A22B-Thinking-2507-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 743, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "baichuan-inc/Baichuan-M3-235B", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 737, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "huihui-ai/Huihui-MiroThinker-v1.5-30B-abliterated", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 728, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-30B-A3B-Base", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 726, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Jebadiah/Qwen3-30B-A3B-seed-0", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 719, + "total_params": 30531028992 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "DavidAU/Qwen3-42B-A3B-2507-Thinking-Abliterated-uncensored-TOTAL-RECALL-v2-Medium-MASTER-CODER", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 717, + "total_params": 42371414784 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nytopop/Qwen3-30B-A3B.w4a16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 691, + "total_params": 4605856128 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "tokyotech-llm/Qwen3-Swallow-30B-A3B-RL-v0.2", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 677, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-235B-A22B-Thinking-2507-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 673, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nightmedia/Qwen3-30B-A3B-Thinking-2507-Claude-4.5-Sonnet-High-Reasoning-Distill-mxfp4-mlx", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 669, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-30B-A3B-Instruct-2507-quantized.w4a16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 663, + "total_params": 4594944480 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-235B-A22B-Instruct-2507-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 658, + "total_params": 135739033600 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-30B-A3B-Instruct-2507-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 647, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "huihui-ai/Huihui-MoE-1B-A0.6B", + "status": 1, + "verified_date": "2026-04-10", + "metadata": { + "downloads": 223, + "total_params": 1120000000 + }, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 97.6, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "imdatta0/tiny_qwen3_moe_2.8B_0.7B", + "status": 1, + "verified_date": "2026-04-10", + "metadata": { + "downloads": 218, + "total_params": 2800000000 + }, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 70.4, + "phase7_score": null, + "phase8_score": null } ] } diff --git a/transformer_lens/tools/model_registry/data/supported_models_qwen3_moe.json b/transformer_lens/tools/model_registry/data/supported_models_qwen3_moe.json new file mode 100644 index 000000000..cd5b8e104 --- /dev/null +++ b/transformer_lens/tools/model_registry/data/supported_models_qwen3_moe.json @@ -0,0 +1,2200 @@ +{ + "architecture_id": "Qwen3MoeForCausalLM", + "total_models": 129, + "scanned": 10000, + "models": [ + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1577944, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-Coder-30B-A3B-Instruct", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1177329, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1071372, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 594767, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-Instruct-2507-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 370376, + "total_params": 30533947392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 354610, + "total_params": 30533947392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 293539, + "total_params": 9298753536 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 235228, + "total_params": 235107904512 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-30B-A3B-Instruct-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 212096, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-30B-A3B-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 209682, + "total_params": 15583623168 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "trl-internal-testing/tiny-Qwen3MoeForCausalLM", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 207171, + "total_params": 2574656 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-30B-A3B-Instruct-MLX-5bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 206997, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-30B-A3B-Instruct-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 196979, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-30B-A3B-Instruct-MLX-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 194457, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-Thinking-2507", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 190710, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-Instruct-2507", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 183785, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-GPTQ-Int4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 139859, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-Coder-480B-A35B-Instruct", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 90099, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "stelterlab/Qwen3-30B-A3B-Instruct-2507-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 88254, + "total_params": 4605856128 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-30B-A3B-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 86609, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 83207, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-Thinking-2507", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 81021, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "btbtyler09/Qwen3-Coder-30B-A3B-Instruct-gptq-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 75715, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "cyankiwi/Qwen3-Coder-30B-A3B-Instruct-AWQ-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 74870, + "total_params": 5306567040 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 64513, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 49515, + "total_params": 235107904512 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "NVFP4/Qwen3-30B-A3B-Instruct-2507-FP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 46285, + "total_params": 15583623168 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-Base", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 45932, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuixiAI/Qwen3-30B-A3B-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 44268, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 42807, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "typhoon-ai/typhoon2.5-qwen3-30b-a3b", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 42205, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Instruct-2507-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 35168, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Instruct-2507-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 34301, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-30B-A3B-Thinking-2507-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 34117, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Instruct-2507-MLX-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 33498, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "JunHowie/Qwen3-30B-A3B-Instruct-2507-GPTQ-Int4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 33311, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "cyankiwi/Qwen3-30B-A3B-Instruct-2507-AWQ-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 27742, + "total_params": 5306567040 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-30B-A3B-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 26590, + "total_params": 17452222848 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nytopop/Qwen3-30B-A3B.w8a8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 23560, + "total_params": 30554486784 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "NVFP4/Qwen3-Coder-30B-A3B-Instruct-FP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 18001, + "total_params": 15583623168 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "np-cr/testing-qwen3-moe", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 17816, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-Thinking-2507-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 16886, + "total_params": 235107904512 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Alibaba-NLP/Tongyi-DeepResearch-30B-A3B", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 16568, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-30B-A3B-GGUF", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 16377, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-30B-A3B-FP8-block", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 15451, + "total_params": 30533947392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-Nemotron-235B-A22B-GenRM", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 15251, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nightmedia/Huihui-Qwen3-30B-A3B-Instruct-2507-abliterated-dwq4-mlx", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 15234, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 14566, + "total_params": 4770822144 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 13999, + "total_params": 8587311104 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "BCCard/Qwen3-30B-A3B-FP8-Dynamic", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 9881, + "total_params": 30554486784 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "stelterlab/Qwen3-Coder-30B-A3B-Instruct-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 9718, + "total_params": 4605856128 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-235B-A22B-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 9521, + "total_params": 132806618624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-GPTQ-Int4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 7615, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuantTrio/Qwen3-30B-A3B-Thinking-2507-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 7283, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-480B-A35B-Instruct-MLX-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 6725, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-235B-A22B-GGUF", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 5776, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-480B-A35B-Instruct-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 5510, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuantTrio/Qwen3-235B-A22B-Instruct-2507-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 5349, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-Coder-480B-A35B-Instruct-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 5217, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 5150, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-Coder-30B-A3B-Instruct", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4870, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4742, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4650, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4326, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "yujiepan/qwen3-moe-tiny-random", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4314, + "total_params": 9970624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-30B-A3B-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4284, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Applied-Innovation-Center/Karnak", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4268, + "total_params": 40669136896 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "optimum-internal-testing/tiny-random-qwen3_moe", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4265, + "total_params": 9970624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "llm-jp/llm-jp-4-32b-a3b-thinking", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4061, + "total_params": 32139028992 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-30B-A3B-FP8-dynamic", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 4060, + "total_params": 30554486784 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "miromind-ai/MiroThinker-1.7", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3960, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-235B-A22B-Instruct-2507-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3797, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-235B-A22B-Instruct-2507-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3667, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-30B-A3B-Instruct-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3656, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-235B-A22B-Instruct-2507-MLX-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3608, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-235B-A22B-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3584, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-30B-A3B-quantized.w4a16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3443, + "total_params": 4605856128 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-235B-A22B-FP8-dynamic", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3029, + "total_params": 235181131264 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "MerbAI/Karnak", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 3013, + "total_params": 40669136896 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "miromind-ai/MiroThinker-v1.5-30B", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2963, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-235B-A22B-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2742, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "ig1/Qwen3-Coder-30B-A3B-Instruct-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2685, + "total_params": 17452222848 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-235B-A22B-Instruct-2507-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2655, + "total_params": 119968378368 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-30B-A3B-Instruct-2507", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2600, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Ravi07bec/SQL-v2-oct-16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2548, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "miromind-ai/MiroThinker-1.7-mini", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2495, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-GPTQ-Int8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2274, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-Coder-480B-A35B-Instruct-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 2152, + "total_params": 241041798656 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "cyankiwi/Qwen3-30B-A3B-Thinking-2507-AWQ-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1959, + "total_params": 5306567040 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-30B-A3B-128K-GGUF", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1888, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-Nemotron-235B-A22B-GenRM-2603", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1785, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "koushd/Qwen3-235B-A22B-Instruct-2507-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1697, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuantTrio/Qwen3-Coder-480B-A35B-Instruct-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1550, + "total_params": 480154875392 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-30B-A3B-Instruct-5bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1525, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlabonne/Qwen3-30B-A3B-abliterated", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1522, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-30B-A3B-Instruct-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1499, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Aratako/Qwen3-30B-A3B-NSFW-JP", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1462, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Thinking-2507-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1452, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-30B-A3B-Instruct-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1142, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Thinking-2507-MLX-4bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1108, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "amd/Qwen3-235B-A22B-Instruct-2507-MXFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1081, + "total_params": 118194187776 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Sophia-AI/Qwen3-30B-A3B-Instruct-2507-AWQ-W4A16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1061, + "total_params": 5419812864 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Qwen/Qwen3-235B-A22B-MLX-bf16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1051, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "QuixiAI/Qwen3-235B-A22B-AWQ", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1046, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "huihui-ai/Huihui-Qwen3-Coder-30B-A3B-Instruct-abliterated", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 1004, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-235B-A22B-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 879, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-Coder-30B-A3B-Instruct-4bit-dwq-v2", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 878, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "huihui-ai/Huihui-MoE-0.8B-2E", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 868, + "total_params": 860348416 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "tokyotech-llm/Qwen3-Swallow-30B-A3B-CPT-v0.2", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 855, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-235B-A22B-128K-GGUF", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 854, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "zianglih/Qwen3-30B-A3B-Instruct-2507-MXFP8-last-8-BF16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 845, + "total_params": 31310690304 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-30B-A3B-Thinking-2507-MLX-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 823, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nvidia/Qwen3-235B-A22B-Thinking-2507-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 819, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "OpenMOSS-Team/SciThinker-30B", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 788, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-30B-A3B-Instruct-2507-6bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 763, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nightmedia/Qwen3-42B-A3B-2507-Thinking-Abliterated-uncensored-TOTAL-RECALL-v2-Medium-MASTER-CODER-qx4-mlx", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 758, + "total_params": 42371414784 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "mlx-community/Qwen3-235B-A22B-Thinking-2507-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 743, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "baichuan-inc/Baichuan-M3-235B", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 737, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "huihui-ai/Huihui-MiroThinker-v1.5-30B-abliterated", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 728, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-30B-A3B-Base", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 726, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "Jebadiah/Qwen3-30B-A3B-seed-0", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 719, + "total_params": 30531028992 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "DavidAU/Qwen3-42B-A3B-2507-Thinking-Abliterated-uncensored-TOTAL-RECALL-v2-Medium-MASTER-CODER", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 717, + "total_params": 42371414784 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nytopop/Qwen3-30B-A3B.w4a16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 691, + "total_params": 4605856128 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "tokyotech-llm/Qwen3-Swallow-30B-A3B-RL-v0.2", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 677, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "lmstudio-community/Qwen3-235B-A22B-Thinking-2507-MLX-8bit", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 673, + "total_params": 235093634560 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "nightmedia/Qwen3-30B-A3B-Thinking-2507-Claude-4.5-Sonnet-High-Reasoning-Distill-mxfp4-mlx", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 669, + "total_params": 30532122624 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-30B-A3B-Instruct-2507-quantized.w4a16", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 663, + "total_params": 4594944480 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "RedHatAI/Qwen3-235B-A22B-Instruct-2507-NVFP4", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 658, + "total_params": 135739033600 + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "Qwen3MoeForCausalLM", + "model_id": "unsloth/Qwen3-30B-A3B-Instruct-2507-FP8", + "status": 0, + "verified_date": null, + "metadata": { + "downloads": 647, + "total_params": null + }, + "note": null, + "phase1_score": null, + "phase2_score": null, + "phase3_score": null, + "phase4_score": null, + "phase7_score": null, + "phase8_score": null + } + ] +} \ No newline at end of file diff --git a/transformer_lens/tools/model_registry/data/verification_history.json b/transformer_lens/tools/model_registry/data/verification_history.json index 920affc5c..a701f2d19 100644 --- a/transformer_lens/tools/model_registry/data/verification_history.json +++ b/transformer_lens/tools/model_registry/data/verification_history.json @@ -1,5 +1,5 @@ { - "last_updated": "2026-04-10T00:51:34.188066", + "last_updated": "2026-04-10T15:08:21.930188", "records": [ { "model_id": "Macropodus/macbert4mdcspell_v1", @@ -11201,6 +11201,16 @@ "invalidated": false, "invalidation_reason": null }, + { + "model_id": "trl-internal-testing/tiny-Qwen3MoeForCausalLM", + "architecture_id": "Qwen3MoeForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P3=89.5% but required tests failed: logits_equivalence \u2014 Text quality score: 72.2/100 (avg perplexity: 558.8) \u2014 generated text may be incoherent", + "invalidated": false, + "invalidation_reason": null + }, { "model_id": "tiny-random/qwen3-next-moe", "architecture_id": "Qwen3NextForCausalLM", @@ -11211,6 +11221,16 @@ "invalidated": false, "invalidation_reason": null }, + { + "model_id": "huihui-ai/Huihui-MoE-0.8B-2E", + "architecture_id": "Qwen3MoeForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P3=89.5% but required tests failed: logits_equivalence, loss_equivalence \u2014 Tensors differ: max_diff=0.625000, mean_rel=0.002930", + "invalidated": false, + "invalidation_reason": null + }, { "model_id": "optimum-intel-internal-testing/tiny-random-qwen3-next", "architecture_id": "Qwen3NextForCausalLM", @@ -11221,6 +11241,16 @@ "invalidated": false, "invalidation_reason": null }, + { + "model_id": "huihui-ai/Huihui-MoE-0.8B-2E", + "architecture_id": "Qwen3MoeForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P3=89.5% but required tests failed: logits_equivalence, loss_equivalence \u2014 Tensors differ: max_diff=0.625000, mean_rel=0.002930", + "invalidated": false, + "invalidation_reason": null + }, { "model_id": "yujiepan/qwen3-next-moe-tiny-random", "architecture_id": "Qwen3NextForCausalLM", @@ -11231,6 +11261,16 @@ "invalidated": false, "invalidation_reason": null }, + { + "model_id": "huihui-ai/Huihui-MoE-0.8B-2E", + "architecture_id": "Qwen3MoeForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P3=89.5% but required tests failed: logits_equivalence, loss_equivalence \u2014 Tensors differ: max_diff=0.625000, mean_rel=0.002930", + "invalidated": false, + "invalidation_reason": null + }, { "model_id": "tiny-random/qwen3-next-moe", "architecture_id": "Qwen3NextForCausalLM", @@ -11241,6 +11281,16 @@ "invalidated": false, "invalidation_reason": null }, + { + "model_id": "huihui-ai/Huihui-MoE-0.8B-2E", + "architecture_id": "Qwen3MoeForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null + }, { "model_id": "optimum-intel-internal-testing/tiny-random-qwen3-next", "architecture_id": "Qwen3NextForCausalLM", @@ -11271,6 +11321,16 @@ "invalidated": false, "invalidation_reason": null }, + { + "model_id": "huihui-ai/Huihui-MoE-1B-A0.6B", + "architecture_id": "Qwen3MoeForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null + }, { "model_id": "tiny-random/qwen3-next-moe", "architecture_id": "Qwen3NextForCausalLM", @@ -11281,6 +11341,16 @@ "invalidated": false, "invalidation_reason": null }, + { + "model_id": "imdatta0/tiny_qwen3_moe_2.8B_0.7B", + "architecture_id": "Qwen3MoeForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Full verification completed", + "invalidated": false, + "invalidation_reason": null + }, { "model_id": "optimum-intel-internal-testing/tiny-random-qwen3-next", "architecture_id": "Qwen3NextForCausalLM", @@ -11291,6 +11361,26 @@ "invalidated": false, "invalidation_reason": null }, + { + "model_id": "yujiepan/qwen3-moe-tiny-random", + "architecture_id": "Qwen3MoeForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: 'Qwen3MoeMLP' object has no attribute 'gate'", + "invalidated": false, + "invalidation_reason": null + }, + { + "model_id": "optimum-internal-testing/tiny-random-qwen3_moe", + "architecture_id": "Qwen3MoeForCausalLM", + "verified_date": "2026-04-10", + "verified_by": "verify_models", + "transformerlens_version": null, + "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: 'Qwen3MoeMLP' object has no attribute 'gate'", + "invalidated": false, + "invalidation_reason": null + }, { "model_id": "yujiepan/qwen3-next-moe-tiny-random", "architecture_id": "Qwen3NextForCausalLM", diff --git a/transformer_lens/tools/model_registry/verify_models.py b/transformer_lens/tools/model_registry/verify_models.py index a31e91a63..0aecc8eba 100644 --- a/transformer_lens/tools/model_registry/verify_models.py +++ b/transformer_lens/tools/model_registry/verify_models.py @@ -249,6 +249,7 @@ def estimate_model_params(model_id: str) -> int: "mixtral", "qwen2", "qwen3", + "qwen3_moe", "phi3", "stablelm", ) @@ -262,8 +263,11 @@ def estimate_model_params(model_id: str) -> int: lang_config, "num_experts", None ) if num_experts and num_experts > 1: - # For MoE, MLP params are multiplied by num_experts + gate params - mlp_per_layer = d_model * d_mlp * mlp_multiplier + # Qwen3MoE and similar store per-expert hidden size in moe_intermediate_size; + # intermediate_size refers to a dense fallback MLP that we don't use here. + moe_d_mlp = getattr(lang_config, "moe_intermediate_size", None) or d_mlp + # MLP params scale with num_experts; add gate params per expert + mlp_per_layer = d_model * moe_d_mlp * mlp_multiplier moe_per_layer = (mlp_per_layer + d_model) * num_experts # Replace the non-MoE MLP contribution n_params -= n_layers * (d_model * d_mlp * mlp_multiplier) @@ -279,13 +283,15 @@ def estimate_benchmark_memory_gb( n_params: int, dtype: str = "float32", phases: Optional[list[int]] = None, + use_hf_reference: bool = True, ) -> float: """Estimate peak memory needed for benchmark suite. Phases run sequentially, so peak memory is the maximum of any single phase, not the sum. The multiplier represents how many model copies exist at peak: - Phase 1: Briefly loads HF ref + Bridge → 2.0x peak + Phase 1 (HF ref on): HF ref + Bridge → 2.0x peak + Phase 1 (HF ref off): Bridge only → 1.0x peak Phase 2: Bridge + HookedTransformer (separate copy) → 2.0x model + overhead Phase 3: Same as Phase 2 (processed versions) → 2.0x model + overhead Phase 4: Bridge + GPT-2 scorer (~500MB) → ~1.0x model + 0.5 GB @@ -294,6 +300,8 @@ def estimate_benchmark_memory_gb( n_params: Number of model parameters dtype: Data type for memory calculation phases: Which phases will be run (None = all phases) + use_hf_reference: Whether Phase 1 loads an HF reference alongside the + Bridge. Mirrors the ``--no-hf-reference`` CLI flag. Returns: Estimated peak memory in GB @@ -315,9 +323,12 @@ def estimate_benchmark_memory_gb( phases = [1, 2, 3, 4] for p in phases: - if p in (1, 2, 3): - # Phase 1: HF ref + Bridge = 2 copies briefly - # Phase 2/3: Bridge + HookedTransformer = 2 copies + if p == 1: + # HF ref + Bridge (2 copies) or Bridge alone + multiplier = 2.0 if use_hf_reference else 1.0 + phase_peaks.append(model_size_gb * multiplier * (1 + overhead_fraction)) + elif p in (2, 3): + # Bridge + HookedTransformer = 2 copies phase_peaks.append(model_size_gb * 2.0 * (1 + overhead_fraction)) elif p == 4: # Bridge + GPT-2 scorer @@ -781,7 +792,9 @@ def verify_models( continue # Step 2: Check memory - estimated_mem = estimate_benchmark_memory_gb(n_params, dtype, phases=phases) + estimated_mem = estimate_benchmark_memory_gb( + n_params, dtype, phases=phases, use_hf_reference=use_hf_reference + ) candidate.estimated_memory_gb = estimated_mem if not quiet: print( @@ -1087,6 +1100,7 @@ def _print_dry_run( dtype: str, max_memory_gb: float, phases: Optional[list[int]] = None, + use_hf_reference: bool = True, ) -> None: """Print what would be tested in a dry run.""" print(f"\nDry run: {len(candidates)} models would be tested") @@ -1107,7 +1121,9 @@ def _print_dry_run( for c in models: try: n_params = estimate_model_params(c.model_id) - mem = estimate_benchmark_memory_gb(n_params, dtype, phases=phases) + mem = estimate_benchmark_memory_gb( + n_params, dtype, phases=phases, use_hf_reference=use_hf_reference + ) status = "OK" if mem <= max_memory_gb else "SKIP (too large)" if mem > max_memory_gb: skippable += 1 @@ -1339,6 +1355,7 @@ def main() -> None: args.dtype, max_memory_gb, phases=args.phases, + use_hf_reference=not args.no_hf_reference, ) return