From e3585a40923b02b9120f9134a3329326f86d4aa0 Mon Sep 17 00:00:00 2001 From: jlarson4 Date: Thu, 9 Apr 2026 16:53:16 -0500 Subject: [PATCH 1/2] Initial CodeGen setup --- .../test_codegen_attention_bridge.py | 545 +++ .../supported_architectures/__init__.py | 0 .../test_codegen_adapter.py | 331 ++ .../factories/architecture_adapter_factory.py | 2 + .../generalized_components/__init__.py | 4 + .../codegen_attention.py | 372 ++ .../model_bridge/sources/transformers.py | 5 +- .../supported_architectures/__init__.py | 4 + .../supported_architectures/codegen.py | 150 + .../tools/model_registry/__init__.py | 1 + .../data/architecture_gaps.json | 3032 ++++++----------- .../model_registry/data/supported_models.json | 70 + .../data/verification_history.json | 152 +- 13 files changed, 2537 insertions(+), 2131 deletions(-) create mode 100644 tests/unit/model_bridge/generalized_components/test_codegen_attention_bridge.py create mode 100644 tests/unit/model_bridge/supported_architectures/__init__.py create mode 100644 tests/unit/model_bridge/supported_architectures/test_codegen_adapter.py create mode 100644 transformer_lens/model_bridge/generalized_components/codegen_attention.py create mode 100644 transformer_lens/model_bridge/supported_architectures/codegen.py diff --git a/tests/unit/model_bridge/generalized_components/test_codegen_attention_bridge.py b/tests/unit/model_bridge/generalized_components/test_codegen_attention_bridge.py new file mode 100644 index 000000000..5814b1418 --- /dev/null +++ b/tests/unit/model_bridge/generalized_components/test_codegen_attention_bridge.py @@ -0,0 +1,545 @@ +"""Unit tests for CodeGenAttentionBridge. + +Tests cover: +- RoPE helper functions (_rotate_every_two, _apply_rotary_pos_emb) +- CodeGenAttentionBridge initialisation and out_proj wiring +- Forward pass: all hooks fire (hook_q, hook_k, hook_v, hook_attn_scores, + hook_pattern, hook_z, hook_result) +- RoPE is applied to Q and K (partial rotary_dim path and full-dim path) +- Causal masking is applied correctly +- KV cache is passed through to _update_kv_cache +""" + +from typing import Any +from unittest.mock import MagicMock, patch + +import torch + +from transformer_lens.model_bridge.generalized_components.codegen_attention import ( + CodeGenAttentionBridge, + _apply_rotary_pos_emb, + _rotate_every_two, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_config( + n_heads: int = 4, + d_model: int = 64, + rotary_dim: int = 8, # must be <= head_dim = d_model // n_heads = 16 +): + """Return a minimal config namespace for CodeGenAttentionBridge tests.""" + + class Config: + pass + + cfg = Config() + cfg.n_heads = n_heads + cfg.d_model = d_model + cfg.d_head = d_model // n_heads + cfg.positional_embedding_type = "rotary" + cfg.rotary_dim = rotary_dim + return cfg + + +def _make_original_attention( + d_model: int = 64, + n_heads: int = 4, + rotary_dim: int = 8, # must be <= head_dim = d_model // n_heads = 16 + max_positions: int = 512, +): + """Create a minimal stand-in for a CodeGenAttention module.""" + head_dim = d_model // n_heads + pos_embd_dim = rotary_dim if rotary_dim else d_model + + # Sinusoidal positions buffer: shape [max_positions, pos_embd_dim] + inv_freq = 1.0 / (10000 ** (torch.arange(0, pos_embd_dim, 2, dtype=torch.int64) / pos_embd_dim)) + sinusoid_inp = torch.einsum( + "i , j -> i j", + torch.arange(max_positions, dtype=torch.int64).float(), + inv_freq, + ).float() + embed_positions = torch.cat((torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)), dim=1) + + attn = MagicMock(spec=torch.nn.Module) + attn.embed_positions = embed_positions + attn.rotary_dim = rotary_dim + attn.scale_attn = float(head_dim) ** 0.5 + attn.layer_idx = 0 + + # out_proj + out_proj = torch.nn.Linear(d_model, d_model, bias=False) + attn.out_proj = out_proj + + # qkv_proj — fused weight [3*d_model, d_model] (no bias) + qkv_proj = torch.nn.Linear(d_model, d_model * 3, bias=False) + attn.qkv_proj = qkv_proj + + return attn + + +def _make_split_qkv(d_model: int = 64): + """Return a split_qkv_matrix callable producing three independent Linears.""" + q_lin = torch.nn.Linear(d_model, d_model, bias=False) + k_lin = torch.nn.Linear(d_model, d_model, bias=False) + v_lin = torch.nn.Linear(d_model, d_model, bias=False) + + def split_qkv(_component): + return q_lin, k_lin, v_lin + + return split_qkv, q_lin, k_lin, v_lin + + +def _make_bridge(config=None, split_qkv=None): + """Construct a CodeGenAttentionBridge ready for unit testing. + + The bridge is constructed with an ``o`` LinearBridge submodule (matching + how the adapter passes ``"o": LinearBridge(name="out_proj")``). + """ + from transformer_lens.model_bridge.generalized_components.linear import LinearBridge + + if config is None: + config = _make_config() + if split_qkv is None: + split_qkv, _, _, _ = _make_split_qkv(config.d_model) + + bridge = CodeGenAttentionBridge( + name="attn", + config=config, + split_qkv_matrix=split_qkv, + submodules={"o": LinearBridge(name="out_proj")}, + ) + original = _make_original_attention( + d_model=config.d_model, + n_heads=config.n_heads, + rotary_dim=config.rotary_dim, + ) + bridge.set_original_component(original) + return bridge, original + + +# --------------------------------------------------------------------------- +# Rotary helper tests +# --------------------------------------------------------------------------- + + +class TestRotateEveryTwo: + """Tests for the _rotate_every_two function.""" + + def test_output_shape_matches_input(self): + """rotate_every_two must return a tensor of the same shape.""" + x = torch.randn(2, 4, 8, 16) + out = _rotate_every_two(x) + assert out.shape == x.shape + + def test_even_odd_rotation(self): + """Verify the rotation formula: (x0, x1) -> (-x1, x0).""" + # Use a simple 4-element last dimension so we can check by hand. + x = torch.tensor([[[[1.0, 2.0, 3.0, 4.0]]]]) # [1, 1, 1, 4] + out = _rotate_every_two(x) + # Even indices 0, 2 → x1 = [2, 4], so output at even positions = -x1 = [-2, -4] + # Odd indices 1, 3 → x0 = [1, 3], so output at odd positions = x0 = [ 1, 3] + # interleaved: [-2, 1, -4, 3] + expected = torch.tensor([[[[-2.0, 1.0, -4.0, 3.0]]]]) + assert torch.allclose(out, expected) + + def test_double_rotation_is_negation(self): + """Applying rotate_every_two twice should return the negation of the input.""" + x = torch.randn(1, 2, 5, 8) + out = _rotate_every_two(_rotate_every_two(x)) + assert torch.allclose(out, -x, atol=1e-6) + + +class TestApplyRotaryPosEmb: + """Tests for the _apply_rotary_pos_emb function.""" + + def test_identity_with_zero_sin_unit_cos(self): + """With sin=0 and cos=1, RoPE should be an identity transform.""" + b, h, s, d = 1, 2, 4, 8 + tensor = torch.randn(b, h, s, d) + sin = torch.zeros(b, s, d // 2) + cos = torch.ones(b, s, d // 2) + out = _apply_rotary_pos_emb(tensor, sin, cos) + assert torch.allclose(out, tensor, atol=1e-6) + + def test_output_shape_matches_input(self): + """Output shape must equal input shape.""" + b, h, s, d = 2, 4, 6, 16 + tensor = torch.randn(b, h, s, d) + sin = torch.randn(b, s, d // 2) + cos = torch.randn(b, s, d // 2) + out = _apply_rotary_pos_emb(tensor, sin, cos) + assert out.shape == tensor.shape + + def test_rope_modifies_tensor(self): + """With non-trivial sin/cos, the output must differ from the input.""" + b, h, s, d = 1, 1, 3, 8 + tensor = torch.randn(b, h, s, d) + sin = torch.randn(b, s, d // 2) + cos = torch.randn(b, s, d // 2) + out = _apply_rotary_pos_emb(tensor, sin, cos) + assert not torch.allclose(out, tensor) + + +# --------------------------------------------------------------------------- +# Initialisation tests +# --------------------------------------------------------------------------- + + +class TestCodeGenAttentionBridgeInit: + """Tests for CodeGenAttentionBridge initialisation.""" + + def test_out_proj_is_wired_after_set_original_component(self): + """out_proj should be linked to self.o after set_original_component.""" + bridge, original = _make_bridge() + assert bridge.o.original_component is original.out_proj + + def test_q_k_v_projections_are_set(self): + """Q, K, V LinearBridges must have their original_component set.""" + bridge, _ = _make_bridge() + assert bridge.q.original_component is not None + assert bridge.k.original_component is not None + assert bridge.v.original_component is not None + + def test_no_c_proj_attribute_needed(self): + """Construction must succeed when the original component has no c_proj.""" + from transformer_lens.model_bridge.generalized_components.linear import LinearBridge + + config = _make_config() + split_qkv, _, _, _ = _make_split_qkv(config.d_model) + bridge = CodeGenAttentionBridge( + name="attn", + config=config, + split_qkv_matrix=split_qkv, + submodules={"o": LinearBridge(name="out_proj")}, + ) + original = _make_original_attention() + # Ensure original has no c_proj + if hasattr(original, "c_proj"): + del original.c_proj + bridge.set_original_component(original) # Must not raise + assert bridge.o.original_component is original.out_proj + + def test_inherits_from_joint_qkv_attention_bridge(self): + """CodeGenAttentionBridge must subclass JointQKVAttentionBridge.""" + from transformer_lens.model_bridge.generalized_components.joint_qkv_attention import ( + JointQKVAttentionBridge, + ) + + bridge, _ = _make_bridge() + assert isinstance(bridge, JointQKVAttentionBridge) + + +# --------------------------------------------------------------------------- +# Forward pass / hooks tests +# --------------------------------------------------------------------------- + + +class TestCodeGenAttentionBridgeForward: + """Tests for the CodeGenAttentionBridge forward pass.""" + + def _position_ids(self, batch: int, seq: int) -> torch.Tensor: + return torch.arange(seq).unsqueeze(0).expand(batch, -1) + + def test_forward_returns_tuple(self): + """forward() must return a tuple (attn_output, attn_weights).""" + bridge, _ = _make_bridge() + B, S, D = 1, 6, 64 + hs = torch.randn(B, S, D) + pos_ids = self._position_ids(B, S) + out = bridge(hs, position_ids=pos_ids) + assert isinstance(out, tuple) and len(out) == 2 + + def test_output_shape(self): + """attn_output must have shape [batch, seq, d_model].""" + bridge, _ = _make_bridge() + B, S, D = 2, 8, 64 + hs = torch.randn(B, S, D) + pos_ids = self._position_ids(B, S) + attn_out, _ = bridge(hs, position_ids=pos_ids) + assert attn_out.shape == (B, S, D) + + def test_attn_weights_shape(self): + """attn_weights must have shape [batch, n_heads, seq, seq].""" + config = _make_config(n_heads=4, d_model=64) + bridge, _ = _make_bridge(config=config) + B, S = 1, 6 + hs = torch.randn(B, S, config.d_model) + pos_ids = self._position_ids(B, S) + _, attn_weights = bridge(hs, position_ids=pos_ids) + assert attn_weights.shape == (B, config.n_heads, S, S) + + def test_hook_q_fires(self): + """hook_q (q.hook_out) must be called during the forward pass.""" + bridge, _ = _make_bridge() + fired = [] + + def hook_fn(tensor, hook): + fired.append(True) + return tensor + + bridge.q.hook_out.add_hook(hook_fn) + B, S, D = 1, 4, 64 + bridge(torch.randn(B, S, D), position_ids=self._position_ids(B, S)) + assert fired, "hook_q (q.hook_out) did not fire" + + def test_hook_k_fires(self): + """hook_k (k.hook_out) must be called during the forward pass.""" + bridge, _ = _make_bridge() + fired = [] + + def hook_fn(tensor, hook): + fired.append(True) + return tensor + + bridge.k.hook_out.add_hook(hook_fn) + B, S, D = 1, 4, 64 + bridge(torch.randn(B, S, D), position_ids=self._position_ids(B, S)) + assert fired, "hook_k (k.hook_out) did not fire" + + def test_hook_v_fires(self): + """hook_v (v.hook_out) must be called during the forward pass.""" + bridge, _ = _make_bridge() + fired = [] + + def hook_fn(tensor, hook): + fired.append(True) + return tensor + + bridge.v.hook_out.add_hook(hook_fn) + B, S, D = 1, 4, 64 + bridge(torch.randn(B, S, D), position_ids=self._position_ids(B, S)) + assert fired, "hook_v (v.hook_out) did not fire" + + def test_hook_attn_scores_fires(self): + """hook_attn_scores must be called during _reconstruct_attention.""" + bridge, _ = _make_bridge() + fired = [] + + def hook_fn(tensor, hook): + fired.append(True) + return tensor + + bridge.hook_attn_scores.add_hook(hook_fn) + B, S, D = 1, 4, 64 + bridge(torch.randn(B, S, D), position_ids=self._position_ids(B, S)) + assert fired, "hook_attn_scores did not fire" + + def test_hook_pattern_fires(self): + """hook_pattern must be called during _reconstruct_attention.""" + bridge, _ = _make_bridge() + fired = [] + + def hook_fn(tensor, hook): + fired.append(True) + return tensor + + bridge.hook_pattern.add_hook(hook_fn) + B, S, D = 1, 4, 64 + bridge(torch.randn(B, S, D), position_ids=self._position_ids(B, S)) + assert fired, "hook_pattern did not fire" + + def test_hook_z_fires(self): + """hook_z (o.hook_in) must be called during the forward pass.""" + bridge, _ = _make_bridge() + fired = [] + + def hook_fn(tensor, hook): + fired.append(True) + return tensor + + bridge.o.hook_in.add_hook(hook_fn) + B, S, D = 1, 4, 64 + bridge(torch.randn(B, S, D), position_ids=self._position_ids(B, S)) + assert fired, "hook_z (o.hook_in) did not fire" + + def test_hook_result_fires(self): + """hook_result (hook_out) must be called after the output projection.""" + bridge, _ = _make_bridge() + fired = [] + + def hook_fn(tensor, hook): + fired.append(True) + return tensor + + bridge.hook_out.add_hook(hook_fn) + B, S, D = 1, 4, 64 + bridge(torch.randn(B, S, D), position_ids=self._position_ids(B, S)) + assert fired, "hook_result (hook_out) did not fire" + + def test_hook_q_mutation_affects_output(self): + """A mutation in hook_q must propagate to the final attention output.""" + bridge, _ = _make_bridge() + B, S, D = 1, 4, 64 + hs = torch.randn(B, S, D) + pos_ids = self._position_ids(B, S) + + baseline_out, _ = bridge(hs.clone(), position_ids=pos_ids) + + def zeroing_hook(tensor, hook): + return torch.zeros_like(tensor) + + bridge.q.hook_out.add_hook(zeroing_hook) + zeroed_out, _ = bridge(hs.clone(), position_ids=pos_ids) + + assert not torch.allclose(baseline_out, zeroed_out), ( + "Zeroing hook_q should change the attention output" + ) + + +# --------------------------------------------------------------------------- +# RoPE application tests +# --------------------------------------------------------------------------- + + +class TestCodeGenAttentionBridgeRoPE: + """Tests verifying RoPE is correctly applied in the forward pass.""" + + def _position_ids(self, batch: int, seq: int) -> torch.Tensor: + return torch.arange(seq).unsqueeze(0).expand(batch, -1) + + def test_rope_changes_q_and_k(self): + """RoPE must change the Q and K tensors compared to the raw projection.""" + config = _make_config(n_heads=4, d_model=64, rotary_dim=16) + split_qkv, q_lin, k_lin, v_lin = _make_split_qkv(config.d_model) + bridge, _ = _make_bridge(config=config, split_qkv=split_qkv) + + B, S = 1, 6 + hs = torch.randn(B, S, config.d_model) + pos_ids = self._position_ids(B, S) + + raw_q_values = [] + rope_q_values = [] + + def capture_raw_q(tensor, hook): + raw_q_values.append(tensor.clone()) + return tensor + + def capture_rope_q(tensor, hook): + rope_q_values.append(tensor.clone()) + return tensor + + # Capture Q before RoPE (at q.hook_out, before _reconstruct_attention) + bridge.q.hook_out.add_hook(capture_raw_q) + + # We intercept hook_attn_scores to verify Q was modified. + # Instead, we verify by comparing raw projection output vs scores difference. + # A simpler check: scores with RoPE ≠ scores computed from raw Q*K^T. + attn_scores_with_rope = [] + + def capture_scores(tensor, hook): + attn_scores_with_rope.append(tensor.clone()) + return tensor + + bridge.hook_attn_scores.add_hook(capture_scores) + bridge(hs, position_ids=pos_ids) + + assert raw_q_values, "q.hook_out did not fire" + assert attn_scores_with_rope, "hook_attn_scores did not fire" + + # Compute what scores would be WITHOUT RoPE + raw_q = raw_q_values[0] # [B, S, D] + raw_k = k_lin(hs) # [B, S, D] + n_heads = config.n_heads + head_dim = config.d_model // n_heads + q_plain = raw_q.view(B, S, n_heads, head_dim).transpose(1, 2).to(torch.float32) + k_plain = raw_k.view(B, S, n_heads, head_dim).transpose(1, 2).to(torch.float32) + scores_no_rope = torch.matmul(q_plain, k_plain.transpose(-2, -1)) + + actual_scores = attn_scores_with_rope[0] + + # The scores MUST differ because RoPE was applied + assert not torch.allclose(actual_scores, scores_no_rope, atol=1e-4), ( + "Attention scores with and without RoPE should differ" + ) + + def test_partial_rotary_dim_leaves_pass_through_unchanged(self): + """The head-dim slice beyond rotary_dim should not be rotated. + + We verify this by checking that the last (head_dim - rotary_dim) dimensions + of Q are identical before and after RoPE. + """ + config = _make_config(n_heads=2, d_model=16, rotary_dim=4) + split_qkv, q_lin, k_lin, v_lin = _make_split_qkv(config.d_model) + bridge, original = _make_bridge(config=config, split_qkv=split_qkv) + + B, S = 1, 4 + hs = torch.randn(B, S, config.d_model) + pos_ids = torch.arange(S).unsqueeze(0).expand(B, -1) + + n_heads = config.n_heads + head_dim = config.d_model // n_heads + rotary_dim = config.rotary_dim + + # Compute raw Q projection + raw_q = q_lin(hs) # [B, S, D] + raw_q_heads = raw_q.view(B, S, n_heads, head_dim) # [B, S, H, head_dim] + pass_through_raw = raw_q_heads[:, :, :, rotary_dim:] # the un-rotated slice + + # Now run the full forward to extract the Q passed into attn scores. + # We capture K just before the matmul by patching _apply_rotary_pos_emb. + q_after_rope = [] + + def capture_q_after_rope(tensor, hook): + q_after_rope.append(tensor.clone()) + return tensor + + # We patch _reconstruct_attention to intercept Q after RoPE. + # Simpler: capture attn_scores and back-compute is complex. + # Instead, we patch the module-level function with a wrapper. + import transformer_lens.model_bridge.generalized_components.codegen_attention as codegen_attn_mod + + original_fn = codegen_attn_mod._apply_rotary_pos_emb + q_passed = [] + k_passed = [] + + def patched_apply_rope(tensor, sin, cos): + # Record the first call (Q), second call (K) + if len(q_passed) == 0: + q_passed.append(tensor.clone()) + else: + k_passed.append(tensor.clone()) + return original_fn(tensor, sin, cos) + + codegen_attn_mod._apply_rotary_pos_emb = patched_apply_rope # type: ignore[attr-defined] + try: + bridge(hs, position_ids=pos_ids) + finally: + codegen_attn_mod._apply_rotary_pos_emb = original_fn # type: ignore[attr-defined] + + assert q_passed, "RoPE was not applied to Q" + + # The slice sent into RoPE must equal the raw_q rotary slice + q_rot_slice = q_passed[0] # [B, H, S, rotary_dim] + raw_q_rot_slice = raw_q_heads.transpose(1, 2)[:, :, :, :rotary_dim] + assert torch.allclose(q_rot_slice, raw_q_rot_slice, atol=1e-5), ( + "Q slice sent to RoPE must equal the raw projection (pre-rotation)" + ) + + +# --------------------------------------------------------------------------- +# Causal masking test +# --------------------------------------------------------------------------- + + +class TestCodeGenAttentionBridgeCausalMask: + """Test causal masking in _reconstruct_attention.""" + + def test_future_positions_have_zero_attention_weight(self): + """Attention pattern must be lower-triangular (causal).""" + bridge, _ = _make_bridge() + B, S, D = 1, 6, 64 + hs = torch.randn(B, S, D) + pos_ids = torch.arange(S).unsqueeze(0).expand(B, -1) + + _, attn_weights = bridge(hs, position_ids=pos_ids) + # attn_weights: [B, H, S, S]; upper triangle (future) must be ~0 + for i in range(S): + for j in range(i + 1, S): + assert torch.all(attn_weights[:, :, i, j].abs() < 1e-5), ( + f"attn_weights[:, :, {i}, {j}] should be ~0 (future position)" + ) diff --git a/tests/unit/model_bridge/supported_architectures/__init__.py b/tests/unit/model_bridge/supported_architectures/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/unit/model_bridge/supported_architectures/test_codegen_adapter.py b/tests/unit/model_bridge/supported_architectures/test_codegen_adapter.py new file mode 100644 index 000000000..b76f36cce --- /dev/null +++ b/tests/unit/model_bridge/supported_architectures/test_codegen_adapter.py @@ -0,0 +1,331 @@ +"""Unit tests for CodeGenArchitectureAdapter. + +Tests cover: +- Config attribute validation (all required attributes are set correctly) +- Component mapping structure (correct bridge types, no ln2) +- Weight conversion keys and structure +- split_qkv_matrix correctness (numerical test with known weights) +- Factory registration (CodeGenForCausalLM maps to the right adapter) +""" + +from types import SimpleNamespace +from typing import Any + +import pytest +import torch +import torch.nn as nn + +from transformer_lens.config import TransformerBridgeConfig +from transformer_lens.model_bridge.generalized_components import ( + BlockBridge, + CodeGenAttentionBridge, + EmbeddingBridge, + MLPBridge, + NormalizationBridge, + UnembeddingBridge, +) +from transformer_lens.model_bridge.supported_architectures.codegen import ( + CodeGenArchitectureAdapter, +) + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +def _make_cfg( + n_heads: int = 4, + d_model: int = 64, + n_layers: int = 2, + d_mlp: int = 256, + d_vocab: int = 1000, + n_ctx: int = 512, +) -> TransformerBridgeConfig: + """Return a minimal TransformerBridgeConfig for CodeGen adapter tests.""" + return TransformerBridgeConfig( + d_model=d_model, + d_head=d_model // n_heads, + n_layers=n_layers, + n_ctx=n_ctx, + n_heads=n_heads, + d_vocab=d_vocab, + d_mlp=d_mlp, + default_prepend_bos=True, + architecture="CodeGenForCausalLM", + ) + + +@pytest.fixture +def cfg() -> TransformerBridgeConfig: + return _make_cfg() + + +@pytest.fixture +def adapter(cfg: TransformerBridgeConfig) -> CodeGenArchitectureAdapter: + return CodeGenArchitectureAdapter(cfg) + + +# --------------------------------------------------------------------------- +# Config attribute tests +# --------------------------------------------------------------------------- + + +class TestCodeGenAdapterConfig: + """Tests that the adapter sets required config attributes correctly.""" + + def test_normalization_type_is_ln(self, adapter: CodeGenArchitectureAdapter) -> None: + assert adapter.cfg.normalization_type == "LN" + + def test_positional_embedding_type_is_rotary(self, adapter: CodeGenArchitectureAdapter) -> None: + assert adapter.cfg.positional_embedding_type == "rotary" + + def test_final_rms_is_false(self, adapter: CodeGenArchitectureAdapter) -> None: + assert adapter.cfg.final_rms is False + + def test_gated_mlp_is_false(self, adapter: CodeGenArchitectureAdapter) -> None: + assert adapter.cfg.gated_mlp is False + + def test_attn_only_is_false(self, adapter: CodeGenArchitectureAdapter) -> None: + assert adapter.cfg.attn_only is False + + def test_parallel_attn_mlp_is_true(self, adapter: CodeGenArchitectureAdapter) -> None: + assert adapter.cfg.parallel_attn_mlp is True + + +# --------------------------------------------------------------------------- +# Component mapping structure tests +# --------------------------------------------------------------------------- + + +class TestCodeGenAdapterComponentMapping: + """Tests that component_mapping has the correct bridge types and structure.""" + + def test_embed_is_embedding_bridge(self, adapter: CodeGenArchitectureAdapter) -> None: + assert isinstance(adapter.component_mapping["embed"], EmbeddingBridge) + + def test_embed_name(self, adapter: CodeGenArchitectureAdapter) -> None: + assert adapter.component_mapping["embed"].name == "transformer.wte" + + def test_blocks_is_block_bridge(self, adapter: CodeGenArchitectureAdapter) -> None: + assert isinstance(adapter.component_mapping["blocks"], BlockBridge) + + def test_blocks_name(self, adapter: CodeGenArchitectureAdapter) -> None: + assert adapter.component_mapping["blocks"].name == "transformer.h" + + def test_ln_final_is_normalization_bridge(self, adapter: CodeGenArchitectureAdapter) -> None: + assert isinstance(adapter.component_mapping["ln_final"], NormalizationBridge) + + def test_ln_final_name(self, adapter: CodeGenArchitectureAdapter) -> None: + assert adapter.component_mapping["ln_final"].name == "transformer.ln_f" + + def test_unembed_is_unembedding_bridge(self, adapter: CodeGenArchitectureAdapter) -> None: + assert isinstance(adapter.component_mapping["unembed"], UnembeddingBridge) + + def test_unembed_name(self, adapter: CodeGenArchitectureAdapter) -> None: + assert adapter.component_mapping["unembed"].name == "lm_head" + + def test_blocks_ln1_is_normalization_bridge(self, adapter: CodeGenArchitectureAdapter) -> None: + blocks = adapter.component_mapping["blocks"] + assert isinstance(blocks.submodules["ln1"], NormalizationBridge) + + def test_blocks_ln1_name(self, adapter: CodeGenArchitectureAdapter) -> None: + blocks = adapter.component_mapping["blocks"] + assert blocks.submodules["ln1"].name == "ln_1" + + def test_no_ln2_in_blocks(self, adapter: CodeGenArchitectureAdapter) -> None: + """CodeGen uses parallel attn+MLP sharing ln_1 — there must be no ln2.""" + blocks = adapter.component_mapping["blocks"] + assert "ln2" not in blocks.submodules, ( + "CodeGen parallel block must not have ln2" + ) + + def test_attn_is_codegen_attention_bridge(self, adapter: CodeGenArchitectureAdapter) -> None: + blocks = adapter.component_mapping["blocks"] + assert isinstance(blocks.submodules["attn"], CodeGenAttentionBridge) + + def test_attn_name(self, adapter: CodeGenArchitectureAdapter) -> None: + blocks = adapter.component_mapping["blocks"] + assert blocks.submodules["attn"].name == "attn" + + def test_mlp_is_mlp_bridge(self, adapter: CodeGenArchitectureAdapter) -> None: + blocks = adapter.component_mapping["blocks"] + assert isinstance(blocks.submodules["mlp"], MLPBridge) + + def test_mlp_name(self, adapter: CodeGenArchitectureAdapter) -> None: + blocks = adapter.component_mapping["blocks"] + assert blocks.submodules["mlp"].name == "mlp" + + def test_mlp_in_name(self, adapter: CodeGenArchitectureAdapter) -> None: + blocks = adapter.component_mapping["blocks"] + assert blocks.submodules["mlp"].submodules["in"].name == "fc_in" + + def test_mlp_out_name(self, adapter: CodeGenArchitectureAdapter) -> None: + blocks = adapter.component_mapping["blocks"] + assert blocks.submodules["mlp"].submodules["out"].name == "fc_out" + + +# --------------------------------------------------------------------------- +# Weight processing conversion tests +# --------------------------------------------------------------------------- + + +class TestCodeGenAdapterWeightConversions: + """Tests that weight_processing_conversions has the expected keys.""" + + def test_q_weight_key_present(self, adapter: CodeGenArchitectureAdapter) -> None: + assert "blocks.{i}.attn.q.weight" in adapter.weight_processing_conversions + + def test_k_weight_key_present(self, adapter: CodeGenArchitectureAdapter) -> None: + assert "blocks.{i}.attn.k.weight" in adapter.weight_processing_conversions + + def test_v_weight_key_present(self, adapter: CodeGenArchitectureAdapter) -> None: + assert "blocks.{i}.attn.v.weight" in adapter.weight_processing_conversions + + def test_o_weight_key_present(self, adapter: CodeGenArchitectureAdapter) -> None: + assert "blocks.{i}.attn.o.weight" in adapter.weight_processing_conversions + + def test_exactly_four_conversion_keys(self, adapter: CodeGenArchitectureAdapter) -> None: + assert len(adapter.weight_processing_conversions) == 4 + + +# --------------------------------------------------------------------------- +# split_qkv_matrix numerical correctness tests +# --------------------------------------------------------------------------- + + +class TestCodeGenSplitQKVMatrix: + """Numerical tests verifying the mp_num=4 QKV split logic.""" + + def _make_adapter_with_dmodel(self, d_model: int, n_heads: int) -> CodeGenArchitectureAdapter: + cfg = _make_cfg(d_model=d_model, n_heads=n_heads) + return CodeGenArchitectureAdapter(cfg) + + def _make_attn_component(self, d_model: int) -> Any: + """Create a minimal attn component with a qkv_proj linear.""" + attn = SimpleNamespace() + attn.qkv_proj = nn.Linear(d_model, d_model * 3, bias=False) + return attn + + def test_returns_three_linear_modules(self) -> None: + """split_qkv_matrix must return exactly three nn.Linear modules.""" + adapter = self._make_adapter_with_dmodel(64, 4) + attn = self._make_attn_component(64) + q, k, v = adapter.split_qkv_matrix(attn) + assert isinstance(q, nn.Linear) + assert isinstance(k, nn.Linear) + assert isinstance(v, nn.Linear) + + def test_output_shapes_are_correct(self) -> None: + """Each of Q, K, V must have weight shape [n_embd, n_embd].""" + d_model = 64 + adapter = self._make_adapter_with_dmodel(d_model, 4) + attn = self._make_attn_component(d_model) + q, k, v = adapter.split_qkv_matrix(attn) + assert q.weight.shape == (d_model, d_model) + assert k.weight.shape == (d_model, d_model) + assert v.weight.shape == (d_model, d_model) + + def test_no_bias_on_outputs(self) -> None: + """The split linears must have no bias, matching qkv_proj.""" + adapter = self._make_adapter_with_dmodel(64, 4) + attn = self._make_attn_component(64) + q, k, v = adapter.split_qkv_matrix(attn) + assert q.bias is None + assert k.bias is None + assert v.bias is None + + def test_q_k_v_are_distinct(self) -> None: + """With a non-trivial weight, Q, K, V must differ from each other.""" + adapter = self._make_adapter_with_dmodel(64, 4) + attn = self._make_attn_component(64) + # Fill qkv_proj with distinct values per row + nn.init.normal_(attn.qkv_proj.weight) + q, k, v = adapter.split_qkv_matrix(attn) + # All three must differ + assert not torch.allclose(q.weight, k.weight), "Q and K weights must differ" + assert not torch.allclose(q.weight, v.weight), "Q and V weights must differ" + assert not torch.allclose(k.weight, v.weight), "K and V weights must differ" + + def test_known_partition_ordering(self) -> None: + """Verify the mp_num=4 partition layout: within each partition [Q_part, V_part, K_part]. + + We construct a weight where partition index and slot index are embedded + in the values, then verify that Q, K, V extract the correct slices. + """ + mp_num = 4 + d_model = 64 + n_heads = 4 + local_dim = d_model // mp_num # 16 + + adapter = self._make_adapter_with_dmodel(d_model, n_heads) + attn = self._make_attn_component(d_model) + + # Build a structured weight: rows are indexed 0..3*d_model-1. + # Reshape as [mp_num=4, 3, local_dim=16, d_model=64], set each slice + # to a unique constant so we can track which slot goes where. + w = torch.zeros(mp_num, 3, local_dim, d_model) + # slot 0 = Q_part → fill with 1.0 + w[:, 0, :, :] = 1.0 + # slot 1 = V_part → fill with 2.0 + w[:, 1, :, :] = 2.0 + # slot 2 = K_part → fill with 3.0 + w[:, 2, :, :] = 3.0 + + # Flatten back to [3*d_model, d_model] as qkv_proj expects + attn.qkv_proj.weight = nn.Parameter(w.reshape(3 * d_model, d_model)) + + q, k, v = adapter.split_qkv_matrix(attn) + + assert torch.all(q.weight == 1.0), "Q should come from slot 0 (Q_part)" + assert torch.all(k.weight == 3.0), "K should come from slot 2 (K_part)" + assert torch.all(v.weight == 2.0), "V should come from slot 1 (V_part)" + + def test_forward_output_shape_with_split(self) -> None: + """After split, Q/K/V linears should produce correct output shapes.""" + d_model = 64 + adapter = self._make_adapter_with_dmodel(d_model, 4) + attn = self._make_attn_component(d_model) + q_lin, k_lin, v_lin = adapter.split_qkv_matrix(attn) + + batch, seq = 2, 10 + x = torch.randn(batch, seq, d_model) + assert q_lin(x).shape == (batch, seq, d_model) + assert k_lin(x).shape == (batch, seq, d_model) + assert v_lin(x).shape == (batch, seq, d_model) + + +# --------------------------------------------------------------------------- +# Factory registration test +# --------------------------------------------------------------------------- + + +class TestCodeGenFactoryRegistration: + """Tests that the factory maps CodeGenForCausalLM to the correct adapter. + + Note: Phase D (registration) is required for these tests to pass. They + are included here so that registration is verified as part of the Phase D + commit rather than needing a separate test file. + """ + + def test_factory_returns_codegen_adapter(self) -> None: + """ArchitectureAdapterFactory must return a CodeGenArchitectureAdapter.""" + from transformer_lens.factories.architecture_adapter_factory import ( + ArchitectureAdapterFactory, + ) + + cfg = _make_cfg() + adapter = ArchitectureAdapterFactory.select_architecture_adapter(cfg) + assert isinstance(adapter, CodeGenArchitectureAdapter), ( + f"Expected CodeGenArchitectureAdapter, got {type(adapter).__name__}" + ) + + def test_factory_key_is_codegen_for_causal_lm(self) -> None: + """SUPPORTED_ARCHITECTURES must have a 'CodeGenForCausalLM' key.""" + from transformer_lens.factories.architecture_adapter_factory import ( + SUPPORTED_ARCHITECTURES, + ) + + assert "CodeGenForCausalLM" in SUPPORTED_ARCHITECTURES, ( + "CodeGenForCausalLM must be registered in SUPPORTED_ARCHITECTURES" + ) diff --git a/transformer_lens/factories/architecture_adapter_factory.py b/transformer_lens/factories/architecture_adapter_factory.py index 458d1b073..1c6462cad 100644 --- a/transformer_lens/factories/architecture_adapter_factory.py +++ b/transformer_lens/factories/architecture_adapter_factory.py @@ -9,6 +9,7 @@ ApertusArchitectureAdapter, BertArchitectureAdapter, BloomArchitectureAdapter, + CodeGenArchitectureAdapter, FalconArchitectureAdapter, Gemma1ArchitectureAdapter, Gemma2ArchitectureAdapter, @@ -53,6 +54,7 @@ "ApertusForCausalLM": ApertusArchitectureAdapter, "BertForMaskedLM": BertArchitectureAdapter, "BloomForCausalLM": BloomArchitectureAdapter, + "CodeGenForCausalLM": CodeGenArchitectureAdapter, "FalconForCausalLM": FalconArchitectureAdapter, "GemmaForCausalLM": Gemma1ArchitectureAdapter, # Default to Gemma1 as it's the original version "Gemma1ForCausalLM": Gemma1ArchitectureAdapter, diff --git a/transformer_lens/model_bridge/generalized_components/__init__.py b/transformer_lens/model_bridge/generalized_components/__init__.py index 334b262c0..3bbe8e356 100644 --- a/transformer_lens/model_bridge/generalized_components/__init__.py +++ b/transformer_lens/model_bridge/generalized_components/__init__.py @@ -9,6 +9,9 @@ from transformer_lens.model_bridge.generalized_components.bloom_attention import ( BloomAttentionBridge, ) +from transformer_lens.model_bridge.generalized_components.codegen_attention import ( + CodeGenAttentionBridge, +) from transformer_lens.model_bridge.generalized_components.bloom_block import ( BloomBlockBridge, ) @@ -78,6 +81,7 @@ "BlockBridge", "BloomBlockBridge", "BloomAttentionBridge", + "CodeGenAttentionBridge", "BloomMLPBridge", "CLIPVisionEncoderBridge", "CLIPVisionEncoderLayerBridge", diff --git a/transformer_lens/model_bridge/generalized_components/codegen_attention.py b/transformer_lens/model_bridge/generalized_components/codegen_attention.py new file mode 100644 index 000000000..a4df9c170 --- /dev/null +++ b/transformer_lens/model_bridge/generalized_components/codegen_attention.py @@ -0,0 +1,372 @@ +"""CodeGen-specific attention bridge component. + +CodeGen attention uses a fused QKV projection (qkv_proj) with a GPT-J-style +``rotate_every_two`` rotary positional encoding applied to Q and K before the +attention matmul. The rotary embeddings are stored as a sinusoidal buffer +(``embed_positions``) on the original ``CodeGenAttention`` module and are +indexed by ``position_ids``. + +Optional parameters (may be absent in some CodeGen checkpoints): + - rotary_dim: if None, RoPE is applied to the full head dimension. +""" + +from typing import Any, Callable, Dict, Optional + +import torch + +from transformer_lens.conversion_utils.conversion_steps.base_tensor_conversion import ( + BaseTensorConversion, +) +from transformer_lens.model_bridge.generalized_components.base import ( + GeneralizedComponent, +) +from transformer_lens.model_bridge.generalized_components.joint_qkv_attention import ( + JointQKVAttentionBridge, +) + + +# --------------------------------------------------------------------------- +# Rotary helpers — GPT-J / CodeGen style ("rotate_every_two") +# --------------------------------------------------------------------------- + + +def _rotate_every_two(x: torch.Tensor) -> torch.Tensor: + """Rotate every pair of elements (GPT-J / CodeGen style). + + Mirrors ``rotate_every_two`` from + ``transformers.models.codegen.modeling_codegen`` (line 56-60). + + Args: + x: Tensor of shape ``[batch, heads, seq, head_dim]``. + + Returns: + Tensor of the same shape with even/odd pairs rotated. + """ + x1 = x[:, :, :, ::2] # even-indexed dims + x2 = x[:, :, :, 1::2] # odd-indexed dims + x = torch.stack((-x2, x1), dim=-1) + return x.flatten(-2) + + +def _apply_rotary_pos_emb( + tensor: torch.Tensor, + sin: torch.Tensor, + cos: torch.Tensor, +) -> torch.Tensor: + """Apply rotary positional embeddings (GPT-J / CodeGen style). + + Adapted from ``apply_rotary_pos_emb`` in + ``transformers.models.codegen.modeling_codegen`` (line 64-67) to work + with tensors in the TransformerLens ``[batch, heads, seq, head_dim]`` + layout (heads and seq are swapped relative to HuggingFace). + + Args: + tensor: ``[batch, heads, seq, rotary_dim]`` — the slice of Q or K that + will be rotated. + sin: ``[batch, seq, rotary_dim // 2]`` — the sin half of the sinusoidal + embedding (before ``repeat_interleave``). + cos: ``[batch, seq, rotary_dim // 2]`` — the cos half. + + Returns: + Rotated tensor with the same shape as *tensor*. + """ + # Expand sin/cos from [batch, seq, rotary_dim//2] + # to [batch, 1, seq, rotary_dim] so they broadcast with + # tensor of shape [batch, heads, seq, rotary_dim]. + sin = torch.repeat_interleave(sin[:, None, :, :], 2, 3) # [B, 1, seq, rot_dim] + cos = torch.repeat_interleave(cos[:, None, :, :], 2, 3) # [B, 1, seq, rot_dim] + return (tensor * cos) + (_rotate_every_two(tensor) * sin) + + +class CodeGenAttentionBridge(JointQKVAttentionBridge): + """Attention bridge for CodeGen models. + + CodeGen uses: + - A fused ``qkv_proj`` linear (no bias). + - GPT-J-style ``rotate_every_two`` RoPE applied to Q and K before the + attention matmul. Rotary embeddings are stored in the + ``embed_positions`` buffer of the original ``CodeGenAttention`` module + and indexed by ``position_ids``. + - Only the first ``rotary_dim`` dimensions of each head are rotated. + When ``rotary_dim`` is None the full head dimension is rotated. + - An ``out_proj`` linear output projection (no bias). + + All TransformerLens hooks fire in the forward pass: + ``hook_q``, ``hook_k``, ``hook_v``, ``hook_attn_scores``, + ``hook_pattern``, ``hook_z`` (via ``o.hook_in``), ``hook_result`` + (via ``hook_out``). + """ + + def __init__( + self, + name: str, + config: Any, + split_qkv_matrix: Optional[Callable] = None, + submodules: Optional[Dict[str, GeneralizedComponent]] = None, + qkv_conversion_rule: Optional[BaseTensorConversion] = None, + attn_conversion_rule: Optional[BaseTensorConversion] = None, + pattern_conversion_rule: Optional[BaseTensorConversion] = None, + ) -> None: + """Initialise the CodeGen attention bridge. + + Args: + name: The name of this component. + config: Model configuration (must have ``n_heads``, ``d_head``, + and optionally ``rotary_dim``). + split_qkv_matrix: Callable that splits the fused QKV weight into + three ``nn.Linear`` modules for Q, K, and V. Required — there + is no sensible default for CodeGen's mp_num=4 split logic. + submodules: Optional extra submodules to register. + qkv_conversion_rule: Optional conversion rule for Q/K/V outputs. + attn_conversion_rule: Optional conversion rule for the attention + output. + pattern_conversion_rule: Optional conversion rule for attention + patterns. + """ + super().__init__( + name=name, + config=config, + split_qkv_matrix=split_qkv_matrix, + submodules=submodules, + qkv_conversion_rule=qkv_conversion_rule, + attn_conversion_rule=attn_conversion_rule, + pattern_conversion_rule=pattern_conversion_rule, + requires_position_embeddings=False, + requires_attention_mask=False, + ) + + # ------------------------------------------------------------------ + # Component testing inputs + # ------------------------------------------------------------------ + + def get_random_inputs( + self, + batch_size: int = 2, + seq_len: int = 8, + device=None, + dtype=None, + ): + """Return random inputs for isolated component testing. + + CodeGen attention requires ``position_ids`` (to index into + ``embed_positions``) and a HuggingFace-style 4D causal attention mask. + The mask is provided so that both the bridge and the HF component + apply identical causal masking during the ``all_components`` benchmark. + + Args: + batch_size: Batch size. + seq_len: Sequence length. + device: Target device (defaults to CPU). + dtype: Tensor dtype (defaults to float32). + + Returns: + Dict with ``hidden_states``, ``position_ids``, and + ``attention_mask`` suitable for both bridge and HF forward calls. + """ + import torch + + if device is None: + device = torch.device("cpu") + if dtype is None: + dtype = torch.float32 + + d_model = ( + self.config.d_model + if self.config and hasattr(self.config, "d_model") + else 768 + ) + + # Build the HF-style 4D causal mask: 0 where attended, -inf where masked. + # Shape: [batch, 1, seq_len, seq_len] + min_val = torch.finfo(dtype).min + causal = torch.zeros(batch_size, 1, seq_len, seq_len, device=device, dtype=dtype) + mask_upper = torch.triu( + torch.ones(seq_len, seq_len, device=device, dtype=torch.bool), diagonal=1 + ) + causal[:, 0] = causal[:, 0].masked_fill(mask_upper, min_val) + + return { + "hidden_states": torch.randn( + batch_size, seq_len, d_model, device=device, dtype=dtype + ), + "position_ids": torch.arange(seq_len, device=device) + .unsqueeze(0) + .expand(batch_size, -1), + "attention_mask": causal, + } + + # ------------------------------------------------------------------ + # Component wiring + # ------------------------------------------------------------------ + + def set_original_component(self, original_component: torch.nn.Module) -> None: + """Wire the original CodeGenAttention and set up the output projection. + + The base ``JointQKVAttentionBridge.set_original_component`` hardcodes + ``c_proj`` for the output projection wiring. CodeGen uses ``out_proj`` + instead, so we override here to wire it correctly after calling super. + + Args: + original_component: The original ``CodeGenAttention`` layer. + """ + # Let the base class split QKV; it will attempt (and fail-silently) the + # c_proj wiring because CodeGen has no c_proj attribute. + super().set_original_component(original_component) + + # Wire out_proj explicitly. + if hasattr(self, "o") and hasattr(original_component, "out_proj"): + self.o.set_original_component(original_component.out_proj) + + # ------------------------------------------------------------------ + # Forward pass + # ------------------------------------------------------------------ + + def forward(self, *args: Any, **kwargs: Any) -> Any: + """Forward pass through CodeGen attention with all hooks firing. + + Manually reconstructs attention so that all TransformerLens hooks + (hook_q, hook_k, hook_v, hook_attn_scores, hook_pattern, hook_z, + hook_result) fire correctly. + + CodeGen passes ``position_ids`` as a keyword argument; these are used + to index into the ``embed_positions`` sinusoidal buffer stored on the + original ``CodeGenAttention`` module. + + Args: + *args: Positional arguments; the first must be ``hidden_states``. + **kwargs: Keyword arguments including ``position_ids`` (required + for RoPE), ``attention_mask`` (optional), ``layer_past`` + (optional KV cache), and ``cache_position`` (optional). + + Returns: + Tuple of ``(attn_output, attn_weights)``. + """ + if self.original_component is None: + raise RuntimeError( + f"Original component not set for {self.name}. " + "Call set_original_component() first." + ) + + # ---- 1. Extract hidden_states ---- + if len(args) > 0 and isinstance(args[0], torch.Tensor): + hidden_states = args[0] + elif "hidden_states" in kwargs and isinstance(kwargs["hidden_states"], torch.Tensor): + hidden_states = kwargs["hidden_states"] + else: + raise ValueError("Could not find hidden_states in args or kwargs.") + + # ---- 2. Input hook ---- + hooked_input = self.hook_in(hidden_states) + + # ---- 3. Q / K / V projections (fires hook_q, hook_k, hook_v) ---- + q_output = self.q(hooked_input) + k_output = self.k(hooked_input) + v_output = self.v(hooked_input) + + # ---- 4. Reconstruct attention with RoPE ---- + attn_output, attn_weights = self._reconstruct_attention( + q_output, k_output, v_output, **kwargs + ) + + # ---- 5. Output hooks (fires hook_z via o.hook_in, hook_result via hook_out) ---- + output = (attn_output, attn_weights) + output = self._process_output(output) + return output + + def _reconstruct_attention( + self, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + **kwargs: Any, + ) -> tuple: + """Reconstruct attention with CodeGen's rotate_every_two RoPE. + + This method: + 1. Reshapes Q/K/V to ``[batch, heads, seq, head_dim]``. + 2. Applies ``rotate_every_two`` RoPE to Q and K (first ``rotary_dim`` + dimensions only when ``rotary_dim`` is set). + 3. Runs scaled dot-product attention (fp32, matching HF CodeGen). + 4. Fires ``hook_attn_scores`` and ``hook_pattern``. + 5. Applies the output projection via ``self.o``. + + Args: + q: Q tensor from the Q LinearBridge. + k: K tensor from the K LinearBridge. + v: V tensor from the V LinearBridge. + **kwargs: Forwarded kwargs; must include ``position_ids``. + + Returns: + ``(attn_output, attn_weights)`` tuple. + """ + assert self.original_component is not None + assert self.config is not None + + num_heads: int = self.config.n_heads + + # Reshape to [batch, heads, seq, head_dim] + q, k, v, batch_size, seq_len, head_dim = self._reshape_qkv_to_heads(q, k, v, num_heads) + + # ---- RoPE ---- + position_ids: Optional[torch.Tensor] = kwargs.get("position_ids", None) + if position_ids is not None: + embed_positions: torch.Tensor = self.original_component.embed_positions # type: ignore[union-attr] + # Move buffer to the right device if needed (mirrors HF forward) + if embed_positions.device != position_ids.device: + embed_positions = embed_positions.to(position_ids.device) + + # sincos: [batch, seq, rotary_dim] (full dim = sin_half + cos_half) + sincos = embed_positions[position_ids] + half = sincos.shape[-1] // 2 + sin, cos = sincos[:, :, :half], sincos[:, :, half:] + + rotary_dim: Optional[int] = getattr(self.original_component, "rotary_dim", None) + if rotary_dim is not None: + # Only rotate the first rotary_dim dimensions; pass the rest through. + q_rot = _apply_rotary_pos_emb(q[:, :, :, :rotary_dim], sin, cos) + k_rot = _apply_rotary_pos_emb(k[:, :, :, :rotary_dim], sin, cos) + q = torch.cat([q_rot, q[:, :, :, rotary_dim:]], dim=-1) + k = torch.cat([k_rot, k[:, :, :, rotary_dim:]], dim=-1) + else: + q = _apply_rotary_pos_emb(q, sin, cos) + k = _apply_rotary_pos_emb(k, sin, cos) + + # ---- KV cache ---- + k, v = self._update_kv_cache(k, v, **kwargs) + kv_seq_len = k.shape[-2] + + # ---- Scaled dot-product (fp32, matching HF CodeGen._attn) ---- + scale = self.original_component.scale_attn # type: ignore[union-attr] + q_f32 = q.to(torch.float32) + k_f32 = k.to(torch.float32) + + attn_scores = torch.matmul(q_f32, k_f32.transpose(-2, -1)) + + attention_mask: Optional[torch.Tensor] = kwargs.get("attention_mask", None) + attn_scores = self._apply_reconstruct_attention_mask( + attn_scores=attn_scores, + attention_mask=attention_mask, + seq_len=kv_seq_len, + q_seq_len=seq_len, + ) + + # Divide by scale_attn (CodeGen divides *after* the mask, not before) + attn_scores = attn_scores / scale + + attn_scores = self.hook_attn_scores(attn_scores) + + # Softmax + dropout + hook_pattern + attn_weights = self._softmax_dropout_pattern( + attn_scores, + target_dtype=v.dtype, + ) + + attn_output = torch.matmul(attn_weights, v) + + # Reshape [batch, heads, seq, head_dim] → [batch, seq, hidden] + attn_output = self._reshape_attn_output(attn_output, batch_size, seq_len, num_heads, head_dim) + + # Output projection (fires hook_z via o.hook_in) + attn_output = self._apply_output_projection(attn_output) + + return (attn_output, attn_weights) diff --git a/transformer_lens/model_bridge/sources/transformers.py b/transformer_lens/model_bridge/sources/transformers.py index 3f189e841..0dbf6ee70 100644 --- a/transformer_lens/model_bridge/sources/transformers.py +++ b/transformer_lens/model_bridge/sources/transformers.py @@ -166,9 +166,9 @@ def map_default_transformer_lens_config(hf_config): tl_config.sliding_window = source_config.sliding_window if getattr(hf_config, "use_parallel_residual", False): tl_config.parallel_attn_mlp = True - # GPT-J: parallel attn+MLP but missing use_parallel_residual in HF config + # GPT-J and CodeGen: parallel attn+MLP but missing use_parallel_residual in HF config arch_classes = getattr(hf_config, "architectures", []) or [] - if any(a in ("GPTJForCausalLM",) for a in arch_classes): + if any(a in ("GPTJForCausalLM", "CodeGenForCausalLM") for a in arch_classes): tl_config.parallel_attn_mlp = True tl_config.default_prepend_bos = True return tl_config @@ -205,6 +205,7 @@ def determine_architecture_from_hf_config(hf_config): "gemma3": "Gemma3ForCausalLM", "bert": "BertForMaskedLM", "bloom": "BloomForCausalLM", + "codegen": "CodeGenForCausalLM", "gptj": "GPTJForCausalLM", "gpt_neo": "GPTNeoForCausalLM", "gpt_neox": "GPTNeoXForCausalLM", diff --git a/transformer_lens/model_bridge/supported_architectures/__init__.py b/transformer_lens/model_bridge/supported_architectures/__init__.py index 2c32f6b38..1b24f3741 100644 --- a/transformer_lens/model_bridge/supported_architectures/__init__.py +++ b/transformer_lens/model_bridge/supported_architectures/__init__.py @@ -12,6 +12,9 @@ from transformer_lens.model_bridge.supported_architectures.bloom import ( BloomArchitectureAdapter, ) +from transformer_lens.model_bridge.supported_architectures.codegen import ( + CodeGenArchitectureAdapter, +) from transformer_lens.model_bridge.supported_architectures.falcon import ( FalconArchitectureAdapter, ) @@ -131,6 +134,7 @@ "ApertusArchitectureAdapter", "BertArchitectureAdapter", "BloomArchitectureAdapter", + "CodeGenArchitectureAdapter", "FalconArchitectureAdapter", "Gemma1ArchitectureAdapter", "Gemma2ArchitectureAdapter", diff --git a/transformer_lens/model_bridge/supported_architectures/codegen.py b/transformer_lens/model_bridge/supported_architectures/codegen.py new file mode 100644 index 000000000..ee19a109c --- /dev/null +++ b/transformer_lens/model_bridge/supported_architectures/codegen.py @@ -0,0 +1,150 @@ +"""CodeGen architecture adapter.""" + +from typing import Any + +import torch +import torch.nn as nn + +from transformer_lens.conversion_utils.conversion_steps import RearrangeTensorConversion +from transformer_lens.conversion_utils.param_processing_conversion import ( + ParamProcessingConversion, +) +from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter +from transformer_lens.model_bridge.generalized_components import ( + BlockBridge, + CodeGenAttentionBridge, + EmbeddingBridge, + LinearBridge, + MLPBridge, + NormalizationBridge, + UnembeddingBridge, +) + + +class CodeGenArchitectureAdapter(ArchitectureAdapter): + """Architecture adapter for CodeGen models. + + CodeGen uses a parallel attention+MLP block (attn and MLP share the same + LayerNorm input and their outputs are summed). The attention layer uses a + fused ``qkv_proj`` weight whose layout follows GPT-J's ``mp_num=4`` + tensor-parallel partitioning: the rows are interleaved as + ``[Q_part, V_part, K_part]`` within each of the 4 MP partitions. + + Optional Parameters (may be absent in some CodeGen checkpoints): + --------------------------------------------------------------- + - No bias on qkv_proj (fused QKV has no bias) + - No bias on out_proj + - No bias on mlp.fc_in or mlp.fc_out + """ + + def __init__(self, cfg: Any) -> None: + """Initialize the CodeGen architecture adapter.""" + super().__init__(cfg) + + # Config attributes + self.cfg.normalization_type = "LN" + self.cfg.positional_embedding_type = "rotary" + self.cfg.final_rms = False + self.cfg.gated_mlp = False + self.cfg.attn_only = False + self.cfg.parallel_attn_mlp = True + + # After split_qkv_matrix the individual Q/K/V weights have shape + # [n_embd, n_embd]. The conversions below rearrange them to the + # TransformerLens format [n_heads, d_model, d_head]. + self.weight_processing_conversions = { + "blocks.{i}.attn.q.weight": ParamProcessingConversion( + tensor_conversion=RearrangeTensorConversion( + "(n h) m -> n m h", n=self.cfg.n_heads + ), + ), + "blocks.{i}.attn.k.weight": ParamProcessingConversion( + tensor_conversion=RearrangeTensorConversion( + "(n h) m -> n m h", n=self.cfg.n_heads + ), + ), + "blocks.{i}.attn.v.weight": ParamProcessingConversion( + tensor_conversion=RearrangeTensorConversion( + "(n h) m -> n m h", n=self.cfg.n_heads + ), + ), + "blocks.{i}.attn.o.weight": ParamProcessingConversion( + tensor_conversion=RearrangeTensorConversion( + "m (n h) -> n h m", n=self.cfg.n_heads + ), + ), + } + + self.component_mapping = { + "embed": EmbeddingBridge(name="transformer.wte"), + "blocks": BlockBridge( + name="transformer.h", + submodules={ + "ln1": NormalizationBridge(name="ln_1", config=self.cfg), + # No ln2: CodeGen uses parallel attn+MLP that both read from ln_1 + "attn": CodeGenAttentionBridge( + name="attn", + config=self.cfg, + split_qkv_matrix=self.split_qkv_matrix, + submodules={ + "qkv": LinearBridge(name="qkv_proj"), + "o": LinearBridge(name="out_proj"), + }, + ), + "mlp": MLPBridge( + name="mlp", + submodules={ + "in": LinearBridge(name="fc_in"), + "out": LinearBridge(name="fc_out"), + }, + ), + }, + ), + "ln_final": NormalizationBridge(name="transformer.ln_f", config=self.cfg), + "unembed": UnembeddingBridge(name="lm_head"), + } + + def split_qkv_matrix( + self, attn_component: Any + ) -> tuple[nn.Linear, nn.Linear, nn.Linear]: + """Split the fused QKV weight into separate Q, K, V linear modules. + + CodeGen uses GPT-J-style tensor-parallel partitioning with ``mp_num=4`` + partitions. Within each partition the row order is + ``[Q_part, V_part, K_part]``, i.e. **not** the conventional Q/K/V order. + + The fused weight has shape ``[3 * n_embd, n_embd]``. We reshape to + ``[mp_num, 3, local_dim, n_embd]``, extract the three slices, then + flatten back to ``[n_embd, n_embd]`` for each of Q, K, V. + + Args: + attn_component: The original ``CodeGenAttention`` module. + + Returns: + Tuple of ``(q_linear, k_linear, v_linear)`` — three ``nn.Linear`` + modules with no bias and weight shape ``[n_embd, n_embd]``. + """ + mp_num = 4 + n_embd = self.cfg.d_model + + weight = attn_component.qkv_proj.weight # [3*n_embd, n_embd] + + # Partition into mp_num slices; within each: [Q_part, V_part, K_part] + local_dim = n_embd // mp_num + w = weight.reshape(mp_num, 3, local_dim, n_embd) + + # Index 0 = Q, 1 = V, 2 = K (CodeGen partition ordering) + W_Q = w[:, 0, :, :].reshape(n_embd, n_embd) + W_V = w[:, 1, :, :].reshape(n_embd, n_embd) + W_K = w[:, 2, :, :].reshape(n_embd, n_embd) + + q_linear = nn.Linear(n_embd, n_embd, bias=False) + q_linear.weight = nn.Parameter(W_Q) + + k_linear = nn.Linear(n_embd, n_embd, bias=False) + k_linear.weight = nn.Parameter(W_K) + + v_linear = nn.Linear(n_embd, n_embd, bias=False) + v_linear.weight = nn.Parameter(W_V) + + return q_linear, k_linear, v_linear diff --git a/transformer_lens/tools/model_registry/__init__.py b/transformer_lens/tools/model_registry/__init__.py index 409c3dc3f..e85aef2bc 100644 --- a/transformer_lens/tools/model_registry/__init__.py +++ b/transformer_lens/tools/model_registry/__init__.py @@ -45,6 +45,7 @@ "ApertusForCausalLM", "BertForMaskedLM", "BloomForCausalLM", + "CodeGenForCausalLM", "FalconForCausalLM", "GemmaForCausalLM", "Gemma2ForCausalLM", diff --git a/transformer_lens/tools/model_registry/data/architecture_gaps.json b/transformer_lens/tools/model_registry/data/architecture_gaps.json index 68ef2bda5..7344d18b3 100644 --- a/transformer_lens/tools/model_registry/data/architecture_gaps.json +++ b/transformer_lens/tools/model_registry/data/architecture_gaps.json @@ -1,177 +1,129 @@ { - "generated_at": "2026-04-09", + "generated_at": "2026-03-19", "scan_info": { - "total_scanned": 10000, + "total_scanned": 3517, "task_filter": "text-generation", "min_downloads": 500, - "scan_duration_seconds": 3.2 + "scan_duration_seconds": 2.7 }, - "total_unsupported_architectures": 372, - "total_unsupported_models": 1416, + "total_unsupported_architectures": 258, + "total_unsupported_models": 1031, "gaps": [ - { - "architecture_id": "Qwen3_5ForConditionalGeneration", - "total_models": 66, - "sample_models": [ - "Tesslate/OmniCoder-9B", - "mconcat/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-NVFP4", - "croll83/Qwopus3.5-27B-v3-Abliterated", - "osoleve/Qwen3.5-27B-Text-NVFP4-MTP", - "nightmedia/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-qx64-hi-mlx", - "mconcat/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-v2-NVFP4", - "Brooooooklyn/Qwen3.5-27B-unsloth-mlx", - "ShinePixelOrg/Qwopus3.5-27B-v3-NVFP4", - "aifeifei798/Qwen3.5-Queen-27B", - "Jackrong/Qwen3.5-2B-Claude-4.6-Opus-Reasoning-Distilled" - ] - }, { "architecture_id": "Qwen3MoeForCausalLM", - "total_models": 55, + "total_models": 68, "sample_models": [ "Qwen/Qwen3-30B-A3B", "Qwen/Qwen3-30B-A3B-Instruct-2507", + "Qwen/Qwen3-30B-A3B-Thinking-2507", "Qwen/Qwen3-Coder-30B-A3B-Instruct", "Qwen/Qwen3-235B-A22B", - "nvidia/Qwen3-30B-A3B-NVFP4", - "Qwen/Qwen3-30B-A3B-Thinking-2507", "trl-internal-testing/tiny-Qwen3MoeForCausalLM", "Qwen/Qwen3-235B-A22B-Instruct-2507", "Qwen/Qwen3-Coder-480B-A35B-Instruct", - "Qwen/Qwen3-235B-A22B-Thinking-2507" + "NVFP4/Qwen3-30B-A3B-Instruct-2507-FP4", + "nvidia/Qwen3-30B-A3B-NVFP4" ] }, { "architecture_id": "DeepseekV3ForCausalLM", - "total_models": 51, + "total_models": 53, "sample_models": [ "deepseek-ai/DeepSeek-R1", "deepseek-ai/DeepSeek-R1-0528", "deepseek-ai/DeepSeek-V3", - "nvidia/DeepSeek-R1-0528-NVFP4-v2", "deepseek-ai/DeepSeek-V3-0324", - "ai-sage/GigaChat3-10B-A1.8B", + "nvidia/DeepSeek-R1-0528-NVFP4-v2", "deepseek-ai/DeepSeek-V3.1", - "moonshotai/Kimi-K2-Instruct-0905", - "moonshotai/Kimi-K2-Instruct", - "moonshotai/Moonlight-16B-A3B-Instruct" + "ai-sage/GigaChat3-10B-A1.8B", + "trl-internal-testing/tiny-DeepseekV3ForCausalLM", + "nvidia/DeepSeek-V3-0324-NVFP4", + "moonshotai/Kimi-K2-Instruct" ] }, { - "architecture_id": "NemotronHForCausalLM", - "total_models": 50, + "architecture_id": "Qwen3_5ForConditionalGeneration", + "total_models": 46, "sample_models": [ - "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", - "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", - "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4", - "nvidia/NVIDIA-Nemotron-Nano-9B-v2-Japanese", - "nvidia/NVIDIA-Nemotron-Nano-9B-v2", - "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", - "nvidia/Nemotron-Cascade-2-30B-A3B", - "nvidia/NVIDIA-Nemotron-3-Nano-4B-BF16", - "unsloth/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", - "unsloth/NVIDIA-Nemotron-3-Nano-4B" + "Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled", + "osoleve/Qwen3.5-27B-Text-NVFP4-MTP", + "Tesslate/OmniCoder-9B", + "nightmedia/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-qx64-hi-mlx", + "Jackrong/Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled", + "txn545/Qwen3.5-27B-NVFP4", + "mconcat/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-NVFP4", + "Jackrong/Qwen3.5-4B-Claude-4.6-Opus-Reasoning-Distilled", + "EganAI/qwen3.5-9b-terminal-merge", + "Jackrong/Qwen3.5-2B-Claude-4.6-Opus-Reasoning-Distilled" ] }, { - "architecture_id": "Lfm2ForCausalLM", - "total_models": 34, + "architecture_id": "Qwen3NextForCausalLM", + "total_models": 35, "sample_models": [ - "farbodtavakkoli/OTel-LLM-1.2B-IT", - "LiquidAI/LFM2.5-1.2B-Instruct", - "LiquidAI/LFM2-1.2B", - "LiquidAI/LFM2-350M", - "LiquidAI/LFM2.5-1.2B-Thinking", - "LiquidAI/LFM2.5-350M", - "LiquidAI/LFM2-2.6B-Exp", - "LiquidAI/LFM2.5-1.2B-Base", - "LiquidAI/LFM2-700M", - "unsloth/LFM2.5-1.2B-Instruct" + "Qwen/Qwen3-Coder-Next", + "Qwen/Qwen3-Next-80B-A3B-Instruct", + "GadflyII/Qwen3-Coder-Next-NVFP4", + "nvidia/Qwen3-Next-80B-A3B-Thinking-NVFP4", + "nvidia/Qwen3-Next-80B-A3B-Instruct-NVFP4", + "Qwen/Qwen3-Next-80B-A3B-Thinking", + "tiny-random/qwen3-next-moe", + "unsloth/Qwen3-Coder-Next", + "yujiepan/qwen3-next-moe-tiny-random", + "RedHatAI/Qwen3-Coder-Next-NVFP4" ] }, { - "architecture_id": "Qwen3_5ForCausalLM", + "architecture_id": "FalconForCausalLM", "total_models": 32, "sample_models": [ - "lukey03/Qwen3.5-9B-abliterated", - "GoodStartLabs/gin-rummy-hbc-qwen3.5-0.8b", - "aifeifei798/Darkidol-Ballad-27B", - "brocchirodrigo/anotaai-ajuda-qwen3_5_Q4", - "llmfan46/Darkidol-Ballad-27B-ultra-uncensored-heretic-v1", - "llmfan46/Darkidol-Ballad-27B-ultra-uncensored-heretic-v2", - "Phonsiri/Qwen3.5-9B-Thai-Law-Base", - "kai-os/Carnice-9b", - "aifeifei798/Darkidol-Ballad-9B", - "continuum-ai/qwen3.5-4b-code-forged" - ] - }, - { - "architecture_id": "Gemma4ForConditionalGeneration", - "total_models": 30, - "sample_models": [ - "nvidia/Gemma-4-31B-IT-NVFP4", - "dealignai/Gemma-4-31B-JANG_4M-CRACK", - "bg-digitalservices/Gemma-4-26B-A4B-it-NVFP4", - "bg-digitalservices/Gemma-4-E2B-NVFP4A16", - "dealignai/Gemma-4-31B-JANG_4M-Uncensored", - "bg-digitalservices/Gemma-4-E2B-it-NVFP4", - "bg-digitalservices/Gemma-4-26B-A4B-it-NVFP4A16", - "0xSero/gemma-4-21b-a4b-it-REAP", - "InfinimindCreations/gemma-4-E4B-it-uncensored", - "EganAI/gemma-4-31B-Claude-4.6-Opus-Reasoning-Distilled" - ] - }, - { - "architecture_id": "CodeGenForCausalLM", - "total_models": 29, - "sample_models": [ - "Salesforce/codegen-350M-mono", - "Salesforce/codegen-350M-multi", - "Salesforce/codegen-2B-mono", - "Salesforce/codegen-6B-multi", - "Salesforce/codegen-16B-nl", - "Salesforce/codegen-6B-nl", - "Salesforce/codegen-350M-nl", - "Salesforce/codegen-6B-mono", - "Salesforce/codegen-2B-multi", - "Salesforce/codegen-16B-mono" - ] - }, - { - "architecture_id": "MPTForCausalLM", - "total_models": 24, - "sample_models": [ - "vinai/PhoGPT-4B", - "anas-awadalla/mpt-7b", - "gl198976/mpt-7b-instruct", - "replit/replit-code-v1-3b", - "vinai/PhoGPT-4B-Chat", - "wtang06/mpt-125m-c4", - "echarlaix/tiny-mpt-random-remote-code", - "lightblue/japanese-mpt-7b", - "gl198976/mpt-7b", - "TehVenom/MPT-7b-InstructAndStorywriting-50_50-Merge" + "tiiuae/falcon-7b", + "tiiuae/falcon-7b-instruct", + "tiiuae/falcon-40b-instruct", + "tiiuae/falcon-40b", + "tiiuae/falcon-rw-1b", + "fxmarty/really-tiny-falcon-testing", + "vilsonrodrigues/falcon-7b-instruct-sharded", + "tiiuae/falcon-11B", + "euclaise/falcon_1b_stage2", + "explosion-testing/falcon-test" ] }, { "architecture_id": "Qwen3_5MoeForConditionalGeneration", - "total_models": 23, + "total_models": 28, "sample_models": [ - "nvidia/Qwen3.5-397B-A17B-NVFP4", "txn545/Qwen3.5-122B-A10B-NVFP4", - "Jackrong/Qwen3.5-35B-A3B-Claude-4.6-Opus-Reasoning-Distilled", - "lukealonso/Qwen3.5-397B-A17B-NVFP4", + "nvidia/Qwen3.5-397B-A17B-NVFP4", "txn545/Qwen3.5-35B-A3B-NVFP4", + "RepublicOfKorokke/Qwen3.5-35B-A3B-mlx-lm-mxfp4", + "nightmedia/Qwen3.5-35B-A3B-Text-qx64-hi-mlx", + "lukealonso/Qwen3.5-397B-A17B-NVFP4", "nightmedia/Qwen3.5-122B-A10B-Text-mxfp4-mlx", "olka-fi/Qwen3.5-122B-A10B-MXFP4", - "nightmedia/Qwen3.5-35B-A3B-Text-qx64-hi-mlx", - "RepublicOfKorokke/Qwen3.5-35B-A3B-mlx-lm-mxfp4", - "bjk110/Qwen3.5-122B-A10B-abliterated-NVFP4" + "Jackrong/Qwen3.5-35B-A3B-Claude-4.6-Opus-Reasoning-Distilled", + "NexVeridian/Qwen3.5-35B-A3B-3bit" + ] + }, + { + "architecture_id": "Lfm2ForCausalLM", + "total_models": 21, + "sample_models": [ + "LiquidAI/LFM2-1.2B", + "LiquidAI/LFM2.5-1.2B-Instruct", + "LiquidAI/LFM2.5-1.2B-Base", + "LiquidAI/LFM2-350M", + "LiquidAI/LFM2.5-1.2B-Thinking", + "LiquidAI/LFM2-2.6B", + "LiquidAI/LFM2-2.6B-Exp", + "LiquidAI/LFM2-700M", + "unsloth/LFM2.5-1.2B-Instruct", + "LiquidAI/LFM2.5-1.2B-Thinking-ONNX" ] }, { "architecture_id": "InternLM2ForCausalLM", - "total_models": 23, + "total_models": 19, "sample_models": [ "internlm/internlm2-chat-7b", "internlm/internlm2_5-7b-chat", @@ -182,87 +134,103 @@ "internlm/internlm2-base-20b", "chujiezheng/internlm2-chat-20b-ExPO", "chujiezheng/internlm2-chat-7b-ExPO", - "internlm/internlm2-1_8b" + "AI4Chem/ChemLLM-7B-Chat-1_5-DPO" ] }, { - "architecture_id": "Qwen3NextForCausalLM", - "total_models": 21, + "architecture_id": "Glm4MoeForCausalLM", + "total_models": 18, "sample_models": [ - "Qwen/Qwen3-Coder-Next", - "Qwen/Qwen3-Next-80B-A3B-Instruct", - "GadflyII/Qwen3-Coder-Next-NVFP4", - "unsloth/Qwen3-Coder-Next", - "Qwen/Qwen3-Next-80B-A3B-Thinking", - "tiny-random/qwen3-next-moe", - "nvidia/Qwen3-Next-80B-A3B-Instruct-NVFP4", - "RedHatAI/Qwen3-Coder-Next-NVFP4", - "yujiepan/qwen3-next-moe-tiny-random", - "saricles/Qwen3-Coder-Next-NVFP4-GB10" + "zai-org/GLM-4.5-Air", + "zai-org/GLM-4.7", + "trl-internal-testing/tiny-Glm4MoeForCausalLM", + "zai-org/GLM-4.5", + "zai-org/GLM-4.6", + "Tengyunw/GLM-4.7-NVFP4", + "Salyut1/GLM-4.7-NVFP4", + "np-cr/testing-glm4-moe", + "ArliAI/GLM-4.6-Derestricted-v3", + "zai-org/GLM-4.5-Air-Base" ] }, { "architecture_id": "JambaForCausalLM", - "total_models": 21, + "total_models": 17, "sample_models": [ "ai21labs/AI21-Jamba-Mini-1.5", "ai21labs/Jamba-tiny-random", - "ai21labs/AI21-Jamba-Mini-1.6", - "ai21labs/AI21-Jamba-Large-1.5", "ai21labs/AI21-Jamba2-3B", + "ai21labs/AI21-Jamba-Reasoning-3B", + "ai21labs/AI21-Jamba-Large-1.5", + "ai21labs/AI21-Jamba-Mini-1.6", "ai21labs/AI21-Jamba-Large-1.6", + "microsoft/Dayhoff-170m-GR", "ai21labs/Jamba-v0.1", - "ai21labs/AI21-Jamba2-Mini", - "ai21labs/AI21-Jamba-Reasoning-3B", - "microsoft/Dayhoff-170m-GR" + "microsoft/Dayhoff-170M-GRS-112000" ] }, { "architecture_id": "QWenLMHeadModel", - "total_models": 20, + "total_models": 16, "sample_models": [ - "cckevinn/SeeClick", - "Qwen/Qwen-7B-Chat", "Qwen/Qwen-7B", + "Qwen/Qwen-7B-Chat", "Qwen/Qwen-VL-Chat", "Qwen/Qwen-VL", - "Qwen/Qwen-1_8B-Chat", + "Qwen/Qwen-14B-Chat-Int4", "Qwen/Qwen-14B-Chat", + "Qwen/Qwen-1_8B-Chat", + "Qwen/Qwen-72B", "Qwen/Qwen-14B", - "Xingyu-Zheng/Qwen-VL-Chat", - "Qwen/Qwen-72B" + "Qwen/Qwen-Audio-Chat" + ] + }, + { + "architecture_id": "FalconH1ForCausalLM", + "total_models": 16, + "sample_models": [ + "tiiuae/Falcon-H1-Tiny-90M-Instruct", + "tiiuae/Falcon-H1-0.5B-Base", + "tiiuae/Falcon-H1R-7B", + "tiiuae/Falcon-H1-7B-Instruct", + "tiiuae/Falcon-H1-34B-Base", + "tiiuae/Falcon-H1-34B-Instruct", + "tiiuae/Falcon-H1-1.5B-Base", + "tiiuae/Falcon-H1-7B-Base", + "tiiuae/Falcon-H1-3B-Base", + "tiiuae/Falcon-H1-1.5B-Deep-Base" + ] + }, + { + "architecture_id": "NemotronHForCausalLM", + "total_models": 15, + "sample_models": [ + "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", + "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4", + "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", + "nvidia/NVIDIA-Nemotron-Nano-9B-v2-Japanese", + "nvidia/NVIDIA-Nemotron-Nano-9B-v2", + "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16", + "unsloth/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4", + "OpenResearcher/OpenResearcher-30B-A3B", + "nvidia/NVIDIA-Nemotron-Nano-9B-v2-NVFP4", + "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-Base-BF16" ] }, { "architecture_id": "GPTBigCodeForCausalLM", - "total_models": 20, + "total_models": 15, "sample_models": [ "bigcode/gpt_bigcode-santacoder", "bigcode/tiny_starcoder_py", "bigcode/starcoder", "bigcode/starcoderbase-1b", "ibm-granite/granite-20b-code-base-8k", - "GeorgiaTechResearchInstitute/starcoder-gpteacher-code-instruct", - "HuggingFaceH4/starchat-alpha", - "defog/sqlcoder2", + "ibm-granite/granite-20b-code-instruct-8k", "HuggingFaceH4/starchat-beta", - "LoupGarou/WizardCoder-Guanaco-15B-V1.0" - ] - }, - { - "architecture_id": "XGLMForCausalLM", - "total_models": 18, - "sample_models": [ - "facebook/xglm-564M", - "facebook/incoder-1B", - "facebook/xglm-7.5B", - "facebook/xglm-4.5B", - "facebook/xglm-1.7B", - "KoboldAI/fairseq-dense-2.7B", - "KoboldAI/fairseq-dense-125M", - "KoboldAI/fairseq-dense-355M", - "KoboldAI/fairseq-dense-13B", - "KoboldAI/fairseq-dense-1.3B" + "HuggingFaceH4/starchat-alpha", + "LoupGarou/WizardCoder-Guanaco-15B-V1.1", + "Danielbrdz/CodeBarcenas-1b" ] }, { @@ -273,376 +241,248 @@ "cerebras/MiniMax-M2.1-REAP-139B-A10B", "MiniMaxAI/MiniMax-M2", "MiniMaxAI/MiniMax-M2.1", - "nvidia/MiniMax-M2.5-NVFP4", "cerebras/MiniMax-M2.5-REAP-139B-A10B", - "amd/MiniMax-M2.5-MXFP4", + "PrimeIntellect/MiniMax-M2.5-bf16", + "cerebras/MiniMax-M2.5-REAP-172B-A10B", "saricles/MiniMax-M2.5-REAP-172B-A10B-NVFP4-GB10", "aspctu/MiniMax-M2.5", "amd/MiniMax-M2.1-MXFP4" ] }, { - "architecture_id": "DeciLMForCausalLM", + "architecture_id": "XGLMForCausalLM", "total_models": 14, "sample_models": [ - "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5", - "nvidia/Llama-3_3-Nemotron-Super-49B-v1", - "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5-NVFP4", - "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1", - "ConicCat/Llama3_3-Nemo-Super-Writer-49B", - "nvidia/Llama-3_1-Nemotron-51B-Instruct", - "FriendliAI/Llama-3_3-Nemotron-Super-49B-v1_5", - "FriendliAI/Llama-3_1-Nemotron-Ultra-253B-v1", - "NewstaR/Porpoise-6b-instruct", - "nvidia/Llama-3_1-Nemotron-Ultra-253B-CPT-v1" + "facebook/xglm-564M", + "facebook/xglm-7.5B", + "facebook/xglm-1.7B", + "KoboldAI/fairseq-dense-13B", + "facebook/xglm-4.5B", + "KoboldAI/fairseq-dense-125M", + "KoboldAI/fairseq-dense-2.7B", + "KoboldAI/fairseq-dense-355M", + "KoboldAI/fairseq-dense-1.3B", + "KoboldAI/fairseq-dense-6.7B" ] }, { - "architecture_id": "FalconH1ForCausalLM", - "total_models": 14, + "architecture_id": "Glm4MoeLiteForCausalLM", + "total_models": 13, "sample_models": [ - "tiiuae/Falcon-H1-0.5B-Base", - "tiiuae/Falcon-H1-3B-Base", - "tiiuae/Falcon-H1-7B-Base", - "tiiuae/Falcon-H1-1.5B-Deep-Base", - "tiiuae/Falcon-H1-34B-Base", - "tiiuae/Falcon-H1R-7B", - "tiiuae/Falcon-H1-1.5B-Base", - "tiiuae/Falcon-H1-Tiny-90M-Instruct", - "tiiuae/Falcon-H1-1.5B-Deep-Instruct", - "tiiuae/Falcon-H1-3B-Instruct" + "zai-org/GLM-4.7-Flash", + "GadflyII/GLM-4.7-Flash-NVFP4", + "unsloth/GLM-4.7-Flash", + "GadflyII/GLM-4.7-Flash-MTP-NVFP4", + "Olafangensan/GLM-4.7-Flash-heretic", + "cerebras/GLM-4.7-Flash-REAP-23B-A3B", + "huihui-ai/Huihui-GLM-4.7-Flash-abliterated", + "TeichAI/GLM-4.7-Flash-Claude-Opus-4.5-High-Reasoning-Distill", + "Ex0bit/GLM-4.7-Flash-PRISM", + "MuXodious/GLM-4.7-Flash-absolute-heresy" + ] + }, + { + "architecture_id": "CodeGenForCausalLM", + "total_models": 13, + "sample_models": [ + "Salesforce/codegen-350M-mono", + "Salesforce/codegen-350M-multi", + "Salesforce/codegen-2B-mono", + "hf-tiny-model-private/tiny-random-CodeGenForCausalLM", + "Salesforce/codegen-6B-multi", + "shailja/fine-tuned-codegen-16B-Verilog", + "katuni4ka/tiny-random-codegen2", + "Salesforce/codegen-2B-multi", + "Salesforce/codegen-6B-mono", + "Salesforce/codegen-6B-nl" ] }, { "architecture_id": "RwkvForCausalLM", - "total_models": 14, + "total_models": 13, "sample_models": [ "RWKV/v5-Eagle-7B-HF", "RWKV/rwkv-4-169m-pile", "beomi/KoRWKV-6B", - "RWKV/rwkv-4-430m-pile", "RWKV/rwkv-4-1b5-pile", + "RWKV/rwkv-4-430m-pile", "RWKV/rwkv-4-3b-pile", - "RWKV/rwkv-raven-1b5", "RWKV/rwkv-4-7b-pile", - "RWKV/rwkv-raven-3b", - "RWKV/rwkv-raven-14b" + "RWKV/rwkv-raven-1b5", + "RWKV/rwkv-4-14b-pile", + "RWKV/rwkv-raven-7b" ] }, { "architecture_id": "DeepseekV2ForCausalLM", - "total_models": 13, + "total_models": 11, "sample_models": [ "deepseek-ai/DeepSeek-V2-Lite-Chat", "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", "deepseek-ai/DeepSeek-V2-Lite", - "deepseek-ai/DeepSeek-V2", "deepseek-ai/DeepSeek-V2-Chat", - "deepseek-ai/DeepSeek-Coder-V2-Instruct", + "deepseek-ai/DeepSeek-Coder-V2-Instruct-0724", + "deepseek-ai/DeepSeek-V2", "deepseek-ai/DeepSeek-V2.5", + "deepseek-ai/DeepSeek-Coder-V2-Instruct", "deepseek-ai/DeepSeek-V2-Chat-0628", - "deepseek-ai/DeepSeek-Coder-V2-Lite-Base", - "Kwaipilot/KwaiCoder-DS-V2-Lite-Base" + "deepseek-ai/DeepSeek-Coder-V2-Lite-Base" ] }, { - "architecture_id": "Glm4MoeForCausalLM", - "total_models": 13, + "architecture_id": "CohereForCausalLM", + "total_models": 10, "sample_models": [ - "zai-org/GLM-4.5-Air", - "zai-org/GLM-4.7", - "trl-internal-testing/tiny-Glm4MoeForCausalLM", - "zai-org/GLM-4.5", - "zai-org/GLM-4.6", - "Tengyunw/GLM-4.7-NVFP4", - "np-cr/testing-glm4-moe", - "nvidia/GLM-4.7-NVFP4", - "Salyut1/GLM-4.7-NVFP4", - "ArliAI/GLM-4.6-Derestricted-v3" + "trl-internal-testing/tiny-CohereForCausalLM", + "CohereLabs/aya-23-8B", + "CohereLabs/aya-expanse-8b", + "CohereLabs/c4ai-command-r-v01", + "CohereLabs/aya-expanse-32b", + "NLPark/AnFeng_v3_Avocet", + "CohereLabs/aya-23-35B", + "CohereLabs/c4ai-command-r-plus-08-2024", + "CohereLabs/c4ai-command-r-08-2024", + "CohereLabs/c4ai-command-r-plus" ] }, { - "architecture_id": "BaichuanForCausalLM", - "total_models": 13, - "sample_models": [ - "baichuan-inc/Baichuan2-7B-Chat", - "baichuan-inc/Baichuan2-13B-Chat", - "baichuan-inc/Baichuan-13B-Chat", - "baichuan-inc/Baichuan2-7B-Base", - "baichuan-inc/Baichuan2-13B-Base", - "katuni4ka/tiny-random-baichuan2", - "sakuraumi/Sakura-13B-Galgame", - "zxbsmk/NSFW_13B_sft", - "katuni4ka/tiny-random-baichuan2-13b", - "baichuan-inc/Baichuan-13B-Base" - ] - }, - { - "architecture_id": "LlavaLlamaForCausalLM", - "total_models": 13, - "sample_models": [ - "LanguageBind/Video-LLaVA-7B", - "wisdomik/Quilt-Llava-v1.5-7b", - "liuhaotian/llava-llama-2-13b-chat-lightning-preview", - "lmms-lab/llama3-llava-next-8b", - "mmaaz60/LLaVA-7B-Lightening-v1-1", - "microsoft/llava-med-7b-delta", - "deepcs233/VisCoT-7b-336", - "ManishThota/Ollama_Video_llama_7B", - "EricPolaris/Quilt-Llava-v1.5-7b", - "liuhaotian/LLaVA-Lightning-7B-delta-v1-1" - ] - }, - { - "architecture_id": "T5GemmaForConditionalGeneration", - "total_models": 12, + "architecture_id": "T5GemmaForConditionalGeneration", + "total_models": 10, "sample_models": [ "google/t5gemma-s-s-prefixlm", "google/t5gemma-9b-9b-ul2", "google/t5gemma-b-b-ul2", - "google/t5gemma-2b-2b-prefixlm", "google/t5gemma-2b-2b-ul2", - "google/t5gemma-l-l-ul2-it", - "google/t5gemma-ml-ml-ul2-it", "google/t5gemma-b-b-prefixlm", - "google/t5gemma-s-s-prefixlm-it", - "google/t5gemma-s-s-ul2" - ] - }, - { - "architecture_id": "MT5ForConditionalGeneration", - "total_models": 12, - "sample_models": [ - "knowledgator/IUPAC2SMILES-canonical-base", - "knowledgator/SMILES2IUPAC-canonical-base", - "bigscience/mt0-small", - "bigscience/mt0-base", - "bigscience/mt0-large", - "bigscience/mt0-xl", - "bigscience/mt0-xxl", - "intelia-lab-uah/mt0-base_QG_SQAC", - "intelia-lab-uah/mt0-base_AE_SQAC", - "UBC-NLP/toucan-1.2B" - ] - }, - { - "architecture_id": "LLaMAForCausalLM", - "total_models": 12, - "sample_models": [ - "maicomputer/alpaca-13b", - "Enoch/llama-65b-hf", - "mncai/chatdoctor", - "AdaptLLM/law-LLM", - "Nitish-Garikoti/finance-LLM", - "boboto/LLaMA-65B-HF", - "AdaptLLM/finance-LLM", - "AdaptLLM/medicine-LLM", - "Rardilit/Panther_v1", - "James-WYang/BigTranslate" - ] - }, - { - "architecture_id": "MiniCPMForCausalLM", - "total_models": 11, - "sample_models": [ - "openbmb/MiniCPM-2B-sft-bf16", - "openbmb/MiniCPM4.1-8B", - "openbmb/MiniCPM-1B-sft-bf16", - "openbmb/MiniCPM4-0.5B", - "openbmb/MiniCPM-MoE-8x2B", - "katuni4ka/tiny-random-minicpm", - "openbmb/MiniCPM-S-1B-sft", - "openbmb/MiniCPM-2B-sft-fp32", - "openbmb/MiniCPM-2B-dpo-bf16", - "openbmb/MiniCPM4-8B" + "google/t5gemma-9b-9b-ul2-it", + "google/t5gemma-2b-2b-prefixlm", + "google/t5gemma-9b-2b-ul2-it", + "google/t5gemma-l-l-prefixlm", + "harshaljanjani/tiny-t5gemma-test" ] }, { - "architecture_id": "Glm4MoeLiteForCausalLM", + "architecture_id": "Cohere2ForCausalLM", "total_models": 10, "sample_models": [ - "zai-org/GLM-4.7-Flash", - "unsloth/GLM-4.7-Flash", - "cerebras/GLM-4.7-Flash-REAP-23B-A3B", - "GadflyII/GLM-4.7-Flash-NVFP4", - "huihui-ai/Huihui-GLM-4.7-Flash-abliterated", - "Olafangensan/GLM-4.7-Flash-heretic", - "Ex0bit/GLM-4.7-Flash-PRISM", - "jerrycheng233/model5_sft_16bit", - "aaravriyer193/chimpgpt-coder-elite", - "GadflyII/GLM-4.7-Flash-MTP-NVFP4" + "trl-internal-testing/tiny-Cohere2ForCausalLM", + "CohereLabs/tiny-aya-global", + "CohereLabs/c4ai-command-r7b-12-2024", + "CohereLabs/tiny-aya-base", + "CohereLabs/c4ai-command-a-03-2025", + "CohereLabs/c4ai-command-r7b-arabic-02-2025", + "CohereLabs/tiny-aya-water", + "CohereLabs/tiny-aya-fire", + "CohereLabs/command-a-reasoning-08-2025", + "CohereLabs/tiny-aya-earth" ] }, { - "architecture_id": "CohereForCausalLM", - "total_models": 10, + "architecture_id": "DeciLMForCausalLM", + "total_models": 9, "sample_models": [ - "trl-internal-testing/tiny-CohereForCausalLM", - "CohereLabs/aya-expanse-8b", - "CohereLabs/c4ai-command-r-v01", - "CohereLabs/aya-23-8B", - "NLPark/AnFeng_v3_Avocet", - "CohereLabs/aya-expanse-32b", - "CohereLabs/aya-23-35B", - "CohereLabs/c4ai-command-r-plus-08-2024", - "CohereLabs/c4ai-command-r-08-2024", - "CohereLabs/c4ai-command-r-plus" + "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5", + "nvidia/Llama-3_3-Nemotron-Super-49B-v1", + "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5-NVFP4", + "Deci/DeciLM-7B-instruct", + "Deci/DeciLM-7B", + "NewstaR/Porpoise-6b-instruct", + "Danielbrdz/Barcenas-6b", + "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1", + "nvidia/Llama-3_1-Nemotron-51B-Instruct" ] }, { "architecture_id": "DFlashDraftModel", - "total_models": 10, + "total_models": 9, "sample_models": [ "z-lab/Qwen3-4B-DFlash-b16", "z-lab/Qwen3-8B-DFlash-b16", "z-lab/Qwen3.5-9B-DFlash", - "z-lab/Qwen3.5-4B-DFlash", - "z-lab/Qwen3.5-27B-DFlash", "z-lab/gpt-oss-20b-DFlash", "z-lab/gpt-oss-120b-DFlash", - "z-lab/Qwen3.5-35B-A3B-DFlash", "z-lab/LLaMA3.1-8B-Instruct-DFlash-UltraChat", - "z-lab/Qwen3-Coder-30B-A3B-DFlash" - ] - }, - { - "architecture_id": "RWForCausalLM", - "total_models": 10, - "sample_models": [ - "projecte-aina/aguila-7b", - "lightonai/alfred-40b-1023", - "explosion-testing/refined-web-model-test", - "vilm/vulture-40b", - "h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v2", - "nomic-ai/gpt4all-falcon", - "h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3", - "OpenAssistant/falcon-40b-sft-top1-560", - "QuixiAI/WizardLM-Uncensored-Falcon-40b", - "mrm8488/falcoder-7b" - ] - }, - { - "architecture_id": "DeepseekV32ForCausalLM", - "total_models": 9, - "sample_models": [ - "deepseek-ai/DeepSeek-V3.2", - "deepseek-ai/DeepSeek-V3.2-Exp", - "nvidia/DeepSeek-V3.2-NVFP4", - "deepseek-ai/DeepSeek-V3.2-Speciale", - "deepseek-ai/DeepSeek-Math-V2", - "exolabs/DeepSeek-V3.2_bf16", - "deepseek-ai/DeepSeek-V3.2-Exp-Base", - "hyper-accel/tiny-random-deepseek-v32", - "cs2764/DeepSeek-V3.2_dq4-mlx" - ] - }, - { - "architecture_id": "Cohere2ForCausalLM", - "total_models": 9, - "sample_models": [ - "trl-internal-testing/tiny-Cohere2ForCausalLM", - "CohereLabs/tiny-aya-global", - "CohereLabs/c4ai-command-r7b-12-2024", - "CohereLabs/tiny-aya-base", - "CohereLabs/c4ai-command-r7b-arabic-02-2025", - "CohereLabs/c4ai-command-a-03-2025", - "CohereLabs/tiny-aya-water", - "CohereLabs/tiny-aya-fire", - "CohereLabs/tiny-aya-earth" - ] - }, - { - "architecture_id": "HunYuanDenseV1ForCausalLM", - "total_models": 9, - "sample_models": [ - "tencent/Hunyuan-7B-Instruct", - "tencent/Hunyuan-0.5B-Pretrain", - "tencent/Hunyuan-1.8B-Pretrain", - "tencent/Hunyuan-4B-Pretrain", - "tencent/Hunyuan-7B-Instruct-0124", - "tencent/Hunyuan-7B-Pretrain", - "tencent/Hunyuan-1.8B-Instruct", - "tencent/Hunyuan-0.5B-Instruct", - "tencent/Hunyuan-4B-Instruct" + "z-lab/Qwen3.5-35B-A3B-DFlash", + "z-lab/Qwen3-Coder-30B-A3B-DFlash", + "z-lab/Qwen3.5-4B-DFlash" ] }, { - "architecture_id": "HybridQwen3ForCausalLM", - "total_models": 9, + "architecture_id": "LlavaQwenForCausalLM", + "total_models": 8, "sample_models": [ - "amazon/GKA-primed-HQwen3-8B-Instruct", - "amazon/Mamba2-primed-HQwen3-8B-Instruct", - "amazon/GDN-primed-HQwen3-8B-Instruct", - "amazon/GDN-primed-HQwen3-32B-Instruct", - "amazon/GKA-primed-HQwen3-32B-Instruct", - "amazon/BMOJOF-primed-HQwen3-8B-Instruct", - "amazon/GKA-primed-HQwen3-8B-Reasoner", - "amazon/GDN-primed-HQwen3-8B-Reasoner", - "amazon/GKA-primed-HQwen3-32B-Reasoner" + "lmms-lab/llava-onevision-qwen2-7b-ov", + "lmms-lab/llava-onevision-qwen2-0.5b-ov", + "lmms-lab/llava-onevision-qwen2-7b-si", + "lmms-lab/LLaVA-Video-7B-Qwen2-Video-Only", + "lmms-lab/llava-onevision-qwen2-7b-ov-chat", + "lmms-lab/llava-next-interleave-qwen-7b", + "lmms-lab/llava-onevision-qwen2-0.5b-si", + "lmms-lab/LongVA-7B" ] }, { - "architecture_id": "BartForConditionalGeneration", + "architecture_id": "MiniCPMForCausalLM", "total_models": 8, "sample_models": [ - "KomeijiForce/bart-large-emojilm", - "antalvdb/bart-base-spelling-nl", - "lmqg/bart-large-squad-qg", - "kengurukleo/deutsch_a2_transformer", - "shibing624/bart4csc-base-chinese", - "SkitCon/gec-spanish-BARTO-SYNTHETIC", - "Nargizi/screeve-lemmatizer", - "Tianlin668/MentalBART" + "openbmb/MiniCPM4.1-8B", + "openbmb/MiniCPM-2B-sft-bf16", + "openbmb/MiniCPM4-0.5B", + "openbmb/MiniCPM-1B-sft-bf16", + "openbmb/MiniCPM-MoE-8x2B", + "katuni4ka/tiny-random-minicpm", + "openbmb/MiniCPM4-8B", + "openbmb/MiniCPM-S-1B-sft" ] }, { - "architecture_id": "MambaForCausalLM", + "architecture_id": "MT5ForConditionalGeneration", "total_models": 8, "sample_models": [ - "state-spaces/mamba-130m-hf", - "state-spaces/mamba-2.8b-hf", - "state-spaces/mamba-1.4b-hf", - "state-spaces/mamba-370m-hf", - "state-spaces/mamba-790m-hf", - "NYTK/PULI-HuBA-mamba-130M", - "EchoLabs33/mamba-130m-hxq", - "TRI-ML/mamba-7b-rw" + "knowledgator/IUPAC2SMILES-canonical-base", + "knowledgator/SMILES2IUPAC-canonical-base", + "knowledgator/SMILES2IUPAC-canonical-small", + "bigscience/mt0-base", + "bigscience/mt0-small", + "HiTZ/Medical-mT5-large", + "bigscience/mt0-large", + "dreuxx26/Multilingual-grammar-Corrector-using-mT5-small" ] }, { - "architecture_id": "Lfm2MoeForCausalLM", + "architecture_id": "Qwen3_5ForCausalLM", "total_models": 8, "sample_models": [ - "farbodtavakkoli/OTel-LLM-24B-IT", - "LiquidAI/LFM2-8B-A1B", - "LiquidAI/LFM2-24B-A2B", - "LiquidAI/LFM2-8B-A1B-ONNX", - "LiquidAI/LFM2-24B-A2B-ONNX", - "unsloth/LFM2-8B-A1B", - "huihui-ai/Huihui-LFM2-24B-A2B-abliterated", - "MuXodious/LFM2-8B-A1B-absolute-heresy-MPOA" + "lukey03/Qwen3.5-9B-abliterated", + "osoleve/Qwen3.5-9B-Base-Text-NVFP4", + "Phonsiri/Qwen3.5-9B-Thai-Law-Base", + "Green-eyedDevil/Monika-9B", + "eerwitt/qwen-h-neurons-honest", + "rahul7star/albeit", + "nbeerbower/Huihui-Qwen3.5-9B-abliterated-Grimoire-ORPO", + "nahidstaq/html-section-retriever" ] }, { - "architecture_id": "BloomModel", + "architecture_id": "MPTForCausalLM", "total_models": 8, "sample_models": [ - "bigscience/bigscience-small-testing", - "TurkuNLP/gpt3-finnish-small", - "TurkuNLP/gpt3-finnish-large", - "TurkuNLP/gpt3-finnish-13B", - "BelleGroup/BELLE-7B-2M", - "norallm/norbloom-7b-scratch", - "Muennighoff/bloom-tiny-random", - "TurkuNLP/gpt3-finnish-xl" + "anas-awadalla/mpt-7b", + "wtang06/mpt-125m-c4", + "echarlaix/tiny-mpt-random-remote-code", + "lightblue/japanese-mpt-7b", + "vinai/PhoGPT-4B", + "Nethermind/Mpt-Instruct-DotNet-S", + "replit/replit-code-v1-3b", + "vinai/PhoGPT-4B-Chat" ] }, { "architecture_id": "ExaoneForCausalLM", "total_models": 7, "sample_models": [ - "LGAI-EXAONE/EXAONE-Deep-7.8B", "LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct", + "LGAI-EXAONE/EXAONE-Deep-7.8B", "LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct", "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", "LGAI-EXAONE/EXAONE-3.5-32B-Instruct", @@ -651,67 +491,66 @@ ] }, { - "architecture_id": "Zamba2ForCausalLM", + "architecture_id": "BaichuanForCausalLM", "total_models": 7, "sample_models": [ - "Zyphra/Zamba2-1.2B-instruct", - "Zyphra/Zamba2-7B-Instruct", - "Zyphra/Zamba2-2.7B", - "EchoLabs33/zamba2-1.2b-hxq", - "Zyphra/Zamba2-2.7B-instruct", - "EchoLabs33/zamba2-2.7b-instruct-hxq", - "EchoLabs33/zamba2-7b-instruct-hxq" + "baichuan-inc/Baichuan2-7B-Chat", + "baichuan-inc/Baichuan2-13B-Chat", + "baichuan-inc/Baichuan-13B-Chat", + "katuni4ka/tiny-random-baichuan2", + "baichuan-inc/Baichuan2-7B-Base", + "katuni4ka/tiny-random-baichuan2-13b", + "baichuan-inc/Baichuan2-13B-Base" ] }, { - "architecture_id": "LlamaForCausalLMEagle3", + "architecture_id": "SmolLM3ForCausalLM", "total_models": 7, "sample_models": [ - "taobao-mnn/Qwen3-VL-8B-Instruct-Eagle3", - "nvidia/gpt-oss-120b-Eagle3-long-context", - "nvidia/gpt-oss-120b-Eagle3-short-context", - "Zjcxy-SmartAI/Eagle3-Qwen3-32B-zh", - "Zjcxy-SmartAI/Eagle3-Qwen3-8B-zh", - "nvidia/gpt-oss-120b-Eagle3-throughput", - "chankhavu/c2.eagle3-test" + "HuggingFaceTB/SmolLM3-3B", + "HuggingFaceTB/SmolLM3-3B-Base", + "optimum-internal-testing/tiny-random-SmolLM3ForCausalLM", + "onnx-internal-testing/tiny-random-SmolLM3ForCausalLM", + "HuggingFaceTB/SmolLM3-3B-ONNX", + "N-Bot-Int/SmolSam3-MEMGRPO", + "toroe/SmolLM-3B-Science-ES" ] }, { - "architecture_id": "T5WithLMHeadModel", + "architecture_id": "ProGenForCausalLM", "total_models": 7, "sample_models": [ - "Salesforce/codet5-large", - "unicamp-dl/ptt5-base-portuguese-vocab", - "Salesforce/codet5-large-ntp-py", - "Rostlab/prot_t5_xl_bfd", - "unicamp-dl/ptt5-small-portuguese-vocab", - "gagan3012/k2t", - "unicamp-dl/ptt5-large-portuguese-vocab" + "hugohrban/progen2-base", + "hugohrban/progen2-small", + "hugohrban/progen2-medium", + "hugohrban/progen2-oas", + "hugohrban/progen2-small-mix7", + "hugohrban/progen2-large", + "hugohrban/progen2-xlarge" ] }, { - "architecture_id": "Rwkv6ForCausalLM", - "total_models": 7, + "architecture_id": "DeepseekV32ForCausalLM", + "total_models": 6, "sample_models": [ - "RWKV/v6-Finch-1B6-HF", - "RWKV/v6-Finch-7B-HF", - "RWKV/rwkv-6-world-1b6", - "RWKV/v6-Finch-14B-HF", - "RWKV/v6-Finch-3B-HF", - "RWKV/rwkv-6-world-7b", - "RWKV/rwkv-6-world-3b-v2.1" + "deepseek-ai/DeepSeek-V3.2", + "deepseek-ai/DeepSeek-V3.2-Exp", + "nvidia/DeepSeek-V3.2-NVFP4", + "deepseek-ai/DeepSeek-V3.2-Speciale", + "deepseek-ai/DeepSeek-Math-V2", + "cerebras/DeepSeek-V3.2-REAP-508B-A37B" ] }, { - "architecture_id": "GlmMoeDsaForCausalLM", + "architecture_id": "MambaForCausalLM", "total_models": 6, "sample_models": [ - "zai-org/GLM-5", - "nvidia/GLM-5-NVFP4", - "zai-org/GLM-5.1", - "cs2764/GLM-5-abliterated-dq4-mlx", - "0xSero/GLM-5-REAP-381B", - "cs2764/GLM-5-abliterated-dq3-mlx" + "state-spaces/mamba-130m-hf", + "state-spaces/mamba-2.8b-hf", + "state-spaces/mamba-370m-hf", + "state-spaces/mamba-1.4b-hf", + "state-spaces/mamba-790m-hf", + "TRI-ML/mamba-7b-rw" ] }, { @@ -727,154 +566,74 @@ ] }, { - "architecture_id": "DreamModel", + "architecture_id": "NemotronForCausalLM", "total_models": 6, "sample_models": [ - "Dream-org/Dream-v0-Instruct-7B", - "Dream-org/Dream-v0-Base-7B", - "Dream-org/Dream-Coder-v0-Instruct-7B", - "Zigeng/dParallel_Dream_7B_Instruct", - "Dream-org/Dream-Coder-v0-Base-7B", - "Dream-org/DreamOn-v0-7B" + "nvidia/Nemotron-Mini-4B-Instruct", + "nvidia/Minitron-8B-Base", + "badaoui/tiny-random-NemotronForCausalLM", + "nvidia/Minitron-4B-Base", + "thhaus/nemotron3-8b", + "dmvevents/Nemotron-Mini-4B-Instruct" ] }, { - "architecture_id": "Phi3VForCausalLM", + "architecture_id": "HyenaDNAForCausalLM", "total_models": 6, "sample_models": [ - "microsoft/Phi-3-vision-128k-instruct", - "TIGER-Lab/VLM2Vec-Full", - "yujiepan/phi-3-vision-tiny-random", - "furonghuang-lab/tracevla_phi3v", - "Desm0nt/Phi-3-HornyVision-128k-instruct", - "failspy/Phi-3-vision-128k-instruct-abliterated-alpha" + "LongSafari/hyenadna-small-32k-seqlen-hf", + "LongSafari/hyenadna-tiny-1k-seqlen-hf", + "LongSafari/hyenadna-large-1m-seqlen-hf", + "LongSafari/hyenadna-medium-450k-seqlen-hf", + "LongSafari/hyenadna-medium-160k-seqlen-hf", + "LongSafari/hyenadna-tiny-1k-seqlen-d256-hf" ] }, { - "architecture_id": "SDARForCausalLM", + "architecture_id": "LlavaLlamaForCausalLM", "total_models": 6, "sample_models": [ - "JetLM/SDAR-1.7B-Chat", - "JetLM/SDAR-8B-Chat-b32", - "JetLM/SDAR-8B-Chat", - "JetLM/SDAR-1.7B-Chat-b32", - "JetLM/SDAR-4B-Chat", - "JetLM/SDAR-4B-Chat-b32" - ] - }, - { - "architecture_id": "HyenaDNAForCausalLM", - "total_models": 6, - "sample_models": [ - "LongSafari/hyenadna-small-32k-seqlen-hf", - "LongSafari/hyenadna-medium-450k-seqlen-hf", - "LongSafari/hyenadna-large-1m-seqlen-hf", - "LongSafari/hyenadna-tiny-1k-seqlen-hf", - "LongSafari/hyenadna-medium-160k-seqlen-hf", - "LongSafari/hyenadna-tiny-16k-seqlen-d128-hf" - ] - }, - { - "architecture_id": "AfmoeForCausalLM", - "total_models": 6, - "sample_models": [ - "arcee-ai/Trinity-Nano-Preview", - "arcee-ai/Trinity-Large-Thinking", - "arcee-ai/Trinity-Mini", - "arcee-ai/Trinity-Nano-Base", - "arcee-ai/Trinity-Mini-Base", - "arcee-ai/Trinity-Large-Preview" - ] - }, - { - "architecture_id": "AquilaForCausalLM", - "total_models": 6, - "sample_models": [ - "BAAI/AquilaChat2-7B", - "katuni4ka/tiny-random-aquilachat", - "katuni4ka/tiny-random-aquila2", - "BAAI/Aquila2-34B", - "BAAI/AquilaChat2-34B", - "BAAI/AquilaChat2-34B-16K" + "LanguageBind/Video-LLaVA-7B", + "lmms-lab/llama3-llava-next-8b", + "liuhaotian/llava-llama-2-13b-chat-lightning-preview", + "wisdomik/Quilt-Llava-v1.5-7b", + "ManishThota/Ollama_Video_llama_7B", + "ShareGPTVideo/LLaVA-Hound-Pretrain" ] }, { - "architecture_id": "OLMoForCausalLM", + "architecture_id": "LlavaLlamaModel", "total_models": 6, "sample_models": [ - "allenai/OLMo-7B-Instruct", - "allenai/OLMo-7B", - "allenai/OLMo-1B", - "allenai/OLMo-7B-0424", - "allenai/OLMo-7B-Twin-2T", - "allenai/OLMo-7B-SFT" + "Efficient-Large-Model/VILA1.5-3b", + "Efficient-Large-Model/NVILA-Lite-8B", + "Efficient-Large-Model/NVILA-8B", + "Efficient-Large-Model/NVILA-15B", + "Efficient-Large-Model/VILA1.5-13b", + "Efficient-Large-Model/Llama-3-VILA1.5-8B" ] }, { - "architecture_id": "DogeForCausalLM", + "architecture_id": "LLaMAForCausalLM", "total_models": 6, "sample_models": [ - "SmallDoge/Doge-320M", - "SmallDoge/Doge-20M", - "SmallDoge/Doge-160M", - "SmallDoge/Doge-60M", - "SmallDoge/Doge-120M-MoE", - "SmallDoge/Doge-20M-MoE" - ] - }, - { - "architecture_id": "SmolLM3ForCausalLM", - "total_models": 5, - "sample_models": [ - "HuggingFaceTB/SmolLM3-3B", - "HuggingFaceTB/SmolLM3-3B-Base", - "optimum-internal-testing/tiny-random-SmolLM3ForCausalLM", - "unsloth/SmolLM3-3B", - "onnx-internal-testing/tiny-random-SmolLM3ForCausalLM" - ] - }, - { - "architecture_id": "XLNetLMHeadModel", - "total_models": 5, - "sample_models": [ - "xlnet/xlnet-base-cased", - "xlnet/xlnet-large-cased", - "hfl/chinese-xlnet-base", - "sshleifer/tiny-xlnet-base-cased", - "textattack/xlnet-base-cased-imdb" - ] - }, - { - "architecture_id": "BioGptForCausalLM", - "total_models": 5, - "sample_models": [ - "microsoft/biogpt", - "microsoft/BioGPT-Large", - "microsoft/BioGPT-Large-PubMedQA", - "hf-tiny-model-private/tiny-random-BioGptForCausalLM", - "zequnl/molxpt" - ] - }, - { - "architecture_id": "Mistral3ForConditionalGeneration", - "total_models": 5, - "sample_models": [ - "farbodtavakkoli/OTel-LLM-3B-IT", - "RedHatAI/Mistral-Small-3.2-24B-Instruct-2506-NVFP4", - "ArmGPT/ArmenianGPT-1.0-3B", - "JANGQ-AI/Mistral-Small-4-119B-A6B-JANG_2L", - "odytrice/kenichi-flash" + "Enoch/llama-65b-hf", + "Rardilit/Panther_v1", + "James-WYang/BigTranslate", + "mncai/chatdoctor", + "heegyu/LIMA-13b", + "maicomputer/alpaca-13b" ] }, { - "architecture_id": "LLaDA2MoeModelLM", + "architecture_id": "LLaDAModelLM", "total_models": 5, "sample_models": [ - "inclusionAI/LLaDA2.1-flash", - "inclusionAI/LLaDA2.0-mini", - "inclusionAI/LLaDA2.1-mini", - "inclusionAI/LLaDA2.0-mini-CAP", - "inclusionAI/LLaDA2.0-flash" + "GSAI-ML/LLaDA-8B-Instruct", + "GSAI-ML/LLaDA-8B-Base", + "GSAI-ML/LLaDA-1.5", + "d3LLM/d3LLM_LLaDA", + "Fraser/LLaDA-8B-Base-gg2m" ] }, { @@ -882,2387 +641,1674 @@ "total_models": 5, "sample_models": [ "trl-internal-testing/tiny-FalconMambaForCausalLM", - "tiiuae/falcon-mamba-7b-instruct", - "tiiuae/falcon-mamba-7b", "tiiuae/falcon-mamba-tiny-dev", + "tiiuae/falcon-mamba-7b", + "tiiuae/falcon-mamba-7b-instruct", "tiiuae/Falcon3-Mamba-7B-Instruct" ] }, { - "architecture_id": "Eagle3Speculator", - "total_models": 5, - "sample_models": [ - "RedHatAI/Qwen3-8B-speculator.eagle3", - "RedHatAI/Llama-3.1-8B-Instruct-speculator.eagle3", - "RedHatAI/Llama-3.3-70B-Instruct-speculator.eagle3", - "RedHatAI/Qwen3-32B-speculator.eagle3", - "RedHatAI/Qwen3-14B-speculator.eagle3" - ] - }, - { - "architecture_id": "NemotronForCausalLM", - "total_models": 5, - "sample_models": [ - "nvidia/Nemotron-Mini-4B-Instruct", - "nvidia/Minitron-8B-Base", - "nvidia/Minitron-4B-Base", - "badaoui/tiny-random-NemotronForCausalLM", - "thhaus/nemotron3-8b" - ] - }, - { - "architecture_id": "ProGenForCausalLM", - "total_models": 5, - "sample_models": [ - "hugohrban/progen2-base", - "hugohrban/progen2-small", - "hugohrban/progen2-medium", - "hugohrban/progen2-large", - "hugohrban/progen2-small-mix7" - ] - }, - { - "architecture_id": "Glm4ForCausalLM", + "architecture_id": "DreamModel", "total_models": 5, "sample_models": [ - "zai-org/GLM-4-9B-0414", - "zai-org/GLM-Z1-32B-0414", - "zai-org/GLM-Z1-9B-0414", - "zai-org/GLM-4-32B-0414", - "zai-org/GLM-4-32B-Base-0414" + "Dream-org/Dream-v0-Instruct-7B", + "Dream-org/Dream-v0-Base-7B", + "Dream-org/Dream-Coder-v0-Instruct-7B", + "d3LLM/d3LLM_Dream", + "Dream-org/Dream-Coder-v0-Base-7B" ] }, { - "architecture_id": "Eagle3DraftModel", + "architecture_id": "Eagle3Speculator", "total_models": 5, "sample_models": [ + "RedHatAI/Qwen3-8B-speculator.eagle3", "RedHatAI/gpt-oss-20b-speculator.eagle3", - "RedHatAI/gpt-oss-120b-speculator.eagle3", - "RedHatAI/Qwen3-30B-A3B-Thinking-2507-speculator.eagle3", - "RedHatAI/Qwen3-235B-A22B-Instruct-2507-speculator.eagle3", - "RedHatAI/Qwen3-30B-A3B-Instruct-2507-speculator.eagle3" - ] - }, - { - "architecture_id": "LlavaQwen2ForCausalLM", - "total_models": 5, - "sample_models": [ - "qnguyen3/nanoLLaVA", - "apple/FastVLM-0.5B", - "apple/FastVLM-1.5B", - "apple/FastVLM-7B", - "FreedomIntelligence/HuatuoGPT-Vision-7B" - ] - }, - { - "architecture_id": "JAISLMHeadModel", - "total_models": 5, - "sample_models": [ - "inceptionai/jais-13b-chat", - "katuni4ka/tiny-random-jais", - "inceptionai/jais-family-30b-8k", - "inceptionai/jais-13b", - "inceptionai/jais-family-13b-chat" - ] - }, - { - "architecture_id": "MoAMetricLM", - "total_models": 5, - "sample_models": [ - "reaperdoesntknow/MoA-150M", - "reaperdoesntknow/MoA-400M", - "reaperdoesntknow/MoA-155M", - "reaperdoesntknow/MoA-100M", - "reaperdoesntknow/DiscoverLM-70M" - ] - }, - { - "architecture_id": "PldrllmForCausalLM", - "total_models": 5, - "sample_models": [ - "fromthesky/PLDR-LLM-v51-SOC-110M-5", - "fromthesky/PLDR-LLM-v51-SOC-110M-2", - "fromthesky/PLDR-LLM-v51-SOC-110M-4", - "fromthesky/PLDR-LLM-v51-SOC-110M-3", - "fromthesky/PLDR-LLM-v51-SOC-110M-1" - ] - }, - { - "architecture_id": "MBartForConditionalGeneration", - "total_models": 5, - "sample_models": [ - "Pravopysnyk/best-unlp", - "DeepPavlov/mbart-large-50-ru-persona-chat", - "sn4kebyt3/ru-bart-large", - "MRNH/mbart-italian-grammar-corrector", - "MRNH/mbart-german-grammar-corrector" - ] - }, - { - "architecture_id": "PhiMoEForCausalLM", - "total_models": 4, - "sample_models": [ - "microsoft/Phi-tiny-MoE-instruct", - "microsoft/Phi-mini-MoE-instruct", - "microsoft/Phi-3.5-MoE-instruct", - "optimum-intel-internal-testing/phi-3.5-moe-tiny-random" - ] - }, - { - "architecture_id": "LlavaQwenForCausalLM", - "total_models": 4, - "sample_models": [ - "lmms-lab/llava-onevision-qwen2-7b-ov", - "lmms-lab/llava-onevision-qwen2-0.5b-ov", - "lmms-lab/llava-onevision-qwen2-0.5b-si", - "lmms-lab/llava-onevision-qwen2-7b-si" - ] - }, - { - "architecture_id": "Starcoder2ForCausalLM", - "total_models": 4, - "sample_models": [ - "bigcode/starcoder2-3b", - "bigcode/starcoder2-7b", - "bigcode/starcoder2-15b", - "bigcode/starcoder2-15b-instruct-v0.1" - ] - }, - { - "architecture_id": "GlmForCausalLM", - "total_models": 4, - "sample_models": [ - "zai-org/glm-4-9b-chat-hf", - "zai-org/glm-4-9b-hf", - "zai-org/glm-edge-4b-chat", - "zai-org/glm-edge-1.5b-chat" - ] - }, - { - "architecture_id": "OuroForCausalLM", - "total_models": 4, - "sample_models": [ - "ByteDance/Ouro-1.4B", - "ByteDance/Ouro-2.6B-Thinking", - "ByteDance/Ouro-2.6B", - "ByteDance/Ouro-1.4B-Thinking" + "RedHatAI/Llama-3.1-8B-Instruct-speculator.eagle3", + "RedHatAI/Qwen3-32B-speculator.eagle3", + "RedHatAI/Qwen3-14B-speculator.eagle3" ] }, { "architecture_id": "SeedOssForCausalLM", - "total_models": 4, + "total_models": 5, "sample_models": [ "ByteDance-Seed/Seed-OSS-36B-Instruct", - "NousResearch/Hermes-4.3-36B", "ByteDance-Seed/Seed-OSS-36B-Base", - "mratsim/Seed-OSS-36B-Instruct-NVFP4" - ] - }, - { - "architecture_id": "ArceeForCausalLM", - "total_models": 4, - "sample_models": [ - "arcee-ai/AFM-4.5B-Base", - "optimum-intel-internal-testing/tiny-random-ArceeForCausalLM", - "onnx-internal-testing/tiny-random-ArceeForCausalLM", - "arcee-ai/AFM-4.5B" - ] - }, - { - "architecture_id": "BailingMoeV2ForCausalLM", - "total_models": 4, - "sample_models": [ - "inclusionAI/Ling-mini-2.0", - "inclusionAI/Ling-1T", - "inclusionAI/Ring-mini-2.0", - "inclusionAI/Ling-flash-2.0" - ] - }, - { - "architecture_id": "MobilintLlamaForCausalLM", - "total_models": 4, - "sample_models": [ - "mobilint/Llama-3.2-3B-Instruct", - "mobilint/Llama-3.2-1B-Instruct", - "mobilint/Llama-3.1-8B-Instruct", - "mobilint/HyperCLOVAX-SEED-Text-Instruct-1.5B" - ] - }, - { - "architecture_id": "MobilintQwen2ForCausalLM", - "total_models": 4, - "sample_models": [ - "mobilint/Qwen2.5-1.5B-Instruct", - "mobilint/Qwen2.5-0.5B-Instruct", - "mobilint/Qwen2.5-3B-Instruct", - "mobilint/Qwen2.5-7B-Instruct" - ] - }, - { - "architecture_id": "MobilintQwen3ForCausalLM", - "total_models": 4, - "sample_models": [ - "mobilint/Qwen3-4B", - "mobilint/Qwen3-0.6B", - "mobilint/Qwen3-1.7B", - "mobilint/Qwen3-8B" + "NousResearch/Hermes-4.3-36B", + "mratsim/Seed-OSS-36B-Instruct-NVFP4", + "YanLabs/Seed-OSS-36B-Instruct-MPOA" ] }, { - "architecture_id": "MobilintExaoneForCausalLM", - "total_models": 4, + "architecture_id": "Ernie4_5_MoeForCausalLM", + "total_models": 5, "sample_models": [ - "mobilint/EXAONE-3.5-2.4B-Instruct", - "mobilint/EXAONE-Deep-2.4B", - "mobilint/EXAONE-3.5-7.8B-Instruct", - "mobilint/EXAONE-Deep-7.8B" + "baidu/ERNIE-4.5-21B-A3B-PT", + "baidu/ERNIE-4.5-21B-A3B-Base-PT", + "baidu/ERNIE-4.5-21B-A3B-Thinking", + "baidu/ERNIE-4.5-300B-A47B-PT", + "baidu/ERNIE-4.5-300B-A47B-Paddle" ] }, { - "architecture_id": "Qwen2_5_VLForConditionalGeneration", - "total_models": 4, - "sample_models": [ - "nvidia/Qwen2.5-VL-7B-Instruct-NVFP4", - "OmniSVG/OmniSVG1.1_4B", - "OmniSVG/OmniSVG1.1_8B", - "OmniSVG/OmniSVG" - ] - }, - { - "architecture_id": "IdeficsForVisionText2Text", - "total_models": 4, - "sample_models": [ - "HuggingFaceM4/idefics-80b-instruct", - "HuggingFaceM4/idefics-9b", - "HuggingFaceM4/idefics-9b-instruct", - "HuggingFaceM4/idefics-80b" - ] - }, - { - "architecture_id": "LISAForCausalLM", - "total_models": 4, - "sample_models": [ - "xinlai/LISA-13B-llama2-v1", - "xinlai/LISA-7B-v1", - "xinlai/LISA-7B-v1-explanatory", - "xinlai/LISA-13B-llama2-v1-explanatory" - ] - }, - { - "architecture_id": "LLaDAModelLM", - "total_models": 3, - "sample_models": [ - "GSAI-ML/LLaDA-8B-Instruct", - "GSAI-ML/LLaDA-8B-Base", - "GSAI-ML/LLaDA-1.5" - ] - }, - { - "architecture_id": "BambaForCausalLM", - "total_models": 3, - "sample_models": [ - "hmellor/tiny-random-BambaForCausalLM", - "ibm-ai-platform/Bamba-9B-v1", - "ibm-ai-platform/Bamba-9B-v2" - ] - }, - { - "architecture_id": "InternLMForCausalLM", - "total_models": 3, - "sample_models": [ - "internlm/internlm-chat-7b", - "internlm/internlm-20b", - "internlm/internlm-7b" - ] - }, - { - "architecture_id": "Ernie4_5_MoeForCausalLM", - "total_models": 3, - "sample_models": [ - "baidu/ERNIE-4.5-21B-A3B-PT", - "baidu/ERNIE-4.5-21B-A3B-Base-PT", - "baidu/ERNIE-4.5-21B-A3B-Thinking" - ] - }, - { - "architecture_id": "Exaone4ForCausalLM", - "total_models": 3, - "sample_models": [ - "LGAI-EXAONE/EXAONE-4.0-1.2B", - "LGAI-EXAONE/EXAONE-4.0.1-32B", - "LGAI-EXAONE/EXAONE-4.0-32B" - ] - }, - { - "architecture_id": "OlmoHybridForCausalLM", - "total_models": 3, - "sample_models": [ - "allenai/Olmo-Hybrid-7B", - "allenai/Olmo-Hybrid-Instruct-DPO-7B", - "allenai/Olmo-Hybrid-Instruct-SFT-7B" - ] - }, - { - "architecture_id": "Llama4ForCausalLM", - "total_models": 3, - "sample_models": [ - "trl-internal-testing/tiny-Llama4ForCausalLM", - "pruna-test/test-save-tiny-random-llama4-smashed", - "facebook/MobileLLM-R1.5-360M" - ] - }, - { - "architecture_id": "BitNetForCausalLM", - "total_models": 3, - "sample_models": [ - "microsoft/bitnet-b1.58-2B-4T", - "microsoft/bitnet-b1.58-2B-4T-bf16", - "iSolver-AI/FEnet" - ] - }, - { - "architecture_id": "IQuestCoderForCausalLM", - "total_models": 3, - "sample_models": [ - "IQuestLab/IQuest-Coder-V1-40B-Instruct", - "IQuestLab/IQuest-Coder-V1-7B-Instruct", - "Multilingual-Multimodal-NLP/IndustrialCoder" - ] - }, - { - "architecture_id": "XverseForCausalLM", - "total_models": 3, - "sample_models": [ - "xverse/XVERSE-7B-Chat", - "katuni4ka/tiny-random-xverse", - "xverse/XVERSE-13B-256K" - ] - }, - { - "architecture_id": "PersimmonForCausalLM", - "total_models": 3, - "sample_models": [ - "adept/persimmon-8b-chat", - "adept/persimmon-8b-base", - "pszemraj/perSLIMmon-8b-base" - ] - }, - { - "architecture_id": "RecurrentGemmaForCausalLM", - "total_models": 3, - "sample_models": [ - "google/recurrentgemma-2b", - "google/recurrentgemma-2b-it", - "google/recurrentgemma-9b" - ] - }, - { - "architecture_id": "Llama4ForConditionalGeneration", - "total_models": 3, - "sample_models": [ - "yujiepan/llama-4-tiny-random", - "RedHatAI/Llama-4-Scout-17B-16E-Instruct-NVFP4", - "RedHatAI/Llama-4-Maverick-17B-128E-Instruct-NVFP4" - ] - }, - { - "architecture_id": "LlavaLlamaModel", - "total_models": 3, - "sample_models": [ - "Efficient-Large-Model/VILA1.5-3b", - "Efficient-Large-Model/NVILA-Lite-8B", - "Efficient-Large-Model/NVILA-8B" - ] - }, - { - "architecture_id": "AraGPT2LMHeadModel", - "total_models": 3, - "sample_models": [ - "QCRI/Fanar-2-Diwan", - "aubmindlab/aragpt2-mega", - "aubmindlab/aragpt2-large" - ] - }, - { - "architecture_id": "RITAModelForCausalLM", - "total_models": 3, - "sample_models": [ - "lightonai/RITA_s", - "lightonai/RITA_xl", - "lightonai/RITA_l" - ] - }, - { - "architecture_id": "NanoChatForCausalLM", - "total_models": 3, - "sample_models": [ - "Twobombs/nanochat-d34-sft-hf", - "pankajmathur/nanochat-d34-sft-hf", - "Nekochu/nanochat-d24" - ] - }, - { - "architecture_id": "MobileLlamaForCausalLM", - "total_models": 3, - "sample_models": [ - "mtgv/MobileVLM_V2-1.7B", - "mtgv/MobileVLM_V2-7B", - "mtgv/MobileVLM_V2-3B" - ] - }, - { - "architecture_id": "ParamBharatGenForCausalLM", - "total_models": 3, - "sample_models": [ - "bharatgenai/Param-1-5B", - "bharatgenai/AyurParam", - "bharatgenai/Param-1-2.9B-Instruct" - ] - }, - { - "architecture_id": "modeling_camelidae.LlamaForCausalLM", - "total_models": 3, - "sample_models": [ - "hywu/Camelidae-8x34B", - "hywu/Camelidae-8x7B", - "hywu/Camelidae-8x13B" - ] - }, - { - "architecture_id": "MptForCausalLM", - "total_models": 3, - "sample_models": [ - "yujiepan/mpt-tiny-random", - "explosion-testing/mpt-test", - "team-lucid/mptk-1b" - ] - }, - { - "architecture_id": "BlueLMForCausalLM", - "total_models": 3, - "sample_models": [ - "vivo-ai/BlueLM-7B-Chat", - "vivo-ai/BlueLM-7B-Base", - "vivo-ai/BlueLM-7B-Chat-32K" - ] - }, - { - "architecture_id": "LlamaMoEForCausalLM", - "total_models": 3, - "sample_models": [ - "llama-moe/LLaMA-MoE-v1-3_5B-2_8", - "llama-moe/LLaMA-MoE-v1-3_0B-2_16", - "llama-moe/LLaMA-MoE-v1-3_5B-4_16" - ] - }, - { - "architecture_id": "H2OVLChatModel", - "total_models": 2, - "sample_models": [ - "h2oai/h2ovl-mississippi-800m", - "h2oai/h2ovl-mississippi-2b" - ] - }, - { - "architecture_id": "KimiK25ForConditionalGeneration", - "total_models": 2, - "sample_models": [ - "nvidia/Kimi-K2.5-NVFP4", - "Ex0bit/Kimi-K2.5-PRISM-REAP-530B-A32B" - ] - }, - { - "architecture_id": "HCXVisionV2ForCausalLM", - "total_models": 2, - "sample_models": [ - "naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B", - "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B" - ] - }, - { - "architecture_id": "SolarOpenForCausalLM", - "total_models": 2, - "sample_models": [ - "upstage/Solar-Open-100B", - "nota-ai/Solar-Open-100B-NotaMoEQuant-Int4" - ] - }, - { - "architecture_id": "OpenAIGPTLMHeadModel", - "total_models": 2, - "sample_models": [ - "openai-community/openai-gpt", - "lgaalves/gpt1" - ] - }, - { - "architecture_id": "MoshiForConditionalGeneration", - "total_models": 2, - "sample_models": [ - "kmhf/hf-moshiko", - "kmhf/hf-moshika" - ] - }, - { - "architecture_id": "SarvamMLAForCausalLM", - "total_models": 2, - "sample_models": [ - "aoxo/sarvam-105b-uncensored", - "sarvamai/sarvam-105b" - ] - }, - { - "architecture_id": "ReformerModelWithLMHead", - "total_models": 2, - "sample_models": [ - "google/reformer-crime-and-punishment", - "google/reformer-enwik8" - ] - }, - { - "architecture_id": "GPTNeoXJapaneseForCausalLM", - "total_models": 2, - "sample_models": [ - "abeja/gpt-neox-japanese-2.7b", - "hf-tiny-model-private/tiny-random-GPTNeoXJapaneseForCausalLM" - ] - }, - { - "architecture_id": "SarvamMoEForCausalLM", - "total_models": 2, - "sample_models": [ - "aoxo/sarvam-30b-uncensored", - "sarvamai/sarvam-30b" - ] - }, - { - "architecture_id": "MiMoForCausalLM", - "total_models": 2, - "sample_models": [ - "XiaomiMiMo/MiMo-7B-Base", - "XiaomiMiMo/MiMo-7B-RL" - ] - }, - { - "architecture_id": "StarVectorForCausalLM", - "total_models": 2, - "sample_models": [ - "starvector/starvector-1b-im2svg", - "starvector/starvector-8b-im2svg" - ] - }, - { - "architecture_id": "BaiChuanForCausalLM", - "total_models": 2, - "sample_models": [ - "baichuan-inc/Baichuan-7B", - "FreedomIntelligence/HuatuoGPT-7B" - ] - }, - { - "architecture_id": "MiniMaxM1ForCausalLM", - "total_models": 2, - "sample_models": [ - "MiniMaxAI/MiniMax-M1-40k", - "MiniMaxAI/MiniMax-M1-80k" - ] - }, - { - "architecture_id": "DeepseekForCausalLM", - "total_models": 2, - "sample_models": [ - "deepseek-ai/deepseek-moe-16b-base", - "deepseek-ai/deepseek-moe-16b-chat" - ] - }, - { - "architecture_id": "Phi3SmallForCausalLM", - "total_models": 2, - "sample_models": [ - "microsoft/Phi-3-small-8k-instruct", - "microsoft/Phi-3-small-128k-instruct" - ] - }, - { - "architecture_id": "EchoForCausalLM", - "total_models": 2, - "sample_models": [ - "ethicalabs/Echo-DSRN-486M-v0.7.6-SFT", - "ethicalabs/Echo-DSRN-114M-Base" - ] - }, - { - "architecture_id": "Ernie4_5ForCausalLM", - "total_models": 2, - "sample_models": [ - "baidu/ERNIE-4.5-0.3B-PT", - "baidu/ERNIE-4.5-0.3B-Base-PT" - ] - }, - { - "architecture_id": "OrionForCausalLM", - "total_models": 2, - "sample_models": [ - "OrionStarAI/Orion-14B-Chat", - "OrionStarAI/Orion-14B-Base" - ] - }, - { - "architecture_id": "NemotronFlashForCausalLM", - "total_models": 2, - "sample_models": [ - "nvidia/Nemotron-Flash-3B", - "nvidia/Nemotron-Flash-1B" - ] - }, - { - "architecture_id": "AXK1ForCausalLM", - "total_models": 2, - "sample_models": [ - "skt/A.X-K1", - "thkim93/axk1-2layers" - ] - }, - { - "architecture_id": "DbrxForCausalLM", - "total_models": 2, - "sample_models": [ - "trl-internal-testing/tiny-DbrxForCausalLM", - "katuni4ka/tiny-random-dbrx" - ] - }, - { - "architecture_id": "Dots1ForCausalLM", - "total_models": 2, - "sample_models": [ - "rednote-hilab/dots.llm1.inst", - "rednote-hilab/dots.llm1.base" - ] - }, - { - "architecture_id": "FlexOlmoForCausalLM", - "total_models": 2, - "sample_models": [ - "allenai/Flex-reddit-2x7B-1T", - "allenai/FlexOlmo-7x7B-1T-RT" - ] - }, - { - "architecture_id": "ChatGLMModel", - "total_models": 2, - "sample_models": [ - "zai-org/codegeex4-all-9b", - "zai-org/glm-4-9b" - ] - }, - { - "architecture_id": "CLIPT5ForConditionalGeneration", - "total_models": 2, - "sample_models": [ - "zhiqiulin/clip-flant5-xl", - "zhiqiulin/clip-flant5-xxl" - ] - }, - { - "architecture_id": "PenguinVLQwen3ForCausalLM", - "total_models": 2, - "sample_models": [ - "tencent/Penguin-VL-8B", - "tencent/Penguin-VL-2B" - ] - }, - { - "architecture_id": "StripedHyenaModelForCausalLM", - "total_models": 2, - "sample_models": [ - "togethercomputer/evo-1-131k-base", - "togethercomputer/evo-1-8k-base" - ] - }, - { - "architecture_id": "CrystalCoderLMHeadModel", - "total_models": 2, - "sample_models": [ - "LLM360/Crystal", - "LLM360/CrystalChat" - ] - }, - { - "architecture_id": "JetNemotronForCausalLM", - "total_models": 2, - "sample_models": [ - "jet-ai/Jet-Nemotron-2B", - "jet-ai/Jet-Nemotron-4B" - ] - }, - { - "architecture_id": "Mamba2ForCausalLM", - "total_models": 2, - "sample_models": [ - "deqing/mamba2-300M-v5-mamba2", - "EchoLabs33/mamba2-1.3b-hxq" - ] - }, - { - "architecture_id": "MolformerForCausalLM", - "total_models": 2, - "sample_models": [ - "ibm-research/GP-MoLFormer-Uniq", - "ralyn/NPComposer-v2" - ] - }, - { - "architecture_id": "CogVLMForCausalLM", - "total_models": 2, - "sample_models": [ - "zai-org/cogvlm2-llama3-chat-19B", - "zai-org/cogvlm-chat-hf" - ] - }, - { - "architecture_id": "Jais2ForCausalLM", - "total_models": 2, - "sample_models": [ - "inceptionai/Jais-2-8B-Chat", - "inceptionai/Jais-2-70B-Chat" - ] - }, - { - "architecture_id": "Qwen2ForSequenceClassification", - "total_models": 2, - "sample_models": [ - "nvidia/AceMath-7B-RM", - "nvidia/Qwen2.5-CascadeRL-RM-72B" - ] - }, - { - "architecture_id": "ChatGLMForConditionalGeneration", - "total_models": 2, - "sample_models": [ - "IAAR-Shanghai/xVerify-9B-C", - "qiuhuachuan/MeChat" - ] - }, - { - "architecture_id": "RavenForCausalLM", - "total_models": 2, - "sample_models": [ - "tomg-group-umd/huginn-0125", - "smcleish/Recurrent-Llama-3.2-train-recurrence-32" - ] - }, - { - "architecture_id": "YoutuForCausalLM", - "total_models": 2, - "sample_models": [ - "tencent/Youtu-LLM-2B-Base", - "tencent/Youtu-LLM-2B" - ] - }, - { - "architecture_id": "Qwen3VLMoeForConditionalGeneration", - "total_models": 2, - "sample_models": [ - "RedHatAI/Qwen3-VL-235B-A22B-Instruct-NVFP4", - "Oysiyl/qwen3-vl-30b-a3b-unslop-good-lora-v1" - ] - }, - { - "architecture_id": "MosaicGPT", - "total_models": 2, - "sample_models": [ - "anas-awadalla/mpt-1b-redpajama-200b", - "anas-awadalla/mpt-1b-redpajama-200b-dolly" - ] - }, - { - "architecture_id": "GTLMForCausalLM", - "total_models": 2, - "sample_models": [ - "Madras1/GTLM-1-2B-A350M", - "Madras1/GTLM-1-2B-A350M-fp16" - ] - }, - { - "architecture_id": "GPT2Model", - "total_models": 2, - "sample_models": [ - "cerebras/Cerebras-GPT-13B", - "keshan/sinhala-gpt2" - ] - }, - { - "architecture_id": "LiquidForCausalLM", - "total_models": 2, - "sample_models": [ - "reaperdoesntknow/DNA-175M", - "reaperdoesntknow/DNA-50M" - ] - }, - { - "architecture_id": "ModernBertDecoderForCausalLM", - "total_models": 2, - "sample_models": [ - "jhu-clsp/ettin-decoder-400m", - "jhu-clsp/ettin-decoder-32m" - ] - }, - { - "architecture_id": "GLAForCausalLM", - "total_models": 2, - "sample_models": [ - "fla-hub/gla-340M-15B", - "fla-hub/gla-1.3B-100B" - ] - }, - { - "architecture_id": "DuchifatCore", - "total_models": 2, - "sample_models": [ - "Raziel1234/Duchifat-2", - "razielAI/Duchifat-2.1-Instruct" - ] - }, - { - "architecture_id": "KonkanGPT", - "total_models": 2, - "sample_models": [ - "omdeep22/Gonyai-teo2", - "omdeep22/Gonyai-v1" - ] - }, - { - "architecture_id": "BertLMHeadModel", - "total_models": 2, - "sample_models": [ - "dicta-il/BEREL_3.0", - "hf-tiny-model-private/tiny-random-BertLMHeadModel" - ] - }, - { - "architecture_id": "RobertaForCausalLM", - "total_models": 2, - "sample_models": [ - "gokceuludogan/ChemBERTaLM", - "uf-aice-lab/math-roberta" - ] - }, - { - "architecture_id": "MossForCausalLM", - "total_models": 2, - "sample_models": [ - "OpenMOSS-Team/moss-moon-003-sft", - "OpenMOSS-Team/moss-moon-003-base" - ] - }, - { - "architecture_id": "WeDLMForCausalLM", - "total_models": 2, - "sample_models": [ - "tencent/WeDLM-8B-Base", - "tencent/WeDLM-8B-Instruct" - ] - }, - { - "architecture_id": "Rwkv5ForCausalLM", - "total_models": 2, - "sample_models": [ - "RWKV/rwkv-5-world-3b", - "RWKV/rwkv-5-world-1b5" - ] - }, - { - "architecture_id": "BartForCausalLM", - "total_models": 2, - "sample_models": [ - "sanchit-gandhi/tiny-random-bart-fp16", - "hf-tiny-model-private/tiny-random-BartForCausalLM" - ] - }, - { - "architecture_id": "BitnetForCausalLM", - "total_models": 2, + "architecture_id": "SDARForCausalLM", + "total_models": 5, "sample_models": [ - "1bitLLM/bitnet_b1_58-large", - "1bitLLM/bitnet_b1_58-3B" + "JetLM/SDAR-8B-Chat-b32", + "JetLM/SDAR-4B-Chat-b32", + "JetLM/SDAR-8B-Chat", + "JetLM/SDAR-1.7B-Chat-b32", + "JetLM/SDAR-1.7B-Chat" ] }, { - "architecture_id": "Int8OPTForCausalLM", - "total_models": 2, + "architecture_id": "BloomModel", + "total_models": 5, "sample_models": [ - "mit-han-lab/opt-125m-smoothquant", - "mit-han-lab/opt-6.7b-smoothquant" + "bigscience/bigscience-small-testing", + "TurkuNLP/gpt3-finnish-small", + "TurkuNLP/gpt3-finnish-large", + "TurkuNLP/gpt3-finnish-13B", + "TurkuNLP/gpt3-finnish-xl" ] }, { - "architecture_id": "Olmo2ForSequenceClassification", - "total_models": 2, + "architecture_id": "AfmoeForCausalLM", + "total_models": 5, "sample_models": [ - "allenai/OLMo-2-1124-7B-RM", - "LifeWiki-ai/OLMo-2-1124-7B-RM" + "arcee-ai/Trinity-Nano-Preview", + "arcee-ai/Trinity-Mini", + "arcee-ai/Trinity-Large-Preview", + "arcee-ai/Trinity-Nano-Base", + "arcee-ai/Trinity-Mini-Base" ] }, { - "architecture_id": "TranceptionLMHeadModel", - "total_models": 2, + "architecture_id": "LlavaQwen2ForCausalLM", + "total_models": 5, "sample_models": [ - "PascalNotin/Tranception_Large", - "PascalNotin/Tranception_Small" + "apple/FastVLM-0.5B", + "qnguyen3/nanoLLaVA", + "apple/FastVLM-1.5B", + "apple/FastVLM-7B", + "FreedomIntelligence/HuatuoGPT-Vision-7B" ] }, { - "architecture_id": "MultiScaleForCausalLM", - "total_models": 2, + "architecture_id": "HunYuanDenseV1ForCausalLM", + "total_models": 5, "sample_models": [ - "KoinicLabs/AXL-Vision-v2", - "KoinicLabs/AXL-Translate" + "tencent/Hunyuan-7B-Instruct", + "tencent/Hunyuan-0.5B-Pretrain", + "tencent/Hunyuan-4B-Instruct", + "tencent/Hunyuan-0.5B-Instruct", + "tencent/Hunyuan-1.8B-Instruct" ] }, { - "architecture_id": "GPT", - "total_models": 2, + "architecture_id": "PhiMoEForCausalLM", + "total_models": 4, "sample_models": [ - "LH-Tech-AI/Apex-1.5-Coder-Instruct-350M", - "LH-Tech-AI/Apex-1.5-Instruct-350M" + "microsoft/Phi-tiny-MoE-instruct", + "microsoft/Phi-mini-MoE-instruct", + "microsoft/Phi-3.5-MoE-instruct", + "optimum-intel-internal-testing/phi-3.5-moe-tiny-random" ] }, { - "architecture_id": "BolmoForCausalLM", - "total_models": 2, + "architecture_id": "Starcoder2ForCausalLM", + "total_models": 4, "sample_models": [ - "allenai/Bolmo-1B", - "allenai/Bolmo-7B" + "bigcode/starcoder2-3b", + "bigcode/starcoder2-15b", + "bigcode/starcoder2-7b", + "bigcode/starcoder2-15b-instruct-v0.1" ] }, { - "architecture_id": "OpenMoeForCausalLM", - "total_models": 2, + "architecture_id": "Lfm2MoeForCausalLM", + "total_models": 4, "sample_models": [ - "hpcai-tech/openmoe-8B", - "OrionZheng/openmoe-8b" + "LiquidAI/LFM2-8B-A1B", + "LiquidAI/LFM2-24B-A2B", + "huihui-ai/Huihui-LFM2-24B-A2B-abliterated", + "huihui-ai/Huihui-LFM2-8B-A1B-abliterated" ] }, { - "architecture_id": "MiniMindForCausalLM", - "total_models": 2, + "architecture_id": "LLaDA2MoeModelLM", + "total_models": 4, "sample_models": [ - "yiwenX/MiniMind-MoE-640-120M", - "chujiamo/baiheng_0405" + "inclusionAI/LLaDA2.0-mini", + "inclusionAI/LLaDA2.1-mini", + "inclusionAI/LLaDA2.1-flash", + "inclusionAI/LLaDA2.0-flash" ] }, { - "architecture_id": "RWKV7ForCausalLM", - "total_models": 2, + "architecture_id": "LlamaForCausalLMEagle3", + "total_models": 4, "sample_models": [ - "RWKV/RWKV7-Goose-World3-1.5B-HF", - "fla-hub/rwkv7-1.5B-world" + "nvidia/gpt-oss-120b-Eagle3-short-context", + "taobao-mnn/Qwen3-VL-8B-Instruct-Eagle3", + "nvidia/gpt-oss-120b-Eagle3-long-context", + "nvidia/gpt-oss-120b-Eagle3-throughput" ] }, { - "architecture_id": "BottleneckT5LMWithPerturb", - "total_models": 2, + "architecture_id": "DeepseekForCausalLM", + "total_models": 4, "sample_models": [ - "thesephist/contra-bottleneck-t5-small-wikipedia", - "thesephist/contra-bottleneck-t5-base-wikipedia" + "deepseek-ai/deepseek-moe-16b-base", + "deepseek-ai/deepseek-moe-16b-chat", + "ai-sage/GigaChat-20B-A3B-base", + "ai-sage/GigaChat-20B-A3B-instruct" ] }, { - "architecture_id": "StableDiffcoderForCausalLM", - "total_models": 2, + "architecture_id": "OlmoHybridForCausalLM", + "total_models": 4, "sample_models": [ - "ByteDance-Seed/Stable-DiffCoder-8B-Instruct", - "ByteDance-Seed/Stable-DiffCoder-8B-Base" + "allenai/Olmo-Hybrid-7B", + "allenai/Olmo-Hybrid-Instruct-DPO-7B", + "allenai/Olmo-Hybrid-Instruct-SFT-7B", + "allenai/Olmo-Hybrid-Think-SFT-7B" ] }, { - "architecture_id": "OtterForConditionalGeneration", - "total_models": 2, + "architecture_id": "OuroForCausalLM", + "total_models": 4, "sample_models": [ - "luodian/OTTER-Video-LLaMA7B-DenseCaption", - "luodian/OTTER-MPT1B-RPJama-Init" + "ByteDance/Ouro-1.4B", + "ByteDance/Ouro-2.6B-Thinking", + "ByteDance/Ouro-1.4B-Thinking", + "ByteDance/Ouro-2.6B" ] }, { - "architecture_id": "MonkeyLMHeadModel", - "total_models": 2, + "architecture_id": "Glm4ForCausalLM", + "total_models": 4, "sample_models": [ - "echo840/Monkey-Chat", - "echo840/Monkey" + "zai-org/GLM-4-9B-0414", + "zai-org/GLM-Z1-32B-0414", + "zai-org/GLM-Z1-9B-0414", + "zai-org/GLM-4-32B-0414" ] }, { - "architecture_id": "IndexForCausalLM", - "total_models": 2, + "architecture_id": "ArceeForCausalLM", + "total_models": 4, "sample_models": [ - "IndexTeam/Index-1.9B-Chat", - "IndexTeam/Index-1.9B-Pure" + "optimum-intel-internal-testing/tiny-random-ArceeForCausalLM", + "arcee-ai/AFM-4.5B-Base", + "onnx-internal-testing/tiny-random-ArceeForCausalLM", + "arcee-ai/AFM-4.5B" ] }, { - "architecture_id": "PointLLMLlamaForCausalLM", - "total_models": 2, + "architecture_id": "BailingMoeV2ForCausalLM", + "total_models": 4, "sample_models": [ - "RunsenXu/PointLLM_7B_v1.1_init", - "RunsenXu/PointLLM_7B_v1.2" + "inclusionAI/Ling-mini-2.0", + "inclusionAI/Ling-1T", + "inclusionAI/Ring-mini-2.0", + "inclusionAI/Ling-flash-2.0" ] }, { - "architecture_id": "T5EncoderModel", - "total_models": 1, + "architecture_id": "AquilaForCausalLM", + "total_models": 4, "sample_models": [ - "XLabs-AI/xflux_text_encoders" + "BAAI/AquilaChat2-7B", + "katuni4ka/tiny-random-aquila2", + "BAAI/Aquila2-34B", + "katuni4ka/tiny-random-aquilachat" ] }, { - "architecture_id": "Step3p5ForCausalLM", - "total_models": 1, + "architecture_id": "RWForCausalLM", + "total_models": 4, "sample_models": [ - "stepfun-ai/Step-3.5-Flash" + "lightonai/alfred-40b-1023", + "vilm/vulture-40b", + "explosion-testing/refined-web-model-test", + "h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v2" ] }, { - "architecture_id": "AprielForCausalLM", - "total_models": 1, + "architecture_id": "XLNetLMHeadModel", + "total_models": 3, "sample_models": [ - "ServiceNow-AI/Apriel-5B-Instruct" + "xlnet/xlnet-base-cased", + "xlnet/xlnet-large-cased", + "sshleifer/tiny-xlnet-base-cased" ] }, { - "architecture_id": "IlamaForCausalLM", - "total_models": 1, + "architecture_id": "BioGptForCausalLM", + "total_models": 3, "sample_models": [ - "hmellor/Ilama-3.2-1B" + "microsoft/biogpt", + "microsoft/BioGPT-Large", + "microsoft/BioGPT-Large-PubMedQA" ] }, { - "architecture_id": "XCurOSForCausalLM", - "total_models": 1, + "architecture_id": "BambaForCausalLM", + "total_models": 3, "sample_models": [ - "XCurOS/XCurOS-0.1-8B-Instruct" + "hmellor/tiny-random-BambaForCausalLM", + "ibm-ai-platform/Bamba-9B-v1", + "ibm-ai-platform/Bamba-9B-v2" ] }, { - "architecture_id": "TarsierForConditionalGeneration", - "total_models": 1, + "architecture_id": "Exaone4ForCausalLM", + "total_models": 3, "sample_models": [ - "omni-research/Tarsier-7b" + "LGAI-EXAONE/EXAONE-4.0.1-32B", + "LGAI-EXAONE/EXAONE-4.0-1.2B", + "LGAI-EXAONE/EXAONE-4.0-32B" ] }, { - "architecture_id": "Plamo2ForCausalLM", - "total_models": 1, + "architecture_id": "MiMoForCausalLM", + "total_models": 3, "sample_models": [ - "pfnet/plamo-2-1b" + "XiaomiMiMo/MiMo-7B-Base", + "XiaomiMiMo/MiMo-7B-RL", + "XiaomiMiMo/MiMo-7B-SFT" ] }, { - "architecture_id": "HCXVisionForCausalLM", - "total_models": 1, + "architecture_id": "T5WithLMHeadModel", + "total_models": 3, "sample_models": [ - "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B" + "Rostlab/prot_t5_xl_bfd", + "Salesforce/codet5-large", + "unicamp-dl/ptt5-base-portuguese-vocab" ] }, { - "architecture_id": "KimiLinearForCausalLM", - "total_models": 1, + "architecture_id": "GlmMoeDsaForCausalLM", + "total_models": 3, "sample_models": [ - "moonshotai/Kimi-Linear-48B-A3B-Instruct" + "zai-org/GLM-5", + "nvidia/GLM-5-NVFP4", + "cs2764/GLM-5_dq3-mlx" ] }, { - "architecture_id": "MiMoV2FlashForCausalLM", - "total_models": 1, + "architecture_id": "Step3p5ForCausalLM", + "total_models": 3, "sample_models": [ - "XiaomiMiMo/MiMo-V2-Flash" + "stepfun-ai/Step-3.5-Flash", + "tacos4me/Step-3.5-Flash-NVFP4", + "stepfun-ai/Step-3.5-Flash-Base" ] }, { - "architecture_id": "LongcatFlashForCausalLM", - "total_models": 1, + "architecture_id": "Zamba2ForCausalLM", + "total_models": 3, "sample_models": [ - "meituan-longcat/LongCat-Flash-Chat" + "Zyphra/Zamba2-1.2B-instruct", + "Zyphra/Zamba2-7B-Instruct", + "Zyphra/Zamba2-2.7B" ] }, { - "architecture_id": "InternLM3ForCausalLM", - "total_models": 1, + "architecture_id": "InternLMForCausalLM", + "total_models": 3, "sample_models": [ - "internlm/internlm3-8b-instruct" + "internlm/internlm-chat-7b", + "internlm/internlm-20b", + "internlm/internlm-7b" ] }, { - "architecture_id": "HyperCLOVAXForCausalLM", - "total_models": 1, + "architecture_id": "GlmForCausalLM", + "total_models": 3, "sample_models": [ - "naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" + "zai-org/glm-4-9b-chat-hf", + "zai-org/glm-4-9b-hf", + "zai-org/glm-edge-1.5b-chat" ] }, { - "architecture_id": "GritLM", - "total_models": 1, + "architecture_id": "NemotronFlashForCausalLM", + "total_models": 3, "sample_models": [ - "parasail-ai/GritLM-7B-vllm" + "nvidia/Nemotron-Flash-3B", + "nvidia/Nemotron-Flash-3B-Instruct", + "nvidia/Nemotron-Flash-1B" ] }, { - "architecture_id": "BailingMoeV2_5ForCausalLM", - "total_models": 1, + "architecture_id": "Mistral3ForConditionalGeneration", + "total_models": 3, "sample_models": [ - "inclusionAI/Ring-2.5-1T" + "RedHatAI/Mistral-Small-3.2-24B-Instruct-2506-NVFP4", + "ArmGPT/ArmenianGPT-1.0-3B", + "srs6901/SOLARized-GraniStral-14B_2102_YeAM-HCT_32QKV" ] }, { - "architecture_id": "ExaoneMoEForCausalLM", - "total_models": 1, + "architecture_id": "Llama4ForConditionalGeneration", + "total_models": 3, "sample_models": [ - "LGAI-EXAONE/K-EXAONE-236B-A23B" + "RedHatAI/Llama-4-Scout-17B-16E-Instruct-NVFP4", + "yujiepan/llama-4-tiny-random", + "RedHatAI/Llama-4-Maverick-17B-128E-Instruct-NVFP4" ] }, { - "architecture_id": "Grok1ModelForCausalLM", - "total_models": 1, + "architecture_id": "PersimmonForCausalLM", + "total_models": 3, "sample_models": [ - "hpcai-tech/grok-1" + "adept/persimmon-8b-chat", + "adept/persimmon-8b-base", + "pszemraj/perSLIMmon-8b-base" ] }, { - "architecture_id": "BailingMoeForCausalLM", - "total_models": 1, + "architecture_id": "JAISLMHeadModel", + "total_models": 3, "sample_models": [ - "inclusionAI/Ling-lite-1.5" + "inceptionai/jais-13b-chat", + "katuni4ka/tiny-random-jais", + "inceptionai/jais-13b" ] }, { - "architecture_id": "SolarForCausalLM", - "total_models": 1, + "architecture_id": "TrillionForCausalLM", + "total_models": 3, "sample_models": [ - "upstage/solar-pro-preview-instruct" + "trillionlabs/Tri-21B-Think", + "trillionlabs/Tri-21B", + "trillionlabs/Tri-21B-Think-Preview" ] }, { - "architecture_id": "HunYuanMoEV1ForCausalLM", - "total_models": 1, + "architecture_id": "IdeficsForVisionText2Text", + "total_models": 3, "sample_models": [ - "tencent/Hunyuan-A13B-Instruct" + "HuggingFaceM4/idefics-80b-instruct", + "HuggingFaceM4/idefics-9b", + "HuggingFaceM4/idefics-9b-instruct" ] }, { - "architecture_id": "GptOssPuzzleForCausalLM", - "total_models": 1, + "architecture_id": "OLMoForCausalLM", + "total_models": 3, "sample_models": [ - "nvidia/gpt-oss-puzzle-88B" + "allenai/OLMo-1B", + "allenai/OLMo-7B-Instruct", + "allenai/OLMo-7B" ] }, { - "architecture_id": "MiniMaxForCausalLM", - "total_models": 1, + "architecture_id": "modeling_camelidae.LlamaForCausalLM", + "total_models": 3, "sample_models": [ - "MiniMaxAI/MiniMax-Text-01-hf" + "hywu/Camelidae-8x34B", + "hywu/Camelidae-8x13B", + "hywu/Camelidae-8x7B" ] }, { - "architecture_id": "ModernBertForSequenceClassification", - "total_models": 1, + "architecture_id": "LISAForCausalLM", + "total_models": 3, "sample_models": [ - "opendatalab/meta-rater-professionalism-rating" + "xinlai/LISA-13B-llama2-v1", + "xinlai/LISA-7B-v1", + "MBZUAI/GLaMM-GranD-Pretrained" ] }, { - "architecture_id": "MiniCPM3ForCausalLM", - "total_models": 1, + "architecture_id": "RWKV7ForCausalLM", + "total_models": 3, "sample_models": [ - "openbmb/MiniCPM3-4B" + "RWKV/RWKV7-Goose-World3-1.5B-HF", + "fla-hub/rwkv7-1.5B-world", + "RWKV/RWKV7-Goose-World3-2.9B-HF" ] }, { - "architecture_id": "ArcticForCausalLM", - "total_models": 1, + "architecture_id": "MotifForCausalLM", + "total_models": 3, "sample_models": [ - "Snowflake/snowflake-arctic-instruct" + "Motif-Technologies/Motif-2-12.7B-Base", + "Motif-Technologies/Motif-2-12.7B-Instruct", + "Motif-Technologies/Motif-2.6b-v1.1-LC" ] }, { - "architecture_id": "IQuestLoopCoderForCausalLM", - "total_models": 1, + "architecture_id": "H2OVLChatModel", + "total_models": 2, "sample_models": [ - "IQuestLab/IQuest-Coder-V1-40B-Loop-Instruct" + "h2oai/h2ovl-mississippi-800m", + "h2oai/h2ovl-mississippi-2b" ] }, { - "architecture_id": "Plamo3ForCausalLM", - "total_models": 1, + "architecture_id": "KimiK25ForConditionalGeneration", + "total_models": 2, "sample_models": [ - "pfnet/plamo-3-nict-2b-base" + "nvidia/Kimi-K2.5-NVFP4", + "Ex0bit/Kimi-K2.5-PRISM-REAP-530B-A32B" ] }, { - "architecture_id": "TransformerForCausalLM", - "total_models": 1, + "architecture_id": "HCXVisionV2ForCausalLM", + "total_models": 2, "sample_models": [ - "fla-hub/transformer-1.3B-100B" + "naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B", + "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B" ] }, { - "architecture_id": "Moondream", - "total_models": 1, + "architecture_id": "OpenAIGPTLMHeadModel", + "total_models": 2, "sample_models": [ - "vikhyatk/moondream1" + "openai-community/openai-gpt", + "lgaalves/gpt1" ] }, { - "architecture_id": "GraphT5TransformerForConditionalGeneration", - "total_models": 1, + "architecture_id": "MoshiForConditionalGeneration", + "total_models": 2, "sample_models": [ - "haitengzhao/gimlet" + "kmhf/hf-moshiko", + "kmhf/hf-moshika" ] }, { - "architecture_id": "GPT2LMHeadCustomModel", - "total_models": 1, + "architecture_id": "ReformerModelWithLMHead", + "total_models": 2, "sample_models": [ - "bigcode/santacoder" + "google/reformer-crime-and-punishment", + "google/reformer-enwik8" ] }, { - "architecture_id": "GPTRefactForCausalLM", - "total_models": 1, + "architecture_id": "Phi3VForCausalLM", + "total_models": 2, "sample_models": [ - "refactai/Refact-1_6B-fim" + "microsoft/Phi-3-vision-128k-instruct", + "TIGER-Lab/VLM2Vec-Full" ] }, { - "architecture_id": "TrillionForCausalLM", - "total_models": 1, + "architecture_id": "BartForConditionalGeneration", + "total_models": 2, "sample_models": [ - "trillionlabs/Tri-21B-Think" + "KomeijiForce/bart-large-emojilm", + "Nargizi/screeve-lemmatizer" ] }, { - "architecture_id": "InternLMXComposer2ForCausalLM", - "total_models": 1, + "architecture_id": "StarVectorForCausalLM", + "total_models": 2, "sample_models": [ - "internlm/internlm-xcomposer2-7b" + "starvector/starvector-1b-im2svg", + "starvector/starvector-8b-im2svg" ] }, { - "architecture_id": "NandiForCausalLM", - "total_models": 1, + "architecture_id": "KimiLinearForCausalLM", + "total_models": 2, "sample_models": [ - "Rta-AILabs/Nandi-Mini-150M" + "moonshotai/Kimi-Linear-48B-A3B-Instruct", + "moonshotai/Kimi-Linear-48B-A3B-Base" ] }, { - "architecture_id": "StableLMAlphaForCausalLM", - "total_models": 1, + "architecture_id": "DbrxForCausalLM", + "total_models": 2, "sample_models": [ - "stabilityai/stablelm-base-alpha-7b-v2" + "trl-internal-testing/tiny-DbrxForCausalLM", + "katuni4ka/tiny-random-dbrx" ] }, { - "architecture_id": "Param2MoEForCausalLM", - "total_models": 1, + "architecture_id": "Qwen2_5_VLForConditionalGeneration", + "total_models": 2, "sample_models": [ - "bharatgenai/Param2-17B-A2.4B-Thinking" + "nvidia/Qwen2.5-VL-7B-Instruct-NVFP4", + "OmniSVG/OmniSVG" ] }, { - "architecture_id": "InternLMXComposerForCausalLM", - "total_models": 1, + "architecture_id": "ChatGLMModel", + "total_models": 2, "sample_models": [ - "internlm/internlm-xcomposer-7b" + "zai-org/glm-4-9b", + "zai-org/codegeex4-all-9b" ] }, { - "architecture_id": "MobilintExaone4ForCausalLM", - "total_models": 1, + "architecture_id": "Llama4ForCausalLM", + "total_models": 2, "sample_models": [ - "mobilint/EXAONE-4.0-1.2B" + "trl-internal-testing/tiny-Llama4ForCausalLM", + "facebook/MobileLLM-R1-950M" ] }, { - "architecture_id": "PanguEmbeddedForCausalLM", - "total_models": 1, + "architecture_id": "Phi3SmallForCausalLM", + "total_models": 2, "sample_models": [ - "FreedomIntelligence/openPangu-Embedded-1B" + "microsoft/Phi-3-small-8k-instruct", + "microsoft/Phi-3-small-128k-instruct" ] }, { - "architecture_id": "ModelStarOLMhead", - "total_models": 1, + "architecture_id": "MiniMaxM1ForCausalLM", + "total_models": 2, "sample_models": [ - "Hawa-Al-Akram/StarO-Ai" + "MiniMaxAI/MiniMax-M1-40k", + "MiniMaxAI/MiniMax-M1-80k" ] }, { - "architecture_id": "TransfoXLLMHeadModel", - "total_models": 1, + "architecture_id": "CLIPT5ForConditionalGeneration", + "total_models": 2, "sample_models": [ - "transfo-xl/transfo-xl-wt103" + "zhiqiulin/clip-flant5-xxl", + "zhiqiulin/clip-flant5-xl" ] }, { - "architecture_id": "Qwen3TSForCausalLM", - "total_models": 1, + "architecture_id": "BailingMoeForCausalLM", + "total_models": 2, "sample_models": [ - "bytedance-research/ChatTS-8B" + "inclusionAI/Ling-lite-1.5", + "inclusionAI/Ling-lite" ] }, { - "architecture_id": "SparseLlamaForCausalLM", - "total_models": 1, + "architecture_id": "BitNetForCausalLM", + "total_models": 2, "sample_models": [ - "openbmb/NOSA-8B" + "microsoft/bitnet-b1.58-2B-4T", + "microsoft/bitnet-b1.58-2B-4T-bf16" ] }, { - "architecture_id": "DeltaNetForCausalLM", - "total_models": 1, + "architecture_id": "InternVLChatModel", + "total_models": 2, "sample_models": [ - "fla-hub/delta_net-1.3B-100B" + "numind/NuExtract-2-4B-experimental", + "numind/NuExtract-2-8B-experimental" ] }, { - "architecture_id": "CambrianQwenForCausalLM", - "total_models": 1, + "architecture_id": "Ernie4_5ForCausalLM", + "total_models": 2, "sample_models": [ - "nyu-visionx/Scale-RAE-Qwen1.5B_DiT2.4B" + "baidu/ERNIE-4.5-0.3B-PT", + "baidu/ERNIE-4.5-0.3B-Base-PT" ] }, { - "architecture_id": "EvafrillMoForCausalLM", - "total_models": 1, + "architecture_id": "IQuestCoderForCausalLM", + "total_models": 2, "sample_models": [ - "pathcosmos/EVAFRILL-Mo-3B" + "IQuestLab/IQuest-Coder-V1-40B-Instruct", + "IQuestLab/IQuest-Coder-V1-7B-Instruct" ] }, { - "architecture_id": "InternVLChatModel", - "total_models": 1, + "architecture_id": "XverseForCausalLM", + "total_models": 2, "sample_models": [ - "numind/NuExtract-2-4B-experimental" + "xverse/XVERSE-7B-Chat", + "katuni4ka/tiny-random-xverse" ] }, { - "architecture_id": "VaultGemmaForCausalLM", - "total_models": 1, + "architecture_id": "Jais2ForCausalLM", + "total_models": 2, "sample_models": [ - "google/vaultgemma-1b" + "inceptionai/Jais-2-8B-Chat", + "inceptionai/Jais-2-70B-Chat" ] }, { - "architecture_id": "ZambaForCausalLM", - "total_models": 1, + "architecture_id": "StripedHyenaModelForCausalLM", + "total_models": 2, "sample_models": [ - "Zyphra/Zamba-7B-v1" + "togethercomputer/evo-1-8k-base", + "togethercomputer/evo-1-131k-base" ] }, { - "architecture_id": "CheXagentForCausalLM", - "total_models": 1, + "architecture_id": "AXK1ForCausalLM", + "total_models": 2, "sample_models": [ - "StanfordAIMI/CheXagent-2-3b" + "skt/A.X-K1", + "thkim93/axk1-2layers" ] }, { - "architecture_id": "GatedDeltaNetForCausalLM", - "total_models": 1, + "architecture_id": "RecurrentGemmaForCausalLM", + "total_models": 2, "sample_models": [ - "deqing/gdn-300M-v5-gdn" + "google/recurrentgemma-2b", + "google/recurrentgemma-2b-it" ] }, { - "architecture_id": "Qwen2TSForCausalLM", - "total_models": 1, + "architecture_id": "FlexOlmoForCausalLM", + "total_models": 2, "sample_models": [ - "bytedance-research/ChatTS-14B" + "allenai/Flex-reddit-2x7B-1T", + "shanearora/Flex-reddit-2x7B-1T" ] }, { - "architecture_id": "QEDForCausalLM", - "total_models": 1, + "architecture_id": "SolarOpenForCausalLM", + "total_models": 2, "sample_models": [ - "levossadtchi/QED-75M" + "upstage/Solar-Open-100B", + "nota-ai/Solar-Open-100B-NotaMoEQuant-Int4" ] }, { - "architecture_id": "JetMoEForCausalLM", - "total_models": 1, + "architecture_id": "PenguinVLQwen3ForCausalLM", + "total_models": 2, "sample_models": [ - "jetmoe/jetmoe-8b" + "tencent/Penguin-VL-8B", + "tencent/Penguin-VL-2B" ] }, { - "architecture_id": "RecursiveLanguageModel", - "total_models": 1, + "architecture_id": "MolformerForCausalLM", + "total_models": 2, "sample_models": [ - "Girinath11/recursive-language-model-198m" + "ibm-research/GP-MoLFormer-Uniq", + "ralyn/NPComposer-v2" ] }, { - "architecture_id": "SeerAttnLlamaForCausalLM", - "total_models": 1, + "architecture_id": "GLAForCausalLM", + "total_models": 2, "sample_models": [ - "SeerAttention/SeerAttention-Llama-3.1-8B-AttnGates" + "fla-hub/gla-340M-15B", + "fla-hub/gla-1.3B-100B" ] }, { - "architecture_id": "LongcatCausalLM", - "total_models": 1, + "architecture_id": "MosaicGPT", + "total_models": 2, "sample_models": [ - "meituan-longcat/LongCat-Flash-Thinking-2601" + "anas-awadalla/mpt-1b-redpajama-200b", + "anas-awadalla/mpt-1b-redpajama-200b-dolly" ] }, { - "architecture_id": "GomeForCausalLM", - "total_models": 1, + "architecture_id": "Eagle3DraftModel", + "total_models": 2, "sample_models": [ - "Prositron/gome" + "RedHatAI/Qwen3-30B-A3B-Instruct-2507-speculator.eagle3", + "RedHatAI/Qwen3-235B-A22B-Instruct-2507-speculator.eagle3" ] }, { - "architecture_id": "MoYiForCausalLM", - "total_models": 1, + "architecture_id": "BolmoForCausalLM", + "total_models": 2, "sample_models": [ - "astanahub/alemllm" + "allenai/Bolmo-7B", + "allenai/Bolmo-1B" ] }, { - "architecture_id": "NanochatWasmFusedModel", - "total_models": 1, + "architecture_id": "JetMoEForCausalLM", + "total_models": 2, "sample_models": [ - "eastlondoner/nanochat-wasm-fused-preview-01" + "jetmoe/jetmoe-8b", + "jetmoe/jetmoe-8b-chat" ] }, { - "architecture_id": "LLM", - "total_models": 1, + "architecture_id": "CogVLMForCausalLM", + "total_models": 2, "sample_models": [ - "rudyon/linnet-497M" + "zai-org/cogvlm2-llama3-chat-19B", + "zai-org/cogvlm-chat-hf" ] }, { - "architecture_id": "MyAwesomeModelForCausalLM", - "total_models": 1, + "architecture_id": "WeDLMForCausalLM", + "total_models": 2, "sample_models": [ - "dongbobo/MyAwesomeModel" + "tencent/WeDLM-8B-Instruct", + "tencent/WeDLM-8B-Base" ] }, { - "architecture_id": "SwarmForCausalLM", - "total_models": 1, + "architecture_id": "YoutuForCausalLM", + "total_models": 2, "sample_models": [ - "reaperdoesntknow/SAGI" + "tencent/Youtu-LLM-2B-Base", + "tencent/Youtu-LLM-2B" ] }, { - "architecture_id": "CPMAntForCausalLM", - "total_models": 1, + "architecture_id": "ParamBharatGenForCausalLM", + "total_models": 2, "sample_models": [ - "openbmb/cpm-ant-10b" + "bharatgenai/Param-1-2.9B-Instruct", + "bharatgenai/AyurParam" ] }, { - "architecture_id": "Maira2ForConditionalGeneration", - "total_models": 1, + "architecture_id": "BitnetForCausalLM", + "total_models": 2, "sample_models": [ - "microsoft/maira-2" + "1bitLLM/bitnet_b1_58-3B", + "1bitLLM/bitnet_b1_58-large" ] }, { - "architecture_id": "CTRLLMHeadModel", - "total_models": 1, + "architecture_id": "SliderGPT", + "total_models": 2, "sample_models": [ - "sshleifer/tiny-ctrl" + "c-bone/CrystaLLM-pi_Mattergen-XRD", + "c-bone/CrystaLLM-pi_COD-XRD" ] }, { - "architecture_id": "SpatialLMQwenForCausalLM", - "total_models": 1, + "architecture_id": "BottleneckT5LMWithPerturb", + "total_models": 2, "sample_models": [ - "manycore-research/SpatialLM1.1-Qwen-0.5B" + "thesephist/contra-bottleneck-t5-base-wikipedia", + "thesephist/contra-bottleneck-t5-large-wikipedia" ] }, { - "architecture_id": "CoherenceMomentumModel", - "total_models": 1, + "architecture_id": "MptForCausalLM", + "total_models": 2, "sample_models": [ - "aisingapore/coherence-momentum" + "team-lucid/mptk-1b", + "explosion-testing/mpt-test" ] }, { - "architecture_id": "TAMELM", - "total_models": 1, + "architecture_id": "OpenMoeForCausalLM", + "total_models": 2, "sample_models": [ - "reaperdoesntknow/TameForCasualLM" + "hpcai-tech/openmoe-8B", + "OrionZheng/openmoe-8b" ] }, { - "architecture_id": "GPT2CustomLMHeadModel", + "architecture_id": "MiMoV2FlashForCausalLM", "total_models": 1, "sample_models": [ - "fxmarty/tiny-testing-gpt2-remote-code" + "XiaomiMiMo/MiMo-V2-Flash" ] }, { - "architecture_id": "GPT2", + "architecture_id": "T5EncoderModel", "total_models": 1, "sample_models": [ - "NamrataThakur/Small_Language_Model_MHA_53M_Pretrained" + "XLabs-AI/xflux_text_encoders" ] }, { - "architecture_id": "GQAGPT2", + "architecture_id": "XCurOSForCausalLM", "total_models": 1, "sample_models": [ - "NamrataThakur/Small_Language_Model_GQA_48M_Pretrained" + "XCurOS/XCurOS-0.1-8B-Instruct" ] }, { - "architecture_id": "MoEGPT2", + "architecture_id": "GPTNeoXJapaneseForCausalLM", "total_models": 1, "sample_models": [ - "NamrataThakur/Small_Language_Model_MOE_127M_Pretrained" + "abeja/gpt-neox-japanese-2.7b" ] }, { - "architecture_id": "Esm2LlamaInstructForCausalLM", + "architecture_id": "IlamaForCausalLM", "total_models": 1, "sample_models": [ - "xiao-fei/Prot2Text-V2-11B-Instruct-hf" + "hmellor/Ilama-3.2-1B" ] }, { - "architecture_id": "ThinkerLM", + "architecture_id": "Plamo2ForCausalLM", "total_models": 1, "sample_models": [ - "prskid1000/micro-Omni" + "pfnet/plamo-2-1b" ] }, { - "architecture_id": "QHEARTForECGQA", + "architecture_id": "HCXVisionForCausalLM", "total_models": 1, "sample_models": [ - "Manhph2211/Q-HEART" + "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B" ] }, { - "architecture_id": "MoELLaVAQwen2ForCausalLM", + "architecture_id": "TarsierForConditionalGeneration", "total_models": 1, "sample_models": [ - "KKHYA/llavaqwen2.5-0.5b-finetune-moe-4e-2k_20260331_194516" + "omni-research/Tarsier-7b" ] }, { - "architecture_id": "D3PMSanskritModel", + "architecture_id": "BaiChuanForCausalLM", "total_models": 1, "sample_models": [ - "bhsinghgrid/sanskrit-translation" + "baichuan-inc/Baichuan-7B" ] }, { - "architecture_id": "AV2TextForConditionalGeneration", + "architecture_id": "SarvamMoEForCausalLM", "total_models": 1, "sample_models": [ - "nguyenvulebinh/AV-HuBERT-MuAViC-en" + "sarvamai/sarvam-30b" ] }, { - "architecture_id": "GPTJXMoEForCausalLM", + "architecture_id": "LongcatFlashForCausalLM", "total_models": 1, "sample_models": [ - "Aletheia-ng/SabiYarn_MoE_translate" + "meituan-longcat/LongCat-Flash-Chat" ] }, { - "architecture_id": "Eagle3DeepseekV2ForCausalLM", + "architecture_id": "GPTRefactForCausalLM", "total_models": 1, "sample_models": [ - "nvidia/Kimi-K2.5-Thinking-Eagle3" + "refactai/Refact-1_6B-fim" ] }, { - "architecture_id": "Videollama2Qwen2ForCausalLM", + "architecture_id": "HyperCLOVAXForCausalLM", "total_models": 1, "sample_models": [ - "QuangTuan/MultiMood-7B-GRPO-VisualAudioText-Comp" + "naver-hyperclovax/HyperCLOVAX-SEED-Think-14B" ] }, { - "architecture_id": "Speech2TextTransformerForConditionalGeneration", + "architecture_id": "ExaoneMoEForCausalLM", "total_models": 1, "sample_models": [ - "valhalla/s2t_mustc_multilinguial_medium" + "LGAI-EXAONE/K-EXAONE-236B-A23B" ] }, { - "architecture_id": "BlenderbotForConditionalGeneration", + "architecture_id": "HunYuanMoEV1ForCausalLM", "total_models": 1, "sample_models": [ - "thu-coai/blenderbot-400M-esconv" + "tencent/Hunyuan-A13B-Instruct" ] }, { - "architecture_id": "WhisperMixStyleForConditionalGeneration", + "architecture_id": "GritLM", "total_models": 1, "sample_models": [ - "wago5090/mixstyle_multi-s" + "parasail-ai/GritLM-7B-vllm" ] }, { - "architecture_id": "Autoencoder", + "architecture_id": "BailingMoeV2_5ForCausalLM", "total_models": 1, "sample_models": [ - "cccczshao/CALM-Autoencoder" + "inclusionAI/Ring-2.5-1T" ] }, { - "architecture_id": "AlinlightForCausalLM", + "architecture_id": "SolarForCausalLM", "total_models": 1, "sample_models": [ - "EngineerGL/Alinlight" + "upstage/solar-pro-preview-instruct" ] }, { - "architecture_id": "LlamaForCausalLMEagle", + "architecture_id": "JetNemotronForCausalLM", "total_models": 1, "sample_models": [ - "thunlp/LLaMA3-Instruct-8B-FR-Spec" + "jet-ai/Jet-Nemotron-2B" ] }, { - "architecture_id": "GuppyLM", + "architecture_id": "InternLM3ForCausalLM", "total_models": 1, "sample_models": [ - "arman-bd/guppylm-9M" + "internlm/internlm3-8b-instruct" ] }, { - "architecture_id": "FusionInDecoderForConditionalGeneration", + "architecture_id": "Grok1ModelForCausalLM", "total_models": 1, "sample_models": [ - "Intel/fid_flan_t5_base_nq" + "hpcai-tech/grok-1" ] }, { - "architecture_id": "EveMoEForCausalLM", + "architecture_id": "Qwen3VLMoeForConditionalGeneration", "total_models": 1, "sample_models": [ - "anthonym21/Eve-2-MoE-IT-272M" + "RedHatAI/Qwen3-VL-235B-A22B-Instruct-NVFP4" ] }, { - "architecture_id": "Typhoon2Audio2AudioForConditionalGeneration", + "architecture_id": "MiniCPM3ForCausalLM", "total_models": 1, "sample_models": [ - "typhoon-ai/llama3.1-typhoon2-audio-8b-instruct" + "openbmb/MiniCPM3-4B" ] }, { - "architecture_id": "LlaMAForCausalLM", + "architecture_id": "Emu3ForCausalLM", "total_models": 1, "sample_models": [ - "circulus/alpaca-7b" + "BAAI/Emu3-Chat" ] }, { - "architecture_id": "GeoVForCausalLM", + "architecture_id": "GRIN-MoE", "total_models": 1, "sample_models": [ - "GeoV/GeoV-9b" + "microsoft/GRIN-MoE" ] }, { - "architecture_id": "RobertaPreLayerNormForCausalLM", + "architecture_id": "AV2TextForConditionalGeneration", "total_models": 1, "sample_models": [ - "hf-tiny-model-private/tiny-random-RobertaPreLayerNormForCausalLM" + "nguyenvulebinh/AV-HuBERT-MuAViC-en" ] }, { - "architecture_id": "RuGPT3XLForCausalLM", + "architecture_id": "MiniMaxForCausalLM", "total_models": 1, "sample_models": [ - "evilfreelancer/ruGPT3XL" + "MiniMaxAI/MiniMax-Text-01-hf" ] }, { - "architecture_id": "TeleFLMForCausalLM", + "architecture_id": "ArcticForCausalLM", "total_models": 1, "sample_models": [ - "CofeAI/Tele-FLM-1T" + "Snowflake/snowflake-arctic-instruct" ] }, { - "architecture_id": "GPTModelForTextGeneration", + "architecture_id": "OrionForCausalLM", "total_models": 1, "sample_models": [ - "samkeet/GPT_124M-Instruct" + "OrionStarAI/Orion-14B-Chat" ] }, { - "architecture_id": "TFGPT2LMHeadModel", + "architecture_id": "IQuestLoopCoderForCausalLM", "total_models": 1, "sample_models": [ - "mymusise/gpt2-medium-chinese" + "IQuestLab/IQuest-Coder-V1-40B-Loop-Instruct" ] }, { - "architecture_id": "PegasusForCausalLM", + "architecture_id": "Moondream", "total_models": 1, "sample_models": [ - "hf-tiny-model-private/tiny-random-PegasusForCausalLM" + "vikhyatk/moondream1" ] }, { - "architecture_id": "ElectraForCausalLM", + "architecture_id": "SarvamMLAForCausalLM", "total_models": 1, "sample_models": [ - "smeoni/nbme-electra-large-generator" + "sarvamai/sarvam-105b" ] }, { - "architecture_id": "BlenderbotForCausalLM", + "architecture_id": "Plamo3ForCausalLM", "total_models": 1, "sample_models": [ - "hf-tiny-model-private/tiny-random-BlenderbotForCausalLM" + "pfnet/plamo-3-nict-2b-base" ] }, { - "architecture_id": "LIMEForCausalLM", + "architecture_id": "InternLMXComposer2ForCausalLM", "total_models": 1, "sample_models": [ - "anarlavrenov/lime-1b-instruct" + "internlm/internlm-xcomposer2-7b" ] }, { - "architecture_id": "ModernBertForMaskedLM", + "architecture_id": "GraphT5TransformerForConditionalGeneration", "total_models": 1, "sample_models": [ - "JorgeVanco/diffusionGPT" + "haitengzhao/gimlet" ] }, { - "architecture_id": "MvpForCausalLM", + "architecture_id": "InternLMXComposerForCausalLM", "total_models": 1, "sample_models": [ - "hf-tiny-model-private/tiny-random-MvpForCausalLM" + "internlm/internlm-xcomposer-7b" ] }, { - "architecture_id": "DenseLLM", + "architecture_id": "Dots1ForCausalLM", "total_models": 1, "sample_models": [ - "AlgoDriveAI/Sanskrit_Akkadian_LLM_v1.0" + "rednote-hilab/dots.llm1.inst" ] }, { - "architecture_id": "FP8Qwen3ForCausalLM", + "architecture_id": "LlavaSearchLlamaForCausalLM", "total_models": 1, "sample_models": [ - "xihc-ucb/Qwen3-8B-Base-train-Quasar-0809" + "craigwu/seal_vqa_7b" ] }, { - "architecture_id": "EnergyTransformer", + "architecture_id": "CheXagentForCausalLM", "total_models": 1, "sample_models": [ - "cccczshao/CALM-M" + "StanfordAIMI/CheXagent-2-3b" ] }, { - "architecture_id": "ConditionalGPT2LMHeadModel", + "architecture_id": "TransfoXLLMHeadModel", "total_models": 1, "sample_models": [ - "entropy/roberta_zinc_decoder" + "transfo-xl/transfo-xl-wt103" ] }, { - "architecture_id": "XModelForCausalLM", + "architecture_id": "ZetaGrid25B", "total_models": 1, "sample_models": [ - "XiaoduoAILab/Xmodel_LM" + "RthItalia/Rth-lm-code-25b" ] }, { - "architecture_id": "JiRackTernary1B", + "architecture_id": "TransformerForCausalLM", "total_models": 1, "sample_models": [ - "kgrabko/JiRackTernary_1b" + "fla-hub/transformer-1.3B-100B" ] }, { - "architecture_id": "DebertaV2ForCausalLM", + "architecture_id": "Qwen3VLForConditionalGeneration", "total_models": 1, "sample_models": [ - "ltg/deberta-xxlarge-fixed" + "RedHatAI/Qwen3-VL-32B-Instruct-NVFP4" ] }, { - "architecture_id": "MoEGPTForCausalLM", + "architecture_id": "Rwkv6ForCausalLM", "total_models": 1, "sample_models": [ - "arnomatic/german-moe-gpt-v8-pretrained" + "RWKV/v6-Finch-1B6-HF" ] }, { - "architecture_id": "SongGenMixedForConditionalGeneration", + "architecture_id": "CambrianQwenForCausalLM", "total_models": 1, "sample_models": [ - "LiuZH-19/SongGen_mixed_pro" + "nyu-visionx/Scale-RAE-Qwen1.5B_DiT2.4B" ] }, { - "architecture_id": "SpectusForConditionalGeneration", + "architecture_id": "VaultGemmaForCausalLM", "total_models": 1, "sample_models": [ - "MS-ML/SpecTUS_pretrained_only" + "google/vaultgemma-1b" ] }, { - "architecture_id": "LSGBartForConditionalGeneration", + "architecture_id": "FP8Qwen2ForCausalLM", "total_models": 1, "sample_models": [ - "morenolq/LEGIT-BART-LSG-4096" + "xihc-ucb/Qwen2.5-7B-train-Quasar-1214" ] }, { - "architecture_id": "CloverLMForCausalLM", + "architecture_id": "SparseLlamaForCausalLM", "total_models": 1, "sample_models": [ - "daslab-testing/CloverLM" + "openbmb/NOSA-8B" ] }, { - "architecture_id": "Qwen2VLAudioForConditionalGeneration", + "architecture_id": "SpatialLMQwenForCausalLM", "total_models": 1, "sample_models": [ - "MayaKD/qwen2-vl-audio" + "manycore-research/SpatialLM1.1-Qwen-0.5B" ] }, { - "architecture_id": "FP8Qwen2ForCausalLM", + "architecture_id": "VSMForCausalLM", "total_models": 1, "sample_models": [ - "xihc-ucb/Qwen2.5-7B-train-Quasar-1214" + "craigwu/seal_vsm_7b" ] }, { - "architecture_id": "LSTMForCausalLM", + "architecture_id": "GPT2LMHeadCustomModel", "total_models": 1, "sample_models": [ - "deqing/lstm-window-4-v5" + "bigcode/santacoder" ] }, { - "architecture_id": "CheXagentForConditionalGeneration", + "architecture_id": "MoYiForCausalLM", "total_models": 1, "sample_models": [ - "StanfordAIMI/CheXagent-8b" + "astanahub/alemllm" ] }, { - "architecture_id": "LatentMoELLaVAPhiForCausalLM", + "architecture_id": "SeerAttnLlamaForCausalLM", "total_models": 1, "sample_models": [ - "KKHYA/llavaphi2-2.7b-finetune-latent-sparse-moe-4e-2k-freeze-1.0_20260304_075653" + "SeerAttention/SeerAttention-Llama-3.1-8B-AttnGates" ] }, { - "architecture_id": "GPTXForCausalLM", + "architecture_id": "RavenForCausalLM", "total_models": 1, "sample_models": [ - "AxiomicLabs/GPT-X-125m-15bt" + "tomg-group-umd/huginn-0125" ] }, { - "architecture_id": "OmniASRForConditionalGeneration", + "architecture_id": "GeoChatLlamaForCausalLM", "total_models": 1, "sample_models": [ - "bezzam/omniasr-llm-300m-v2" + "MBZUAI/geochat-7B" ] }, { - "architecture_id": "MiniMaxText01ForCausalLM", + "architecture_id": "Param2MoEForCausalLM", "total_models": 1, "sample_models": [ - "MiniMaxAI/MiniMax-Text-01" + "bharatgenai/Param2-17B-A2.4B-Thinking" ] }, { - "architecture_id": "LlavaCrystalForCausalLM", + "architecture_id": "AprielForCausalLM", "total_models": 1, "sample_models": [ - "LLM360/CrystalChat-7B-Web2Code" + "ServiceNow-AI/Apriel-5B-Instruct" ] }, { - "architecture_id": "MatriochkaForCausalLM", + "architecture_id": "PanguEmbeddedForCausalLM", "total_models": 1, "sample_models": [ - "nthngdy/matryoshka-3B" + "FreedomIntelligence/openPangu-Embedded-1B" ] }, { - "architecture_id": "MobileLLMP1ForCausalLM", + "architecture_id": "Phi4MMForCausalLM", "total_models": 1, "sample_models": [ - "facebook/MobileLLM-Pro-base" + "Yanis-Gerst/fine_tune" ] }, { - "architecture_id": "Bagel", + "architecture_id": "Maira2ForConditionalGeneration", "total_models": 1, "sample_models": [ - "lmms-lab/BAGEL-7B-MoT-ver.LE" + "microsoft/maira-2" ] }, { - "architecture_id": "InternLM2ForRewardModel", + "architecture_id": "MiniCPMSALAForCausalLM", "total_models": 1, "sample_models": [ - "internlm/internlm2_5-step-prover-critic" + "openbmb/MiniCPM-SALA" ] }, { - "architecture_id": "GeoChatLlamaForCausalLM", + "architecture_id": "GiddForDiffusionLM", "total_models": 1, "sample_models": [ - "MBZUAI/geochat-7B" + "dvruette/gidd-unif-3b" ] }, { - "architecture_id": "MobileLLMForCausalLM", + "architecture_id": "SteerlingForCausalLM", "total_models": 1, "sample_models": [ - "facebook/MobileLLM-125M" + "guidelabs/steerling-8b" ] }, { - "architecture_id": "SliderGPT", + "architecture_id": "StableLMAlphaForCausalLM", "total_models": 1, "sample_models": [ - "c-bone/CrystaLLM-pi_Mattergen-XRD" + "stabilityai/stablelm-base-alpha-7b-v2" ] }, { - "architecture_id": "CircuitGPTForCausalLM", + "architecture_id": "HGRNBitForCausalLM", "total_models": 1, "sample_models": [ - "openai/circuit-sparsity" + "ridger/MMfreeLM-370M" ] }, { - "architecture_id": "Qwen35ForCausalLM", + "architecture_id": "CheXagentForConditionalGeneration", "total_models": 1, "sample_models": [ - "JeffGreen311/Eve-V2-Unleashed-Qwen3.5-8B-Liberated-4K-4B-Merged" + "StanfordAIMI/CheXagent-8b" ] }, { - "architecture_id": "KORMoForCausalLM", + "architecture_id": "MiniMaxText01ForCausalLM", "total_models": 1, "sample_models": [ - "KORMo-Team/KORMo-10B-sft" + "MiniMaxAI/MiniMax-Text-01" ] }, { - "architecture_id": "MiniCPMSALAForCausalLM", + "architecture_id": "LamedPhi3ForCausalLM", "total_models": 1, "sample_models": [ - "openbmb/MiniCPM-SALA" + "GoodBaiBai88/M3D-LaMed-Phi-3-4B" ] }, { - "architecture_id": "GiddForDiffusionLM", + "architecture_id": "TorchMultiOmicsModel", "total_models": 1, "sample_models": [ - "dvruette/gidd-unif-3b" + "InstaDeepAI/ChatNT" ] }, { - "architecture_id": "MobilintEagle3Qwen2ForCausalLM", + "architecture_id": "MobileLlamaForCausalLM", "total_models": 1, "sample_models": [ - "mobilint/EAGLE3-JPharmatron-7B" + "mtgv/MobileVLM_V2-1.7B" ] }, { - "architecture_id": "Kanana2VecModel", + "architecture_id": "Phi4FlashForCausalLM", "total_models": 1, "sample_models": [ - "kakaocorp/kanana-nano-2.1b-embedding" + "microsoft/Phi-4-mini-flash-reasoning" ] }, { - "architecture_id": "JiRackTernaryModel", + "architecture_id": "DeciCoderForCausalLM", "total_models": 1, "sample_models": [ - "kgrabko/JiRackTernary_70b" + "Deci/DeciCoder-1b" ] }, { - "architecture_id": "Qwen3VLForConditionalGeneration", + "architecture_id": "GPT3DevLMHeadModel", "total_models": 1, "sample_models": [ - "RedHatAI/Qwen3-VL-32B-Instruct-NVFP4" + "k050506koch/GPT3-dev-350m-2805" ] }, { - "architecture_id": "Gemma4ForCausalLM", + "architecture_id": "Qwen2VLForConditionalGeneration", "total_models": 1, "sample_models": [ - "LilaRest/gemma-4-31B-it-NVFP4-turbo" + "yujiepan/qwen2-vl-tiny-random" ] }, { - "architecture_id": "AeroForConditionalGeneration", + "architecture_id": "Qwen2ForSequenceClassification", "total_models": 1, "sample_models": [ - "lmms-lab/Aero-1-Audio" + "nvidia/AceMath-7B-RM" ] }, { - "architecture_id": "HeliumForCausalLM", + "architecture_id": "Kanana2VecModel", "total_models": 1, "sample_models": [ - "kyutai/helium-1-preview-2b" + "kakaocorp/kanana-nano-2.1b-embedding" ] }, { - "architecture_id": "DeciCoderForCausalLM", + "architecture_id": "EchoForCausalLM", "total_models": 1, "sample_models": [ - "Deci/DeciCoder-1b" + "ethicalabs/Echo-DSRN-486M-v0.7.6-SFT" ] }, { - "architecture_id": "PolyLMHeadModel", + "architecture_id": "CTRLLMHeadModel", "total_models": 1, "sample_models": [ - "DAMO-NLP-MT/polylm-13b" + "sshleifer/tiny-ctrl" ] }, { - "architecture_id": "BD3LM", + "architecture_id": "LLaDAMoEModel", "total_models": 1, "sample_models": [ - "kuleshov-group/bd3lm-owt-block_size4" + "inclusionAI/LLaDA-MoE-7B-A1B-Base" ] }, { - "architecture_id": "LamedPhi3ForCausalLM", + "architecture_id": "CPMAntForCausalLM", "total_models": 1, "sample_models": [ - "GoodBaiBai88/M3D-LaMed-Phi-3-4B" + "openbmb/cpm-ant-10b" ] }, { - "architecture_id": "Emu3ForCausalLM", + "architecture_id": "ICONNForCausalLM", "total_models": 1, "sample_models": [ - "BAAI/Emu3-Chat" + "ICONNAI/ICONN-1-Mini-Beta" ] }, { - "architecture_id": "BunnyLlamaForCausalLM", + "architecture_id": "HeliumForCausalLM", "total_models": 1, "sample_models": [ - "typhoon-ai/llama-3-typhoon-v1.5-8b-vision-preview" + "kyutai/helium-1-preview-2b" ] }, { - "architecture_id": "SKTOmniForConditionalGeneration", + "architecture_id": "DogeForCausalLM", "total_models": 1, "sample_models": [ - "Shrijanagain/SKT_OMNI_SUPREME" + "SmallDoge/Doge-20M" ] }, { - "architecture_id": "CambrianLlamaForCausalLM", + "architecture_id": "LongcatFlashNgramForCausalLM", "total_models": 1, "sample_models": [ - "nyu-visionx/cambrian-8b" + "meituan-longcat/LongCat-Flash-Lite" ] }, { - "architecture_id": "LlamaModel", + "architecture_id": "GPT", "total_models": 1, "sample_models": [ - "ngoan/NgoanYi" + "LH-Tech-AI/Apex-1.5-Coder-Instruct-350M" ] }, { - "architecture_id": "SteerlingForCausalLM", + "architecture_id": "GPT2CustomLMHeadModel", "total_models": 1, "sample_models": [ - "guidelabs/steerling-8b" + "fxmarty/tiny-testing-gpt2-remote-code" ] }, { - "architecture_id": "TransnormerForCausalLM", + "architecture_id": "SKTOmniForConditionalGeneration", "total_models": 1, "sample_models": [ - "OpenNLPLab/TransNormerLLM-385M" + "Shrijanagain/SKT_OMNI_SUPREME" ] }, { - "architecture_id": "DUO", + "architecture_id": "MobileLLMForCausalLM", "total_models": 1, "sample_models": [ - "s-sahoo/duo-distilled" + "facebook/MobileLLM-125M" ] }, { - "architecture_id": "ErnieForCausalLM", + "architecture_id": "CircuitGPTForCausalLM", "total_models": 1, "sample_models": [ - "mohitsha/tiny-ernie-random-remote-code" + "openai/circuit-sparsity" ] }, { - "architecture_id": "ShikraLlamaForCausalLM", + "architecture_id": "Qwen3TSForCausalLM", "total_models": 1, "sample_models": [ - "shikras/shikra-7b-delta-v1" + "bytedance-research/ChatTS-8B" ] }, { - "architecture_id": "Rwkv7ForCausalLM", + "architecture_id": "ConditionalGPT", "total_models": 1, "sample_models": [ - "admijgjtjtjtjjg/dfdfdf" + "c-bone/CrystaLLM-pi_bandgap" ] }, { - "architecture_id": "ICONNForCausalLM", + "architecture_id": "DuchifatCore", "total_models": 1, "sample_models": [ - "ICONNAI/ICONN-1-Mini-Beta" + "Raziel1234/Duchifat-2" ] }, { - "architecture_id": "RubiRLM", + "architecture_id": "GPT2Model", "total_models": 1, "sample_models": [ - "DevHunterAI/RubiRLM-1B-Base" + "cerebras/Cerebras-GPT-13B" ] }, { - "architecture_id": "YiForCausalLM", + "architecture_id": "BD3LM", "total_models": 1, "sample_models": [ - "llmware/dragon-yi-6b-v0" + "kuleshov-group/bd3lm-owt-block_size4" ] }, { - "architecture_id": "SoraForSLM", + "architecture_id": "AeroForConditionalGeneration", "total_models": 1, "sample_models": [ - "Conlanger-LLM-CLEM/Sorie" + "lmms-lab/Aero-1-Audio" ] }, { - "architecture_id": "CpmBeeForCausalLM", + "architecture_id": "KORMoForCausalLM", "total_models": 1, "sample_models": [ - "openbmb/cpm-bee-10b" + "KORMo-Team/KORMo-10B-sft" ] }, { - "architecture_id": "HGRNBitForCausalLM", + "architecture_id": "PhariaForCausalLM", "total_models": 1, "sample_models": [ - "ridger/MMfreeLM-370M" + "Aleph-Alpha/Pharia-1-LLM-7B-control-hf" ] }, { - "architecture_id": "ZsGPT2LMHeadModel", + "architecture_id": "UMT5ForConditionalGeneration", "total_models": 1, "sample_models": [ - "claritylab/zero-shot-vanilla-gpt2" + "EleutherAI/pile-t5-xl" ] }, { - "architecture_id": "Phi4FlashForCausalLM", + "architecture_id": "ZambaForCausalLM", "total_models": 1, "sample_models": [ - "microsoft/Phi-4-mini-flash-reasoning" + "Zyphra/Zamba-7B-v1" ] }, { - "architecture_id": "MochivaForCausalLM", + "architecture_id": "PolyLMHeadModel", "total_models": 1, "sample_models": [ - "Mochiva-team/Mochiva-model" + "DAMO-NLP-MT/polylm-13b" ] }, { - "architecture_id": "HumanGPTForCausalLM", + "architecture_id": "RecursiveLanguageModel", "total_models": 1, "sample_models": [ - "YaoFeng/CHATPOSE-V0" + "Girinath11/recursive-language-model-198m" ] }, { - "architecture_id": "BTLMLMHeadModel", + "architecture_id": "SpatialLMLlamaForCausalLM", "total_models": 1, "sample_models": [ - "cerebras/btlm-3b-8k-base" + "manycore-research/SpatialLM1.1-Llama-1B" ] }, { - "architecture_id": "DotLMForCausalLM", + "architecture_id": "PointLLMLlamaForCausalLM", "total_models": 1, "sample_models": [ - "tensorfiend/DotLM-165M" + "RunsenXu/PointLLM_7B_v1.2" ] }, { - "architecture_id": "XMistralForCausalLM", + "architecture_id": "MegaForCausalLM", "total_models": 1, "sample_models": [ - "Hannibal046/xrag-7b" + "BEE-spoke-data/mega-ar-126m-4k" ] }, { - "architecture_id": "TelechatForCausalLM", + "architecture_id": "SongGenMixedForConditionalGeneration", "total_models": 1, "sample_models": [ - "Tele-AI/telechat-7B" + "LiuZH-19/SongGen_mixed_pro" ] }, { - "architecture_id": "FlamingoForCausalLM", + "architecture_id": "DUO", "total_models": 1, "sample_models": [ - "babylm/flamingo-2024" + "s-sahoo/duo-distilled" ] }, { - "architecture_id": "Qwen2VLForConditionalGeneration", + "architecture_id": "LlamaModel", "total_models": 1, "sample_models": [ - "typhoon-ai/typhoon2-qwen2vl-7b-vision-instruct" + "ngoan/NgoanYi" ] }, { - "architecture_id": "VStreamLlamaForCausalLM", + "architecture_id": "BailingMoeLinearV2ForCausalLM", "total_models": 1, "sample_models": [ - "IVGSZ/Flash-VStream-7b" + "inclusionAI/Ring-mini-linear-2.0" ] }, { - "architecture_id": "AquilaDenseForCausalLM", + "architecture_id": "BertLMHeadModel", "total_models": 1, "sample_models": [ - "BAAI/AquilaDense-7B" + "dicta-il/BEREL_3.0" ] }, { - "architecture_id": "LongLlamaForCausalLM", + "architecture_id": "Glm4MoeLiteSonicForCausalLM", "total_models": 1, "sample_models": [ - "syzymon/long_llama_3b" + "rpDungeon/GLM-4.7-Flash-SonicMOE" ] }, { - "architecture_id": "EmuForCausalLM", + "architecture_id": "Bagel", "total_models": 1, "sample_models": [ - "BAAI/Emu2-Chat" + "lmms-lab/BAGEL-7B-MoT-ver.LE" ] }, { - "architecture_id": "Lfm2Prototype1ForCausalLM", + "architecture_id": "GLaMMForCausalLM", "total_models": 1, "sample_models": [ - "nntsuzu/LFM2-SFT-Prototype01-1.2B-JP" + "MBZUAI/GLaMM-FullScope" ] }, { - "architecture_id": "CogVLMVideoForCausalLM", + "architecture_id": "KonkanGPT", "total_models": 1, "sample_models": [ - "zai-org/VisionReward-Video" + "omdeep22/Gonyai-v1" ] }, { - "architecture_id": "MoELLaVAQWenForCausalLM", + "architecture_id": "Qwen3OmniMoeThinkerForConditionalGeneration", "total_models": 1, "sample_models": [ - "LanguageBind/MoE-LLaVA-Qwen-1.8B-4e" + "ngqtrung/Qwen3-Omni-Thinker-30B-Instruct" ] }, { - "architecture_id": "YayiForCausalLM", + "architecture_id": "C3QwenForCausalLM", "total_models": 1, "sample_models": [ - "wenge-research/yayi2-30b" + "liufanfanlff/C3-Context-Cascade-Compression" ] }, { - "architecture_id": "ArgonneModel", + "architecture_id": "MonoidForCausalLM", "total_models": 1, "sample_models": [ - "PursuitOfDataScience/Argonne2.5-base" + "NoesisLab/Spartacus-1B-Instruct" ] }, { - "architecture_id": "SkyworkForCausalLM", + "architecture_id": "ErnieForCausalLM", "total_models": 1, "sample_models": [ - "Skywork/Skywork-13B-base" + "mohitsha/tiny-ernie-random-remote-code" ] }, { - "architecture_id": "Qwen3ASRForConditionalGeneration", + "architecture_id": "TransnormerForCausalLM", "total_models": 1, "sample_models": [ - "bezzam/Qwen3-ASR-0.6B" + "OpenNLPLab/TransNormerLLM-385M" ] }, { - "architecture_id": "HymbaForCausalLM", + "architecture_id": "PKVGPT", "total_models": 1, "sample_models": [ - "nvidia/Hymba-1.5B-Instruct" + "c-bone/CrystaLLM-pi_SLME" ] }, { - "architecture_id": "MobiLlamaForCausalLM", + "architecture_id": "MedHemoModel", "total_models": 1, "sample_models": [ - "MBZUAI/MobiLlama-05B" + "amewebstudio/medhemo-earcp" ] }, { - "architecture_id": "HebrewGPTForCausalLM", + "architecture_id": "OpenLMForCausalLM", "total_models": 1, "sample_models": [ - "Slasky/HebrewGPT-1B" + "nick11roberts/SL-discrep-chinchilla-rw-params5M_maxstep760-flop_1_25e16_step_767" ] }, { - "architecture_id": "YuanForCausalLM", + "architecture_id": "MCGPTForCausalLM", "total_models": 1, "sample_models": [ - "IEITYuan/Yuan2-M32-hf" + "TopAI-1/MCGPT-1" ] }, { - "architecture_id": "MegaForCausalLM", + "architecture_id": "HymbaForCausalLM", "total_models": 1, "sample_models": [ - "BEE-spoke-data/mega-ar-126m-4k" + "nvidia/Hymba-1.5B-Instruct" ] }, { - "architecture_id": "Gemma4TextModel", + "architecture_id": "LlamaMoEForCausalLM", "total_models": 1, "sample_models": [ - "bRadu/gemma-4-E2B-it-textonly" + "llama-moe/LLaMA-MoE-v1-3_5B-2_8" ] } ] diff --git a/transformer_lens/tools/model_registry/data/supported_models.json b/transformer_lens/tools/model_registry/data/supported_models.json index 1c8d879d0..b7868f462 100644 --- a/transformer_lens/tools/model_registry/data/supported_models.json +++ b/transformer_lens/tools/model_registry/data/supported_models.json @@ -77668,6 +77668,76 @@ "phase3_score": 100.0, "phase4_score": 89.9, "phase7_score": null + }, + { + "architecture_id": "CodeGenForCausalLM", + "model_id": "Salesforce/codegen-350M-mono", + "status": 1, + "verified_date": "2026-04-09", + "metadata": null, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 86.2, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "CodeGenForCausalLM", + "model_id": "Salesforce/codegen-350M-multi", + "status": 1, + "verified_date": "2026-04-09", + "metadata": null, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 84.5, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "CodeGenForCausalLM", + "model_id": "Salesforce/codegen-350M-nl", + "status": 1, + "verified_date": "2026-04-09", + "metadata": null, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 89.2, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "CodeGenForCausalLM", + "model_id": "Salesforce/codegen-2B-mono", + "status": 1, + "verified_date": "2026-04-09", + "metadata": null, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 85.4, + "phase7_score": null, + "phase8_score": null + }, + { + "architecture_id": "CodeGenForCausalLM", + "model_id": "Salesforce/codegen-2B-multi", + "status": 1, + "verified_date": "2026-04-09", + "metadata": null, + "note": "Full verification completed", + "phase1_score": 100.0, + "phase2_score": 100.0, + "phase3_score": 100.0, + "phase4_score": 67.5, + "phase7_score": null, + "phase8_score": null } ] } diff --git a/transformer_lens/tools/model_registry/data/verification_history.json b/transformer_lens/tools/model_registry/data/verification_history.json index 3eae3fae0..9eb2e7648 100644 --- a/transformer_lens/tools/model_registry/data/verification_history.json +++ b/transformer_lens/tools/model_registry/data/verification_history.json @@ -1,5 +1,5 @@ { - "last_updated": "2026-04-09T13:22:45.115556", + "last_updated": "2026-04-09T16:34:36.818082", "records": [ { "model_id": "Macropodus/macbert4mdcspell_v1", @@ -11142,158 +11142,38 @@ "invalidation_reason": null }, { - "model_id": "fxmarty/really-tiny-falcon-testing", - "architecture_id": "FalconForCausalLM", + "model_id": "Salesforce/codegen-350M-mono", + "architecture_id": "CodeGenForCausalLM", "verified_date": "2026-04-09", "verified_by": "verify_models", "transformerlens_version": null, - "notes": "Below threshold: P3=55.6% < 75.0% (failed: process_bridge_weights, weight_modification, hook_functio \u2014 Critical backward hooks check failed: Output 0 of BackwardHookFunctionBackward is a view and is being modified inplace. This view was created inside a", + "notes": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 20/103 components failed (20 critical)", "invalidated": false, "invalidation_reason": null }, { - "model_id": "yujiepan/falcon-tiny-random", - "architecture_id": "FalconForCausalLM", + "model_id": "Salesforce/codegen-350M-mono", + "architecture_id": "CodeGenForCausalLM", "verified_date": "2026-04-09", "verified_by": "verify_models", "transformerlens_version": null, - "notes": "Below threshold: P3=55.6% < 75.0% (failed: process_bridge_weights, weight_modification, hook_functio \u2014 Critical backward hooks check failed: Output 0 of BackwardHookFunctionBackward is a view and is being modified inplace. This view was created inside a", - "invalidated": false, - "invalidation_reason": null - }, - { - "model_id": "yujiepan/falcon-tiny-random", - "architecture_id": "FalconForCausalLM", - "verified_date": "2026-04-09", - "verified_by": "verify_models", - "transformerlens_version": null, - "notes": "Below threshold: P3=55.6% < 75.0% (failed: process_bridge_weights, weight_modification, hook_functio \u2014 Critical backward hooks check failed: Output 0 of BackwardHookFunctionBackward is a view and is being modified inplace. This view was created inside a", - "invalidated": false, - "invalidation_reason": null - }, - { - "model_id": "tiiuae/falcon-rw-1b", - "architecture_id": "FalconForCausalLM", - "verified_date": "2026-04-09", - "verified_by": "verify_models", - "transformerlens_version": null, - "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: Falcon with ALiBi is not yet supported. Only RoPE-based Falcon models are currently handled.", - "invalidated": false, - "invalidation_reason": null - }, - { - "model_id": "tiiuae/falcon-7b", - "architecture_id": "FalconForCausalLM", - "verified_date": "2026-04-09", - "verified_by": "verify_models", - "transformerlens_version": null, - "notes": "Below threshold: P3=50.0% < 75.0% (failed: process_bridge_weights, layer_norm_folding, weight_modifi \u2014 Critical backward hooks check failed: Output 0 of BackwardHookFunctionBackward is a view and is being modified inplace. This view was created inside a", - "invalidated": false, - "invalidation_reason": null - }, - { - "model_id": "tiiuae/falcon-7b", - "architecture_id": "FalconForCausalLM", - "verified_date": "2026-04-09", - "verified_by": "verify_models", - "transformerlens_version": null, - "notes": "Below threshold: P3=50.0% < 75.0% (failed: process_bridge_weights, layer_norm_folding, weight_modifi \u2014 Critical backward hooks check failed: Output 0 of BackwardHookFunctionBackward is a view and is being modified inplace. This view was created inside a", - "invalidated": false, - "invalidation_reason": null - }, - { - "model_id": "fxmarty/really-tiny-falcon-testing", - "architecture_id": "FalconForCausalLM", - "verified_date": "2026-04-09", - "verified_by": "verify_models", - "transformerlens_version": null, - "notes": "Below threshold: P3=55.6% < 75.0% (failed: process_bridge_weights, weight_modification, hook_functio \u2014 Critical backward hooks check failed: Output 0 of BackwardHookFunctionBackward is a view and is being modified inplace. This view was created inside a", - "invalidated": false, - "invalidation_reason": null - }, - { - "model_id": "fxmarty/really-tiny-falcon-testing", - "architecture_id": "FalconForCausalLM", - "verified_date": "2026-04-09", - "verified_by": "verify_models", - "transformerlens_version": null, - "notes": "Below threshold: P3=55.6% < 75.0% (failed: process_bridge_weights, weight_modification, hook_functio \u2014 Critical backward hooks check failed: Output 0 of BackwardHookFunctionBackward is a view and is being modified inplace. This view was created inside a", - "invalidated": false, - "invalidation_reason": null - }, - { - "model_id": "tiiuae/falcon-rw-1b", - "architecture_id": "FalconForCausalLM", - "verified_date": "2026-04-09", - "verified_by": "verify_models", - "transformerlens_version": null, - "notes": "Below threshold: P1=50.0% < 100.0% (failed: forward_pass_logits); P3=89.5% but required tests failed \u2014 Tensors differ: max_diff=33.789673, mean_rel=2.566615", - "invalidated": false, - "invalidation_reason": null - }, - { - "model_id": "fxmarty/really-tiny-falcon-testing", - "architecture_id": "FalconForCausalLM", - "verified_date": "2026-04-09", - "verified_by": "verify_models", - "transformerlens_version": null, - "notes": "Below threshold: P3=55.6% < 75.0% (failed: process_bridge_weights, weight_modification, hook_functio \u2014 Critical backward hooks check failed: Output 0 of BackwardHookFunctionBackward is a view and is being modified inplace. This view was created inside a", - "invalidated": false, - "invalidation_reason": null - }, - { - "model_id": "fxmarty/really-tiny-falcon-testing", - "architecture_id": "FalconForCausalLM", - "verified_date": "2026-04-09", - "verified_by": "verify_models", - "transformerlens_version": null, - "notes": "Full verification completed with issues, low text quality: P3=95.0% (failed: process_bridge_weights)", - "invalidated": false, - "invalidation_reason": null - }, - { - "model_id": "tiiuae/falcon-rw-1b", - "architecture_id": "FalconForCausalLM", - "verified_date": "2026-04-09", - "verified_by": "verify_models", - "transformerlens_version": null, - "notes": "Below threshold: P1=0.0% < 100.0% (failed: all_components, forward_pass_logits); P3=89.5% but requir \u2014 24/147 components failed (24 critical)", - "invalidated": false, - "invalidation_reason": null - }, - { - "model_id": "tiiuae/falcon-rw-1b", - "architecture_id": "FalconForCausalLM", - "verified_date": "2026-04-09", - "verified_by": "verify_models", - "transformerlens_version": null, - "notes": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 24/123 components failed (24 critical)", - "invalidated": false, - "invalidation_reason": null - }, - { - "model_id": "tiiuae/falcon-rw-1b", - "architecture_id": "FalconForCausalLM", - "verified_date": "2026-04-09", - "verified_by": "verify_models", - "transformerlens_version": null, - "notes": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 24/123 components failed (24 critical)", + "notes": "Full verification completed", "invalidated": false, "invalidation_reason": null }, { - "model_id": "tiiuae/falcon-rw-1b", - "architecture_id": "FalconForCausalLM", + "model_id": "Salesforce/codegen-350M-multi", + "architecture_id": "CodeGenForCausalLM", "verified_date": "2026-04-09", "verified_by": "verify_models", "transformerlens_version": null, - "notes": "Below threshold: P3=89.5% but required tests failed: logits_equivalence, loss_equivalence \u2014 Text quality score: 72.2/100 (avg perplexity: 471.0) \u2014 generated text may be incoherent", + "notes": "Full verification completed", "invalidated": false, "invalidation_reason": null }, { - "model_id": "tiiuae/falcon-rw-1b", - "architecture_id": "FalconForCausalLM", + "model_id": "Salesforce/codegen-350M-nl", + "architecture_id": "CodeGenForCausalLM", "verified_date": "2026-04-09", "verified_by": "verify_models", "transformerlens_version": null, @@ -11302,8 +11182,8 @@ "invalidation_reason": null }, { - "model_id": "tiiuae/falcon-rw-1b", - "architecture_id": "FalconForCausalLM", + "model_id": "Salesforce/codegen-2B-mono", + "architecture_id": "CodeGenForCausalLM", "verified_date": "2026-04-09", "verified_by": "verify_models", "transformerlens_version": null, @@ -11312,8 +11192,8 @@ "invalidation_reason": null }, { - "model_id": "tiiuae/falcon-rw-1b", - "architecture_id": "FalconForCausalLM", + "model_id": "Salesforce/codegen-2B-multi", + "architecture_id": "CodeGenForCausalLM", "verified_date": "2026-04-09", "verified_by": "verify_models", "transformerlens_version": null, From 1153ddeb3d2259681310701cc5380b262ce37f12 Mon Sep 17 00:00:00 2001 From: jlarson4 Date: Thu, 9 Apr 2026 17:19:51 -0500 Subject: [PATCH 2/2] Mypy and check format --- .../test_codegen_attention_bridge.py | 34 +++++++++---------- .../test_codegen_adapter.py | 17 ++++------ .../codegen_attention.py | 23 +++++-------- .../supported_architectures/codegen.py | 21 +++--------- 4 files changed, 38 insertions(+), 57 deletions(-) diff --git a/tests/unit/model_bridge/generalized_components/test_codegen_attention_bridge.py b/tests/unit/model_bridge/generalized_components/test_codegen_attention_bridge.py index 5814b1418..a6a22e288 100644 --- a/tests/unit/model_bridge/generalized_components/test_codegen_attention_bridge.py +++ b/tests/unit/model_bridge/generalized_components/test_codegen_attention_bridge.py @@ -10,8 +10,7 @@ - KV cache is passed through to _update_kv_cache """ -from typing import Any -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock import torch @@ -21,7 +20,6 @@ _rotate_every_two, ) - # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- @@ -207,7 +205,9 @@ def test_q_k_v_projections_are_set(self): def test_no_c_proj_attribute_needed(self): """Construction must succeed when the original component has no c_proj.""" - from transformer_lens.model_bridge.generalized_components.linear import LinearBridge + from transformer_lens.model_bridge.generalized_components.linear import ( + LinearBridge, + ) config = _make_config() split_qkv, _, _, _ = _make_split_qkv(config.d_model) @@ -386,9 +386,9 @@ def zeroing_hook(tensor, hook): bridge.q.hook_out.add_hook(zeroing_hook) zeroed_out, _ = bridge(hs.clone(), position_ids=pos_ids) - assert not torch.allclose(baseline_out, zeroed_out), ( - "Zeroing hook_q should change the attention output" - ) + assert not torch.allclose( + baseline_out, zeroed_out + ), "Zeroing hook_q should change the attention output" # --------------------------------------------------------------------------- @@ -443,7 +443,7 @@ def capture_scores(tensor, hook): # Compute what scores would be WITHOUT RoPE raw_q = raw_q_values[0] # [B, S, D] - raw_k = k_lin(hs) # [B, S, D] + raw_k = k_lin(hs) # [B, S, D] n_heads = config.n_heads head_dim = config.d_model // n_heads q_plain = raw_q.view(B, S, n_heads, head_dim).transpose(1, 2).to(torch.float32) @@ -453,9 +453,9 @@ def capture_scores(tensor, hook): actual_scores = attn_scores_with_rope[0] # The scores MUST differ because RoPE was applied - assert not torch.allclose(actual_scores, scores_no_rope, atol=1e-4), ( - "Attention scores with and without RoPE should differ" - ) + assert not torch.allclose( + actual_scores, scores_no_rope, atol=1e-4 + ), "Attention scores with and without RoPE should differ" def test_partial_rotary_dim_leaves_pass_through_unchanged(self): """The head-dim slice beyond rotary_dim should not be rotated. @@ -516,9 +516,9 @@ def patched_apply_rope(tensor, sin, cos): # The slice sent into RoPE must equal the raw_q rotary slice q_rot_slice = q_passed[0] # [B, H, S, rotary_dim] raw_q_rot_slice = raw_q_heads.transpose(1, 2)[:, :, :, :rotary_dim] - assert torch.allclose(q_rot_slice, raw_q_rot_slice, atol=1e-5), ( - "Q slice sent to RoPE must equal the raw projection (pre-rotation)" - ) + assert torch.allclose( + q_rot_slice, raw_q_rot_slice, atol=1e-5 + ), "Q slice sent to RoPE must equal the raw projection (pre-rotation)" # --------------------------------------------------------------------------- @@ -540,6 +540,6 @@ def test_future_positions_have_zero_attention_weight(self): # attn_weights: [B, H, S, S]; upper triangle (future) must be ~0 for i in range(S): for j in range(i + 1, S): - assert torch.all(attn_weights[:, :, i, j].abs() < 1e-5), ( - f"attn_weights[:, :, {i}, {j}] should be ~0 (future position)" - ) + assert torch.all( + attn_weights[:, :, i, j].abs() < 1e-5 + ), f"attn_weights[:, :, {i}, {j}] should be ~0 (future position)" diff --git a/tests/unit/model_bridge/supported_architectures/test_codegen_adapter.py b/tests/unit/model_bridge/supported_architectures/test_codegen_adapter.py index b76f36cce..efee81fc9 100644 --- a/tests/unit/model_bridge/supported_architectures/test_codegen_adapter.py +++ b/tests/unit/model_bridge/supported_architectures/test_codegen_adapter.py @@ -28,7 +28,6 @@ CodeGenArchitectureAdapter, ) - # --------------------------------------------------------------------------- # Fixtures # --------------------------------------------------------------------------- @@ -136,9 +135,7 @@ def test_blocks_ln1_name(self, adapter: CodeGenArchitectureAdapter) -> None: def test_no_ln2_in_blocks(self, adapter: CodeGenArchitectureAdapter) -> None: """CodeGen uses parallel attn+MLP sharing ln_1 — there must be no ln2.""" blocks = adapter.component_mapping["blocks"] - assert "ln2" not in blocks.submodules, ( - "CodeGen parallel block must not have ln2" - ) + assert "ln2" not in blocks.submodules, "CodeGen parallel block must not have ln2" def test_attn_is_codegen_attention_bridge(self, adapter: CodeGenArchitectureAdapter) -> None: blocks = adapter.component_mapping["blocks"] @@ -316,9 +313,9 @@ def test_factory_returns_codegen_adapter(self) -> None: cfg = _make_cfg() adapter = ArchitectureAdapterFactory.select_architecture_adapter(cfg) - assert isinstance(adapter, CodeGenArchitectureAdapter), ( - f"Expected CodeGenArchitectureAdapter, got {type(adapter).__name__}" - ) + assert isinstance( + adapter, CodeGenArchitectureAdapter + ), f"Expected CodeGenArchitectureAdapter, got {type(adapter).__name__}" def test_factory_key_is_codegen_for_causal_lm(self) -> None: """SUPPORTED_ARCHITECTURES must have a 'CodeGenForCausalLM' key.""" @@ -326,6 +323,6 @@ def test_factory_key_is_codegen_for_causal_lm(self) -> None: SUPPORTED_ARCHITECTURES, ) - assert "CodeGenForCausalLM" in SUPPORTED_ARCHITECTURES, ( - "CodeGenForCausalLM must be registered in SUPPORTED_ARCHITECTURES" - ) + assert ( + "CodeGenForCausalLM" in SUPPORTED_ARCHITECTURES + ), "CodeGenForCausalLM must be registered in SUPPORTED_ARCHITECTURES" diff --git a/transformer_lens/model_bridge/generalized_components/codegen_attention.py b/transformer_lens/model_bridge/generalized_components/codegen_attention.py index a4df9c170..e21bc46b0 100644 --- a/transformer_lens/model_bridge/generalized_components/codegen_attention.py +++ b/transformer_lens/model_bridge/generalized_components/codegen_attention.py @@ -10,7 +10,7 @@ - rotary_dim: if None, RoPE is applied to the full head dimension. """ -from typing import Any, Callable, Dict, Optional +from typing import Any, Callable, Dict, Optional, cast import torch @@ -24,7 +24,6 @@ JointQKVAttentionBridge, ) - # --------------------------------------------------------------------------- # Rotary helpers — GPT-J / CodeGen style ("rotate_every_two") # --------------------------------------------------------------------------- @@ -42,7 +41,7 @@ def _rotate_every_two(x: torch.Tensor) -> torch.Tensor: Returns: Tensor of the same shape with even/odd pairs rotated. """ - x1 = x[:, :, :, ::2] # even-indexed dims + x1 = x[:, :, :, ::2] # even-indexed dims x2 = x[:, :, :, 1::2] # odd-indexed dims x = torch.stack((-x2, x1), dim=-1) return x.flatten(-2) @@ -170,11 +169,7 @@ def get_random_inputs( if dtype is None: dtype = torch.float32 - d_model = ( - self.config.d_model - if self.config and hasattr(self.config, "d_model") - else 768 - ) + d_model = self.config.d_model if self.config and hasattr(self.config, "d_model") else 768 # Build the HF-style 4D causal mask: 0 where attended, -inf where masked. # Shape: [batch, 1, seq_len, seq_len] @@ -186,9 +181,7 @@ def get_random_inputs( causal[:, 0] = causal[:, 0].masked_fill(mask_upper, min_val) return { - "hidden_states": torch.randn( - batch_size, seq_len, d_model, device=device, dtype=dtype - ), + "hidden_states": torch.randn(batch_size, seq_len, d_model, device=device, dtype=dtype), "position_ids": torch.arange(seq_len, device=device) .unsqueeze(0) .expand(batch_size, -1), @@ -310,7 +303,7 @@ def _reconstruct_attention( # ---- RoPE ---- position_ids: Optional[torch.Tensor] = kwargs.get("position_ids", None) if position_ids is not None: - embed_positions: torch.Tensor = self.original_component.embed_positions # type: ignore[union-attr] + embed_positions = cast(torch.Tensor, self.original_component.embed_positions) # type: ignore[union-attr] # Move buffer to the right device if needed (mirrors HF forward) if embed_positions.device != position_ids.device: embed_positions = embed_positions.to(position_ids.device) @@ -336,7 +329,7 @@ def _reconstruct_attention( kv_seq_len = k.shape[-2] # ---- Scaled dot-product (fp32, matching HF CodeGen._attn) ---- - scale = self.original_component.scale_attn # type: ignore[union-attr] + scale = cast(torch.Tensor, self.original_component.scale_attn) # type: ignore[union-attr] q_f32 = q.to(torch.float32) k_f32 = k.to(torch.float32) @@ -364,7 +357,9 @@ def _reconstruct_attention( attn_output = torch.matmul(attn_weights, v) # Reshape [batch, heads, seq, head_dim] → [batch, seq, hidden] - attn_output = self._reshape_attn_output(attn_output, batch_size, seq_len, num_heads, head_dim) + attn_output = self._reshape_attn_output( + attn_output, batch_size, seq_len, num_heads, head_dim + ) # Output projection (fires hook_z via o.hook_in) attn_output = self._apply_output_projection(attn_output) diff --git a/transformer_lens/model_bridge/supported_architectures/codegen.py b/transformer_lens/model_bridge/supported_architectures/codegen.py index ee19a109c..c385833ae 100644 --- a/transformer_lens/model_bridge/supported_architectures/codegen.py +++ b/transformer_lens/model_bridge/supported_architectures/codegen.py @@ -2,7 +2,6 @@ from typing import Any -import torch import torch.nn as nn from transformer_lens.conversion_utils.conversion_steps import RearrangeTensorConversion @@ -54,24 +53,16 @@ def __init__(self, cfg: Any) -> None: # TransformerLens format [n_heads, d_model, d_head]. self.weight_processing_conversions = { "blocks.{i}.attn.q.weight": ParamProcessingConversion( - tensor_conversion=RearrangeTensorConversion( - "(n h) m -> n m h", n=self.cfg.n_heads - ), + tensor_conversion=RearrangeTensorConversion("(n h) m -> n m h", n=self.cfg.n_heads), ), "blocks.{i}.attn.k.weight": ParamProcessingConversion( - tensor_conversion=RearrangeTensorConversion( - "(n h) m -> n m h", n=self.cfg.n_heads - ), + tensor_conversion=RearrangeTensorConversion("(n h) m -> n m h", n=self.cfg.n_heads), ), "blocks.{i}.attn.v.weight": ParamProcessingConversion( - tensor_conversion=RearrangeTensorConversion( - "(n h) m -> n m h", n=self.cfg.n_heads - ), + tensor_conversion=RearrangeTensorConversion("(n h) m -> n m h", n=self.cfg.n_heads), ), "blocks.{i}.attn.o.weight": ParamProcessingConversion( - tensor_conversion=RearrangeTensorConversion( - "m (n h) -> n h m", n=self.cfg.n_heads - ), + tensor_conversion=RearrangeTensorConversion("m (n h) -> n h m", n=self.cfg.n_heads), ), } @@ -104,9 +95,7 @@ def __init__(self, cfg: Any) -> None: "unembed": UnembeddingBridge(name="lm_head"), } - def split_qkv_matrix( - self, attn_component: Any - ) -> tuple[nn.Linear, nn.Linear, nn.Linear]: + def split_qkv_matrix(self, attn_component: Any) -> tuple[nn.Linear, nn.Linear, nn.Linear]: """Split the fused QKV weight into separate Q, K, V linear modules. CodeGen uses GPT-J-style tensor-parallel partitioning with ``mp_num=4``