From e3585a40923b02b9120f9134a3329326f86d4aa0 Mon Sep 17 00:00:00 2001
From: jlarson4 <jonahalarson@comcast.net>
Date: Thu, 9 Apr 2026 16:53:16 -0500
Subject: [PATCH 1/2] Initial CodeGen setup

---
 .../test_codegen_attention_bridge.py          |  545 +++
 .../supported_architectures/__init__.py       |    0
 .../test_codegen_adapter.py                   |  331 ++
 .../factories/architecture_adapter_factory.py |    2 +
 .../generalized_components/__init__.py        |    4 +
 .../codegen_attention.py                      |  372 ++
 .../model_bridge/sources/transformers.py      |    5 +-
 .../supported_architectures/__init__.py       |    4 +
 .../supported_architectures/codegen.py        |  150 +
 .../tools/model_registry/__init__.py          |    1 +
 .../data/architecture_gaps.json               | 3032 ++++++-----------
 .../model_registry/data/supported_models.json |   70 +
 .../data/verification_history.json            |  152 +-
 13 files changed, 2537 insertions(+), 2131 deletions(-)
 create mode 100644 tests/unit/model_bridge/generalized_components/test_codegen_attention_bridge.py
 create mode 100644 tests/unit/model_bridge/supported_architectures/__init__.py
 create mode 100644 tests/unit/model_bridge/supported_architectures/test_codegen_adapter.py
 create mode 100644 transformer_lens/model_bridge/generalized_components/codegen_attention.py
 create mode 100644 transformer_lens/model_bridge/supported_architectures/codegen.py

diff --git a/tests/unit/model_bridge/generalized_components/test_codegen_attention_bridge.py b/tests/unit/model_bridge/generalized_components/test_codegen_attention_bridge.py
new file mode 100644
index 000000000..5814b1418
--- /dev/null
+++ b/tests/unit/model_bridge/generalized_components/test_codegen_attention_bridge.py
@@ -0,0 +1,545 @@
+"""Unit tests for CodeGenAttentionBridge.
+
+Tests cover:
+- RoPE helper functions (_rotate_every_two, _apply_rotary_pos_emb)
+- CodeGenAttentionBridge initialisation and out_proj wiring
+- Forward pass: all hooks fire (hook_q, hook_k, hook_v, hook_attn_scores,
+  hook_pattern, hook_z, hook_result)
+- RoPE is applied to Q and K (partial rotary_dim path and full-dim path)
+- Causal masking is applied correctly
+- KV cache is passed through to _update_kv_cache
+"""
+
+from typing import Any
+from unittest.mock import MagicMock, patch
+
+import torch
+
+from transformer_lens.model_bridge.generalized_components.codegen_attention import (
+    CodeGenAttentionBridge,
+    _apply_rotary_pos_emb,
+    _rotate_every_two,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_config(
+    n_heads: int = 4,
+    d_model: int = 64,
+    rotary_dim: int = 8,  # must be <= head_dim = d_model // n_heads = 16
+):
+    """Return a minimal config namespace for CodeGenAttentionBridge tests."""
+
+    class Config:
+        pass
+
+    cfg = Config()
+    cfg.n_heads = n_heads
+    cfg.d_model = d_model
+    cfg.d_head = d_model // n_heads
+    cfg.positional_embedding_type = "rotary"
+    cfg.rotary_dim = rotary_dim
+    return cfg
+
+
+def _make_original_attention(
+    d_model: int = 64,
+    n_heads: int = 4,
+    rotary_dim: int = 8,  # must be <= head_dim = d_model // n_heads = 16
+    max_positions: int = 512,
+):
+    """Create a minimal stand-in for a CodeGenAttention module."""
+    head_dim = d_model // n_heads
+    pos_embd_dim = rotary_dim if rotary_dim else d_model
+
+    # Sinusoidal positions buffer: shape [max_positions, pos_embd_dim]
+    inv_freq = 1.0 / (10000 ** (torch.arange(0, pos_embd_dim, 2, dtype=torch.int64) / pos_embd_dim))
+    sinusoid_inp = torch.einsum(
+        "i , j -> i j",
+        torch.arange(max_positions, dtype=torch.int64).float(),
+        inv_freq,
+    ).float()
+    embed_positions = torch.cat((torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)), dim=1)
+
+    attn = MagicMock(spec=torch.nn.Module)
+    attn.embed_positions = embed_positions
+    attn.rotary_dim = rotary_dim
+    attn.scale_attn = float(head_dim) ** 0.5
+    attn.layer_idx = 0
+
+    # out_proj
+    out_proj = torch.nn.Linear(d_model, d_model, bias=False)
+    attn.out_proj = out_proj
+
+    # qkv_proj — fused weight [3*d_model, d_model] (no bias)
+    qkv_proj = torch.nn.Linear(d_model, d_model * 3, bias=False)
+    attn.qkv_proj = qkv_proj
+
+    return attn
+
+
+def _make_split_qkv(d_model: int = 64):
+    """Return a split_qkv_matrix callable producing three independent Linears."""
+    q_lin = torch.nn.Linear(d_model, d_model, bias=False)
+    k_lin = torch.nn.Linear(d_model, d_model, bias=False)
+    v_lin = torch.nn.Linear(d_model, d_model, bias=False)
+
+    def split_qkv(_component):
+        return q_lin, k_lin, v_lin
+
+    return split_qkv, q_lin, k_lin, v_lin
+
+
+def _make_bridge(config=None, split_qkv=None):
+    """Construct a CodeGenAttentionBridge ready for unit testing.
+
+    The bridge is constructed with an ``o`` LinearBridge submodule (matching
+    how the adapter passes ``"o": LinearBridge(name="out_proj")``).
+    """
+    from transformer_lens.model_bridge.generalized_components.linear import LinearBridge
+
+    if config is None:
+        config = _make_config()
+    if split_qkv is None:
+        split_qkv, _, _, _ = _make_split_qkv(config.d_model)
+
+    bridge = CodeGenAttentionBridge(
+        name="attn",
+        config=config,
+        split_qkv_matrix=split_qkv,
+        submodules={"o": LinearBridge(name="out_proj")},
+    )
+    original = _make_original_attention(
+        d_model=config.d_model,
+        n_heads=config.n_heads,
+        rotary_dim=config.rotary_dim,
+    )
+    bridge.set_original_component(original)
+    return bridge, original
+
+
+# ---------------------------------------------------------------------------
+# Rotary helper tests
+# ---------------------------------------------------------------------------
+
+
+class TestRotateEveryTwo:
+    """Tests for the _rotate_every_two function."""
+
+    def test_output_shape_matches_input(self):
+        """rotate_every_two must return a tensor of the same shape."""
+        x = torch.randn(2, 4, 8, 16)
+        out = _rotate_every_two(x)
+        assert out.shape == x.shape
+
+    def test_even_odd_rotation(self):
+        """Verify the rotation formula: (x0, x1) -> (-x1, x0)."""
+        # Use a simple 4-element last dimension so we can check by hand.
+        x = torch.tensor([[[[1.0, 2.0, 3.0, 4.0]]]])  # [1, 1, 1, 4]
+        out = _rotate_every_two(x)
+        # Even indices 0, 2 → x1 = [2, 4], so output at even positions = -x1 = [-2, -4]
+        # Odd  indices 1, 3 → x0 = [1, 3], so output at odd  positions =  x0 = [ 1,  3]
+        # interleaved: [-2, 1, -4, 3]
+        expected = torch.tensor([[[[-2.0, 1.0, -4.0, 3.0]]]])
+        assert torch.allclose(out, expected)
+
+    def test_double_rotation_is_negation(self):
+        """Applying rotate_every_two twice should return the negation of the input."""
+        x = torch.randn(1, 2, 5, 8)
+        out = _rotate_every_two(_rotate_every_two(x))
+        assert torch.allclose(out, -x, atol=1e-6)
+
+
+class TestApplyRotaryPosEmb:
+    """Tests for the _apply_rotary_pos_emb function."""
+
+    def test_identity_with_zero_sin_unit_cos(self):
+        """With sin=0 and cos=1, RoPE should be an identity transform."""
+        b, h, s, d = 1, 2, 4, 8
+        tensor = torch.randn(b, h, s, d)
+        sin = torch.zeros(b, s, d // 2)
+        cos = torch.ones(b, s, d // 2)
+        out = _apply_rotary_pos_emb(tensor, sin, cos)
+        assert torch.allclose(out, tensor, atol=1e-6)
+
+    def test_output_shape_matches_input(self):
+        """Output shape must equal input shape."""
+        b, h, s, d = 2, 4, 6, 16
+        tensor = torch.randn(b, h, s, d)
+        sin = torch.randn(b, s, d // 2)
+        cos = torch.randn(b, s, d // 2)
+        out = _apply_rotary_pos_emb(tensor, sin, cos)
+        assert out.shape == tensor.shape
+
+    def test_rope_modifies_tensor(self):
+        """With non-trivial sin/cos, the output must differ from the input."""
+        b, h, s, d = 1, 1, 3, 8
+        tensor = torch.randn(b, h, s, d)
+        sin = torch.randn(b, s, d // 2)
+        cos = torch.randn(b, s, d // 2)
+        out = _apply_rotary_pos_emb(tensor, sin, cos)
+        assert not torch.allclose(out, tensor)
+
+
+# ---------------------------------------------------------------------------
+# Initialisation tests
+# ---------------------------------------------------------------------------
+
+
+class TestCodeGenAttentionBridgeInit:
+    """Tests for CodeGenAttentionBridge initialisation."""
+
+    def test_out_proj_is_wired_after_set_original_component(self):
+        """out_proj should be linked to self.o after set_original_component."""
+        bridge, original = _make_bridge()
+        assert bridge.o.original_component is original.out_proj
+
+    def test_q_k_v_projections_are_set(self):
+        """Q, K, V LinearBridges must have their original_component set."""
+        bridge, _ = _make_bridge()
+        assert bridge.q.original_component is not None
+        assert bridge.k.original_component is not None
+        assert bridge.v.original_component is not None
+
+    def test_no_c_proj_attribute_needed(self):
+        """Construction must succeed when the original component has no c_proj."""
+        from transformer_lens.model_bridge.generalized_components.linear import LinearBridge
+
+        config = _make_config()
+        split_qkv, _, _, _ = _make_split_qkv(config.d_model)
+        bridge = CodeGenAttentionBridge(
+            name="attn",
+            config=config,
+            split_qkv_matrix=split_qkv,
+            submodules={"o": LinearBridge(name="out_proj")},
+        )
+        original = _make_original_attention()
+        # Ensure original has no c_proj
+        if hasattr(original, "c_proj"):
+            del original.c_proj
+        bridge.set_original_component(original)  # Must not raise
+        assert bridge.o.original_component is original.out_proj
+
+    def test_inherits_from_joint_qkv_attention_bridge(self):
+        """CodeGenAttentionBridge must subclass JointQKVAttentionBridge."""
+        from transformer_lens.model_bridge.generalized_components.joint_qkv_attention import (
+            JointQKVAttentionBridge,
+        )
+
+        bridge, _ = _make_bridge()
+        assert isinstance(bridge, JointQKVAttentionBridge)
+
+
+# ---------------------------------------------------------------------------
+# Forward pass / hooks tests
+# ---------------------------------------------------------------------------
+
+
+class TestCodeGenAttentionBridgeForward:
+    """Tests for the CodeGenAttentionBridge forward pass."""
+
+    def _position_ids(self, batch: int, seq: int) -> torch.Tensor:
+        return torch.arange(seq).unsqueeze(0).expand(batch, -1)
+
+    def test_forward_returns_tuple(self):
+        """forward() must return a tuple (attn_output, attn_weights)."""
+        bridge, _ = _make_bridge()
+        B, S, D = 1, 6, 64
+        hs = torch.randn(B, S, D)
+        pos_ids = self._position_ids(B, S)
+        out = bridge(hs, position_ids=pos_ids)
+        assert isinstance(out, tuple) and len(out) == 2
+
+    def test_output_shape(self):
+        """attn_output must have shape [batch, seq, d_model]."""
+        bridge, _ = _make_bridge()
+        B, S, D = 2, 8, 64
+        hs = torch.randn(B, S, D)
+        pos_ids = self._position_ids(B, S)
+        attn_out, _ = bridge(hs, position_ids=pos_ids)
+        assert attn_out.shape == (B, S, D)
+
+    def test_attn_weights_shape(self):
+        """attn_weights must have shape [batch, n_heads, seq, seq]."""
+        config = _make_config(n_heads=4, d_model=64)
+        bridge, _ = _make_bridge(config=config)
+        B, S = 1, 6
+        hs = torch.randn(B, S, config.d_model)
+        pos_ids = self._position_ids(B, S)
+        _, attn_weights = bridge(hs, position_ids=pos_ids)
+        assert attn_weights.shape == (B, config.n_heads, S, S)
+
+    def test_hook_q_fires(self):
+        """hook_q (q.hook_out) must be called during the forward pass."""
+        bridge, _ = _make_bridge()
+        fired = []
+
+        def hook_fn(tensor, hook):
+            fired.append(True)
+            return tensor
+
+        bridge.q.hook_out.add_hook(hook_fn)
+        B, S, D = 1, 4, 64
+        bridge(torch.randn(B, S, D), position_ids=self._position_ids(B, S))
+        assert fired, "hook_q (q.hook_out) did not fire"
+
+    def test_hook_k_fires(self):
+        """hook_k (k.hook_out) must be called during the forward pass."""
+        bridge, _ = _make_bridge()
+        fired = []
+
+        def hook_fn(tensor, hook):
+            fired.append(True)
+            return tensor
+
+        bridge.k.hook_out.add_hook(hook_fn)
+        B, S, D = 1, 4, 64
+        bridge(torch.randn(B, S, D), position_ids=self._position_ids(B, S))
+        assert fired, "hook_k (k.hook_out) did not fire"
+
+    def test_hook_v_fires(self):
+        """hook_v (v.hook_out) must be called during the forward pass."""
+        bridge, _ = _make_bridge()
+        fired = []
+
+        def hook_fn(tensor, hook):
+            fired.append(True)
+            return tensor
+
+        bridge.v.hook_out.add_hook(hook_fn)
+        B, S, D = 1, 4, 64
+        bridge(torch.randn(B, S, D), position_ids=self._position_ids(B, S))
+        assert fired, "hook_v (v.hook_out) did not fire"
+
+    def test_hook_attn_scores_fires(self):
+        """hook_attn_scores must be called during _reconstruct_attention."""
+        bridge, _ = _make_bridge()
+        fired = []
+
+        def hook_fn(tensor, hook):
+            fired.append(True)
+            return tensor
+
+        bridge.hook_attn_scores.add_hook(hook_fn)
+        B, S, D = 1, 4, 64
+        bridge(torch.randn(B, S, D), position_ids=self._position_ids(B, S))
+        assert fired, "hook_attn_scores did not fire"
+
+    def test_hook_pattern_fires(self):
+        """hook_pattern must be called during _reconstruct_attention."""
+        bridge, _ = _make_bridge()
+        fired = []
+
+        def hook_fn(tensor, hook):
+            fired.append(True)
+            return tensor
+
+        bridge.hook_pattern.add_hook(hook_fn)
+        B, S, D = 1, 4, 64
+        bridge(torch.randn(B, S, D), position_ids=self._position_ids(B, S))
+        assert fired, "hook_pattern did not fire"
+
+    def test_hook_z_fires(self):
+        """hook_z (o.hook_in) must be called during the forward pass."""
+        bridge, _ = _make_bridge()
+        fired = []
+
+        def hook_fn(tensor, hook):
+            fired.append(True)
+            return tensor
+
+        bridge.o.hook_in.add_hook(hook_fn)
+        B, S, D = 1, 4, 64
+        bridge(torch.randn(B, S, D), position_ids=self._position_ids(B, S))
+        assert fired, "hook_z (o.hook_in) did not fire"
+
+    def test_hook_result_fires(self):
+        """hook_result (hook_out) must be called after the output projection."""
+        bridge, _ = _make_bridge()
+        fired = []
+
+        def hook_fn(tensor, hook):
+            fired.append(True)
+            return tensor
+
+        bridge.hook_out.add_hook(hook_fn)
+        B, S, D = 1, 4, 64
+        bridge(torch.randn(B, S, D), position_ids=self._position_ids(B, S))
+        assert fired, "hook_result (hook_out) did not fire"
+
+    def test_hook_q_mutation_affects_output(self):
+        """A mutation in hook_q must propagate to the final attention output."""
+        bridge, _ = _make_bridge()
+        B, S, D = 1, 4, 64
+        hs = torch.randn(B, S, D)
+        pos_ids = self._position_ids(B, S)
+
+        baseline_out, _ = bridge(hs.clone(), position_ids=pos_ids)
+
+        def zeroing_hook(tensor, hook):
+            return torch.zeros_like(tensor)
+
+        bridge.q.hook_out.add_hook(zeroing_hook)
+        zeroed_out, _ = bridge(hs.clone(), position_ids=pos_ids)
+
+        assert not torch.allclose(baseline_out, zeroed_out), (
+            "Zeroing hook_q should change the attention output"
+        )
+
+
+# ---------------------------------------------------------------------------
+# RoPE application tests
+# ---------------------------------------------------------------------------
+
+
+class TestCodeGenAttentionBridgeRoPE:
+    """Tests verifying RoPE is correctly applied in the forward pass."""
+
+    def _position_ids(self, batch: int, seq: int) -> torch.Tensor:
+        return torch.arange(seq).unsqueeze(0).expand(batch, -1)
+
+    def test_rope_changes_q_and_k(self):
+        """RoPE must change the Q and K tensors compared to the raw projection."""
+        config = _make_config(n_heads=4, d_model=64, rotary_dim=16)
+        split_qkv, q_lin, k_lin, v_lin = _make_split_qkv(config.d_model)
+        bridge, _ = _make_bridge(config=config, split_qkv=split_qkv)
+
+        B, S = 1, 6
+        hs = torch.randn(B, S, config.d_model)
+        pos_ids = self._position_ids(B, S)
+
+        raw_q_values = []
+        rope_q_values = []
+
+        def capture_raw_q(tensor, hook):
+            raw_q_values.append(tensor.clone())
+            return tensor
+
+        def capture_rope_q(tensor, hook):
+            rope_q_values.append(tensor.clone())
+            return tensor
+
+        # Capture Q before RoPE (at q.hook_out, before _reconstruct_attention)
+        bridge.q.hook_out.add_hook(capture_raw_q)
+
+        # We intercept hook_attn_scores to verify Q was modified.
+        # Instead, we verify by comparing raw projection output vs scores difference.
+        # A simpler check: scores with RoPE ≠ scores computed from raw Q*K^T.
+        attn_scores_with_rope = []
+
+        def capture_scores(tensor, hook):
+            attn_scores_with_rope.append(tensor.clone())
+            return tensor
+
+        bridge.hook_attn_scores.add_hook(capture_scores)
+        bridge(hs, position_ids=pos_ids)
+
+        assert raw_q_values, "q.hook_out did not fire"
+        assert attn_scores_with_rope, "hook_attn_scores did not fire"
+
+        # Compute what scores would be WITHOUT RoPE
+        raw_q = raw_q_values[0]  # [B, S, D]
+        raw_k = k_lin(hs)        # [B, S, D]
+        n_heads = config.n_heads
+        head_dim = config.d_model // n_heads
+        q_plain = raw_q.view(B, S, n_heads, head_dim).transpose(1, 2).to(torch.float32)
+        k_plain = raw_k.view(B, S, n_heads, head_dim).transpose(1, 2).to(torch.float32)
+        scores_no_rope = torch.matmul(q_plain, k_plain.transpose(-2, -1))
+
+        actual_scores = attn_scores_with_rope[0]
+
+        # The scores MUST differ because RoPE was applied
+        assert not torch.allclose(actual_scores, scores_no_rope, atol=1e-4), (
+            "Attention scores with and without RoPE should differ"
+        )
+
+    def test_partial_rotary_dim_leaves_pass_through_unchanged(self):
+        """The head-dim slice beyond rotary_dim should not be rotated.
+
+        We verify this by checking that the last (head_dim - rotary_dim) dimensions
+        of Q are identical before and after RoPE.
+        """
+        config = _make_config(n_heads=2, d_model=16, rotary_dim=4)
+        split_qkv, q_lin, k_lin, v_lin = _make_split_qkv(config.d_model)
+        bridge, original = _make_bridge(config=config, split_qkv=split_qkv)
+
+        B, S = 1, 4
+        hs = torch.randn(B, S, config.d_model)
+        pos_ids = torch.arange(S).unsqueeze(0).expand(B, -1)
+
+        n_heads = config.n_heads
+        head_dim = config.d_model // n_heads
+        rotary_dim = config.rotary_dim
+
+        # Compute raw Q projection
+        raw_q = q_lin(hs)  # [B, S, D]
+        raw_q_heads = raw_q.view(B, S, n_heads, head_dim)  # [B, S, H, head_dim]
+        pass_through_raw = raw_q_heads[:, :, :, rotary_dim:]  # the un-rotated slice
+
+        # Now run the full forward to extract the Q passed into attn scores.
+        # We capture K just before the matmul by patching _apply_rotary_pos_emb.
+        q_after_rope = []
+
+        def capture_q_after_rope(tensor, hook):
+            q_after_rope.append(tensor.clone())
+            return tensor
+
+        # We patch _reconstruct_attention to intercept Q after RoPE.
+        # Simpler: capture attn_scores and back-compute is complex.
+        # Instead, we patch the module-level function with a wrapper.
+        import transformer_lens.model_bridge.generalized_components.codegen_attention as codegen_attn_mod
+
+        original_fn = codegen_attn_mod._apply_rotary_pos_emb
+        q_passed = []
+        k_passed = []
+
+        def patched_apply_rope(tensor, sin, cos):
+            # Record the first call (Q), second call (K)
+            if len(q_passed) == 0:
+                q_passed.append(tensor.clone())
+            else:
+                k_passed.append(tensor.clone())
+            return original_fn(tensor, sin, cos)
+
+        codegen_attn_mod._apply_rotary_pos_emb = patched_apply_rope  # type: ignore[attr-defined]
+        try:
+            bridge(hs, position_ids=pos_ids)
+        finally:
+            codegen_attn_mod._apply_rotary_pos_emb = original_fn  # type: ignore[attr-defined]
+
+        assert q_passed, "RoPE was not applied to Q"
+
+        # The slice sent into RoPE must equal the raw_q rotary slice
+        q_rot_slice = q_passed[0]  # [B, H, S, rotary_dim]
+        raw_q_rot_slice = raw_q_heads.transpose(1, 2)[:, :, :, :rotary_dim]
+        assert torch.allclose(q_rot_slice, raw_q_rot_slice, atol=1e-5), (
+            "Q slice sent to RoPE must equal the raw projection (pre-rotation)"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Causal masking test
+# ---------------------------------------------------------------------------
+
+
+class TestCodeGenAttentionBridgeCausalMask:
+    """Test causal masking in _reconstruct_attention."""
+
+    def test_future_positions_have_zero_attention_weight(self):
+        """Attention pattern must be lower-triangular (causal)."""
+        bridge, _ = _make_bridge()
+        B, S, D = 1, 6, 64
+        hs = torch.randn(B, S, D)
+        pos_ids = torch.arange(S).unsqueeze(0).expand(B, -1)
+
+        _, attn_weights = bridge(hs, position_ids=pos_ids)
+        # attn_weights: [B, H, S, S]; upper triangle (future) must be ~0
+        for i in range(S):
+            for j in range(i + 1, S):
+                assert torch.all(attn_weights[:, :, i, j].abs() < 1e-5), (
+                    f"attn_weights[:, :, {i}, {j}] should be ~0 (future position)"
+                )
diff --git a/tests/unit/model_bridge/supported_architectures/__init__.py b/tests/unit/model_bridge/supported_architectures/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/unit/model_bridge/supported_architectures/test_codegen_adapter.py b/tests/unit/model_bridge/supported_architectures/test_codegen_adapter.py
new file mode 100644
index 000000000..b76f36cce
--- /dev/null
+++ b/tests/unit/model_bridge/supported_architectures/test_codegen_adapter.py
@@ -0,0 +1,331 @@
+"""Unit tests for CodeGenArchitectureAdapter.
+
+Tests cover:
+- Config attribute validation (all required attributes are set correctly)
+- Component mapping structure (correct bridge types, no ln2)
+- Weight conversion keys and structure
+- split_qkv_matrix correctness (numerical test with known weights)
+- Factory registration (CodeGenForCausalLM maps to the right adapter)
+"""
+
+from types import SimpleNamespace
+from typing import Any
+
+import pytest
+import torch
+import torch.nn as nn
+
+from transformer_lens.config import TransformerBridgeConfig
+from transformer_lens.model_bridge.generalized_components import (
+    BlockBridge,
+    CodeGenAttentionBridge,
+    EmbeddingBridge,
+    MLPBridge,
+    NormalizationBridge,
+    UnembeddingBridge,
+)
+from transformer_lens.model_bridge.supported_architectures.codegen import (
+    CodeGenArchitectureAdapter,
+)
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+def _make_cfg(
+    n_heads: int = 4,
+    d_model: int = 64,
+    n_layers: int = 2,
+    d_mlp: int = 256,
+    d_vocab: int = 1000,
+    n_ctx: int = 512,
+) -> TransformerBridgeConfig:
+    """Return a minimal TransformerBridgeConfig for CodeGen adapter tests."""
+    return TransformerBridgeConfig(
+        d_model=d_model,
+        d_head=d_model // n_heads,
+        n_layers=n_layers,
+        n_ctx=n_ctx,
+        n_heads=n_heads,
+        d_vocab=d_vocab,
+        d_mlp=d_mlp,
+        default_prepend_bos=True,
+        architecture="CodeGenForCausalLM",
+    )
+
+
+@pytest.fixture
+def cfg() -> TransformerBridgeConfig:
+    return _make_cfg()
+
+
+@pytest.fixture
+def adapter(cfg: TransformerBridgeConfig) -> CodeGenArchitectureAdapter:
+    return CodeGenArchitectureAdapter(cfg)
+
+
+# ---------------------------------------------------------------------------
+# Config attribute tests
+# ---------------------------------------------------------------------------
+
+
+class TestCodeGenAdapterConfig:
+    """Tests that the adapter sets required config attributes correctly."""
+
+    def test_normalization_type_is_ln(self, adapter: CodeGenArchitectureAdapter) -> None:
+        assert adapter.cfg.normalization_type == "LN"
+
+    def test_positional_embedding_type_is_rotary(self, adapter: CodeGenArchitectureAdapter) -> None:
+        assert adapter.cfg.positional_embedding_type == "rotary"
+
+    def test_final_rms_is_false(self, adapter: CodeGenArchitectureAdapter) -> None:
+        assert adapter.cfg.final_rms is False
+
+    def test_gated_mlp_is_false(self, adapter: CodeGenArchitectureAdapter) -> None:
+        assert adapter.cfg.gated_mlp is False
+
+    def test_attn_only_is_false(self, adapter: CodeGenArchitectureAdapter) -> None:
+        assert adapter.cfg.attn_only is False
+
+    def test_parallel_attn_mlp_is_true(self, adapter: CodeGenArchitectureAdapter) -> None:
+        assert adapter.cfg.parallel_attn_mlp is True
+
+
+# ---------------------------------------------------------------------------
+# Component mapping structure tests
+# ---------------------------------------------------------------------------
+
+
+class TestCodeGenAdapterComponentMapping:
+    """Tests that component_mapping has the correct bridge types and structure."""
+
+    def test_embed_is_embedding_bridge(self, adapter: CodeGenArchitectureAdapter) -> None:
+        assert isinstance(adapter.component_mapping["embed"], EmbeddingBridge)
+
+    def test_embed_name(self, adapter: CodeGenArchitectureAdapter) -> None:
+        assert adapter.component_mapping["embed"].name == "transformer.wte"
+
+    def test_blocks_is_block_bridge(self, adapter: CodeGenArchitectureAdapter) -> None:
+        assert isinstance(adapter.component_mapping["blocks"], BlockBridge)
+
+    def test_blocks_name(self, adapter: CodeGenArchitectureAdapter) -> None:
+        assert adapter.component_mapping["blocks"].name == "transformer.h"
+
+    def test_ln_final_is_normalization_bridge(self, adapter: CodeGenArchitectureAdapter) -> None:
+        assert isinstance(adapter.component_mapping["ln_final"], NormalizationBridge)
+
+    def test_ln_final_name(self, adapter: CodeGenArchitectureAdapter) -> None:
+        assert adapter.component_mapping["ln_final"].name == "transformer.ln_f"
+
+    def test_unembed_is_unembedding_bridge(self, adapter: CodeGenArchitectureAdapter) -> None:
+        assert isinstance(adapter.component_mapping["unembed"], UnembeddingBridge)
+
+    def test_unembed_name(self, adapter: CodeGenArchitectureAdapter) -> None:
+        assert adapter.component_mapping["unembed"].name == "lm_head"
+
+    def test_blocks_ln1_is_normalization_bridge(self, adapter: CodeGenArchitectureAdapter) -> None:
+        blocks = adapter.component_mapping["blocks"]
+        assert isinstance(blocks.submodules["ln1"], NormalizationBridge)
+
+    def test_blocks_ln1_name(self, adapter: CodeGenArchitectureAdapter) -> None:
+        blocks = adapter.component_mapping["blocks"]
+        assert blocks.submodules["ln1"].name == "ln_1"
+
+    def test_no_ln2_in_blocks(self, adapter: CodeGenArchitectureAdapter) -> None:
+        """CodeGen uses parallel attn+MLP sharing ln_1 — there must be no ln2."""
+        blocks = adapter.component_mapping["blocks"]
+        assert "ln2" not in blocks.submodules, (
+            "CodeGen parallel block must not have ln2"
+        )
+
+    def test_attn_is_codegen_attention_bridge(self, adapter: CodeGenArchitectureAdapter) -> None:
+        blocks = adapter.component_mapping["blocks"]
+        assert isinstance(blocks.submodules["attn"], CodeGenAttentionBridge)
+
+    def test_attn_name(self, adapter: CodeGenArchitectureAdapter) -> None:
+        blocks = adapter.component_mapping["blocks"]
+        assert blocks.submodules["attn"].name == "attn"
+
+    def test_mlp_is_mlp_bridge(self, adapter: CodeGenArchitectureAdapter) -> None:
+        blocks = adapter.component_mapping["blocks"]
+        assert isinstance(blocks.submodules["mlp"], MLPBridge)
+
+    def test_mlp_name(self, adapter: CodeGenArchitectureAdapter) -> None:
+        blocks = adapter.component_mapping["blocks"]
+        assert blocks.submodules["mlp"].name == "mlp"
+
+    def test_mlp_in_name(self, adapter: CodeGenArchitectureAdapter) -> None:
+        blocks = adapter.component_mapping["blocks"]
+        assert blocks.submodules["mlp"].submodules["in"].name == "fc_in"
+
+    def test_mlp_out_name(self, adapter: CodeGenArchitectureAdapter) -> None:
+        blocks = adapter.component_mapping["blocks"]
+        assert blocks.submodules["mlp"].submodules["out"].name == "fc_out"
+
+
+# ---------------------------------------------------------------------------
+# Weight processing conversion tests
+# ---------------------------------------------------------------------------
+
+
+class TestCodeGenAdapterWeightConversions:
+    """Tests that weight_processing_conversions has the expected keys."""
+
+    def test_q_weight_key_present(self, adapter: CodeGenArchitectureAdapter) -> None:
+        assert "blocks.{i}.attn.q.weight" in adapter.weight_processing_conversions
+
+    def test_k_weight_key_present(self, adapter: CodeGenArchitectureAdapter) -> None:
+        assert "blocks.{i}.attn.k.weight" in adapter.weight_processing_conversions
+
+    def test_v_weight_key_present(self, adapter: CodeGenArchitectureAdapter) -> None:
+        assert "blocks.{i}.attn.v.weight" in adapter.weight_processing_conversions
+
+    def test_o_weight_key_present(self, adapter: CodeGenArchitectureAdapter) -> None:
+        assert "blocks.{i}.attn.o.weight" in adapter.weight_processing_conversions
+
+    def test_exactly_four_conversion_keys(self, adapter: CodeGenArchitectureAdapter) -> None:
+        assert len(adapter.weight_processing_conversions) == 4
+
+
+# ---------------------------------------------------------------------------
+# split_qkv_matrix numerical correctness tests
+# ---------------------------------------------------------------------------
+
+
+class TestCodeGenSplitQKVMatrix:
+    """Numerical tests verifying the mp_num=4 QKV split logic."""
+
+    def _make_adapter_with_dmodel(self, d_model: int, n_heads: int) -> CodeGenArchitectureAdapter:
+        cfg = _make_cfg(d_model=d_model, n_heads=n_heads)
+        return CodeGenArchitectureAdapter(cfg)
+
+    def _make_attn_component(self, d_model: int) -> Any:
+        """Create a minimal attn component with a qkv_proj linear."""
+        attn = SimpleNamespace()
+        attn.qkv_proj = nn.Linear(d_model, d_model * 3, bias=False)
+        return attn
+
+    def test_returns_three_linear_modules(self) -> None:
+        """split_qkv_matrix must return exactly three nn.Linear modules."""
+        adapter = self._make_adapter_with_dmodel(64, 4)
+        attn = self._make_attn_component(64)
+        q, k, v = adapter.split_qkv_matrix(attn)
+        assert isinstance(q, nn.Linear)
+        assert isinstance(k, nn.Linear)
+        assert isinstance(v, nn.Linear)
+
+    def test_output_shapes_are_correct(self) -> None:
+        """Each of Q, K, V must have weight shape [n_embd, n_embd]."""
+        d_model = 64
+        adapter = self._make_adapter_with_dmodel(d_model, 4)
+        attn = self._make_attn_component(d_model)
+        q, k, v = adapter.split_qkv_matrix(attn)
+        assert q.weight.shape == (d_model, d_model)
+        assert k.weight.shape == (d_model, d_model)
+        assert v.weight.shape == (d_model, d_model)
+
+    def test_no_bias_on_outputs(self) -> None:
+        """The split linears must have no bias, matching qkv_proj."""
+        adapter = self._make_adapter_with_dmodel(64, 4)
+        attn = self._make_attn_component(64)
+        q, k, v = adapter.split_qkv_matrix(attn)
+        assert q.bias is None
+        assert k.bias is None
+        assert v.bias is None
+
+    def test_q_k_v_are_distinct(self) -> None:
+        """With a non-trivial weight, Q, K, V must differ from each other."""
+        adapter = self._make_adapter_with_dmodel(64, 4)
+        attn = self._make_attn_component(64)
+        # Fill qkv_proj with distinct values per row
+        nn.init.normal_(attn.qkv_proj.weight)
+        q, k, v = adapter.split_qkv_matrix(attn)
+        # All three must differ
+        assert not torch.allclose(q.weight, k.weight), "Q and K weights must differ"
+        assert not torch.allclose(q.weight, v.weight), "Q and V weights must differ"
+        assert not torch.allclose(k.weight, v.weight), "K and V weights must differ"
+
+    def test_known_partition_ordering(self) -> None:
+        """Verify the mp_num=4 partition layout: within each partition [Q_part, V_part, K_part].
+
+        We construct a weight where partition index and slot index are embedded
+        in the values, then verify that Q, K, V extract the correct slices.
+        """
+        mp_num = 4
+        d_model = 64
+        n_heads = 4
+        local_dim = d_model // mp_num  # 16
+
+        adapter = self._make_adapter_with_dmodel(d_model, n_heads)
+        attn = self._make_attn_component(d_model)
+
+        # Build a structured weight: rows are indexed 0..3*d_model-1.
+        # Reshape as [mp_num=4, 3, local_dim=16, d_model=64], set each slice
+        # to a unique constant so we can track which slot goes where.
+        w = torch.zeros(mp_num, 3, local_dim, d_model)
+        # slot 0 = Q_part → fill with 1.0
+        w[:, 0, :, :] = 1.0
+        # slot 1 = V_part → fill with 2.0
+        w[:, 1, :, :] = 2.0
+        # slot 2 = K_part → fill with 3.0
+        w[:, 2, :, :] = 3.0
+
+        # Flatten back to [3*d_model, d_model] as qkv_proj expects
+        attn.qkv_proj.weight = nn.Parameter(w.reshape(3 * d_model, d_model))
+
+        q, k, v = adapter.split_qkv_matrix(attn)
+
+        assert torch.all(q.weight == 1.0), "Q should come from slot 0 (Q_part)"
+        assert torch.all(k.weight == 3.0), "K should come from slot 2 (K_part)"
+        assert torch.all(v.weight == 2.0), "V should come from slot 1 (V_part)"
+
+    def test_forward_output_shape_with_split(self) -> None:
+        """After split, Q/K/V linears should produce correct output shapes."""
+        d_model = 64
+        adapter = self._make_adapter_with_dmodel(d_model, 4)
+        attn = self._make_attn_component(d_model)
+        q_lin, k_lin, v_lin = adapter.split_qkv_matrix(attn)
+
+        batch, seq = 2, 10
+        x = torch.randn(batch, seq, d_model)
+        assert q_lin(x).shape == (batch, seq, d_model)
+        assert k_lin(x).shape == (batch, seq, d_model)
+        assert v_lin(x).shape == (batch, seq, d_model)
+
+
+# ---------------------------------------------------------------------------
+# Factory registration test
+# ---------------------------------------------------------------------------
+
+
+class TestCodeGenFactoryRegistration:
+    """Tests that the factory maps CodeGenForCausalLM to the correct adapter.
+
+    Note: Phase D (registration) is required for these tests to pass.  They
+    are included here so that registration is verified as part of the Phase D
+    commit rather than needing a separate test file.
+    """
+
+    def test_factory_returns_codegen_adapter(self) -> None:
+        """ArchitectureAdapterFactory must return a CodeGenArchitectureAdapter."""
+        from transformer_lens.factories.architecture_adapter_factory import (
+            ArchitectureAdapterFactory,
+        )
+
+        cfg = _make_cfg()
+        adapter = ArchitectureAdapterFactory.select_architecture_adapter(cfg)
+        assert isinstance(adapter, CodeGenArchitectureAdapter), (
+            f"Expected CodeGenArchitectureAdapter, got {type(adapter).__name__}"
+        )
+
+    def test_factory_key_is_codegen_for_causal_lm(self) -> None:
+        """SUPPORTED_ARCHITECTURES must have a 'CodeGenForCausalLM' key."""
+        from transformer_lens.factories.architecture_adapter_factory import (
+            SUPPORTED_ARCHITECTURES,
+        )
+
+        assert "CodeGenForCausalLM" in SUPPORTED_ARCHITECTURES, (
+            "CodeGenForCausalLM must be registered in SUPPORTED_ARCHITECTURES"
+        )
diff --git a/transformer_lens/factories/architecture_adapter_factory.py b/transformer_lens/factories/architecture_adapter_factory.py
index 458d1b073..1c6462cad 100644
--- a/transformer_lens/factories/architecture_adapter_factory.py
+++ b/transformer_lens/factories/architecture_adapter_factory.py
@@ -9,6 +9,7 @@
     ApertusArchitectureAdapter,
     BertArchitectureAdapter,
     BloomArchitectureAdapter,
+    CodeGenArchitectureAdapter,
     FalconArchitectureAdapter,
     Gemma1ArchitectureAdapter,
     Gemma2ArchitectureAdapter,
@@ -53,6 +54,7 @@
     "ApertusForCausalLM": ApertusArchitectureAdapter,
     "BertForMaskedLM": BertArchitectureAdapter,
     "BloomForCausalLM": BloomArchitectureAdapter,
+    "CodeGenForCausalLM": CodeGenArchitectureAdapter,
     "FalconForCausalLM": FalconArchitectureAdapter,
     "GemmaForCausalLM": Gemma1ArchitectureAdapter,  # Default to Gemma1 as it's the original version
     "Gemma1ForCausalLM": Gemma1ArchitectureAdapter,
diff --git a/transformer_lens/model_bridge/generalized_components/__init__.py b/transformer_lens/model_bridge/generalized_components/__init__.py
index 334b262c0..3bbe8e356 100644
--- a/transformer_lens/model_bridge/generalized_components/__init__.py
+++ b/transformer_lens/model_bridge/generalized_components/__init__.py
@@ -9,6 +9,9 @@
 from transformer_lens.model_bridge.generalized_components.bloom_attention import (
     BloomAttentionBridge,
 )
+from transformer_lens.model_bridge.generalized_components.codegen_attention import (
+    CodeGenAttentionBridge,
+)
 from transformer_lens.model_bridge.generalized_components.bloom_block import (
     BloomBlockBridge,
 )
@@ -78,6 +81,7 @@
     "BlockBridge",
     "BloomBlockBridge",
     "BloomAttentionBridge",
+    "CodeGenAttentionBridge",
     "BloomMLPBridge",
     "CLIPVisionEncoderBridge",
     "CLIPVisionEncoderLayerBridge",
diff --git a/transformer_lens/model_bridge/generalized_components/codegen_attention.py b/transformer_lens/model_bridge/generalized_components/codegen_attention.py
new file mode 100644
index 000000000..a4df9c170
--- /dev/null
+++ b/transformer_lens/model_bridge/generalized_components/codegen_attention.py
@@ -0,0 +1,372 @@
+"""CodeGen-specific attention bridge component.
+
+CodeGen attention uses a fused QKV projection (qkv_proj) with a GPT-J-style
+``rotate_every_two`` rotary positional encoding applied to Q and K before the
+attention matmul.  The rotary embeddings are stored as a sinusoidal buffer
+(``embed_positions``) on the original ``CodeGenAttention`` module and are
+indexed by ``position_ids``.
+
+Optional parameters (may be absent in some CodeGen checkpoints):
+  - rotary_dim: if None, RoPE is applied to the full head dimension.
+"""
+
+from typing import Any, Callable, Dict, Optional
+
+import torch
+
+from transformer_lens.conversion_utils.conversion_steps.base_tensor_conversion import (
+    BaseTensorConversion,
+)
+from transformer_lens.model_bridge.generalized_components.base import (
+    GeneralizedComponent,
+)
+from transformer_lens.model_bridge.generalized_components.joint_qkv_attention import (
+    JointQKVAttentionBridge,
+)
+
+
+# ---------------------------------------------------------------------------
+# Rotary helpers — GPT-J / CodeGen style ("rotate_every_two")
+# ---------------------------------------------------------------------------
+
+
+def _rotate_every_two(x: torch.Tensor) -> torch.Tensor:
+    """Rotate every pair of elements (GPT-J / CodeGen style).
+
+    Mirrors ``rotate_every_two`` from
+    ``transformers.models.codegen.modeling_codegen`` (line 56-60).
+
+    Args:
+        x: Tensor of shape ``[batch, heads, seq, head_dim]``.
+
+    Returns:
+        Tensor of the same shape with even/odd pairs rotated.
+    """
+    x1 = x[:, :, :, ::2]   # even-indexed dims
+    x2 = x[:, :, :, 1::2]  # odd-indexed dims
+    x = torch.stack((-x2, x1), dim=-1)
+    return x.flatten(-2)
+
+
+def _apply_rotary_pos_emb(
+    tensor: torch.Tensor,
+    sin: torch.Tensor,
+    cos: torch.Tensor,
+) -> torch.Tensor:
+    """Apply rotary positional embeddings (GPT-J / CodeGen style).
+
+    Adapted from ``apply_rotary_pos_emb`` in
+    ``transformers.models.codegen.modeling_codegen`` (line 64-67) to work
+    with tensors in the TransformerLens ``[batch, heads, seq, head_dim]``
+    layout (heads and seq are swapped relative to HuggingFace).
+
+    Args:
+        tensor: ``[batch, heads, seq, rotary_dim]`` — the slice of Q or K that
+            will be rotated.
+        sin: ``[batch, seq, rotary_dim // 2]`` — the sin half of the sinusoidal
+            embedding (before ``repeat_interleave``).
+        cos: ``[batch, seq, rotary_dim // 2]`` — the cos half.
+
+    Returns:
+        Rotated tensor with the same shape as *tensor*.
+    """
+    # Expand sin/cos from [batch, seq, rotary_dim//2]
+    # to [batch, 1, seq, rotary_dim] so they broadcast with
+    # tensor of shape [batch, heads, seq, rotary_dim].
+    sin = torch.repeat_interleave(sin[:, None, :, :], 2, 3)  # [B, 1, seq, rot_dim]
+    cos = torch.repeat_interleave(cos[:, None, :, :], 2, 3)  # [B, 1, seq, rot_dim]
+    return (tensor * cos) + (_rotate_every_two(tensor) * sin)
+
+
+class CodeGenAttentionBridge(JointQKVAttentionBridge):
+    """Attention bridge for CodeGen models.
+
+    CodeGen uses:
+    - A fused ``qkv_proj`` linear (no bias).
+    - GPT-J-style ``rotate_every_two`` RoPE applied to Q and K before the
+      attention matmul.  Rotary embeddings are stored in the
+      ``embed_positions`` buffer of the original ``CodeGenAttention`` module
+      and indexed by ``position_ids``.
+    - Only the first ``rotary_dim`` dimensions of each head are rotated.
+      When ``rotary_dim`` is None the full head dimension is rotated.
+    - An ``out_proj`` linear output projection (no bias).
+
+    All TransformerLens hooks fire in the forward pass:
+    ``hook_q``, ``hook_k``, ``hook_v``, ``hook_attn_scores``,
+    ``hook_pattern``, ``hook_z`` (via ``o.hook_in``), ``hook_result``
+    (via ``hook_out``).
+    """
+
+    def __init__(
+        self,
+        name: str,
+        config: Any,
+        split_qkv_matrix: Optional[Callable] = None,
+        submodules: Optional[Dict[str, GeneralizedComponent]] = None,
+        qkv_conversion_rule: Optional[BaseTensorConversion] = None,
+        attn_conversion_rule: Optional[BaseTensorConversion] = None,
+        pattern_conversion_rule: Optional[BaseTensorConversion] = None,
+    ) -> None:
+        """Initialise the CodeGen attention bridge.
+
+        Args:
+            name: The name of this component.
+            config: Model configuration (must have ``n_heads``, ``d_head``,
+                and optionally ``rotary_dim``).
+            split_qkv_matrix: Callable that splits the fused QKV weight into
+                three ``nn.Linear`` modules for Q, K, and V.  Required — there
+                is no sensible default for CodeGen's mp_num=4 split logic.
+            submodules: Optional extra submodules to register.
+            qkv_conversion_rule: Optional conversion rule for Q/K/V outputs.
+            attn_conversion_rule: Optional conversion rule for the attention
+                output.
+            pattern_conversion_rule: Optional conversion rule for attention
+                patterns.
+        """
+        super().__init__(
+            name=name,
+            config=config,
+            split_qkv_matrix=split_qkv_matrix,
+            submodules=submodules,
+            qkv_conversion_rule=qkv_conversion_rule,
+            attn_conversion_rule=attn_conversion_rule,
+            pattern_conversion_rule=pattern_conversion_rule,
+            requires_position_embeddings=False,
+            requires_attention_mask=False,
+        )
+
+    # ------------------------------------------------------------------
+    # Component testing inputs
+    # ------------------------------------------------------------------
+
+    def get_random_inputs(
+        self,
+        batch_size: int = 2,
+        seq_len: int = 8,
+        device=None,
+        dtype=None,
+    ):
+        """Return random inputs for isolated component testing.
+
+        CodeGen attention requires ``position_ids`` (to index into
+        ``embed_positions``) and a HuggingFace-style 4D causal attention mask.
+        The mask is provided so that both the bridge and the HF component
+        apply identical causal masking during the ``all_components`` benchmark.
+
+        Args:
+            batch_size: Batch size.
+            seq_len: Sequence length.
+            device: Target device (defaults to CPU).
+            dtype: Tensor dtype (defaults to float32).
+
+        Returns:
+            Dict with ``hidden_states``, ``position_ids``, and
+            ``attention_mask`` suitable for both bridge and HF forward calls.
+        """
+        import torch
+
+        if device is None:
+            device = torch.device("cpu")
+        if dtype is None:
+            dtype = torch.float32
+
+        d_model = (
+            self.config.d_model
+            if self.config and hasattr(self.config, "d_model")
+            else 768
+        )
+
+        # Build the HF-style 4D causal mask: 0 where attended, -inf where masked.
+        # Shape: [batch, 1, seq_len, seq_len]
+        min_val = torch.finfo(dtype).min
+        causal = torch.zeros(batch_size, 1, seq_len, seq_len, device=device, dtype=dtype)
+        mask_upper = torch.triu(
+            torch.ones(seq_len, seq_len, device=device, dtype=torch.bool), diagonal=1
+        )
+        causal[:, 0] = causal[:, 0].masked_fill(mask_upper, min_val)
+
+        return {
+            "hidden_states": torch.randn(
+                batch_size, seq_len, d_model, device=device, dtype=dtype
+            ),
+            "position_ids": torch.arange(seq_len, device=device)
+            .unsqueeze(0)
+            .expand(batch_size, -1),
+            "attention_mask": causal,
+        }
+
+    # ------------------------------------------------------------------
+    # Component wiring
+    # ------------------------------------------------------------------
+
+    def set_original_component(self, original_component: torch.nn.Module) -> None:
+        """Wire the original CodeGenAttention and set up the output projection.
+
+        The base ``JointQKVAttentionBridge.set_original_component`` hardcodes
+        ``c_proj`` for the output projection wiring.  CodeGen uses ``out_proj``
+        instead, so we override here to wire it correctly after calling super.
+
+        Args:
+            original_component: The original ``CodeGenAttention`` layer.
+        """
+        # Let the base class split QKV; it will attempt (and fail-silently) the
+        # c_proj wiring because CodeGen has no c_proj attribute.
+        super().set_original_component(original_component)
+
+        # Wire out_proj explicitly.
+        if hasattr(self, "o") and hasattr(original_component, "out_proj"):
+            self.o.set_original_component(original_component.out_proj)
+
+    # ------------------------------------------------------------------
+    # Forward pass
+    # ------------------------------------------------------------------
+
+    def forward(self, *args: Any, **kwargs: Any) -> Any:
+        """Forward pass through CodeGen attention with all hooks firing.
+
+        Manually reconstructs attention so that all TransformerLens hooks
+        (hook_q, hook_k, hook_v, hook_attn_scores, hook_pattern, hook_z,
+        hook_result) fire correctly.
+
+        CodeGen passes ``position_ids`` as a keyword argument; these are used
+        to index into the ``embed_positions`` sinusoidal buffer stored on the
+        original ``CodeGenAttention`` module.
+
+        Args:
+            *args: Positional arguments; the first must be ``hidden_states``.
+            **kwargs: Keyword arguments including ``position_ids`` (required
+                for RoPE), ``attention_mask`` (optional), ``layer_past``
+                (optional KV cache), and ``cache_position`` (optional).
+
+        Returns:
+            Tuple of ``(attn_output, attn_weights)``.
+        """
+        if self.original_component is None:
+            raise RuntimeError(
+                f"Original component not set for {self.name}. "
+                "Call set_original_component() first."
+            )
+
+        # ---- 1. Extract hidden_states ----
+        if len(args) > 0 and isinstance(args[0], torch.Tensor):
+            hidden_states = args[0]
+        elif "hidden_states" in kwargs and isinstance(kwargs["hidden_states"], torch.Tensor):
+            hidden_states = kwargs["hidden_states"]
+        else:
+            raise ValueError("Could not find hidden_states in args or kwargs.")
+
+        # ---- 2. Input hook ----
+        hooked_input = self.hook_in(hidden_states)
+
+        # ---- 3. Q / K / V projections (fires hook_q, hook_k, hook_v) ----
+        q_output = self.q(hooked_input)
+        k_output = self.k(hooked_input)
+        v_output = self.v(hooked_input)
+
+        # ---- 4. Reconstruct attention with RoPE ----
+        attn_output, attn_weights = self._reconstruct_attention(
+            q_output, k_output, v_output, **kwargs
+        )
+
+        # ---- 5. Output hooks (fires hook_z via o.hook_in, hook_result via hook_out) ----
+        output = (attn_output, attn_weights)
+        output = self._process_output(output)
+        return output
+
+    def _reconstruct_attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        **kwargs: Any,
+    ) -> tuple:
+        """Reconstruct attention with CodeGen's rotate_every_two RoPE.
+
+        This method:
+        1. Reshapes Q/K/V to ``[batch, heads, seq, head_dim]``.
+        2. Applies ``rotate_every_two`` RoPE to Q and K (first ``rotary_dim``
+           dimensions only when ``rotary_dim`` is set).
+        3. Runs scaled dot-product attention (fp32, matching HF CodeGen).
+        4. Fires ``hook_attn_scores`` and ``hook_pattern``.
+        5. Applies the output projection via ``self.o``.
+
+        Args:
+            q: Q tensor from the Q LinearBridge.
+            k: K tensor from the K LinearBridge.
+            v: V tensor from the V LinearBridge.
+            **kwargs: Forwarded kwargs; must include ``position_ids``.
+
+        Returns:
+            ``(attn_output, attn_weights)`` tuple.
+        """
+        assert self.original_component is not None
+        assert self.config is not None
+
+        num_heads: int = self.config.n_heads
+
+        # Reshape to [batch, heads, seq, head_dim]
+        q, k, v, batch_size, seq_len, head_dim = self._reshape_qkv_to_heads(q, k, v, num_heads)
+
+        # ---- RoPE ----
+        position_ids: Optional[torch.Tensor] = kwargs.get("position_ids", None)
+        if position_ids is not None:
+            embed_positions: torch.Tensor = self.original_component.embed_positions  # type: ignore[union-attr]
+            # Move buffer to the right device if needed (mirrors HF forward)
+            if embed_positions.device != position_ids.device:
+                embed_positions = embed_positions.to(position_ids.device)
+
+            # sincos: [batch, seq, rotary_dim] (full dim = sin_half + cos_half)
+            sincos = embed_positions[position_ids]
+            half = sincos.shape[-1] // 2
+            sin, cos = sincos[:, :, :half], sincos[:, :, half:]
+
+            rotary_dim: Optional[int] = getattr(self.original_component, "rotary_dim", None)
+            if rotary_dim is not None:
+                # Only rotate the first rotary_dim dimensions; pass the rest through.
+                q_rot = _apply_rotary_pos_emb(q[:, :, :, :rotary_dim], sin, cos)
+                k_rot = _apply_rotary_pos_emb(k[:, :, :, :rotary_dim], sin, cos)
+                q = torch.cat([q_rot, q[:, :, :, rotary_dim:]], dim=-1)
+                k = torch.cat([k_rot, k[:, :, :, rotary_dim:]], dim=-1)
+            else:
+                q = _apply_rotary_pos_emb(q, sin, cos)
+                k = _apply_rotary_pos_emb(k, sin, cos)
+
+        # ---- KV cache ----
+        k, v = self._update_kv_cache(k, v, **kwargs)
+        kv_seq_len = k.shape[-2]
+
+        # ---- Scaled dot-product (fp32, matching HF CodeGen._attn) ----
+        scale = self.original_component.scale_attn  # type: ignore[union-attr]
+        q_f32 = q.to(torch.float32)
+        k_f32 = k.to(torch.float32)
+
+        attn_scores = torch.matmul(q_f32, k_f32.transpose(-2, -1))
+
+        attention_mask: Optional[torch.Tensor] = kwargs.get("attention_mask", None)
+        attn_scores = self._apply_reconstruct_attention_mask(
+            attn_scores=attn_scores,
+            attention_mask=attention_mask,
+            seq_len=kv_seq_len,
+            q_seq_len=seq_len,
+        )
+
+        # Divide by scale_attn (CodeGen divides *after* the mask, not before)
+        attn_scores = attn_scores / scale
+
+        attn_scores = self.hook_attn_scores(attn_scores)
+
+        # Softmax + dropout + hook_pattern
+        attn_weights = self._softmax_dropout_pattern(
+            attn_scores,
+            target_dtype=v.dtype,
+        )
+
+        attn_output = torch.matmul(attn_weights, v)
+
+        # Reshape [batch, heads, seq, head_dim] → [batch, seq, hidden]
+        attn_output = self._reshape_attn_output(attn_output, batch_size, seq_len, num_heads, head_dim)
+
+        # Output projection (fires hook_z via o.hook_in)
+        attn_output = self._apply_output_projection(attn_output)
+
+        return (attn_output, attn_weights)
diff --git a/transformer_lens/model_bridge/sources/transformers.py b/transformer_lens/model_bridge/sources/transformers.py
index 3f189e841..0dbf6ee70 100644
--- a/transformer_lens/model_bridge/sources/transformers.py
+++ b/transformer_lens/model_bridge/sources/transformers.py
@@ -166,9 +166,9 @@ def map_default_transformer_lens_config(hf_config):
         tl_config.sliding_window = source_config.sliding_window
     if getattr(hf_config, "use_parallel_residual", False):
         tl_config.parallel_attn_mlp = True
-    # GPT-J: parallel attn+MLP but missing use_parallel_residual in HF config
+    # GPT-J and CodeGen: parallel attn+MLP but missing use_parallel_residual in HF config
     arch_classes = getattr(hf_config, "architectures", []) or []
-    if any(a in ("GPTJForCausalLM",) for a in arch_classes):
+    if any(a in ("GPTJForCausalLM", "CodeGenForCausalLM") for a in arch_classes):
         tl_config.parallel_attn_mlp = True
     tl_config.default_prepend_bos = True
     return tl_config
@@ -205,6 +205,7 @@ def determine_architecture_from_hf_config(hf_config):
             "gemma3": "Gemma3ForCausalLM",
             "bert": "BertForMaskedLM",
             "bloom": "BloomForCausalLM",
+            "codegen": "CodeGenForCausalLM",
             "gptj": "GPTJForCausalLM",
             "gpt_neo": "GPTNeoForCausalLM",
             "gpt_neox": "GPTNeoXForCausalLM",
diff --git a/transformer_lens/model_bridge/supported_architectures/__init__.py b/transformer_lens/model_bridge/supported_architectures/__init__.py
index 2c32f6b38..1b24f3741 100644
--- a/transformer_lens/model_bridge/supported_architectures/__init__.py
+++ b/transformer_lens/model_bridge/supported_architectures/__init__.py
@@ -12,6 +12,9 @@
 from transformer_lens.model_bridge.supported_architectures.bloom import (
     BloomArchitectureAdapter,
 )
+from transformer_lens.model_bridge.supported_architectures.codegen import (
+    CodeGenArchitectureAdapter,
+)
 from transformer_lens.model_bridge.supported_architectures.falcon import (
     FalconArchitectureAdapter,
 )
@@ -131,6 +134,7 @@
     "ApertusArchitectureAdapter",
     "BertArchitectureAdapter",
     "BloomArchitectureAdapter",
+    "CodeGenArchitectureAdapter",
     "FalconArchitectureAdapter",
     "Gemma1ArchitectureAdapter",
     "Gemma2ArchitectureAdapter",
diff --git a/transformer_lens/model_bridge/supported_architectures/codegen.py b/transformer_lens/model_bridge/supported_architectures/codegen.py
new file mode 100644
index 000000000..ee19a109c
--- /dev/null
+++ b/transformer_lens/model_bridge/supported_architectures/codegen.py
@@ -0,0 +1,150 @@
+"""CodeGen architecture adapter."""
+
+from typing import Any
+
+import torch
+import torch.nn as nn
+
+from transformer_lens.conversion_utils.conversion_steps import RearrangeTensorConversion
+from transformer_lens.conversion_utils.param_processing_conversion import (
+    ParamProcessingConversion,
+)
+from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter
+from transformer_lens.model_bridge.generalized_components import (
+    BlockBridge,
+    CodeGenAttentionBridge,
+    EmbeddingBridge,
+    LinearBridge,
+    MLPBridge,
+    NormalizationBridge,
+    UnembeddingBridge,
+)
+
+
+class CodeGenArchitectureAdapter(ArchitectureAdapter):
+    """Architecture adapter for CodeGen models.
+
+    CodeGen uses a parallel attention+MLP block (attn and MLP share the same
+    LayerNorm input and their outputs are summed).  The attention layer uses a
+    fused ``qkv_proj`` weight whose layout follows GPT-J's ``mp_num=4``
+    tensor-parallel partitioning: the rows are interleaved as
+    ``[Q_part, V_part, K_part]`` within each of the 4 MP partitions.
+
+    Optional Parameters (may be absent in some CodeGen checkpoints):
+    ---------------------------------------------------------------
+    - No bias on qkv_proj (fused QKV has no bias)
+    - No bias on out_proj
+    - No bias on mlp.fc_in or mlp.fc_out
+    """
+
+    def __init__(self, cfg: Any) -> None:
+        """Initialize the CodeGen architecture adapter."""
+        super().__init__(cfg)
+
+        # Config attributes
+        self.cfg.normalization_type = "LN"
+        self.cfg.positional_embedding_type = "rotary"
+        self.cfg.final_rms = False
+        self.cfg.gated_mlp = False
+        self.cfg.attn_only = False
+        self.cfg.parallel_attn_mlp = True
+
+        # After split_qkv_matrix the individual Q/K/V weights have shape
+        # [n_embd, n_embd].  The conversions below rearrange them to the
+        # TransformerLens format [n_heads, d_model, d_head].
+        self.weight_processing_conversions = {
+            "blocks.{i}.attn.q.weight": ParamProcessingConversion(
+                tensor_conversion=RearrangeTensorConversion(
+                    "(n h) m -> n m h", n=self.cfg.n_heads
+                ),
+            ),
+            "blocks.{i}.attn.k.weight": ParamProcessingConversion(
+                tensor_conversion=RearrangeTensorConversion(
+                    "(n h) m -> n m h", n=self.cfg.n_heads
+                ),
+            ),
+            "blocks.{i}.attn.v.weight": ParamProcessingConversion(
+                tensor_conversion=RearrangeTensorConversion(
+                    "(n h) m -> n m h", n=self.cfg.n_heads
+                ),
+            ),
+            "blocks.{i}.attn.o.weight": ParamProcessingConversion(
+                tensor_conversion=RearrangeTensorConversion(
+                    "m (n h) -> n h m", n=self.cfg.n_heads
+                ),
+            ),
+        }
+
+        self.component_mapping = {
+            "embed": EmbeddingBridge(name="transformer.wte"),
+            "blocks": BlockBridge(
+                name="transformer.h",
+                submodules={
+                    "ln1": NormalizationBridge(name="ln_1", config=self.cfg),
+                    # No ln2: CodeGen uses parallel attn+MLP that both read from ln_1
+                    "attn": CodeGenAttentionBridge(
+                        name="attn",
+                        config=self.cfg,
+                        split_qkv_matrix=self.split_qkv_matrix,
+                        submodules={
+                            "qkv": LinearBridge(name="qkv_proj"),
+                            "o": LinearBridge(name="out_proj"),
+                        },
+                    ),
+                    "mlp": MLPBridge(
+                        name="mlp",
+                        submodules={
+                            "in": LinearBridge(name="fc_in"),
+                            "out": LinearBridge(name="fc_out"),
+                        },
+                    ),
+                },
+            ),
+            "ln_final": NormalizationBridge(name="transformer.ln_f", config=self.cfg),
+            "unembed": UnembeddingBridge(name="lm_head"),
+        }
+
+    def split_qkv_matrix(
+        self, attn_component: Any
+    ) -> tuple[nn.Linear, nn.Linear, nn.Linear]:
+        """Split the fused QKV weight into separate Q, K, V linear modules.
+
+        CodeGen uses GPT-J-style tensor-parallel partitioning with ``mp_num=4``
+        partitions.  Within each partition the row order is
+        ``[Q_part, V_part, K_part]``, i.e. **not** the conventional Q/K/V order.
+
+        The fused weight has shape ``[3 * n_embd, n_embd]``.  We reshape to
+        ``[mp_num, 3, local_dim, n_embd]``, extract the three slices, then
+        flatten back to ``[n_embd, n_embd]`` for each of Q, K, V.
+
+        Args:
+            attn_component: The original ``CodeGenAttention`` module.
+
+        Returns:
+            Tuple of ``(q_linear, k_linear, v_linear)`` — three ``nn.Linear``
+            modules with no bias and weight shape ``[n_embd, n_embd]``.
+        """
+        mp_num = 4
+        n_embd = self.cfg.d_model
+
+        weight = attn_component.qkv_proj.weight  # [3*n_embd, n_embd]
+
+        # Partition into mp_num slices; within each: [Q_part, V_part, K_part]
+        local_dim = n_embd // mp_num
+        w = weight.reshape(mp_num, 3, local_dim, n_embd)
+
+        # Index 0 = Q, 1 = V, 2 = K  (CodeGen partition ordering)
+        W_Q = w[:, 0, :, :].reshape(n_embd, n_embd)
+        W_V = w[:, 1, :, :].reshape(n_embd, n_embd)
+        W_K = w[:, 2, :, :].reshape(n_embd, n_embd)
+
+        q_linear = nn.Linear(n_embd, n_embd, bias=False)
+        q_linear.weight = nn.Parameter(W_Q)
+
+        k_linear = nn.Linear(n_embd, n_embd, bias=False)
+        k_linear.weight = nn.Parameter(W_K)
+
+        v_linear = nn.Linear(n_embd, n_embd, bias=False)
+        v_linear.weight = nn.Parameter(W_V)
+
+        return q_linear, k_linear, v_linear
diff --git a/transformer_lens/tools/model_registry/__init__.py b/transformer_lens/tools/model_registry/__init__.py
index 409c3dc3f..e85aef2bc 100644
--- a/transformer_lens/tools/model_registry/__init__.py
+++ b/transformer_lens/tools/model_registry/__init__.py
@@ -45,6 +45,7 @@
     "ApertusForCausalLM",
     "BertForMaskedLM",
     "BloomForCausalLM",
+    "CodeGenForCausalLM",
     "FalconForCausalLM",
     "GemmaForCausalLM",
     "Gemma2ForCausalLM",
diff --git a/transformer_lens/tools/model_registry/data/architecture_gaps.json b/transformer_lens/tools/model_registry/data/architecture_gaps.json
index 68ef2bda5..7344d18b3 100644
--- a/transformer_lens/tools/model_registry/data/architecture_gaps.json
+++ b/transformer_lens/tools/model_registry/data/architecture_gaps.json
@@ -1,177 +1,129 @@
 {
-  "generated_at": "2026-04-09",
+  "generated_at": "2026-03-19",
   "scan_info": {
-    "total_scanned": 10000,
+    "total_scanned": 3517,
     "task_filter": "text-generation",
     "min_downloads": 500,
-    "scan_duration_seconds": 3.2
+    "scan_duration_seconds": 2.7
   },
-  "total_unsupported_architectures": 372,
-  "total_unsupported_models": 1416,
+  "total_unsupported_architectures": 258,
+  "total_unsupported_models": 1031,
   "gaps": [
-    {
-      "architecture_id": "Qwen3_5ForConditionalGeneration",
-      "total_models": 66,
-      "sample_models": [
-        "Tesslate/OmniCoder-9B",
-        "mconcat/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-NVFP4",
-        "croll83/Qwopus3.5-27B-v3-Abliterated",
-        "osoleve/Qwen3.5-27B-Text-NVFP4-MTP",
-        "nightmedia/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-qx64-hi-mlx",
-        "mconcat/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-v2-NVFP4",
-        "Brooooooklyn/Qwen3.5-27B-unsloth-mlx",
-        "ShinePixelOrg/Qwopus3.5-27B-v3-NVFP4",
-        "aifeifei798/Qwen3.5-Queen-27B",
-        "Jackrong/Qwen3.5-2B-Claude-4.6-Opus-Reasoning-Distilled"
-      ]
-    },
     {
       "architecture_id": "Qwen3MoeForCausalLM",
-      "total_models": 55,
+      "total_models": 68,
       "sample_models": [
         "Qwen/Qwen3-30B-A3B",
         "Qwen/Qwen3-30B-A3B-Instruct-2507",
+        "Qwen/Qwen3-30B-A3B-Thinking-2507",
         "Qwen/Qwen3-Coder-30B-A3B-Instruct",
         "Qwen/Qwen3-235B-A22B",
-        "nvidia/Qwen3-30B-A3B-NVFP4",
-        "Qwen/Qwen3-30B-A3B-Thinking-2507",
         "trl-internal-testing/tiny-Qwen3MoeForCausalLM",
         "Qwen/Qwen3-235B-A22B-Instruct-2507",
         "Qwen/Qwen3-Coder-480B-A35B-Instruct",
-        "Qwen/Qwen3-235B-A22B-Thinking-2507"
+        "NVFP4/Qwen3-30B-A3B-Instruct-2507-FP4",
+        "nvidia/Qwen3-30B-A3B-NVFP4"
       ]
     },
     {
       "architecture_id": "DeepseekV3ForCausalLM",
-      "total_models": 51,
+      "total_models": 53,
       "sample_models": [
         "deepseek-ai/DeepSeek-R1",
         "deepseek-ai/DeepSeek-R1-0528",
         "deepseek-ai/DeepSeek-V3",
-        "nvidia/DeepSeek-R1-0528-NVFP4-v2",
         "deepseek-ai/DeepSeek-V3-0324",
-        "ai-sage/GigaChat3-10B-A1.8B",
+        "nvidia/DeepSeek-R1-0528-NVFP4-v2",
         "deepseek-ai/DeepSeek-V3.1",
-        "moonshotai/Kimi-K2-Instruct-0905",
-        "moonshotai/Kimi-K2-Instruct",
-        "moonshotai/Moonlight-16B-A3B-Instruct"
+        "ai-sage/GigaChat3-10B-A1.8B",
+        "trl-internal-testing/tiny-DeepseekV3ForCausalLM",
+        "nvidia/DeepSeek-V3-0324-NVFP4",
+        "moonshotai/Kimi-K2-Instruct"
       ]
     },
     {
-      "architecture_id": "NemotronHForCausalLM",
-      "total_models": 50,
+      "architecture_id": "Qwen3_5ForConditionalGeneration",
+      "total_models": 46,
       "sample_models": [
-        "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",
-        "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
-        "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4",
-        "nvidia/NVIDIA-Nemotron-Nano-9B-v2-Japanese",
-        "nvidia/NVIDIA-Nemotron-Nano-9B-v2",
-        "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16",
-        "nvidia/Nemotron-Cascade-2-30B-A3B",
-        "nvidia/NVIDIA-Nemotron-3-Nano-4B-BF16",
-        "unsloth/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",
-        "unsloth/NVIDIA-Nemotron-3-Nano-4B"
+        "Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled",
+        "osoleve/Qwen3.5-27B-Text-NVFP4-MTP",
+        "Tesslate/OmniCoder-9B",
+        "nightmedia/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-qx64-hi-mlx",
+        "Jackrong/Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled",
+        "txn545/Qwen3.5-27B-NVFP4",
+        "mconcat/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-NVFP4",
+        "Jackrong/Qwen3.5-4B-Claude-4.6-Opus-Reasoning-Distilled",
+        "EganAI/qwen3.5-9b-terminal-merge",
+        "Jackrong/Qwen3.5-2B-Claude-4.6-Opus-Reasoning-Distilled"
       ]
     },
     {
-      "architecture_id": "Lfm2ForCausalLM",
-      "total_models": 34,
+      "architecture_id": "Qwen3NextForCausalLM",
+      "total_models": 35,
       "sample_models": [
-        "farbodtavakkoli/OTel-LLM-1.2B-IT",
-        "LiquidAI/LFM2.5-1.2B-Instruct",
-        "LiquidAI/LFM2-1.2B",
-        "LiquidAI/LFM2-350M",
-        "LiquidAI/LFM2.5-1.2B-Thinking",
-        "LiquidAI/LFM2.5-350M",
-        "LiquidAI/LFM2-2.6B-Exp",
-        "LiquidAI/LFM2.5-1.2B-Base",
-        "LiquidAI/LFM2-700M",
-        "unsloth/LFM2.5-1.2B-Instruct"
+        "Qwen/Qwen3-Coder-Next",
+        "Qwen/Qwen3-Next-80B-A3B-Instruct",
+        "GadflyII/Qwen3-Coder-Next-NVFP4",
+        "nvidia/Qwen3-Next-80B-A3B-Thinking-NVFP4",
+        "nvidia/Qwen3-Next-80B-A3B-Instruct-NVFP4",
+        "Qwen/Qwen3-Next-80B-A3B-Thinking",
+        "tiny-random/qwen3-next-moe",
+        "unsloth/Qwen3-Coder-Next",
+        "yujiepan/qwen3-next-moe-tiny-random",
+        "RedHatAI/Qwen3-Coder-Next-NVFP4"
       ]
     },
     {
-      "architecture_id": "Qwen3_5ForCausalLM",
+      "architecture_id": "FalconForCausalLM",
       "total_models": 32,
       "sample_models": [
-        "lukey03/Qwen3.5-9B-abliterated",
-        "GoodStartLabs/gin-rummy-hbc-qwen3.5-0.8b",
-        "aifeifei798/Darkidol-Ballad-27B",
-        "brocchirodrigo/anotaai-ajuda-qwen3_5_Q4",
-        "llmfan46/Darkidol-Ballad-27B-ultra-uncensored-heretic-v1",
-        "llmfan46/Darkidol-Ballad-27B-ultra-uncensored-heretic-v2",
-        "Phonsiri/Qwen3.5-9B-Thai-Law-Base",
-        "kai-os/Carnice-9b",
-        "aifeifei798/Darkidol-Ballad-9B",
-        "continuum-ai/qwen3.5-4b-code-forged"
-      ]
-    },
-    {
-      "architecture_id": "Gemma4ForConditionalGeneration",
-      "total_models": 30,
-      "sample_models": [
-        "nvidia/Gemma-4-31B-IT-NVFP4",
-        "dealignai/Gemma-4-31B-JANG_4M-CRACK",
-        "bg-digitalservices/Gemma-4-26B-A4B-it-NVFP4",
-        "bg-digitalservices/Gemma-4-E2B-NVFP4A16",
-        "dealignai/Gemma-4-31B-JANG_4M-Uncensored",
-        "bg-digitalservices/Gemma-4-E2B-it-NVFP4",
-        "bg-digitalservices/Gemma-4-26B-A4B-it-NVFP4A16",
-        "0xSero/gemma-4-21b-a4b-it-REAP",
-        "InfinimindCreations/gemma-4-E4B-it-uncensored",
-        "EganAI/gemma-4-31B-Claude-4.6-Opus-Reasoning-Distilled"
-      ]
-    },
-    {
-      "architecture_id": "CodeGenForCausalLM",
-      "total_models": 29,
-      "sample_models": [
-        "Salesforce/codegen-350M-mono",
-        "Salesforce/codegen-350M-multi",
-        "Salesforce/codegen-2B-mono",
-        "Salesforce/codegen-6B-multi",
-        "Salesforce/codegen-16B-nl",
-        "Salesforce/codegen-6B-nl",
-        "Salesforce/codegen-350M-nl",
-        "Salesforce/codegen-6B-mono",
-        "Salesforce/codegen-2B-multi",
-        "Salesforce/codegen-16B-mono"
-      ]
-    },
-    {
-      "architecture_id": "MPTForCausalLM",
-      "total_models": 24,
-      "sample_models": [
-        "vinai/PhoGPT-4B",
-        "anas-awadalla/mpt-7b",
-        "gl198976/mpt-7b-instruct",
-        "replit/replit-code-v1-3b",
-        "vinai/PhoGPT-4B-Chat",
-        "wtang06/mpt-125m-c4",
-        "echarlaix/tiny-mpt-random-remote-code",
-        "lightblue/japanese-mpt-7b",
-        "gl198976/mpt-7b",
-        "TehVenom/MPT-7b-InstructAndStorywriting-50_50-Merge"
+        "tiiuae/falcon-7b",
+        "tiiuae/falcon-7b-instruct",
+        "tiiuae/falcon-40b-instruct",
+        "tiiuae/falcon-40b",
+        "tiiuae/falcon-rw-1b",
+        "fxmarty/really-tiny-falcon-testing",
+        "vilsonrodrigues/falcon-7b-instruct-sharded",
+        "tiiuae/falcon-11B",
+        "euclaise/falcon_1b_stage2",
+        "explosion-testing/falcon-test"
       ]
     },
     {
       "architecture_id": "Qwen3_5MoeForConditionalGeneration",
-      "total_models": 23,
+      "total_models": 28,
       "sample_models": [
-        "nvidia/Qwen3.5-397B-A17B-NVFP4",
         "txn545/Qwen3.5-122B-A10B-NVFP4",
-        "Jackrong/Qwen3.5-35B-A3B-Claude-4.6-Opus-Reasoning-Distilled",
-        "lukealonso/Qwen3.5-397B-A17B-NVFP4",
+        "nvidia/Qwen3.5-397B-A17B-NVFP4",
         "txn545/Qwen3.5-35B-A3B-NVFP4",
+        "RepublicOfKorokke/Qwen3.5-35B-A3B-mlx-lm-mxfp4",
+        "nightmedia/Qwen3.5-35B-A3B-Text-qx64-hi-mlx",
+        "lukealonso/Qwen3.5-397B-A17B-NVFP4",
         "nightmedia/Qwen3.5-122B-A10B-Text-mxfp4-mlx",
         "olka-fi/Qwen3.5-122B-A10B-MXFP4",
-        "nightmedia/Qwen3.5-35B-A3B-Text-qx64-hi-mlx",
-        "RepublicOfKorokke/Qwen3.5-35B-A3B-mlx-lm-mxfp4",
-        "bjk110/Qwen3.5-122B-A10B-abliterated-NVFP4"
+        "Jackrong/Qwen3.5-35B-A3B-Claude-4.6-Opus-Reasoning-Distilled",
+        "NexVeridian/Qwen3.5-35B-A3B-3bit"
+      ]
+    },
+    {
+      "architecture_id": "Lfm2ForCausalLM",
+      "total_models": 21,
+      "sample_models": [
+        "LiquidAI/LFM2-1.2B",
+        "LiquidAI/LFM2.5-1.2B-Instruct",
+        "LiquidAI/LFM2.5-1.2B-Base",
+        "LiquidAI/LFM2-350M",
+        "LiquidAI/LFM2.5-1.2B-Thinking",
+        "LiquidAI/LFM2-2.6B",
+        "LiquidAI/LFM2-2.6B-Exp",
+        "LiquidAI/LFM2-700M",
+        "unsloth/LFM2.5-1.2B-Instruct",
+        "LiquidAI/LFM2.5-1.2B-Thinking-ONNX"
       ]
     },
     {
       "architecture_id": "InternLM2ForCausalLM",
-      "total_models": 23,
+      "total_models": 19,
       "sample_models": [
         "internlm/internlm2-chat-7b",
         "internlm/internlm2_5-7b-chat",
@@ -182,87 +134,103 @@
         "internlm/internlm2-base-20b",
         "chujiezheng/internlm2-chat-20b-ExPO",
         "chujiezheng/internlm2-chat-7b-ExPO",
-        "internlm/internlm2-1_8b"
+        "AI4Chem/ChemLLM-7B-Chat-1_5-DPO"
       ]
     },
     {
-      "architecture_id": "Qwen3NextForCausalLM",
-      "total_models": 21,
+      "architecture_id": "Glm4MoeForCausalLM",
+      "total_models": 18,
       "sample_models": [
-        "Qwen/Qwen3-Coder-Next",
-        "Qwen/Qwen3-Next-80B-A3B-Instruct",
-        "GadflyII/Qwen3-Coder-Next-NVFP4",
-        "unsloth/Qwen3-Coder-Next",
-        "Qwen/Qwen3-Next-80B-A3B-Thinking",
-        "tiny-random/qwen3-next-moe",
-        "nvidia/Qwen3-Next-80B-A3B-Instruct-NVFP4",
-        "RedHatAI/Qwen3-Coder-Next-NVFP4",
-        "yujiepan/qwen3-next-moe-tiny-random",
-        "saricles/Qwen3-Coder-Next-NVFP4-GB10"
+        "zai-org/GLM-4.5-Air",
+        "zai-org/GLM-4.7",
+        "trl-internal-testing/tiny-Glm4MoeForCausalLM",
+        "zai-org/GLM-4.5",
+        "zai-org/GLM-4.6",
+        "Tengyunw/GLM-4.7-NVFP4",
+        "Salyut1/GLM-4.7-NVFP4",
+        "np-cr/testing-glm4-moe",
+        "ArliAI/GLM-4.6-Derestricted-v3",
+        "zai-org/GLM-4.5-Air-Base"
       ]
     },
     {
       "architecture_id": "JambaForCausalLM",
-      "total_models": 21,
+      "total_models": 17,
       "sample_models": [
         "ai21labs/AI21-Jamba-Mini-1.5",
         "ai21labs/Jamba-tiny-random",
-        "ai21labs/AI21-Jamba-Mini-1.6",
-        "ai21labs/AI21-Jamba-Large-1.5",
         "ai21labs/AI21-Jamba2-3B",
+        "ai21labs/AI21-Jamba-Reasoning-3B",
+        "ai21labs/AI21-Jamba-Large-1.5",
+        "ai21labs/AI21-Jamba-Mini-1.6",
         "ai21labs/AI21-Jamba-Large-1.6",
+        "microsoft/Dayhoff-170m-GR",
         "ai21labs/Jamba-v0.1",
-        "ai21labs/AI21-Jamba2-Mini",
-        "ai21labs/AI21-Jamba-Reasoning-3B",
-        "microsoft/Dayhoff-170m-GR"
+        "microsoft/Dayhoff-170M-GRS-112000"
       ]
     },
     {
       "architecture_id": "QWenLMHeadModel",
-      "total_models": 20,
+      "total_models": 16,
       "sample_models": [
-        "cckevinn/SeeClick",
-        "Qwen/Qwen-7B-Chat",
         "Qwen/Qwen-7B",
+        "Qwen/Qwen-7B-Chat",
         "Qwen/Qwen-VL-Chat",
         "Qwen/Qwen-VL",
-        "Qwen/Qwen-1_8B-Chat",
+        "Qwen/Qwen-14B-Chat-Int4",
         "Qwen/Qwen-14B-Chat",
+        "Qwen/Qwen-1_8B-Chat",
+        "Qwen/Qwen-72B",
         "Qwen/Qwen-14B",
-        "Xingyu-Zheng/Qwen-VL-Chat",
-        "Qwen/Qwen-72B"
+        "Qwen/Qwen-Audio-Chat"
+      ]
+    },
+    {
+      "architecture_id": "FalconH1ForCausalLM",
+      "total_models": 16,
+      "sample_models": [
+        "tiiuae/Falcon-H1-Tiny-90M-Instruct",
+        "tiiuae/Falcon-H1-0.5B-Base",
+        "tiiuae/Falcon-H1R-7B",
+        "tiiuae/Falcon-H1-7B-Instruct",
+        "tiiuae/Falcon-H1-34B-Base",
+        "tiiuae/Falcon-H1-34B-Instruct",
+        "tiiuae/Falcon-H1-1.5B-Base",
+        "tiiuae/Falcon-H1-7B-Base",
+        "tiiuae/Falcon-H1-3B-Base",
+        "tiiuae/Falcon-H1-1.5B-Deep-Base"
+      ]
+    },
+    {
+      "architecture_id": "NemotronHForCausalLM",
+      "total_models": 15,
+      "sample_models": [
+        "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
+        "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4",
+        "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",
+        "nvidia/NVIDIA-Nemotron-Nano-9B-v2-Japanese",
+        "nvidia/NVIDIA-Nemotron-Nano-9B-v2",
+        "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16",
+        "unsloth/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4",
+        "OpenResearcher/OpenResearcher-30B-A3B",
+        "nvidia/NVIDIA-Nemotron-Nano-9B-v2-NVFP4",
+        "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-Base-BF16"
       ]
     },
     {
       "architecture_id": "GPTBigCodeForCausalLM",
-      "total_models": 20,
+      "total_models": 15,
       "sample_models": [
         "bigcode/gpt_bigcode-santacoder",
         "bigcode/tiny_starcoder_py",
         "bigcode/starcoder",
         "bigcode/starcoderbase-1b",
         "ibm-granite/granite-20b-code-base-8k",
-        "GeorgiaTechResearchInstitute/starcoder-gpteacher-code-instruct",
-        "HuggingFaceH4/starchat-alpha",
-        "defog/sqlcoder2",
+        "ibm-granite/granite-20b-code-instruct-8k",
         "HuggingFaceH4/starchat-beta",
-        "LoupGarou/WizardCoder-Guanaco-15B-V1.0"
-      ]
-    },
-    {
-      "architecture_id": "XGLMForCausalLM",
-      "total_models": 18,
-      "sample_models": [
-        "facebook/xglm-564M",
-        "facebook/incoder-1B",
-        "facebook/xglm-7.5B",
-        "facebook/xglm-4.5B",
-        "facebook/xglm-1.7B",
-        "KoboldAI/fairseq-dense-2.7B",
-        "KoboldAI/fairseq-dense-125M",
-        "KoboldAI/fairseq-dense-355M",
-        "KoboldAI/fairseq-dense-13B",
-        "KoboldAI/fairseq-dense-1.3B"
+        "HuggingFaceH4/starchat-alpha",
+        "LoupGarou/WizardCoder-Guanaco-15B-V1.1",
+        "Danielbrdz/CodeBarcenas-1b"
       ]
     },
     {
@@ -273,376 +241,248 @@
         "cerebras/MiniMax-M2.1-REAP-139B-A10B",
         "MiniMaxAI/MiniMax-M2",
         "MiniMaxAI/MiniMax-M2.1",
-        "nvidia/MiniMax-M2.5-NVFP4",
         "cerebras/MiniMax-M2.5-REAP-139B-A10B",
-        "amd/MiniMax-M2.5-MXFP4",
+        "PrimeIntellect/MiniMax-M2.5-bf16",
+        "cerebras/MiniMax-M2.5-REAP-172B-A10B",
         "saricles/MiniMax-M2.5-REAP-172B-A10B-NVFP4-GB10",
         "aspctu/MiniMax-M2.5",
         "amd/MiniMax-M2.1-MXFP4"
       ]
     },
     {
-      "architecture_id": "DeciLMForCausalLM",
+      "architecture_id": "XGLMForCausalLM",
       "total_models": 14,
       "sample_models": [
-        "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5",
-        "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
-        "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5-NVFP4",
-        "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1",
-        "ConicCat/Llama3_3-Nemo-Super-Writer-49B",
-        "nvidia/Llama-3_1-Nemotron-51B-Instruct",
-        "FriendliAI/Llama-3_3-Nemotron-Super-49B-v1_5",
-        "FriendliAI/Llama-3_1-Nemotron-Ultra-253B-v1",
-        "NewstaR/Porpoise-6b-instruct",
-        "nvidia/Llama-3_1-Nemotron-Ultra-253B-CPT-v1"
+        "facebook/xglm-564M",
+        "facebook/xglm-7.5B",
+        "facebook/xglm-1.7B",
+        "KoboldAI/fairseq-dense-13B",
+        "facebook/xglm-4.5B",
+        "KoboldAI/fairseq-dense-125M",
+        "KoboldAI/fairseq-dense-2.7B",
+        "KoboldAI/fairseq-dense-355M",
+        "KoboldAI/fairseq-dense-1.3B",
+        "KoboldAI/fairseq-dense-6.7B"
       ]
     },
     {
-      "architecture_id": "FalconH1ForCausalLM",
-      "total_models": 14,
+      "architecture_id": "Glm4MoeLiteForCausalLM",
+      "total_models": 13,
       "sample_models": [
-        "tiiuae/Falcon-H1-0.5B-Base",
-        "tiiuae/Falcon-H1-3B-Base",
-        "tiiuae/Falcon-H1-7B-Base",
-        "tiiuae/Falcon-H1-1.5B-Deep-Base",
-        "tiiuae/Falcon-H1-34B-Base",
-        "tiiuae/Falcon-H1R-7B",
-        "tiiuae/Falcon-H1-1.5B-Base",
-        "tiiuae/Falcon-H1-Tiny-90M-Instruct",
-        "tiiuae/Falcon-H1-1.5B-Deep-Instruct",
-        "tiiuae/Falcon-H1-3B-Instruct"
+        "zai-org/GLM-4.7-Flash",
+        "GadflyII/GLM-4.7-Flash-NVFP4",
+        "unsloth/GLM-4.7-Flash",
+        "GadflyII/GLM-4.7-Flash-MTP-NVFP4",
+        "Olafangensan/GLM-4.7-Flash-heretic",
+        "cerebras/GLM-4.7-Flash-REAP-23B-A3B",
+        "huihui-ai/Huihui-GLM-4.7-Flash-abliterated",
+        "TeichAI/GLM-4.7-Flash-Claude-Opus-4.5-High-Reasoning-Distill",
+        "Ex0bit/GLM-4.7-Flash-PRISM",
+        "MuXodious/GLM-4.7-Flash-absolute-heresy"
+      ]
+    },
+    {
+      "architecture_id": "CodeGenForCausalLM",
+      "total_models": 13,
+      "sample_models": [
+        "Salesforce/codegen-350M-mono",
+        "Salesforce/codegen-350M-multi",
+        "Salesforce/codegen-2B-mono",
+        "hf-tiny-model-private/tiny-random-CodeGenForCausalLM",
+        "Salesforce/codegen-6B-multi",
+        "shailja/fine-tuned-codegen-16B-Verilog",
+        "katuni4ka/tiny-random-codegen2",
+        "Salesforce/codegen-2B-multi",
+        "Salesforce/codegen-6B-mono",
+        "Salesforce/codegen-6B-nl"
       ]
     },
     {
       "architecture_id": "RwkvForCausalLM",
-      "total_models": 14,
+      "total_models": 13,
       "sample_models": [
         "RWKV/v5-Eagle-7B-HF",
         "RWKV/rwkv-4-169m-pile",
         "beomi/KoRWKV-6B",
-        "RWKV/rwkv-4-430m-pile",
         "RWKV/rwkv-4-1b5-pile",
+        "RWKV/rwkv-4-430m-pile",
         "RWKV/rwkv-4-3b-pile",
-        "RWKV/rwkv-raven-1b5",
         "RWKV/rwkv-4-7b-pile",
-        "RWKV/rwkv-raven-3b",
-        "RWKV/rwkv-raven-14b"
+        "RWKV/rwkv-raven-1b5",
+        "RWKV/rwkv-4-14b-pile",
+        "RWKV/rwkv-raven-7b"
       ]
     },
     {
       "architecture_id": "DeepseekV2ForCausalLM",
-      "total_models": 13,
+      "total_models": 11,
       "sample_models": [
         "deepseek-ai/DeepSeek-V2-Lite-Chat",
         "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
         "deepseek-ai/DeepSeek-V2-Lite",
-        "deepseek-ai/DeepSeek-V2",
         "deepseek-ai/DeepSeek-V2-Chat",
-        "deepseek-ai/DeepSeek-Coder-V2-Instruct",
+        "deepseek-ai/DeepSeek-Coder-V2-Instruct-0724",
+        "deepseek-ai/DeepSeek-V2",
         "deepseek-ai/DeepSeek-V2.5",
+        "deepseek-ai/DeepSeek-Coder-V2-Instruct",
         "deepseek-ai/DeepSeek-V2-Chat-0628",
-        "deepseek-ai/DeepSeek-Coder-V2-Lite-Base",
-        "Kwaipilot/KwaiCoder-DS-V2-Lite-Base"
+        "deepseek-ai/DeepSeek-Coder-V2-Lite-Base"
       ]
     },
     {
-      "architecture_id": "Glm4MoeForCausalLM",
-      "total_models": 13,
+      "architecture_id": "CohereForCausalLM",
+      "total_models": 10,
       "sample_models": [
-        "zai-org/GLM-4.5-Air",
-        "zai-org/GLM-4.7",
-        "trl-internal-testing/tiny-Glm4MoeForCausalLM",
-        "zai-org/GLM-4.5",
-        "zai-org/GLM-4.6",
-        "Tengyunw/GLM-4.7-NVFP4",
-        "np-cr/testing-glm4-moe",
-        "nvidia/GLM-4.7-NVFP4",
-        "Salyut1/GLM-4.7-NVFP4",
-        "ArliAI/GLM-4.6-Derestricted-v3"
+        "trl-internal-testing/tiny-CohereForCausalLM",
+        "CohereLabs/aya-23-8B",
+        "CohereLabs/aya-expanse-8b",
+        "CohereLabs/c4ai-command-r-v01",
+        "CohereLabs/aya-expanse-32b",
+        "NLPark/AnFeng_v3_Avocet",
+        "CohereLabs/aya-23-35B",
+        "CohereLabs/c4ai-command-r-plus-08-2024",
+        "CohereLabs/c4ai-command-r-08-2024",
+        "CohereLabs/c4ai-command-r-plus"
       ]
     },
     {
-      "architecture_id": "BaichuanForCausalLM",
-      "total_models": 13,
-      "sample_models": [
-        "baichuan-inc/Baichuan2-7B-Chat",
-        "baichuan-inc/Baichuan2-13B-Chat",
-        "baichuan-inc/Baichuan-13B-Chat",
-        "baichuan-inc/Baichuan2-7B-Base",
-        "baichuan-inc/Baichuan2-13B-Base",
-        "katuni4ka/tiny-random-baichuan2",
-        "sakuraumi/Sakura-13B-Galgame",
-        "zxbsmk/NSFW_13B_sft",
-        "katuni4ka/tiny-random-baichuan2-13b",
-        "baichuan-inc/Baichuan-13B-Base"
-      ]
-    },
-    {
-      "architecture_id": "LlavaLlamaForCausalLM",
-      "total_models": 13,
-      "sample_models": [
-        "LanguageBind/Video-LLaVA-7B",
-        "wisdomik/Quilt-Llava-v1.5-7b",
-        "liuhaotian/llava-llama-2-13b-chat-lightning-preview",
-        "lmms-lab/llama3-llava-next-8b",
-        "mmaaz60/LLaVA-7B-Lightening-v1-1",
-        "microsoft/llava-med-7b-delta",
-        "deepcs233/VisCoT-7b-336",
-        "ManishThota/Ollama_Video_llama_7B",
-        "EricPolaris/Quilt-Llava-v1.5-7b",
-        "liuhaotian/LLaVA-Lightning-7B-delta-v1-1"
-      ]
-    },
-    {
-      "architecture_id": "T5GemmaForConditionalGeneration",
-      "total_models": 12,
+      "architecture_id": "T5GemmaForConditionalGeneration",
+      "total_models": 10,
       "sample_models": [
         "google/t5gemma-s-s-prefixlm",
         "google/t5gemma-9b-9b-ul2",
         "google/t5gemma-b-b-ul2",
-        "google/t5gemma-2b-2b-prefixlm",
         "google/t5gemma-2b-2b-ul2",
-        "google/t5gemma-l-l-ul2-it",
-        "google/t5gemma-ml-ml-ul2-it",
         "google/t5gemma-b-b-prefixlm",
-        "google/t5gemma-s-s-prefixlm-it",
-        "google/t5gemma-s-s-ul2"
-      ]
-    },
-    {
-      "architecture_id": "MT5ForConditionalGeneration",
-      "total_models": 12,
-      "sample_models": [
-        "knowledgator/IUPAC2SMILES-canonical-base",
-        "knowledgator/SMILES2IUPAC-canonical-base",
-        "bigscience/mt0-small",
-        "bigscience/mt0-base",
-        "bigscience/mt0-large",
-        "bigscience/mt0-xl",
-        "bigscience/mt0-xxl",
-        "intelia-lab-uah/mt0-base_QG_SQAC",
-        "intelia-lab-uah/mt0-base_AE_SQAC",
-        "UBC-NLP/toucan-1.2B"
-      ]
-    },
-    {
-      "architecture_id": "LLaMAForCausalLM",
-      "total_models": 12,
-      "sample_models": [
-        "maicomputer/alpaca-13b",
-        "Enoch/llama-65b-hf",
-        "mncai/chatdoctor",
-        "AdaptLLM/law-LLM",
-        "Nitish-Garikoti/finance-LLM",
-        "boboto/LLaMA-65B-HF",
-        "AdaptLLM/finance-LLM",
-        "AdaptLLM/medicine-LLM",
-        "Rardilit/Panther_v1",
-        "James-WYang/BigTranslate"
-      ]
-    },
-    {
-      "architecture_id": "MiniCPMForCausalLM",
-      "total_models": 11,
-      "sample_models": [
-        "openbmb/MiniCPM-2B-sft-bf16",
-        "openbmb/MiniCPM4.1-8B",
-        "openbmb/MiniCPM-1B-sft-bf16",
-        "openbmb/MiniCPM4-0.5B",
-        "openbmb/MiniCPM-MoE-8x2B",
-        "katuni4ka/tiny-random-minicpm",
-        "openbmb/MiniCPM-S-1B-sft",
-        "openbmb/MiniCPM-2B-sft-fp32",
-        "openbmb/MiniCPM-2B-dpo-bf16",
-        "openbmb/MiniCPM4-8B"
+        "google/t5gemma-9b-9b-ul2-it",
+        "google/t5gemma-2b-2b-prefixlm",
+        "google/t5gemma-9b-2b-ul2-it",
+        "google/t5gemma-l-l-prefixlm",
+        "harshaljanjani/tiny-t5gemma-test"
       ]
     },
     {
-      "architecture_id": "Glm4MoeLiteForCausalLM",
+      "architecture_id": "Cohere2ForCausalLM",
       "total_models": 10,
       "sample_models": [
-        "zai-org/GLM-4.7-Flash",
-        "unsloth/GLM-4.7-Flash",
-        "cerebras/GLM-4.7-Flash-REAP-23B-A3B",
-        "GadflyII/GLM-4.7-Flash-NVFP4",
-        "huihui-ai/Huihui-GLM-4.7-Flash-abliterated",
-        "Olafangensan/GLM-4.7-Flash-heretic",
-        "Ex0bit/GLM-4.7-Flash-PRISM",
-        "jerrycheng233/model5_sft_16bit",
-        "aaravriyer193/chimpgpt-coder-elite",
-        "GadflyII/GLM-4.7-Flash-MTP-NVFP4"
+        "trl-internal-testing/tiny-Cohere2ForCausalLM",
+        "CohereLabs/tiny-aya-global",
+        "CohereLabs/c4ai-command-r7b-12-2024",
+        "CohereLabs/tiny-aya-base",
+        "CohereLabs/c4ai-command-a-03-2025",
+        "CohereLabs/c4ai-command-r7b-arabic-02-2025",
+        "CohereLabs/tiny-aya-water",
+        "CohereLabs/tiny-aya-fire",
+        "CohereLabs/command-a-reasoning-08-2025",
+        "CohereLabs/tiny-aya-earth"
       ]
     },
     {
-      "architecture_id": "CohereForCausalLM",
-      "total_models": 10,
+      "architecture_id": "DeciLMForCausalLM",
+      "total_models": 9,
       "sample_models": [
-        "trl-internal-testing/tiny-CohereForCausalLM",
-        "CohereLabs/aya-expanse-8b",
-        "CohereLabs/c4ai-command-r-v01",
-        "CohereLabs/aya-23-8B",
-        "NLPark/AnFeng_v3_Avocet",
-        "CohereLabs/aya-expanse-32b",
-        "CohereLabs/aya-23-35B",
-        "CohereLabs/c4ai-command-r-plus-08-2024",
-        "CohereLabs/c4ai-command-r-08-2024",
-        "CohereLabs/c4ai-command-r-plus"
+        "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5",
+        "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
+        "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5-NVFP4",
+        "Deci/DeciLM-7B-instruct",
+        "Deci/DeciLM-7B",
+        "NewstaR/Porpoise-6b-instruct",
+        "Danielbrdz/Barcenas-6b",
+        "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1",
+        "nvidia/Llama-3_1-Nemotron-51B-Instruct"
       ]
     },
     {
       "architecture_id": "DFlashDraftModel",
-      "total_models": 10,
+      "total_models": 9,
       "sample_models": [
         "z-lab/Qwen3-4B-DFlash-b16",
         "z-lab/Qwen3-8B-DFlash-b16",
         "z-lab/Qwen3.5-9B-DFlash",
-        "z-lab/Qwen3.5-4B-DFlash",
-        "z-lab/Qwen3.5-27B-DFlash",
         "z-lab/gpt-oss-20b-DFlash",
         "z-lab/gpt-oss-120b-DFlash",
-        "z-lab/Qwen3.5-35B-A3B-DFlash",
         "z-lab/LLaMA3.1-8B-Instruct-DFlash-UltraChat",
-        "z-lab/Qwen3-Coder-30B-A3B-DFlash"
-      ]
-    },
-    {
-      "architecture_id": "RWForCausalLM",
-      "total_models": 10,
-      "sample_models": [
-        "projecte-aina/aguila-7b",
-        "lightonai/alfred-40b-1023",
-        "explosion-testing/refined-web-model-test",
-        "vilm/vulture-40b",
-        "h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v2",
-        "nomic-ai/gpt4all-falcon",
-        "h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3",
-        "OpenAssistant/falcon-40b-sft-top1-560",
-        "QuixiAI/WizardLM-Uncensored-Falcon-40b",
-        "mrm8488/falcoder-7b"
-      ]
-    },
-    {
-      "architecture_id": "DeepseekV32ForCausalLM",
-      "total_models": 9,
-      "sample_models": [
-        "deepseek-ai/DeepSeek-V3.2",
-        "deepseek-ai/DeepSeek-V3.2-Exp",
-        "nvidia/DeepSeek-V3.2-NVFP4",
-        "deepseek-ai/DeepSeek-V3.2-Speciale",
-        "deepseek-ai/DeepSeek-Math-V2",
-        "exolabs/DeepSeek-V3.2_bf16",
-        "deepseek-ai/DeepSeek-V3.2-Exp-Base",
-        "hyper-accel/tiny-random-deepseek-v32",
-        "cs2764/DeepSeek-V3.2_dq4-mlx"
-      ]
-    },
-    {
-      "architecture_id": "Cohere2ForCausalLM",
-      "total_models": 9,
-      "sample_models": [
-        "trl-internal-testing/tiny-Cohere2ForCausalLM",
-        "CohereLabs/tiny-aya-global",
-        "CohereLabs/c4ai-command-r7b-12-2024",
-        "CohereLabs/tiny-aya-base",
-        "CohereLabs/c4ai-command-r7b-arabic-02-2025",
-        "CohereLabs/c4ai-command-a-03-2025",
-        "CohereLabs/tiny-aya-water",
-        "CohereLabs/tiny-aya-fire",
-        "CohereLabs/tiny-aya-earth"
-      ]
-    },
-    {
-      "architecture_id": "HunYuanDenseV1ForCausalLM",
-      "total_models": 9,
-      "sample_models": [
-        "tencent/Hunyuan-7B-Instruct",
-        "tencent/Hunyuan-0.5B-Pretrain",
-        "tencent/Hunyuan-1.8B-Pretrain",
-        "tencent/Hunyuan-4B-Pretrain",
-        "tencent/Hunyuan-7B-Instruct-0124",
-        "tencent/Hunyuan-7B-Pretrain",
-        "tencent/Hunyuan-1.8B-Instruct",
-        "tencent/Hunyuan-0.5B-Instruct",
-        "tencent/Hunyuan-4B-Instruct"
+        "z-lab/Qwen3.5-35B-A3B-DFlash",
+        "z-lab/Qwen3-Coder-30B-A3B-DFlash",
+        "z-lab/Qwen3.5-4B-DFlash"
       ]
     },
     {
-      "architecture_id": "HybridQwen3ForCausalLM",
-      "total_models": 9,
+      "architecture_id": "LlavaQwenForCausalLM",
+      "total_models": 8,
       "sample_models": [
-        "amazon/GKA-primed-HQwen3-8B-Instruct",
-        "amazon/Mamba2-primed-HQwen3-8B-Instruct",
-        "amazon/GDN-primed-HQwen3-8B-Instruct",
-        "amazon/GDN-primed-HQwen3-32B-Instruct",
-        "amazon/GKA-primed-HQwen3-32B-Instruct",
-        "amazon/BMOJOF-primed-HQwen3-8B-Instruct",
-        "amazon/GKA-primed-HQwen3-8B-Reasoner",
-        "amazon/GDN-primed-HQwen3-8B-Reasoner",
-        "amazon/GKA-primed-HQwen3-32B-Reasoner"
+        "lmms-lab/llava-onevision-qwen2-7b-ov",
+        "lmms-lab/llava-onevision-qwen2-0.5b-ov",
+        "lmms-lab/llava-onevision-qwen2-7b-si",
+        "lmms-lab/LLaVA-Video-7B-Qwen2-Video-Only",
+        "lmms-lab/llava-onevision-qwen2-7b-ov-chat",
+        "lmms-lab/llava-next-interleave-qwen-7b",
+        "lmms-lab/llava-onevision-qwen2-0.5b-si",
+        "lmms-lab/LongVA-7B"
       ]
     },
     {
-      "architecture_id": "BartForConditionalGeneration",
+      "architecture_id": "MiniCPMForCausalLM",
       "total_models": 8,
       "sample_models": [
-        "KomeijiForce/bart-large-emojilm",
-        "antalvdb/bart-base-spelling-nl",
-        "lmqg/bart-large-squad-qg",
-        "kengurukleo/deutsch_a2_transformer",
-        "shibing624/bart4csc-base-chinese",
-        "SkitCon/gec-spanish-BARTO-SYNTHETIC",
-        "Nargizi/screeve-lemmatizer",
-        "Tianlin668/MentalBART"
+        "openbmb/MiniCPM4.1-8B",
+        "openbmb/MiniCPM-2B-sft-bf16",
+        "openbmb/MiniCPM4-0.5B",
+        "openbmb/MiniCPM-1B-sft-bf16",
+        "openbmb/MiniCPM-MoE-8x2B",
+        "katuni4ka/tiny-random-minicpm",
+        "openbmb/MiniCPM4-8B",
+        "openbmb/MiniCPM-S-1B-sft"
       ]
     },
     {
-      "architecture_id": "MambaForCausalLM",
+      "architecture_id": "MT5ForConditionalGeneration",
       "total_models": 8,
       "sample_models": [
-        "state-spaces/mamba-130m-hf",
-        "state-spaces/mamba-2.8b-hf",
-        "state-spaces/mamba-1.4b-hf",
-        "state-spaces/mamba-370m-hf",
-        "state-spaces/mamba-790m-hf",
-        "NYTK/PULI-HuBA-mamba-130M",
-        "EchoLabs33/mamba-130m-hxq",
-        "TRI-ML/mamba-7b-rw"
+        "knowledgator/IUPAC2SMILES-canonical-base",
+        "knowledgator/SMILES2IUPAC-canonical-base",
+        "knowledgator/SMILES2IUPAC-canonical-small",
+        "bigscience/mt0-base",
+        "bigscience/mt0-small",
+        "HiTZ/Medical-mT5-large",
+        "bigscience/mt0-large",
+        "dreuxx26/Multilingual-grammar-Corrector-using-mT5-small"
       ]
     },
     {
-      "architecture_id": "Lfm2MoeForCausalLM",
+      "architecture_id": "Qwen3_5ForCausalLM",
       "total_models": 8,
       "sample_models": [
-        "farbodtavakkoli/OTel-LLM-24B-IT",
-        "LiquidAI/LFM2-8B-A1B",
-        "LiquidAI/LFM2-24B-A2B",
-        "LiquidAI/LFM2-8B-A1B-ONNX",
-        "LiquidAI/LFM2-24B-A2B-ONNX",
-        "unsloth/LFM2-8B-A1B",
-        "huihui-ai/Huihui-LFM2-24B-A2B-abliterated",
-        "MuXodious/LFM2-8B-A1B-absolute-heresy-MPOA"
+        "lukey03/Qwen3.5-9B-abliterated",
+        "osoleve/Qwen3.5-9B-Base-Text-NVFP4",
+        "Phonsiri/Qwen3.5-9B-Thai-Law-Base",
+        "Green-eyedDevil/Monika-9B",
+        "eerwitt/qwen-h-neurons-honest",
+        "rahul7star/albeit",
+        "nbeerbower/Huihui-Qwen3.5-9B-abliterated-Grimoire-ORPO",
+        "nahidstaq/html-section-retriever"
       ]
     },
     {
-      "architecture_id": "BloomModel",
+      "architecture_id": "MPTForCausalLM",
       "total_models": 8,
       "sample_models": [
-        "bigscience/bigscience-small-testing",
-        "TurkuNLP/gpt3-finnish-small",
-        "TurkuNLP/gpt3-finnish-large",
-        "TurkuNLP/gpt3-finnish-13B",
-        "BelleGroup/BELLE-7B-2M",
-        "norallm/norbloom-7b-scratch",
-        "Muennighoff/bloom-tiny-random",
-        "TurkuNLP/gpt3-finnish-xl"
+        "anas-awadalla/mpt-7b",
+        "wtang06/mpt-125m-c4",
+        "echarlaix/tiny-mpt-random-remote-code",
+        "lightblue/japanese-mpt-7b",
+        "vinai/PhoGPT-4B",
+        "Nethermind/Mpt-Instruct-DotNet-S",
+        "replit/replit-code-v1-3b",
+        "vinai/PhoGPT-4B-Chat"
       ]
     },
     {
       "architecture_id": "ExaoneForCausalLM",
       "total_models": 7,
       "sample_models": [
-        "LGAI-EXAONE/EXAONE-Deep-7.8B",
         "LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct",
+        "LGAI-EXAONE/EXAONE-Deep-7.8B",
         "LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct",
         "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct",
         "LGAI-EXAONE/EXAONE-3.5-32B-Instruct",
@@ -651,67 +491,66 @@
       ]
     },
     {
-      "architecture_id": "Zamba2ForCausalLM",
+      "architecture_id": "BaichuanForCausalLM",
       "total_models": 7,
       "sample_models": [
-        "Zyphra/Zamba2-1.2B-instruct",
-        "Zyphra/Zamba2-7B-Instruct",
-        "Zyphra/Zamba2-2.7B",
-        "EchoLabs33/zamba2-1.2b-hxq",
-        "Zyphra/Zamba2-2.7B-instruct",
-        "EchoLabs33/zamba2-2.7b-instruct-hxq",
-        "EchoLabs33/zamba2-7b-instruct-hxq"
+        "baichuan-inc/Baichuan2-7B-Chat",
+        "baichuan-inc/Baichuan2-13B-Chat",
+        "baichuan-inc/Baichuan-13B-Chat",
+        "katuni4ka/tiny-random-baichuan2",
+        "baichuan-inc/Baichuan2-7B-Base",
+        "katuni4ka/tiny-random-baichuan2-13b",
+        "baichuan-inc/Baichuan2-13B-Base"
       ]
     },
     {
-      "architecture_id": "LlamaForCausalLMEagle3",
+      "architecture_id": "SmolLM3ForCausalLM",
       "total_models": 7,
       "sample_models": [
-        "taobao-mnn/Qwen3-VL-8B-Instruct-Eagle3",
-        "nvidia/gpt-oss-120b-Eagle3-long-context",
-        "nvidia/gpt-oss-120b-Eagle3-short-context",
-        "Zjcxy-SmartAI/Eagle3-Qwen3-32B-zh",
-        "Zjcxy-SmartAI/Eagle3-Qwen3-8B-zh",
-        "nvidia/gpt-oss-120b-Eagle3-throughput",
-        "chankhavu/c2.eagle3-test"
+        "HuggingFaceTB/SmolLM3-3B",
+        "HuggingFaceTB/SmolLM3-3B-Base",
+        "optimum-internal-testing/tiny-random-SmolLM3ForCausalLM",
+        "onnx-internal-testing/tiny-random-SmolLM3ForCausalLM",
+        "HuggingFaceTB/SmolLM3-3B-ONNX",
+        "N-Bot-Int/SmolSam3-MEMGRPO",
+        "toroe/SmolLM-3B-Science-ES"
       ]
     },
     {
-      "architecture_id": "T5WithLMHeadModel",
+      "architecture_id": "ProGenForCausalLM",
       "total_models": 7,
       "sample_models": [
-        "Salesforce/codet5-large",
-        "unicamp-dl/ptt5-base-portuguese-vocab",
-        "Salesforce/codet5-large-ntp-py",
-        "Rostlab/prot_t5_xl_bfd",
-        "unicamp-dl/ptt5-small-portuguese-vocab",
-        "gagan3012/k2t",
-        "unicamp-dl/ptt5-large-portuguese-vocab"
+        "hugohrban/progen2-base",
+        "hugohrban/progen2-small",
+        "hugohrban/progen2-medium",
+        "hugohrban/progen2-oas",
+        "hugohrban/progen2-small-mix7",
+        "hugohrban/progen2-large",
+        "hugohrban/progen2-xlarge"
       ]
     },
     {
-      "architecture_id": "Rwkv6ForCausalLM",
-      "total_models": 7,
+      "architecture_id": "DeepseekV32ForCausalLM",
+      "total_models": 6,
       "sample_models": [
-        "RWKV/v6-Finch-1B6-HF",
-        "RWKV/v6-Finch-7B-HF",
-        "RWKV/rwkv-6-world-1b6",
-        "RWKV/v6-Finch-14B-HF",
-        "RWKV/v6-Finch-3B-HF",
-        "RWKV/rwkv-6-world-7b",
-        "RWKV/rwkv-6-world-3b-v2.1"
+        "deepseek-ai/DeepSeek-V3.2",
+        "deepseek-ai/DeepSeek-V3.2-Exp",
+        "nvidia/DeepSeek-V3.2-NVFP4",
+        "deepseek-ai/DeepSeek-V3.2-Speciale",
+        "deepseek-ai/DeepSeek-Math-V2",
+        "cerebras/DeepSeek-V3.2-REAP-508B-A37B"
       ]
     },
     {
-      "architecture_id": "GlmMoeDsaForCausalLM",
+      "architecture_id": "MambaForCausalLM",
       "total_models": 6,
       "sample_models": [
-        "zai-org/GLM-5",
-        "nvidia/GLM-5-NVFP4",
-        "zai-org/GLM-5.1",
-        "cs2764/GLM-5-abliterated-dq4-mlx",
-        "0xSero/GLM-5-REAP-381B",
-        "cs2764/GLM-5-abliterated-dq3-mlx"
+        "state-spaces/mamba-130m-hf",
+        "state-spaces/mamba-2.8b-hf",
+        "state-spaces/mamba-370m-hf",
+        "state-spaces/mamba-1.4b-hf",
+        "state-spaces/mamba-790m-hf",
+        "TRI-ML/mamba-7b-rw"
       ]
     },
     {
@@ -727,154 +566,74 @@
       ]
     },
     {
-      "architecture_id": "DreamModel",
+      "architecture_id": "NemotronForCausalLM",
       "total_models": 6,
       "sample_models": [
-        "Dream-org/Dream-v0-Instruct-7B",
-        "Dream-org/Dream-v0-Base-7B",
-        "Dream-org/Dream-Coder-v0-Instruct-7B",
-        "Zigeng/dParallel_Dream_7B_Instruct",
-        "Dream-org/Dream-Coder-v0-Base-7B",
-        "Dream-org/DreamOn-v0-7B"
+        "nvidia/Nemotron-Mini-4B-Instruct",
+        "nvidia/Minitron-8B-Base",
+        "badaoui/tiny-random-NemotronForCausalLM",
+        "nvidia/Minitron-4B-Base",
+        "thhaus/nemotron3-8b",
+        "dmvevents/Nemotron-Mini-4B-Instruct"
       ]
     },
     {
-      "architecture_id": "Phi3VForCausalLM",
+      "architecture_id": "HyenaDNAForCausalLM",
       "total_models": 6,
       "sample_models": [
-        "microsoft/Phi-3-vision-128k-instruct",
-        "TIGER-Lab/VLM2Vec-Full",
-        "yujiepan/phi-3-vision-tiny-random",
-        "furonghuang-lab/tracevla_phi3v",
-        "Desm0nt/Phi-3-HornyVision-128k-instruct",
-        "failspy/Phi-3-vision-128k-instruct-abliterated-alpha"
+        "LongSafari/hyenadna-small-32k-seqlen-hf",
+        "LongSafari/hyenadna-tiny-1k-seqlen-hf",
+        "LongSafari/hyenadna-large-1m-seqlen-hf",
+        "LongSafari/hyenadna-medium-450k-seqlen-hf",
+        "LongSafari/hyenadna-medium-160k-seqlen-hf",
+        "LongSafari/hyenadna-tiny-1k-seqlen-d256-hf"
       ]
     },
     {
-      "architecture_id": "SDARForCausalLM",
+      "architecture_id": "LlavaLlamaForCausalLM",
       "total_models": 6,
       "sample_models": [
-        "JetLM/SDAR-1.7B-Chat",
-        "JetLM/SDAR-8B-Chat-b32",
-        "JetLM/SDAR-8B-Chat",
-        "JetLM/SDAR-1.7B-Chat-b32",
-        "JetLM/SDAR-4B-Chat",
-        "JetLM/SDAR-4B-Chat-b32"
-      ]
-    },
-    {
-      "architecture_id": "HyenaDNAForCausalLM",
-      "total_models": 6,
-      "sample_models": [
-        "LongSafari/hyenadna-small-32k-seqlen-hf",
-        "LongSafari/hyenadna-medium-450k-seqlen-hf",
-        "LongSafari/hyenadna-large-1m-seqlen-hf",
-        "LongSafari/hyenadna-tiny-1k-seqlen-hf",
-        "LongSafari/hyenadna-medium-160k-seqlen-hf",
-        "LongSafari/hyenadna-tiny-16k-seqlen-d128-hf"
-      ]
-    },
-    {
-      "architecture_id": "AfmoeForCausalLM",
-      "total_models": 6,
-      "sample_models": [
-        "arcee-ai/Trinity-Nano-Preview",
-        "arcee-ai/Trinity-Large-Thinking",
-        "arcee-ai/Trinity-Mini",
-        "arcee-ai/Trinity-Nano-Base",
-        "arcee-ai/Trinity-Mini-Base",
-        "arcee-ai/Trinity-Large-Preview"
-      ]
-    },
-    {
-      "architecture_id": "AquilaForCausalLM",
-      "total_models": 6,
-      "sample_models": [
-        "BAAI/AquilaChat2-7B",
-        "katuni4ka/tiny-random-aquilachat",
-        "katuni4ka/tiny-random-aquila2",
-        "BAAI/Aquila2-34B",
-        "BAAI/AquilaChat2-34B",
-        "BAAI/AquilaChat2-34B-16K"
+        "LanguageBind/Video-LLaVA-7B",
+        "lmms-lab/llama3-llava-next-8b",
+        "liuhaotian/llava-llama-2-13b-chat-lightning-preview",
+        "wisdomik/Quilt-Llava-v1.5-7b",
+        "ManishThota/Ollama_Video_llama_7B",
+        "ShareGPTVideo/LLaVA-Hound-Pretrain"
       ]
     },
     {
-      "architecture_id": "OLMoForCausalLM",
+      "architecture_id": "LlavaLlamaModel",
       "total_models": 6,
       "sample_models": [
-        "allenai/OLMo-7B-Instruct",
-        "allenai/OLMo-7B",
-        "allenai/OLMo-1B",
-        "allenai/OLMo-7B-0424",
-        "allenai/OLMo-7B-Twin-2T",
-        "allenai/OLMo-7B-SFT"
+        "Efficient-Large-Model/VILA1.5-3b",
+        "Efficient-Large-Model/NVILA-Lite-8B",
+        "Efficient-Large-Model/NVILA-8B",
+        "Efficient-Large-Model/NVILA-15B",
+        "Efficient-Large-Model/VILA1.5-13b",
+        "Efficient-Large-Model/Llama-3-VILA1.5-8B"
       ]
     },
     {
-      "architecture_id": "DogeForCausalLM",
+      "architecture_id": "LLaMAForCausalLM",
       "total_models": 6,
       "sample_models": [
-        "SmallDoge/Doge-320M",
-        "SmallDoge/Doge-20M",
-        "SmallDoge/Doge-160M",
-        "SmallDoge/Doge-60M",
-        "SmallDoge/Doge-120M-MoE",
-        "SmallDoge/Doge-20M-MoE"
-      ]
-    },
-    {
-      "architecture_id": "SmolLM3ForCausalLM",
-      "total_models": 5,
-      "sample_models": [
-        "HuggingFaceTB/SmolLM3-3B",
-        "HuggingFaceTB/SmolLM3-3B-Base",
-        "optimum-internal-testing/tiny-random-SmolLM3ForCausalLM",
-        "unsloth/SmolLM3-3B",
-        "onnx-internal-testing/tiny-random-SmolLM3ForCausalLM"
-      ]
-    },
-    {
-      "architecture_id": "XLNetLMHeadModel",
-      "total_models": 5,
-      "sample_models": [
-        "xlnet/xlnet-base-cased",
-        "xlnet/xlnet-large-cased",
-        "hfl/chinese-xlnet-base",
-        "sshleifer/tiny-xlnet-base-cased",
-        "textattack/xlnet-base-cased-imdb"
-      ]
-    },
-    {
-      "architecture_id": "BioGptForCausalLM",
-      "total_models": 5,
-      "sample_models": [
-        "microsoft/biogpt",
-        "microsoft/BioGPT-Large",
-        "microsoft/BioGPT-Large-PubMedQA",
-        "hf-tiny-model-private/tiny-random-BioGptForCausalLM",
-        "zequnl/molxpt"
-      ]
-    },
-    {
-      "architecture_id": "Mistral3ForConditionalGeneration",
-      "total_models": 5,
-      "sample_models": [
-        "farbodtavakkoli/OTel-LLM-3B-IT",
-        "RedHatAI/Mistral-Small-3.2-24B-Instruct-2506-NVFP4",
-        "ArmGPT/ArmenianGPT-1.0-3B",
-        "JANGQ-AI/Mistral-Small-4-119B-A6B-JANG_2L",
-        "odytrice/kenichi-flash"
+        "Enoch/llama-65b-hf",
+        "Rardilit/Panther_v1",
+        "James-WYang/BigTranslate",
+        "mncai/chatdoctor",
+        "heegyu/LIMA-13b",
+        "maicomputer/alpaca-13b"
       ]
     },
     {
-      "architecture_id": "LLaDA2MoeModelLM",
+      "architecture_id": "LLaDAModelLM",
       "total_models": 5,
       "sample_models": [
-        "inclusionAI/LLaDA2.1-flash",
-        "inclusionAI/LLaDA2.0-mini",
-        "inclusionAI/LLaDA2.1-mini",
-        "inclusionAI/LLaDA2.0-mini-CAP",
-        "inclusionAI/LLaDA2.0-flash"
+        "GSAI-ML/LLaDA-8B-Instruct",
+        "GSAI-ML/LLaDA-8B-Base",
+        "GSAI-ML/LLaDA-1.5",
+        "d3LLM/d3LLM_LLaDA",
+        "Fraser/LLaDA-8B-Base-gg2m"
       ]
     },
     {
@@ -882,2387 +641,1674 @@
       "total_models": 5,
       "sample_models": [
         "trl-internal-testing/tiny-FalconMambaForCausalLM",
-        "tiiuae/falcon-mamba-7b-instruct",
-        "tiiuae/falcon-mamba-7b",
         "tiiuae/falcon-mamba-tiny-dev",
+        "tiiuae/falcon-mamba-7b",
+        "tiiuae/falcon-mamba-7b-instruct",
         "tiiuae/Falcon3-Mamba-7B-Instruct"
       ]
     },
     {
-      "architecture_id": "Eagle3Speculator",
-      "total_models": 5,
-      "sample_models": [
-        "RedHatAI/Qwen3-8B-speculator.eagle3",
-        "RedHatAI/Llama-3.1-8B-Instruct-speculator.eagle3",
-        "RedHatAI/Llama-3.3-70B-Instruct-speculator.eagle3",
-        "RedHatAI/Qwen3-32B-speculator.eagle3",
-        "RedHatAI/Qwen3-14B-speculator.eagle3"
-      ]
-    },
-    {
-      "architecture_id": "NemotronForCausalLM",
-      "total_models": 5,
-      "sample_models": [
-        "nvidia/Nemotron-Mini-4B-Instruct",
-        "nvidia/Minitron-8B-Base",
-        "nvidia/Minitron-4B-Base",
-        "badaoui/tiny-random-NemotronForCausalLM",
-        "thhaus/nemotron3-8b"
-      ]
-    },
-    {
-      "architecture_id": "ProGenForCausalLM",
-      "total_models": 5,
-      "sample_models": [
-        "hugohrban/progen2-base",
-        "hugohrban/progen2-small",
-        "hugohrban/progen2-medium",
-        "hugohrban/progen2-large",
-        "hugohrban/progen2-small-mix7"
-      ]
-    },
-    {
-      "architecture_id": "Glm4ForCausalLM",
+      "architecture_id": "DreamModel",
       "total_models": 5,
       "sample_models": [
-        "zai-org/GLM-4-9B-0414",
-        "zai-org/GLM-Z1-32B-0414",
-        "zai-org/GLM-Z1-9B-0414",
-        "zai-org/GLM-4-32B-0414",
-        "zai-org/GLM-4-32B-Base-0414"
+        "Dream-org/Dream-v0-Instruct-7B",
+        "Dream-org/Dream-v0-Base-7B",
+        "Dream-org/Dream-Coder-v0-Instruct-7B",
+        "d3LLM/d3LLM_Dream",
+        "Dream-org/Dream-Coder-v0-Base-7B"
       ]
     },
     {
-      "architecture_id": "Eagle3DraftModel",
+      "architecture_id": "Eagle3Speculator",
       "total_models": 5,
       "sample_models": [
+        "RedHatAI/Qwen3-8B-speculator.eagle3",
         "RedHatAI/gpt-oss-20b-speculator.eagle3",
-        "RedHatAI/gpt-oss-120b-speculator.eagle3",
-        "RedHatAI/Qwen3-30B-A3B-Thinking-2507-speculator.eagle3",
-        "RedHatAI/Qwen3-235B-A22B-Instruct-2507-speculator.eagle3",
-        "RedHatAI/Qwen3-30B-A3B-Instruct-2507-speculator.eagle3"
-      ]
-    },
-    {
-      "architecture_id": "LlavaQwen2ForCausalLM",
-      "total_models": 5,
-      "sample_models": [
-        "qnguyen3/nanoLLaVA",
-        "apple/FastVLM-0.5B",
-        "apple/FastVLM-1.5B",
-        "apple/FastVLM-7B",
-        "FreedomIntelligence/HuatuoGPT-Vision-7B"
-      ]
-    },
-    {
-      "architecture_id": "JAISLMHeadModel",
-      "total_models": 5,
-      "sample_models": [
-        "inceptionai/jais-13b-chat",
-        "katuni4ka/tiny-random-jais",
-        "inceptionai/jais-family-30b-8k",
-        "inceptionai/jais-13b",
-        "inceptionai/jais-family-13b-chat"
-      ]
-    },
-    {
-      "architecture_id": "MoAMetricLM",
-      "total_models": 5,
-      "sample_models": [
-        "reaperdoesntknow/MoA-150M",
-        "reaperdoesntknow/MoA-400M",
-        "reaperdoesntknow/MoA-155M",
-        "reaperdoesntknow/MoA-100M",
-        "reaperdoesntknow/DiscoverLM-70M"
-      ]
-    },
-    {
-      "architecture_id": "PldrllmForCausalLM",
-      "total_models": 5,
-      "sample_models": [
-        "fromthesky/PLDR-LLM-v51-SOC-110M-5",
-        "fromthesky/PLDR-LLM-v51-SOC-110M-2",
-        "fromthesky/PLDR-LLM-v51-SOC-110M-4",
-        "fromthesky/PLDR-LLM-v51-SOC-110M-3",
-        "fromthesky/PLDR-LLM-v51-SOC-110M-1"
-      ]
-    },
-    {
-      "architecture_id": "MBartForConditionalGeneration",
-      "total_models": 5,
-      "sample_models": [
-        "Pravopysnyk/best-unlp",
-        "DeepPavlov/mbart-large-50-ru-persona-chat",
-        "sn4kebyt3/ru-bart-large",
-        "MRNH/mbart-italian-grammar-corrector",
-        "MRNH/mbart-german-grammar-corrector"
-      ]
-    },
-    {
-      "architecture_id": "PhiMoEForCausalLM",
-      "total_models": 4,
-      "sample_models": [
-        "microsoft/Phi-tiny-MoE-instruct",
-        "microsoft/Phi-mini-MoE-instruct",
-        "microsoft/Phi-3.5-MoE-instruct",
-        "optimum-intel-internal-testing/phi-3.5-moe-tiny-random"
-      ]
-    },
-    {
-      "architecture_id": "LlavaQwenForCausalLM",
-      "total_models": 4,
-      "sample_models": [
-        "lmms-lab/llava-onevision-qwen2-7b-ov",
-        "lmms-lab/llava-onevision-qwen2-0.5b-ov",
-        "lmms-lab/llava-onevision-qwen2-0.5b-si",
-        "lmms-lab/llava-onevision-qwen2-7b-si"
-      ]
-    },
-    {
-      "architecture_id": "Starcoder2ForCausalLM",
-      "total_models": 4,
-      "sample_models": [
-        "bigcode/starcoder2-3b",
-        "bigcode/starcoder2-7b",
-        "bigcode/starcoder2-15b",
-        "bigcode/starcoder2-15b-instruct-v0.1"
-      ]
-    },
-    {
-      "architecture_id": "GlmForCausalLM",
-      "total_models": 4,
-      "sample_models": [
-        "zai-org/glm-4-9b-chat-hf",
-        "zai-org/glm-4-9b-hf",
-        "zai-org/glm-edge-4b-chat",
-        "zai-org/glm-edge-1.5b-chat"
-      ]
-    },
-    {
-      "architecture_id": "OuroForCausalLM",
-      "total_models": 4,
-      "sample_models": [
-        "ByteDance/Ouro-1.4B",
-        "ByteDance/Ouro-2.6B-Thinking",
-        "ByteDance/Ouro-2.6B",
-        "ByteDance/Ouro-1.4B-Thinking"
+        "RedHatAI/Llama-3.1-8B-Instruct-speculator.eagle3",
+        "RedHatAI/Qwen3-32B-speculator.eagle3",
+        "RedHatAI/Qwen3-14B-speculator.eagle3"
       ]
     },
     {
       "architecture_id": "SeedOssForCausalLM",
-      "total_models": 4,
+      "total_models": 5,
       "sample_models": [
         "ByteDance-Seed/Seed-OSS-36B-Instruct",
-        "NousResearch/Hermes-4.3-36B",
         "ByteDance-Seed/Seed-OSS-36B-Base",
-        "mratsim/Seed-OSS-36B-Instruct-NVFP4"
-      ]
-    },
-    {
-      "architecture_id": "ArceeForCausalLM",
-      "total_models": 4,
-      "sample_models": [
-        "arcee-ai/AFM-4.5B-Base",
-        "optimum-intel-internal-testing/tiny-random-ArceeForCausalLM",
-        "onnx-internal-testing/tiny-random-ArceeForCausalLM",
-        "arcee-ai/AFM-4.5B"
-      ]
-    },
-    {
-      "architecture_id": "BailingMoeV2ForCausalLM",
-      "total_models": 4,
-      "sample_models": [
-        "inclusionAI/Ling-mini-2.0",
-        "inclusionAI/Ling-1T",
-        "inclusionAI/Ring-mini-2.0",
-        "inclusionAI/Ling-flash-2.0"
-      ]
-    },
-    {
-      "architecture_id": "MobilintLlamaForCausalLM",
-      "total_models": 4,
-      "sample_models": [
-        "mobilint/Llama-3.2-3B-Instruct",
-        "mobilint/Llama-3.2-1B-Instruct",
-        "mobilint/Llama-3.1-8B-Instruct",
-        "mobilint/HyperCLOVAX-SEED-Text-Instruct-1.5B"
-      ]
-    },
-    {
-      "architecture_id": "MobilintQwen2ForCausalLM",
-      "total_models": 4,
-      "sample_models": [
-        "mobilint/Qwen2.5-1.5B-Instruct",
-        "mobilint/Qwen2.5-0.5B-Instruct",
-        "mobilint/Qwen2.5-3B-Instruct",
-        "mobilint/Qwen2.5-7B-Instruct"
-      ]
-    },
-    {
-      "architecture_id": "MobilintQwen3ForCausalLM",
-      "total_models": 4,
-      "sample_models": [
-        "mobilint/Qwen3-4B",
-        "mobilint/Qwen3-0.6B",
-        "mobilint/Qwen3-1.7B",
-        "mobilint/Qwen3-8B"
+        "NousResearch/Hermes-4.3-36B",
+        "mratsim/Seed-OSS-36B-Instruct-NVFP4",
+        "YanLabs/Seed-OSS-36B-Instruct-MPOA"
       ]
     },
     {
-      "architecture_id": "MobilintExaoneForCausalLM",
-      "total_models": 4,
+      "architecture_id": "Ernie4_5_MoeForCausalLM",
+      "total_models": 5,
       "sample_models": [
-        "mobilint/EXAONE-3.5-2.4B-Instruct",
-        "mobilint/EXAONE-Deep-2.4B",
-        "mobilint/EXAONE-3.5-7.8B-Instruct",
-        "mobilint/EXAONE-Deep-7.8B"
+        "baidu/ERNIE-4.5-21B-A3B-PT",
+        "baidu/ERNIE-4.5-21B-A3B-Base-PT",
+        "baidu/ERNIE-4.5-21B-A3B-Thinking",
+        "baidu/ERNIE-4.5-300B-A47B-PT",
+        "baidu/ERNIE-4.5-300B-A47B-Paddle"
       ]
     },
     {
-      "architecture_id": "Qwen2_5_VLForConditionalGeneration",
-      "total_models": 4,
-      "sample_models": [
-        "nvidia/Qwen2.5-VL-7B-Instruct-NVFP4",
-        "OmniSVG/OmniSVG1.1_4B",
-        "OmniSVG/OmniSVG1.1_8B",
-        "OmniSVG/OmniSVG"
-      ]
-    },
-    {
-      "architecture_id": "IdeficsForVisionText2Text",
-      "total_models": 4,
-      "sample_models": [
-        "HuggingFaceM4/idefics-80b-instruct",
-        "HuggingFaceM4/idefics-9b",
-        "HuggingFaceM4/idefics-9b-instruct",
-        "HuggingFaceM4/idefics-80b"
-      ]
-    },
-    {
-      "architecture_id": "LISAForCausalLM",
-      "total_models": 4,
-      "sample_models": [
-        "xinlai/LISA-13B-llama2-v1",
-        "xinlai/LISA-7B-v1",
-        "xinlai/LISA-7B-v1-explanatory",
-        "xinlai/LISA-13B-llama2-v1-explanatory"
-      ]
-    },
-    {
-      "architecture_id": "LLaDAModelLM",
-      "total_models": 3,
-      "sample_models": [
-        "GSAI-ML/LLaDA-8B-Instruct",
-        "GSAI-ML/LLaDA-8B-Base",
-        "GSAI-ML/LLaDA-1.5"
-      ]
-    },
-    {
-      "architecture_id": "BambaForCausalLM",
-      "total_models": 3,
-      "sample_models": [
-        "hmellor/tiny-random-BambaForCausalLM",
-        "ibm-ai-platform/Bamba-9B-v1",
-        "ibm-ai-platform/Bamba-9B-v2"
-      ]
-    },
-    {
-      "architecture_id": "InternLMForCausalLM",
-      "total_models": 3,
-      "sample_models": [
-        "internlm/internlm-chat-7b",
-        "internlm/internlm-20b",
-        "internlm/internlm-7b"
-      ]
-    },
-    {
-      "architecture_id": "Ernie4_5_MoeForCausalLM",
-      "total_models": 3,
-      "sample_models": [
-        "baidu/ERNIE-4.5-21B-A3B-PT",
-        "baidu/ERNIE-4.5-21B-A3B-Base-PT",
-        "baidu/ERNIE-4.5-21B-A3B-Thinking"
-      ]
-    },
-    {
-      "architecture_id": "Exaone4ForCausalLM",
-      "total_models": 3,
-      "sample_models": [
-        "LGAI-EXAONE/EXAONE-4.0-1.2B",
-        "LGAI-EXAONE/EXAONE-4.0.1-32B",
-        "LGAI-EXAONE/EXAONE-4.0-32B"
-      ]
-    },
-    {
-      "architecture_id": "OlmoHybridForCausalLM",
-      "total_models": 3,
-      "sample_models": [
-        "allenai/Olmo-Hybrid-7B",
-        "allenai/Olmo-Hybrid-Instruct-DPO-7B",
-        "allenai/Olmo-Hybrid-Instruct-SFT-7B"
-      ]
-    },
-    {
-      "architecture_id": "Llama4ForCausalLM",
-      "total_models": 3,
-      "sample_models": [
-        "trl-internal-testing/tiny-Llama4ForCausalLM",
-        "pruna-test/test-save-tiny-random-llama4-smashed",
-        "facebook/MobileLLM-R1.5-360M"
-      ]
-    },
-    {
-      "architecture_id": "BitNetForCausalLM",
-      "total_models": 3,
-      "sample_models": [
-        "microsoft/bitnet-b1.58-2B-4T",
-        "microsoft/bitnet-b1.58-2B-4T-bf16",
-        "iSolver-AI/FEnet"
-      ]
-    },
-    {
-      "architecture_id": "IQuestCoderForCausalLM",
-      "total_models": 3,
-      "sample_models": [
-        "IQuestLab/IQuest-Coder-V1-40B-Instruct",
-        "IQuestLab/IQuest-Coder-V1-7B-Instruct",
-        "Multilingual-Multimodal-NLP/IndustrialCoder"
-      ]
-    },
-    {
-      "architecture_id": "XverseForCausalLM",
-      "total_models": 3,
-      "sample_models": [
-        "xverse/XVERSE-7B-Chat",
-        "katuni4ka/tiny-random-xverse",
-        "xverse/XVERSE-13B-256K"
-      ]
-    },
-    {
-      "architecture_id": "PersimmonForCausalLM",
-      "total_models": 3,
-      "sample_models": [
-        "adept/persimmon-8b-chat",
-        "adept/persimmon-8b-base",
-        "pszemraj/perSLIMmon-8b-base"
-      ]
-    },
-    {
-      "architecture_id": "RecurrentGemmaForCausalLM",
-      "total_models": 3,
-      "sample_models": [
-        "google/recurrentgemma-2b",
-        "google/recurrentgemma-2b-it",
-        "google/recurrentgemma-9b"
-      ]
-    },
-    {
-      "architecture_id": "Llama4ForConditionalGeneration",
-      "total_models": 3,
-      "sample_models": [
-        "yujiepan/llama-4-tiny-random",
-        "RedHatAI/Llama-4-Scout-17B-16E-Instruct-NVFP4",
-        "RedHatAI/Llama-4-Maverick-17B-128E-Instruct-NVFP4"
-      ]
-    },
-    {
-      "architecture_id": "LlavaLlamaModel",
-      "total_models": 3,
-      "sample_models": [
-        "Efficient-Large-Model/VILA1.5-3b",
-        "Efficient-Large-Model/NVILA-Lite-8B",
-        "Efficient-Large-Model/NVILA-8B"
-      ]
-    },
-    {
-      "architecture_id": "AraGPT2LMHeadModel",
-      "total_models": 3,
-      "sample_models": [
-        "QCRI/Fanar-2-Diwan",
-        "aubmindlab/aragpt2-mega",
-        "aubmindlab/aragpt2-large"
-      ]
-    },
-    {
-      "architecture_id": "RITAModelForCausalLM",
-      "total_models": 3,
-      "sample_models": [
-        "lightonai/RITA_s",
-        "lightonai/RITA_xl",
-        "lightonai/RITA_l"
-      ]
-    },
-    {
-      "architecture_id": "NanoChatForCausalLM",
-      "total_models": 3,
-      "sample_models": [
-        "Twobombs/nanochat-d34-sft-hf",
-        "pankajmathur/nanochat-d34-sft-hf",
-        "Nekochu/nanochat-d24"
-      ]
-    },
-    {
-      "architecture_id": "MobileLlamaForCausalLM",
-      "total_models": 3,
-      "sample_models": [
-        "mtgv/MobileVLM_V2-1.7B",
-        "mtgv/MobileVLM_V2-7B",
-        "mtgv/MobileVLM_V2-3B"
-      ]
-    },
-    {
-      "architecture_id": "ParamBharatGenForCausalLM",
-      "total_models": 3,
-      "sample_models": [
-        "bharatgenai/Param-1-5B",
-        "bharatgenai/AyurParam",
-        "bharatgenai/Param-1-2.9B-Instruct"
-      ]
-    },
-    {
-      "architecture_id": "modeling_camelidae.LlamaForCausalLM",
-      "total_models": 3,
-      "sample_models": [
-        "hywu/Camelidae-8x34B",
-        "hywu/Camelidae-8x7B",
-        "hywu/Camelidae-8x13B"
-      ]
-    },
-    {
-      "architecture_id": "MptForCausalLM",
-      "total_models": 3,
-      "sample_models": [
-        "yujiepan/mpt-tiny-random",
-        "explosion-testing/mpt-test",
-        "team-lucid/mptk-1b"
-      ]
-    },
-    {
-      "architecture_id": "BlueLMForCausalLM",
-      "total_models": 3,
-      "sample_models": [
-        "vivo-ai/BlueLM-7B-Chat",
-        "vivo-ai/BlueLM-7B-Base",
-        "vivo-ai/BlueLM-7B-Chat-32K"
-      ]
-    },
-    {
-      "architecture_id": "LlamaMoEForCausalLM",
-      "total_models": 3,
-      "sample_models": [
-        "llama-moe/LLaMA-MoE-v1-3_5B-2_8",
-        "llama-moe/LLaMA-MoE-v1-3_0B-2_16",
-        "llama-moe/LLaMA-MoE-v1-3_5B-4_16"
-      ]
-    },
-    {
-      "architecture_id": "H2OVLChatModel",
-      "total_models": 2,
-      "sample_models": [
-        "h2oai/h2ovl-mississippi-800m",
-        "h2oai/h2ovl-mississippi-2b"
-      ]
-    },
-    {
-      "architecture_id": "KimiK25ForConditionalGeneration",
-      "total_models": 2,
-      "sample_models": [
-        "nvidia/Kimi-K2.5-NVFP4",
-        "Ex0bit/Kimi-K2.5-PRISM-REAP-530B-A32B"
-      ]
-    },
-    {
-      "architecture_id": "HCXVisionV2ForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B",
-        "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B"
-      ]
-    },
-    {
-      "architecture_id": "SolarOpenForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "upstage/Solar-Open-100B",
-        "nota-ai/Solar-Open-100B-NotaMoEQuant-Int4"
-      ]
-    },
-    {
-      "architecture_id": "OpenAIGPTLMHeadModel",
-      "total_models": 2,
-      "sample_models": [
-        "openai-community/openai-gpt",
-        "lgaalves/gpt1"
-      ]
-    },
-    {
-      "architecture_id": "MoshiForConditionalGeneration",
-      "total_models": 2,
-      "sample_models": [
-        "kmhf/hf-moshiko",
-        "kmhf/hf-moshika"
-      ]
-    },
-    {
-      "architecture_id": "SarvamMLAForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "aoxo/sarvam-105b-uncensored",
-        "sarvamai/sarvam-105b"
-      ]
-    },
-    {
-      "architecture_id": "ReformerModelWithLMHead",
-      "total_models": 2,
-      "sample_models": [
-        "google/reformer-crime-and-punishment",
-        "google/reformer-enwik8"
-      ]
-    },
-    {
-      "architecture_id": "GPTNeoXJapaneseForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "abeja/gpt-neox-japanese-2.7b",
-        "hf-tiny-model-private/tiny-random-GPTNeoXJapaneseForCausalLM"
-      ]
-    },
-    {
-      "architecture_id": "SarvamMoEForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "aoxo/sarvam-30b-uncensored",
-        "sarvamai/sarvam-30b"
-      ]
-    },
-    {
-      "architecture_id": "MiMoForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "XiaomiMiMo/MiMo-7B-Base",
-        "XiaomiMiMo/MiMo-7B-RL"
-      ]
-    },
-    {
-      "architecture_id": "StarVectorForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "starvector/starvector-1b-im2svg",
-        "starvector/starvector-8b-im2svg"
-      ]
-    },
-    {
-      "architecture_id": "BaiChuanForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "baichuan-inc/Baichuan-7B",
-        "FreedomIntelligence/HuatuoGPT-7B"
-      ]
-    },
-    {
-      "architecture_id": "MiniMaxM1ForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "MiniMaxAI/MiniMax-M1-40k",
-        "MiniMaxAI/MiniMax-M1-80k"
-      ]
-    },
-    {
-      "architecture_id": "DeepseekForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "deepseek-ai/deepseek-moe-16b-base",
-        "deepseek-ai/deepseek-moe-16b-chat"
-      ]
-    },
-    {
-      "architecture_id": "Phi3SmallForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "microsoft/Phi-3-small-8k-instruct",
-        "microsoft/Phi-3-small-128k-instruct"
-      ]
-    },
-    {
-      "architecture_id": "EchoForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "ethicalabs/Echo-DSRN-486M-v0.7.6-SFT",
-        "ethicalabs/Echo-DSRN-114M-Base"
-      ]
-    },
-    {
-      "architecture_id": "Ernie4_5ForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "baidu/ERNIE-4.5-0.3B-PT",
-        "baidu/ERNIE-4.5-0.3B-Base-PT"
-      ]
-    },
-    {
-      "architecture_id": "OrionForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "OrionStarAI/Orion-14B-Chat",
-        "OrionStarAI/Orion-14B-Base"
-      ]
-    },
-    {
-      "architecture_id": "NemotronFlashForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "nvidia/Nemotron-Flash-3B",
-        "nvidia/Nemotron-Flash-1B"
-      ]
-    },
-    {
-      "architecture_id": "AXK1ForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "skt/A.X-K1",
-        "thkim93/axk1-2layers"
-      ]
-    },
-    {
-      "architecture_id": "DbrxForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "trl-internal-testing/tiny-DbrxForCausalLM",
-        "katuni4ka/tiny-random-dbrx"
-      ]
-    },
-    {
-      "architecture_id": "Dots1ForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "rednote-hilab/dots.llm1.inst",
-        "rednote-hilab/dots.llm1.base"
-      ]
-    },
-    {
-      "architecture_id": "FlexOlmoForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "allenai/Flex-reddit-2x7B-1T",
-        "allenai/FlexOlmo-7x7B-1T-RT"
-      ]
-    },
-    {
-      "architecture_id": "ChatGLMModel",
-      "total_models": 2,
-      "sample_models": [
-        "zai-org/codegeex4-all-9b",
-        "zai-org/glm-4-9b"
-      ]
-    },
-    {
-      "architecture_id": "CLIPT5ForConditionalGeneration",
-      "total_models": 2,
-      "sample_models": [
-        "zhiqiulin/clip-flant5-xl",
-        "zhiqiulin/clip-flant5-xxl"
-      ]
-    },
-    {
-      "architecture_id": "PenguinVLQwen3ForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "tencent/Penguin-VL-8B",
-        "tencent/Penguin-VL-2B"
-      ]
-    },
-    {
-      "architecture_id": "StripedHyenaModelForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "togethercomputer/evo-1-131k-base",
-        "togethercomputer/evo-1-8k-base"
-      ]
-    },
-    {
-      "architecture_id": "CrystalCoderLMHeadModel",
-      "total_models": 2,
-      "sample_models": [
-        "LLM360/Crystal",
-        "LLM360/CrystalChat"
-      ]
-    },
-    {
-      "architecture_id": "JetNemotronForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "jet-ai/Jet-Nemotron-2B",
-        "jet-ai/Jet-Nemotron-4B"
-      ]
-    },
-    {
-      "architecture_id": "Mamba2ForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "deqing/mamba2-300M-v5-mamba2",
-        "EchoLabs33/mamba2-1.3b-hxq"
-      ]
-    },
-    {
-      "architecture_id": "MolformerForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "ibm-research/GP-MoLFormer-Uniq",
-        "ralyn/NPComposer-v2"
-      ]
-    },
-    {
-      "architecture_id": "CogVLMForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "zai-org/cogvlm2-llama3-chat-19B",
-        "zai-org/cogvlm-chat-hf"
-      ]
-    },
-    {
-      "architecture_id": "Jais2ForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "inceptionai/Jais-2-8B-Chat",
-        "inceptionai/Jais-2-70B-Chat"
-      ]
-    },
-    {
-      "architecture_id": "Qwen2ForSequenceClassification",
-      "total_models": 2,
-      "sample_models": [
-        "nvidia/AceMath-7B-RM",
-        "nvidia/Qwen2.5-CascadeRL-RM-72B"
-      ]
-    },
-    {
-      "architecture_id": "ChatGLMForConditionalGeneration",
-      "total_models": 2,
-      "sample_models": [
-        "IAAR-Shanghai/xVerify-9B-C",
-        "qiuhuachuan/MeChat"
-      ]
-    },
-    {
-      "architecture_id": "RavenForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "tomg-group-umd/huginn-0125",
-        "smcleish/Recurrent-Llama-3.2-train-recurrence-32"
-      ]
-    },
-    {
-      "architecture_id": "YoutuForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "tencent/Youtu-LLM-2B-Base",
-        "tencent/Youtu-LLM-2B"
-      ]
-    },
-    {
-      "architecture_id": "Qwen3VLMoeForConditionalGeneration",
-      "total_models": 2,
-      "sample_models": [
-        "RedHatAI/Qwen3-VL-235B-A22B-Instruct-NVFP4",
-        "Oysiyl/qwen3-vl-30b-a3b-unslop-good-lora-v1"
-      ]
-    },
-    {
-      "architecture_id": "MosaicGPT",
-      "total_models": 2,
-      "sample_models": [
-        "anas-awadalla/mpt-1b-redpajama-200b",
-        "anas-awadalla/mpt-1b-redpajama-200b-dolly"
-      ]
-    },
-    {
-      "architecture_id": "GTLMForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "Madras1/GTLM-1-2B-A350M",
-        "Madras1/GTLM-1-2B-A350M-fp16"
-      ]
-    },
-    {
-      "architecture_id": "GPT2Model",
-      "total_models": 2,
-      "sample_models": [
-        "cerebras/Cerebras-GPT-13B",
-        "keshan/sinhala-gpt2"
-      ]
-    },
-    {
-      "architecture_id": "LiquidForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "reaperdoesntknow/DNA-175M",
-        "reaperdoesntknow/DNA-50M"
-      ]
-    },
-    {
-      "architecture_id": "ModernBertDecoderForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "jhu-clsp/ettin-decoder-400m",
-        "jhu-clsp/ettin-decoder-32m"
-      ]
-    },
-    {
-      "architecture_id": "GLAForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "fla-hub/gla-340M-15B",
-        "fla-hub/gla-1.3B-100B"
-      ]
-    },
-    {
-      "architecture_id": "DuchifatCore",
-      "total_models": 2,
-      "sample_models": [
-        "Raziel1234/Duchifat-2",
-        "razielAI/Duchifat-2.1-Instruct"
-      ]
-    },
-    {
-      "architecture_id": "KonkanGPT",
-      "total_models": 2,
-      "sample_models": [
-        "omdeep22/Gonyai-teo2",
-        "omdeep22/Gonyai-v1"
-      ]
-    },
-    {
-      "architecture_id": "BertLMHeadModel",
-      "total_models": 2,
-      "sample_models": [
-        "dicta-il/BEREL_3.0",
-        "hf-tiny-model-private/tiny-random-BertLMHeadModel"
-      ]
-    },
-    {
-      "architecture_id": "RobertaForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "gokceuludogan/ChemBERTaLM",
-        "uf-aice-lab/math-roberta"
-      ]
-    },
-    {
-      "architecture_id": "MossForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "OpenMOSS-Team/moss-moon-003-sft",
-        "OpenMOSS-Team/moss-moon-003-base"
-      ]
-    },
-    {
-      "architecture_id": "WeDLMForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "tencent/WeDLM-8B-Base",
-        "tencent/WeDLM-8B-Instruct"
-      ]
-    },
-    {
-      "architecture_id": "Rwkv5ForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "RWKV/rwkv-5-world-3b",
-        "RWKV/rwkv-5-world-1b5"
-      ]
-    },
-    {
-      "architecture_id": "BartForCausalLM",
-      "total_models": 2,
-      "sample_models": [
-        "sanchit-gandhi/tiny-random-bart-fp16",
-        "hf-tiny-model-private/tiny-random-BartForCausalLM"
-      ]
-    },
-    {
-      "architecture_id": "BitnetForCausalLM",
-      "total_models": 2,
+      "architecture_id": "SDARForCausalLM",
+      "total_models": 5,
       "sample_models": [
-        "1bitLLM/bitnet_b1_58-large",
-        "1bitLLM/bitnet_b1_58-3B"
+        "JetLM/SDAR-8B-Chat-b32",
+        "JetLM/SDAR-4B-Chat-b32",
+        "JetLM/SDAR-8B-Chat",
+        "JetLM/SDAR-1.7B-Chat-b32",
+        "JetLM/SDAR-1.7B-Chat"
       ]
     },
     {
-      "architecture_id": "Int8OPTForCausalLM",
-      "total_models": 2,
+      "architecture_id": "BloomModel",
+      "total_models": 5,
       "sample_models": [
-        "mit-han-lab/opt-125m-smoothquant",
-        "mit-han-lab/opt-6.7b-smoothquant"
+        "bigscience/bigscience-small-testing",
+        "TurkuNLP/gpt3-finnish-small",
+        "TurkuNLP/gpt3-finnish-large",
+        "TurkuNLP/gpt3-finnish-13B",
+        "TurkuNLP/gpt3-finnish-xl"
       ]
     },
     {
-      "architecture_id": "Olmo2ForSequenceClassification",
-      "total_models": 2,
+      "architecture_id": "AfmoeForCausalLM",
+      "total_models": 5,
       "sample_models": [
-        "allenai/OLMo-2-1124-7B-RM",
-        "LifeWiki-ai/OLMo-2-1124-7B-RM"
+        "arcee-ai/Trinity-Nano-Preview",
+        "arcee-ai/Trinity-Mini",
+        "arcee-ai/Trinity-Large-Preview",
+        "arcee-ai/Trinity-Nano-Base",
+        "arcee-ai/Trinity-Mini-Base"
       ]
     },
     {
-      "architecture_id": "TranceptionLMHeadModel",
-      "total_models": 2,
+      "architecture_id": "LlavaQwen2ForCausalLM",
+      "total_models": 5,
       "sample_models": [
-        "PascalNotin/Tranception_Large",
-        "PascalNotin/Tranception_Small"
+        "apple/FastVLM-0.5B",
+        "qnguyen3/nanoLLaVA",
+        "apple/FastVLM-1.5B",
+        "apple/FastVLM-7B",
+        "FreedomIntelligence/HuatuoGPT-Vision-7B"
       ]
     },
     {
-      "architecture_id": "MultiScaleForCausalLM",
-      "total_models": 2,
+      "architecture_id": "HunYuanDenseV1ForCausalLM",
+      "total_models": 5,
       "sample_models": [
-        "KoinicLabs/AXL-Vision-v2",
-        "KoinicLabs/AXL-Translate"
+        "tencent/Hunyuan-7B-Instruct",
+        "tencent/Hunyuan-0.5B-Pretrain",
+        "tencent/Hunyuan-4B-Instruct",
+        "tencent/Hunyuan-0.5B-Instruct",
+        "tencent/Hunyuan-1.8B-Instruct"
       ]
     },
     {
-      "architecture_id": "GPT",
-      "total_models": 2,
+      "architecture_id": "PhiMoEForCausalLM",
+      "total_models": 4,
       "sample_models": [
-        "LH-Tech-AI/Apex-1.5-Coder-Instruct-350M",
-        "LH-Tech-AI/Apex-1.5-Instruct-350M"
+        "microsoft/Phi-tiny-MoE-instruct",
+        "microsoft/Phi-mini-MoE-instruct",
+        "microsoft/Phi-3.5-MoE-instruct",
+        "optimum-intel-internal-testing/phi-3.5-moe-tiny-random"
       ]
     },
     {
-      "architecture_id": "BolmoForCausalLM",
-      "total_models": 2,
+      "architecture_id": "Starcoder2ForCausalLM",
+      "total_models": 4,
       "sample_models": [
-        "allenai/Bolmo-1B",
-        "allenai/Bolmo-7B"
+        "bigcode/starcoder2-3b",
+        "bigcode/starcoder2-15b",
+        "bigcode/starcoder2-7b",
+        "bigcode/starcoder2-15b-instruct-v0.1"
       ]
     },
     {
-      "architecture_id": "OpenMoeForCausalLM",
-      "total_models": 2,
+      "architecture_id": "Lfm2MoeForCausalLM",
+      "total_models": 4,
       "sample_models": [
-        "hpcai-tech/openmoe-8B",
-        "OrionZheng/openmoe-8b"
+        "LiquidAI/LFM2-8B-A1B",
+        "LiquidAI/LFM2-24B-A2B",
+        "huihui-ai/Huihui-LFM2-24B-A2B-abliterated",
+        "huihui-ai/Huihui-LFM2-8B-A1B-abliterated"
       ]
     },
     {
-      "architecture_id": "MiniMindForCausalLM",
-      "total_models": 2,
+      "architecture_id": "LLaDA2MoeModelLM",
+      "total_models": 4,
       "sample_models": [
-        "yiwenX/MiniMind-MoE-640-120M",
-        "chujiamo/baiheng_0405"
+        "inclusionAI/LLaDA2.0-mini",
+        "inclusionAI/LLaDA2.1-mini",
+        "inclusionAI/LLaDA2.1-flash",
+        "inclusionAI/LLaDA2.0-flash"
       ]
     },
     {
-      "architecture_id": "RWKV7ForCausalLM",
-      "total_models": 2,
+      "architecture_id": "LlamaForCausalLMEagle3",
+      "total_models": 4,
       "sample_models": [
-        "RWKV/RWKV7-Goose-World3-1.5B-HF",
-        "fla-hub/rwkv7-1.5B-world"
+        "nvidia/gpt-oss-120b-Eagle3-short-context",
+        "taobao-mnn/Qwen3-VL-8B-Instruct-Eagle3",
+        "nvidia/gpt-oss-120b-Eagle3-long-context",
+        "nvidia/gpt-oss-120b-Eagle3-throughput"
       ]
     },
     {
-      "architecture_id": "BottleneckT5LMWithPerturb",
-      "total_models": 2,
+      "architecture_id": "DeepseekForCausalLM",
+      "total_models": 4,
       "sample_models": [
-        "thesephist/contra-bottleneck-t5-small-wikipedia",
-        "thesephist/contra-bottleneck-t5-base-wikipedia"
+        "deepseek-ai/deepseek-moe-16b-base",
+        "deepseek-ai/deepseek-moe-16b-chat",
+        "ai-sage/GigaChat-20B-A3B-base",
+        "ai-sage/GigaChat-20B-A3B-instruct"
       ]
     },
     {
-      "architecture_id": "StableDiffcoderForCausalLM",
-      "total_models": 2,
+      "architecture_id": "OlmoHybridForCausalLM",
+      "total_models": 4,
       "sample_models": [
-        "ByteDance-Seed/Stable-DiffCoder-8B-Instruct",
-        "ByteDance-Seed/Stable-DiffCoder-8B-Base"
+        "allenai/Olmo-Hybrid-7B",
+        "allenai/Olmo-Hybrid-Instruct-DPO-7B",
+        "allenai/Olmo-Hybrid-Instruct-SFT-7B",
+        "allenai/Olmo-Hybrid-Think-SFT-7B"
       ]
     },
     {
-      "architecture_id": "OtterForConditionalGeneration",
-      "total_models": 2,
+      "architecture_id": "OuroForCausalLM",
+      "total_models": 4,
       "sample_models": [
-        "luodian/OTTER-Video-LLaMA7B-DenseCaption",
-        "luodian/OTTER-MPT1B-RPJama-Init"
+        "ByteDance/Ouro-1.4B",
+        "ByteDance/Ouro-2.6B-Thinking",
+        "ByteDance/Ouro-1.4B-Thinking",
+        "ByteDance/Ouro-2.6B"
       ]
     },
     {
-      "architecture_id": "MonkeyLMHeadModel",
-      "total_models": 2,
+      "architecture_id": "Glm4ForCausalLM",
+      "total_models": 4,
       "sample_models": [
-        "echo840/Monkey-Chat",
-        "echo840/Monkey"
+        "zai-org/GLM-4-9B-0414",
+        "zai-org/GLM-Z1-32B-0414",
+        "zai-org/GLM-Z1-9B-0414",
+        "zai-org/GLM-4-32B-0414"
       ]
     },
     {
-      "architecture_id": "IndexForCausalLM",
-      "total_models": 2,
+      "architecture_id": "ArceeForCausalLM",
+      "total_models": 4,
       "sample_models": [
-        "IndexTeam/Index-1.9B-Chat",
-        "IndexTeam/Index-1.9B-Pure"
+        "optimum-intel-internal-testing/tiny-random-ArceeForCausalLM",
+        "arcee-ai/AFM-4.5B-Base",
+        "onnx-internal-testing/tiny-random-ArceeForCausalLM",
+        "arcee-ai/AFM-4.5B"
       ]
     },
     {
-      "architecture_id": "PointLLMLlamaForCausalLM",
-      "total_models": 2,
+      "architecture_id": "BailingMoeV2ForCausalLM",
+      "total_models": 4,
       "sample_models": [
-        "RunsenXu/PointLLM_7B_v1.1_init",
-        "RunsenXu/PointLLM_7B_v1.2"
+        "inclusionAI/Ling-mini-2.0",
+        "inclusionAI/Ling-1T",
+        "inclusionAI/Ring-mini-2.0",
+        "inclusionAI/Ling-flash-2.0"
       ]
     },
     {
-      "architecture_id": "T5EncoderModel",
-      "total_models": 1,
+      "architecture_id": "AquilaForCausalLM",
+      "total_models": 4,
       "sample_models": [
-        "XLabs-AI/xflux_text_encoders"
+        "BAAI/AquilaChat2-7B",
+        "katuni4ka/tiny-random-aquila2",
+        "BAAI/Aquila2-34B",
+        "katuni4ka/tiny-random-aquilachat"
       ]
     },
     {
-      "architecture_id": "Step3p5ForCausalLM",
-      "total_models": 1,
+      "architecture_id": "RWForCausalLM",
+      "total_models": 4,
       "sample_models": [
-        "stepfun-ai/Step-3.5-Flash"
+        "lightonai/alfred-40b-1023",
+        "vilm/vulture-40b",
+        "explosion-testing/refined-web-model-test",
+        "h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v2"
       ]
     },
     {
-      "architecture_id": "AprielForCausalLM",
-      "total_models": 1,
+      "architecture_id": "XLNetLMHeadModel",
+      "total_models": 3,
       "sample_models": [
-        "ServiceNow-AI/Apriel-5B-Instruct"
+        "xlnet/xlnet-base-cased",
+        "xlnet/xlnet-large-cased",
+        "sshleifer/tiny-xlnet-base-cased"
       ]
     },
     {
-      "architecture_id": "IlamaForCausalLM",
-      "total_models": 1,
+      "architecture_id": "BioGptForCausalLM",
+      "total_models": 3,
       "sample_models": [
-        "hmellor/Ilama-3.2-1B"
+        "microsoft/biogpt",
+        "microsoft/BioGPT-Large",
+        "microsoft/BioGPT-Large-PubMedQA"
       ]
     },
     {
-      "architecture_id": "XCurOSForCausalLM",
-      "total_models": 1,
+      "architecture_id": "BambaForCausalLM",
+      "total_models": 3,
       "sample_models": [
-        "XCurOS/XCurOS-0.1-8B-Instruct"
+        "hmellor/tiny-random-BambaForCausalLM",
+        "ibm-ai-platform/Bamba-9B-v1",
+        "ibm-ai-platform/Bamba-9B-v2"
       ]
     },
     {
-      "architecture_id": "TarsierForConditionalGeneration",
-      "total_models": 1,
+      "architecture_id": "Exaone4ForCausalLM",
+      "total_models": 3,
       "sample_models": [
-        "omni-research/Tarsier-7b"
+        "LGAI-EXAONE/EXAONE-4.0.1-32B",
+        "LGAI-EXAONE/EXAONE-4.0-1.2B",
+        "LGAI-EXAONE/EXAONE-4.0-32B"
       ]
     },
     {
-      "architecture_id": "Plamo2ForCausalLM",
-      "total_models": 1,
+      "architecture_id": "MiMoForCausalLM",
+      "total_models": 3,
       "sample_models": [
-        "pfnet/plamo-2-1b"
+        "XiaomiMiMo/MiMo-7B-Base",
+        "XiaomiMiMo/MiMo-7B-RL",
+        "XiaomiMiMo/MiMo-7B-SFT"
       ]
     },
     {
-      "architecture_id": "HCXVisionForCausalLM",
-      "total_models": 1,
+      "architecture_id": "T5WithLMHeadModel",
+      "total_models": 3,
       "sample_models": [
-        "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
+        "Rostlab/prot_t5_xl_bfd",
+        "Salesforce/codet5-large",
+        "unicamp-dl/ptt5-base-portuguese-vocab"
       ]
     },
     {
-      "architecture_id": "KimiLinearForCausalLM",
-      "total_models": 1,
+      "architecture_id": "GlmMoeDsaForCausalLM",
+      "total_models": 3,
       "sample_models": [
-        "moonshotai/Kimi-Linear-48B-A3B-Instruct"
+        "zai-org/GLM-5",
+        "nvidia/GLM-5-NVFP4",
+        "cs2764/GLM-5_dq3-mlx"
       ]
     },
     {
-      "architecture_id": "MiMoV2FlashForCausalLM",
-      "total_models": 1,
+      "architecture_id": "Step3p5ForCausalLM",
+      "total_models": 3,
       "sample_models": [
-        "XiaomiMiMo/MiMo-V2-Flash"
+        "stepfun-ai/Step-3.5-Flash",
+        "tacos4me/Step-3.5-Flash-NVFP4",
+        "stepfun-ai/Step-3.5-Flash-Base"
       ]
     },
     {
-      "architecture_id": "LongcatFlashForCausalLM",
-      "total_models": 1,
+      "architecture_id": "Zamba2ForCausalLM",
+      "total_models": 3,
       "sample_models": [
-        "meituan-longcat/LongCat-Flash-Chat"
+        "Zyphra/Zamba2-1.2B-instruct",
+        "Zyphra/Zamba2-7B-Instruct",
+        "Zyphra/Zamba2-2.7B"
       ]
     },
     {
-      "architecture_id": "InternLM3ForCausalLM",
-      "total_models": 1,
+      "architecture_id": "InternLMForCausalLM",
+      "total_models": 3,
       "sample_models": [
-        "internlm/internlm3-8b-instruct"
+        "internlm/internlm-chat-7b",
+        "internlm/internlm-20b",
+        "internlm/internlm-7b"
       ]
     },
     {
-      "architecture_id": "HyperCLOVAXForCausalLM",
-      "total_models": 1,
+      "architecture_id": "GlmForCausalLM",
+      "total_models": 3,
       "sample_models": [
-        "naver-hyperclovax/HyperCLOVAX-SEED-Think-14B"
+        "zai-org/glm-4-9b-chat-hf",
+        "zai-org/glm-4-9b-hf",
+        "zai-org/glm-edge-1.5b-chat"
       ]
     },
     {
-      "architecture_id": "GritLM",
-      "total_models": 1,
+      "architecture_id": "NemotronFlashForCausalLM",
+      "total_models": 3,
       "sample_models": [
-        "parasail-ai/GritLM-7B-vllm"
+        "nvidia/Nemotron-Flash-3B",
+        "nvidia/Nemotron-Flash-3B-Instruct",
+        "nvidia/Nemotron-Flash-1B"
       ]
     },
     {
-      "architecture_id": "BailingMoeV2_5ForCausalLM",
-      "total_models": 1,
+      "architecture_id": "Mistral3ForConditionalGeneration",
+      "total_models": 3,
       "sample_models": [
-        "inclusionAI/Ring-2.5-1T"
+        "RedHatAI/Mistral-Small-3.2-24B-Instruct-2506-NVFP4",
+        "ArmGPT/ArmenianGPT-1.0-3B",
+        "srs6901/SOLARized-GraniStral-14B_2102_YeAM-HCT_32QKV"
       ]
     },
     {
-      "architecture_id": "ExaoneMoEForCausalLM",
-      "total_models": 1,
+      "architecture_id": "Llama4ForConditionalGeneration",
+      "total_models": 3,
       "sample_models": [
-        "LGAI-EXAONE/K-EXAONE-236B-A23B"
+        "RedHatAI/Llama-4-Scout-17B-16E-Instruct-NVFP4",
+        "yujiepan/llama-4-tiny-random",
+        "RedHatAI/Llama-4-Maverick-17B-128E-Instruct-NVFP4"
       ]
     },
     {
-      "architecture_id": "Grok1ModelForCausalLM",
-      "total_models": 1,
+      "architecture_id": "PersimmonForCausalLM",
+      "total_models": 3,
       "sample_models": [
-        "hpcai-tech/grok-1"
+        "adept/persimmon-8b-chat",
+        "adept/persimmon-8b-base",
+        "pszemraj/perSLIMmon-8b-base"
       ]
     },
     {
-      "architecture_id": "BailingMoeForCausalLM",
-      "total_models": 1,
+      "architecture_id": "JAISLMHeadModel",
+      "total_models": 3,
       "sample_models": [
-        "inclusionAI/Ling-lite-1.5"
+        "inceptionai/jais-13b-chat",
+        "katuni4ka/tiny-random-jais",
+        "inceptionai/jais-13b"
       ]
     },
     {
-      "architecture_id": "SolarForCausalLM",
-      "total_models": 1,
+      "architecture_id": "TrillionForCausalLM",
+      "total_models": 3,
       "sample_models": [
-        "upstage/solar-pro-preview-instruct"
+        "trillionlabs/Tri-21B-Think",
+        "trillionlabs/Tri-21B",
+        "trillionlabs/Tri-21B-Think-Preview"
       ]
     },
     {
-      "architecture_id": "HunYuanMoEV1ForCausalLM",
-      "total_models": 1,
+      "architecture_id": "IdeficsForVisionText2Text",
+      "total_models": 3,
       "sample_models": [
-        "tencent/Hunyuan-A13B-Instruct"
+        "HuggingFaceM4/idefics-80b-instruct",
+        "HuggingFaceM4/idefics-9b",
+        "HuggingFaceM4/idefics-9b-instruct"
       ]
     },
     {
-      "architecture_id": "GptOssPuzzleForCausalLM",
-      "total_models": 1,
+      "architecture_id": "OLMoForCausalLM",
+      "total_models": 3,
       "sample_models": [
-        "nvidia/gpt-oss-puzzle-88B"
+        "allenai/OLMo-1B",
+        "allenai/OLMo-7B-Instruct",
+        "allenai/OLMo-7B"
       ]
     },
     {
-      "architecture_id": "MiniMaxForCausalLM",
-      "total_models": 1,
+      "architecture_id": "modeling_camelidae.LlamaForCausalLM",
+      "total_models": 3,
       "sample_models": [
-        "MiniMaxAI/MiniMax-Text-01-hf"
+        "hywu/Camelidae-8x34B",
+        "hywu/Camelidae-8x13B",
+        "hywu/Camelidae-8x7B"
       ]
     },
     {
-      "architecture_id": "ModernBertForSequenceClassification",
-      "total_models": 1,
+      "architecture_id": "LISAForCausalLM",
+      "total_models": 3,
       "sample_models": [
-        "opendatalab/meta-rater-professionalism-rating"
+        "xinlai/LISA-13B-llama2-v1",
+        "xinlai/LISA-7B-v1",
+        "MBZUAI/GLaMM-GranD-Pretrained"
       ]
     },
     {
-      "architecture_id": "MiniCPM3ForCausalLM",
-      "total_models": 1,
+      "architecture_id": "RWKV7ForCausalLM",
+      "total_models": 3,
       "sample_models": [
-        "openbmb/MiniCPM3-4B"
+        "RWKV/RWKV7-Goose-World3-1.5B-HF",
+        "fla-hub/rwkv7-1.5B-world",
+        "RWKV/RWKV7-Goose-World3-2.9B-HF"
       ]
     },
     {
-      "architecture_id": "ArcticForCausalLM",
-      "total_models": 1,
+      "architecture_id": "MotifForCausalLM",
+      "total_models": 3,
       "sample_models": [
-        "Snowflake/snowflake-arctic-instruct"
+        "Motif-Technologies/Motif-2-12.7B-Base",
+        "Motif-Technologies/Motif-2-12.7B-Instruct",
+        "Motif-Technologies/Motif-2.6b-v1.1-LC"
       ]
     },
     {
-      "architecture_id": "IQuestLoopCoderForCausalLM",
-      "total_models": 1,
+      "architecture_id": "H2OVLChatModel",
+      "total_models": 2,
       "sample_models": [
-        "IQuestLab/IQuest-Coder-V1-40B-Loop-Instruct"
+        "h2oai/h2ovl-mississippi-800m",
+        "h2oai/h2ovl-mississippi-2b"
       ]
     },
     {
-      "architecture_id": "Plamo3ForCausalLM",
-      "total_models": 1,
+      "architecture_id": "KimiK25ForConditionalGeneration",
+      "total_models": 2,
       "sample_models": [
-        "pfnet/plamo-3-nict-2b-base"
+        "nvidia/Kimi-K2.5-NVFP4",
+        "Ex0bit/Kimi-K2.5-PRISM-REAP-530B-A32B"
       ]
     },
     {
-      "architecture_id": "TransformerForCausalLM",
-      "total_models": 1,
+      "architecture_id": "HCXVisionV2ForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "fla-hub/transformer-1.3B-100B"
+        "naver-hyperclovax/HyperCLOVAX-SEED-Omni-8B",
+        "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B"
       ]
     },
     {
-      "architecture_id": "Moondream",
-      "total_models": 1,
+      "architecture_id": "OpenAIGPTLMHeadModel",
+      "total_models": 2,
       "sample_models": [
-        "vikhyatk/moondream1"
+        "openai-community/openai-gpt",
+        "lgaalves/gpt1"
       ]
     },
     {
-      "architecture_id": "GraphT5TransformerForConditionalGeneration",
-      "total_models": 1,
+      "architecture_id": "MoshiForConditionalGeneration",
+      "total_models": 2,
       "sample_models": [
-        "haitengzhao/gimlet"
+        "kmhf/hf-moshiko",
+        "kmhf/hf-moshika"
       ]
     },
     {
-      "architecture_id": "GPT2LMHeadCustomModel",
-      "total_models": 1,
+      "architecture_id": "ReformerModelWithLMHead",
+      "total_models": 2,
       "sample_models": [
-        "bigcode/santacoder"
+        "google/reformer-crime-and-punishment",
+        "google/reformer-enwik8"
       ]
     },
     {
-      "architecture_id": "GPTRefactForCausalLM",
-      "total_models": 1,
+      "architecture_id": "Phi3VForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "refactai/Refact-1_6B-fim"
+        "microsoft/Phi-3-vision-128k-instruct",
+        "TIGER-Lab/VLM2Vec-Full"
       ]
     },
     {
-      "architecture_id": "TrillionForCausalLM",
-      "total_models": 1,
+      "architecture_id": "BartForConditionalGeneration",
+      "total_models": 2,
       "sample_models": [
-        "trillionlabs/Tri-21B-Think"
+        "KomeijiForce/bart-large-emojilm",
+        "Nargizi/screeve-lemmatizer"
       ]
     },
     {
-      "architecture_id": "InternLMXComposer2ForCausalLM",
-      "total_models": 1,
+      "architecture_id": "StarVectorForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "internlm/internlm-xcomposer2-7b"
+        "starvector/starvector-1b-im2svg",
+        "starvector/starvector-8b-im2svg"
       ]
     },
     {
-      "architecture_id": "NandiForCausalLM",
-      "total_models": 1,
+      "architecture_id": "KimiLinearForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "Rta-AILabs/Nandi-Mini-150M"
+        "moonshotai/Kimi-Linear-48B-A3B-Instruct",
+        "moonshotai/Kimi-Linear-48B-A3B-Base"
       ]
     },
     {
-      "architecture_id": "StableLMAlphaForCausalLM",
-      "total_models": 1,
+      "architecture_id": "DbrxForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "stabilityai/stablelm-base-alpha-7b-v2"
+        "trl-internal-testing/tiny-DbrxForCausalLM",
+        "katuni4ka/tiny-random-dbrx"
       ]
     },
     {
-      "architecture_id": "Param2MoEForCausalLM",
-      "total_models": 1,
+      "architecture_id": "Qwen2_5_VLForConditionalGeneration",
+      "total_models": 2,
       "sample_models": [
-        "bharatgenai/Param2-17B-A2.4B-Thinking"
+        "nvidia/Qwen2.5-VL-7B-Instruct-NVFP4",
+        "OmniSVG/OmniSVG"
       ]
     },
     {
-      "architecture_id": "InternLMXComposerForCausalLM",
-      "total_models": 1,
+      "architecture_id": "ChatGLMModel",
+      "total_models": 2,
       "sample_models": [
-        "internlm/internlm-xcomposer-7b"
+        "zai-org/glm-4-9b",
+        "zai-org/codegeex4-all-9b"
       ]
     },
     {
-      "architecture_id": "MobilintExaone4ForCausalLM",
-      "total_models": 1,
+      "architecture_id": "Llama4ForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "mobilint/EXAONE-4.0-1.2B"
+        "trl-internal-testing/tiny-Llama4ForCausalLM",
+        "facebook/MobileLLM-R1-950M"
       ]
     },
     {
-      "architecture_id": "PanguEmbeddedForCausalLM",
-      "total_models": 1,
+      "architecture_id": "Phi3SmallForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "FreedomIntelligence/openPangu-Embedded-1B"
+        "microsoft/Phi-3-small-8k-instruct",
+        "microsoft/Phi-3-small-128k-instruct"
       ]
     },
     {
-      "architecture_id": "ModelStarOLMhead",
-      "total_models": 1,
+      "architecture_id": "MiniMaxM1ForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "Hawa-Al-Akram/StarO-Ai"
+        "MiniMaxAI/MiniMax-M1-40k",
+        "MiniMaxAI/MiniMax-M1-80k"
       ]
     },
     {
-      "architecture_id": "TransfoXLLMHeadModel",
-      "total_models": 1,
+      "architecture_id": "CLIPT5ForConditionalGeneration",
+      "total_models": 2,
       "sample_models": [
-        "transfo-xl/transfo-xl-wt103"
+        "zhiqiulin/clip-flant5-xxl",
+        "zhiqiulin/clip-flant5-xl"
       ]
     },
     {
-      "architecture_id": "Qwen3TSForCausalLM",
-      "total_models": 1,
+      "architecture_id": "BailingMoeForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "bytedance-research/ChatTS-8B"
+        "inclusionAI/Ling-lite-1.5",
+        "inclusionAI/Ling-lite"
       ]
     },
     {
-      "architecture_id": "SparseLlamaForCausalLM",
-      "total_models": 1,
+      "architecture_id": "BitNetForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "openbmb/NOSA-8B"
+        "microsoft/bitnet-b1.58-2B-4T",
+        "microsoft/bitnet-b1.58-2B-4T-bf16"
       ]
     },
     {
-      "architecture_id": "DeltaNetForCausalLM",
-      "total_models": 1,
+      "architecture_id": "InternVLChatModel",
+      "total_models": 2,
       "sample_models": [
-        "fla-hub/delta_net-1.3B-100B"
+        "numind/NuExtract-2-4B-experimental",
+        "numind/NuExtract-2-8B-experimental"
       ]
     },
     {
-      "architecture_id": "CambrianQwenForCausalLM",
-      "total_models": 1,
+      "architecture_id": "Ernie4_5ForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "nyu-visionx/Scale-RAE-Qwen1.5B_DiT2.4B"
+        "baidu/ERNIE-4.5-0.3B-PT",
+        "baidu/ERNIE-4.5-0.3B-Base-PT"
       ]
     },
     {
-      "architecture_id": "EvafrillMoForCausalLM",
-      "total_models": 1,
+      "architecture_id": "IQuestCoderForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "pathcosmos/EVAFRILL-Mo-3B"
+        "IQuestLab/IQuest-Coder-V1-40B-Instruct",
+        "IQuestLab/IQuest-Coder-V1-7B-Instruct"
       ]
     },
     {
-      "architecture_id": "InternVLChatModel",
-      "total_models": 1,
+      "architecture_id": "XverseForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "numind/NuExtract-2-4B-experimental"
+        "xverse/XVERSE-7B-Chat",
+        "katuni4ka/tiny-random-xverse"
       ]
     },
     {
-      "architecture_id": "VaultGemmaForCausalLM",
-      "total_models": 1,
+      "architecture_id": "Jais2ForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "google/vaultgemma-1b"
+        "inceptionai/Jais-2-8B-Chat",
+        "inceptionai/Jais-2-70B-Chat"
       ]
     },
     {
-      "architecture_id": "ZambaForCausalLM",
-      "total_models": 1,
+      "architecture_id": "StripedHyenaModelForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "Zyphra/Zamba-7B-v1"
+        "togethercomputer/evo-1-8k-base",
+        "togethercomputer/evo-1-131k-base"
       ]
     },
     {
-      "architecture_id": "CheXagentForCausalLM",
-      "total_models": 1,
+      "architecture_id": "AXK1ForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "StanfordAIMI/CheXagent-2-3b"
+        "skt/A.X-K1",
+        "thkim93/axk1-2layers"
       ]
     },
     {
-      "architecture_id": "GatedDeltaNetForCausalLM",
-      "total_models": 1,
+      "architecture_id": "RecurrentGemmaForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "deqing/gdn-300M-v5-gdn"
+        "google/recurrentgemma-2b",
+        "google/recurrentgemma-2b-it"
       ]
     },
     {
-      "architecture_id": "Qwen2TSForCausalLM",
-      "total_models": 1,
+      "architecture_id": "FlexOlmoForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "bytedance-research/ChatTS-14B"
+        "allenai/Flex-reddit-2x7B-1T",
+        "shanearora/Flex-reddit-2x7B-1T"
       ]
     },
     {
-      "architecture_id": "QEDForCausalLM",
-      "total_models": 1,
+      "architecture_id": "SolarOpenForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "levossadtchi/QED-75M"
+        "upstage/Solar-Open-100B",
+        "nota-ai/Solar-Open-100B-NotaMoEQuant-Int4"
       ]
     },
     {
-      "architecture_id": "JetMoEForCausalLM",
-      "total_models": 1,
+      "architecture_id": "PenguinVLQwen3ForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "jetmoe/jetmoe-8b"
+        "tencent/Penguin-VL-8B",
+        "tencent/Penguin-VL-2B"
       ]
     },
     {
-      "architecture_id": "RecursiveLanguageModel",
-      "total_models": 1,
+      "architecture_id": "MolformerForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "Girinath11/recursive-language-model-198m"
+        "ibm-research/GP-MoLFormer-Uniq",
+        "ralyn/NPComposer-v2"
       ]
     },
     {
-      "architecture_id": "SeerAttnLlamaForCausalLM",
-      "total_models": 1,
+      "architecture_id": "GLAForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "SeerAttention/SeerAttention-Llama-3.1-8B-AttnGates"
+        "fla-hub/gla-340M-15B",
+        "fla-hub/gla-1.3B-100B"
       ]
     },
     {
-      "architecture_id": "LongcatCausalLM",
-      "total_models": 1,
+      "architecture_id": "MosaicGPT",
+      "total_models": 2,
       "sample_models": [
-        "meituan-longcat/LongCat-Flash-Thinking-2601"
+        "anas-awadalla/mpt-1b-redpajama-200b",
+        "anas-awadalla/mpt-1b-redpajama-200b-dolly"
       ]
     },
     {
-      "architecture_id": "GomeForCausalLM",
-      "total_models": 1,
+      "architecture_id": "Eagle3DraftModel",
+      "total_models": 2,
       "sample_models": [
-        "Prositron/gome"
+        "RedHatAI/Qwen3-30B-A3B-Instruct-2507-speculator.eagle3",
+        "RedHatAI/Qwen3-235B-A22B-Instruct-2507-speculator.eagle3"
       ]
     },
     {
-      "architecture_id": "MoYiForCausalLM",
-      "total_models": 1,
+      "architecture_id": "BolmoForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "astanahub/alemllm"
+        "allenai/Bolmo-7B",
+        "allenai/Bolmo-1B"
       ]
     },
     {
-      "architecture_id": "NanochatWasmFusedModel",
-      "total_models": 1,
+      "architecture_id": "JetMoEForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "eastlondoner/nanochat-wasm-fused-preview-01"
+        "jetmoe/jetmoe-8b",
+        "jetmoe/jetmoe-8b-chat"
       ]
     },
     {
-      "architecture_id": "LLM",
-      "total_models": 1,
+      "architecture_id": "CogVLMForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "rudyon/linnet-497M"
+        "zai-org/cogvlm2-llama3-chat-19B",
+        "zai-org/cogvlm-chat-hf"
       ]
     },
     {
-      "architecture_id": "MyAwesomeModelForCausalLM",
-      "total_models": 1,
+      "architecture_id": "WeDLMForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "dongbobo/MyAwesomeModel"
+        "tencent/WeDLM-8B-Instruct",
+        "tencent/WeDLM-8B-Base"
       ]
     },
     {
-      "architecture_id": "SwarmForCausalLM",
-      "total_models": 1,
+      "architecture_id": "YoutuForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "reaperdoesntknow/SAGI"
+        "tencent/Youtu-LLM-2B-Base",
+        "tencent/Youtu-LLM-2B"
       ]
     },
     {
-      "architecture_id": "CPMAntForCausalLM",
-      "total_models": 1,
+      "architecture_id": "ParamBharatGenForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "openbmb/cpm-ant-10b"
+        "bharatgenai/Param-1-2.9B-Instruct",
+        "bharatgenai/AyurParam"
       ]
     },
     {
-      "architecture_id": "Maira2ForConditionalGeneration",
-      "total_models": 1,
+      "architecture_id": "BitnetForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "microsoft/maira-2"
+        "1bitLLM/bitnet_b1_58-3B",
+        "1bitLLM/bitnet_b1_58-large"
       ]
     },
     {
-      "architecture_id": "CTRLLMHeadModel",
-      "total_models": 1,
+      "architecture_id": "SliderGPT",
+      "total_models": 2,
       "sample_models": [
-        "sshleifer/tiny-ctrl"
+        "c-bone/CrystaLLM-pi_Mattergen-XRD",
+        "c-bone/CrystaLLM-pi_COD-XRD"
       ]
     },
     {
-      "architecture_id": "SpatialLMQwenForCausalLM",
-      "total_models": 1,
+      "architecture_id": "BottleneckT5LMWithPerturb",
+      "total_models": 2,
       "sample_models": [
-        "manycore-research/SpatialLM1.1-Qwen-0.5B"
+        "thesephist/contra-bottleneck-t5-base-wikipedia",
+        "thesephist/contra-bottleneck-t5-large-wikipedia"
       ]
     },
     {
-      "architecture_id": "CoherenceMomentumModel",
-      "total_models": 1,
+      "architecture_id": "MptForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "aisingapore/coherence-momentum"
+        "team-lucid/mptk-1b",
+        "explosion-testing/mpt-test"
       ]
     },
     {
-      "architecture_id": "TAMELM",
-      "total_models": 1,
+      "architecture_id": "OpenMoeForCausalLM",
+      "total_models": 2,
       "sample_models": [
-        "reaperdoesntknow/TameForCasualLM"
+        "hpcai-tech/openmoe-8B",
+        "OrionZheng/openmoe-8b"
       ]
     },
     {
-      "architecture_id": "GPT2CustomLMHeadModel",
+      "architecture_id": "MiMoV2FlashForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "fxmarty/tiny-testing-gpt2-remote-code"
+        "XiaomiMiMo/MiMo-V2-Flash"
       ]
     },
     {
-      "architecture_id": "GPT2",
+      "architecture_id": "T5EncoderModel",
       "total_models": 1,
       "sample_models": [
-        "NamrataThakur/Small_Language_Model_MHA_53M_Pretrained"
+        "XLabs-AI/xflux_text_encoders"
       ]
     },
     {
-      "architecture_id": "GQAGPT2",
+      "architecture_id": "XCurOSForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "NamrataThakur/Small_Language_Model_GQA_48M_Pretrained"
+        "XCurOS/XCurOS-0.1-8B-Instruct"
       ]
     },
     {
-      "architecture_id": "MoEGPT2",
+      "architecture_id": "GPTNeoXJapaneseForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "NamrataThakur/Small_Language_Model_MOE_127M_Pretrained"
+        "abeja/gpt-neox-japanese-2.7b"
       ]
     },
     {
-      "architecture_id": "Esm2LlamaInstructForCausalLM",
+      "architecture_id": "IlamaForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "xiao-fei/Prot2Text-V2-11B-Instruct-hf"
+        "hmellor/Ilama-3.2-1B"
       ]
     },
     {
-      "architecture_id": "ThinkerLM",
+      "architecture_id": "Plamo2ForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "prskid1000/micro-Omni"
+        "pfnet/plamo-2-1b"
       ]
     },
     {
-      "architecture_id": "QHEARTForECGQA",
+      "architecture_id": "HCXVisionForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "Manhph2211/Q-HEART"
+        "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
       ]
     },
     {
-      "architecture_id": "MoELLaVAQwen2ForCausalLM",
+      "architecture_id": "TarsierForConditionalGeneration",
       "total_models": 1,
       "sample_models": [
-        "KKHYA/llavaqwen2.5-0.5b-finetune-moe-4e-2k_20260331_194516"
+        "omni-research/Tarsier-7b"
       ]
     },
     {
-      "architecture_id": "D3PMSanskritModel",
+      "architecture_id": "BaiChuanForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "bhsinghgrid/sanskrit-translation"
+        "baichuan-inc/Baichuan-7B"
       ]
     },
     {
-      "architecture_id": "AV2TextForConditionalGeneration",
+      "architecture_id": "SarvamMoEForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "nguyenvulebinh/AV-HuBERT-MuAViC-en"
+        "sarvamai/sarvam-30b"
       ]
     },
     {
-      "architecture_id": "GPTJXMoEForCausalLM",
+      "architecture_id": "LongcatFlashForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "Aletheia-ng/SabiYarn_MoE_translate"
+        "meituan-longcat/LongCat-Flash-Chat"
       ]
     },
     {
-      "architecture_id": "Eagle3DeepseekV2ForCausalLM",
+      "architecture_id": "GPTRefactForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "nvidia/Kimi-K2.5-Thinking-Eagle3"
+        "refactai/Refact-1_6B-fim"
       ]
     },
     {
-      "architecture_id": "Videollama2Qwen2ForCausalLM",
+      "architecture_id": "HyperCLOVAXForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "QuangTuan/MultiMood-7B-GRPO-VisualAudioText-Comp"
+        "naver-hyperclovax/HyperCLOVAX-SEED-Think-14B"
       ]
     },
     {
-      "architecture_id": "Speech2TextTransformerForConditionalGeneration",
+      "architecture_id": "ExaoneMoEForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "valhalla/s2t_mustc_multilinguial_medium"
+        "LGAI-EXAONE/K-EXAONE-236B-A23B"
       ]
     },
     {
-      "architecture_id": "BlenderbotForConditionalGeneration",
+      "architecture_id": "HunYuanMoEV1ForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "thu-coai/blenderbot-400M-esconv"
+        "tencent/Hunyuan-A13B-Instruct"
       ]
     },
     {
-      "architecture_id": "WhisperMixStyleForConditionalGeneration",
+      "architecture_id": "GritLM",
       "total_models": 1,
       "sample_models": [
-        "wago5090/mixstyle_multi-s"
+        "parasail-ai/GritLM-7B-vllm"
       ]
     },
     {
-      "architecture_id": "Autoencoder",
+      "architecture_id": "BailingMoeV2_5ForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "cccczshao/CALM-Autoencoder"
+        "inclusionAI/Ring-2.5-1T"
       ]
     },
     {
-      "architecture_id": "AlinlightForCausalLM",
+      "architecture_id": "SolarForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "EngineerGL/Alinlight"
+        "upstage/solar-pro-preview-instruct"
       ]
     },
     {
-      "architecture_id": "LlamaForCausalLMEagle",
+      "architecture_id": "JetNemotronForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "thunlp/LLaMA3-Instruct-8B-FR-Spec"
+        "jet-ai/Jet-Nemotron-2B"
       ]
     },
     {
-      "architecture_id": "GuppyLM",
+      "architecture_id": "InternLM3ForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "arman-bd/guppylm-9M"
+        "internlm/internlm3-8b-instruct"
       ]
     },
     {
-      "architecture_id": "FusionInDecoderForConditionalGeneration",
+      "architecture_id": "Grok1ModelForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "Intel/fid_flan_t5_base_nq"
+        "hpcai-tech/grok-1"
       ]
     },
     {
-      "architecture_id": "EveMoEForCausalLM",
+      "architecture_id": "Qwen3VLMoeForConditionalGeneration",
       "total_models": 1,
       "sample_models": [
-        "anthonym21/Eve-2-MoE-IT-272M"
+        "RedHatAI/Qwen3-VL-235B-A22B-Instruct-NVFP4"
       ]
     },
     {
-      "architecture_id": "Typhoon2Audio2AudioForConditionalGeneration",
+      "architecture_id": "MiniCPM3ForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "typhoon-ai/llama3.1-typhoon2-audio-8b-instruct"
+        "openbmb/MiniCPM3-4B"
       ]
     },
     {
-      "architecture_id": "LlaMAForCausalLM",
+      "architecture_id": "Emu3ForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "circulus/alpaca-7b"
+        "BAAI/Emu3-Chat"
       ]
     },
     {
-      "architecture_id": "GeoVForCausalLM",
+      "architecture_id": "GRIN-MoE",
       "total_models": 1,
       "sample_models": [
-        "GeoV/GeoV-9b"
+        "microsoft/GRIN-MoE"
       ]
     },
     {
-      "architecture_id": "RobertaPreLayerNormForCausalLM",
+      "architecture_id": "AV2TextForConditionalGeneration",
       "total_models": 1,
       "sample_models": [
-        "hf-tiny-model-private/tiny-random-RobertaPreLayerNormForCausalLM"
+        "nguyenvulebinh/AV-HuBERT-MuAViC-en"
       ]
     },
     {
-      "architecture_id": "RuGPT3XLForCausalLM",
+      "architecture_id": "MiniMaxForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "evilfreelancer/ruGPT3XL"
+        "MiniMaxAI/MiniMax-Text-01-hf"
       ]
     },
     {
-      "architecture_id": "TeleFLMForCausalLM",
+      "architecture_id": "ArcticForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "CofeAI/Tele-FLM-1T"
+        "Snowflake/snowflake-arctic-instruct"
       ]
     },
     {
-      "architecture_id": "GPTModelForTextGeneration",
+      "architecture_id": "OrionForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "samkeet/GPT_124M-Instruct"
+        "OrionStarAI/Orion-14B-Chat"
       ]
     },
     {
-      "architecture_id": "TFGPT2LMHeadModel",
+      "architecture_id": "IQuestLoopCoderForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "mymusise/gpt2-medium-chinese"
+        "IQuestLab/IQuest-Coder-V1-40B-Loop-Instruct"
       ]
     },
     {
-      "architecture_id": "PegasusForCausalLM",
+      "architecture_id": "Moondream",
       "total_models": 1,
       "sample_models": [
-        "hf-tiny-model-private/tiny-random-PegasusForCausalLM"
+        "vikhyatk/moondream1"
       ]
     },
     {
-      "architecture_id": "ElectraForCausalLM",
+      "architecture_id": "SarvamMLAForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "smeoni/nbme-electra-large-generator"
+        "sarvamai/sarvam-105b"
       ]
     },
     {
-      "architecture_id": "BlenderbotForCausalLM",
+      "architecture_id": "Plamo3ForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "hf-tiny-model-private/tiny-random-BlenderbotForCausalLM"
+        "pfnet/plamo-3-nict-2b-base"
       ]
     },
     {
-      "architecture_id": "LIMEForCausalLM",
+      "architecture_id": "InternLMXComposer2ForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "anarlavrenov/lime-1b-instruct"
+        "internlm/internlm-xcomposer2-7b"
       ]
     },
     {
-      "architecture_id": "ModernBertForMaskedLM",
+      "architecture_id": "GraphT5TransformerForConditionalGeneration",
       "total_models": 1,
       "sample_models": [
-        "JorgeVanco/diffusionGPT"
+        "haitengzhao/gimlet"
       ]
     },
     {
-      "architecture_id": "MvpForCausalLM",
+      "architecture_id": "InternLMXComposerForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "hf-tiny-model-private/tiny-random-MvpForCausalLM"
+        "internlm/internlm-xcomposer-7b"
       ]
     },
     {
-      "architecture_id": "DenseLLM",
+      "architecture_id": "Dots1ForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "AlgoDriveAI/Sanskrit_Akkadian_LLM_v1.0"
+        "rednote-hilab/dots.llm1.inst"
       ]
     },
     {
-      "architecture_id": "FP8Qwen3ForCausalLM",
+      "architecture_id": "LlavaSearchLlamaForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "xihc-ucb/Qwen3-8B-Base-train-Quasar-0809"
+        "craigwu/seal_vqa_7b"
       ]
     },
     {
-      "architecture_id": "EnergyTransformer",
+      "architecture_id": "CheXagentForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "cccczshao/CALM-M"
+        "StanfordAIMI/CheXagent-2-3b"
       ]
     },
     {
-      "architecture_id": "ConditionalGPT2LMHeadModel",
+      "architecture_id": "TransfoXLLMHeadModel",
       "total_models": 1,
       "sample_models": [
-        "entropy/roberta_zinc_decoder"
+        "transfo-xl/transfo-xl-wt103"
       ]
     },
     {
-      "architecture_id": "XModelForCausalLM",
+      "architecture_id": "ZetaGrid25B",
       "total_models": 1,
       "sample_models": [
-        "XiaoduoAILab/Xmodel_LM"
+        "RthItalia/Rth-lm-code-25b"
       ]
     },
     {
-      "architecture_id": "JiRackTernary1B",
+      "architecture_id": "TransformerForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "kgrabko/JiRackTernary_1b"
+        "fla-hub/transformer-1.3B-100B"
       ]
     },
     {
-      "architecture_id": "DebertaV2ForCausalLM",
+      "architecture_id": "Qwen3VLForConditionalGeneration",
       "total_models": 1,
       "sample_models": [
-        "ltg/deberta-xxlarge-fixed"
+        "RedHatAI/Qwen3-VL-32B-Instruct-NVFP4"
       ]
     },
     {
-      "architecture_id": "MoEGPTForCausalLM",
+      "architecture_id": "Rwkv6ForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "arnomatic/german-moe-gpt-v8-pretrained"
+        "RWKV/v6-Finch-1B6-HF"
       ]
     },
     {
-      "architecture_id": "SongGenMixedForConditionalGeneration",
+      "architecture_id": "CambrianQwenForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "LiuZH-19/SongGen_mixed_pro"
+        "nyu-visionx/Scale-RAE-Qwen1.5B_DiT2.4B"
       ]
     },
     {
-      "architecture_id": "SpectusForConditionalGeneration",
+      "architecture_id": "VaultGemmaForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "MS-ML/SpecTUS_pretrained_only"
+        "google/vaultgemma-1b"
       ]
     },
     {
-      "architecture_id": "LSGBartForConditionalGeneration",
+      "architecture_id": "FP8Qwen2ForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "morenolq/LEGIT-BART-LSG-4096"
+        "xihc-ucb/Qwen2.5-7B-train-Quasar-1214"
       ]
     },
     {
-      "architecture_id": "CloverLMForCausalLM",
+      "architecture_id": "SparseLlamaForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "daslab-testing/CloverLM"
+        "openbmb/NOSA-8B"
       ]
     },
     {
-      "architecture_id": "Qwen2VLAudioForConditionalGeneration",
+      "architecture_id": "SpatialLMQwenForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "MayaKD/qwen2-vl-audio"
+        "manycore-research/SpatialLM1.1-Qwen-0.5B"
       ]
     },
     {
-      "architecture_id": "FP8Qwen2ForCausalLM",
+      "architecture_id": "VSMForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "xihc-ucb/Qwen2.5-7B-train-Quasar-1214"
+        "craigwu/seal_vsm_7b"
       ]
     },
     {
-      "architecture_id": "LSTMForCausalLM",
+      "architecture_id": "GPT2LMHeadCustomModel",
       "total_models": 1,
       "sample_models": [
-        "deqing/lstm-window-4-v5"
+        "bigcode/santacoder"
       ]
     },
     {
-      "architecture_id": "CheXagentForConditionalGeneration",
+      "architecture_id": "MoYiForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "StanfordAIMI/CheXagent-8b"
+        "astanahub/alemllm"
       ]
     },
     {
-      "architecture_id": "LatentMoELLaVAPhiForCausalLM",
+      "architecture_id": "SeerAttnLlamaForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "KKHYA/llavaphi2-2.7b-finetune-latent-sparse-moe-4e-2k-freeze-1.0_20260304_075653"
+        "SeerAttention/SeerAttention-Llama-3.1-8B-AttnGates"
       ]
     },
     {
-      "architecture_id": "GPTXForCausalLM",
+      "architecture_id": "RavenForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "AxiomicLabs/GPT-X-125m-15bt"
+        "tomg-group-umd/huginn-0125"
       ]
     },
     {
-      "architecture_id": "OmniASRForConditionalGeneration",
+      "architecture_id": "GeoChatLlamaForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "bezzam/omniasr-llm-300m-v2"
+        "MBZUAI/geochat-7B"
       ]
     },
     {
-      "architecture_id": "MiniMaxText01ForCausalLM",
+      "architecture_id": "Param2MoEForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "MiniMaxAI/MiniMax-Text-01"
+        "bharatgenai/Param2-17B-A2.4B-Thinking"
       ]
     },
     {
-      "architecture_id": "LlavaCrystalForCausalLM",
+      "architecture_id": "AprielForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "LLM360/CrystalChat-7B-Web2Code"
+        "ServiceNow-AI/Apriel-5B-Instruct"
       ]
     },
     {
-      "architecture_id": "MatriochkaForCausalLM",
+      "architecture_id": "PanguEmbeddedForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "nthngdy/matryoshka-3B"
+        "FreedomIntelligence/openPangu-Embedded-1B"
       ]
     },
     {
-      "architecture_id": "MobileLLMP1ForCausalLM",
+      "architecture_id": "Phi4MMForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "facebook/MobileLLM-Pro-base"
+        "Yanis-Gerst/fine_tune"
       ]
     },
     {
-      "architecture_id": "Bagel",
+      "architecture_id": "Maira2ForConditionalGeneration",
       "total_models": 1,
       "sample_models": [
-        "lmms-lab/BAGEL-7B-MoT-ver.LE"
+        "microsoft/maira-2"
       ]
     },
     {
-      "architecture_id": "InternLM2ForRewardModel",
+      "architecture_id": "MiniCPMSALAForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "internlm/internlm2_5-step-prover-critic"
+        "openbmb/MiniCPM-SALA"
       ]
     },
     {
-      "architecture_id": "GeoChatLlamaForCausalLM",
+      "architecture_id": "GiddForDiffusionLM",
       "total_models": 1,
       "sample_models": [
-        "MBZUAI/geochat-7B"
+        "dvruette/gidd-unif-3b"
       ]
     },
     {
-      "architecture_id": "MobileLLMForCausalLM",
+      "architecture_id": "SteerlingForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "facebook/MobileLLM-125M"
+        "guidelabs/steerling-8b"
       ]
     },
     {
-      "architecture_id": "SliderGPT",
+      "architecture_id": "StableLMAlphaForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "c-bone/CrystaLLM-pi_Mattergen-XRD"
+        "stabilityai/stablelm-base-alpha-7b-v2"
       ]
     },
     {
-      "architecture_id": "CircuitGPTForCausalLM",
+      "architecture_id": "HGRNBitForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "openai/circuit-sparsity"
+        "ridger/MMfreeLM-370M"
       ]
     },
     {
-      "architecture_id": "Qwen35ForCausalLM",
+      "architecture_id": "CheXagentForConditionalGeneration",
       "total_models": 1,
       "sample_models": [
-        "JeffGreen311/Eve-V2-Unleashed-Qwen3.5-8B-Liberated-4K-4B-Merged"
+        "StanfordAIMI/CheXagent-8b"
       ]
     },
     {
-      "architecture_id": "KORMoForCausalLM",
+      "architecture_id": "MiniMaxText01ForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "KORMo-Team/KORMo-10B-sft"
+        "MiniMaxAI/MiniMax-Text-01"
       ]
     },
     {
-      "architecture_id": "MiniCPMSALAForCausalLM",
+      "architecture_id": "LamedPhi3ForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "openbmb/MiniCPM-SALA"
+        "GoodBaiBai88/M3D-LaMed-Phi-3-4B"
       ]
     },
     {
-      "architecture_id": "GiddForDiffusionLM",
+      "architecture_id": "TorchMultiOmicsModel",
       "total_models": 1,
       "sample_models": [
-        "dvruette/gidd-unif-3b"
+        "InstaDeepAI/ChatNT"
       ]
     },
     {
-      "architecture_id": "MobilintEagle3Qwen2ForCausalLM",
+      "architecture_id": "MobileLlamaForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "mobilint/EAGLE3-JPharmatron-7B"
+        "mtgv/MobileVLM_V2-1.7B"
       ]
     },
     {
-      "architecture_id": "Kanana2VecModel",
+      "architecture_id": "Phi4FlashForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "kakaocorp/kanana-nano-2.1b-embedding"
+        "microsoft/Phi-4-mini-flash-reasoning"
       ]
     },
     {
-      "architecture_id": "JiRackTernaryModel",
+      "architecture_id": "DeciCoderForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "kgrabko/JiRackTernary_70b"
+        "Deci/DeciCoder-1b"
       ]
     },
     {
-      "architecture_id": "Qwen3VLForConditionalGeneration",
+      "architecture_id": "GPT3DevLMHeadModel",
       "total_models": 1,
       "sample_models": [
-        "RedHatAI/Qwen3-VL-32B-Instruct-NVFP4"
+        "k050506koch/GPT3-dev-350m-2805"
       ]
     },
     {
-      "architecture_id": "Gemma4ForCausalLM",
+      "architecture_id": "Qwen2VLForConditionalGeneration",
       "total_models": 1,
       "sample_models": [
-        "LilaRest/gemma-4-31B-it-NVFP4-turbo"
+        "yujiepan/qwen2-vl-tiny-random"
       ]
     },
     {
-      "architecture_id": "AeroForConditionalGeneration",
+      "architecture_id": "Qwen2ForSequenceClassification",
       "total_models": 1,
       "sample_models": [
-        "lmms-lab/Aero-1-Audio"
+        "nvidia/AceMath-7B-RM"
       ]
     },
     {
-      "architecture_id": "HeliumForCausalLM",
+      "architecture_id": "Kanana2VecModel",
       "total_models": 1,
       "sample_models": [
-        "kyutai/helium-1-preview-2b"
+        "kakaocorp/kanana-nano-2.1b-embedding"
       ]
     },
     {
-      "architecture_id": "DeciCoderForCausalLM",
+      "architecture_id": "EchoForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "Deci/DeciCoder-1b"
+        "ethicalabs/Echo-DSRN-486M-v0.7.6-SFT"
       ]
     },
     {
-      "architecture_id": "PolyLMHeadModel",
+      "architecture_id": "CTRLLMHeadModel",
       "total_models": 1,
       "sample_models": [
-        "DAMO-NLP-MT/polylm-13b"
+        "sshleifer/tiny-ctrl"
       ]
     },
     {
-      "architecture_id": "BD3LM",
+      "architecture_id": "LLaDAMoEModel",
       "total_models": 1,
       "sample_models": [
-        "kuleshov-group/bd3lm-owt-block_size4"
+        "inclusionAI/LLaDA-MoE-7B-A1B-Base"
       ]
     },
     {
-      "architecture_id": "LamedPhi3ForCausalLM",
+      "architecture_id": "CPMAntForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "GoodBaiBai88/M3D-LaMed-Phi-3-4B"
+        "openbmb/cpm-ant-10b"
       ]
     },
     {
-      "architecture_id": "Emu3ForCausalLM",
+      "architecture_id": "ICONNForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "BAAI/Emu3-Chat"
+        "ICONNAI/ICONN-1-Mini-Beta"
       ]
     },
     {
-      "architecture_id": "BunnyLlamaForCausalLM",
+      "architecture_id": "HeliumForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "typhoon-ai/llama-3-typhoon-v1.5-8b-vision-preview"
+        "kyutai/helium-1-preview-2b"
       ]
     },
     {
-      "architecture_id": "SKTOmniForConditionalGeneration",
+      "architecture_id": "DogeForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "Shrijanagain/SKT_OMNI_SUPREME"
+        "SmallDoge/Doge-20M"
       ]
     },
     {
-      "architecture_id": "CambrianLlamaForCausalLM",
+      "architecture_id": "LongcatFlashNgramForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "nyu-visionx/cambrian-8b"
+        "meituan-longcat/LongCat-Flash-Lite"
       ]
     },
     {
-      "architecture_id": "LlamaModel",
+      "architecture_id": "GPT",
       "total_models": 1,
       "sample_models": [
-        "ngoan/NgoanYi"
+        "LH-Tech-AI/Apex-1.5-Coder-Instruct-350M"
       ]
     },
     {
-      "architecture_id": "SteerlingForCausalLM",
+      "architecture_id": "GPT2CustomLMHeadModel",
       "total_models": 1,
       "sample_models": [
-        "guidelabs/steerling-8b"
+        "fxmarty/tiny-testing-gpt2-remote-code"
       ]
     },
     {
-      "architecture_id": "TransnormerForCausalLM",
+      "architecture_id": "SKTOmniForConditionalGeneration",
       "total_models": 1,
       "sample_models": [
-        "OpenNLPLab/TransNormerLLM-385M"
+        "Shrijanagain/SKT_OMNI_SUPREME"
       ]
     },
     {
-      "architecture_id": "DUO",
+      "architecture_id": "MobileLLMForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "s-sahoo/duo-distilled"
+        "facebook/MobileLLM-125M"
       ]
     },
     {
-      "architecture_id": "ErnieForCausalLM",
+      "architecture_id": "CircuitGPTForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "mohitsha/tiny-ernie-random-remote-code"
+        "openai/circuit-sparsity"
       ]
     },
     {
-      "architecture_id": "ShikraLlamaForCausalLM",
+      "architecture_id": "Qwen3TSForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "shikras/shikra-7b-delta-v1"
+        "bytedance-research/ChatTS-8B"
       ]
     },
     {
-      "architecture_id": "Rwkv7ForCausalLM",
+      "architecture_id": "ConditionalGPT",
       "total_models": 1,
       "sample_models": [
-        "admijgjtjtjtjjg/dfdfdf"
+        "c-bone/CrystaLLM-pi_bandgap"
       ]
     },
     {
-      "architecture_id": "ICONNForCausalLM",
+      "architecture_id": "DuchifatCore",
       "total_models": 1,
       "sample_models": [
-        "ICONNAI/ICONN-1-Mini-Beta"
+        "Raziel1234/Duchifat-2"
       ]
     },
     {
-      "architecture_id": "RubiRLM",
+      "architecture_id": "GPT2Model",
       "total_models": 1,
       "sample_models": [
-        "DevHunterAI/RubiRLM-1B-Base"
+        "cerebras/Cerebras-GPT-13B"
       ]
     },
     {
-      "architecture_id": "YiForCausalLM",
+      "architecture_id": "BD3LM",
       "total_models": 1,
       "sample_models": [
-        "llmware/dragon-yi-6b-v0"
+        "kuleshov-group/bd3lm-owt-block_size4"
       ]
     },
     {
-      "architecture_id": "SoraForSLM",
+      "architecture_id": "AeroForConditionalGeneration",
       "total_models": 1,
       "sample_models": [
-        "Conlanger-LLM-CLEM/Sorie"
+        "lmms-lab/Aero-1-Audio"
       ]
     },
     {
-      "architecture_id": "CpmBeeForCausalLM",
+      "architecture_id": "KORMoForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "openbmb/cpm-bee-10b"
+        "KORMo-Team/KORMo-10B-sft"
       ]
     },
     {
-      "architecture_id": "HGRNBitForCausalLM",
+      "architecture_id": "PhariaForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "ridger/MMfreeLM-370M"
+        "Aleph-Alpha/Pharia-1-LLM-7B-control-hf"
       ]
     },
     {
-      "architecture_id": "ZsGPT2LMHeadModel",
+      "architecture_id": "UMT5ForConditionalGeneration",
       "total_models": 1,
       "sample_models": [
-        "claritylab/zero-shot-vanilla-gpt2"
+        "EleutherAI/pile-t5-xl"
       ]
     },
     {
-      "architecture_id": "Phi4FlashForCausalLM",
+      "architecture_id": "ZambaForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "microsoft/Phi-4-mini-flash-reasoning"
+        "Zyphra/Zamba-7B-v1"
       ]
     },
     {
-      "architecture_id": "MochivaForCausalLM",
+      "architecture_id": "PolyLMHeadModel",
       "total_models": 1,
       "sample_models": [
-        "Mochiva-team/Mochiva-model"
+        "DAMO-NLP-MT/polylm-13b"
       ]
     },
     {
-      "architecture_id": "HumanGPTForCausalLM",
+      "architecture_id": "RecursiveLanguageModel",
       "total_models": 1,
       "sample_models": [
-        "YaoFeng/CHATPOSE-V0"
+        "Girinath11/recursive-language-model-198m"
       ]
     },
     {
-      "architecture_id": "BTLMLMHeadModel",
+      "architecture_id": "SpatialLMLlamaForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "cerebras/btlm-3b-8k-base"
+        "manycore-research/SpatialLM1.1-Llama-1B"
       ]
     },
     {
-      "architecture_id": "DotLMForCausalLM",
+      "architecture_id": "PointLLMLlamaForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "tensorfiend/DotLM-165M"
+        "RunsenXu/PointLLM_7B_v1.2"
       ]
     },
     {
-      "architecture_id": "XMistralForCausalLM",
+      "architecture_id": "MegaForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "Hannibal046/xrag-7b"
+        "BEE-spoke-data/mega-ar-126m-4k"
       ]
     },
     {
-      "architecture_id": "TelechatForCausalLM",
+      "architecture_id": "SongGenMixedForConditionalGeneration",
       "total_models": 1,
       "sample_models": [
-        "Tele-AI/telechat-7B"
+        "LiuZH-19/SongGen_mixed_pro"
       ]
     },
     {
-      "architecture_id": "FlamingoForCausalLM",
+      "architecture_id": "DUO",
       "total_models": 1,
       "sample_models": [
-        "babylm/flamingo-2024"
+        "s-sahoo/duo-distilled"
       ]
     },
     {
-      "architecture_id": "Qwen2VLForConditionalGeneration",
+      "architecture_id": "LlamaModel",
       "total_models": 1,
       "sample_models": [
-        "typhoon-ai/typhoon2-qwen2vl-7b-vision-instruct"
+        "ngoan/NgoanYi"
       ]
     },
     {
-      "architecture_id": "VStreamLlamaForCausalLM",
+      "architecture_id": "BailingMoeLinearV2ForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "IVGSZ/Flash-VStream-7b"
+        "inclusionAI/Ring-mini-linear-2.0"
       ]
     },
     {
-      "architecture_id": "AquilaDenseForCausalLM",
+      "architecture_id": "BertLMHeadModel",
       "total_models": 1,
       "sample_models": [
-        "BAAI/AquilaDense-7B"
+        "dicta-il/BEREL_3.0"
       ]
     },
     {
-      "architecture_id": "LongLlamaForCausalLM",
+      "architecture_id": "Glm4MoeLiteSonicForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "syzymon/long_llama_3b"
+        "rpDungeon/GLM-4.7-Flash-SonicMOE"
       ]
     },
     {
-      "architecture_id": "EmuForCausalLM",
+      "architecture_id": "Bagel",
       "total_models": 1,
       "sample_models": [
-        "BAAI/Emu2-Chat"
+        "lmms-lab/BAGEL-7B-MoT-ver.LE"
       ]
     },
     {
-      "architecture_id": "Lfm2Prototype1ForCausalLM",
+      "architecture_id": "GLaMMForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "nntsuzu/LFM2-SFT-Prototype01-1.2B-JP"
+        "MBZUAI/GLaMM-FullScope"
       ]
     },
     {
-      "architecture_id": "CogVLMVideoForCausalLM",
+      "architecture_id": "KonkanGPT",
       "total_models": 1,
       "sample_models": [
-        "zai-org/VisionReward-Video"
+        "omdeep22/Gonyai-v1"
       ]
     },
     {
-      "architecture_id": "MoELLaVAQWenForCausalLM",
+      "architecture_id": "Qwen3OmniMoeThinkerForConditionalGeneration",
       "total_models": 1,
       "sample_models": [
-        "LanguageBind/MoE-LLaVA-Qwen-1.8B-4e"
+        "ngqtrung/Qwen3-Omni-Thinker-30B-Instruct"
       ]
     },
     {
-      "architecture_id": "YayiForCausalLM",
+      "architecture_id": "C3QwenForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "wenge-research/yayi2-30b"
+        "liufanfanlff/C3-Context-Cascade-Compression"
       ]
     },
     {
-      "architecture_id": "ArgonneModel",
+      "architecture_id": "MonoidForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "PursuitOfDataScience/Argonne2.5-base"
+        "NoesisLab/Spartacus-1B-Instruct"
       ]
     },
     {
-      "architecture_id": "SkyworkForCausalLM",
+      "architecture_id": "ErnieForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "Skywork/Skywork-13B-base"
+        "mohitsha/tiny-ernie-random-remote-code"
       ]
     },
     {
-      "architecture_id": "Qwen3ASRForConditionalGeneration",
+      "architecture_id": "TransnormerForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "bezzam/Qwen3-ASR-0.6B"
+        "OpenNLPLab/TransNormerLLM-385M"
       ]
     },
     {
-      "architecture_id": "HymbaForCausalLM",
+      "architecture_id": "PKVGPT",
       "total_models": 1,
       "sample_models": [
-        "nvidia/Hymba-1.5B-Instruct"
+        "c-bone/CrystaLLM-pi_SLME"
       ]
     },
     {
-      "architecture_id": "MobiLlamaForCausalLM",
+      "architecture_id": "MedHemoModel",
       "total_models": 1,
       "sample_models": [
-        "MBZUAI/MobiLlama-05B"
+        "amewebstudio/medhemo-earcp"
       ]
     },
     {
-      "architecture_id": "HebrewGPTForCausalLM",
+      "architecture_id": "OpenLMForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "Slasky/HebrewGPT-1B"
+        "nick11roberts/SL-discrep-chinchilla-rw-params5M_maxstep760-flop_1_25e16_step_767"
       ]
     },
     {
-      "architecture_id": "YuanForCausalLM",
+      "architecture_id": "MCGPTForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "IEITYuan/Yuan2-M32-hf"
+        "TopAI-1/MCGPT-1"
       ]
     },
     {
-      "architecture_id": "MegaForCausalLM",
+      "architecture_id": "HymbaForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "BEE-spoke-data/mega-ar-126m-4k"
+        "nvidia/Hymba-1.5B-Instruct"
       ]
     },
     {
-      "architecture_id": "Gemma4TextModel",
+      "architecture_id": "LlamaMoEForCausalLM",
       "total_models": 1,
       "sample_models": [
-        "bRadu/gemma-4-E2B-it-textonly"
+        "llama-moe/LLaMA-MoE-v1-3_5B-2_8"
       ]
     }
   ]
diff --git a/transformer_lens/tools/model_registry/data/supported_models.json b/transformer_lens/tools/model_registry/data/supported_models.json
index 1c8d879d0..b7868f462 100644
--- a/transformer_lens/tools/model_registry/data/supported_models.json
+++ b/transformer_lens/tools/model_registry/data/supported_models.json
@@ -77668,6 +77668,76 @@
       "phase3_score": 100.0,
       "phase4_score": 89.9,
       "phase7_score": null
+    },
+    {
+      "architecture_id": "CodeGenForCausalLM",
+      "model_id": "Salesforce/codegen-350M-mono",
+      "status": 1,
+      "verified_date": "2026-04-09",
+      "metadata": null,
+      "note": "Full verification completed",
+      "phase1_score": 100.0,
+      "phase2_score": 100.0,
+      "phase3_score": 100.0,
+      "phase4_score": 86.2,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "CodeGenForCausalLM",
+      "model_id": "Salesforce/codegen-350M-multi",
+      "status": 1,
+      "verified_date": "2026-04-09",
+      "metadata": null,
+      "note": "Full verification completed",
+      "phase1_score": 100.0,
+      "phase2_score": 100.0,
+      "phase3_score": 100.0,
+      "phase4_score": 84.5,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "CodeGenForCausalLM",
+      "model_id": "Salesforce/codegen-350M-nl",
+      "status": 1,
+      "verified_date": "2026-04-09",
+      "metadata": null,
+      "note": "Full verification completed",
+      "phase1_score": 100.0,
+      "phase2_score": 100.0,
+      "phase3_score": 100.0,
+      "phase4_score": 89.2,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "CodeGenForCausalLM",
+      "model_id": "Salesforce/codegen-2B-mono",
+      "status": 1,
+      "verified_date": "2026-04-09",
+      "metadata": null,
+      "note": "Full verification completed",
+      "phase1_score": 100.0,
+      "phase2_score": 100.0,
+      "phase3_score": 100.0,
+      "phase4_score": 85.4,
+      "phase7_score": null,
+      "phase8_score": null
+    },
+    {
+      "architecture_id": "CodeGenForCausalLM",
+      "model_id": "Salesforce/codegen-2B-multi",
+      "status": 1,
+      "verified_date": "2026-04-09",
+      "metadata": null,
+      "note": "Full verification completed",
+      "phase1_score": 100.0,
+      "phase2_score": 100.0,
+      "phase3_score": 100.0,
+      "phase4_score": 67.5,
+      "phase7_score": null,
+      "phase8_score": null
     }
   ]
 }
diff --git a/transformer_lens/tools/model_registry/data/verification_history.json b/transformer_lens/tools/model_registry/data/verification_history.json
index 3eae3fae0..9eb2e7648 100644
--- a/transformer_lens/tools/model_registry/data/verification_history.json
+++ b/transformer_lens/tools/model_registry/data/verification_history.json
@@ -1,5 +1,5 @@
 {
-  "last_updated": "2026-04-09T13:22:45.115556",
+  "last_updated": "2026-04-09T16:34:36.818082",
   "records": [
     {
       "model_id": "Macropodus/macbert4mdcspell_v1",
@@ -11142,158 +11142,38 @@
       "invalidation_reason": null
     },
     {
-      "model_id": "fxmarty/really-tiny-falcon-testing",
-      "architecture_id": "FalconForCausalLM",
+      "model_id": "Salesforce/codegen-350M-mono",
+      "architecture_id": "CodeGenForCausalLM",
       "verified_date": "2026-04-09",
       "verified_by": "verify_models",
       "transformerlens_version": null,
-      "notes": "Below threshold: P3=55.6% < 75.0% (failed: process_bridge_weights, weight_modification, hook_functio \u2014 Critical backward hooks check failed: Output 0 of BackwardHookFunctionBackward is a view and is being modified inplace. This view was created inside a",
+      "notes": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 20/103 components failed (20 critical)",
       "invalidated": false,
       "invalidation_reason": null
     },
     {
-      "model_id": "yujiepan/falcon-tiny-random",
-      "architecture_id": "FalconForCausalLM",
+      "model_id": "Salesforce/codegen-350M-mono",
+      "architecture_id": "CodeGenForCausalLM",
       "verified_date": "2026-04-09",
       "verified_by": "verify_models",
       "transformerlens_version": null,
-      "notes": "Below threshold: P3=55.6% < 75.0% (failed: process_bridge_weights, weight_modification, hook_functio \u2014 Critical backward hooks check failed: Output 0 of BackwardHookFunctionBackward is a view and is being modified inplace. This view was created inside a",
-      "invalidated": false,
-      "invalidation_reason": null
-    },
-    {
-      "model_id": "yujiepan/falcon-tiny-random",
-      "architecture_id": "FalconForCausalLM",
-      "verified_date": "2026-04-09",
-      "verified_by": "verify_models",
-      "transformerlens_version": null,
-      "notes": "Below threshold: P3=55.6% < 75.0% (failed: process_bridge_weights, weight_modification, hook_functio \u2014 Critical backward hooks check failed: Output 0 of BackwardHookFunctionBackward is a view and is being modified inplace. This view was created inside a",
-      "invalidated": false,
-      "invalidation_reason": null
-    },
-    {
-      "model_id": "tiiuae/falcon-rw-1b",
-      "architecture_id": "FalconForCausalLM",
-      "verified_date": "2026-04-09",
-      "verified_by": "verify_models",
-      "transformerlens_version": null,
-      "notes": "Below threshold: P1=0.0% < 100.0% (failed: load_bridge_unprocessed) \u2014 Failed to load unprocessed TransformerBridge: Falcon with ALiBi is not yet supported. Only RoPE-based Falcon models are currently handled.",
-      "invalidated": false,
-      "invalidation_reason": null
-    },
-    {
-      "model_id": "tiiuae/falcon-7b",
-      "architecture_id": "FalconForCausalLM",
-      "verified_date": "2026-04-09",
-      "verified_by": "verify_models",
-      "transformerlens_version": null,
-      "notes": "Below threshold: P3=50.0% < 75.0% (failed: process_bridge_weights, layer_norm_folding, weight_modifi \u2014 Critical backward hooks check failed: Output 0 of BackwardHookFunctionBackward is a view and is being modified inplace. This view was created inside a",
-      "invalidated": false,
-      "invalidation_reason": null
-    },
-    {
-      "model_id": "tiiuae/falcon-7b",
-      "architecture_id": "FalconForCausalLM",
-      "verified_date": "2026-04-09",
-      "verified_by": "verify_models",
-      "transformerlens_version": null,
-      "notes": "Below threshold: P3=50.0% < 75.0% (failed: process_bridge_weights, layer_norm_folding, weight_modifi \u2014 Critical backward hooks check failed: Output 0 of BackwardHookFunctionBackward is a view and is being modified inplace. This view was created inside a",
-      "invalidated": false,
-      "invalidation_reason": null
-    },
-    {
-      "model_id": "fxmarty/really-tiny-falcon-testing",
-      "architecture_id": "FalconForCausalLM",
-      "verified_date": "2026-04-09",
-      "verified_by": "verify_models",
-      "transformerlens_version": null,
-      "notes": "Below threshold: P3=55.6% < 75.0% (failed: process_bridge_weights, weight_modification, hook_functio \u2014 Critical backward hooks check failed: Output 0 of BackwardHookFunctionBackward is a view and is being modified inplace. This view was created inside a",
-      "invalidated": false,
-      "invalidation_reason": null
-    },
-    {
-      "model_id": "fxmarty/really-tiny-falcon-testing",
-      "architecture_id": "FalconForCausalLM",
-      "verified_date": "2026-04-09",
-      "verified_by": "verify_models",
-      "transformerlens_version": null,
-      "notes": "Below threshold: P3=55.6% < 75.0% (failed: process_bridge_weights, weight_modification, hook_functio \u2014 Critical backward hooks check failed: Output 0 of BackwardHookFunctionBackward is a view and is being modified inplace. This view was created inside a",
-      "invalidated": false,
-      "invalidation_reason": null
-    },
-    {
-      "model_id": "tiiuae/falcon-rw-1b",
-      "architecture_id": "FalconForCausalLM",
-      "verified_date": "2026-04-09",
-      "verified_by": "verify_models",
-      "transformerlens_version": null,
-      "notes": "Below threshold: P1=50.0% < 100.0% (failed: forward_pass_logits); P3=89.5% but required tests failed \u2014 Tensors differ: max_diff=33.789673, mean_rel=2.566615",
-      "invalidated": false,
-      "invalidation_reason": null
-    },
-    {
-      "model_id": "fxmarty/really-tiny-falcon-testing",
-      "architecture_id": "FalconForCausalLM",
-      "verified_date": "2026-04-09",
-      "verified_by": "verify_models",
-      "transformerlens_version": null,
-      "notes": "Below threshold: P3=55.6% < 75.0% (failed: process_bridge_weights, weight_modification, hook_functio \u2014 Critical backward hooks check failed: Output 0 of BackwardHookFunctionBackward is a view and is being modified inplace. This view was created inside a",
-      "invalidated": false,
-      "invalidation_reason": null
-    },
-    {
-      "model_id": "fxmarty/really-tiny-falcon-testing",
-      "architecture_id": "FalconForCausalLM",
-      "verified_date": "2026-04-09",
-      "verified_by": "verify_models",
-      "transformerlens_version": null,
-      "notes": "Full verification completed with issues, low text quality: P3=95.0% (failed: process_bridge_weights)",
-      "invalidated": false,
-      "invalidation_reason": null
-    },
-    {
-      "model_id": "tiiuae/falcon-rw-1b",
-      "architecture_id": "FalconForCausalLM",
-      "verified_date": "2026-04-09",
-      "verified_by": "verify_models",
-      "transformerlens_version": null,
-      "notes": "Below threshold: P1=0.0% < 100.0% (failed: all_components, forward_pass_logits); P3=89.5% but requir \u2014 24/147 components failed (24 critical)",
-      "invalidated": false,
-      "invalidation_reason": null
-    },
-    {
-      "model_id": "tiiuae/falcon-rw-1b",
-      "architecture_id": "FalconForCausalLM",
-      "verified_date": "2026-04-09",
-      "verified_by": "verify_models",
-      "transformerlens_version": null,
-      "notes": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 24/123 components failed (24 critical)",
-      "invalidated": false,
-      "invalidation_reason": null
-    },
-    {
-      "model_id": "tiiuae/falcon-rw-1b",
-      "architecture_id": "FalconForCausalLM",
-      "verified_date": "2026-04-09",
-      "verified_by": "verify_models",
-      "transformerlens_version": null,
-      "notes": "Below threshold: P1=50.0% < 100.0% (failed: all_components) \u2014 24/123 components failed (24 critical)",
+      "notes": "Full verification completed",
       "invalidated": false,
       "invalidation_reason": null
     },
     {
-      "model_id": "tiiuae/falcon-rw-1b",
-      "architecture_id": "FalconForCausalLM",
+      "model_id": "Salesforce/codegen-350M-multi",
+      "architecture_id": "CodeGenForCausalLM",
       "verified_date": "2026-04-09",
       "verified_by": "verify_models",
       "transformerlens_version": null,
-      "notes": "Below threshold: P3=89.5% but required tests failed: logits_equivalence, loss_equivalence \u2014 Text quality score: 72.2/100 (avg perplexity: 471.0) \u2014 generated text may be incoherent",
+      "notes": "Full verification completed",
       "invalidated": false,
       "invalidation_reason": null
     },
     {
-      "model_id": "tiiuae/falcon-rw-1b",
-      "architecture_id": "FalconForCausalLM",
+      "model_id": "Salesforce/codegen-350M-nl",
+      "architecture_id": "CodeGenForCausalLM",
       "verified_date": "2026-04-09",
       "verified_by": "verify_models",
       "transformerlens_version": null,
@@ -11302,8 +11182,8 @@
       "invalidation_reason": null
     },
     {
-      "model_id": "tiiuae/falcon-rw-1b",
-      "architecture_id": "FalconForCausalLM",
+      "model_id": "Salesforce/codegen-2B-mono",
+      "architecture_id": "CodeGenForCausalLM",
       "verified_date": "2026-04-09",
       "verified_by": "verify_models",
       "transformerlens_version": null,
@@ -11312,8 +11192,8 @@
       "invalidation_reason": null
     },
     {
-      "model_id": "tiiuae/falcon-rw-1b",
-      "architecture_id": "FalconForCausalLM",
+      "model_id": "Salesforce/codegen-2B-multi",
+      "architecture_id": "CodeGenForCausalLM",
       "verified_date": "2026-04-09",
       "verified_by": "verify_models",
       "transformerlens_version": null,

From 1153ddeb3d2259681310701cc5380b262ce37f12 Mon Sep 17 00:00:00 2001
From: jlarson4 <jonahalarson@comcast.net>
Date: Thu, 9 Apr 2026 17:19:51 -0500
Subject: [PATCH 2/2] Mypy and check format

---
 .../test_codegen_attention_bridge.py          | 34 +++++++++----------
 .../test_codegen_adapter.py                   | 17 ++++------
 .../codegen_attention.py                      | 23 +++++--------
 .../supported_architectures/codegen.py        | 21 +++---------
 4 files changed, 38 insertions(+), 57 deletions(-)

diff --git a/tests/unit/model_bridge/generalized_components/test_codegen_attention_bridge.py b/tests/unit/model_bridge/generalized_components/test_codegen_attention_bridge.py
index 5814b1418..a6a22e288 100644
--- a/tests/unit/model_bridge/generalized_components/test_codegen_attention_bridge.py
+++ b/tests/unit/model_bridge/generalized_components/test_codegen_attention_bridge.py
@@ -10,8 +10,7 @@
 - KV cache is passed through to _update_kv_cache
 """
 
-from typing import Any
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock
 
 import torch
 
@@ -21,7 +20,6 @@
     _rotate_every_two,
 )
 
-
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
@@ -207,7 +205,9 @@ def test_q_k_v_projections_are_set(self):
 
     def test_no_c_proj_attribute_needed(self):
         """Construction must succeed when the original component has no c_proj."""
-        from transformer_lens.model_bridge.generalized_components.linear import LinearBridge
+        from transformer_lens.model_bridge.generalized_components.linear import (
+            LinearBridge,
+        )
 
         config = _make_config()
         split_qkv, _, _, _ = _make_split_qkv(config.d_model)
@@ -386,9 +386,9 @@ def zeroing_hook(tensor, hook):
         bridge.q.hook_out.add_hook(zeroing_hook)
         zeroed_out, _ = bridge(hs.clone(), position_ids=pos_ids)
 
-        assert not torch.allclose(baseline_out, zeroed_out), (
-            "Zeroing hook_q should change the attention output"
-        )
+        assert not torch.allclose(
+            baseline_out, zeroed_out
+        ), "Zeroing hook_q should change the attention output"
 
 
 # ---------------------------------------------------------------------------
@@ -443,7 +443,7 @@ def capture_scores(tensor, hook):
 
         # Compute what scores would be WITHOUT RoPE
         raw_q = raw_q_values[0]  # [B, S, D]
-        raw_k = k_lin(hs)        # [B, S, D]
+        raw_k = k_lin(hs)  # [B, S, D]
         n_heads = config.n_heads
         head_dim = config.d_model // n_heads
         q_plain = raw_q.view(B, S, n_heads, head_dim).transpose(1, 2).to(torch.float32)
@@ -453,9 +453,9 @@ def capture_scores(tensor, hook):
         actual_scores = attn_scores_with_rope[0]
 
         # The scores MUST differ because RoPE was applied
-        assert not torch.allclose(actual_scores, scores_no_rope, atol=1e-4), (
-            "Attention scores with and without RoPE should differ"
-        )
+        assert not torch.allclose(
+            actual_scores, scores_no_rope, atol=1e-4
+        ), "Attention scores with and without RoPE should differ"
 
     def test_partial_rotary_dim_leaves_pass_through_unchanged(self):
         """The head-dim slice beyond rotary_dim should not be rotated.
@@ -516,9 +516,9 @@ def patched_apply_rope(tensor, sin, cos):
         # The slice sent into RoPE must equal the raw_q rotary slice
         q_rot_slice = q_passed[0]  # [B, H, S, rotary_dim]
         raw_q_rot_slice = raw_q_heads.transpose(1, 2)[:, :, :, :rotary_dim]
-        assert torch.allclose(q_rot_slice, raw_q_rot_slice, atol=1e-5), (
-            "Q slice sent to RoPE must equal the raw projection (pre-rotation)"
-        )
+        assert torch.allclose(
+            q_rot_slice, raw_q_rot_slice, atol=1e-5
+        ), "Q slice sent to RoPE must equal the raw projection (pre-rotation)"
 
 
 # ---------------------------------------------------------------------------
@@ -540,6 +540,6 @@ def test_future_positions_have_zero_attention_weight(self):
         # attn_weights: [B, H, S, S]; upper triangle (future) must be ~0
         for i in range(S):
             for j in range(i + 1, S):
-                assert torch.all(attn_weights[:, :, i, j].abs() < 1e-5), (
-                    f"attn_weights[:, :, {i}, {j}] should be ~0 (future position)"
-                )
+                assert torch.all(
+                    attn_weights[:, :, i, j].abs() < 1e-5
+                ), f"attn_weights[:, :, {i}, {j}] should be ~0 (future position)"
diff --git a/tests/unit/model_bridge/supported_architectures/test_codegen_adapter.py b/tests/unit/model_bridge/supported_architectures/test_codegen_adapter.py
index b76f36cce..efee81fc9 100644
--- a/tests/unit/model_bridge/supported_architectures/test_codegen_adapter.py
+++ b/tests/unit/model_bridge/supported_architectures/test_codegen_adapter.py
@@ -28,7 +28,6 @@
     CodeGenArchitectureAdapter,
 )
 
-
 # ---------------------------------------------------------------------------
 # Fixtures
 # ---------------------------------------------------------------------------
@@ -136,9 +135,7 @@ def test_blocks_ln1_name(self, adapter: CodeGenArchitectureAdapter) -> None:
     def test_no_ln2_in_blocks(self, adapter: CodeGenArchitectureAdapter) -> None:
         """CodeGen uses parallel attn+MLP sharing ln_1 — there must be no ln2."""
         blocks = adapter.component_mapping["blocks"]
-        assert "ln2" not in blocks.submodules, (
-            "CodeGen parallel block must not have ln2"
-        )
+        assert "ln2" not in blocks.submodules, "CodeGen parallel block must not have ln2"
 
     def test_attn_is_codegen_attention_bridge(self, adapter: CodeGenArchitectureAdapter) -> None:
         blocks = adapter.component_mapping["blocks"]
@@ -316,9 +313,9 @@ def test_factory_returns_codegen_adapter(self) -> None:
 
         cfg = _make_cfg()
         adapter = ArchitectureAdapterFactory.select_architecture_adapter(cfg)
-        assert isinstance(adapter, CodeGenArchitectureAdapter), (
-            f"Expected CodeGenArchitectureAdapter, got {type(adapter).__name__}"
-        )
+        assert isinstance(
+            adapter, CodeGenArchitectureAdapter
+        ), f"Expected CodeGenArchitectureAdapter, got {type(adapter).__name__}"
 
     def test_factory_key_is_codegen_for_causal_lm(self) -> None:
         """SUPPORTED_ARCHITECTURES must have a 'CodeGenForCausalLM' key."""
@@ -326,6 +323,6 @@ def test_factory_key_is_codegen_for_causal_lm(self) -> None:
             SUPPORTED_ARCHITECTURES,
         )
 
-        assert "CodeGenForCausalLM" in SUPPORTED_ARCHITECTURES, (
-            "CodeGenForCausalLM must be registered in SUPPORTED_ARCHITECTURES"
-        )
+        assert (
+            "CodeGenForCausalLM" in SUPPORTED_ARCHITECTURES
+        ), "CodeGenForCausalLM must be registered in SUPPORTED_ARCHITECTURES"
diff --git a/transformer_lens/model_bridge/generalized_components/codegen_attention.py b/transformer_lens/model_bridge/generalized_components/codegen_attention.py
index a4df9c170..e21bc46b0 100644
--- a/transformer_lens/model_bridge/generalized_components/codegen_attention.py
+++ b/transformer_lens/model_bridge/generalized_components/codegen_attention.py
@@ -10,7 +10,7 @@
   - rotary_dim: if None, RoPE is applied to the full head dimension.
 """
 
-from typing import Any, Callable, Dict, Optional
+from typing import Any, Callable, Dict, Optional, cast
 
 import torch
 
@@ -24,7 +24,6 @@
     JointQKVAttentionBridge,
 )
 
-
 # ---------------------------------------------------------------------------
 # Rotary helpers — GPT-J / CodeGen style ("rotate_every_two")
 # ---------------------------------------------------------------------------
@@ -42,7 +41,7 @@ def _rotate_every_two(x: torch.Tensor) -> torch.Tensor:
     Returns:
         Tensor of the same shape with even/odd pairs rotated.
     """
-    x1 = x[:, :, :, ::2]   # even-indexed dims
+    x1 = x[:, :, :, ::2]  # even-indexed dims
     x2 = x[:, :, :, 1::2]  # odd-indexed dims
     x = torch.stack((-x2, x1), dim=-1)
     return x.flatten(-2)
@@ -170,11 +169,7 @@ def get_random_inputs(
         if dtype is None:
             dtype = torch.float32
 
-        d_model = (
-            self.config.d_model
-            if self.config and hasattr(self.config, "d_model")
-            else 768
-        )
+        d_model = self.config.d_model if self.config and hasattr(self.config, "d_model") else 768
 
         # Build the HF-style 4D causal mask: 0 where attended, -inf where masked.
         # Shape: [batch, 1, seq_len, seq_len]
@@ -186,9 +181,7 @@ def get_random_inputs(
         causal[:, 0] = causal[:, 0].masked_fill(mask_upper, min_val)
 
         return {
-            "hidden_states": torch.randn(
-                batch_size, seq_len, d_model, device=device, dtype=dtype
-            ),
+            "hidden_states": torch.randn(batch_size, seq_len, d_model, device=device, dtype=dtype),
             "position_ids": torch.arange(seq_len, device=device)
             .unsqueeze(0)
             .expand(batch_size, -1),
@@ -310,7 +303,7 @@ def _reconstruct_attention(
         # ---- RoPE ----
         position_ids: Optional[torch.Tensor] = kwargs.get("position_ids", None)
         if position_ids is not None:
-            embed_positions: torch.Tensor = self.original_component.embed_positions  # type: ignore[union-attr]
+            embed_positions = cast(torch.Tensor, self.original_component.embed_positions)  # type: ignore[union-attr]
             # Move buffer to the right device if needed (mirrors HF forward)
             if embed_positions.device != position_ids.device:
                 embed_positions = embed_positions.to(position_ids.device)
@@ -336,7 +329,7 @@ def _reconstruct_attention(
         kv_seq_len = k.shape[-2]
 
         # ---- Scaled dot-product (fp32, matching HF CodeGen._attn) ----
-        scale = self.original_component.scale_attn  # type: ignore[union-attr]
+        scale = cast(torch.Tensor, self.original_component.scale_attn)  # type: ignore[union-attr]
         q_f32 = q.to(torch.float32)
         k_f32 = k.to(torch.float32)
 
@@ -364,7 +357,9 @@ def _reconstruct_attention(
         attn_output = torch.matmul(attn_weights, v)
 
         # Reshape [batch, heads, seq, head_dim] → [batch, seq, hidden]
-        attn_output = self._reshape_attn_output(attn_output, batch_size, seq_len, num_heads, head_dim)
+        attn_output = self._reshape_attn_output(
+            attn_output, batch_size, seq_len, num_heads, head_dim
+        )
 
         # Output projection (fires hook_z via o.hook_in)
         attn_output = self._apply_output_projection(attn_output)
diff --git a/transformer_lens/model_bridge/supported_architectures/codegen.py b/transformer_lens/model_bridge/supported_architectures/codegen.py
index ee19a109c..c385833ae 100644
--- a/transformer_lens/model_bridge/supported_architectures/codegen.py
+++ b/transformer_lens/model_bridge/supported_architectures/codegen.py
@@ -2,7 +2,6 @@
 
 from typing import Any
 
-import torch
 import torch.nn as nn
 
 from transformer_lens.conversion_utils.conversion_steps import RearrangeTensorConversion
@@ -54,24 +53,16 @@ def __init__(self, cfg: Any) -> None:
         # TransformerLens format [n_heads, d_model, d_head].
         self.weight_processing_conversions = {
             "blocks.{i}.attn.q.weight": ParamProcessingConversion(
-                tensor_conversion=RearrangeTensorConversion(
-                    "(n h) m -> n m h", n=self.cfg.n_heads
-                ),
+                tensor_conversion=RearrangeTensorConversion("(n h) m -> n m h", n=self.cfg.n_heads),
             ),
             "blocks.{i}.attn.k.weight": ParamProcessingConversion(
-                tensor_conversion=RearrangeTensorConversion(
-                    "(n h) m -> n m h", n=self.cfg.n_heads
-                ),
+                tensor_conversion=RearrangeTensorConversion("(n h) m -> n m h", n=self.cfg.n_heads),
             ),
             "blocks.{i}.attn.v.weight": ParamProcessingConversion(
-                tensor_conversion=RearrangeTensorConversion(
-                    "(n h) m -> n m h", n=self.cfg.n_heads
-                ),
+                tensor_conversion=RearrangeTensorConversion("(n h) m -> n m h", n=self.cfg.n_heads),
             ),
             "blocks.{i}.attn.o.weight": ParamProcessingConversion(
-                tensor_conversion=RearrangeTensorConversion(
-                    "m (n h) -> n h m", n=self.cfg.n_heads
-                ),
+                tensor_conversion=RearrangeTensorConversion("m (n h) -> n h m", n=self.cfg.n_heads),
             ),
         }
 
@@ -104,9 +95,7 @@ def __init__(self, cfg: Any) -> None:
             "unembed": UnembeddingBridge(name="lm_head"),
         }
 
-    def split_qkv_matrix(
-        self, attn_component: Any
-    ) -> tuple[nn.Linear, nn.Linear, nn.Linear]:
+    def split_qkv_matrix(self, attn_component: Any) -> tuple[nn.Linear, nn.Linear, nn.Linear]:
         """Split the fused QKV weight into separate Q, K, V linear modules.
 
         CodeGen uses GPT-J-style tensor-parallel partitioning with ``mp_num=4``