fix: resolve NameError when MultiHeadAttention is called with w_init=None

Igor Morozov · Igor Morozov · commit d3ee9c33e4e7 · 2026-03-31T11:00:28.000+03:00
`w_init_scale` was referenced but never defined, causing a NameError whenever MultiHeadAttention is instantiated with the default w_init=None. Fix replaces the undefined variable with the literal 1.0, which matches the upstream haiku VarianceScaling default. Adds two focused regression tests: - test_w_init_none_does_not_raise: exercises the formerly-broken code path - test_w_init_explicit_still_works: confirms explicit w_init is unaffected Fixes #11
diff --git a/crystalformer/src/attention.py b/crystalformer/src/attention.py
@@ -84,7 +84,7 @@ def __init__(
     self.dropout_rate = dropout_rate
 
     if w_init is None:
-      w_init = hk.initializers.VarianceScaling(w_init_scale)
+      w_init = hk.initializers.VarianceScaling(1.0)
     self.w_init = w_init
     self.with_bias = with_bias
     self.b_init = b_init
diff --git a/tests/test_attention.py b/tests/test_attention.py
@@ -0,0 +1,55 @@
+"""Tests for MultiHeadAttention -- focuses on the w_init=None fix.
+
+Before the fix, calling MultiHeadAttention with w_init=None (the default)
+raised NameError: name 'w_init_scale' is not defined.
+"""
+from config import *
+
+from crystalformer.src.attention import MultiHeadAttention
+
+
+def test_w_init_none_does_not_raise():
+    """w_init=None (the default) must not raise NameError.
+
+    Regression test for: NameError: name 'w_init_scale' is not defined.
+    The fix replaces the undefined variable with the literal value 1.0,
+    matching the upstream haiku default for VarianceScaling.
+    """
+    def fn(q, k, v):
+        mha = MultiHeadAttention(
+            num_heads=2,
+            key_size=8,
+            model_size=16,
+            w_init=None,  # default -- was broken before fix
+        )
+        return mha(q, k, v)
+
+    f = hk.without_apply_rng(hk.transform(fn))
+    key = jax.random.PRNGKey(0)
+    x = jax.random.normal(key, (4, 16))
+    params = f.init(key, x, x, x)
+    out = f.apply(params, x, x, x)
+
+    assert out.shape == (4, 16)
+    assert jnp.isfinite(out).all(), "w_init=None path produces NaN/Inf"
+
+
+def test_w_init_explicit_still_works():
+    """Explicit w_init continues to work after the fix."""
+    def fn(q, k, v):
+        mha = MultiHeadAttention(
+            num_heads=2,
+            key_size=8,
+            model_size=16,
+            w_init=hk.initializers.VarianceScaling(1.0),
+        )
+        return mha(q, k, v)
+
+    f = hk.without_apply_rng(hk.transform(fn))
+    key = jax.random.PRNGKey(1)
+    x = jax.random.normal(key, (4, 16))
+    params = f.init(key, x, x, x)
+    out = f.apply(params, x, x, x)
+
+    assert out.shape == (4, 16)
+    assert jnp.isfinite(out).all()