MeridianResearch · marichkakorolyuk · Jul 8, 2025 · Jul 8, 2025
diff --git a/early_exit/patching/attention_mixins/qwen2.py b/early_exit/patching/attention_mixins/qwen2.py
@@ -127,7 +127,13 @@ def patched_attention_forward(
         query_states = self.q_proj(hidden_states)
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
-
+        if not hasattr(self, "num_heads"):
+            self.num_heads = self.config.num_attention_heads
+        if not hasattr(self, "num_key_value_heads"):
+            self.num_key_value_heads = self.config.num_key_value_heads
+        if not hasattr(self, "hidden_size"):
+            self.hidden_size = self.config.hidden_size
+
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
@@ -144,6 +150,7 @@ def patched_attention_forward(
             cos, sin = position_embeddings
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
+
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

diff --git a/...lts_and_data/early_exit_sft_dataset/test/.ipynb_checkpoints/prompt_config-checkpoint.json b/...lts_and_data/early_exit_sft_dataset/test/.ipynb_checkpoints/prompt_config-checkpoint.json
@@ -0,0 +1,5 @@
+{
+    "system_prompt": "",
+    "task_context": "I am going to give you a story and a question about the story. Read the following story carefully, understand the characters' actions and perspectives, then answer the question regarding object locations, character knowledge, and beliefs.",
+    "prefiller": ""
+}