SYSTEMS-OPERATOR · SYSTEMS-OPERATOR · Jun 27, 2025 · Jun 27, 2025
diff --git a/model/tokenizer.py b/model/tokenizer.py
@@ -355,7 +355,11 @@ def merge_vocab(pair: Tuple[str, str], v_in: Dict[str, int]) -> Dict[str, int]:
     p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
     for word in v_in:
         w_out = p.sub(''.join(pair), word)
-        v_out[w_out] = v_in[word]
+        # Combine counts if multiple words collapse into the same token
+        if w_out in v_out:
+            v_out[w_out] += v_in[word]
+        else:
+            v_out[w_out] = v_in[word]
 
     return v_out
 

diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
@@ -36,6 +36,12 @@ def test_merge_vocab():
     assert 'a b' not in merged
 
 
+def test_merge_vocab_accumulates_counts():
+    vocab = {'a b c': 2, 'ab c': 1}
+    merged = merge_vocab(('a', 'b'), vocab)
+    assert merged['ab c'] == 3
+
+
 def test_save_load_roundtrip(tmp_path):
     freqs = {
         'a': 1,