From 29060d7bd667a38e3af119c54a14bd30876bbc16 Mon Sep 17 00:00:00 2001 From: SYSTEMS-OPERATOR <155610697+SYSTEMS-OPERATOR@users.noreply.github.com> Date: Fri, 27 Jun 2025 05:26:11 -0400 Subject: [PATCH] Fix merge_vocab frequency accumulation --- model/tokenizer.py | 6 +++++- tests/test_tokenizer.py | 6 ++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/model/tokenizer.py b/model/tokenizer.py index 449e8fc..d4051aa 100644 --- a/model/tokenizer.py +++ b/model/tokenizer.py @@ -355,7 +355,11 @@ def merge_vocab(pair: Tuple[str, str], v_in: Dict[str, int]) -> Dict[str, int]: p = re.compile(r'(?