From 6adc949d1b41c176d2df343116ff294c92e41c51 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Fri, 25 Apr 2025 15:23:14 +0200
Subject: [PATCH 1/6] Update .gitignore

---
 .gitignore | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.gitignore b/.gitignore
index f9cb175a..af998906 100644
--- a/.gitignore
+++ b/.gitignore
@@ -167,3 +167,11 @@ cython_debug/
 /logs
 /results_buffer
 electra_pretrained.ckpt
+
+build
+.virtual_documents
+.jupyter
+chebai.egg-info
+lightning_logs
+logs
+.isort.cfg

From f71dc108987a039a32f3d6015df98d487c238026 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Fri, 25 Apr 2025 16:01:54 +0200
Subject: [PATCH 2/6] reader: add dictionary for constant lookup

---
 chebai/preprocessing/reader.py | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/chebai/preprocessing/reader.py b/chebai/preprocessing/reader.py
index 345b2567..9b244c82 100644
--- a/chebai/preprocessing/reader.py
+++ b/chebai/preprocessing/reader.py
@@ -1,4 +1,5 @@
 import os
+from itertools import islice
 from typing import Any, Dict, List, Optional, Tuple
 
 import deepsmiles
@@ -137,13 +138,16 @@ def name(cls) -> str:
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         with open(self.token_path, "r") as pk:
-            self.cache = [x.strip() for x in pk]
+            self.cache: Dict[str, int] = {
+                token.strip(): idx for idx, token in enumerate(pk)
+            }
+        self._loaded_tokens_count = len(self.cache)
 
     def _get_token_index(self, token: str) -> int:
         """Returns a unique number for each token, automatically adds new tokens."""
         if not str(token) in self.cache:
-            self.cache.append(str(token))
-        return self.cache.index(str(token)) + EMBEDDING_OFFSET
+            self.cache[(str(token))] = len(self.cache) - 1  # as index begins from 0
+        return self.cache[str(token)] + EMBEDDING_OFFSET
 
     def _read_data(self, raw_data: str) -> List[int]:
         """
@@ -161,10 +165,22 @@ def on_finish(self) -> None:
         """
         Saves the current cache of tokens to the token file. This method is called after all data processing is complete.
         """
-        with open(self.token_path, "w") as pk:
-            print(f"saving {len(self.cache)} tokens to {self.token_path}...")
-            print(f"first 10 tokens: {self.cache[:10]}")
-            pk.writelines([f"{c}\n" for c in self.cache])
+        print(f"first 10 tokens: {list(islice(self.cache, 10))}")
+
+        total_tokens = len(self.cache)
+        if total_tokens > self._loaded_tokens_count:
+            print("New tokens added to the cache, Saving them to token file.....")
+
+            # For python 3.7+, the standard dict type preserves insertion order, and is iterated over in same order
+            # https://docs.python.org/3/whatsnew/3.7.html#summary-release-highlights
+            # https://mail.python.org/pipermail/python-dev/2017-December/151283.html
+            new_tokens = list(
+                islice(self.cache, self._loaded_tokens_count, total_tokens)
+            )
+
+            with open(self.token_path, "a") as pk:
+                print(f"saving new {len(new_tokens)} tokens to {self.token_path}...")
+                pk.writelines([f"{c}\n" for c in new_tokens])
 
 
 class DeepChemDataReader(ChemDataReader):

From 93dcf26ad9594d1759304f05e31c1d05f723ee4b Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Fri, 25 Apr 2025 16:07:24 +0200
Subject: [PATCH 3/6] reader: fix index for new element

---
 chebai/preprocessing/reader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chebai/preprocessing/reader.py b/chebai/preprocessing/reader.py
index 9b244c82..b72f7ac7 100644
--- a/chebai/preprocessing/reader.py
+++ b/chebai/preprocessing/reader.py
@@ -146,7 +146,7 @@ def __init__(self, *args, **kwargs):
     def _get_token_index(self, token: str) -> int:
         """Returns a unique number for each token, automatically adds new tokens."""
         if not str(token) in self.cache:
-            self.cache[(str(token))] = len(self.cache) - 1  # as index begins from 0
+            self.cache[(str(token))] = len(self.cache)
         return self.cache[str(token)] + EMBEDDING_OFFSET
 
     def _read_data(self, raw_data: str) -> List[int]:

From e04fc5e123771d30a17c20ecbdb5e8af91280ba7 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Fri, 25 Apr 2025 16:45:15 +0200
Subject: [PATCH 4/6] update tests for dict lookup

---
 tests/unit/readers/testChemDataReader.py     | 20 +++++++++++---------
 tests/unit/readers/testDeepChemDataReader.py | 16 +++++++++-------
 tests/unit/readers/testSelfiesReader.py      | 14 ++++++++------
 3 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/tests/unit/readers/testChemDataReader.py b/tests/unit/readers/testChemDataReader.py
index 0c1c4d6f..75884c92 100644
--- a/tests/unit/readers/testChemDataReader.py
+++ b/tests/unit/readers/testChemDataReader.py
@@ -27,14 +27,16 @@ def setUpClass(cls, mock_file: mock_open) -> None:
         """
         cls.reader = ChemDataReader(token_path="/mock/path")
         # After initializing, cls.reader.cache should now be set to ['C', 'O', 'N', '=', '1', '(']
-        assert cls.reader.cache == [
-            "C",
-            "O",
-            "N",
-            "=",
-            "1",
-            "(",
-        ], "Initial cache does not match expected values."
+        assert list(cls.reader.cache.items()) == list(
+            {
+                "C": 0,
+                "O": 1,
+                "N": 2,
+                "=": 3,
+                "1": 4,
+                "(": 5,
+            }.items()
+        ), "Initial cache does not match expected values or the order doesn't match."
 
     def test_read_data(self) -> None:
         """
@@ -87,7 +89,7 @@ def test_read_data_with_new_token(self) -> None:
         )
         # Ensure it's at the correct index
         self.assertEqual(
-            self.reader.cache.index("[H-]"),
+            self.reader.cache["[H-]"],
             index_for_last_token,
             "The new token '[H-]' was not added at the correct index in the cache.",
         )
diff --git a/tests/unit/readers/testDeepChemDataReader.py b/tests/unit/readers/testDeepChemDataReader.py
index dc29c9a6..d0688fa4 100644
--- a/tests/unit/readers/testDeepChemDataReader.py
+++ b/tests/unit/readers/testDeepChemDataReader.py
@@ -27,12 +27,14 @@ def setUpClass(cls, mock_file: mock_open) -> None:
         """
         cls.reader = DeepChemDataReader(token_path="/mock/path")
         # After initializing, cls.reader.cache should now be set to ['C', 'O', 'c', ')']
-        assert cls.reader.cache == [
-            "C",
-            "O",
-            "c",
-            ")",
-        ], "Cache initialization did not match expected tokens."
+        assert list(cls.reader.cache.items()) == list(
+            {
+                "C": 0,
+                "O": 1,
+                "c": 2,
+                ")": 3,
+            }.items()
+        ), "Cache initialization did not match expected tokens or the expected order."
 
     def test_read_data(self) -> None:
         """
@@ -95,7 +97,7 @@ def test_read_data_with_new_token(self) -> None:
         )
         # Ensure it's at the correct index
         self.assertEqual(
-            self.reader.cache.index("[H-]"),
+            self.reader.cache["[H-]"],
             index_for_last_token,
             "The new token '[H-]' was not added to the correct index in the cache.",
         )
diff --git a/tests/unit/readers/testSelfiesReader.py b/tests/unit/readers/testSelfiesReader.py
index 411fc63b..c915a0fa 100644
--- a/tests/unit/readers/testSelfiesReader.py
+++ b/tests/unit/readers/testSelfiesReader.py
@@ -27,11 +27,13 @@ def setUpClass(cls, mock_file: mock_open) -> None:
         """
         cls.reader = SelfiesReader(token_path="/mock/path")
         # After initializing, cls.reader.cache should now be set to ['[C]', '[O]', '[=C]']
-        assert cls.reader.cache == [
-            "[C]",
-            "[O]",
-            "[=C]",
-        ], "Cache initialization did not match expected tokens."
+        assert list(cls.reader.cache.items()) == list(
+            {
+                "[C]": 0,
+                "[O]": 1,
+                "[=C]": 2,
+            }.items()
+        ), "Cache initialization did not match expected tokens or the expected order."
 
     def test_read_data(self) -> None:
         """
@@ -98,7 +100,7 @@ def test_read_data_with_new_token(self) -> None:
         )
         # Ensure it's at the correct index
         self.assertEqual(
-            self.reader.cache.index("[H-1]"),
+            self.reader.cache["[H-1]"],
             index_for_last_token,
             "The new token '[H-1]' was not added at the correct index in the cache.",
         )

From 1bf0f1e7468a60ed05cc4bafaed2eb46bbc1b643 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Fri, 25 Apr 2025 16:46:03 +0200
Subject: [PATCH 5/6] reader: assert for python version

---
 chebai/preprocessing/reader.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/chebai/preprocessing/reader.py b/chebai/preprocessing/reader.py
index b72f7ac7..0224a101 100644
--- a/chebai/preprocessing/reader.py
+++ b/chebai/preprocessing/reader.py
@@ -1,4 +1,5 @@
 import os
+import sys
 from itertools import islice
 from typing import Any, Dict, List, Optional, Tuple
 
@@ -171,6 +172,10 @@ def on_finish(self) -> None:
         if total_tokens > self._loaded_tokens_count:
             print("New tokens added to the cache, Saving them to token file.....")
 
+            assert sys.version_info >= (
+                3,
+                7,
+            ), "This code requires Python 3.7 or higher."
             # For python 3.7+, the standard dict type preserves insertion order, and is iterated over in same order
             # https://docs.python.org/3/whatsnew/3.7.html#summary-release-highlights
             # https://mail.python.org/pipermail/python-dev/2017-December/151283.html

From b51229e4f97d2d19dc7e4c91e61093d040b3fac1 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Fri, 25 Apr 2025 17:06:25 +0200
Subject: [PATCH 6/6] tests: add test for on_finish method of chemdatareader

---
 tests/unit/readers/testChemDataReader.py | 51 ++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/tests/unit/readers/testChemDataReader.py b/tests/unit/readers/testChemDataReader.py
index 75884c92..5e6fb099 100644
--- a/tests/unit/readers/testChemDataReader.py
+++ b/tests/unit/readers/testChemDataReader.py
@@ -104,6 +104,57 @@ def test_read_data_with_invalid_input(self) -> None:
         with self.assertRaises(ValueError):
             self.reader._read_data(raw_data)
 
+    @patch("builtins.open", new_callable=mock_open)
+    def test_finish_method_for_new_tokens(self, mock_file: mock_open) -> None:
+        """
+        Test the on_finish method to ensure it appends only the new tokens to the token file in order.
+        """
+        # Simulate that some tokens were already loaded
+        self.reader._loaded_tokens_count = 6  # 6 tokens already loaded
+        self.reader.cache = {
+            "C": 0,
+            "O": 1,
+            "N": 2,
+            "=": 3,
+            "1": 4,
+            "(": 5,
+            "[H-]": 6,  # New token 1
+            "Br": 7,  # New token 2
+            "Cl": 8,  # New token 3
+            "Na": 9,  # New token 4
+            "Mg": 10,  # New token 5
+        }
+
+        # Run the on_finish method
+        self.reader.on_finish()
+
+        # Check that the file was opened in append mode ('a')
+        mock_file.assert_called_with(self.reader.token_path, "a")
+
+        # Verify the new tokens were written in the correct order
+        mock_file().writelines.assert_called_with(
+            ["[H-]\n", "Br\n", "Cl\n", "Na\n", "Mg\n"]
+        )
+
+    def test_finish_method_no_new_tokens(self) -> None:
+        """
+        Test the on_finish method when no new tokens are added (cache is the same).
+        """
+        self.reader._loaded_tokens_count = 6  # No new tokens
+        self.reader.cache = {
+            "C": 0,
+            "O": 1,
+            "N": 2,
+            "=": 3,
+            "1": 4,
+            "(": 5,
+        }
+
+        with patch("builtins.open", new_callable=mock_open) as mock_file:
+            self.reader.on_finish()
+            # Check that no new tokens were written
+            mock_file().writelines.assert_not_called()
+
 
 if __name__ == "__main__":
     unittest.main()