From 0a24fdcf8b578ea9b200f02aa651580931fd9d86 Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 7 Jun 2025 19:59:33 +0200
Subject: [PATCH 1/2] optimize index encode using dict instead of list

 - for constant time search
---
 .../preprocessing/property_encoder.py         | 41 ++++++++++++++-----
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/chebai_graph/preprocessing/property_encoder.py b/chebai_graph/preprocessing/property_encoder.py
index 497025c..b648961 100644
--- a/chebai_graph/preprocessing/property_encoder.py
+++ b/chebai_graph/preprocessing/property_encoder.py
@@ -1,8 +1,11 @@
 import abc
 import os
-import torch
 from typing import Optional
 
+import torch
+import sys
+from itertools import islice
+
 
 class PropertyEncoder(abc.ABC):
     def __init__(self, property, **kwargs):
@@ -40,7 +43,9 @@ def __init__(self, property, indices_dir=None, **kwargs):
         self.dirname = indices_dir
         # load already existing cache
         with open(self.index_path, "r") as pk:
-            self.cache = [x.strip() for x in pk]
+            self.cache: dict[str, int] = {
+                token.strip(): idx for idx, token in enumerate(pk)
+            }
         self.index_length_start = len(self.cache)
         self.offset = 0
 
@@ -64,19 +69,33 @@ def index_path(self):
 
     def on_finish(self):
         """Save cache"""
-        with open(self.index_path, "w") as pk:
-            new_length = len(self.cache) - self.index_length_start
-            pk.writelines([f"{c}\n" for c in self.cache])
-            print(
-                f"saved index of property {self.property.name} to {self.index_path}, "
-                f"index length: {len(self.cache)} (new: {new_length})"
-            )
+        total_tokens = len(self.cache)
+        if total_tokens > self.index_length_start:
+            print("New tokens added to the cache, Saving them to index token file.....")
+
+            assert sys.version_info >= (
+                3,
+                7,
+            ), "This code requires Python 3.7 or higher."
+            # For python 3.7+, the standard dict type preserves insertion order, and is iterated over in same order
+            # https://docs.python.org/3/whatsnew/3.7.html#summary-release-highlights
+            # https://mail.python.org/pipermail/python-dev/2017-December/151283.html
+            new_tokens = list(islice(self.cache, self.index_length_start, total_tokens))
+
+            with open(self.index_path, "a") as pk:
+                pk.writelines([f"{c}\n" for c in new_tokens])
+                print(
+                    f"New {len(new_tokens)} tokens append to index of property {self.property.name} to {self.index_path}..."
+                )
+                print(
+                    f"Now, the total length of the index of property {self.property.name} is {total_tokens}"
+                )
 
     def encode(self, token):
         """Returns a unique number for each token, automatically adds new tokens to the cache."""
         if not str(token) in self.cache:
-            self.cache.append(str(token))
-        return torch.tensor([self.cache.index(str(token)) + self.offset])
+            self.cache[(str(token))] = len(self.cache)
+        return torch.tensor([self.cache[str(token)] + self.offset])
 
 
 class OneHotEncoder(IndexEncoder):

From 775c1889eab4fbb016fbcb41e37018d71899835b Mon Sep 17 00:00:00 2001
From: aditya0by0 <aditya0by0@gmail.com>
Date: Sat, 7 Jun 2025 20:05:44 +0200
Subject: [PATCH 2/2] dirname based on where class is defined rather than file

---
 chebai_graph/preprocessing/property_encoder.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/chebai_graph/preprocessing/property_encoder.py b/chebai_graph/preprocessing/property_encoder.py
index b648961..f998396 100644
--- a/chebai_graph/preprocessing/property_encoder.py
+++ b/chebai_graph/preprocessing/property_encoder.py
@@ -5,6 +5,7 @@
 import torch
 import sys
 from itertools import islice
+import inspect
 
 
 class PropertyEncoder(abc.ABC):
@@ -39,7 +40,7 @@ class IndexEncoder(PropertyEncoder):
     def __init__(self, property, indices_dir=None, **kwargs):
         super().__init__(property, **kwargs)
         if indices_dir is None:
-            indices_dir = os.path.dirname(__file__)
+            indices_dir = os.path.dirname(inspect.getfile(self.__class__))
         self.dirname = indices_dir
         # load already existing cache
         with open(self.index_path, "r") as pk: