From 0a24fdcf8b578ea9b200f02aa651580931fd9d86 Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sat, 7 Jun 2025 19:59:33 +0200 Subject: [PATCH 1/2] optimize index encode using dict instead of list - for constant time search --- .../preprocessing/property_encoder.py | 41 ++++++++++++++----- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/chebai_graph/preprocessing/property_encoder.py b/chebai_graph/preprocessing/property_encoder.py index 497025c..b648961 100644 --- a/chebai_graph/preprocessing/property_encoder.py +++ b/chebai_graph/preprocessing/property_encoder.py @@ -1,8 +1,11 @@ import abc import os -import torch from typing import Optional +import torch +import sys +from itertools import islice + class PropertyEncoder(abc.ABC): def __init__(self, property, **kwargs): @@ -40,7 +43,9 @@ def __init__(self, property, indices_dir=None, **kwargs): self.dirname = indices_dir # load already existing cache with open(self.index_path, "r") as pk: - self.cache = [x.strip() for x in pk] + self.cache: dict[str, int] = { + token.strip(): idx for idx, token in enumerate(pk) + } self.index_length_start = len(self.cache) self.offset = 0 @@ -64,19 +69,33 @@ def index_path(self): def on_finish(self): """Save cache""" - with open(self.index_path, "w") as pk: - new_length = len(self.cache) - self.index_length_start - pk.writelines([f"{c}\n" for c in self.cache]) - print( - f"saved index of property {self.property.name} to {self.index_path}, " - f"index length: {len(self.cache)} (new: {new_length})" - ) + total_tokens = len(self.cache) + if total_tokens > self.index_length_start: + print("New tokens added to the cache, Saving them to index token file.....") + + assert sys.version_info >= ( + 3, + 7, + ), "This code requires Python 3.7 or higher." + # For python 3.7+, the standard dict type preserves insertion order, and is iterated over in same order + # https://docs.python.org/3/whatsnew/3.7.html#summary-release-highlights + # https://mail.python.org/pipermail/python-dev/2017-December/151283.html + new_tokens = list(islice(self.cache, self.index_length_start, total_tokens)) + + with open(self.index_path, "a") as pk: + pk.writelines([f"{c}\n" for c in new_tokens]) + print( + f"New {len(new_tokens)} tokens append to index of property {self.property.name} to {self.index_path}..." + ) + print( + f"Now, the total length of the index of property {self.property.name} is {total_tokens}" + ) def encode(self, token): """Returns a unique number for each token, automatically adds new tokens to the cache.""" if not str(token) in self.cache: - self.cache.append(str(token)) - return torch.tensor([self.cache.index(str(token)) + self.offset]) + self.cache[(str(token))] = len(self.cache) + return torch.tensor([self.cache[str(token)] + self.offset]) class OneHotEncoder(IndexEncoder): From 775c1889eab4fbb016fbcb41e37018d71899835b Mon Sep 17 00:00:00 2001 From: aditya0by0 Date: Sat, 7 Jun 2025 20:05:44 +0200 Subject: [PATCH 2/2] dirname based on where class is defined rather than file --- chebai_graph/preprocessing/property_encoder.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/chebai_graph/preprocessing/property_encoder.py b/chebai_graph/preprocessing/property_encoder.py index b648961..f998396 100644 --- a/chebai_graph/preprocessing/property_encoder.py +++ b/chebai_graph/preprocessing/property_encoder.py @@ -5,6 +5,7 @@ import torch import sys from itertools import islice +import inspect class PropertyEncoder(abc.ABC): @@ -39,7 +40,7 @@ class IndexEncoder(PropertyEncoder): def __init__(self, property, indices_dir=None, **kwargs): super().__init__(property, **kwargs) if indices_dir is None: - indices_dir = os.path.dirname(__file__) + indices_dir = os.path.dirname(inspect.getfile(self.__class__)) self.dirname = indices_dir # load already existing cache with open(self.index_path, "r") as pk: