From 624450561eaeed737c931ac4d91a409c74bcc2f4 Mon Sep 17 00:00:00 2001 From: Grecil Date: Tue, 25 Jun 2024 12:11:50 +0530 Subject: [PATCH 1/3] Added optional arguments "model_dir" to specify the directory model should be downloaded in or loaded from. This saves a lot of time, memory and network resources. --- pinecone_text/sparse/splade_encoder.py | 38 +++++++++++++++++++++----- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/pinecone_text/sparse/splade_encoder.py b/pinecone_text/sparse/splade_encoder.py index 240a9df..f4b26dc 100644 --- a/pinecone_text/sparse/splade_encoder.py +++ b/pinecone_text/sparse/splade_encoder.py @@ -1,5 +1,6 @@ from typing import List, Union, Optional - +from os import PathLike +import os try: import torch except (OSError, ImportError, ModuleNotFoundError) as e: @@ -26,11 +27,12 @@ class SpladeEncoder(BaseSparseEncoder): Currently only supports inference with naver/splade-cocondenser-ensembledistil """ - def __init__(self, max_seq_length: int = 256, device: Optional[str] = None): + def __init__(self, max_seq_length: int = 256, device: Optional[str] = None, model_dir:Optional[PathLike[str]] = None): """ Args: max_seq_length: Maximum sequence length for the model. Must be between 1 and 512. device: Device to use for inference. Defaults to GPU if available, otherwise CPU. + model_dir: Directory to download and load model from. Saves time and resources. Example: @@ -61,12 +63,34 @@ def __init__(self, max_seq_length: int = 256, device: Optional[str] = None): device = device or ("cuda" if torch.cuda.is_available() else "cpu") self.device = device - - model = "naver/splade-cocondenser-ensembledistil" - self.tokenizer = AutoTokenizer.from_pretrained(model) - self.model = AutoModelForMaskedLM.from_pretrained(model).to(self.device) + expected_model_name = "naver/splade-cocondenser-ensembledistil" + if not self._is_correct_model(model_dir, expected_model_name): + self.tokenizer,self.model=self._download_model(model_dir, expected_model_name) + else: + self.tokenizer = AutoTokenizer.from_pretrained(model_dir) + self.model = AutoModelForMaskedLM.from_pretrained(model_dir).to(self.device) self.max_seq_length = max_seq_length - + def _is_correct_model(self, model_dir, expected_model_name): + # Check for the presence of specific files that indicate the correct model + config_path = os.path.join(model_dir, 'config.json') + if not os.path.exists(config_path): + return False + + with open(config_path, 'r') as config_file: + config = json.load(config_file) + return config.get("model_type") == expected_model_name + + def _download_model(self, model_dir, model_name): + # Ensure the directory exists + os.makedirs(model_dir, exist_ok=True) + + # Download the tokenizer and model + tokenizer = AutoTokenizer.from_pretrained(model_name) + tokenizer.save_pretrained(model_dir) + + model = AutoModelForMaskedLM.from_pretrained(model_name) + model.save_pretrained(model_dir) + return tokenizer,model def encode_documents( self, texts: Union[str, List[str]] ) -> Union[SparseVector, List[SparseVector]]: From c421223ad6dffdfdeea83e8edef6b8ad9d696322 Mon Sep 17 00:00:00 2001 From: Grecil Date: Tue, 25 Jun 2024 12:19:06 +0530 Subject: [PATCH 2/3] if model_dir not given, work as usual --- pinecone_text/sparse/splade_encoder.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pinecone_text/sparse/splade_encoder.py b/pinecone_text/sparse/splade_encoder.py index f4b26dc..19a6567 100644 --- a/pinecone_text/sparse/splade_encoder.py +++ b/pinecone_text/sparse/splade_encoder.py @@ -64,11 +64,15 @@ def __init__(self, max_seq_length: int = 256, device: Optional[str] = None, mode device = device or ("cuda" if torch.cuda.is_available() else "cpu") self.device = device expected_model_name = "naver/splade-cocondenser-ensembledistil" - if not self._is_correct_model(model_dir, expected_model_name): - self.tokenizer,self.model=self._download_model(model_dir, expected_model_name) + if model_dir: + if not self._is_correct_model(model_dir, expected_model_name): + self.tokenizer,self.model=self._download_model(model_dir, expected_model_name) + else: + self.tokenizer = AutoTokenizer.from_pretrained(model_dir) + self.model = AutoModelForMaskedLM.from_pretrained(model_dir).to(self.device) else: - self.tokenizer = AutoTokenizer.from_pretrained(model_dir) - self.model = AutoModelForMaskedLM.from_pretrained(model_dir).to(self.device) + self.tokenizer = AutoTokenizer.from_pretrained(expected_model_name) + self.model = AutoModelForMaskedLM.from_pretrained(expected_model_name).to(self.device) self.max_seq_length = max_seq_length def _is_correct_model(self, model_dir, expected_model_name): # Check for the presence of specific files that indicate the correct model From b66c92d00af519e5e1986758e262db7e34f9f17a Mon Sep 17 00:00:00 2001 From: Grecil Date: Tue, 25 Jun 2024 12:43:46 +0530 Subject: [PATCH 3/3] bug fix --- pinecone_text/sparse/splade_encoder.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pinecone_text/sparse/splade_encoder.py b/pinecone_text/sparse/splade_encoder.py index 19a6567..f5225ee 100644 --- a/pinecone_text/sparse/splade_encoder.py +++ b/pinecone_text/sparse/splade_encoder.py @@ -1,6 +1,7 @@ from typing import List, Union, Optional from os import PathLike import os +import json try: import torch except (OSError, ImportError, ModuleNotFoundError) as e: @@ -82,7 +83,7 @@ def _is_correct_model(self, model_dir, expected_model_name): with open(config_path, 'r') as config_file: config = json.load(config_file) - return config.get("model_type") == expected_model_name + return config.get("_name_or_path") == expected_model_name def _download_model(self, model_dir, model_name): # Ensure the directory exists