diff --git a/README.md b/README.md index a6f8b9e..1850f0a 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,6 @@ - Vicinity is a light-weight, low-dependency vector store. It provides a simple and intuitive interface for nearest neighbor search, with support for different backends and evaluation. There are many nearest neighbors packages and methods out there. However, we found it difficult to compare them. Every package has its own interface, quirks, and limitations, and learning a new package can be time-consuming. In addition to that, how do you effectively evaluate different packages? How do you know which one is the best for your use case? @@ -43,7 +42,7 @@ Install the package with: ```bash pip install vicinity ``` -Optionally, [install any of the supported backends](#installation), or simply install all of them with: +Optionally, [install specific backends and integrations](#installation), or simply install all of them with: ```bash pip install vicinity[all] ``` @@ -87,6 +86,13 @@ vicinity.save('my_vector_store') vicinity = Vicinity.load('my_vector_store') ``` +Pushing and loading a vector store from the Hugging Face Hub: + +```python +vicinity.push_to_hub(model_name_or_path='my_vector_store', repo_id='my_vector_store') +vicinity = Vicinity.load_from_hub(repo_id='my_vector_store') +``` + Evaluating a backend: ```python @@ -167,9 +173,18 @@ The following installation options are available: # Install the base package pip install vicinity -# Install all backends +# Install all integrations and backends pip install vicinity[all] +# Install all integrations +pip install vicinity[integrations] + +# Install specific integrations +pip install vicinity[huggingface] + +# Install all backends +pip install vicinity[backends] + # Install specific backends pip install vicinity[annoy] pip install vicinity[faiss] diff --git a/pyproject.toml b/pyproject.toml index a2d6087..c306cbe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,14 @@ dev = [ "ruff", "setuptools" ] + +# Integrations +huggingface = ["datasets"] +integrations = [ + "datasets" +] + +# Backends hnsw = ["hnswlib"] pynndescent = [ "pynndescent>=0.5.10", @@ -53,7 +61,20 @@ annoy = ["annoy"] faiss = ["faiss-cpu"] usearch = ["usearch"] voyager = ["voyager"] +backends = [ + "hnswlib", + "pynndescent>=0.5.10", + "numba>=0.59.0", + "llvmlite>=0.42.0", + "numpy>=1.24.0", + "annoy", + "faiss-cpu", + "usearch", + "voyager" +] + all = [ + "datasets", "hnswlib", "pynndescent>=0.5.10", "numba>=0.59.0", diff --git a/tests/integrations/test_huggingface.py b/tests/integrations/test_huggingface.py new file mode 100644 index 0000000..bf4fc43 --- /dev/null +++ b/tests/integrations/test_huggingface.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +import io +import sys + +from vicinity import Vicinity +from vicinity.datatypes import Backend +from vicinity.integrations.huggingface import _MODEL_NAME_OR_PATH_PRINT_STATEMENT + +BackendType = tuple[Backend, str] + + +def test_load_from_hub(vicinity_instance: Vicinity) -> None: + """ + Test Vicinity.load_from_hub. + + :param vicinity_instance: A Vicinity instance. + """ + repo_id = "davidberenstein1957/my-vicinity-repo" + # get the first part of the print statement to test if model name or path is printed + expected_print_statement = _MODEL_NAME_OR_PATH_PRINT_STATEMENT.split(":")[0] + + # Capture the output + captured_output = io.StringIO() + sys.stdout = captured_output + + Vicinity.load_from_hub(repo_id=repo_id) + + # Reset redirect. + sys.stdout = sys.__stdout__ + + # Check if the expected message is in the output + assert expected_print_statement in captured_output.getvalue() diff --git a/vicinity/integrations/dataset_card_template.md b/vicinity/integrations/dataset_card_template.md new file mode 100644 index 0000000..085a7a5 --- /dev/null +++ b/vicinity/integrations/dataset_card_template.md @@ -0,0 +1,30 @@ +--- +tags: +- vicinity +- vector-store +--- + +# Dataset Card for {repo_id} + +This dataset was created using the [vicinity](https://github.com/MinishLab/vicinity) library, a lightweight nearest neighbors library with flexible backends. + +It contains a vector space with {num_items} items. + +## Usage + +You can load this dataset using the following code: + +```python +from vicinity import Vicinity +vicinity = Vicinity.load_from_hub("{repo_id}") +``` + +After loading the dataset, you can use the `vicinity.query` method to find the nearest neighbors to a vector. + +## Configuration + +The configuration of the dataset is stored in the `config.json` file. The vector backend is stored in the `backend` folder. + +```bash +{config} +``` \ No newline at end of file diff --git a/vicinity/integrations/huggingface.py b/vicinity/integrations/huggingface.py new file mode 100644 index 0000000..0e75129 --- /dev/null +++ b/vicinity/integrations/huggingface.py @@ -0,0 +1,138 @@ +from __future__ import annotations + +import json +import logging +import tempfile +from pathlib import Path +from typing import TYPE_CHECKING, Any + +from vicinity.backends import BasicVectorStore, get_backend_class +from vicinity.datatypes import Backend + +if TYPE_CHECKING: + from huggingface_hub import CommitInfo + + from vicinity.vicinity import Vicinity + +_HUB_IMPORT_ERROR = ImportError( + "`datasets` and `huggingface_hub` are required to push to the Hugging Face Hub. Please install them with `pip install 'vicinity[huggingface]'`" +) +_MODEL_NAME_OR_PATH_PRINT_STATEMENT = ( + "Embeddings in Vicinity instance were created from model name or path: {model_name_or_path}" +) + +logger = logging.getLogger(__name__) + + +class HuggingFaceMixin: + def push_to_hub( + self, + model_name_or_path: str, + repo_id: str, + token: str | None = None, + private: bool = False, + **kwargs: Any, + ) -> "CommitInfo": + """ + Push the Vicinity instance to the Hugging Face Hub. + + :param model_name_or_path: The name of the model or the path to the local directory + that was used to create the embeddings in the Vicinity instance. + :param repo_id: The repository ID on the Hugging Face Hub + :param token: Optional authentication token for private repositories + :param private: Whether to create a private repository + :param **kwargs: Additional arguments passed to Dataset.push_to_hub() + :return: The commit info + """ + try: + from datasets import Dataset + from huggingface_hub import DatasetCard, upload_file, upload_folder + except ImportError: + raise _HUB_IMPORT_ERROR + + # Create and push dataset with items and vectors + if isinstance(self.items[0], dict): + dataset_dict = {k: [item[k] for item in self.items] for k in self.items[0].keys()} + else: + dataset_dict = {"items": self.items} + if self.vector_store is not None: + dataset_dict["vectors"] = self.vector_store.vectors + dataset = Dataset.from_dict(dataset_dict) + dataset.push_to_hub(repo_id, token=token, private=private, **kwargs) + + # Save backend and config files to temp directory and upload + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Save and upload backend + self.backend.save(temp_path) + upload_folder( + repo_id=repo_id, + folder_path=temp_path, + token=token, + repo_type="dataset", + path_in_repo="backend", + ) + + # Save and upload config + config = { + "metadata": self.metadata, + "backend_type": self.backend.backend_type.value, + "model_name_or_path": model_name_or_path, + } + config_path = temp_path / "config.json" + config_path.write_text(json.dumps(config)) + upload_file( + repo_id=repo_id, + path_or_fileobj=config_path, + token=token, + repo_type="dataset", + path_in_repo="config.json", + ) + + # Load the dataset card template from the related path + template_path = Path(__file__).parent / "dataset_card_template.md" + template = template_path.read_text() + content = template.format(repo_id=repo_id, num_items=len(self.items), config=json.dumps(config, indent=4)) + return DatasetCard(content=content).push_to_hub(repo_id=repo_id, token=token, repo_type="dataset") + + @classmethod + def load_from_hub(cls, repo_id: str, token: str | None = None, **kwargs: Any) -> "Vicinity": + """ + Load a Vicinity instance from the Hugging Face Hub. + + :param repo_id: The repository ID on the Hugging Face Hub. + :param token: Optional authentication token for private repositories. + :param **kwargs: Additional arguments passed to load_dataset. + :return: A Vicinity instance loaded from the Hub. + """ + try: + from datasets import load_dataset + from huggingface_hub import snapshot_download + except ImportError: + raise _HUB_IMPORT_ERROR + + # Load dataset and extract items and vectors + dataset = load_dataset(repo_id, token=token, split="train", **kwargs) + if "items" in dataset.column_names: + items = dataset["items"] + else: + # Create items from all columns except 'vectors' + items = [] + columns = [col for col in dataset.column_names if col != "vectors"] + for i in range(len(dataset)): + items.append({col: dataset[col][i] for col in columns}) + has_vectors = "vectors" in dataset.column_names + vector_store = BasicVectorStore(vectors=dataset["vectors"]) if has_vectors else None + + # Download and load config and backend + repo_path = Path(snapshot_download(repo_id=repo_id, token=token, repo_type="dataset")) + with open(repo_path / "config.json") as f: + config = json.load(f) + model_name_or_path = config.pop("model_name_or_path") + + print(_MODEL_NAME_OR_PATH_PRINT_STATEMENT.format(model_name_or_path=model_name_or_path)) + backend_type = Backend(config["backend_type"]) + backend = get_backend_class(backend_type).load(repo_path / "backend") + + return cls(items=items, backend=backend, metadata=config["metadata"], vector_store=vector_store) diff --git a/vicinity/vicinity.py b/vicinity/vicinity.py index 6c4c74f..1b106b1 100644 --- a/vicinity/vicinity.py +++ b/vicinity/vicinity.py @@ -19,8 +19,10 @@ logger = logging.getLogger(__name__) +from vicinity.integrations.huggingface import HuggingFaceMixin -class Vicinity: + +class Vicinity(HuggingFaceMixin): """ Work with vector representations of items.