From afc3fb4a463cfa771364aa0551bc20fe5631f047 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Mon, 20 Jan 2025 17:22:14 +0100
Subject: [PATCH 01/15] Refactor README and Vicinity class to support any
 serializable item type

- Updated README.md to clarify that items can be strings or other serializable objects.
- Modified the Vicinity class to accept a broader range of item types by changing type hints from `str` to `Any` in several methods.
- Enhanced the insert and delete methods to handle non-string tokens appropriately, ensuring that items can be checked and managed regardless of their type.
---
 README.md            | 13 +++++++------
 vicinity/vicinity.py | 32 ++++++++++++++++++++++----------
 2 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 2bd6718..f796796 100644
--- a/README.md
+++ b/README.md
@@ -35,8 +35,7 @@ Vicinity is a light-weight, low-dependency vector store. It provides a simple an
 
 There are many nearest neighbors packages and methods out there. However, we found it difficult to compare them. Every package has its own interface, quirks, and limitations, and learning a new package can be time-consuming. In addition to that, how do you effectively evaluate different packages? How do you know which one is the best for your use case?
 
-
- This is where Vicinity comes in. Instead of learning a new interface for each new package or backend, Vicinity provides a unified interface for all backends. This allows you to easily experiment with different indexing methods and distance metrics and choose the best one for your use case. Vicinity also provides a simple way to evaluate the performance of different backends, allowing you to measure the queries per second and recall.
+This is where Vicinity comes in. Instead of learning a new interface for each new package or backend, Vicinity provides a unified interface for all backends. This allows you to easily experiment with different indexing methods and distance metrics and choose the best one for your use case. Vicinity also provides a simple way to evaluate the performance of different backends, allowing you to measure the queries per second and recall.
 
 ## Quickstart
 
@@ -49,13 +48,13 @@ Optionally, [install any of the supported backends](#installation), or simply in
 pip install vicinity[all]
 ```
 
-
 The following code snippet demonstrates how to use Vicinity for nearest neighbor search:
+
 ```python
 import numpy as np
 from vicinity import Vicinity, Backend, Metric
 
-# Create some dummy data
+# Create some dummy data as strings or other serializable objects
 items = ["triforce", "master sword", "hylian shield", "boomerang", "hookshot"]
 vectors = np.random.rand(len(items), 128)
 
@@ -100,6 +99,7 @@ qps, recall = vicinity.evaluate(
 ```
 
 ## Main Features
+
 Vicinity provides the following features:
 - Lightweight: Minimal dependencies and fast performance.
 - Flexible Backend Support: Use different backends for vector storage and search.
@@ -108,6 +108,7 @@ Vicinity provides the following features:
 - Easy to Use: Simple and intuitive API.
 
 ## Supported Backends
+
 The following backends are supported:
 - `BASIC`: A simple (exact matching) flat index for vector storage and search.
 - [HNSW](https://github.com/nmslib/hnswlib): Hierarchical Navigable Small World Graph (HNSW) for ANN search using hnswlib.
@@ -126,8 +127,6 @@ The following backends are supported:
   - `ivfpqr`: Inverted file search with product quantizer and refinement.
 - [VOYAGER](https://github.com/spotify/voyager): Voyager is a library for performing fast approximate nearest-neighbor searches on an in-memory collection of vectors.
 
-
-
 NOTE: the ANN backends do not support dynamic deletion. To delete items, you need to recreate the index. Insertion is supported in the following backends: `FAISS`, `HNSW`, and `Usearch`. The `BASIC` backend supports both insertion and deletion.
 
 ### Backend Parameters
@@ -159,7 +158,9 @@ NOTE: the ANN backends do not support dynamic deletion. To delete items, you nee
 |                 | `m`                 | The number of connections between nodes in the tree’s internal data structure.                                                              | `16`                |
 
 ## Installation
+
 The following installation options are available:
+
 ```bash
 # Install the base package
 pip install vicinity
diff --git a/vicinity/vicinity.py b/vicinity/vicinity.py
index 2216cef..97e9d81 100644
--- a/vicinity/vicinity.py
+++ b/vicinity/vicinity.py
@@ -29,7 +29,7 @@ class Vicinity:
 
     def __init__(
         self,
-        items: Sequence[str],
+        items: Sequence[Any],
         backend: AbstractBackend,
         metadata: Union[dict[str, Any], None] = None,
         vector_store: BasicVectorStore | None = None,
@@ -49,7 +49,7 @@ def __init__(
             raise ValueError(
                 "Your vector space and list of items are not the same length: " f"{len(backend)} != {len(items)}"
             )
-        self.items: list[str] = list(items)
+        self.items: list[Any] = list(items)
         self.backend: AbstractBackend = backend
         self.metadata = metadata or {}
         self.vector_store = vector_store
@@ -74,7 +74,7 @@ def __len__(self) -> int:
     def from_vectors_and_items(
         cls: type[Vicinity],
         vectors: npt.NDArray,
-        items: Sequence[str],
+        items: Sequence[Any],
         backend_type: Backend | str = Backend.BASIC,
         store_vectors: bool = False,
         **kwargs: Any,
@@ -211,7 +211,7 @@ def load(cls, filename: PathLike) -> Vicinity:
 
         with open(folder_path / "data.json", "rb") as file_handle:
             data: dict[str, Any] = orjson.loads(file_handle.read())
-        items: Sequence[str] = data["items"]
+        items: Sequence[Any] = data["items"]
 
         metadata: dict[str, Any] = data["metadata"]
         backend_type = Backend(data["backend_type"])
@@ -227,7 +227,7 @@ def load(cls, filename: PathLike) -> Vicinity:
 
         return instance
 
-    def insert(self, tokens: Sequence[str], vectors: npt.NDArray) -> None:
+    def insert(self, tokens: Sequence[Any], vectors: npt.NDArray) -> None:
         """
         Insert new items into the vector space.
 
@@ -243,14 +243,20 @@ def insert(self, tokens: Sequence[str], vectors: npt.NDArray) -> None:
 
         item_set = set(self.items)
         for token in tokens:
-            if token in item_set:
-                raise ValueError(f"Token {token} is already in the vector space.")
-            self.items.append(token)
+            if isinstance(token, str):
+                if token in item_set:
+                    raise ValueError(f"Token {token} is already in the vector space.")
+                self.items.append(token)
+            else:
+                for item in self.items:
+                    if item == token:
+                        raise ValueError(f"Token {token} is already in the vector space.")
+                self.items.append(token)
         self.backend.insert(vectors)
         if self.vector_store is not None:
             self.vector_store.insert(vectors)
 
-    def delete(self, tokens: Sequence[str]) -> None:
+    def delete(self, tokens: Sequence[Any]) -> None:
         """
         Delete tokens from the vector space.
 
@@ -261,7 +267,13 @@ def delete(self, tokens: Sequence[str]) -> None:
         :raises ValueError: If any passed tokens are not in the vector space.
         """
         try:
-            curr_indices = [self.items.index(token) for token in tokens]
+            if isinstance(tokens[0], str):
+                curr_indices = [self.items.index(token) for token in tokens]
+            else:
+                curr_indices = []
+                for idx, elem in enumerate(self.items):
+                    if elem in tokens:
+                        curr_indices.append(idx)
         except ValueError as exc:
             raise ValueError(f"Token {exc} was not in the vector space.") from exc
 

From 9ffb491f662e6b36ff5e103ee5dda5c7a67e7e9d Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Mon, 20 Jan 2025 17:24:32 +0100
Subject: [PATCH 02/15] Update README.md to include examples for saving/loading
 vector stores and evaluating backends

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index f796796..a6f8b9e 100644
--- a/README.md
+++ b/README.md
@@ -81,12 +81,14 @@ results = vicinity.query(query_vectors, k=3)
 ```
 
 Saving and loading a vector store:
+
 ```python
 vicinity.save('my_vector_store')
 vicinity = Vicinity.load('my_vector_store')
 ```
 
 Evaluating a backend:
+
 ```python
 # Use the first 1000 vectors as query vectors
 query_vectors = vectors[:1000]

From 7b2bb532b45a26178854e47245bd98a548aa90d7 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Mon, 20 Jan 2025 17:40:20 +0100
Subject: [PATCH 03/15] Refactor Vicinity class to streamline token handling

- Simplified the logic for checking and appending tokens in the insert method, ensuring that duplicate tokens are properly managed.
---
 vicinity/vicinity.py | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/vicinity/vicinity.py b/vicinity/vicinity.py
index 97e9d81..53e9d3d 100644
--- a/vicinity/vicinity.py
+++ b/vicinity/vicinity.py
@@ -243,15 +243,10 @@ def insert(self, tokens: Sequence[Any], vectors: npt.NDArray) -> None:
 
         item_set = set(self.items)
         for token in tokens:
-            if isinstance(token, str):
-                if token in item_set:
+            for item in item_set:
+                if item == token:
                     raise ValueError(f"Token {token} is already in the vector space.")
-                self.items.append(token)
-            else:
-                for item in self.items:
-                    if item == token:
-                        raise ValueError(f"Token {token} is already in the vector space.")
-                self.items.append(token)
+            self.items.append(token)
         self.backend.insert(vectors)
         if self.vector_store is not None:
             self.vector_store.insert(vectors)
@@ -267,13 +262,10 @@ def delete(self, tokens: Sequence[Any]) -> None:
         :raises ValueError: If any passed tokens are not in the vector space.
         """
         try:
-            if isinstance(tokens[0], str):
-                curr_indices = [self.items.index(token) for token in tokens]
-            else:
-                curr_indices = []
-                for idx, elem in enumerate(self.items):
-                    if elem in tokens:
-                        curr_indices.append(idx)
+            curr_indices = []
+            for idx, elem in enumerate(self.items):
+                if elem in tokens:
+                    curr_indices.append(idx)
         except ValueError as exc:
             raise ValueError(f"Token {exc} was not in the vector space.") from exc
 

From a5ce987ef9120f5e49f60c329328dd364451a8c1 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Mon, 20 Jan 2025 17:55:59 +0100
Subject: [PATCH 04/15] Refactor item handling in tests and Vicinity class

- Updated the `items` fixture to return a mix of dictionaries and strings based on index parity.
- Modified `test_vicinity_insert_duplicate` to use the updated `items` fixture for inserting items.
- Adjusted `test_vicinity_delete_and_query` to reference items by their indices instead of hardcoded values.
- Enhanced the Vicinity class to streamline token management, ensuring proper handling of duplicates and improving error messaging for token deletions.
---
 tests/conftest.py      |  2 +-
 tests/test_vicinity.py | 14 +++++++-------
 vicinity/vicinity.py   | 17 +++++++++--------
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 63a07bf..7577844 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -24,7 +24,7 @@
 @pytest.fixture(scope="session")
 def items() -> list[str]:
     """Fixture providing a list of item names."""
-    return [f"item{i}" for i in range(1, 10001)]
+    return [f"item{i}" if i % 2 == 0 else {"name": f"item{i}", "id": i} for i in range(1, 10001)]
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/test_vicinity.py b/tests/test_vicinity.py
index daae1ca..e9d30e5 100644
--- a/tests/test_vicinity.py
+++ b/tests/test_vicinity.py
@@ -183,18 +183,17 @@ def test_index_vector_store(vicinity_with_basic_backend_and_store: Vicinity, vec
         vicinity_with_basic_backend_and_store.get_vector_by_index([-1])
 
 
-def test_vicinity_insert_duplicate(vicinity_instance: Vicinity, query_vector: np.ndarray) -> None:
+def test_vicinity_insert_duplicate(items: list[str], vicinity_instance: Vicinity, query_vector: np.ndarray) -> None:
     """
     Test that Vicinity.insert raises ValueError when inserting duplicate items.
 
     :param vicinity_instance: A Vicinity instance.
     :raises ValueError: If inserting items that already exist.
     """
-    new_items = ["item1"]
     new_vector = query_vector
 
     with pytest.raises(ValueError):
-        vicinity_instance.insert(new_items, new_vector[None, :])
+        vicinity_instance.insert(items[0], new_vector[None, :])
 
 
 def test_vicinity_delete_nonexistent(vicinity_instance: Vicinity) -> None:
@@ -281,7 +280,8 @@ def test_vicinity_delete_and_query(vicinity_instance: Vicinity, items: list[str]
         return
 
     # Delete some items from the Vicinity instance
-    items_to_delete = ["item2", "item4", "item6"]
+    non_existing_items_indices = [0, 1, 2]
+    items_to_delete = [items[i] for i in non_existing_items_indices]
     vicinity_instance.delete(items_to_delete)
 
     # Ensure the items are no longer in the items list
@@ -289,14 +289,14 @@ def test_vicinity_delete_and_query(vicinity_instance: Vicinity, items: list[str]
         assert item not in vicinity_instance.items
 
     # Query using a vector of an item that wasn't deleted
-    item3_index = items.index("item3")
-    item3_vector = vectors[item3_index]
+    existsing_item_index = 3
+    item3_vector = vectors[existsing_item_index]
 
     results = vicinity_instance.query(item3_vector, k=10)
     returned_items = [item for item, _ in results[0]]
 
     # Check that the queried item is in the results
-    assert "item3" in returned_items
+    assert items[existsing_item_index] in returned_items
 
 
 def test_vicinity_evaluate(vicinity_instance: Vicinity, vectors: np.ndarray) -> None:
diff --git a/vicinity/vicinity.py b/vicinity/vicinity.py
index 53e9d3d..cc2c7f2 100644
--- a/vicinity/vicinity.py
+++ b/vicinity/vicinity.py
@@ -241,9 +241,8 @@ def insert(self, tokens: Sequence[Any], vectors: npt.NDArray) -> None:
         if vectors.shape[1] != self.dim:
             raise ValueError("The inserted vectors must have the same dimension as the backend.")
 
-        item_set = set(self.items)
         for token in tokens:
-            for item in item_set:
+            for item in self.items:
                 if item == token:
                     raise ValueError(f"Token {token} is already in the vector space.")
             self.items.append(token)
@@ -261,13 +260,15 @@ def delete(self, tokens: Sequence[Any]) -> None:
         :param tokens: A list of tokens to remove from the vector space.
         :raises ValueError: If any passed tokens are not in the vector space.
         """
-        try:
-            curr_indices = []
-            for idx, elem in enumerate(self.items):
-                if elem in tokens:
+        seen_tokens = []
+        curr_indices = []
+        for idx, elem in enumerate(self.items):
+            for token in tokens:
+                if elem == token:
                     curr_indices.append(idx)
-        except ValueError as exc:
-            raise ValueError(f"Token {exc} was not in the vector space.") from exc
+                    seen_tokens.append(token)
+        if len(seen_tokens) < len(tokens):
+            raise ValueError("Not all tokens were in the vector space.")
 
         self.backend.delete(curr_indices)
         if self.vector_store is not None:

From 022c7b1957b19d26786f90ce926e44d041bff976 Mon Sep 17 00:00:00 2001
From: David Berenstein <david.m.berenstein@gmail.com>
Date: Mon, 20 Jan 2025 19:41:27 +0100
Subject: [PATCH 05/15] Apply suggestions from code review

Co-authored-by: Stephan Tulkens <stephantul@gmail.com>
---
 tests/test_vicinity.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_vicinity.py b/tests/test_vicinity.py
index e9d30e5..b1e3c8b 100644
--- a/tests/test_vicinity.py
+++ b/tests/test_vicinity.py
@@ -289,14 +289,14 @@ def test_vicinity_delete_and_query(vicinity_instance: Vicinity, items: list[str]
         assert item not in vicinity_instance.items
 
     # Query using a vector of an item that wasn't deleted
-    existsing_item_index = 3
-    item3_vector = vectors[existsing_item_index]
+    existing_item_index = 3
+    item3_vector = vectors[existing_item_index]
 
     results = vicinity_instance.query(item3_vector, k=10)
     returned_items = [item for item, _ in results[0]]
 
     # Check that the queried item is in the results
-    assert items[existsing_item_index] in returned_items
+    assert items[existing_item_index] in returned_items
 
 
 def test_vicinity_evaluate(vicinity_instance: Vicinity, vectors: np.ndarray) -> None:

From eaabbfa56130a02242d53d6062a07cade799c64f Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Mon, 20 Jan 2025 19:42:31 +0100
Subject: [PATCH 06/15] Refactor token insertion in Vicinity class to simplify
 duplicate handling

- Replaced the nested loop for checking duplicates with a single extend operation for tokens.
- Improved efficiency by directly appending tokens to the items list, ensuring proper management of duplicates.
---
 vicinity/vicinity.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/vicinity/vicinity.py b/vicinity/vicinity.py
index cc2c7f2..d012d56 100644
--- a/vicinity/vicinity.py
+++ b/vicinity/vicinity.py
@@ -241,11 +241,7 @@ def insert(self, tokens: Sequence[Any], vectors: npt.NDArray) -> None:
         if vectors.shape[1] != self.dim:
             raise ValueError("The inserted vectors must have the same dimension as the backend.")
 
-        for token in tokens:
-            for item in self.items:
-                if item == token:
-                    raise ValueError(f"Token {token} is already in the vector space.")
-            self.items.append(token)
+        self.items.extend(tokens)
         self.backend.insert(vectors)
         if self.vector_store is not None:
             self.vector_store.insert(vectors)

From 031c1364b1f6a9df53852296923d0cd5537929dd Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Mon, 20 Jan 2025 19:53:38 +0100
Subject: [PATCH 07/15] Refactor token deletion logic in Vicinity class to
 improve error handling

- Replaced the nested loop for token matching with a more efficient list comprehension.
- Enhanced error messaging to specify which tokens were not found in the vector space.
---
 vicinity/vicinity.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/vicinity/vicinity.py b/vicinity/vicinity.py
index d012d56..ef7004c 100644
--- a/vicinity/vicinity.py
+++ b/vicinity/vicinity.py
@@ -256,15 +256,17 @@ def delete(self, tokens: Sequence[Any]) -> None:
         :param tokens: A list of tokens to remove from the vector space.
         :raises ValueError: If any passed tokens are not in the vector space.
         """
-        seen_tokens = []
+        tokens_to_find = list(tokens)
         curr_indices = []
         for idx, elem in enumerate(self.items):
-            for token in tokens:
-                if elem == token:
-                    curr_indices.append(idx)
-                    seen_tokens.append(token)
-        if len(seen_tokens) < len(tokens):
-            raise ValueError("Not all tokens were in the vector space.")
+            matching_tokens = [t for t in tokens_to_find if t == elem]
+            if matching_tokens:
+                curr_indices.append(idx)
+                for t in matching_tokens:
+                    tokens_to_find.remove(t)
+
+        if tokens_to_find:
+            raise ValueError(f"Tokens {tokens_to_find} were not in the vector space.")
 
         self.backend.delete(curr_indices)
         if self.vector_store is not None:

From 26e7ed6d02699180feafbf4bf6590ad3b961f67d Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Mon, 20 Jan 2025 19:57:53 +0100
Subject: [PATCH 08/15] Enhance error handling in Vicinity class for JSON
 serialization

- Added a try-except block around the JSON serialization process to catch JSONEncodeError.
---
 vicinity/vicinity.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/vicinity/vicinity.py b/vicinity/vicinity.py
index ef7004c..f365030 100644
--- a/vicinity/vicinity.py
+++ b/vicinity/vicinity.py
@@ -11,6 +11,7 @@
 import numpy as np
 import orjson
 from numpy import typing as npt
+from orjson import JSONEncodeError
 
 from vicinity import Metric
 from vicinity.backends import AbstractBackend, BasicBackend, BasicVectorStore, get_backend_class
@@ -185,9 +186,11 @@ def save(
             raise ValueError(f"Path {path} should be a directory.")
 
         items_dict = {"items": self.items, "metadata": self.metadata, "backend_type": self.backend.backend_type.value}
-
-        with open(path / "data.json", "wb") as file_handle:
-            file_handle.write(orjson.dumps(items_dict))
+        try:
+            with open(path / "data.json", "wb") as file_handle:
+                file_handle.write(orjson.dumps(items_dict))
+        except JSONEncodeError as e:
+            raise JSONEncodeError(f"Items could not be encoded to JSON because they are not serializable: {e}")
 
         self.backend.save(path)
         if self.vector_store is not None:

From 6fb6305a6be6d770f5b4c6fcd0ff81d906d3bc8b Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Mon, 20 Jan 2025 20:01:15 +0100
Subject: [PATCH 09/15] Add non-serializable items fixture and test for
 Vicinity class

- Introduced a new pytest fixture `non_serializable_items` that generates a list of non-serializable objects for testing.
- Added a test case `test_vicinity_save_and_load_non_serializable_items` to verify that saving a Vicinity instance with non-serializable items raises a JSONEncodeError.
- Updated the Vicinity class documentation to specify that JSONEncodeError may be raised if items are not serializable.
---
 tests/conftest.py      | 12 ++++++++++++
 tests/test_vicinity.py | 16 ++++++++++++++++
 vicinity/vicinity.py   |  1 +
 3 files changed, 29 insertions(+)

diff --git a/tests/conftest.py b/tests/conftest.py
index 7577844..63a8199 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -27,6 +27,18 @@ def items() -> list[str]:
     return [f"item{i}" if i % 2 == 0 else {"name": f"item{i}", "id": i} for i in range(1, 10001)]
 
 
+@pytest.fixture(scope="session")
+def non_serializable_items() -> list[str]:
+    """Fixture providing a list of non-serializable items."""
+
+    class NonSerializable:
+        def __init__(self, name: str, id: int) -> None:
+            self.name = name
+            self.id = id
+
+    return [NonSerializable(f"item{i}", i) for i in range(1, 10001)]
+
+
 @pytest.fixture(scope="session")
 def vectors() -> np.ndarray:
     """Fixture providing an array of vectors corresponding to items."""
diff --git a/tests/test_vicinity.py b/tests/test_vicinity.py
index b1e3c8b..a30c864 100644
--- a/tests/test_vicinity.py
+++ b/tests/test_vicinity.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 import pytest
+from orjson import JSONEncodeError
 
 from vicinity import Vicinity
 from vicinity.datatypes import Backend
@@ -162,6 +163,21 @@ def test_vicinity_save_and_load_vector_store(tmp_path: Path, vicinity_instance_w
     assert v.vector_store is not None
 
 
+def test_vicinity_save_and_load_non_serializable_items(
+    tmp_path: Path, non_serializable_items: list[str], vectors: np.ndarray
+) -> None:
+    """
+    Test Vicinity.save and Vicinity.load with non-serializable items.
+
+    :param tmp_path: Temporary directory provided by pytest.
+    :param non_serializable_items: A list of non-serializable items.
+    """
+    vicinity = Vicinity.from_vectors_and_items(vectors=vectors, items=non_serializable_items)
+    save_path = tmp_path / "vicinity_data"
+    with pytest.raises(JSONEncodeError):
+        vicinity.save(save_path)
+
+
 def test_index_vector_store(vicinity_with_basic_backend_and_store: Vicinity, vectors: np.ndarray) -> None:
     """
     Index vectors in the Vicinity instance.
diff --git a/vicinity/vicinity.py b/vicinity/vicinity.py
index f365030..6c4c74f 100644
--- a/vicinity/vicinity.py
+++ b/vicinity/vicinity.py
@@ -178,6 +178,7 @@ def save(
         :param folder: The path to which to save the JSON file. The vectors are saved separately. The JSON contains a path to the numpy file.
         :param overwrite: Whether to overwrite the JSON and numpy files if they already exist.
         :raises ValueError: If the path is not a directory.
+        :raises JSONEncodeError: If the items are not serializable.
         """
         path = Path(folder)
         path.mkdir(parents=True, exist_ok=overwrite)

From c86f7e53325d6458e0f14b75d2b818cc8d109a95 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Tue, 28 Jan 2025 22:03:35 +0100
Subject: [PATCH 10/15] Add Hugging Face integration for Vicinity class

- Introduced HuggingFaceMixin to enable saving and loading Vicinity instances to/from Hugging Face Hub
- Added optional import of HuggingFaceMixin based on huggingface_hub and datasets library availability
- Implemented methods for pushing Vicinity instances to the Hub, including dataset and metadata upload
- Created a method to load Vicinity instances from Hugging Face repositories
---
 vicinity/integrations/huggingface.py | 162 +++++++++++++++++++++++++++
 vicinity/vicinity.py                 |   8 +-
 2 files changed, 169 insertions(+), 1 deletion(-)
 create mode 100644 vicinity/integrations/huggingface.py

diff --git a/vicinity/integrations/huggingface.py b/vicinity/integrations/huggingface.py
new file mode 100644
index 0000000..14c19fe
--- /dev/null
+++ b/vicinity/integrations/huggingface.py
@@ -0,0 +1,162 @@
+import json
+import tempfile
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+from huggingface_hub import DatasetCard, upload_file, upload_folder
+
+from vicinity.backends import BasicVectorStore, get_backend_class
+from vicinity.datatypes import Backend
+
+if TYPE_CHECKING:
+    from vicinity.vicinity import Vicinity
+
+
+class HuggingFaceMixin:
+    def save_to_hub(
+        self,
+        repo_id: str,
+        token: str | None = None,
+        private: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        """
+        Save the Vicinity instance to the Hugging Face Hub.
+
+        Args:
+            repo_id: The repository ID on the Hugging Face Hub
+            token: Optional authentication token for private repositories
+            private: Whether to create a private repository
+            **kwargs: Additional arguments passed to push_to_hub()
+
+        """
+        self.push_to_hub(repo_id, token=token, private=private, **kwargs)
+
+    def push_to_hub(
+        self,
+        repo_id: str,
+        token: str | None = None,
+        private: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        """
+        Push the Vicinity instance to the Hugging Face Hub.
+
+        Args:
+            repo_id: The repository ID on the Hugging Face Hub
+            token: Optional authentication token for private repositories
+            private: Whether to create a private repository
+            **kwargs: Additional arguments passed to Dataset.push_to_hub()
+
+        """
+        from datasets import Dataset
+
+        # Create and push dataset with items and vectors
+        if isinstance(self.items[0], dict):
+            dataset_dict = {k: [item[k] for item in self.items] for k in self.items[0].keys()}
+        else:
+            dataset_dict = {"items": self.items}
+        if self.vector_store is not None:
+            dataset_dict["vectors"] = self.vector_store.vectors
+        dataset = Dataset.from_dict(dataset_dict)
+        dataset.push_to_hub(repo_id, token=token, private=private, **kwargs)
+
+        # Save backend and config files to temp directory and upload
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+
+            # Save and upload backend
+            self.backend.save(temp_path)
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=temp_path,
+                token=token,
+                repo_type="dataset",
+                path_in_repo="backend",
+            )
+
+            # Save and upload config
+            config = {"metadata": self.metadata, "backend_type": self.backend.backend_type.value}
+            config_path = temp_path / "config.json"
+            config_path.write_text(json.dumps(config))
+            upload_file(
+                repo_id=repo_id,
+                path_or_fileobj=config_path,
+                token=token,
+                repo_type="dataset",
+                path_in_repo="config.json",
+            )
+
+        # DatasetCard
+        DatasetCard(
+            content=(
+                f"""
+---
+tags:
+- vicinity
+- vector-store
+---
+
+# Dataset Card for {repo_id}
+
+This dataset was created using the [vicinity](https://github.com/MinishLab/vicinity) library, a lightweight nearest neighbors library with flexible backends.
+
+It contains a vector space with {len(self.items)} items.
+
+## Usage
+
+You can load this dataset using the following code:
+
+```python
+from vicinity import Vicinity
+vicinity = Vicinity.load_from_hub("{repo_id}")
+```
+
+After loading the dataset, you can use the `vicinity.query` method to find the nearest neighbors to a vector.
+
+## Configuration
+
+The configuration of the dataset is stored in the `config.json` file. The vector backend is stored in the `backend` folder.
+
+```bash
+{json.dumps(config, indent=2)}
+```
+"""
+            )
+        ).push_to_hub(repo_id, token=token, repo_type="dataset")
+
+    @classmethod
+    def load_from_hub(cls, repo_id: str, token: str | None = None, **kwargs: Any) -> "Vicinity":
+        """
+        Load a Vicinity instance from the Hugging Face Hub.
+
+        :param repo_id: The repository ID on the Hugging Face Hub.
+        :param token: Optional authentication token for private repositories.
+        :param kwargs: Additional arguments passed to load_dataset.
+        :return: A Vicinity instance loaded from the Hub.
+        """
+        from datasets import load_dataset
+        from huggingface_hub import snapshot_download
+
+        # Load dataset and extract items and vectors
+        dataset = load_dataset(repo_id, token=token, split="train", **kwargs)
+        if "items" in dataset.column_names:
+            items = dataset["items"]
+        else:
+            # Create items from all columns except 'vectors'
+            items = []
+            columns = [col for col in dataset.column_names if col != "vectors"]
+            for i in range(len(dataset)):
+                items.append({col: dataset[col][i] for col in columns})
+        has_vectors = "vectors" in dataset.column_names
+        vector_store = BasicVectorStore(vectors=dataset["vectors"]) if has_vectors else None
+
+        # Download and load config and backend
+        repo_path = Path(snapshot_download(repo_id=repo_id, token=token, repo_type="dataset"))
+        with open(repo_path / "config.json") as f:
+            config = json.load(f)
+
+        backend_type = Backend(config["backend_type"])
+        backend = get_backend_class(backend_type).load(repo_path / "backend")
+
+        return cls(items=items, backend=backend, metadata=config["metadata"], vector_store=vector_store)
diff --git a/vicinity/vicinity.py b/vicinity/vicinity.py
index 6c4c74f..af6b60f 100644
--- a/vicinity/vicinity.py
+++ b/vicinity/vicinity.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import importlib
 import logging
 from io import open
 from pathlib import Path
@@ -19,8 +20,13 @@
 
 logger = logging.getLogger(__name__)
 
+if importlib.util.find_spec("huggingface_hub") is not None and importlib.util.find_spec("datasets") is not None:
+    from vicinity.integrations.huggingface import HuggingFaceMixin
+else:
+    HuggingFaceMixin = object
 
-class Vicinity:
+
+class Vicinity(HuggingFaceMixin):
     """
     Work with vector representations of items.
 

From 4f30d457b654e9215444fd1abb34be96b4183b55 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Tue, 25 Feb 2025 22:26:19 +0100
Subject: [PATCH 11/15] Enhance Hugging Face integration with improved error
 handling and dataset card template

- Added a dataset card template for Hugging Face Hub uploads
- Improved error handling for Hugging Face integration with custom import error
- Updated `push_to_hub` method to include model name/path in configuration
- Removed conditional import of Hugging Face libraries in `vicinity.py`
- Added `huggingface` optional dependency in `pyproject.toml`
---
 pyproject.toml                                |   2 +
 .../integrations/dataset_card_template.md     |  30 +++++
 vicinity/integrations/huggingface.py          | 111 +++++++-----------
 vicinity/vicinity.py                          |   6 +-
 4 files changed, 75 insertions(+), 74 deletions(-)
 create mode 100644 vicinity/integrations/dataset_card_template.md

diff --git a/pyproject.toml b/pyproject.toml
index a2d6087..d853c62 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,6 +43,7 @@ dev = [
     "setuptools"
 ]
 hnsw = ["hnswlib"]
+huggingface = ["datasets"]
 pynndescent = [
     "pynndescent>=0.5.10",
     "numba>=0.59.0",
@@ -54,6 +55,7 @@ faiss = ["faiss-cpu"]
 usearch = ["usearch"]
 voyager = ["voyager"]
 all = [
+    "datasets",
     "hnswlib",
     "pynndescent>=0.5.10",
     "numba>=0.59.0",
diff --git a/vicinity/integrations/dataset_card_template.md b/vicinity/integrations/dataset_card_template.md
new file mode 100644
index 0000000..085a7a5
--- /dev/null
+++ b/vicinity/integrations/dataset_card_template.md
@@ -0,0 +1,30 @@
+---
+tags:
+- vicinity
+- vector-store
+---
+
+# Dataset Card for {repo_id}
+
+This dataset was created using the [vicinity](https://github.com/MinishLab/vicinity) library, a lightweight nearest neighbors library with flexible backends.
+
+It contains a vector space with {num_items} items.
+
+## Usage
+
+You can load this dataset using the following code:
+
+```python
+from vicinity import Vicinity
+vicinity = Vicinity.load_from_hub("{repo_id}")
+```
+
+After loading the dataset, you can use the `vicinity.query` method to find the nearest neighbors to a vector.
+
+## Configuration
+
+The configuration of the dataset is stored in the `config.json` file. The vector backend is stored in the `backend` folder.
+
+```bash
+{config}
+```
\ No newline at end of file
diff --git a/vicinity/integrations/huggingface.py b/vicinity/integrations/huggingface.py
index 14c19fe..0f856bf 100644
--- a/vicinity/integrations/huggingface.py
+++ b/vicinity/integrations/huggingface.py
@@ -1,55 +1,51 @@
+from __future__ import annotations
+
 import json
+import logging
 import tempfile
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 
-from huggingface_hub import DatasetCard, upload_file, upload_folder
-
 from vicinity.backends import BasicVectorStore, get_backend_class
 from vicinity.datatypes import Backend
 
 if TYPE_CHECKING:
+    from huggingface_hub import CommitInfo
+
     from vicinity.vicinity import Vicinity
 
+_HUB_IMPORT_ERROR = ImportError(
+    "`datasets` and `huggingface_hub` are required to push to the Hugging Face Hub. Please install them with `pip install 'vicinity[huggingface]'`"
+)
 
-class HuggingFaceMixin:
-    def save_to_hub(
-        self,
-        repo_id: str,
-        token: str | None = None,
-        private: bool = False,
-        **kwargs: Any,
-    ) -> None:
-        """
-        Save the Vicinity instance to the Hugging Face Hub.
+logger = logging.getLogger(__name__)
 
-        Args:
-            repo_id: The repository ID on the Hugging Face Hub
-            token: Optional authentication token for private repositories
-            private: Whether to create a private repository
-            **kwargs: Additional arguments passed to push_to_hub()
-
-        """
-        self.push_to_hub(repo_id, token=token, private=private, **kwargs)
 
+class HuggingFaceMixin:
     def push_to_hub(
         self,
+        model_name_or_path: str,
         repo_id: str,
         token: str | None = None,
         private: bool = False,
         **kwargs: Any,
-    ) -> None:
+    ) -> "CommitInfo":
         """
         Push the Vicinity instance to the Hugging Face Hub.
 
-        Args:
-            repo_id: The repository ID on the Hugging Face Hub
-            token: Optional authentication token for private repositories
-            private: Whether to create a private repository
-            **kwargs: Additional arguments passed to Dataset.push_to_hub()
-
+        :param model_name_or_path: The name of the model or the path to the local directory
+            that was used to create the embeddings in the Vicinity instance.
+        :param repo_id: The repository ID on the Hugging Face Hub
+        :param token: Optional authentication token for private repositories
+        :param private: Whether to create a private repository
+        :param **kwargs: Additional arguments passed to Dataset.push_to_hub()
+        :return: The commit info
         """
-        from datasets import Dataset
+        try:
+            from datasets import Dataset
+            from huggingface_hub import DatasetCard, upload_file, upload_folder
+        except ImportError:
+            raise _HUB_IMPORT_ERROR
 
         # Create and push dataset with items and vectors
         if isinstance(self.items[0], dict):
@@ -76,7 +72,11 @@ def push_to_hub(
             )
 
             # Save and upload config
-            config = {"metadata": self.metadata, "backend_type": self.backend.backend_type.value}
+            config = {
+                "metadata": self.metadata,
+                "backend_type": self.backend.backend_type.value,
+                "model_name_or_path": model_name_or_path,
+            }
             config_path = temp_path / "config.json"
             config_path.write_text(json.dumps(config))
             upload_file(
@@ -87,43 +87,11 @@ def push_to_hub(
                 path_in_repo="config.json",
             )
 
-        # DatasetCard
-        DatasetCard(
-            content=(
-                f"""
----
-tags:
-- vicinity
-- vector-store
----
-
-# Dataset Card for {repo_id}
-
-This dataset was created using the [vicinity](https://github.com/MinishLab/vicinity) library, a lightweight nearest neighbors library with flexible backends.
-
-It contains a vector space with {len(self.items)} items.
-
-## Usage
-
-You can load this dataset using the following code:
-
-```python
-from vicinity import Vicinity
-vicinity = Vicinity.load_from_hub("{repo_id}")
-```
-
-After loading the dataset, you can use the `vicinity.query` method to find the nearest neighbors to a vector.
-
-## Configuration
-
-The configuration of the dataset is stored in the `config.json` file. The vector backend is stored in the `backend` folder.
-
-```bash
-{json.dumps(config, indent=2)}
-```
-"""
-            )
-        ).push_to_hub(repo_id, token=token, repo_type="dataset")
+        # Load the dataset card template from the related path
+        template_path = Path(__file__).parent / "dataset_card_template.md"
+        template = template_path.read_text()
+        content = template.format(repo_id=repo_id, num_items=len(self.items), config=json.dumps(config, indent=4))
+        return DatasetCard(content=content).push_to_hub(repo_id=repo_id, token=token, repo_type="dataset")
 
     @classmethod
     def load_from_hub(cls, repo_id: str, token: str | None = None, **kwargs: Any) -> "Vicinity":
@@ -132,11 +100,14 @@ def load_from_hub(cls, repo_id: str, token: str | None = None, **kwargs: Any) ->
 
         :param repo_id: The repository ID on the Hugging Face Hub.
         :param token: Optional authentication token for private repositories.
-        :param kwargs: Additional arguments passed to load_dataset.
+        :param **kwargs: Additional arguments passed to load_dataset.
         :return: A Vicinity instance loaded from the Hub.
         """
-        from datasets import load_dataset
-        from huggingface_hub import snapshot_download
+        try:
+            from datasets import load_dataset
+            from huggingface_hub import snapshot_download
+        except ImportError:
+            raise _HUB_IMPORT_ERROR
 
         # Load dataset and extract items and vectors
         dataset = load_dataset(repo_id, token=token, split="train", **kwargs)
@@ -155,7 +126,9 @@ def load_from_hub(cls, repo_id: str, token: str | None = None, **kwargs: Any) ->
         repo_path = Path(snapshot_download(repo_id=repo_id, token=token, repo_type="dataset"))
         with open(repo_path / "config.json") as f:
             config = json.load(f)
+            model_name_or_path = config.pop("model_name_or_path")
 
+        print(f"Embeddings in Vicinity instance were created from model name or path: {model_name_or_path}")
         backend_type = Backend(config["backend_type"])
         backend = get_backend_class(backend_type).load(repo_path / "backend")
 
diff --git a/vicinity/vicinity.py b/vicinity/vicinity.py
index af6b60f..1b106b1 100644
--- a/vicinity/vicinity.py
+++ b/vicinity/vicinity.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import importlib
 import logging
 from io import open
 from pathlib import Path
@@ -20,10 +19,7 @@
 
 logger = logging.getLogger(__name__)
 
-if importlib.util.find_spec("huggingface_hub") is not None and importlib.util.find_spec("datasets") is not None:
-    from vicinity.integrations.huggingface import HuggingFaceMixin
-else:
-    HuggingFaceMixin = object
+from vicinity.integrations.huggingface import HuggingFaceMixin
 
 
 class Vicinity(HuggingFaceMixin):

From cab15e56d41dee455ba6a32a992035ae06133025 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Tue, 25 Feb 2025 22:31:35 +0100
Subject: [PATCH 12/15] Update pyproject.toml and README.md for improved
 package installation and Hugging Face integration

- Added new optional dependency groups for integrations and backends in pyproject.toml
- Updated README.md with new installation instructions for specific integrations and backends
- Added documentation for pushing and loading vector stores from Hugging Face Hub
- Simplified and clarified installation options in README
---
 README.md      | 21 ++++++++++++++++++---
 pyproject.toml | 21 ++++++++++++++++++++-
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index a6f8b9e..1850f0a 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,6 @@
 
 </div>
 
-
 Vicinity is a light-weight, low-dependency vector store. It provides a simple and intuitive interface for nearest neighbor search, with support for different backends and evaluation.
 
 There are many nearest neighbors packages and methods out there. However, we found it difficult to compare them. Every package has its own interface, quirks, and limitations, and learning a new package can be time-consuming. In addition to that, how do you effectively evaluate different packages? How do you know which one is the best for your use case?
@@ -43,7 +42,7 @@ Install the package with:
 ```bash
 pip install vicinity
 ```
-Optionally, [install any of the supported backends](#installation), or simply install all of them with:
+Optionally, [install specific backends and integrations](#installation), or simply install all of them with:
 ```bash
 pip install vicinity[all]
 ```
@@ -87,6 +86,13 @@ vicinity.save('my_vector_store')
 vicinity = Vicinity.load('my_vector_store')
 ```
 
+Pushing and loading a vector store from the Hugging Face Hub:
+
+```python
+vicinity.push_to_hub(model_name_or_path='my_vector_store', repo_id='my_vector_store')
+vicinity = Vicinity.load_from_hub(repo_id='my_vector_store')
+```
+
 Evaluating a backend:
 
 ```python
@@ -167,9 +173,18 @@ The following installation options are available:
 # Install the base package
 pip install vicinity
 
-# Install all backends
+# Install all integrations and backends
 pip install vicinity[all]
 
+# Install all integrations
+pip install vicinity[integrations]
+
+# Install specific integrations
+pip install vicinity[huggingface]
+
+# Install all backends
+pip install vicinity[backends]
+
 # Install specific backends
 pip install vicinity[annoy]
 pip install vicinity[faiss]
diff --git a/pyproject.toml b/pyproject.toml
index d853c62..c306cbe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,8 +42,15 @@ dev = [
     "ruff",
     "setuptools"
 ]
-hnsw = ["hnswlib"]
+
+# Integrations
 huggingface = ["datasets"]
+integrations = [
+    "datasets"
+]
+
+# Backends
+hnsw = ["hnswlib"]
 pynndescent = [
     "pynndescent>=0.5.10",
     "numba>=0.59.0",
@@ -54,6 +61,18 @@ annoy = ["annoy"]
 faiss = ["faiss-cpu"]
 usearch = ["usearch"]
 voyager = ["voyager"]
+backends = [
+    "hnswlib",
+    "pynndescent>=0.5.10",
+    "numba>=0.59.0",
+    "llvmlite>=0.42.0",
+    "numpy>=1.24.0",
+    "annoy",
+    "faiss-cpu",
+    "usearch",
+    "voyager"
+]
+
 all = [
     "datasets",
     "hnswlib",

From 65465f3a3c7620d0679eff4c21f48df212e88c34 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Tue, 25 Feb 2025 22:38:13 +0100
Subject: [PATCH 13/15] Add test for Vicinity.load_from_hub method

- Implemented a new test case for loading a Vicinity instance from Hugging Face Hub
- Added test to verify the print statement when loading from a repository
- Introduced a constant for the print statement in the Hugging Face integration module
- Updated the print statement to use string formatting for better flexibility
---
 tests/test_vicinity.py               | 25 +++++++++++++++++++++++++
 vicinity/integrations/huggingface.py |  5 ++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/tests/test_vicinity.py b/tests/test_vicinity.py
index a30c864..7061e5c 100644
--- a/tests/test_vicinity.py
+++ b/tests/test_vicinity.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import io
+import sys
 from pathlib import Path
 
 import numpy as np
@@ -8,6 +10,7 @@
 
 from vicinity import Vicinity
 from vicinity.datatypes import Backend
+from vicinity.integrations.huggingface import _MODEL_NAME_OR_PATH_PRINT_STATEMENT
 
 BackendType = tuple[Backend, str]
 
@@ -333,3 +336,25 @@ def test_vicinity_evaluate(vicinity_instance: Vicinity, vectors: np.ndarray) ->
     vicinity_instance.backend.arguments.metric = "manhattan"
     with pytest.raises(ValueError):
         vicinity_instance.evaluate(vectors, query_vectors)
+
+
+def test_load_from_hub(vicinity_instance: Vicinity) -> None:
+    """
+    Test Vicinity.load_from_hub.
+
+    :param vicinity_instance: A Vicinity instance.
+    """
+    repo_id = "davidberenstein1957/my-vicinity-repo"
+    expected_print_statement = _MODEL_NAME_OR_PATH_PRINT_STATEMENT.split(":")[0]
+
+    # Capture the output
+    captured_output = io.StringIO()
+    sys.stdout = captured_output
+
+    Vicinity.load_from_hub(repo_id=repo_id)
+
+    # Reset redirect.
+    sys.stdout = sys.__stdout__
+
+    # Check if the expected message is in the output
+    assert expected_print_statement in captured_output.getvalue()
diff --git a/vicinity/integrations/huggingface.py b/vicinity/integrations/huggingface.py
index 0f856bf..0e75129 100644
--- a/vicinity/integrations/huggingface.py
+++ b/vicinity/integrations/huggingface.py
@@ -17,6 +17,9 @@
 _HUB_IMPORT_ERROR = ImportError(
     "`datasets` and `huggingface_hub` are required to push to the Hugging Face Hub. Please install them with `pip install 'vicinity[huggingface]'`"
 )
+_MODEL_NAME_OR_PATH_PRINT_STATEMENT = (
+    "Embeddings in Vicinity instance were created from model name or path: {model_name_or_path}"
+)
 
 logger = logging.getLogger(__name__)
 
@@ -128,7 +131,7 @@ def load_from_hub(cls, repo_id: str, token: str | None = None, **kwargs: Any) ->
             config = json.load(f)
             model_name_or_path = config.pop("model_name_or_path")
 
-        print(f"Embeddings in Vicinity instance were created from model name or path: {model_name_or_path}")
+        print(_MODEL_NAME_OR_PATH_PRINT_STATEMENT.format(model_name_or_path=model_name_or_path))
         backend_type = Backend(config["backend_type"])
         backend = get_backend_class(backend_type).load(repo_path / "backend")
 

From 06545dd8a9cb278d68d02280045afe8cca7ff6c6 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Tue, 25 Feb 2025 22:43:16 +0100
Subject: [PATCH 14/15] Remove test files for utils and vicinity modules

- Deleted `tests/test_utils.py` containing tests for normalization utility functions
- Removed `tests/test_vicinity.py` with comprehensive test cases for the Vicinity class
- These test files are no longer needed, likely due to refactoring or migration of tests
---
 tests/integrations/test_vicinity.py | 33 +++++++++++++++++++++++++++++
 tests/{ => unit}/test_utils.py      |  0
 tests/{ => unit}/test_vicinity.py   | 25 ----------------------
 3 files changed, 33 insertions(+), 25 deletions(-)
 create mode 100644 tests/integrations/test_vicinity.py
 rename tests/{ => unit}/test_utils.py (100%)
 rename tests/{ => unit}/test_vicinity.py (94%)

diff --git a/tests/integrations/test_vicinity.py b/tests/integrations/test_vicinity.py
new file mode 100644
index 0000000..bf4fc43
--- /dev/null
+++ b/tests/integrations/test_vicinity.py
@@ -0,0 +1,33 @@
+from __future__ import annotations
+
+import io
+import sys
+
+from vicinity import Vicinity
+from vicinity.datatypes import Backend
+from vicinity.integrations.huggingface import _MODEL_NAME_OR_PATH_PRINT_STATEMENT
+
+BackendType = tuple[Backend, str]
+
+
+def test_load_from_hub(vicinity_instance: Vicinity) -> None:
+    """
+    Test Vicinity.load_from_hub.
+
+    :param vicinity_instance: A Vicinity instance.
+    """
+    repo_id = "davidberenstein1957/my-vicinity-repo"
+    # get the first part of the print statement to test if model name or path is printed
+    expected_print_statement = _MODEL_NAME_OR_PATH_PRINT_STATEMENT.split(":")[0]
+
+    # Capture the output
+    captured_output = io.StringIO()
+    sys.stdout = captured_output
+
+    Vicinity.load_from_hub(repo_id=repo_id)
+
+    # Reset redirect.
+    sys.stdout = sys.__stdout__
+
+    # Check if the expected message is in the output
+    assert expected_print_statement in captured_output.getvalue()
diff --git a/tests/test_utils.py b/tests/unit/test_utils.py
similarity index 100%
rename from tests/test_utils.py
rename to tests/unit/test_utils.py
diff --git a/tests/test_vicinity.py b/tests/unit/test_vicinity.py
similarity index 94%
rename from tests/test_vicinity.py
rename to tests/unit/test_vicinity.py
index 7061e5c..a30c864 100644
--- a/tests/test_vicinity.py
+++ b/tests/unit/test_vicinity.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-import io
-import sys
 from pathlib import Path
 
 import numpy as np
@@ -10,7 +8,6 @@
 
 from vicinity import Vicinity
 from vicinity.datatypes import Backend
-from vicinity.integrations.huggingface import _MODEL_NAME_OR_PATH_PRINT_STATEMENT
 
 BackendType = tuple[Backend, str]
 
@@ -336,25 +333,3 @@ def test_vicinity_evaluate(vicinity_instance: Vicinity, vectors: np.ndarray) ->
     vicinity_instance.backend.arguments.metric = "manhattan"
     with pytest.raises(ValueError):
         vicinity_instance.evaluate(vectors, query_vectors)
-
-
-def test_load_from_hub(vicinity_instance: Vicinity) -> None:
-    """
-    Test Vicinity.load_from_hub.
-
-    :param vicinity_instance: A Vicinity instance.
-    """
-    repo_id = "davidberenstein1957/my-vicinity-repo"
-    expected_print_statement = _MODEL_NAME_OR_PATH_PRINT_STATEMENT.split(":")[0]
-
-    # Capture the output
-    captured_output = io.StringIO()
-    sys.stdout = captured_output
-
-    Vicinity.load_from_hub(repo_id=repo_id)
-
-    # Reset redirect.
-    sys.stdout = sys.__stdout__
-
-    # Check if the expected message is in the output
-    assert expected_print_statement in captured_output.getvalue()

From cc3fbf47f020f75a6142c34f9c3692a499bdc771 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Tue, 25 Feb 2025 22:47:18 +0100
Subject: [PATCH 15/15] Add comprehensive test suites for Vicinity and utility
 functions

- Implemented `test_utils.py` with tests for vector normalization functions
- Created `test_vicinity.py` with extensive test cases covering Vicinity class methods
- Added `test_huggingface.py` to test Hugging Face integration functionality
- Included tests for various scenarios such as:
  * Initialization and vector handling
  * Querying and thresholding
  * Insertion and deletion of vectors
  * Saving and loading vector stores
  * Handling non-serializable items
  * Hugging Face Hub integration
---
 tests/integrations/{test_vicinity.py => test_huggingface.py} | 0
 tests/{unit => }/test_utils.py                               | 0
 tests/{unit => }/test_vicinity.py                            | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 rename tests/integrations/{test_vicinity.py => test_huggingface.py} (100%)
 rename tests/{unit => }/test_utils.py (100%)
 rename tests/{unit => }/test_vicinity.py (100%)

diff --git a/tests/integrations/test_vicinity.py b/tests/integrations/test_huggingface.py
similarity index 100%
rename from tests/integrations/test_vicinity.py
rename to tests/integrations/test_huggingface.py
diff --git a/tests/unit/test_utils.py b/tests/test_utils.py
similarity index 100%
rename from tests/unit/test_utils.py
rename to tests/test_utils.py
diff --git a/tests/unit/test_vicinity.py b/tests/test_vicinity.py
similarity index 100%
rename from tests/unit/test_vicinity.py
rename to tests/test_vicinity.py