From 803e2e80b582ce5c7498bd53726d9ee9caf656d1 Mon Sep 17 00:00:00 2001 From: Patrick Ge <53650488+cococo2000@users.noreply.github.com> Date: Fri, 3 Jan 2025 20:43:39 +0800 Subject: [PATCH 1/4] Update bvb-run.yml Add sptag and pgvector to action check --- .github/workflows/bvb-run.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/bvb-run.yml b/.github/workflows/bvb-run.yml index 62fc451..0aed2fb 100644 --- a/.github/workflows/bvb-run.yml +++ b/.github/workflows/bvb-run.yml @@ -36,6 +36,8 @@ jobs: - weaviate - redis - elasticsearch + - sptag + - pgvector # - vearch # Vearch test is disabled temporarily due to disk space constraints. # GPU support is not available in the free tier # - milvus_gpu From 33e2fcfeffb0dd1acc8d0b1c0b54fb681f131cd1 Mon Sep 17 00:00:00 2001 From: "Gzx@151" Date: Fri, 3 Jan 2025 22:53:59 +0800 Subject: [PATCH 2/4] fix sptag: add super.init - missing docker_cient --- bigvectorbench/algorithms/sptag/module.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/bigvectorbench/algorithms/sptag/module.py b/bigvectorbench/algorithms/sptag/module.py index 14a2cf1..d2143b5 100644 --- a/bigvectorbench/algorithms/sptag/module.py +++ b/bigvectorbench/algorithms/sptag/module.py @@ -1,12 +1,8 @@ """ SPTAG module for BigVectorBench framework. """ -import subprocess -from time import sleep import numpy as np import SPTAG from bigvectorbench.algorithms.base.module import BaseANN -import csv -import shutil def metric_mapping(_metric: str): """ @@ -31,6 +27,7 @@ class SPTAGBase(BaseANN): """SPTAG implementation""" def __init__(self, metric: str, dim: int): + super().__init__() self._metric = metric self._dim = dim self._metric_type = metric_mapping(metric) @@ -109,7 +106,7 @@ def set_query_arguments(self): def query(self, v, n, filter_expr=None): if filter_expr is not None: - raise ValueError( f"[SPTAG] have not supported filter-query!!!" ) + raise ValueError("[SPTAG] have not supported filter-query!!!") j = SPTAG.AnnIndex.Load(self.index_name) # print(j.Search(v.tobytes(), n)[0]) # print(j.Search(v.tobytes(), n)[1]) From 06c7a3dc3b2c13387ac827e2e17476afda46a7ec Mon Sep 17 00:00:00 2001 From: "Gzx@151" Date: Fri, 3 Jan 2025 22:58:52 +0800 Subject: [PATCH 3/4] add docker_tag filter --- bigvectorbench/main.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bigvectorbench/main.py b/bigvectorbench/main.py index aa5bc7b..bc015fc 100644 --- a/bigvectorbench/main.py +++ b/bigvectorbench/main.py @@ -516,6 +516,10 @@ def main(): force=args.force, ) + if args.docker_tag: + logger.info("running only %s", args.docker_tag) + definitions = [d for d in definitions if d.docker_tag == args.docker_tag] + if args.algorithm: logger.info("running only %s", args.algorithm) definitions = [d for d in definitions if d.algorithm == args.algorithm] From 62fcd334ba8946f6b2edfb132d880f42611d1fba Mon Sep 17 00:00:00 2001 From: "Gzx@151" Date: Fri, 3 Jan 2025 23:16:55 +0800 Subject: [PATCH 4/4] fix pgvector: only hnsw, add and fix insert, update op --- bigvectorbench/algorithms/pgvector/config.yml | 22 +++++------ bigvectorbench/algorithms/pgvector/module.py | 37 +++++++++---------- 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/bigvectorbench/algorithms/pgvector/config.yml b/bigvectorbench/algorithms/pgvector/config.yml index 3439f18..06011a0 100644 --- a/bigvectorbench/algorithms/pgvector/config.yml +++ b/bigvectorbench/algorithms/pgvector/config.yml @@ -15,16 +15,16 @@ float: arg_groups: [{M: 24, efConstruction: 200}] # args: {} query_args: [[10, 20, 40, 80, 120, 200, 400, 800]] - - base_args: ["@metric"] - constructor: PGVectorIVFFLAT - disabled: false - docker_tag: bigvectorbench-pgvector - module: bigvectorbench.algorithms.pgvector - name: pgvector-ivfflat - run_groups: - IVFFLAT_32: - arg_groups: [{ nlist: 32 }] - query_args: [[1, 4, 8, 16, 32]] + # - base_args: ["@metric"] + # constructor: PGVectorIVFFLAT + # disabled: false + # docker_tag: bigvectorbench-pgvector + # module: bigvectorbench.algorithms.pgvector + # name: pgvector-ivfflat + # run_groups: + # IVFFLAT_32: + # arg_groups: [{ nlist: 32 }] + # query_args: [[1, 4, 8, 16, 32]] # IVFFLAT_64: # arg_groups: [{ nlist: 64 }] # query_args: [[4, 16, 32, 48, 64]] @@ -45,4 +45,4 @@ float: # query_args: [[256, 1024, 2048, 3072, 4096]] # IVFFLAT_8192: # arg_groups: [{ nlist: 8192 }] - # query_args: [[512, 2048, 4096, 6144, 8192]] \ No newline at end of file + # query_args: [[512, 2048, 4096, 6144, 8192]] diff --git a/bigvectorbench/algorithms/pgvector/module.py b/bigvectorbench/algorithms/pgvector/module.py index cd66587..527db6c 100644 --- a/bigvectorbench/algorithms/pgvector/module.py +++ b/bigvectorbench/algorithms/pgvector/module.py @@ -11,9 +11,10 @@ class PGVector(BaseANN): def __init__(self, metric, method_param): + super().__init__() self._metric = metric - self._m = method_param['M'] - self._ef_construction = method_param['efConstruction'] + self._m = method_param.get("M", 32) + self._ef_construction = method_param.get("efConstruction", 40) self._cur = None self.labels = None self.label_names = None @@ -26,7 +27,6 @@ def __init__(self, metric, method_param): self._query = "SELECT id FROM items ORDER BY embedding <-> %s LIMIT %s" else: raise RuntimeError(f"Unknown metric {metric}") - def get_vector_index(self): """Get vector index""" @@ -53,7 +53,7 @@ def load_data( table_definition = f"id integer, embedding vector({embeddings.shape[1]})" cur.execute(f"CREATE TABLE items ({table_definition})") cur.execute("ALTER TABLE items ALTER COLUMN embedding SET STORAGE PLAIN") - + if labels is not None and label_names is not None: with cur.copy(f"COPY items (id, embedding, {', '.join(label_names)}) FROM STDIN WITH (FORMAT BINARY)") as copy: copy.set_types(["int4", "vector"] + ["int4" for _ in label_names]) @@ -64,16 +64,16 @@ def load_data( copy.set_types(["int4", "vector"]) for i, embedding in enumerate(embeddings): copy.write_row((i, embedding)) - + print("Creating index...") - + if self._metric == "angular": cur.execute("CREATE INDEX ON items USING %s (embedding vector_cosine_ops) WITH (m = %d, ef_construction = %d)" % (self.index,self._m, self._ef_construction)) elif self._metric == "euclidean": cur.execute("CREATE INDEX ON items USING %s (embedding vector_l2_ops) WITH (m = %d, ef_construction = %d)" % (self.index,self._m, self._ef_construction)) else: raise RuntimeError(f"Unknown metric {self._metric}") - + print("Done!") self._cur = cur @@ -109,7 +109,7 @@ def get_memory_usage(self): def __str__(self): return f"PGVector(m={self._m}, ef_construction={self._ef_construction}, ef_search={self._ef_search})" - + def insert(self, embeddings: np.ndarray, labels: np.ndarray | None = None) -> None: """ Single insert data @@ -122,9 +122,9 @@ def insert(self, embeddings: np.ndarray, labels: np.ndarray | None = None) -> No None """ if labels is not None and self.label_names is not None: - insert_sentence = (f"INSERT INTO items (id,embedding,{', '.join(self.label_names)}) VALUES ({self.num_entities+1},{embeddings},{', '.join(labels)})") + insert_sentence = f"INSERT INTO items (id,embedding,{', '.join(self.label_names)}) VALUES ({self.num_entities+1},'[{', '.join(map(str, embeddings.tolist()))}]',{', '.join(map(str, labels.tolist()))})" else: - insert_sentence = (f"INSERT INTO items (id,embedding) VALUES ({self.num_entities+1},{embeddings}") + insert_sentence = f"INSERT INTO items (id,embedding) VALUES ({self.num_entities+1},'[{', '.join(map(str, embeddings.tolist()))}]')" self._cur.execute(insert_sentence) self.num_entities += 1 @@ -142,10 +142,10 @@ def update( Returns: None """ - update_item = (f"embeddings = {embeddings},") + update_item = f"embedding = '[{', '.join(map(str, embeddings.tolist()))}]'" if labels is not None and self.label_names is not None: - for i in enumerate(self.label_names): - update_item += f"{self.label_names[i]} = {labels[i]}" + for i in range(len(self.label_names)): + update_item += f", {self.label_names[i]} = {str(labels[i].item())}" update_sentence = (f"UPDATE items SET {update_item} where id = {index}") self._cur.execute(update_sentence) @@ -171,7 +171,7 @@ def delete( class PGVectorHNSW(PGVector): def __init__(self, metric: str, index_param: dict): super().__init__(metric, index_param) - self._nlinks = index_param.get("nlinks", 32) + self._m = index_param.get("M", 32) self._efConstruction = index_param.get("efConstruction", 40) def get_vector_index(self): @@ -183,16 +183,15 @@ def set_query_arguments(self, efSearch: int = 40): Set query arguments for pgvector query with hnsw index """ self.search_params = { - "metric_type": self._metric_type, + "metric_type": self._metric, "efSearch": efSearch, } - self.name = f"pgvector HNSW metric:{self._metric}, nlinks:{self._nlinks}, efConstruction:{self._efConstruction}, efSearch:{efSearch}" + self.name = f"pgvector HNSW metric:{self._metric}, M:{self._m}, efConstruction:{self._efConstruction}, efSearch:{efSearch}" class PGVectorIVFFLAT(PGVector): def __init__(self, metric: str, index_param: dict): super().__init__(metric, index_param) self._nlinks = index_param.get("nlinks", 32) - self._efConstruction = index_param.get("efConstruction", 40) def get_vector_index(self): """Get IVFFLAT vector index""" @@ -203,7 +202,7 @@ def set_query_arguments(self, efSearch: int = 40): Set query arguments for pgvector query with ivfflat index """ self.search_params = { - "metric_type": self._metric_type, + "metric_type": self._metric, "efSearch": efSearch, } - self.name = f"pgvector ivfflat metric:{self._metric}, nlinks:{self._nlinks}, efConstruction:{self._efConstruction}, efSearch:{efSearch}" \ No newline at end of file + self.name = f"pgvector ivfflat metric:{self._metric}, nlinks:{self._nlinks}, efSearch:{efSearch}"