Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions bigvectorbench/algorithms/pgvector/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
FROM bigvectorbench-base

ENV TZ=Asia/Shanghai
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo '$TZ' > /etc/timezone

RUN git clone https://github.com/pgvector/pgvector /tmp/pgvector
RUN DEBIAN_FRONTEND=noninteractive apt-get -y install tzdata

RUN apt-get update && apt-get install -y --no-install-recommends build-essential postgresql-common
RUN /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh -y
RUN apt-get install -y --no-install-recommends postgresql-16 postgresql-server-dev-16
RUN pip install psutil
RUN sh -c 'echo "local all all trust" > /etc/postgresql/16/main/pg_hba.conf'

# Dynamically set OPTFLAGS based on the architecture
RUN ARCH=$(uname -m) && \
if [ "$ARCH" = "aarch64" ]; then \
OPTFLAGS="-march=native -msve-vector-bits=512"; \
elif [ "$ARCH" = "x86_64" ]; then \
OPTFLAGS="-march=native -mprefer-vector-width=512"; \
else \
OPTFLAGS="-march=native"; \
fi && \
cd /tmp/pgvector && \
make clean && \
make OPTFLAGS="$OPTFLAGS" && \
make install

USER postgres
RUN service postgresql start && \
psql -c "CREATE USER bvb WITH ENCRYPTED PASSWORD 'bvb'" && \
psql -c "CREATE DATABASE bvb" && \
psql -c "GRANT ALL PRIVILEGES ON DATABASE bvb TO bvb" && \
psql -d bvb -c "GRANT ALL ON SCHEMA public TO bvb" && \
psql -d bvb -c "CREATE EXTENSION vector" && \
psql -c "ALTER USER bvb SET maintenance_work_mem = '4GB'" && \
psql -c "ALTER USER bvb SET max_parallel_maintenance_workers = 0" && \
psql -c "ALTER SYSTEM SET shared_buffers = '4GB'"
USER root

RUN pip install psycopg[binary] pgvector

# ENTRYPOINT ["bash"]
48 changes: 48 additions & 0 deletions bigvectorbench/algorithms/pgvector/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
float:
any:
- base_args: ["@metric"]
constructor: PGVectorHNSW
disabled: false
docker_tag: bigvectorbench-pgvector
module: bigvectorbench.algorithms.pgvector
name: pgvector-hnsw
run_groups:
M-16:
arg_groups: [{M: 16, efConstruction: 200}]
# args: {}
query_args: [[10, 20, 40, 80, 120, 200, 400, 800]]
M-24:
arg_groups: [{M: 24, efConstruction: 200}]
# args: {}
query_args: [[10, 20, 40, 80, 120, 200, 400, 800]]
- base_args: ["@metric"]
constructor: PGVectorIVFFLAT
disabled: false
docker_tag: bigvectorbench-pgvector
module: bigvectorbench.algorithms.pgvector
name: pgvector-ivfflat
run_groups:
IVFFLAT_32:
arg_groups: [{ nlist: 32 }]
query_args: [[1, 4, 8, 16, 32]]
# IVFFLAT_64:
# arg_groups: [{ nlist: 64 }]
# query_args: [[4, 16, 32, 48, 64]]
# IVFFLAT_128:
# arg_groups: [{ nlist: 128 }]
# query_args: [[8, 32, 64, 96, 128]]
# IVFFLAT_512:
# arg_groups: [{ nlist: 512 }]
# query_args: [[32, 128, 256, 384, 512]]
# IVFFLAT_1024:
# arg_groups: [{ nlist: 1024 }]
# query_args: [[64, 256, 512, 768, 1024]]
# IVFFLAT_2048:
# arg_groups: [{ nlist: 2048 }]
# query_args: [[128, 512, 1024, 1536, 2048]]
# IVFFLAT_4096:
# arg_groups: [{ nlist: 4096 }]
# query_args: [[256, 1024, 2048, 3072, 4096]]
# IVFFLAT_8192:
# arg_groups: [{ nlist: 8192 }]
# query_args: [[512, 2048, 4096, 6144, 8192]]
209 changes: 209 additions & 0 deletions bigvectorbench/algorithms/pgvector/module.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
""" Pgvector module for BigVectorBench framework. """

import subprocess
import sys
import numpy as np
import pgvector.psycopg
import psycopg
import os

from bigvectorbench.algorithms.base.module import BaseANN

class PGVector(BaseANN):
def __init__(self, metric, method_param):
self._metric = metric
self._m = method_param['M']
self._ef_construction = method_param['efConstruction']
self._cur = None
self.labels = None
self.label_names = None
self.label_types = None
self.index = self.get_vector_index()

if metric == "angular":
self._query = "SELECT id FROM items ORDER BY embedding <=> %s LIMIT %s"
elif metric == "euclidean":
self._query = "SELECT id FROM items ORDER BY embedding <-> %s LIMIT %s"
else:
raise RuntimeError(f"Unknown metric {metric}")


def get_vector_index(self):
"""Get vector index"""
raise NotImplementedError()

def load_data(
self,
embeddings: np.array,
labels: np.ndarray | None = None,
label_names: list[str] | None = None,
label_types: list[str] | None = None,
) -> None:
subprocess.run("service postgresql start", shell=True, check=True, stdout=sys.stdout, stderr=sys.stderr)
conn = psycopg.connect(user="bvb", password="bvb", dbname="bvb", autocommit=True)
pgvector.psycopg.register_vector(conn)
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS items")
self.label_names = label_names
if labels is not None and label_names is not None and label_types is not None:
pg_types = ['integer' if t == 'int32' else t for t in label_types]
additional_columns = ', '.join(f"{name} integer" for name in label_names)
table_definition = f"id integer, embedding vector({embeddings.shape[1]}), {additional_columns}"
else:
table_definition = f"id integer, embedding vector({embeddings.shape[1]})"
cur.execute(f"CREATE TABLE items ({table_definition})")
cur.execute("ALTER TABLE items ALTER COLUMN embedding SET STORAGE PLAIN")

if labels is not None and label_names is not None:
with cur.copy(f"COPY items (id, embedding, {', '.join(label_names)}) FROM STDIN WITH (FORMAT BINARY)") as copy:
copy.set_types(["int4", "vector"] + ["int4" for _ in label_names])
for i, embedding in enumerate(embeddings):
copy.write_row((i, embedding) + tuple(int(x) for x in labels[i]))
else:
with cur.copy("COPY items (id, embedding) FROM STDIN WITH (FORMAT BINARY)") as copy:
copy.set_types(["int4", "vector"])
for i, embedding in enumerate(embeddings):
copy.write_row((i, embedding))

print("Creating index...")

if self._metric == "angular":
cur.execute("CREATE INDEX ON items USING %s (embedding vector_cosine_ops) WITH (m = %d, ef_construction = %d)" % (self.index,self._m, self._ef_construction))
elif self._metric == "euclidean":
cur.execute("CREATE INDEX ON items USING %s (embedding vector_l2_ops) WITH (m = %d, ef_construction = %d)" % (self.index,self._m, self._ef_construction))
else:
raise RuntimeError(f"Unknown metric {self._metric}")

print("Done!")
self._cur = cur

def parse_filter_expr(self, filter_expr: str) -> str:
"""Parse filter expression and return SQL WHERE clause"""

print(f"Received filter expression: {filter_expr}")
return filter_expr

def set_query_arguments(self, ef_search):
self._ef_search = ef_search
self._cur.execute("SET %s.ef_search = %d" % (self.index,ef_search))

def query(self, v: np.array, n: int, filter_expr: str | None = None) -> list[int]:
if filter_expr:
filter_expr = filter_expr.replace("==", "=")
query = f"""
SELECT id FROM items
WHERE {filter_expr}
ORDER BY embedding <-> %s
LIMIT %s
"""
else:
query = self._query
self._cur.execute(query, (v, n), binary=True, prepare=True)
return [id for id, in self._cur.fetchall()]

def get_memory_usage(self):
if self._cur is None:
return 0
self._cur.execute("SELECT pg_relation_size('items_embedding_idx')")
return self._cur.fetchone()[0] / 1024

def __str__(self):
return f"PGVector(m={self._m}, ef_construction={self._ef_construction}, ef_search={self._ef_search})"

def insert(self, embeddings: np.ndarray, labels: np.ndarray | None = None) -> None:
"""
Single insert data

Args:
embeddings (np.ndarray): embeddings
labels (np.ndarray): labels

Returns:
None
"""
if labels is not None and self.label_names is not None:
insert_sentence = (f"INSERT INTO items (id,embedding,{', '.join(self.label_names)}) VALUES ({self.num_entities+1},{embeddings},{', '.join(labels)})")
else:
insert_sentence = (f"INSERT INTO items (id,embedding) VALUES ({self.num_entities+1},{embeddings}")
self._cur.execute(insert_sentence)
self.num_entities += 1

def update(
self, index: int, embeddings: np.ndarray, labels: np.ndarray | None = None
) -> None:
"""
Single update data

Args:
index (int): index to update
embeddings (np.ndarray): embeddings
labels (np.ndarray): labels

Returns:
None
"""
update_item = (f"embeddings = {embeddings},")
if labels is not None and self.label_names is not None:
for i in enumerate(self.label_names):
update_item += f"{self.label_names[i]} = {labels[i]}"
update_sentence = (f"UPDATE items SET {update_item} where id = {index}")

self._cur.execute(update_sentence)

def delete(
self,
index: int,
) -> None:
"""
Single delete data

Args:
index (int): index to delete

Returns:
None
"""
delete_sentence = (f"DELETE FROM items where id = {index}")

self._cur.execute(delete_sentence)


class PGVectorHNSW(PGVector):
def __init__(self, metric: str, index_param: dict):
super().__init__(metric, index_param)
self._nlinks = index_param.get("nlinks", 32)
self._efConstruction = index_param.get("efConstruction", 40)

def get_vector_index(self):
"""Get HNSW vector index"""
return "hnsw"

def set_query_arguments(self, efSearch: int = 40):
"""
Set query arguments for pgvector query with hnsw index
"""
self.search_params = {
"metric_type": self._metric_type,
"efSearch": efSearch,
}
self.name = f"pgvector HNSW metric:{self._metric}, nlinks:{self._nlinks}, efConstruction:{self._efConstruction}, efSearch:{efSearch}"

class PGVectorIVFFLAT(PGVector):
def __init__(self, metric: str, index_param: dict):
super().__init__(metric, index_param)
self._nlinks = index_param.get("nlinks", 32)
self._efConstruction = index_param.get("efConstruction", 40)

def get_vector_index(self):
"""Get IVFFLAT vector index"""
return "ivfflat"

def set_query_arguments(self, efSearch: int = 40):
"""
Set query arguments for pgvector query with ivfflat index
"""
self.search_params = {
"metric_type": self._metric_type,
"efSearch": efSearch,
}
self.name = f"pgvector ivfflat metric:{self._metric}, nlinks:{self._nlinks}, efConstruction:{self._efConstruction}, efSearch:{efSearch}"
34 changes: 34 additions & 0 deletions bigvectorbench/algorithms/sptag/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
FROM bigvectorbench-base

WORKDIR /home/app

RUN set GIT_LFS_SKIP_SMUDGE=1
RUN git clone --recurse-submodules https://github.com/microsoft/SPTAG

WORKDIR /home/app/SPTAG/ThirdParty
RUN rm -rf zstd
RUN git clone https://github.com/facebook/zstd.git

WORKDIR /home/app

RUN apt-get update && apt-get -y install wget build-essential cmake libboost-all-dev libtbb-dev software-properties-common swig

# Patch https://github.com/microsoft/SPTAG/issues/243
# RUN cd SPTAG && \
# wget -qO- https://github.com/pabs3/SPTAG/commit/bd9c25d1409325ac45ebeb7f1e8fc87d03ec478c.patch | git apply && \
# cd ..

# SPTAG defaults to Python 2 if it's found on the system, so as a hack, we remove it. See https://github.com/microsoft/SPTAG/blob/master/Wrappers/CMakeLists.txt
RUN apt-get -y remove libpython2.7

# Compile
RUN cd SPTAG && mkdir build && cd build && cmake .. && make && cd ..

# so python can find the SPTAG module
ENV PYTHONPATH=/home/app/SPTAG/Release
# RUN python3 -c 'import SPTAG'

# COPY bigvectorbench/algorithms/sptag/docker-compose.yml ./SPTAG
# COPY docker-compose.yml ./SPTAG
# COPY bigvectorbench/algorithms/sptag/Dockerfile.builder ./SPTAG

14 changes: 14 additions & 0 deletions bigvectorbench/algorithms/sptag/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
float:
euclidean:
- base_args: ["@metric", "@dimension"]
constructor: SPTAGBKT
disabled: False
docker_tag: bigvectorbench-sptag
module: bigvectorbench.algorithms.sptag
name: sptag-bkt
run_groups:
BKT:
args:
nlist: [64]
query_args: [[100]]
# , 128, 256, 512, 1024, 2048, 4096, 8192
23 changes: 23 additions & 0 deletions bigvectorbench/algorithms/sptag/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
version: '3.8'

services:
builder:
build:
context: .
dockerfile: Dockerfile
volumes:
- .:/app
command: /bin/bash -c "apt-get update && apt-get -y install wget build-essential swig cmake git libnuma-dev python3.8-dev python3-distutils gcc-8 g++-8 libboost-filesystem-dev libboost-test-dev libboost-serialization-dev libboost-regex-dev libboost-serialization-dev libboost-regex-dev libboost-thread-dev libboost-system-dev && wget https://bootstrap.pypa.io/get-pip.py && python3.8 get-pip.py && python3.8 -m pip install numpy && export CC=/usr/bin/gcc-8 && export CXX=/usr/bin/g++-8 && mkdir build && cd build && cmake .. && make -j && cd .."

app:
build:
context: .
dockerfile: Dockerfile
volumes:
- .:/app
environment:
- DEBIAN_FRONTEND=noninteractive
- PYTHONPATH=/app/Release
command: /bin/bash
depends_on:
- builder
Loading
Loading