From c80dffae9d92967b0701fdbcf458203eacc00f43 Mon Sep 17 00:00:00 2001
From: AnnaWheel <zhangxueya21@mails.uacs.ac.cn>
Date: Wed, 1 Jan 2025 14:51:08 +0800
Subject: [PATCH] add artificial_datasets_create

---
 README.md                     |  75 ++++-
 bigvectorbench/datasets.py    |  33 +-
 bigvectorbench/runner.py      |   8 +-
 create_artificial_datasets.py | 572 ++++++++++++++++++++++++++++++++++
 4 files changed, 679 insertions(+), 9 deletions(-)
 create mode 100644 create_artificial_datasets.py

diff --git a/README.md b/README.md
index 5bb4dff..6be00a9 100644
--- a/README.md
+++ b/README.md
@@ -102,12 +102,15 @@ python run.py --dataset app_reviews-384-euclidean-filter --count 100 --runs 3 --
   - redis-hnsw
 - [Elasitcsearch](https://www.elastic.co/)
   - elasticsearch-hnsw
-
+- [SPTAG](https://github.com/microsoft/SPTAG)
+  - sptag-bkt
+- [pgvector](https://github.com/pgvector/pgvector)
+  - pgvector-hnsw
+  - pgvector-ivfflat
+  
 **TODO**
 
 - [Vespa](https://vespa.ai/)
-- [SPTAG](https://github.com/microsoft/SPTAG)
-- [pgvector](https://github.com/pgvector/pgvector)
 
 ## Use-cases for Compound Queries
 
@@ -237,6 +240,72 @@ The dataset at [Hugging Face - dbpedia-entities-openai3-text-embedding-3-large-3
 | dbpedia-entities-openai3-text-embedding-3-large-1536-1000k-euclidean | 990,000 / 10,000    | [OpenAI text-embedding-3-large](https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-1536-1M) | 1536      | Euclidean | [link1](https://huggingface.co/datasets/Patrickcode/BigVectorBench/resolve/main/dbpedia-entities-openai3-text-embedding-3-large-1536-1000k-euclidean.hdf5), [link2](https://hf-mirror.com/datasets/Patrickcode/BigVectorBench/resolve/main/dbpedia-entities-openai3-text-embedding-3-large-1536-1000k-euclidean.hdf5) | [dbpedia-entities](https://huggingface.co/datasets/BeIR/dbpedia-entity) |
 | dbpedia-entities-openai3-text-embedding-3-large-3072-1000k-euclidean | 990,000 / 10,000    | [OpenAI text-embedding-3-large](https://huggingface.co/datasets/Qdrant/dbpedia-entities-openai3-text-embedding-3-large-3072-1M) | 3072      | Euclidean | [link1](https://huggingface.co/datasets/Patrickcode/BigVectorBench/resolve/main/dbpedia-entities-openai3-text-embedding-3-large-3072-1000k-euclidean.hdf5), [link2](https://hf-mirror.com/datasets/Patrickcode/BigVectorBench/resolve/main/dbpedia-entities-openai3-text-embedding-3-large-3072-1000k-euclidean.hdf5) | [dbpedia-entities](https://huggingface.co/datasets/BeIR/dbpedia-entity) |
 
+## ARTIFICIAL WORKLOADS
+
+### BUILD
+
+The command below will create a man-made datasets for test.
+
+```bash
+python create_artificial_datasets.py
+```
+
+Arguments:
+
+- `--n`: the number of train data to be generated (default: 10000)
+- `--m`: the number of test data to be generated (default: 1000)
+- `--d`: the dimension of data to be generated (default: 128)
+- `--l`: the number of labels for data to be generated (default: 1)
+- `--metric`: the metric type for distance to be calculated (default: angular)
+- `--maxlabel`: the max label value to be generated (default: 100000)
+- `--center`: the center numbers of the vectors to be generated (default: 100)
+
+### FORMAT
+
+- HDF5 format:
+  - Attributes:
+    - `type`: the type of the dataset (default: `ann`)
+      - `ann` or `dense`: ann datasets and large-scale datasets
+      - `filter-ann`: filter-ann datasets
+      - `mm-ann`: multi-modal datasets
+      - `mv-ann`: multi-vector datasets
+      - `sparse`: sparse datasets
+    - `distance`: the distance computation method (must be specified)
+      - `euclidean`: Euclidean distance
+      - `angular`: Angular distance
+      - `hamming`: Hamming distance
+      - `jaccard`: Jaccard distance
+    - `filter_expr_func`: the filter expression function (only available for the filter-ann datasets)
+    - `label_names`: the names of the labels (only available for the filter-ann datasets)
+    - `label_types`: the types of the labels (only available for the filter-ann datasets, e.g., `int32`)
+  - Datasets:
+    <!-- - `train`: the training vectors (available except for the filter-ann datasets)
+    - `test`: the query vectors (available except for the filter-ann datasets) -->
+    - `train_vec`: the training vectors (only available for the filter-ann datasets)
+    - `train_label`: the training labels (only available for the filter-ann datasets)
+    - `test_vec`: the query vectors (only available for the filter-ann datasets)
+    - `test_label`: the query labels (only available for the filter-ann datasets)
+    - `distances`: the ground truth distances between the query vectors and the training vectors
+    - `neighbors`: the ground truth neighbors containing the indices of the nearest neighbors
+  
+### STORE and USE
+
+- Store the datasets in the `./data` directory named as `artificial-*-*d-*l-*a.hdf5`.
+- You should add your new datasets name in `./bigvectorbench/datasets.py` line 947 to update the `ART_DATASETS` 's key.
+
+### Completed artificial workloads
+
+| Dataset                                                              | Data / Query Points |  Type                                                                                                                 | Dimension | Distance  | Label Numbers  | Filter Ratio  |Download                                                                                                                                                                                                                                                                                                              | Raw Data                                                                |
+| :--------------------------------------------------------------------: | :------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------: | :---------: | :---------: | :--------: | :---------: |:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------: |
+| msong-1filter-80a | 990,000 / 10,000    | real vectors with artificial labels | 420  | Euclidean | 1 | 80% | [link1](https://huggingface.co/datasets/AnnaZh/Bigvectorbench-artificial-datasets/resolve/main/msong-1filter-80a.hdf5), [link2](https://hf-mirror.com/datasets/AnnaZh/Bigvectorbench-artificial-datasets/resolve/main/msong-1filter-80a.hdf5) | [msong](https://www.cse.cuhk.edu.hk/systems/hash/gqr/datasets.html) |
+| deep1M-2filter-50a | 1,000,000 / 10,000    | real vectors with artificial labels | 256  | Euclidean | 2 | 50% | [link1](https://huggingface.co/datasets/AnnaZh/Bigvectorbench-artificial-datasets/resolve/main/deep1M-2filter-50a.hdf5), [link2](https://hf-mirror.com/datasets/AnnaZh/Bigvectorbench-artificial-datasets/resolve/main/deep1M-2filter-50a.hdf5) | [deep1M](https://www.cse.cuhk.edu.hk/systems/hash/gqr/datasets.html) |
+| tiny5m-6filter-12a | 5,000,000 / 10,000    | real vectors with artificial labels | 384  | Euclidean | 6 | 12% | [link1](https://huggingface.co/datasets/AnnaZh/Bigvectorbench-artificial-datasets/resolve/main/tiny5m-6filter-12a.hdf5), [link2](https://hf-mirror.com/datasets/AnnaZh/Bigvectorbench-artificial-datasets/resolve/main/tiny5m-6filter-12a.hdf5) | [tiny5m](https://www.cse.cuhk.edu.hk/systems/hash/gqr/datasets.html) |
+| sift10m-6filter-6a | 10,000,000 / 10,000    | real vectors with artificial labels | 128  | Euclidean | 6 | 6% | [link1](https://huggingface.co/datasets/AnnaZh/Bigvectorbench-artificial-datasets/resolve/main/sift10m-6filter-6a.hdf5), [link2](https://hf-mirror.com/datasets/AnnaZh/Bigvectorbench-artificial-datasets/resolve/main/sift10m-6filter-6a.hdf5) | [sift10m](https://www.cse.cuhk.edu.hk/systems/hash/gqr/datasets.html) |
+| artificial-average-128d-1l-80a-euclidean-107 | 10,000,000 / 10,000    |artificial vectors with labels | 128  | Euclidean | 1 | 80% | [link1](https://huggingface.co/datasets/AnnaZh/Bigvectorbench-artificial-datasets/resolve/main/artificial-average-128d-1l-80a-euclidean-107.hdf5), [link2](https://hf-mirror.com/datasets/AnnaZh/Bigvectorbench-artificial-datasets/resolve/main/artificial-average-128d-1l-80a-euclidean-107.hdf5) | - |
+| artificial-average-128d-2l-50a-euclidean-107 | 10,000,000 / 10,000    |artificial vectors with labels | 128  | Euclidean | 2 | 50% | [link1](https://huggingface.co/datasets/AnnaZh/Bigvectorbench-artificial-datasets/resolve/main/artificial-average-128d-2l-50a-euclidean-107.hdf5), [link2](https://hf-mirror.com/datasets/AnnaZh/Bigvectorbench-artificial-datasets/resolve/main/artificial-average-128d-2l-50a-euclidean-107.hdf5) | - |
+| artificial-average-384d-6l-12a-euclidean-107 | 10,000,000 / 10,000    |artificial vectors with labels | 384  | Euclidean | 6 | 12% | [link1](https://huggingface.co/datasets/AnnaZh/Bigvectorbench-artificial-datasets/resolve/main/rtificial-average-384d-6l-12a-euclidean-107.hdf5), [link2](https://hf-mirror.com/datasets/AnnaZh/Bigvectorbench-artificial-datasets/resolve/main/rtificial-average-384d-6l-12a-euclidean-107.hdf5) | - |
+| artificial-average-768d-6l-6a-euclidean-107 | 10,000,000 / 10,000    |artificial vectors with labels | 768  | Euclidean | 6 | 6% | [link1](https://huggingface.co/datasets/AnnaZh/Bigvectorbench-artificial-datasets/resolve/main/artificial-average-768d-6l-6a-euclidean-107.hdf5), [link2](https://hf-mirror.com/datasets/AnnaZh/Bigvectorbench-artificial-datasets/resolve/main/artificial-average-768d-6l-6a-euclidean-107.hdf5) | - |
+
 ## Contributing
 
 For the development of BigVectorBench, we welcome contributions from the community. If you are interested in contributing to this project, please follow the .[Guidelines for Contributing](./CONTRIBUTING.md).
diff --git a/bigvectorbench/datasets.py b/bigvectorbench/datasets.py
index 999e285..756b8be 100644
--- a/bigvectorbench/datasets.py
+++ b/bigvectorbench/datasets.py
@@ -57,6 +57,7 @@ def get_dataset_fn(dataset_name: str) -> str:
     """
     if not os.path.exists("data"):
         os.mkdir("data")
+    
     return os.path.join("data", f"{dataset_name}.hdf5")
 
 
@@ -77,10 +78,14 @@ def get_dataset(dataset_name: str) -> Tuple[h5py.File, int]:
     if dataset_name in ANN_DATASETS or dataset_name in RANDOM_DATASETS:
         dataset_url = f"https://ann-benchmarks.com/{dataset_name}.hdf5"
     elif dataset_name in BVB_DATASETS:
-        dataset_url = f"https://huggingface.co/datasets/Patrickcode/BigVectorBench/resolve/main/{dataset_name}.hdf5"
+        dataset_url = f"https://huggingface.co/datasets/Patrickcode/BigVectorBench/blob/main/{dataset_name}.hdf5"
         # dataset_url = f"https://hf-mirror.com/datasets/Patrickcode/BigVectorBench/resolve/main/{dataset_name}.hdf5"
+    elif dataset_name in ART_DATASETS:
+        dataset_url = f"https://huggingface.co/datasets/Patrickcode/BigVectorBench/blob/main/{dataset_name}.hdf5"
+        # dataset_url = f"https://hf-mirror.com/datasets/Patrickcode/BigVectorBench/blob/main/{dataset_name}.hdf5"
     else:
-        raise ValueError(f"Unknown dataset: {dataset_name}")
+        raise ValueError(f"Unknown dataset: {dataset_name},datasets should be in {DATASETS.keys()} or be created by create_datasets.py then added in ART_DATASETS.key()")
+    
     try:
         download(dataset_url, hdf5_filename)
     except Exception:
@@ -931,7 +936,31 @@ def dbpedia_entities_openai3_text_embedding_3_large_1536_1M(out_fn, i, distance)
     }
 )
 
+def artificial_dataset(out_fn: str, dataset_name: str) -> None:
+    """
+    bvb_dataset: Downloads a dataset from the BigVectorBench repository on Hugging Face Datasets Hub
+    """
+    dataset_url = f"https://huggingface.co/datasets/AnnaZh/Bigvectorbench-artificial-datasets/resolve/main/{dataset_name}.hdf5"
+    # dataset_url = f"https://hf-mirror.com/datasets/AnnaZh/Bigvectorbench-artificial-datasets/resolve/main/{dataset_name}.hdf5"
+    download(dataset_url, out_fn)
+
+ART_DATASETS: Dict[str, Callable[[str], None]] = {
+    "deep1M-2filter-50a": lambda out_fn: artificial_dataset(
+        out_fn, "deep1M-2filter-50a"
+    ),
+    "msong-1filter-80a": lambda out_fn: artificial_dataset(
+        out_fn, "msong-1filter-80a"
+    ),
+    "sift10m-6filter-6a": lambda out_fn: artificial_dataset(
+        out_fn, "sift10m-6filter-6a"
+    ),
+    "tiny5m-6filter-12a": lambda out_fn: artificial_dataset(
+        out_fn, "tiny5m-6filter-12a"
+    )
+}
+
 DATASETS: Dict[str, Callable[[str], None]] = {}
 DATASETS.update(RANDOM_DATASETS)
 DATASETS.update(ANN_DATASETS)
 DATASETS.update(BVB_DATASETS)
+DATASETS.update(ART_DATASETS)
\ No newline at end of file
diff --git a/bigvectorbench/runner.py b/bigvectorbench/runner.py
index 3f0eb9a..c35ddd7 100644
--- a/bigvectorbench/runner.py
+++ b/bigvectorbench/runner.py
@@ -132,7 +132,7 @@ def batch_query(
             """
             exprs = None
             if filter_expr_func is not None:
-                exprs = [filter_expr(*labels) for labels in X_labels]
+                exprs = [filter_expr(*(labels.flatten())) for labels in X_labels]
             # TODO: consider using a dataclass to represent return value.
             if prepared_queries:
                 algo.prepare_batch_query(X, count, exprs)
@@ -207,7 +207,7 @@ def single_multi_vector_query(vs: np.ndarray):
                     results = batch_query(X_test, X_test_label)
                 else:
                     results = [
-                        single_query(x, labels)
+                        single_query(x, labels.flatten())
                         for x, labels in zip(X_test, X_test_label)
                     ]
         else:
@@ -267,7 +267,7 @@ def run_individual_insert(
     else:
         for i, (x, labels) in enumerate(zip(X_test, X_test_label)):
             start = time.time()
-            algo.insert(x, labels)
+            algo.insert(x, labels.flatten())
             latencies.append(time.time() - start)
             if i % 1000 == 0:
                 print(f"Processed {i}/{len(X_test)} inserts...")
@@ -305,7 +305,7 @@ def run_individual_update(
         for i, (x, labels) in enumerate(zip(X_test, X_test_label)):
             idx = np.random.randint(num_entities)
             start = time.time()
-            algo.update(idx, x, labels)
+            algo.update(idx, x, labels.flatten())
             latencies.append(time.time() - start)
             if i % 1000 == 0:
                 print(f"Processed {i}/{len(X_test)} updates...")
diff --git a/create_artificial_datasets.py b/create_artificial_datasets.py
new file mode 100644
index 0000000..dbefadb
--- /dev/null
+++ b/create_artificial_datasets.py
@@ -0,0 +1,572 @@
+import os
+import random
+import tarfile
+from urllib.request import urlopen, urlretrieve, build_opener, install_opener
+from typing import Any, Callable, Dict, Tuple
+import gzip
+import zipfile
+import struct
+import h5py
+import numpy as np
+from tqdm import tqdm
+import time
+import math
+# from datasets import load_dataset
+from sklearn import random_projection
+from sklearn.model_selection import train_test_split
+from sklearn.datasets import make_blobs
+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn.neighbors import NearestNeighbors
+from scipy.sparse import lil_matrix
+from implicit.datasets.lastfm import get_lastfm
+from implicit.als import AlternatingLeastSquares
+from implicit.nearest_neighbours import bm25_weight
+from implicit.utils import augment_inner_product_matrix
+import argparse
+
+def positive_int(input_str: str) -> int:
+    """
+    Validates if the input string can be converted to a positive integer.
+
+    Args:
+        input_str (str): The input string to validate and convert to a positive integer.
+
+    Returns:
+        int: The validated positive integer.
+
+    Raises:
+        argparse.ArgumentTypeError: If the input string cannot be converted to a positive integer.
+    """
+    try:
+        i = int(input_str)
+        if i < 1:
+            raise ValueError
+    except ValueError as exc:
+        raise argparse.ArgumentTypeError(
+            f"{input_str} is not a positive integer"
+        ) from exc
+
+    return i
+
+def parse_arguments() -> argparse.Namespace:
+    """
+    Parses the command line arguments and returns the parsed arguments.
+
+    Returns:
+        argparse.Namespace: The parsed arguments.
+    """
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--type",
+        metavar="TYPE",
+        help="the dataset to be generated as",
+        default="random",
+        choices={"random","average"},
+    )
+    parser.add_argument(
+        "--n",
+        default=10000,
+        type=positive_int,
+        help="the number of train data to be generated",
+    )
+    parser.add_argument(
+        "--m",
+        default=1000,
+        type=positive_int,
+        help="the number of test data to be generated",
+    )
+    parser.add_argument(
+        "--a",
+        default=80,
+        type=positive_int,
+        help="a'%' is the ratio of data to be filtered",
+    )
+    parser.add_argument(
+        "--d",
+        default=128,
+        type=positive_int,
+        help="the dimension of data to be generated",
+    )
+    parser.add_argument(
+        "--l",
+        default=1,
+        type=positive_int,
+        help="the number of labels for data to be generated",
+    )
+    parser.add_argument(
+        "--path",
+        default="",
+        help="the creates dataset will be at 'data/path/***.hdf5'",
+    )
+    parser.add_argument(
+        "--center",
+        default=100,
+        type=positive_int,
+        help="the number of centers for data to be generated",
+    )
+    parser.add_argument(
+        "--metric",
+        default="inner_product",
+        help="the metric type for distance to be calculated",
+    )
+    parser.add_argument(
+        "--maxlabel",
+        default=100000,
+        type=positive_int,
+        help="the max label value to be generated",
+    )
+    parser.add_argument(
+        "--topk",
+        default=200,
+        type=positive_int,
+        help="the topk neighbors to be retriveled",
+    )
+
+    args = parser.parse_args()
+
+    return args
+
+def inner_product_metric(u, v):
+    return -np.dot(u, v)
+
+def metric_mapping(_metric: str):
+    """
+    Mapping metric type to milvus metric type
+
+    Args:
+        _metric (str): metric type
+
+    Returns:
+        str: milvus metric type
+    """
+    _metric = _metric.lower()
+    _metric_type = {"angular": "cosine", "euclidean": "euclidean","inner_product":inner_product_metric}.get(_metric, None)
+    if _metric_type is None:
+        raise ValueError(f"[Milvus] Not support metric type: {_metric}!!!")
+    return _metric_type
+
+
+def filter_knn_calculate(
+    distance:str,
+    train_vec: np.ndarray,
+    test_vec: np.ndarray,
+    train_label: np.ndarray,
+    test_label: np.ndarray,
+    topk:int,
+    ratio_request: float,
+) -> None:
+    neighbors_ds = np.full((len(test_vec), topk), -1,   dtype=np.int32)
+    distances_ds = np.full((len(test_vec), topk), -1.0, dtype=np.float32)
+    maxcnt = 0
+    mincnt = train_vec.shape[0]
+    sumcnt = 0
+    if ratio_request >= 0.2:
+        for i, qry in tqdm(enumerate(test_vec),desc="Processing"):
+            vec_set = []
+            idx_set = []
+            qry_label_left = []
+            qry_label_right = []
+
+            n_samples_fit = min(int(topk*1.5/ratio_request), train_vec.shape[0])
+            nn = NearestNeighbors(n_neighbors=n_samples_fit, metric=metric_mapping(distance), n_jobs=-1,algorithm='brute')
+            nn.fit(train_vec)
+            distances, indices = nn.kneighbors(np.array([qry]))
+
+            neighbors_tmp = np.full((int(topk*1.5/ratio_request)), -1,   dtype=np.int32)
+            distances_tmp = np.full((int(topk*1.5/ratio_request)), -1.0, dtype=np.float32)
+            labels_tmp = np.full((int(topk*1.5/ratio_request),train_label.shape[1]), 0, dtype=np.int32)
+
+            neighbors_tmp[:n_samples_fit] = indices[0]
+            distances_tmp[:n_samples_fit] = distances[0]
+            labels_tmp[:n_samples_fit] = train_label[indices[0]]
+
+            flags = np.zeros(n_samples_fit, dtype=int)
+            for j,j_test_label in enumerate(test_label[i]):
+                # qry_label_left.append(j_test_label[0])
+                # qry_label_right.append(j_test_label[1])
+                left = j_test_label[0]
+                right = j_test_label[1]
+                flags += (left <= labels_tmp[:,j]) & (labels_tmp[:,j] <= right)
+
+            cnt = 0
+            lens = train_label.shape[1]
+            for j,flag in enumerate(flags):
+                if flag == lens:
+                    vec_set.append(distances_tmp[j])
+                    idx_set.append(neighbors_tmp[j])
+                    cnt += 1
+
+            if cnt < topk:
+                print(f"Warning: {i}-th query has {cnt} neighbors")
+
+            train_vec_scope = np.array(vec_set, dtype=np.float32)
+            train_idx_scope = np.array(idx_set, dtype=np.int32)
+
+            if i % 100 == 0:
+                print(f"{i}-th query : train_vec_scope.shape: {train_vec_scope.shape}, train_idx_scope.shape: {train_idx_scope.shape}")
+            
+            n_samples_fit = min(topk, train_vec_scope.shape[0])
+            if(n_samples_fit < 1):
+                print(f"Warning: {i}-th query has {n_samples_fit} neighbors")
+                continue
+
+            if cnt > maxcnt:
+                maxcnt = cnt
+            if cnt < mincnt:
+                mincnt = cnt
+            sumcnt += cnt
+
+            neighbors_ds[i, :topk] = train_idx_scope[:topk]
+            distances_ds[i, :topk] = train_vec_scope[:topk]
+        print(f"min filter ratio is {mincnt/(int(topk*2/ratio_request))}")
+        print(f"max filter ratio is {maxcnt/ (int(topk*2/ratio_request)) } ")
+        print(f"average filter ratio is {sumcnt/((int(topk*2/ratio_request))*test_vec.shape[0])}")
+            
+
+    else:
+        for i, qry in tqdm(enumerate(test_vec),desc="Processing"):
+            vec_set = []
+            idx_set = []
+            qry_label_left = []
+            qry_label_right = []
+            lens = train_label.shape[1]
+            flags = np.zeros(train_label.shape[0], dtype=int)
+            for j,j_test_label in enumerate(test_label[i]):
+                # qry_label_left.append(j_test_label[0])
+                # qry_label_right.append(j_test_label[1])
+                left = j_test_label[0]
+                right = j_test_label[1]
+                flags += (left <= train_label[:,j]) & (train_label[:,j] <= right)
+
+            cnt = 0
+
+            lens = train_label.shape[1]
+            for j,flag in enumerate(flags):
+                if flag == lens:
+                    vec_set.append(train_vec[j])
+                    idx_set.append(j)
+                    cnt += 1
+
+            if cnt < topk:
+                print(f"Warning: {i}-th query has {cnt} neighbors")
+
+            train_vec_scope = np.array(vec_set, dtype=np.float32)
+            train_idx_scope = np.array(idx_set, dtype=np.int32)
+
+            if i % 100 == 0:
+                print(f"{i}-th query : train_vec_scope.shape: {train_vec_scope.shape}, train_idx_scope.shape: {train_idx_scope.shape}")
+            
+            n_samples_fit = min(topk, train_vec_scope.shape[0])
+            if(n_samples_fit < 1):
+                print(f"Warning: {i}-th query has {n_samples_fit} neighbors")
+                continue
+
+            if cnt > maxcnt:
+                maxcnt = cnt
+            if cnt < mincnt:
+                mincnt = cnt
+            sumcnt += cnt
+
+            nn = NearestNeighbors(n_neighbors=n_samples_fit, metric=metric_mapping(distance), n_jobs=-1,algorithm='brute')
+            nn.fit(train_vec_scope)
+            distances, indices = nn.kneighbors(np.array([qry]))
+
+            neighbors_ds[i, :n_samples_fit] = train_idx_scope[indices[0]]
+            distances_ds[i, :n_samples_fit] = distances[0]
+
+        print(f"min filter ratio is {mincnt/(train_vec.shape[0])}")
+        print(f"max filter ratio is {maxcnt/(train_vec.shape[0])}")
+        print(f"average filter ratio is {sumcnt/(train_vec.shape[0]*test_vec.shape[0])}")
+
+    if distance == "inner_product":
+        distances_ds = -distances_ds
+    return neighbors_ds,distances_ds
+
+def write_groundtruth_output(
+    fn: str,
+    train_vec: np.ndarray,
+    test_vec: np.ndarray,
+    train_label: np.ndarray,
+    test_label: np.ndarray,
+    topk: int,
+    ratio_request: float,
+) -> None:
+    with h5py.File(fn, "w") as f:  
+
+        neighbors,distances = filter_knn_calculate(train_vec,test_vec,train_label,test_label,topk,ratio_request)
+
+        f.create_dataset("neighbors",data=neighbors,maxshape=(None, neighbors.shape[1]), chunks=(10000, neighbors.shape[1]), dtype=int)
+        f.create_dataset("distances",data=distances,maxshape=(None, distances.shape[1]), chunks=(10000, distances.shape[1]), dtype=float)
+
+        f.close()
+    print(f"groundtruth is already:{fn}")
+
+def write_filter_output(
+    fn: str,
+    train_vec: np.ndarray,
+    test_vec: np.ndarray,
+    train_label: np.ndarray,
+    test_label: np.ndarray,
+    distance: str,
+    filter_expr_func: str,
+    label_names: list[str],
+    label_types: list[str],
+    label_ranges: list[str],
+    label_range_types: list[str],
+    point_type: str,
+    topk: int,
+    ratio_request: float,
+) -> None:
+    with h5py.File(fn, "w") as f:
+        f.attrs["type"] = "filter-ann"
+        f.attrs["distance"] = distance
+        f.attrs["dimension"] = len(train_vec[0])
+        f.attrs["point_type"] = point_type
+        f.attrs["label_names"] = label_names
+        f.attrs["label_types"] = label_types
+        f.attrs["label_ranges"] = label_ranges
+        f.attrs["label_range_types"] = label_range_types
+        f.attrs["filter_expr_func"] = filter_expr_func
+
+        print(f"train size: {train_vec.shape[0]} * {train_vec.shape[1]}")
+        print(f"test size:  {test_vec.shape[0]} * {test_vec.shape[1]}")
+        f.create_dataset("train_vec", data=train_vec, maxshape=(None, train_vec.shape[1]), chunks=(10000, train_vec.shape[1]), dtype=float)
+        f.create_dataset("test_vec", data=test_vec, maxshape=(None, test_vec.shape[1]), chunks=(10000, test_vec.shape[1]), dtype=float)
+        
+        test_label_scope = np.array(test_label, dtype=np.int32)
+        print(f"train size: {train_label.shape[0]} * {train_label.shape[1]}")
+        print(f"test size:  {test_label_scope.shape[0]} * {test_label_scope.shape[1]} * {test_label_scope.shape[2]}")
+        f.create_dataset("train_label", data=train_label, maxshape=(None, train_label.shape[1]), chunks=(10000, train_label.shape[1]), dtype=int)
+        f.create_dataset("test_label", data=test_label_scope, maxshape=(None, None, test_label_scope.shape[2]), chunks=(10000, test_label_scope.shape[1],test_label_scope.shape[2]), dtype=int)
+        
+        neighbors,distances = filter_knn_calculate(distance,train_vec,test_vec,train_label,test_label,topk,ratio_request)
+
+        f.create_dataset("neighbors",data=neighbors,maxshape=(None, neighbors.shape[1]), chunks=(10000, neighbors.shape[1]), dtype=int)
+        f.create_dataset("distances",data=distances,maxshape=(None, distances.shape[1]), chunks=(10000, distances.shape[1]), dtype=float)
+
+        f.close()
+
+    print(f"datafile is already:{fn}")
+
+
+        
+def generate_random_ranges(generate_type, sum, num_ranges, min_val, max_val, train_labels, ratio_request):
+    label_ranges = []
+    if generate_type=="average":
+        init_random_ratio = ratio_request
+    else: init_random_ratio = random.uniform(ratio_request * 0.95, min(1.0, ratio_request * 1.05))
+
+    if num_ranges == 1:
+        val = max_val * init_random_ratio
+        min_side = min_val
+        max_side = max_val - val
+        
+        lefts = np.random.randint(min_side,max_side,size=sum)
+        for i in tqdm(range(sum),desc="Processing"):
+            random_ranges = []
+            left = lefts[i]
+            right = left + val
+            random_ranges.append((left, right))
+            label_ranges.append(random_ranges)
+
+    else:
+        random_ratio = []
+        random_ratio.append(init_random_ratio)
+
+        a_ratio = []
+        a_ratio.append(1.0)
+        for i in range(num_ranges):
+            random_ratio_i = random_ratio[i]/a_ratio[i]
+            aver_random_ratio = math.pow(random_ratio_i, 1/(num_ranges-i))
+            ai = random.uniform(max(aver_random_ratio * 0.7,math.pow(random_ratio_i,1/2)),min(1.0, (aver_random_ratio*1.3)))
+            # print(i,random_ratio_i,aver_random_ratio,ai)
+            if i==num_ranges-1: a_ratio.append(aver_random_ratio)
+            else:a_ratio.append(ai)
+            random_ratio.append(random_ratio_i)
+
+        a_ratio_random = 1.0
+        for i in range(num_ranges):
+            a_ratio_random *= a_ratio[i+1]
+        print(a_ratio)
+        print(a_ratio_random)
+        
+        range_set = [1]
+        for i in range(num_ranges):
+            if range_set[i] >= max_val/1000: 
+                for j in range(num_ranges-i-1):
+                    range_set.append(1)
+                break
+            range_set.append(range_set[i]*10)
+        print(range_set)
+
+        lefts = []
+        val_js = []
+        for j in range(num_ranges):
+            val_j = max_val / range_set[j] * min(a_ratio[j+1],1)
+            min_j = min_val
+            max_j = max(1, int(max_val / range_set[j] - val_j))
+            lefts.append(np.random.randint(min_j,max_j,size=sum))
+            val_js.append(val_j)
+
+        for i in tqdm(range(sum),desc="Processing"):
+            random_ranges = []
+            for j in range(num_ranges):
+                left_j = lefts[j][i]
+                rigth_j = left_j + int(val_js[j])
+                random_ranges.append((left_j,rigth_j))
+            label_ranges.append(random_ranges)
+
+        # for i in range(1,20):
+        #     for j in range(num_ranges):
+        #         print(label_ranges[i][j][0],label_ranges[i][j][1])
+
+    return label_ranges
+
+
+def create_filter(
+    out_fn: str,
+    generate_type: str,
+    n_dims: int,
+    n_samples: int,
+    m_test:int,
+    centers: int,
+    n_filters: int,
+    max_labels:int,
+    ratio_request: float,
+    distance: str = "inner_product",
+    topk: int = 200,
+) -> None:
+    """Gen random filter dataset with n_filters filters"""
+    print(f"now_dataset: {out_fn}")
+    X = make_blobs(
+        n_samples=n_samples, n_features=n_dims, centers=centers, random_state=1
+    )[0]
+    train_X, test_X = train_test_split(X, test_size=m_test,random_state=42)
+    # test_X = make_blobs(
+    #     n_samples=m_test, n_features=n_dims, centers=centers, random_state=1
+    # )[0]
+
+    train_label_names = [f"label_{i}" for i in range(n_filters)]
+    train_label_types = ["int32" for i in range(n_filters)]
+    print(f"train_labels_names: {train_label_names}")
+    print(f"train_labels_types: {train_label_types}")
+
+    train_label_min = 0
+    train_label_max = max_labels
+
+    # label 的设计默认将不同的label限制在不同的数量级中
+    range_set = [1]
+    for i in range(n_filters):
+        if range_set[i] >= train_label_max/1000: 
+            for j in range(n_filters-i-1):
+                range_set.append(1)
+            break
+        range_set.append(range_set[i]*10)
+    print(range_set)
+
+    train_label = []
+    for i in range(n_filters):
+        train_label_max_now = train_label_max/range_set[i]
+        train_label.append(np.random.randint(train_label_min, train_label_max_now, size=n_samples-m_test))
+    train_labels = np.array(train_label)
+    
+    train_labels = train_labels.T
+    print(train_labels.shape)
+    
+    test_label_range_names = [[f"label_l_{i}",f"label_r_{i}"] for i in range(n_filters)]
+    test_label_range_types = [["int32","int32"] for i in range(n_filters)]  
+    print(f"test_label_range_names: {test_label_range_names}")
+    print(f"test_label_range_types: {test_label_range_types}")
+    
+    filter_expr = " and ".join(
+        [
+            f"{train_label_names[i]} >= " + "{" + f"{label_range[0]}" + "}" + " and "
+            f"{train_label_names[i]} <= " + "{" + f"{label_range[1]}" + "}"
+            for i,label_range in enumerate(test_label_range_names)
+        ]
+    )
+    print(f"filter_expr: {filter_expr}")
+
+    if n_filters > 1:
+        # filter_expr_func =  '''def filter_expr(label_l_0, label_r_0, label_l_1, label_r_1, label_l_2, label_r_2, label_l_3, label_r_3, label_l_4, label_r_4, label_l_5, label_r_5):
+        #         return f"label_0 >= {label_l_0} and label_0 <= {label_r_0} and label_1 >= {label_l_1} and label_1 <= {label_r_1} and label_2 >= {label_l_2} and label_2 <= {label_r_2} and label_3 >= {label_l_3} and label_3 <= {label_r_3} and label_4 >= {label_l_4} and label_4 <= {label_r_4} and label_5 >= {label_l_5} and label_5 <= {label_r_5}"  
+        #         '''
+        filter_expr_func =  '''def filter_expr(label_l_0, label_r_0, label_l_1, label_r_1):
+                return f"label_0 >= {label_l_0} and label_0 <= {label_r_0} and label_1 >= {label_l_1} and label_1 <= {label_r_1}"  
+                '''
+    else:
+        filter_expr_func =  '''def filter_expr(label_l_0, label_r_0):
+                return f"label_0 >= {label_l_0} and label_0 <= {label_r_0}"  
+                '''
+
+    print(f"filter_expr_func: {filter_expr_func}")
+    test_label_min = 0
+    test_label_max = max_labels
+    test_labels_range = generate_random_ranges(generate_type, m_test, n_filters, test_label_min, test_label_max,train_labels,ratio_request)
+
+    write_filter_output(
+        out_fn,
+        train_X,
+        test_X,
+        train_labels,
+        test_labels_range,
+        distance,
+        filter_expr_func,
+        train_label_names,
+        train_label_types,
+        test_label_range_names,
+        test_label_range_types,
+        "float",
+        topk,
+        ratio_request,
+    )
+
+
+if __name__ == "__main__":
+
+    args = parse_arguments()
+
+    if not os.path.exists("data"):
+        os.mkdir("data")
+
+    if not os.path.exists("data/" + args.path):
+        os.mkdir("data/" + args.path)
+
+    out_fn = "data/" + args.path + f"/artificial-{args.type}-{args.d}d-{args.l}l-{args.a}a-{args.metric}-10{str(args.n).count('0')}.hdf5"
+
+    create_filter(
+        out_fn,
+        args.type,
+        args.d,
+        args.n,
+        args.m,
+        args.center,
+        args.l,
+        args.maxlabel,
+        args.a/100,
+        args.metric,
+        args.topk,
+    )
+
+    # out_ground_truth_fn = args.path + f"{args.type}-{args.d}d-{args.l}l-{args.a}a-groundtruth.hdf5"
+
+    # with h5py.File(out_fn, "r") as f:
+    #     train_vec=f["train_vec"][:]
+    #     test_vec=f["test_vec"][:]
+    #     train_label=f["train_label"][:]
+    #     test_label=f["test_label"][:]
+    #     write_groundtruth_output(
+    #         out_ground_truth_fn,
+    #         train_vec,
+    #         test_vec,
+    #         train_label,
+    #         test_label,
+    #         args.topk,
+    #         ratio_request,
+    #     )
+    #     f.close()
+
+
+