diff --git a/docs/ivf.md b/docs/ivf.md new file mode 100644 index 000000000..95370d7a3 --- /dev/null +++ b/docs/ivf.md @@ -0,0 +1,158 @@ +# IVF Index + +## Definition +IVF (Inverted File Index) improves search efficiency by **partitioning** data into buckets, thus reducing the search scope. + +## Working Principle +1. **Clustering Phase**: + First, perform a clustering operation on the entire high - dimensional vector dataset, dividing it into multiple non - overlapping clusters (also known as inverted lists). Commonly used clustering algorithms include K - means, etc. The cluster centers are called centroids. Suppose there are $n$ vectors clustered into $m$ clusters, with each cluster having a centroid. +2. **Index Building Phase**: + Each vector is assigned to the cluster whose centroid is closest to it. The vector's information (such as the vector ID) is added to the corresponding inverted list. In this way, the IVF index is built, with each inverted list containing all the vectors belonging to that cluster. +3. **Search Phase**: + When a query vector is given, first calculate the distances between the query vector and all centroids, and find the $k$ closest centroids ($k$ is a retrieval parameter). Then, perform an exact nearest - neighbor search only within the inverted lists corresponding to these $k$ centroids, significantly reducing the number of vectors to be searched. + +## Suitable Scenarios (Recommended for use if any 2 - 4 of the following conditions are met) +1. The vector dimension is not very high, usually less than 512 dimensions. High dimensions may lead to the "curse of dimensionality" problem (disadvantage). +2. High - scale data scenarios, typically with over 100 million data points. +3. Memory - constrained scenarios, as its memory usage is lower than that of graph algorithms. +4. Large top - k recall requirements or complex filtering scenarios. + +## Usage +For examples, refer to [106_index_ivf.cpp](https://github.com/antgroup/vsag/blob/main/examples/cpp/106_index_ivf.cpp). + +## Detailed Explanation of Building Parameters + +### partition_strategy_type +- **Parameter Type**: string +- **Parameter Description**: Bucket partitioning strategy type +- **Optional Values**: "ivf", "gno_imi" +- **Default Value**: "ivf" + +### first_order_buckets_count +- **Parameter Type**: int +- **Parameter Description**: Only effective when `partition_strategy_type` is "gno_imi", representing the number of first - level buckets. +- **Optional Values**: 1 to INT_MAX +- **Default Value**: 10 + +### second_order_buckets_count +- **Parameter Type**: int +- **Parameter Description**: Only effective when `partition_strategy_type` is "gno_imi", representing the number of second - level buckets. +- **Optional Values**: 1 to INT_MAX +- **Default Value**: 10 + +### buckets_count +- **Parameter Type**: int +- **Parameter Description**: Only effective when `partition_strategy_type` is "ivf", representing the number of buckets. +- **Optional Values**: 1 to INT_MAX +- **Default Value**: 10 + +### ivf_train_type +- **Parameter Type**: string +- **Parameter Description**: Clustering algorithm type +- **Optional Values**: "kmeans", "random" +- **Default Value**: "kmeans" + +### base_quantization_type +- **Parameter Type**: string +- **Parameter Description**: Coarse - ranking vector quantization type (encoding of in - bucket vectors) +- **Optional Values**: "fp32", "fp16", "bf16", "sq8", "sq8_uniform", "sq4_uniform", "pq", "rabitq", "pqfs" +- **Default Value**: "fp32" + +### base_io_type +- **Parameter Type**: string +- **Parameter Description**: Coarse - ranking vector IO type (storage access type of in - bucket vectors) +- **Optional Values**: "memory_io", "block_memory_io" +- **Default Value**: "memory_io" + +### base_pq_dim +- **Parameter Type**: int +- **Parameter Description**: Coarse - ranking vector PQ dimension, used for re - ranking +- **Optional Values**: 1 to dim +- **Default Value**: 1 + +### use_reorder +- **Parameter Type**: bool +- **Parameter Description**: Whether to use re - ranking +- **Optional Values**: true, false +- **Default Value**: false + +### precise_quantization_type +- **Parameter Type**: string +- **Parameter Description**: Fine - ranking vector quantization type, used for re - ranking +- **Optional Values**: "fp32", "fp16", "bf16", "sq8", "sq8_uniform", "sq4_uniform", "pq", "rabitq", "pqfs" +- **Default Value**: "fp32" + +### precise_io_type +- **Parameter Type**: string +- **Parameter Description**: Fine - ranking vector IO type, used for re - ranking +- **Optional Values**: "memory_io", "block_memory_io", "mmap_io", "buffer_io", "async_io", "reader_io" +- **Default Value**: "block_memory_io" + +### precise_file_path +- **Parameter Type**: string +- **Parameter Description**: Fine - ranking vector file path, used for re - ranking +- **Optional Values**: Any valid file path +- **Default Value**: "" + +## Examples for Build Parameter String +```json +"index_param": { + "buckets_count": 50, + "base_quantization_type": "fp32", + "partition_strategy_type": "ivf", + "ivf_train_type": "kmeans" +} +``` +means that the index is built using 50 buckets, the base quantization type is fp32, the partition strategy type is ivf, and the ivf train type is kmeans. + +```json +"index_param": { + "buckets_count": 50, + "base_quantization_type": "pqfs", + "partition_strategy_type": "ivf", + "ivf_train_type": "random", + "precise_quantization_type": "fp16", + "use_reorder": true, + "base_pq_dim": 32, + "precise_io_type": "async_io", + "precise_file_path": "./precise_codes" +} +``` +means that the index is built using 50 buckets, the base quantization type is pqfs with pq dim = 32, the partition strategy type is ivf, and the ivf train type is random. this configuration enables reordering, the precise quantization type is fp16, uses libaio's asynchronous I/O for precise operations, and specifies the file for precise codes as './precise_codes' + +## Detailed Explanation of Search Parameters + +### scan_buckets_count +- **Parameter Type**: int +- **Parameter Description**: Number of buckets to scan +- **Optional Values**: 1 to buckets_count +- **Default Value**: **must be provided (no default value)** + +### factor +- **Parameter Type**: float +- **Parameter Description**: Scan factor, used for reordering, for example, if topk=10, factor=2.0, then IVF stage will recall 20 points, and then use precise code for reordering +- **Optional Values**: 1.0 to FLOAT_MAX +- **Default Value**: 2.0 + +### parallelism +- **Parameter Type**: int +- **Parameter Description**: Number of threads to use for parallel search per query +- **Optional Values**: 1 to INT_MAX +- **Default Value**: 1 (only the search main thread do the search) + +### timeout_ms +- **Parameter Type**: double +- **Parameter Description**: Maximum time cost in milliseconds for each query, used to control the search time cost +- **Optional Values**: 1 to DOUBLE_MAX +- **Default Value**: DOUBLE_MAX + +## Examples for Search Parameter String +```json +"ivf": { + "scan_buckets_count": 10, + "factor": 2.0, + "parallelism": 4, + "timeout_ms": 30.0 +} +``` +means that the search will scan 10 buckets, the factor is 2.0, and the parallelism is 4, around 4 threads per query, and the max time cost is 30ms (when search time exceed 30ms, will return the current result). diff --git a/include/vsag/dataset.h b/include/vsag/dataset.h index 8a01a25f4..f76292c58 100644 --- a/include/vsag/dataset.h +++ b/include/vsag/dataset.h @@ -270,6 +270,32 @@ class Dataset : public std::enable_shared_from_this { */ virtual int64_t GetExtraInfoSize() const = 0; + + /* + * @brief Sets the Statstics for the dataset. + * + * @param Statstics The Statstics string. + * @return DatasetPtr A shared pointer to the dataset with updated Statstics. + */ + virtual DatasetPtr + Statstics(const std::string& Statstics) = 0; + + /** + * @brief Retrieves the all Statstics of the dataset. + * + * @return std::string The Statstics string. + */ + virtual std::string + GetStatstics() const = 0; + + /** + * @brief Retrieves the Statstics of the dataset. + * + * @param stat_keys The vector of stat keys. + * @return std::vector The vector of stat values. + */ + virtual std::vector + GetStatstics(const std::vector& stat_keys) const = 0; }; }; // namespace vsag diff --git a/src/algorithm/hgraph.cpp b/src/algorithm/hgraph.cpp index 8aacd2dc4..84ec0b163 100644 --- a/src/algorithm/hgraph.cpp +++ b/src/algorithm/hgraph.cpp @@ -985,12 +985,11 @@ HGraph::add_one_point(const void* data, int level, InnerIdType inner_id) { bool HGraph::graph_add_one(const void* data, int level, InnerIdType inner_id) { DistHeapPtr result = nullptr; - InnerSearchParam param{ - .topk = 1, - .ep = this->entry_point_id_, - .ef = 1, - .is_inner_id_allowed = nullptr, - }; + InnerSearchParam param; + param.topk = 1; + param.ep = this->entry_point_id_; + param.ef = 1; + param.is_inner_id_allowed = nullptr; LockGuard cur_lock(neighbors_mutex_, inner_id); auto flatten_codes = basic_flatten_codes_; @@ -1675,6 +1674,7 @@ HGraph::SearchWithRequest(const SearchRequest& request) const { search_param.ef = 1; search_param.is_inner_id_allowed = nullptr; search_param.search_alloc = search_allocator; + const auto* raw_query = get_data(query); for (auto i = static_cast(this->route_graphs_.size() - 1); i >= 0; --i) { auto result = this->search_one_graph( @@ -1703,6 +1703,11 @@ HGraph::SearchWithRequest(const SearchRequest& request) const { search_param.is_inner_id_allowed = ft; search_param.topk = static_cast(search_param.ef); search_param.consider_duplicate = true; + if (params.enable_time_record) { + search_param.time_cost = std::make_shared(); + search_param.time_cost->SetThreshold(params.timeout_ms); + (*search_param.stats)["is_timeout"] = false; + } auto search_result = this->search_one_graph( raw_query, this->bottom_graph_, this->basic_flatten_codes_, search_param); @@ -1734,6 +1739,7 @@ HGraph::SearchWithRequest(const SearchRequest& request) const { } search_result->Pop(); } + dataset_results->Statstics(search_param.stats->dump()); return std::move(dataset_results); } diff --git a/src/algorithm/hgraph_parameter.cpp b/src/algorithm/hgraph_parameter.cpp index 402d94164..bf9c691c8 100644 --- a/src/algorithm/hgraph_parameter.cpp +++ b/src/algorithm/hgraph_parameter.cpp @@ -220,6 +220,11 @@ HGraphSearchParameters::FromJson(const std::string& json_string) { obj.use_extra_info_filter = params[INDEX_TYPE_HGRAPH][HGRAPH_USE_EXTRA_INFO_FILTER]; } + if (params[INDEX_TYPE_HGRAPH].contains(SEARCH_MAX_TIME_COST_MS)) { + obj.timeout_ms = params[INDEX_TYPE_HGRAPH][SEARCH_MAX_TIME_COST_MS]; + obj.enable_time_record = true; + } + return obj; } } // namespace vsag diff --git a/src/algorithm/hgraph_parameter.h b/src/algorithm/hgraph_parameter.h index 8abe64c6f..78bc3c271 100644 --- a/src/algorithm/hgraph_parameter.h +++ b/src/algorithm/hgraph_parameter.h @@ -79,6 +79,8 @@ class HGraphSearchParameters { int64_t ef_search{30}; bool use_reorder{false}; bool use_extra_info_filter{false}; + bool enable_time_record{false}; + double timeout_ms{std::numeric_limits::max()}; private: HGraphSearchParameters() = default; diff --git a/src/algorithm/ivf.cpp b/src/algorithm/ivf.cpp index ef44851de..ced145e18 100644 --- a/src/algorithm/ivf.cpp +++ b/src/algorithm/ivf.cpp @@ -626,6 +626,10 @@ IVF::create_search_param(const std::string& parameters, const FilterPtr& filter) param.factor = search_param.topk_factor; param.first_order_scan_ratio = search_param.first_order_scan_ratio; param.parallel_search_thread_count = search_param.parallel_search_thread_count; + if (search_param.enable_time_record) { + param.time_cost = std::make_shared(); + param.time_cost->SetThreshold(search_param.timeout_ms); + } return param; } @@ -633,7 +637,8 @@ DatasetPtr IVF::reorder(int64_t topk, DistHeapPtr& input, const float* query) const { auto [dataset_results, dists, labels] = create_fast_dataset(topk, allocator_); auto reorder_heap = Reorder::ReorderByFlatten(input, reorder_codes_, query, allocator_, topk); - for (int64_t j = topk - 1; j >= 0; --j) { + auto size = static_cast(reorder_heap->Size()); + for (int64_t j = size - 1; j >= 0; --j) { dists[j] = reorder_heap->Top().first; labels[j] = label_table_->GetLabelById(reorder_heap->Top().second); reorder_heap->Pop(); @@ -696,6 +701,9 @@ IVF::search(const DatasetPtr& query, const InnerSearchParam& param) const { Vector centroid(dim_, allocator_); Vector dist(allocator_); for (uint64_t i = 0; i < bucket_count; ++i) { + if (param.time_cost != nullptr and param.time_cost->CheckOvertime()) { + break; + } if (i % search_thread_count != thread_id) { continue; } diff --git a/src/algorithm/ivf_parameter.h b/src/algorithm/ivf_parameter.h index 94245c41d..546c2460b 100644 --- a/src/algorithm/ivf_parameter.h +++ b/src/algorithm/ivf_parameter.h @@ -85,6 +85,11 @@ class IVFSearchParameters { obj.parallel_search_thread_count = params[INDEX_TYPE_IVF][IVF_SEARCH_PARALLELISM]; } + if (params[INDEX_TYPE_IVF].contains(SEARCH_MAX_TIME_COST_MS)) { + obj.timeout_ms = params[INDEX_TYPE_IVF][SEARCH_MAX_TIME_COST_MS]; + obj.enable_time_record = true; + } + return obj; } @@ -93,6 +98,8 @@ class IVFSearchParameters { float topk_factor{2.0F}; float first_order_scan_ratio{1.0F}; int64_t parallel_search_thread_count{1}; + double timeout_ms{std::numeric_limits::max()}; + bool enable_time_record{false}; private: IVFSearchParameters() = default; diff --git a/src/dataset_impl.cpp b/src/dataset_impl.cpp index 8c991bddf..03b547956 100644 --- a/src/dataset_impl.cpp +++ b/src/dataset_impl.cpp @@ -15,6 +15,8 @@ #include "dataset_impl.h" +#include "typing.h" + namespace vsag { DatasetPtr @@ -29,4 +31,18 @@ DatasetImpl::MakeEmptyDataset() { return result; } +std::vector +DatasetImpl::GetStatstics(const std::vector& stat_keys) const { + auto json = JsonType::parse(this->statstics_); + std::vector result; + for (const auto& key : stat_keys) { + if (json.contains(key)) { + result.emplace_back(json[key].dump()); + } else { + result.emplace_back(""); + } + } + return result; +} + }; // namespace vsag diff --git a/src/dataset_impl.h b/src/dataset_impl.h index b3275728b..9c4960168 100644 --- a/src/dataset_impl.h +++ b/src/dataset_impl.h @@ -267,6 +267,20 @@ class DatasetImpl : public Dataset { return 0; } + DatasetPtr + Statstics(const std::string& statstics) override { + this->statstics_ = statstics; + return shared_from_this(); + } + + std::vector + GetStatstics(const std::vector& stat_keys) const override; + + std::string + GetStatstics() const override { + return this->statstics_; + } + static DatasetPtr MakeEmptyDataset(); @@ -274,6 +288,8 @@ class DatasetImpl : public Dataset { bool owner_{true}; std::unordered_map data_; Allocator* allocator_ = nullptr; + + std::string statstics_; }; }; // namespace vsag diff --git a/src/impl/basic_searcher.cpp b/src/impl/basic_searcher.cpp index 54bdd7a9f..a8def0f97 100644 --- a/src/impl/basic_searcher.cpp +++ b/src/impl/basic_searcher.cpp @@ -298,6 +298,12 @@ BasicSearcher::search_impl(const GraphInterfacePtr& graph, hops++; auto current_node_pair = candidate_set->Top(); + if (inner_search_param.time_cost != nullptr and + inner_search_param.time_cost->CheckOvertime()) { + (*inner_search_param.stats)["is_timeout"] = true; + break; + } + if constexpr (mode == InnerSearchMode::KNN_SEARCH) { if ((-current_node_pair.first) > lower_bound && top_candidates->Size() == ef) { break; diff --git a/src/impl/basic_searcher.h b/src/impl/basic_searcher.h index 205c6d196..0a21dce7f 100644 --- a/src/impl/basic_searcher.h +++ b/src/impl/basic_searcher.h @@ -21,57 +21,15 @@ #include "impl/heap/distance_heap.h" #include "index/index_common_param.h" #include "index/iterator_filter.h" +#include "inner_search_param.h" #include "lock_strategy.h" +#include "utils/timer.h" #include "utils/visited_list.h" namespace vsag { static constexpr uint32_t OPTIMIZE_SEARCHER_SAMPLE_SIZE = 10000; -enum InnerSearchMode { KNN_SEARCH = 1, RANGE_SEARCH = 2 }; - -enum InnerSearchType { PURE = 1, WITH_FILTER = 2 }; - -class InnerSearchParam { -public: - int64_t topk{0}; - float radius{0.0f}; - InnerIdType ep{0}; - uint64_t ef{10}; - FilterPtr is_inner_id_allowed{nullptr}; - float skip_ratio{0.8F}; - InnerSearchMode search_mode{KNN_SEARCH}; - int range_search_limit_size{-1}; - int64_t parallel_search_thread_count{1}; - - // for ivf - int scan_bucket_size{1}; - float factor{2.0F}; - float first_order_scan_ratio{1.0F}; - Allocator* search_alloc{nullptr}; - std::vector executors; - mutable int64_t duplicate_id{-1}; - bool consider_duplicate{false}; - - InnerSearchParam& - operator=(const InnerSearchParam& other) { - if (this != &other) { - topk = other.topk; - radius = other.radius; - ep = other.ep; - ef = other.ef; - skip_ratio = other.skip_ratio; - search_mode = other.search_mode; - range_search_limit_size = other.range_search_limit_size; - is_inner_id_allowed = other.is_inner_id_allowed; - scan_bucket_size = other.scan_bucket_size; - factor = other.factor; - first_order_scan_ratio = other.first_order_scan_ratio; - } - return *this; - } -}; - constexpr float THRESHOLD_ERROR = 2e-6; class BasicSearcher { diff --git a/src/impl/inner_search_param.h b/src/impl/inner_search_param.h new file mode 100644 index 000000000..194820d17 --- /dev/null +++ b/src/impl/inner_search_param.h @@ -0,0 +1,78 @@ + +// Copyright 2024-present the vsag project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "attr/executor/executor.h" +#include "typing.h" +#include "utils/timer.h" +#include "vsag/filter.h" + +namespace vsag { + +enum InnerSearchMode { KNN_SEARCH = 1, RANGE_SEARCH = 2 }; + +enum InnerSearchType { PURE = 1, WITH_FILTER = 2 }; + +class InnerSearchParam { +public: + InnerSearchParam() { + stats = std::make_shared(); + } + +public: + int64_t topk{0}; + float radius{0.0f}; + InnerIdType ep{0}; + uint64_t ef{10}; + FilterPtr is_inner_id_allowed{nullptr}; + float skip_ratio{0.8F}; + InnerSearchMode search_mode{KNN_SEARCH}; + int range_search_limit_size{-1}; + int64_t parallel_search_thread_count{1}; + + // for ivf + int scan_bucket_size{1}; + float factor{2.0F}; + float first_order_scan_ratio{1.0F}; + Allocator* search_alloc{nullptr}; + std::vector executors; + mutable int64_t duplicate_id{-1}; + bool consider_duplicate{false}; + + // time record + std::shared_ptr time_cost{nullptr}; + + std::shared_ptr stats{nullptr}; + + InnerSearchParam& + operator=(const InnerSearchParam& other) { + if (this != &other) { + topk = other.topk; + radius = other.radius; + ep = other.ep; + ef = other.ef; + skip_ratio = other.skip_ratio; + search_mode = other.search_mode; + range_search_limit_size = other.range_search_limit_size; + is_inner_id_allowed = other.is_inner_id_allowed; + scan_bucket_size = other.scan_bucket_size; + factor = other.factor; + first_order_scan_ratio = other.first_order_scan_ratio; + } + return *this; + } +}; +} // namespace vsag diff --git a/src/inner_string_params.h b/src/inner_string_params.h index 7cd2ea905..e0806af9c 100644 --- a/src/inner_string_params.h +++ b/src/inner_string_params.h @@ -108,6 +108,7 @@ const char* const BUCKET_USE_RESIDUAL = "use_residual"; const char* const IVF_SEARCH_PARAM_SCAN_BUCKETS_COUNT = "scan_buckets_count"; const char* const IVF_SEARCH_PARAM_FACTOR = "factor"; const char* const IVF_SEARCH_PARALLELISM = "parallelism"; +const char* const SEARCH_MAX_TIME_COST_MS = "timeout_ms"; const char* const IVF_USE_REORDER_KEY = "use_reorder"; const char* const IVF_PRECISE_CODES_KEY = "precise_codes"; diff --git a/src/utils/timer.cpp b/src/utils/timer.cpp index 536eee979..19859de72 100644 --- a/src/utils/timer.cpp +++ b/src/utils/timer.cpp @@ -15,14 +15,44 @@ #include "timer.h" +#include + namespace vsag { -Timer::Timer(double& ref) : ref_(ref) { +Timer::Timer(double* ref) : ref_(ref) { start = std::chrono::steady_clock::now(); + this->threshold_ = std::numeric_limits::max(); +} + +Timer::Timer(double& ref) : Timer(&ref){}; + +Timer::Timer() : Timer(nullptr){}; + +double +Timer::Record() { + auto finish = std::chrono::steady_clock::now(); + std::chrono::duration duration = finish - start; + return duration.count(); +} + +bool +Timer::CheckOvertime() { + if (threshold_ == std::numeric_limits::max()) { + return false; + } + double duration = Record(); + return duration > threshold_; +} + +void +Timer::SetThreshold(double threshold) { + threshold_ = threshold; } Timer::~Timer() { auto finish = std::chrono::steady_clock::now(); std::chrono::duration duration = finish - start; - ref_ = duration.count(); + if (ref_ != nullptr) { + *ref_ = duration.count(); + } } } // namespace vsag diff --git a/src/utils/timer.h b/src/utils/timer.h index 35f0576aa..13fab2508 100644 --- a/src/utils/timer.h +++ b/src/utils/timer.h @@ -20,10 +20,25 @@ namespace vsag { class Timer { public: explicit Timer(double& ref); + + explicit Timer(double* ref); + + explicit Timer(); + ~Timer(); -public: - double& ref_; + double + Record(); + + void + SetThreshold(double threshold); + + bool + CheckOvertime(); + +private: + double* ref_{nullptr}; + double threshold_{std::numeric_limits::max()}; std::chrono::steady_clock::time_point start; }; } // namespace vsag diff --git a/tests/test_hgraph.cpp b/tests/test_hgraph.cpp index dded8e1fe..5698bac3a 100644 --- a/tests/test_hgraph.cpp +++ b/tests/test_hgraph.cpp @@ -1747,6 +1747,57 @@ TEST_CASE("[Daily] HGraph With Extra Info", "[ft][hgraph][daily]") { TestHGraphWithExtraInfo(test_index, resource); } +static void +TestHGraphSearchOverTime(const fixtures::HGraphTestIndexPtr& test_index, + const fixtures::HGraphResourcePtr& resource) { + using namespace fixtures; + auto origin_size = vsag::Options::Instance().block_size_limit(); + auto size = GENERATE(1024 * 1024 * 2); + constexpr const char* search_param = R"({ + "hgraph": { + "ef_search": 200, + "timeout_ms": 5.0 + } + })"; + for (auto metric_type : resource->metric_types) { + for (auto dim : resource->dims) { + for (auto& [base_quantization_str, recall] : resource->test_cases) { + INFO(fmt::format("metric_type: {}, dim: {}, base_quantization_str: {}, recall: {}", + metric_type, + dim, + base_quantization_str, + recall)); + if (HGraphTestIndex::IsRaBitQ(base_quantization_str) && + dim < fixtures::RABITQ_MIN_RACALL_DIM) { + continue; // Skip invalid RaBitQ configurations + } + vsag::Options::Instance().set_block_size_limit(size); + HGraphTestIndex::HGraphBuildParam build_param( + metric_type, dim, base_quantization_str); + auto param = HGraphTestIndex::GenerateHGraphBuildParametersString(build_param); + auto index = TestIndex::TestFactory(test_index->name, param, true); + auto dataset = HGraphTestIndex::pool.GetDatasetAndCreate( + dim, resource->base_count, metric_type); + TestIndex::TestBuildIndex(index, dataset, true); + TestIndex::TestSearchOvertime(index, dataset, search_param); + vsag::Options::Instance().set_block_size_limit(origin_size); + } + } + } +} + +TEST_CASE("[PR] HGraph Search Over Time", "[ft][hgraph][pr]") { + auto test_index = std::make_shared(); + auto resource = test_index->GetResource(true); + TestHGraphSearchOverTime(test_index, resource); +} + +TEST_CASE("[Daily] HGraph Search Over Time", "[ft][hgraph][daily]") { + auto test_index = std::make_shared(); + auto resource = test_index->GetResource(false); + TestHGraphSearchOverTime(test_index, resource); +} + static void TestHGraphDiskIOType(const fixtures::HGraphTestIndexPtr& test_index, const fixtures::HGraphResourcePtr& resource) { diff --git a/tests/test_index.cpp b/tests/test_index.cpp index 6360be64b..b1e056bbf 100644 --- a/tests/test_index.cpp +++ b/tests/test_index.cpp @@ -2103,4 +2103,29 @@ TestIndex::TestBuildDuplicateIndex(const IndexPtr& index, } } +void +TestIndex::TestSearchOvertime(const IndexPtr& index, + const TestDatasetPtr& dataset, + const std::string& search_param) { + auto queries = dataset->query_; + auto query_count = queries->GetNumElements(); + auto dim = queries->GetDim(); + for (auto i = 0; i < query_count; ++i) { + auto query = vsag::Dataset::Make(); + query->NumElements(1) + ->Dim(dim) + ->Float32Vectors(queries->GetFloat32Vectors() + i * dim) + ->SparseVectors(queries->GetSparseVectors() + i) + ->Paths(queries->GetPaths() + i) + ->Owner(false); + auto res = index->KnnSearch(query, 10, search_param); + REQUIRE(res.has_value()); + auto result = res.value(); + REQUIRE(result->GetStatstics() != "{}"); + auto stats = result->GetStatstics({"is_timeout"}); + REQUIRE(stats.size() == 1); + bool has_timeout_result = (stats[0] == "true" or stats[0] == "false"); + REQUIRE(has_timeout_result); + } +} } // namespace fixtures diff --git a/tests/test_index.h b/tests/test_index.h index f2533fa93..254251f95 100644 --- a/tests/test_index.h +++ b/tests/test_index.h @@ -278,6 +278,11 @@ class TestIndex { const std::string& search_param, bool with_update = true); + static void + TestSearchOvertime(const IndexPtr& index, + const TestDatasetPtr& dataset, + const std::string& search_param); + constexpr static float RECALL_THRESHOLD = 0.95; }; diff --git a/tests/test_ivf.cpp b/tests/test_ivf.cpp index ac74d6337..e69186ab8 100644 --- a/tests/test_ivf.cpp +++ b/tests/test_ivf.cpp @@ -566,6 +566,76 @@ TEST_CASE("[Daily] IVF Build", "[ft][ivf][daily]") { TestIVFBuild(test_index, resource); } +static void +TestIVFSearchOvertime(const fixtures::IVFTestIndexPtr& test_index, + const fixtures::IVFResourcePtr& resource) { + using namespace fixtures; + auto origin_size = vsag::Options::Instance().block_size_limit(); + auto size = GENERATE(1024 * 1024 * 2); + constexpr static const char* search_param_tmp2 = R"( + {{ + "ivf": {{ + "scan_buckets_count": {}, + "factor": 4.0, + "first_order_scan_ratio": 1.0, + "parallelism": {}, + "timeout_ms": 20.0 + }} + }})"; + for (auto metric_type : resource->metric_types) { + for (auto dim : resource->dims) { + for (auto train_type : resource->train_types) { + for (auto [base_quantization_str, recall] : resource->test_cases) { + auto count = std::min(300, static_cast(dim / 4)); + if (train_type == "kmeans") { + recall *= 0.8F; // Kmeans may not achieve high recall in random datasets + } + auto search_thread_count = GENERATE(1, 3); + auto search_param = + fmt::format(search_param_tmp2, std::max(200, count), search_thread_count); + INFO( + fmt::format("metric_type: {}, dim: {}, base_quantization_str: {}, " + "train_type: {}, recall: {}", + metric_type, + dim, + base_quantization_str, + train_type, + recall)); + vsag::Options::Instance().set_block_size_limit(size); + auto param = + IVFTestIndex::GenerateIVFBuildParametersString(metric_type, + dim, + base_quantization_str, + 300, + train_type, + false, + 1, + false, + 3); + auto index = IVFTestIndex::TestFactory(IVFTestIndex::name, param, true); + auto dataset = IVFTestIndex::pool.GetDatasetAndCreate( + dim, resource->base_count, metric_type); + IVFTestIndex::TestBuildIndex(index, dataset, true); + IVFTestIndex::TestSearchOvertime(index, dataset, search_param); + vsag::Options::Instance().set_block_size_limit(origin_size); + } + } + } + } +} + +TEST_CASE("[PR] IVF Search Overtime", "[ft][ivf][pr]") { + auto test_index = std::make_shared(); + auto resource = test_index->GetResource(true); + TestIVFSearchOvertime(test_index, resource); +} + +TEST_CASE("[Daily] IVF Search Overtime", "[ft][ivf][daily]") { + auto test_index = std::make_shared(); + auto resource = test_index->GetResource(false); + TestIVFSearchOvertime(test_index, resource); +} + static void TestIVFBuildWithLargeK(const fixtures::IVFTestIndexPtr& test_index, const fixtures::IVFResourcePtr& resource) {