Skip to content

Commit b4dca29

Browse files
authored
Add new vector-search-bench benchmarking crate (#7458)
## Summary Tracking issue: #7297 Adds a new `vector-search-bench` crate. Right now this is just utilities is preparation for adding proper vector search benchmarks (that are pulled from disk, not just in-memory). This just includes data downloading and file preparation, conversion (from parquet lists to vortex vector arrays), and some different compression and scan utilities. ## Testing Some basic unit tests but the real stress test will come later when we actually benchmark stuff. Signed-off-by: Connor Tsui <connor.tsui20@gmail.com>
1 parent 12f63a4 commit b4dca29

File tree

8 files changed

+741
-0
lines changed

8 files changed

+741
-0
lines changed

Cargo.lock

Lines changed: 24 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ members = [
5959
"benchmarks/datafusion-bench",
6060
"benchmarks/duckdb-bench",
6161
"benchmarks/random-access-bench",
62+
"benchmarks/vector-search-bench",
6263
]
6364
exclude = ["java/testfiles", "wasm-test"]
6465
resolver = "2"
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
[package]
2+
name = "vector-search-bench"
3+
description = "Vector similarity search benchmarks for Vortex on public embedding datasets"
4+
authors.workspace = true
5+
categories.workspace = true
6+
edition.workspace = true
7+
homepage.workspace = true
8+
include.workspace = true
9+
keywords.workspace = true
10+
license.workspace = true
11+
readme.workspace = true
12+
repository.workspace = true
13+
rust-version.workspace = true
14+
version.workspace = true
15+
publish = false
16+
17+
[dependencies]
18+
anyhow = { workspace = true }
19+
arrow-array = { workspace = true }
20+
arrow-buffer = { workspace = true }
21+
arrow-schema = { workspace = true }
22+
clap = { workspace = true, features = ["derive"] }
23+
futures = { workspace = true }
24+
indicatif = { workspace = true }
25+
parquet = { workspace = true, features = ["async"] }
26+
rand = { workspace = true }
27+
serde = { workspace = true, features = ["derive"] }
28+
tabled = { workspace = true, features = ["std"] }
29+
tokio = { workspace = true, features = ["full"] }
30+
tracing = { workspace = true }
31+
vortex = { workspace = true, features = ["files", "tokio", "unstable_encodings"] }
32+
vortex-bench = { workspace = true, features = ["unstable_encodings"] }
33+
vortex-btrblocks = { workspace = true, features = ["unstable_encodings"] }
34+
vortex-tensor = { workspace = true }
35+
36+
[dev-dependencies]
37+
tempfile = { workspace = true }
38+
39+
[lints]
40+
workspace = true
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
//! Vector compression flavors exercised by the benchmark.
5+
//!
6+
//! Each [`VectorFlavor`] variant maps to a [`vortex::file::WriteStrategyBuilder`] configuration
7+
//! applied to the same input data.
8+
//!
9+
//! The benchmark writes one `.vortex` file per flavor per data file, then scans them all with the
10+
//! same query so the comparison is apples-to-apples with the Parquet files.
11+
//!
12+
//! Note that the handrolled `&[f32]` parquet baseline is **not** a flavor here.
13+
14+
use clap::ValueEnum;
15+
use vortex::array::ArrayId;
16+
use vortex::array::scalar_fn::ScalarFnVTable;
17+
use vortex::file::ALLOWED_ENCODINGS;
18+
use vortex::file::VortexWriteOptions;
19+
use vortex::file::WriteOptionsSessionExt;
20+
use vortex::file::WriteStrategyBuilder;
21+
use vortex::session::VortexSession;
22+
use vortex::utils::aliases::hash_set::HashSet;
23+
use vortex_bench::Format;
24+
use vortex_btrblocks::BtrBlocksCompressorBuilder;
25+
use vortex_tensor::scalar_fns::l2_denorm::L2Denorm;
26+
use vortex_tensor::scalar_fns::sorf_transform::SorfTransform;
27+
28+
/// Every [`VectorFlavor`] variant in CLI-help order.
29+
pub const ALL_VECTOR_FLAVORS: &[VectorFlavor] =
30+
&[VectorFlavor::Uncompressed, VectorFlavor::TurboQuant];
31+
32+
/// One write-side compression configuration we measure.
33+
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, ValueEnum)]
34+
pub enum VectorFlavor {
35+
/// `BtrBlocksCompressorBuilder::empty()`
36+
#[clap(name = "vortex-uncompressed")]
37+
Uncompressed,
38+
/// `BtrBlocksCompressorBuilder::default().with_turboquant()`.
39+
#[clap(name = "vortex-turboquant")]
40+
TurboQuant,
41+
// TODO(connor): We will want to add `Default` here which is just the default compressor.
42+
}
43+
44+
impl VectorFlavor {
45+
/// Stable kebab-cased label used in CLI args and metric names.
46+
pub fn label(&self) -> &'static str {
47+
match self {
48+
VectorFlavor::Uncompressed => "vortex-uncompressed",
49+
VectorFlavor::TurboQuant => "vortex-turboquant",
50+
}
51+
}
52+
53+
/// The `target.format` value emitted on measurements for this flavor. Both flavors produce
54+
/// `.vortex` files, so the compression label carries the flavor split.
55+
pub fn as_format(&self) -> Format {
56+
match self {
57+
VectorFlavor::Uncompressed => Format::OnDiskVortex,
58+
VectorFlavor::TurboQuant => Format::OnDiskVortex,
59+
}
60+
}
61+
62+
/// Subdirectory name under the per-dataset cache root used to store this flavor's `.vortex`
63+
/// files.
64+
pub fn dir_name(&self) -> &'static str {
65+
match self {
66+
VectorFlavor::Uncompressed => "vortex-uncompressed",
67+
VectorFlavor::TurboQuant => "vortex-turboquant",
68+
}
69+
}
70+
71+
/// Build the [`vortex::file::WriteStrategyBuilder`]-backed write options for this flavor.
72+
///
73+
/// TurboQuant produces `L2Denorm(SorfTransform(...))` which the default file
74+
/// `ALLOWED_ENCODINGS` set rejects on normalization — we extend the allow-list with the two
75+
/// scalar-fn array IDs the scheme actually emits.
76+
pub fn create_write_options(&self, session: &VortexSession) -> VortexWriteOptions {
77+
let strategy = match self {
78+
VectorFlavor::Uncompressed => {
79+
let compressor = BtrBlocksCompressorBuilder::empty().build();
80+
81+
WriteStrategyBuilder::default()
82+
.with_compressor(compressor)
83+
.build()
84+
}
85+
VectorFlavor::TurboQuant => {
86+
let compressor = BtrBlocksCompressorBuilder::default()
87+
.with_turboquant()
88+
.build();
89+
90+
let mut allowed: HashSet<ArrayId> = ALLOWED_ENCODINGS.clone();
91+
allowed.insert(L2Denorm.id());
92+
allowed.insert(SorfTransform.id());
93+
94+
WriteStrategyBuilder::default()
95+
.with_compressor(compressor)
96+
.with_allow_encodings(allowed)
97+
.build()
98+
}
99+
};
100+
101+
session.write_options().with_strategy(strategy)
102+
}
103+
}
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
//! Cosine-similarity filter [`Expression`]s used by the file-scan path.
5+
//!
6+
//! We can easily build a cosine similarity filter by hand:
7+
//!
8+
//! ```text
9+
//! gt(
10+
//! cosine_similarity(col("emb"), lit(query_scalar)),
11+
//! lit(threshold),
12+
//! )
13+
//! ```
14+
//!
15+
//! The query is wrapped as `Scalar::extension::<Vector>(Scalar::fixed_size_list(F32, ...))` so
16+
//! [`CosineSimilarity`] can treat it as a single-row `Vector` value during evaluation.
17+
//!
18+
//! At scan time the literal expands into a `ConstantArray` whose row count matches the chunk batch
19+
//! size.
20+
21+
use anyhow::Result;
22+
use vortex::array::expr::Expression;
23+
use vortex::array::expr::col;
24+
use vortex::array::expr::gt;
25+
use vortex::array::expr::lit;
26+
use vortex::array::extension::EmptyMetadata;
27+
use vortex::array::scalar::Scalar;
28+
use vortex::array::scalar_fn::EmptyOptions;
29+
use vortex::array::scalar_fn::ScalarFnVTableExt;
30+
use vortex::dtype::DType;
31+
use vortex::dtype::Nullability;
32+
use vortex::dtype::PType;
33+
use vortex_tensor::scalar_fns::cosine_similarity::CosineSimilarity;
34+
use vortex_tensor::vector::Vector;
35+
36+
/// Build the filter `cosine_similarity(emb, query) > threshold`.
37+
pub fn similarity_filter(query: &[f32], threshold: f32) -> Result<Expression> {
38+
// Empty queries short-circuit to a literal `false`, so scans return no rows instead of trying
39+
// to evaluate cosine similarity on a zero-dimensional vector.
40+
if query.is_empty() {
41+
return Ok(lit(false));
42+
}
43+
44+
let query_lit = lit(query_scalar(query)?);
45+
let cosine = CosineSimilarity.new_expr(EmptyOptions, [col("emb"), query_lit]);
46+
Ok(gt(cosine, lit(threshold)))
47+
}
48+
49+
/// Wrap a query vector as `Scalar::extension::<Vector>(Scalar::fixed_size_list(F32, ...))`.
50+
pub fn query_scalar(query: &[f32]) -> Result<Scalar> {
51+
let children: Vec<Scalar> = query
52+
.iter()
53+
.map(|&v| Scalar::primitive(v, Nullability::NonNullable))
54+
.collect();
55+
56+
let element_dtype = DType::Primitive(PType::F32, Nullability::NonNullable);
57+
let fsl = Scalar::fixed_size_list(element_dtype, children, Nullability::NonNullable);
58+
59+
Ok(Scalar::extension::<Vector>(EmptyMetadata, fsl))
60+
}
61+
62+
/// Project just the `emb` column. Used by the throughput-only scan path.
63+
pub fn emb_projection() -> Expression {
64+
col("emb")
65+
}
66+
67+
#[cfg(test)]
68+
mod tests {
69+
use super::*;
70+
71+
#[test]
72+
fn query_scalar_accepts_empty_query() {
73+
let scalar = query_scalar(&[]).unwrap();
74+
match scalar.dtype() {
75+
DType::Extension(_) => {}
76+
other => panic!("expected Extension, got {other}"),
77+
}
78+
}
79+
80+
#[test]
81+
fn query_scalar_builds_extension_dtype() {
82+
let scalar = query_scalar(&[1.0, 0.0, 0.0]).unwrap();
83+
match scalar.dtype() {
84+
DType::Extension(_) => {}
85+
other => panic!("expected Extension, got {other}"),
86+
}
87+
}
88+
89+
#[test]
90+
fn similarity_filter_uses_gt_operator() {
91+
let expr = similarity_filter(&[1.0, 0.0, 0.0], 0.5).unwrap();
92+
// Quick sanity check: the printed form contains the operator and the threshold so
93+
// future refactors that change the structure get caught here.
94+
let printed = format!("{expr:?}");
95+
assert!(printed.contains("Gt") || printed.contains(">"), "{printed}");
96+
}
97+
}

0 commit comments

Comments
 (0)