Skip to content

Commit 35109c9

Browse files
authored
Added PQ lookup (#4)
1 parent b2f3f8d commit 35109c9

14 files changed

Lines changed: 1195 additions & 501 deletions

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,6 @@ tools/data.txt
1414
# Generated by cargo mutants
1515
# Contains mutation testing data
1616
**/mutants.out*/
17+
18+
#example data files
19+
examples/data/*

Cargo.lock

Lines changed: 43 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[workspace]
22
resolver = "3"
3-
members = [ "nyas/diskann", "nyas/service", "nyas/system", "nyas/vecd"]
3+
members = ["examples", "nyas/diskann", "nyas/service", "nyas/system", "nyas/vecd"]
44

55
[workspace.package]
66
authors = ["Manish Kumar <manish@neocraft.tech>"]

examples/Cargo.toml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
[package]
2+
name = "nyas-examples"
3+
authors.workspace = true
4+
edition.workspace = true
5+
homepage.workspace = true
6+
license.workspace = true
7+
readme.workspace = true
8+
repository.workspace = true
9+
rust-version.workspace = true
10+
version.workspace = true
11+
12+
[dependencies]
13+
diskann = { path = "../nyas/diskann" }
14+
system = { path = "../nyas/system" }
15+
tokio.workspace = true
16+
cpu-time = "1.0.0"
17+
18+
[[bin]]
19+
name = "sift10k_index"
20+
path = "src/sift10k_index.rs"

examples/src/sift10k_index.rs

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
use std::collections::HashSet;
2+
use std::path::Path;
3+
use std::time::{Duration, SystemTime};
4+
5+
use cpu_time::ProcessTime;
6+
use diskann::index_view::IndexView;
7+
use system::vector_data::VectorData;
8+
use system::vector_point::VectorPoint;
9+
use tokio::fs::File;
10+
use tokio::io::{self, AsyncReadExt, BufReader};
11+
12+
#[derive(Debug)]
13+
pub struct SiftDataset {
14+
pub dimension: u32,
15+
pub vectors: Vec<VectorData>,
16+
}
17+
18+
impl SiftDataset {
19+
pub async fn from_fvecs<P: AsRef<Path>>(path: P) -> io::Result<Self> {
20+
let file = File::open(path).await?;
21+
let mut reader = BufReader::new(file);
22+
23+
let mut vectors = Vec::new();
24+
let mut dimension = 0u32;
25+
let mut buffer = [0u8; 4];
26+
27+
if reader.read_exact(&mut buffer).await.is_ok() {
28+
dimension = u32::from_le_bytes(buffer);
29+
30+
// Read first vector
31+
let mut vec = vec![0f32; dimension as usize];
32+
for item in vec.iter_mut().take(dimension as usize) {
33+
reader.read_exact(&mut buffer).await?;
34+
*item = f32::from_le_bytes(buffer);
35+
}
36+
println!("vector: {:?}", vec);
37+
let vec_data = VectorData::from_f32(vec);
38+
vectors.push(vec_data);
39+
}
40+
41+
loop {
42+
match reader.read_exact(&mut buffer).await {
43+
Ok(_) => {
44+
let dim = u32::from_le_bytes(buffer);
45+
if dim != dimension {
46+
return Err(io::Error::new(
47+
io::ErrorKind::InvalidData,
48+
"Inconsistent vector dimensions",
49+
));
50+
}
51+
52+
let mut vec = vec![0f32; dimension as usize];
53+
for item in vec.iter_mut().take(dimension as usize) {
54+
reader.read_exact(&mut buffer).await?;
55+
*item = f32::from_le_bytes(buffer);
56+
}
57+
let vec_data = VectorData::from_f32(vec);
58+
vectors.push(vec_data);
59+
}
60+
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => break,
61+
Err(e) => return Err(e),
62+
}
63+
}
64+
65+
Ok(SiftDataset { dimension, vectors })
66+
}
67+
68+
pub async fn from_bvecs<P: AsRef<Path>>(path: P) -> io::Result<Self> {
69+
let file = File::open(path).await?;
70+
let mut reader = BufReader::new(file);
71+
72+
let mut vectors = Vec::new();
73+
let mut dimension = 0u32;
74+
let mut buffer = [0u8; 4];
75+
76+
if reader.read_exact(&mut buffer).await.is_ok() {
77+
dimension = u32::from_le_bytes(buffer);
78+
79+
let mut vec = vec![0u8; dimension as usize];
80+
reader.read_exact(&mut vec).await?;
81+
let vec = vec.iter().map(|&x| x as f32).collect::<Vec<f32>>();
82+
let vec_data = VectorData::from_f32(vec);
83+
vectors.push(vec_data);
84+
}
85+
86+
loop {
87+
match reader.read_exact(&mut buffer).await {
88+
Ok(_) => {
89+
let dim = u32::from_le_bytes(buffer);
90+
if dim != dimension {
91+
return Err(io::Error::new(
92+
io::ErrorKind::InvalidData,
93+
"Inconsistent vector dimensions",
94+
));
95+
}
96+
97+
let mut vec = vec![0u8; dimension as usize];
98+
reader.read_exact(&mut vec).await?;
99+
let vec = vec.iter().map(|&x| x as f32).collect::<Vec<f32>>();
100+
let vec_data = VectorData::from_f32(vec);
101+
vectors.push(vec_data);
102+
}
103+
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => break,
104+
Err(e) => return Err(e),
105+
}
106+
}
107+
108+
Ok(SiftDataset { dimension, vectors })
109+
}
110+
111+
pub async fn from_ivecs<P: AsRef<Path>>(path: P) -> io::Result<Vec<Vec<u32>>> {
112+
let file = File::open(path).await?;
113+
let mut reader = BufReader::new(file);
114+
115+
let mut results = Vec::new();
116+
let mut buffer = [0u8; 4];
117+
118+
loop {
119+
match reader.read_exact(&mut buffer).await {
120+
Ok(_) => {
121+
let k = u32::from_le_bytes(buffer);
122+
let mut neighbors = vec![0u32; k as usize];
123+
124+
for item in neighbors.iter_mut().take(k as usize) {
125+
reader.read_exact(&mut buffer).await?;
126+
*item = u32::from_le_bytes(buffer);
127+
}
128+
results.push(neighbors);
129+
}
130+
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => break,
131+
Err(e) => return Err(e),
132+
}
133+
}
134+
135+
Ok(results)
136+
}
137+
138+
pub fn get_vector(&self, idx: usize) -> Option<&VectorData> {
139+
self.vectors.get(idx)
140+
}
141+
}
142+
143+
pub fn compute_recall(results: &[Vec<u32>], ground_truth: &[Vec<u32>], k: usize) -> f64 {
144+
assert_eq!(results.len(), ground_truth.len(), "Results and groundtruth must have same length");
145+
146+
let num_queries = results.len();
147+
let mut total_matches = 0;
148+
149+
for (result, truth) in results.iter().zip(ground_truth.iter()) {
150+
let truth_set: HashSet<u32> = truth.iter().take(k).copied().collect();
151+
152+
let matches = result.iter().take(k).filter(|&&idx| truth_set.contains(&idx)).count();
153+
println!("Matches: {}", matches);
154+
total_matches += matches;
155+
}
156+
157+
total_matches as f64 / (num_queries * k) as f64
158+
}
159+
160+
async fn compute_recall_at_k(
161+
index_view: &IndexView, query: &SiftDataset, ground_truth: &[Vec<u32>], k: usize,
162+
) {
163+
let mut results = Vec::new();
164+
for q in query.vectors.iter() {
165+
let res = index_view.search(q, k, 128).await;
166+
results.push(res);
167+
}
168+
169+
let recall = compute_recall(&results, ground_truth, k);
170+
println!("Recall for k: {}: {:?}", k, recall);
171+
}
172+
173+
#[tokio::main]
174+
async fn main() -> io::Result<()> {
175+
let base_folder = "examples/data/siftsmall";
176+
let start_cpu = ProcessTime::now();
177+
let start_wall = SystemTime::now();
178+
179+
let (base, query, ground_truth) = tokio::try_join!(
180+
SiftDataset::from_fvecs(format!("{}/siftsmall_base.fvecs", base_folder)),
181+
SiftDataset::from_fvecs(format!("{}/siftsmall_query.fvecs", base_folder)),
182+
SiftDataset::from_ivecs(format!("{}/siftsmall_groundtruth.ivecs", base_folder))
183+
)?;
184+
185+
println!("Base dataset: {} vectors of dimension {}", base.vectors.len(), base.dimension);
186+
println!("Query dataset: {} vectors of dimension {}", query.vectors.len(), query.dimension);
187+
println!("Ground truth: {} queries", ground_truth.len());
188+
189+
let index_name = "sift10k";
190+
let index_view = IndexView::new(index_name).await.expect("Failed to create IndexView");
191+
192+
let path = Path::new(index_name);
193+
194+
if !path.exists() {
195+
for (index, vector) in base.vectors.iter().enumerate() {
196+
index_view.insert(&VectorPoint::new(index as u32, vector.clone())).await.unwrap();
197+
}
198+
}
199+
200+
let index_cpu_time = start_cpu.elapsed();
201+
let index_wall_time = start_wall.elapsed().unwrap();
202+
println!("Indexing time: CPU {:?}, Wall {:?}", index_cpu_time, index_wall_time);
203+
204+
for k in [1, 10, 100] {
205+
compute_recall_at_k(&index_view, &query, &ground_truth, k).await;
206+
}
207+
208+
let cpu_time: Duration = start_cpu.elapsed();
209+
let wall_time = start_wall.elapsed().unwrap();
210+
211+
println!("CPU time: {:?}", cpu_time);
212+
println!("Wall time: {:?}", wall_time);
213+
214+
Ok(())
215+
}

nyas/diskann/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ memmap2.workspace = true
1515
rand.workspace = true
1616
tokio.workspace = true
1717
ordered-float.workspace = true
18-
dashmap.workspace = true
18+
dashmap = { version = "6.0.1", features = ["rayon"] }
1919
rayon.workspace = true
2020
serde.workspace = true
2121
rkyv.workspace = true

0 commit comments

Comments
 (0)