perf: TopK early reject, IN O(1) lookup, parallel R2 probing + vector search docs

teamchong · teamchong · commit e7b14757aef1 · 2026-03-08T10:57:24.000-04:00
Performance:
- TopK columnar path: check sort column value against heap root before
  materializing Row object — skips allocation for rows that can't enter
  the heap (10-100x fewer allocations for selective LIMIT queries)
- IN/NOT_IN filter: pre-build Set from filter values with WeakMap cache
  — O(1) per row instead of O(m) linear scan per row
- loadTableFromR2: probe all 6 R2 key candidates in parallel via
  Promise.all instead of sequential head() calls — up to 200ms
  cold-start improvement

Correctness:
- Partition catalog neq/not_in also need exactPartition guard —
  range data (min≠max) could falsely exclude fragments containing
  matching rows

Docs:
- Add vector-search.mdx: DataFrame .vector() API, SQL NEAR/TOPK,
  distance metrics, IVF-PQ vs flat, encoder integration, filter
  composition
diff --git a/docs/src/content/docs/vector-search.mdx b/docs/src/content/docs/vector-search.mdx
@@ -0,0 +1,86 @@
+---
+title: Vector Search
+description: Similarity search with SIMD acceleration, IVF-PQ indexes, and text-to-vector encoding.
+---
+
+QueryMode supports vector similarity search on embedding columns stored in Lance format. Searches use WASM SIMD for acceleration and IVF-PQ indexes when available.
+
+## DataFrame API
+
+```typescript
+// Search with a raw vector
+const similar = await qm
+  .table("images")
+  .vector("embedding", queryVector, 10)
+  .select("id", "title")
+  .collect()
+
+// Search with text (requires encoder)
+const similar = await qm
+  .table("articles")
+  .vector("embedding", "climate change solutions", 10, {
+    encoder: async (text) => myModel.encode(text),
+    metric: "cosine",
+  })
+  .collect()
+```
+
+### Parameters
+
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `column` | `string` | Column containing `Float32Array` embeddings |
+| `queryVector` | `Float32Array \| string` | Query vector or text (text requires `encoder`) |
+| `topK` | `number` | Number of nearest neighbors to return |
+| `opts.metric` | `"cosine" \| "l2" \| "dot"` | Distance metric (default: `"cosine"`) |
+| `opts.encoder` | `(text: string) => Promise<Float32Array>` | Text-to-vector encoder for string queries |
+| `opts.nprobe` | `number` | IVF-PQ tuning: number of partitions to probe |
+| `opts.efSearch` | `number` | HNSW tuning: search beam width |
+
+## SQL
+
+```sql
+SELECT id, title FROM articles
+WHERE embedding NEAR [0.1, 0.2, 0.3, ...] TOPK 10
+```
+
+The `NEAR` operator performs vector similarity search. `TOPK` limits results to the K nearest neighbors.
+
+## Distance metrics
+
+| Metric | Description | Best for |
+|--------|-------------|----------|
+| `cosine` | Cosine similarity (default) | Text embeddings, normalized vectors |
+| `l2` | Euclidean distance | Spatial data, unnormalized vectors |
+| `dot` | Dot product | When vectors are pre-normalized |
+
+## Index types
+
+### Flat (no index)
+
+Without an index, QueryMode performs brute-force SIMD-accelerated distance computation across all vectors. Fast for datasets under ~100K vectors.
+
+### IVF-PQ
+
+For larger datasets, create an IVF-PQ (Inverted File with Product Quantization) index:
+
+- **IVF** partitions vectors into clusters. At query time, only `nprobe` clusters are searched.
+- **PQ** compresses vectors into compact codes, reducing memory and I/O.
+
+IVF-PQ indexes are stored alongside data in R2 and loaded on first query.
+
+## Combining with filters
+
+Vector search composes with all other DataFrame operations:
+
+```typescript
+const results = await qm
+  .table("products")
+  .filter("category", "eq", "electronics")
+  .filter("price", "lt", 1000)
+  .vector("embedding", queryVec, 20, { metric: "l2" })
+  .select("id", "name", "price")
+  .collect()
+```
+
+Filters are applied before vector search — only matching rows are scanned for similarity.
diff --git a/src/decode.ts b/src/decode.ts
@@ -477,8 +477,16 @@ export function matchesFilter(
     case "gte": { const [a, b] = coerceCompare(val, t); return (a as number | bigint | string) >= (b as number | bigint | string); }
     case "lt":  { const [a, b] = coerceCompare(val, t); return (a as number | bigint | string) < (b as number | bigint | string); }
     case "lte": { const [a, b] = coerceCompare(val, t); return (a as number | bigint | string) <= (b as number | bigint | string); }
-    case "in":  return Array.isArray(t) && t.some(v => { const [a, b] = coerceCompare(val, v); return a === b; });
-    case "not_in": return Array.isArray(t) && !t.some(v => { const [a, b] = coerceCompare(val, v); return a === b; });
+    case "in": {
+      if (!Array.isArray(t)) return false;
+      const set = getInSet(t);
+      return set.has(typeof val === "bigint" ? Number(val) : val);
+    }
+    case "not_in": {
+      if (!Array.isArray(t)) return false;
+      const set = getInSet(t);
+      return !set.has(typeof val === "bigint" ? Number(val) : val);
+    }
     case "between": {
       if (!Array.isArray(t) || t.length !== 2) return false;
       const [, lo] = coerceCompare(val, t[0]);
@@ -505,6 +513,20 @@ export function matchesFilter(
   }
 }
 
+/** Cache IN/NOT_IN value sets — O(1) lookup instead of O(m) per row. */
+const inSetCache = new WeakMap<readonly (number | bigint | string)[], Set<number | string>>();
+
+function getInSet(values: readonly (number | bigint | string)[]): Set<number | string> {
+  let cached = inSetCache.get(values);
+  if (cached) return cached;
+  const set = new Set<number | string>();
+  for (const v of values) {
+    set.add(typeof v === "bigint" ? Number(v) : v);
+  }
+  inSetCache.set(values, set);
+  return set;
+}
+
 /** Cache compiled LIKE regexes — avoids re-compilation per row. */
 const likeRegexCache = new Map<string, RegExp>();
 
diff --git a/src/operators.ts b/src/operators.ts
@@ -1651,14 +1651,22 @@ export class TopKOperator implements Operator {
       else if (heap.length > 0 && shouldReplace(row)) { heap[0] = row; siftDown(heap, 0); }
     };
 
-    // Use columnar path if available — materialize only the rows that enter the heap
+    // Use columnar path if available — only materialize rows that would enter the heap
     if (this.upstream.nextColumnar) {
       while (true) {
         const batch = await this.upstream.nextColumnar();
         if (!batch) break;
         const indices = batch.selection ?? Uint32Array.from({ length: batch.rowCount }, (_, i) => i);
         const colNames = Array.from(batch.columns.keys());
+        const sortVals = batch.columns.get(col);
         for (const idx of indices) {
+          // Fast reject: if heap is full and this value can't beat the root, skip materialization
+          if (sortVals && heap.length >= k) {
+            const nv = sortVals[idx] as Row[string];
+            const rv = heap[0][col];
+            if (nv === null) continue;
+            if (rv !== null && (desc ? nv <= rv : nv >= rv)) continue;
+          }
           const row: Row = {};
           for (const name of colNames) {
             row[name] = (batch.columns.get(name)![idx] as Row[string]) ?? null;
diff --git a/src/partition-catalog.test.ts b/src/partition-catalog.test.ts
@@ -184,15 +184,25 @@ describe("PartitionCatalog", () => {
       expect(result).toBeNull();
     });
 
-    it("neq still works on range data (conservative — includes all non-excluded)", () => {
+    it("neq returns null on range data (prevents false exclusion)", () => {
       const fragments = new Map<number, TableMeta>([
         makeFragmentMeta(1, "id", 1, 100),
         makeFragmentMeta(2, "id", 101, 200),
       ]);
       const catalog = PartitionCatalog.fromFragments("id", fragments);
-      // neq is safe even for range data — worst case is slightly over-inclusive
+      // neq on range data could falsely exclude fragments containing matching rows
       const result = catalog.prune([{ column: "id", op: "neq", value: 1 }]);
-      expect(result).not.toBeNull();
+      expect(result).toBeNull();
+    });
+
+    it("not_in returns null on range data", () => {
+      const fragments = new Map<number, TableMeta>([
+        makeFragmentMeta(1, "id", 1, 100),
+        makeFragmentMeta(2, "id", 101, 200),
+      ]);
+      const catalog = PartitionCatalog.fromFragments("id", fragments);
+      const result = catalog.prune([{ column: "id", op: "not_in", value: [1, 101] }]);
+      expect(result).toBeNull();
     });
   });
 
diff --git a/src/partition-catalog.ts b/src/partition-catalog.ts
@@ -141,10 +141,12 @@ export class PartitionCatalog {
         return [...ids];
       }
       case "neq": {
+        if (!this.exactPartition) return null;
         const excluded = new Set(this.index.get(String(filter.value)) ?? []);
         return this.allFragmentIds.filter(id => !excluded.has(id));
       }
       case "not_in": {
+        if (!this.exactPartition) return null;
         if (!Array.isArray(filter.value)) return null;
         const excluded = new Set<number>();
         for (const v of filter.value) {
diff --git a/src/query-do.ts b/src/query-do.ts
@@ -1438,9 +1438,16 @@ export class QueryDO extends DurableObject<Env> {
       `${tableName}.lance`, `${tableName}.parquet`, tableName,
       `data/${tableName}.lance`, `data/${tableName}.parquet`, `data/${tableName}`,
     ];
-    for (const r2Key of candidates) {
-      const head = await this.r2(r2Key).head(r2Key);
-      if (!head) continue;
+    // Probe all candidates in parallel — first hit wins
+    const heads = await Promise.all(
+      candidates.map(async r2Key => {
+        const head = await this.r2(r2Key).head(r2Key);
+        return head ? { r2Key, head } : null;
+      }),
+    );
+    for (const hit of heads) {
+      if (!hit) continue;
+      const { r2Key, head } = hit;
 
       const fileSize = BigInt(head.size);
       const tailSize = Math.min(Number(fileSize), 40);

Original file line number	Diff line number	Diff line change
`@@ -141,10 +141,12 @@ export class PartitionCatalog {`
`141`	`141`	`return [...ids];`
`142`	`142`	`}`
`143`	`143`	`case "neq": {`
	`144`	`+ if (!this.exactPartition) return null;`
`144`	`145`	`const excluded = new Set(this.index.get(String(filter.value)) ?? []);`
`145`	`146`	`return this.allFragmentIds.filter(id => !excluded.has(id));`
`146`	`147`	`}`
`147`	`148`	`case "not_in": {`
	`149`	`+ if (!this.exactPartition) return null;`
`148`	`150`	`if (!Array.isArray(filter.value)) return null;`
`149`	`151`	`const excluded = new Set<number>();`
`150`	`152`	`for (const v of filter.value) {`