teamchong
diff --git a/‎README.md‎
Lines changed: 52 additions & 1 deletion b/‎README.md‎
Lines changed: 52 additions & 1 deletion
diff --git a/‎examples/adaptive-search.ts‎
Lines changed: 150 additions & 0 deletions b/‎examples/adaptive-search.ts‎
Lines changed: 150 additions & 0 deletions
diff --git a/‎examples/custom-spill-backend.ts‎
Lines changed: 127 additions & 0 deletions b/‎examples/custom-spill-backend.ts‎
Lines changed: 127 additions & 0 deletions
@@ -2,6 +2,38 @@
 
 > **Experimental** — early prototype, not production-ready. Architecture and API will change.
 
+## Quickstart
+
+```bash
+pnpm add querymode
+```
+
+```typescript
+import { QueryMode } from "querymode/local"
+
+// Zero-config: demo data, no files needed
+const demo = QueryMode.demo()
+const top5 = await demo
+  .filter("category", "eq", "Electronics")
+  .sort("amount", "desc")
+  .limit(5)
+  .collect()
+
+console.log(top5.rows)
+
+// Or query your own files — Parquet, Lance, CSV, JSON, Arrow
+const qm = QueryMode.local()
+const result = await qm
+  .table("./data/events.parquet")
+  .filter("status", "eq", "active")
+  .filter("amount", "gte", 100)
+  .filter("amount", "lte", 500)
+  .select("id", "amount", "region")
+  .sort("amount", "desc")
+  .limit(20)
+  .collect()
+```
+
 A pluggable columnar query library — not a query engine you push data to, but a query capability your code uses directly. No data materialization, no engine boundary, no SQL transpilation.
 
 ## Why "mode" not "engine"
@@ -167,6 +199,25 @@ Traditional engines give you a fixed query language. You can't put a window func
 
 With QueryMode, operators are building blocks. Your code assembles the pipeline, controls the memory budget, decides when to spill. The query engine isn't a service you call — it's a library your code composes.
 
+### Beyond traditional engines
+
+These examples show what's possible when operators are composable building blocks, not a fixed plan:
+
+| Example | What it shows | Why DuckDB/Polars can't |
+|---------|--------------|------------------------|
+| [`examples/ml-scoring-pipeline.ts`](examples/ml-scoring-pipeline.ts) | Custom scoring runs **inside** the pipeline between Filter and TopK | UDFs serialize data across the engine boundary |
+| [`examples/adaptive-search.ts`](examples/adaptive-search.ts) | Vector search with adaptive threshold — recompose if too few results | Fixed query planner can't dynamically widen search |
+| [`examples/custom-spill-backend.ts`](examples/custom-spill-backend.ts) | Pluggable spill storage (memory, R2, S3) at 4KB budget | DuckDB: disk only. Polars: no spill at all |
+| [`examples/nextjs-api-route.ts`](examples/nextjs-api-route.ts) | Next.js/Vinext API route — query Parquet files, deploy to edge | DuckDB needs a sidecar process, can't run in Workers |
+
+Run any example:
+```bash
+npx tsx examples/ml-scoring-pipeline.ts
+npx tsx examples/adaptive-search.ts
+npx tsx examples/custom-spill-backend.ts
+npx tsx examples/nextjs-api-route.ts
+```
+
 ## What exists
 
 - **TypeScript orchestration** — Durable Object lifecycle, R2 range reads, footer caching, request routing
@@ -181,7 +232,7 @@ With QueryMode, operators are building blocks. Your code assembles the pipeline,
 - **Local mode** — same API reads Lance/Parquet files from disk or HTTP (Node/Bun)
 - **Fragment DO pool** — fan-out parallel scanning for multi-fragment datasets (max 20 slots per datacenter)
 - **112 unit tests + 26 conformance tests** — unit tests cover footer parsing, column decoding, Parquet/Thrift, merging, aggregates, VIP cache, WASM integration; conformance tests validate every operator against DuckDB at 1M-5M row scale
-- **CI benchmarks** — head-to-head QueryMode (Miniflare) vs DuckDB (native) on every push, results posted to GitHub Actions summary
+- **CI benchmarks** — head-to-head QueryMode (Miniflare) vs DuckDB (native) on every push, results posted to [GitHub Actions summary](https://github.com/teamchong/querymode/actions/workflows/ci.yml)
 
 ## What doesn't exist yet
 
 
@@ -0,0 +1,150 @@
+/**
+ * Adaptive Vector Search — dynamically widen search when too few results match.
+ *
+ * Traditional query planners have a fixed execution plan. Here, the pipeline
+ * is recomposed at runtime: if the initial distance threshold yields too few
+ * results, we widen it and search again. Impossible with a fixed query planner.
+ *
+ * Pipeline (per iteration):
+ *   MockOperator(vectors) → VectorTopKOperator(top-50) → FilterOperator(_distance < threshold)
+ */
+
+import {
+  FilterOperator,
+  TopKOperator,
+  drainPipeline,
+  type Operator,
+  type RowBatch,
+} from "../src/operators.js";
+import { cosineDistance } from "../src/hnsw.js";
+import type { Row } from "../src/types.js";
+
+// ─── Mock vector data ───────────────────────────────────────────────────
+
+class MockOperator implements Operator {
+  private rows: Row[];
+  private cursor = 0;
+  private batchSize: number;
+  constructor(rows: Row[], batchSize = 100) {
+    this.rows = rows;
+    this.batchSize = batchSize;
+  }
+  async next(): Promise<RowBatch | null> {
+    if (this.cursor >= this.rows.length) return null;
+    const batch = this.rows.slice(this.cursor, this.cursor + this.batchSize);
+    this.cursor += this.batchSize;
+    return batch;
+  }
+  async close() {}
+}
+
+/**
+ * Custom operator: compute cosine distance for each row and add _distance column.
+ * This is the kind of operator you can compose freely in QueryMode.
+ */
+class VectorDistanceOperator implements Operator {
+  private upstream: Operator;
+  private column: string;
+  private queryVec: Float32Array;
+  constructor(upstream: Operator, column: string, queryVec: Float32Array) {
+    this.upstream = upstream;
+    this.column = column;
+    this.queryVec = queryVec;
+  }
+  async next(): Promise<RowBatch | null> {
+    const batch = await this.upstream.next();
+    if (!batch) return null;
+    return batch.map(row => ({
+      ...row,
+      _distance: cosineDistance(row[this.column] as Float32Array, this.queryVec),
+    }));
+  }
+  async close() { await this.upstream.close(); }
+}
+
+// Deterministic PRNG
+let prngState = 12345;
+function xorshift32(): number {
+  prngState ^= prngState << 13;
+  prngState ^= prngState >>> 17;
+  prngState ^= prngState << 5;
+  return (prngState >>> 0) / 0xFFFFFFFF;
+}
+
+// Generate 200 items with 8-dim embeddings
+const DIM = 8;
+const items: Row[] = Array.from({ length: 200 }, (_, i) => {
+  const embedding = new Float32Array(DIM);
+  for (let d = 0; d < DIM; d++) embedding[d] = xorshift32() * 2 - 1;
+  // Normalize
+  let norm = 0;
+  for (let d = 0; d < DIM; d++) norm += embedding[d] * embedding[d];
+  norm = Math.sqrt(norm);
+  for (let d = 0; d < DIM; d++) embedding[d] /= norm;
+  return { id: i + 1, label: `item_${i + 1}`, embedding };
+});
+
+// Query vector (normalized)
+const queryVec = new Float32Array(DIM);
+for (let d = 0; d < DIM; d++) queryVec[d] = xorshift32() * 2 - 1;
+let qNorm = 0;
+for (let d = 0; d < DIM; d++) qNorm += queryVec[d] * queryVec[d];
+qNorm = Math.sqrt(qNorm);
+for (let d = 0; d < DIM; d++) queryVec[d] /= qNorm;
+
+// ─── Adaptive search ────────────────────────────────────────────────────
+
+const MIN_RESULTS = 10;
+const INITIAL_THRESHOLD = 0.3;
+const WIDEN_STEP = 0.15;
+const MAX_THRESHOLD = 1.5;
+
+async function adaptiveSearch(): Promise<Row[]> {
+  let threshold = INITIAL_THRESHOLD;
+
+  while (threshold <= MAX_THRESHOLD) {
+    console.log(`  Searching with distance threshold: ${threshold.toFixed(2)}`);
+
+    // Recompose pipeline with current threshold
+    const source = new MockOperator(items);
+    const withDistance = new VectorDistanceOperator(source, "embedding", queryVec);
+    const topK = new TopKOperator(withDistance, "_distance", false, 50);
+    const filtered = new FilterOperator(topK, [
+      { column: "_distance", op: "lt", value: threshold },
+    ]);
+
+    const results = await drainPipeline(filtered);
+    console.log(`  → Found ${results.length} results`);
+
+    if (results.length >= MIN_RESULTS) {
+      return results;
+    }
+
+    // Widen threshold and recompose
+    threshold += WIDEN_STEP;
+    console.log(`  Too few results, widening threshold...\n`);
+  }
+
+  // Final pass with no distance filter — just top 50
+  console.log("  Final pass: returning all top-50 results");
+  const source = new MockOperator(items);
+  const withDistance = new VectorDistanceOperator(source, "embedding", queryVec);
+  const topK = new TopKOperator(withDistance, "_distance", false, 50);
+  return drainPipeline(topK);
+}
+
+async function main() {
+  console.log("Adaptive Vector Search\n");
+  console.log("Pipeline is recomposed at runtime based on result quality.\n");
+
+  const results = await adaptiveSearch();
+
+  console.log(`\nFinal results: ${results.length} items`);
+  for (const row of results.slice(0, 5)) {
+    console.log(`  ${row.label} | distance: ${(row._distance as number).toFixed(4)}`);
+  }
+  console.log(`\nKey insight: the pipeline was dynamically recomposed based on result count.`);
+  console.log("A fixed query planner cannot do this — the plan IS the execution.");
+}
+
+main().catch(console.error);
@@ -0,0 +1,127 @@
+/**
+ * Custom Spill Backend — plug your own storage into sort and join operators.
+ *
+ * This example implements an InMemorySpillBackend that satisfies the SpillBackend
+ * interface. It's used with ExternalSortOperator and HashJoinOperator at a tiny
+ * 4KB memory budget to force spilling.
+ *
+ * DuckDB: disk-only spill. Polars: no spill at all.
+ * QueryMode: any SpillBackend — R2, disk, memory, S3, you decide.
+ */
+
+import {
+  ExternalSortOperator,
+  HashJoinOperator,
+  drainPipeline,
+  type Operator,
+  type RowBatch,
+} from "../src/operators.js";
+import type { SpillBackend } from "../src/r2-spill.js";
+import type { Row } from "../src/types.js";
+
+// ─── In-memory spill backend ────────────────────────────────────────────
+
+class InMemorySpillBackend implements SpillBackend {
+  private runs = new Map<string, Row[]>();
+  private nextId = 0;
+  bytesWritten = 0;
+  bytesRead = 0;
+
+  async writeRun(rows: Row[]): Promise<string> {
+    const id = `mem-run-${this.nextId++}`;
+    const copy = rows.map(r => ({ ...r }));
+    this.runs.set(id, copy);
+    const size = rows.length * 100; // rough estimate
+    this.bytesWritten += size;
+    return id;
+  }
+
+  async *streamRun(spillId: string): AsyncGenerator<Row> {
+    const rows = this.runs.get(spillId);
+    if (!rows) throw new Error(`Spill run not found: ${spillId}`);
+    const size = rows.length * 100;
+    this.bytesRead += size;
+    for (const row of rows) yield row;
+  }
+
+  async cleanup(): Promise<void> {
+    this.runs.clear();
+  }
+}
+
+// ─── Mock data source ───────────────────────────────────────────────────
+
+class MockOperator implements Operator {
+  private rows: Row[];
+  private cursor = 0;
+  private batchSize: number;
+  constructor(rows: Row[], batchSize = 50) {
+    this.rows = rows;
+    this.batchSize = batchSize;
+  }
+  async next(): Promise<RowBatch | null> {
+    if (this.cursor >= this.rows.length) return null;
+    const batch = this.rows.slice(this.cursor, this.cursor + this.batchSize);
+    this.cursor += this.batchSize;
+    return batch;
+  }
+  async close() {}
+}
+
+// ─── Demo ───────────────────────────────────────────────────────────────
+
+async function main() {
+  console.log("Custom Spill Backend Demo\n");
+
+  // Generate 200 orders and 100 users
+  const orders: Row[] = Array.from({ length: 200 }, (_, i) => ({
+    order_id: i + 1,
+    user_id: (i % 100) + 1,
+    amount: ((i * 7 + 13) % 1000) + 1,
+  }));
+
+  const users: Row[] = Array.from({ length: 100 }, (_, i) => ({
+    user_id: i + 1,
+    name: `User_${i + 1}`,
+  }));
+
+  const spill = new InMemorySpillBackend();
+  const TINY_BUDGET = 4 * 1024; // 4KB — forces spilling
+
+  // 1. External sort with custom spill backend
+  console.log("1. External sort (4KB budget, forces spill to memory backend):");
+  const sortSource = new MockOperator(orders);
+  const sorted = new ExternalSortOperator(
+    sortSource, "amount", true, 0, TINY_BUDGET, spill,
+  );
+  const sortedRows = await drainPipeline(sorted);
+  console.log(`   Sorted ${sortedRows.length} rows by amount desc`);
+  console.log(`   Spill stats: ${spill.bytesWritten} bytes written, ${spill.bytesRead} bytes read`);
+  console.log(`   Top 3: ${sortedRows.slice(0, 3).map(r => r.amount).join(", ")}\n`);
+
+  // Reset spill
+  await spill.cleanup();
+  spill.bytesWritten = 0;
+  spill.bytesRead = 0;
+
+  // 2. Hash join with custom spill backend
+  console.log("2. Hash join (4KB budget, forces Grace hash partitioning):");
+  const leftSource = new MockOperator(orders);
+  const rightSource = new MockOperator(users);
+  const joined = new HashJoinOperator(
+    leftSource, rightSource, "user_id", "user_id", "inner",
+    TINY_BUDGET, spill,
+  );
+  const joinedRows = await drainPipeline(joined);
+  console.log(`   Joined ${joinedRows.length} rows`);
+  console.log(`   Spill stats: ${spill.bytesWritten} bytes written, ${spill.bytesRead} bytes read`);
+  console.log(`   Sample: order #${joinedRows[0]?.order_id} → ${joinedRows[0]?.name}\n`);
+
+  await spill.cleanup();
+
+  console.log("Key insight: spill storage is pluggable — R2, disk, memory, S3.");
+  console.log("DuckDB: disk only. Polars: no spill at all.");
+  console.log("QueryMode: implement SpillBackend and plug it in.");
+}
+
+main().catch(console.error);