Skip to content

Commit c40da6b

Browse files
committed
fix: IN/NOT_IN bigint cross-type, WindowSpec.column extraction, null vs empty string in DISTINCT/SET/WINDOW; perf: bitmap bitwise, columnar Map.get() hoist, streaming aggs, lazy IN eval; docs accuracy
1 parent 3d409b8 commit c40da6b

File tree

11 files changed

+116
-87
lines changed

11 files changed

+116
-87
lines changed

README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,8 @@ Operation Operator class What it does
9393
───────── ────────────── ────────────
9494
9595
Filtering
96-
predicate FilterOperator eq, neq, gt, gte, lt, lte, in
96+
predicate FilterOperator 14 ops: eq, neq, gt, gte, lt, lte, in, not_in,
97+
between, not_between, like, not_like, is_null, is_not_null
9798
membership SubqueryInOperator Semi-join filter against a value set
9899
99100
Projection
@@ -125,7 +126,7 @@ Set operations
125126
126127
Limiting
127128
limit/offset LimitOperator Row limiting with offset
128-
sample (planned) Random sampling
129+
sample DataFrame.sample() Random sampling (Fisher-Yates)
129130
130131
Similarity
131132
vector near (planned) NEAR topK as composable operator — currently in scan layer
@@ -252,7 +253,7 @@ npx tsx examples/nextjs-api-route.ts
252253
- **Multi-format support** — Lance, Parquet, and Iceberg tables
253254
- **Local mode** — same API reads Lance/Parquet files from disk or HTTP (Node/Bun)
254255
- **Fragment DO pool** — fan-out parallel scanning for multi-fragment datasets (max 100 slots per datacenter)
255-
- **580+ tests** — unit tests cover footer parsing, column decoding, Parquet/Thrift, merging, aggregates, VIP cache, WASM integration, SQL, partition catalog, materialized executor, toCode decompiler; 110+ conformance tests validate every operator against DuckDB at 1M-5M row scale
256+
- **600+ tests** — unit tests cover footer parsing, column decoding, Parquet/Thrift, merging, aggregates, VIP cache, WASM integration, SQL, partition catalog, materialized executor, toCode decompiler; 110+ conformance tests validate every operator against DuckDB at 1M-5M row scale
256257
- **CI benchmarks** — head-to-head QueryMode (Miniflare) vs DuckDB (native) on every push, results posted to [GitHub Actions summary](https://github.com/teamchong/querymode/actions/workflows/ci.yml)
257258

258259
## What doesn't exist yet

docs/src/content/docs/architecture.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,6 @@ Tests run in two runtimes:
179179
|---------|------|-------|
180180
| **workerd** (real CF Workers) | Operators, DOs, decode, format parsing ||
181181
| **Node** | DuckDB conformance (1M-5M rows), fixture files, SQL, infra ||
182-
| **Total** | All runtimes | **580+ tests** |
182+
| **Total** | All runtimes | **600+ tests** |
183183

184184
Conformance tests validate every operator against DuckDB at scale. CI benchmarks compare QueryMode vs DuckDB on every push.

docs/src/content/docs/dataframe-api.mdx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,8 @@ df1.except(df2)
120120
df.where("age", "gt", 25) // alias for filter()
121121
df.whereIn("status", ["active", "trial"]) // shorthand for in
122122
df.whereNotIn("role", ["banned"]) // shorthand for not_in
123-
df.whereBetween("score", [80, 100]) // shorthand for between
124-
df.whereNotBetween("age", [0, 17]) // shorthand for not_between
123+
df.whereBetween("score", 80, 100) // shorthand for between
124+
df.whereNotBetween("age", 0, 17) // shorthand for not_between
125125
df.whereLike("name", "%alice%") // shorthand for like
126126
df.whereNotLike("email", "%spam%") // shorthand for not_like
127127
df.filterIn("region", subquery) // IN subquery (semi-join)

docs/src/content/docs/deployment.mdx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,8 @@ pnpm build && wrangler deploy
108108
{ "column": "amount", "op": "gt", "value": 100 }
109109
],
110110
"projections": ["id", "amount", "region"],
111-
"orderBy": { "column": "amount", "desc": true },
111+
"sortColumn": "amount",
112+
"sortDirection": "desc",
112113
"limit": 20
113114
}
114115
```

src/decode.ts

Lines changed: 16 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -228,29 +228,16 @@ export function decodePage(
228228
rowCount = 0,
229229
dataOffsetInPage?: number,
230230
): (number | bigint | string | null)[] {
231-
let nulls: Set<number> | null = null;
231+
let nullBitmap: Uint8Array | null = null;
232+
let nulls: { has(i: number): boolean; size: number } | null = null;
232233
if (nullCount > 0 && rowCount > 0) {
233234
const bitmapBytes = Math.ceil(rowCount / 8);
234235
if (buf.byteLength < bitmapBytes) return [];
235-
const bytes = new Uint8Array(buf, 0, bitmapBytes);
236-
nulls = new Set<number>();
237-
// Fast path: skip bytes that are 0xFF (all valid) — avoids per-bit work
238-
for (let b = 0; b < bitmapBytes; b++) {
239-
const byte = bytes[b];
240-
if (byte === 0xFF) continue; // all 8 bits valid, skip
241-
const base = b << 3;
242-
if (byte === 0x00) {
243-
// all 8 bits null — add them all
244-
const end = Math.min(base + 8, rowCount);
245-
for (let i = base; i < end; i++) nulls.add(i);
246-
} else {
247-
// mixed — check each bit
248-
const end = Math.min(8, rowCount - base);
249-
for (let bit = 0; bit < end; bit++) {
250-
if (((byte >> bit) & 1) === 0) nulls.add(base + bit);
251-
}
252-
}
253-
}
236+
nullBitmap = new Uint8Array(buf, 0, bitmapBytes);
237+
nulls = {
238+
has(i: number): boolean { return (nullBitmap![i >> 3] & (1 << (i & 7))) === 0; },
239+
get size() { return nullCount; },
240+
};
254241
// Lance v2 uses alignment padding between bitmap and data.
255242
// dataOffsetInPage gives the exact data start; otherwise strip only bitmap bytes.
256243
const stripBytes = dataOffsetInPage ?? bitmapBytes;
@@ -483,12 +470,19 @@ export function matchesFilter(
483470
case "in": {
484471
if (!Array.isArray(t)) return false;
485472
const set = getInSet(t);
486-
return set.has(val);
473+
if (set.has(val)) return true;
474+
// bigint/number cross-type: 5n !== 5 for Set.has, so coerce
475+
if (typeof val === "bigint") return set.has(Number(val));
476+
if (typeof val === "number") return set.has(BigInt(val));
477+
return false;
487478
}
488479
case "not_in": {
489480
if (!Array.isArray(t)) return false;
490481
const set = getInSet(t);
491-
return !set.has(val);
482+
if (set.has(val)) return false;
483+
if (typeof val === "bigint") return !set.has(Number(val));
484+
if (typeof val === "number") return !set.has(BigInt(val));
485+
return true;
492486
}
493487
case "between": {
494488
if (!Array.isArray(t) || t.length !== 2) return false;

src/index.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,14 @@ export {
2121
ComputedColumnOperator, HashJoinOperator, ExternalSortOperator,
2222
InMemorySortOperator, WindowOperator, DistinctOperator, SetOperator,
2323
LimitOperator, SubqueryInOperator,
24-
drainPipeline, buildEdgePipeline,
24+
drainPipeline, buildEdgePipeline, FsSpillBackend,
2525
} from "./operators.js";
2626
export type { Operator, RowBatch } from "./operators.js";
2727
export { QueryModeError } from "./errors.js";
2828
export type { ErrorCode } from "./errors.js";
2929
export { LocalExecutor } from "./local-executor.js";
3030
export { bigIntReplacer } from "./decode.js";
31+
export { R2SpillBackend } from "./r2-spill.js";
3132
export { createFromJSON, createFromCSV, createDemo } from "./convenience.js";
3233
export { formatResultSummary, formatExplain, formatBytes } from "./format.js";
3334
export type { LocalTimingInfo } from "./format.js";

src/operators.ts

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -929,7 +929,8 @@ export class WindowOperator implements Operator {
929929
key = "";
930930
for (let p = 0; p < win.partitionBy.length; p++) {
931931
if (p > 0) key += "\x00";
932-
key += String(rows[i][win.partitionBy[p]] ?? "");
932+
const v = rows[i][win.partitionBy[p]];
933+
key += v === null || v === undefined ? "\x01NULL\x01" : String(v);
933934
}
934935
} else {
935936
key = "__all__";
@@ -1141,7 +1142,8 @@ export class DistinctOperator implements Operator {
11411142
for (let g = 0; g < cols.length; g++) {
11421143
if (g > 0) key += "\x00";
11431144
const vals = batch.columns.get(cols[g]);
1144-
key += String(vals ? (vals[idx] ?? "") : "");
1145+
const v = vals ? vals[idx] : null;
1146+
key += v === null || v === undefined ? "\x01NULL\x01" : String(v);
11451147
}
11461148
if (!this.seen.has(key)) {
11471149
this.seen.add(key);
@@ -1165,7 +1167,8 @@ export class DistinctOperator implements Operator {
11651167
const keyCols = this.columns.length > 0 ? this.columns : Object.keys(row);
11661168
for (let g = 0; g < keyCols.length; g++) {
11671169
if (g > 0) key += "\x00";
1168-
key += String(row[keyCols[g]] ?? "");
1170+
const v = row[keyCols[g]];
1171+
key += v === null || v === undefined ? "\x01NULL\x01" : String(v);
11691172
}
11701173
if (!this.seen.has(key)) {
11711174
this.seen.add(key);
@@ -1205,12 +1208,15 @@ export class SetOperator implements Operator {
12051208
if (mode !== "union_all") this.seen = new Set();
12061209
}
12071210

1211+
private _sortedKeys: string[] | null = null;
12081212
private rowKey(row: Row): string {
1209-
const keys = Object.keys(row).sort();
1213+
if (!this._sortedKeys) this._sortedKeys = Object.keys(row).sort();
1214+
const keys = this._sortedKeys;
12101215
let result = "";
12111216
for (let i = 0; i < keys.length; i++) {
12121217
if (i > 0) result += "\x00";
1213-
result += keys[i] + "=" + String(row[keys[i]] ?? "");
1218+
const v = row[keys[i]];
1219+
result += keys[i] + "=" + (v === null || v === undefined ? "\x01NULL\x01" : String(v));
12141220
}
12151221
return result;
12161222
}
@@ -1476,29 +1482,27 @@ export class LimitOperator implements Operator {
14761482
const batch = await this.upstream.nextColumnar();
14771483
if (!batch) return null;
14781484

1479-
let indices = batch.selection
1480-
? Array.from(batch.selection)
1481-
: Array.from({ length: batch.rowCount }, (_, i) => i);
1485+
let sel = batch.selection ?? identityIndices(batch.rowCount);
14821486

14831487
// Handle offset: skip rows
14841488
if (this.skipped < this.offset) {
14851489
const toSkip = this.offset - this.skipped;
1486-
if (toSkip >= indices.length) {
1487-
this.skipped += indices.length;
1490+
if (toSkip >= sel.length) {
1491+
this.skipped += sel.length;
14881492
continue;
14891493
}
1490-
indices = indices.slice(toSkip);
1494+
sel = sel.subarray(toSkip);
14911495
this.skipped = this.offset;
14921496
}
14931497

14941498
// Apply limit
1495-
if (indices.length > this.remaining) {
1496-
indices = indices.slice(0, this.remaining);
1499+
if (sel.length > this.remaining) {
1500+
sel = sel.slice(0, this.remaining);
14971501
}
1498-
this.remaining -= indices.length;
1502+
this.remaining -= sel.length;
14991503

1500-
if (indices.length > 0) {
1501-
return { columns: batch.columns, rowCount: batch.rowCount, selection: new Uint32Array(indices) };
1504+
if (sel.length > 0) {
1505+
return { columns: batch.columns, rowCount: batch.rowCount, selection: sel };
15021506
}
15031507
return null;
15041508
}

src/partial-agg.ts

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -211,14 +211,13 @@ export function computePartialAggColumnar(
211211
const states = aggregates.map((agg) =>
212212
initPartialAggState(agg.fn, agg.column, { percentileTarget: agg.percentileTarget }),
213213
);
214+
const aggArrays = aggregates.map(a => a.column === "*" ? null : (batch.columns.get(a.column) ?? null));
214215
for (const idx of indices) {
215216
for (let i = 0; i < aggregates.length; i++) {
216-
const col = aggregates[i].column;
217-
if (col === "*") {
217+
if (!aggArrays[i]) {
218218
states[i].count++;
219219
} else {
220-
const vals = batch.columns.get(col);
221-
const val = vals ? vals[idx] : null;
220+
const val = aggArrays[i]![idx];
222221
if (typeof val === "number") {
223222
updatePartialAgg(states[i], val, val);
224223
} else if (typeof val === "bigint") {
@@ -238,13 +237,14 @@ export function computePartialAggColumnar(
238237

239238
const groups = new Map<string, PartialAggState[]>();
240239
const groupCols = query.groupBy;
240+
const groupArrays = groupCols.map(c => batch.columns.get(c) ?? null);
241+
const aggArrays2 = aggregates.map(a => a.column === "*" ? null : (batch.columns.get(a.column) ?? null));
241242

242243
for (const idx of indices) {
243244
let key = "";
244245
for (let g = 0; g < groupCols.length; g++) {
245246
if (g > 0) key += "\x00";
246-
const vals = batch.columns.get(groupCols[g]);
247-
const v = vals ? vals[idx] : null;
247+
const v = groupArrays[g] ? groupArrays[g]![idx] : null;
248248
key += v === null || v === undefined ? "\x01NULL\x01" : String(v);
249249
}
250250
let states = groups.get(key);
@@ -255,12 +255,10 @@ export function computePartialAggColumnar(
255255
groups.set(key, states);
256256
}
257257
for (let i = 0; i < aggregates.length; i++) {
258-
const col = aggregates[i].column;
259-
if (col === "*") {
258+
if (!aggArrays2[i]) {
260259
states[i].count++;
261260
} else {
262-
const vals = batch.columns.get(col);
263-
const val = vals ? vals[idx] : null;
261+
const val = aggArrays2[i]![idx];
264262
if (typeof val === "number") {
265263
updatePartialAgg(states[i], val, val);
266264
} else if (typeof val === "bigint") {

src/sql/compiler.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,15 @@ function compileWindowFunction(expr: SqlExpr & { kind: "call" }, alias?: string)
216216
alias: alias ?? `${fn}_result`,
217217
};
218218

219+
if (expr.args.length > 0) {
220+
const colArg = expr.args[0];
221+
if (colArg.kind === "column") {
222+
spec.column = colArg.name;
223+
} else if (colArg.kind === "star") {
224+
spec.column = "*";
225+
}
226+
}
227+
219228
if ((fn === "lag" || fn === "lead") && expr.args.length > 1) {
220229
const args: WindowSpec["args"] = {};
221230
if (expr.args[1]?.kind === "value" && expr.args[1].value.type === "integer") {

src/sql/evaluator.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,10 @@ export function evaluateExpr(expr: SqlExpr, row: Row): unknown {
2828
case "in_list": {
2929
const val = evaluateExpr(expr.expr, row);
3030
if (val === null) return null;
31-
const values = expr.values.map(v => evaluateExpr(v, row));
32-
const found = values.some(v => looseEqual(val, v));
31+
let found = false;
32+
for (const v of expr.values) {
33+
if (looseEqual(val, evaluateExpr(v, row))) { found = true; break; }
34+
}
3335
return expr.negated ? !found : found;
3436
}
3537

0 commit comments

Comments
 (0)