Skip to content

Commit 8d64f44

Browse files
committed
fix: uniform page skip in query-do and local-executor via canSkipPageMultiCol
Three sites were doing per-column page skip with canSkipPage, which could cause row misalignment when filters on column A skip pages but column B doesn't. Now all use canSkipPageMultiCol for uniform cross-column skip that also handles OR filterGroups.
1 parent 0af10bb commit 8d64f44

File tree

2 files changed

+35
-35
lines changed

2 files changed

+35
-35
lines changed

src/local-executor.ts

Lines changed: 28 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,15 @@ import type { AppendResult, ColumnMeta, DataType, DiffResult, ExplainResult, Pag
1010
import { parseFooter, parseColumnMetaFromProtobuf, FOOTER_SIZE } from "./footer.js";
1111
import { parseManifest } from "./manifest.js";
1212
import { detectFormat, getParquetFooterLength, parseParquetFooter, parquetMetaToTableMeta } from "./parquet.js";
13-
import { assembleRows, canSkipPage } from "./decode.js";
13+
import { assembleRows } from "./decode.js";
1414
import { coalesceRanges, autoCoalesceGap } from "./coalesce.js";
1515
import { instantiateWasm, type WasmEngine } from "./wasm-engine.js";
1616

1717
const textEncoder = new TextEncoder();
1818
import { VipCache } from "./vip-cache.js";
1919
import { QueryModeError } from "./errors.js";
2020
import { parseLanceV2Columns, lanceV2ToColumnMeta, computeLanceV2Stats } from "./lance-v2.js";
21-
import { buildPipeline, drainPipeline, DEFAULT_MEMORY_BUDGET, type FragmentSource, type PipelineOptions } from "./operators.js";
21+
import { buildPipeline, drainPipeline, DEFAULT_MEMORY_BUDGET, canSkipPageMultiCol, type FragmentSource, type PipelineOptions } from "./operators.js";
2222

2323
/**
2424
* Executor for local mode (Node/Bun).
@@ -219,22 +219,28 @@ export class LocalExecutor implements QueryExecutor {
219219
const ranges: { column: string; offset: number; length: number }[] = [];
220220
const colDetails: ExplainResult["columns"] = [];
221221

222-
// Use first column for row estimation (all columns have same page structure)
222+
// Uniform page-level skip across all columns to match actual query behavior
223+
const maxPages = projectedColumns.reduce((m, c) => Math.max(m, c.pages.length), 0);
224+
const keptPages = new Set<number>();
225+
for (let pi = 0; pi < maxPages; pi++) {
226+
if (!query.vectorSearch && canSkipPageMultiCol(projectedColumns, pi, query.filters, query.filterGroups)) {
227+
pagesSkipped += projectedColumns.length;
228+
} else {
229+
keptPages.add(pi);
230+
}
231+
}
232+
pagesTotal = maxPages * projectedColumns.length;
233+
223234
const firstCol = projectedColumns[0];
224235
for (const col of projectedColumns) {
225236
let colBytes = 0;
226237
let colPages = 0;
227238
for (let pi = 0; pi < col.pages.length; pi++) {
239+
if (!keptPages.has(pi)) continue;
228240
const page = col.pages[pi];
229-
pagesTotal++;
230-
if (!query.vectorSearch && canSkipPage(page, query.filters, col.name)) {
231-
pagesSkipped++;
232-
continue;
233-
}
234241
colPages++;
235242
colBytes += page.byteLength;
236243
ranges.push({ column: col.name, offset: Number(page.byteOffset), length: page.byteLength });
237-
// Count estimated rows from first projected column's non-skipped pages
238244
if (col === firstCol) estimatedRows += page.rowCount;
239245
}
240246
colDetails.push({ name: col.name, dtype: col.dtype as DataType, pages: colPages, bytes: colBytes });
@@ -531,12 +537,20 @@ export class LocalExecutor implements QueryExecutor {
531537
const pageRanges: { column: string; offset: bigint; length: number }[] = [];
532538
let pagesSkipped = 0;
533539

540+
// Uniform page-level skip: decide once per page index across all columns to avoid row misalignment.
541+
const maxPages = projectedColumns.reduce((m, c) => Math.max(m, c.pages.length), 0);
542+
const keptPageIndices: number[] = [];
543+
for (let pi = 0; pi < maxPages; pi++) {
544+
if (canSkipPageMultiCol(projectedColumns, pi, query.filters, query.filterGroups)) {
545+
pagesSkipped += projectedColumns.length;
546+
continue;
547+
}
548+
keptPageIndices.push(pi);
549+
}
534550
for (const col of projectedColumns) {
535-
for (const page of col.pages) {
536-
if (canSkipPage(page, query.filters, col.name)) {
537-
pagesSkipped++;
538-
continue;
539-
}
551+
for (const pi of keptPageIndices) {
552+
const page = col.pages[pi];
553+
if (!page) continue;
540554
pageRanges.push({ column: col.name, offset: page.byteOffset, length: page.byteLength });
541555
}
542556
}

src/query-do.ts

Lines changed: 7 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -126,21 +126,14 @@ class EdgeScanOperator implements Operator {
126126
this.cols = cols;
127127

128128
// Determine which pages to keep — must be uniform across all columns to avoid row misalignment.
129-
// A page is skipped only if any AND filter eliminates it (same logic as canSkipPageMultiCol).
129+
// Uses canSkipPageMultiCol which handles both AND filters and OR filterGroups.
130130
const maxPages = cols.reduce((m, c) => Math.max(m, c.pages.length), 0);
131131
const keptPageIndices: number[] = [];
132132
for (let pi = 0; pi < maxPages; pi++) {
133-
let skip = false;
134-
if (!query.vectorSearch) {
135-
for (const f of query.filters) {
136-
const col = cols.find(c => c.name === f.column);
137-
if (!col) continue;
138-
const page = col.pages[pi];
139-
if (!page) continue;
140-
if (canSkipPage(page, [f], f.column)) { skip = true; break; }
141-
}
133+
if (!query.vectorSearch && canSkipPageMultiCol(cols, pi, query.filters, query.filterGroups)) {
134+
this.pagesSkipped += cols.length;
135+
continue;
142136
}
143-
if (skip) { this.pagesSkipped += cols.length; continue; }
144137
keptPageIndices.push(pi);
145138
}
146139

@@ -1004,17 +997,10 @@ export class QueryDO extends DurableObject<Env> {
1004997
const maxPages = cols.reduce((m, c) => Math.max(m, c.pages.length), 0);
1005998
const keptPageIndices: number[] = [];
1006999
for (let pi = 0; pi < maxPages; pi++) {
1007-
let skip = false;
1008-
if (!query.vectorSearch) {
1009-
for (const f of query.filters) {
1010-
const col = cols.find(c => c.name === f.column);
1011-
if (!col) continue;
1012-
const page = col.pages[pi];
1013-
if (!page) continue;
1014-
if (canSkipPage(page, [f], f.column)) { skip = true; break; }
1015-
}
1000+
if (!query.vectorSearch && canSkipPageMultiCol(cols, pi, query.filters, query.filterGroups)) {
1001+
pagesSkipped += cols.length;
1002+
continue;
10161003
}
1017-
if (skip) { pagesSkipped += cols.length; continue; }
10181004
keptPageIndices.push(pi);
10191005
}
10201006

0 commit comments

Comments
 (0)