fix: bigint/number filter comparison, page skip alignment, Parquet V2 decompression

teamchong · teamchong · commit 32f3b723cb2d · 2026-03-04T08:15:31.000-05:00
- Fix bigint===number always false: coerce types in matchesFilter(),
  canSkipPage(), and query-do inline Parquet filters (int64 columns
  decode as bigint but JSON filter values are numbers)
- Fix page-buffer index misalignment: when canSkipPage() skips pages,
  track non-skipped page infos per column so buffer indices stay aligned
  with col.pages metadata (was using wrong encoding/rowCount for decode)
- Fix DATA_PAGE_V2 decompression: subtract rep/def level byte lengths
  from uncompressedSize before passing to decompressor (per Parquet spec,
  levels are uncompressed in V2 pages)
- Fix Snappy: return actual bytes written on early termination instead
  of zero-padded full buffer
- Fix RLE bitWidth=32: mask was 0 due to JS 32-bit shift overflow
- Add SQL buffer length check (64KB max) to prevent WASM memory corruption
- Add gzip retry with increasing capacity (4x, 16x, 64x) for high
  compression ratios
- Add bounds checking in parseWasmResult to prevent reading past buffer
diff --git a/src/decode.ts b/src/decode.ts
@@ -9,10 +9,14 @@ export function canSkipPage(page: PageInfo, filters: QueryDescriptor["filters"],
     if (filter.column !== columnName) continue;
     if (page.minValue === undefined || page.maxValue === undefined) continue;
 
-    const { minValue: min, maxValue: max } = page;
-    const val = filter.value;
+    let { minValue: min, maxValue: max } = page;
+    let val = filter.value;
     if (typeof val === "object") continue;
 
+    // Coerce bigint↔number for cross-type comparisons
+    if (typeof min === "bigint" && typeof val === "number") val = BigInt(Math.trunc(val as number));
+    else if (typeof min === "number" && typeof val === "bigint") { min = BigInt(Math.trunc(min)); max = BigInt(Math.trunc(max as number)); }
+
     switch (filter.op) {
       case "gt":  if (max <= val) return true; break;
       case "gte": if (max < val) return true; break;
@@ -282,20 +286,27 @@ function vectorSearch(
 
 // --- Filters ---
 
+/** Coerce bigint↔number for cross-type comparisons (JSON filter values are numbers, int64 columns decode as bigint). */
+function coerceCompare(a: unknown, b: unknown): [unknown, unknown] {
+  if (typeof a === "bigint" && typeof b === "number") return [a, BigInt(Math.trunc(b))];
+  if (typeof a === "number" && typeof b === "bigint") return [BigInt(Math.trunc(a)), b];
+  return [a, b];
+}
+
 export function matchesFilter(
   val: number | bigint | string | boolean | Float32Array | null,
   filter: QueryDescriptor["filters"][0],
 ): boolean {
   if (val === null) return false;
   const t = filter.value;
   switch (filter.op) {
-    case "eq":  return val === t;
-    case "neq": return val !== t;
-    case "gt":  return val > (t as number | bigint | string);
-    case "gte": return val >= (t as number | bigint | string);
-    case "lt":  return val < (t as number | bigint | string);
-    case "lte": return val <= (t as number | bigint | string);
-    case "in":  return Array.isArray(t) && t.includes(val as number | bigint | string);
+    case "eq":  { const [a, b] = coerceCompare(val, t); return a === b; }
+    case "neq": { const [a, b] = coerceCompare(val, t); return a !== b; }
+    case "gt":  { const [a, b] = coerceCompare(val, t); return (a as number | bigint | string) > (b as number | bigint | string); }
+    case "gte": { const [a, b] = coerceCompare(val, t); return (a as number | bigint | string) >= (b as number | bigint | string); }
+    case "lt":  { const [a, b] = coerceCompare(val, t); return (a as number | bigint | string) < (b as number | bigint | string); }
+    case "lte": { const [a, b] = coerceCompare(val, t); return (a as number | bigint | string) <= (b as number | bigint | string); }
+    case "in":  return Array.isArray(t) && t.some(v => { const [a, b] = coerceCompare(val, v); return a === b; });
     default:    return true;
   }
 }
diff --git a/src/parquet-decode.ts b/src/parquet-decode.ts
@@ -80,6 +80,10 @@ export function decompressSnappy(input: Uint8Array): Uint8Array {
     }
   }
 
+  if (outPos < uncompressedLen) {
+    // Decompression terminated early — return only the valid portion
+    return output.subarray(0, outPos);
+  }
   return output;
 }
 
@@ -93,7 +97,7 @@ function decodeRleBitPacked(
 ): { values: number[]; bytesRead: number } {
   const values: number[] = [];
   let pos = offset;
-  const mask = (1 << bitWidth) - 1;
+  const mask = bitWidth >= 32 ? 0xFFFFFFFF : (1 << bitWidth) - 1;
 
   while (pos < bytes.length && values.length < maxValues) {
     const { value: header, bytesRead } = readVarint(bytes, pos);
@@ -433,9 +437,12 @@ export function decodeParquetColumnChunk(
       dataStart += header.defLevelsByteLength;
 
       // Actual data (may be compressed)
+      // V2 spec: uncompressed_page_size includes rep/def level bytes which are NOT compressed,
+      // so subtract them to get the actual data payload uncompressed size.
       let dataPayload: Uint8Array = pageData.subarray(dataStart);
       if (header.isCompressed) {
-        dataPayload = decompressPage(dataPayload, pageEncoding.compression, header.uncompressedSize, wasm);
+        const dataUncompressedSize = header.uncompressedSize - header.repLevelsByteLength - header.defLevelsByteLength;
+        dataPayload = decompressPage(dataPayload, pageEncoding.compression, dataUncompressedSize, wasm);
       }
 
       // Decode def levels to find nulls
diff --git a/src/query-do.ts b/src/query-do.ts
@@ -517,14 +517,19 @@ export class QueryDO implements DurableObject {
       }
     }
 
-    // Build per-page ranges, applying page-level skip
+    // Build per-page ranges, applying page-level skip.
+    // Track non-skipped page infos per column so buffer indices stay aligned.
     const ranges: { column: string; offset: number; length: number }[] = [];
+    const columnPageInfos = new Map<string, typeof cols[0]["pages"]>();
     let pagesSkipped = 0;
     for (const col of cols) {
+      const keptPages: typeof col.pages = [];
       for (const page of col.pages) {
         if (!query.vectorSearch && canSkipPage(page, query.filters, col.name)) { pagesSkipped++; continue; }
+        keptPages.push(page);
         ranges.push({ column: col.name, offset: Number(page.byteOffset), length: page.byteLength });
       }
+      columnPageInfos.set(col.name, keptPages);
     }
 
     // Cache-before-fetch: check WASM buffer pool for each range
@@ -613,10 +618,13 @@ export class QueryDO implements DurableObject {
         const pages = columnData.get(col.name);
         if (!pages?.length) { decodedColumns.set(col.name, []); continue; }
 
+        // Use non-skipped page infos (aligned with columnData buffers, not col.pages)
+        const keptPageInfos = columnPageInfos.get(col.name) ?? col.pages;
+
         // Concatenate all page buffers for this column (may span multiple row groups)
         const allValues: (number | bigint | string | boolean | null)[] = [];
         for (let pi = 0; pi < pages.length; pi++) {
-          const pageInfo = col.pages[pi];
+          const pageInfo = keptPageInfos[pi];
           const encoding = pageInfo?.encoding ?? { compression: "UNCOMPRESSED" };
 
           // Include dictionary page if present: fetch it from R2 and prepend
@@ -673,19 +681,23 @@ export class QueryDO implements DurableObject {
         rows.push(row);
       }
 
-      // Apply filters in JS
+      // Apply filters in JS (coerce bigint↔number for cross-type comparison)
       if (query.filters.length > 0) {
         rows = rows.filter(row => {
           for (const f of query.filters) {
             const val = row[f.column];
             if (val == null) return false;
+            // Coerce bigint↔number: int64 columns decode as bigint, JSON filter values are numbers
+            let cv = val, cf = f.value;
+            if (typeof cv === "bigint" && typeof cf === "number") cf = BigInt(Math.trunc(cf));
+            else if (typeof cv === "number" && typeof cf === "bigint") cv = BigInt(Math.trunc(cv));
             switch (f.op) {
-              case "eq": if (val !== f.value) return false; break;
-              case "neq": if (val === f.value) return false; break;
-              case "gt": if (!(val > f.value)) return false; break;
-              case "gte": if (!(val >= f.value)) return false; break;
-              case "lt": if (!(val < f.value)) return false; break;
-              case "lte": if (!(val <= f.value)) return false; break;
+              case "eq": if (cv !== cf) return false; break;
+              case "neq": if (cv === cf) return false; break;
+              case "gt": if (!(cv > (cf as number | bigint | string))) return false; break;
+              case "gte": if (!(cv >= (cf as number | bigint | string))) return false; break;
+              case "lt": if (!(cv < (cf as number | bigint | string))) return false; break;
+              case "lte": if (!(cv <= (cf as number | bigint | string))) return false; break;
             }
           }
           return true;
@@ -722,7 +734,8 @@ export class QueryDO implements DurableObject {
     for (const col of cols) {
       const pages = columnData.get(col.name);
       if (!pages?.length) continue;
-      if (!this.wasmEngine.registerColumn(query.table, col.name, col.dtype, pages, col.pages, col.listDimension)) {
+      const keptPageInfos = columnPageInfos.get(col.name) ?? col.pages;
+      if (!this.wasmEngine.registerColumn(query.table, col.name, col.dtype, pages, keptPageInfos, col.listDimension)) {
         throw new Error(`WASM OOM: failed to register column "${col.name}" for table "${query.table}"`);
       }
     }
diff --git a/src/wasm-engine.ts b/src/wasm-engine.ts
@@ -171,19 +171,24 @@ export class WasmEngine {
     return new Uint8Array(this.exports.memory.buffer, outPtr, written).slice();
   }
 
-  /** Decompress GZIP data using the Zig std.compress.gzip implementation. */
+  /** Decompress GZIP data using the Zig std.compress.gzip implementation. Retries with larger buffer if needed. */
   decompressGzip(compressed: Uint8Array): Uint8Array {
     const inPtr = this.exports.alloc(compressed.length);
     if (!inPtr) throw new Error("WASM OOM allocating gzip input");
     new Uint8Array(this.exports.memory.buffer, inPtr, compressed.length).set(compressed);
 
-    const capacity = compressed.length * 4;
-    const outPtr = this.exports.alloc(capacity);
-    if (!outPtr) throw new Error("WASM OOM allocating gzip output");
+    // Try increasing capacities: 4x, 16x, 64x (handles high compression ratios)
+    for (const multiplier of [4, 16, 64]) {
+      const capacity = compressed.length * multiplier;
+      const outPtr = this.exports.alloc(capacity);
+      if (!outPtr) throw new Error("WASM OOM allocating gzip output");
 
-    const written = this.exports.gzip_decompress(inPtr, compressed.length, outPtr, capacity);
-    if (written === 0 && compressed.length > 0) throw new Error("gzip decompression failed");
-    return new Uint8Array(this.exports.memory.buffer, outPtr, written).slice();
+      const written = this.exports.gzip_decompress(inPtr, compressed.length, outPtr, capacity);
+      if (written > 0) return new Uint8Array(this.exports.memory.buffer, outPtr, written).slice();
+      // written === 0 && capacity may have been too small — retry with larger buffer
+      if (written === 0 && compressed.length === 0) return new Uint8Array(0);
+    }
+    throw new Error("gzip decompression failed (output exceeds 64x compressed size)");
   }
 
   /** Decompress LZ4 block data (Parquet hadoop codec). */
@@ -485,7 +490,11 @@ export class WasmEngine {
   }
 
   executeQuery(query: QueryDescriptor): Row[] | null {
+    const MAX_SQL_LENGTH = 64 * 1024; // 64KB — WASM SQL input buffer is fixed-size
     const sqlBytes = textEncoder.encode(queryToSql(query));
+    if (sqlBytes.length > MAX_SQL_LENGTH) {
+      throw new Error(`SQL query too large (${sqlBytes.length} bytes, max ${MAX_SQL_LENGTH})`);
+    }
     const sqlBufPtr = this.exports.getSqlInputBuffer();
     new Uint8Array(this.exports.memory.buffer, sqlBufPtr, sqlBytes.length).set(sqlBytes);
     this.exports.setSqlInputLength(sqlBytes.length);
@@ -855,17 +864,21 @@ function parseWasmResult(memoryBuffer: ArrayBuffer, ptr: number, size: number):
 
   const rows: Row[] = Array.from({ length: numRows }, () => ({}));
   let dp = dataStart;
+  const end = ptr + size;
 
   for (const col of columns) {
     for (let r = 0; r < numRows; r++) {
+      if (dp + 1 > end) return rows.slice(0, r > 0 ? r : 0); // truncated result
       switch (col.type) {
-        case WasmColumnType.Int64:   rows[r][col.name] = view.getBigInt64(dp, true); dp += 8; break;
-        case WasmColumnType.Float64: rows[r][col.name] = view.getFloat64(dp, true); dp += 8; break;
-        case WasmColumnType.Int32:   rows[r][col.name] = view.getInt32(dp, true); dp += 4; break;
-        case WasmColumnType.Float32: rows[r][col.name] = view.getFloat32(dp, true); dp += 4; break;
+        case WasmColumnType.Int64:   if (dp + 8 > end) return rows; rows[r][col.name] = view.getBigInt64(dp, true); dp += 8; break;
+        case WasmColumnType.Float64: if (dp + 8 > end) return rows; rows[r][col.name] = view.getFloat64(dp, true); dp += 8; break;
+        case WasmColumnType.Int32:   if (dp + 4 > end) return rows; rows[r][col.name] = view.getInt32(dp, true); dp += 4; break;
+        case WasmColumnType.Float32: if (dp + 4 > end) return rows; rows[r][col.name] = view.getFloat32(dp, true); dp += 4; break;
         case WasmColumnType.Bool:    rows[r][col.name] = buf[dp] !== 0; dp += 1; break;
         case WasmColumnType.String: {
+          if (dp + 4 > end) return rows;
           const len = view.getUint32(dp, true); dp += 4;
+          if (dp + len > end) return rows;
           rows[r][col.name] = textDecoder.decode(buf.subarray(dp, dp + len)); dp += len;
           break;
         }