refactor: dedup concatQMCBBatches via concatColumnarBatches (-79 lines) + docs for SQL semantics and error handling

teamchong · teamchong · commit 06d6a3cc7085 · 2026-03-17T16:47:02.000-04:00
concatQMCBBatches now delegates to concatColumnarBatches + encodeColumnarBatch
instead of reimplementing the same per-dtype concat logic. All 470 tests pass.

SQL docs: NULL three-valued logic, type coercion rules, operator precedence, comments.
Error handling docs: QueryModeError codes, catching patterns, error wrapping.
diff --git a/docs/astro.config.mjs b/docs/astro.config.mjs
@@ -29,6 +29,7 @@ export default defineConfig({
         { label: "Columnar Format", slug: "columnar-format" },
         { label: "Lazy Evaluation", slug: "lazy-evaluation" },
         { label: "Performance", slug: "performance" },
+        { label: "Error Handling", slug: "error-handling" },
         { label: "Write Path", slug: "write-path" },
         { label: "Deployment", slug: "deployment" },
       ],
diff --git a/docs/src/content/docs/error-handling.mdx b/docs/src/content/docs/error-handling.mdx
@@ -0,0 +1,75 @@
+---
+title: Error Handling
+description: Structured errors with codes, causes, and actionable messages.
+---
+
+All errors thrown by QueryMode are `QueryModeError` instances with a structured `code` field. Low-level errors (ENOENT, parse failures, OOM) are wrapped automatically with context.
+
+## Error codes
+
+| Code | When | Example message |
+|------|------|----------------|
+| `TABLE_NOT_FOUND` | File doesn't exist, R2 key missing | "Table not found: events.lance" |
+| `COLUMN_NOT_FOUND` | Column name not in schema | "Column not found: foo" |
+| `INVALID_FORMAT` | File can't be parsed as any supported format | "Invalid table format: data.xyz" |
+| `SCHEMA_MISMATCH` | Column exists but type doesn't match operation | "Column not found in events: age" |
+| `INVALID_FILTER` | Bad filter op or value type | "Invalid filter: unknown op 'regex'" |
+| `INVALID_AGGREGATE` | Bad aggregate function or missing column | "Invalid aggregate: sum requires numeric column" |
+| `MEMORY_EXCEEDED` | Operator exceeds memory budget | "Memory budget exceeded querying events" |
+| `NETWORK_TIMEOUT` | R2 or RPC call timed out | "Network timeout on events: R2 read timed out" |
+| `QUERY_TIMEOUT` | Total query time exceeded | "Query timeout on events" |
+| `QUERY_FAILED` | Catch-all for unclassified errors | "Query failed on events: ..." |
+
+## Catching errors
+
+```typescript
+import { QueryModeError } from "querymode"
+
+try {
+  await qm.table("missing.lance").collect()
+} catch (err) {
+  if (err instanceof QueryModeError) {
+    switch (err.code) {
+      case "TABLE_NOT_FOUND":
+        console.log("Table doesn't exist:", err.message)
+        break
+      case "MEMORY_EXCEEDED":
+        console.log("Try adding filters or reducing projections")
+        break
+      default:
+        console.log(`${err.code}: ${err.message}`)
+    }
+    // Original error is preserved
+    if (err.cause) console.log("Caused by:", err.cause)
+  }
+}
+```
+
+## Error wrapping
+
+`QueryModeError.from()` wraps any error with context:
+
+```typescript
+try {
+  await riskyOperation()
+} catch (err) {
+  throw QueryModeError.from(err, { table: "events", operation: "scan" })
+  // Automatically classifies: ENOENT → TABLE_NOT_FOUND,
+  // "footer" in message → INVALID_FORMAT, "OOM" → MEMORY_EXCEEDED, etc.
+}
+```
+
+Already-wrapped `QueryModeError` instances pass through unchanged.
+
+## SQL parse errors
+
+SQL syntax errors throw standard `Error` (not `QueryModeError`) with the parse position:
+
+```typescript
+try {
+  await qm.sql("SELECT FROM").collect()
+} catch (err) {
+  // "Expected column or expression at position 7"
+  console.log(err.message)
+}
+```
diff --git a/docs/src/content/docs/sql.mdx b/docs/src/content/docs/sql.mdx
@@ -123,6 +123,88 @@ SELECT * FROM images WHERE embedding NEAR [0.1, 0.2, 0.3] TOPK 10
 
 The `NEAR` operator performs vector similarity search on the specified column. `TOPK` limits results to the K nearest neighbors. Uses IVF-PQ index when available, falls back to flat SIMD scan.
 
+## NULL semantics
+
+QueryMode follows SQL three-valued logic. Expressions involving NULL propagate NULL rather than returning true or false:
+
+```sql
+-- NULL comparisons → NULL (row excluded from results)
+SELECT * FROM users WHERE age = NULL          -- no rows (use IS NULL instead)
+SELECT * FROM users WHERE NULL > 5            -- no rows
+
+-- AND with NULL
+SELECT * FROM t WHERE NULL AND true           -- NULL (excluded)
+SELECT * FROM t WHERE NULL AND false          -- false (excluded)
+
+-- OR with NULL
+SELECT * FROM t WHERE NULL OR true            -- true (included)
+SELECT * FROM t WHERE NULL OR false           -- NULL (excluded)
+
+-- NOT IN with NULL elements
+SELECT * FROM t WHERE id NOT IN (1, 2, NULL)  -- NULL for all rows (per SQL standard)
+
+-- BETWEEN with NULL
+SELECT * FROM t WHERE NULL BETWEEN 1 AND 10   -- NULL (excluded)
+
+-- IS NULL / IS NOT NULL (never return NULL)
+SELECT * FROM t WHERE email IS NULL           -- works correctly
+```
+
+Aggregates also follow SQL NULL rules:
+
+| Expression | Result |
+|-----------|--------|
+| `SUM(col)` where all values are NULL | NULL |
+| `COUNT(col)` where all values are NULL | 0 |
+| `COUNT(*)` | counts all rows (ignores NULLs) |
+| `MIN(col)` / `MAX(col)` on empty group | NULL |
+| `AVG(col)` where all values are NULL | NULL |
+
+## Type coercion
+
+Comparisons between different types follow these rules:
+
+| Left type | Right type | Behavior |
+|-----------|------------|----------|
+| number | number | Direct comparison |
+| bigint | number | number promoted to bigint (when integer) |
+| string | number | Numeric comparison if string is numeric, else string comparison |
+| any | NULL | Result is NULL |
+
+`CAST` converts between types explicitly:
+
+```sql
+SELECT CAST(age AS text) AS age_str           -- number → string
+SELECT CAST('42' AS int) AS age               -- string → number
+SELECT CAST(id AS bigint) AS big_id           -- number → bigint
+```
+
+## Operator precedence
+
+From highest to lowest:
+
+1. Parentheses `()`
+2. Unary `NOT`, `-`
+3. Multiplication `*`, Division `/`, Modulo `%`
+4. Addition `+`, Subtraction `-`
+5. Comparison `=`, `!=`, `<>`, `<`, `>`, `<=`, `>=`
+6. `IS NULL`, `IS NOT NULL`, `BETWEEN`, `IN`, `LIKE`
+7. `AND`
+8. `OR`
+
+```sql
+-- AND binds tighter than OR
+WHERE a = 1 OR b = 2 AND c = 3
+-- is parsed as: WHERE a = 1 OR (b = 2 AND c = 3)
+```
+
+## Comments
+
+```sql
+-- Line comment (ignored)
+SELECT * FROM users /* block comment */ WHERE age > 25
+```
+
 ## How it works
 
 ```
diff --git a/src/columnar.ts b/src/columnar.ts
@@ -420,85 +420,8 @@ export function concatQMCBBatches(batches: ArrayBuffer[]): ArrayBuffer | null {
   const decoded = batches.map(b => decodeColumnarBatch(b)).filter((b): b is ColumnarBatch => b !== null);
   if (decoded.length === 0) return null;
 
-  const totalRows = decoded.reduce((s, b) => s + b.rowCount, 0);
-  const numCols = decoded[0].columns.length;
-
-  const columns: ColumnarColumn[] = [];
-
-  for (let ci = 0; ci < numCols; ci++) {
-    const dtype = decoded[0].columns[ci].dtype;
-    const name = decoded[0].columns[ci].name;
-    const bpe = bytesPerElement(dtype);
-
-    if (bpe > 0) {
-      // Fixed-width numeric: memcpy concat
-      const buf = new ArrayBuffer(totalRows * bpe);
-      const out = new Uint8Array(buf);
-      let offset = 0;
-      for (const batch of decoded) {
-        const src = new Uint8Array(batch.columns[ci].data);
-        out.set(src, offset);
-        offset += src.length;
-      }
-      columns.push({ name, dtype, data: buf, rowCount: totalRows });
-    } else if (dtype === DTYPE_BOOL) {
-      const buf = new ArrayBuffer(Math.ceil(totalRows / 8));
-      const out = new Uint8Array(buf);
-      let row = 0;
-      for (const batch of decoded) {
-        const src = new Uint8Array(batch.columns[ci].data);
-        for (let r = 0; r < batch.rowCount; r++) {
-          if (src[r >> 3] & (1 << (r & 7))) out[row >> 3] |= 1 << (row & 7);
-          row++;
-        }
-      }
-      columns.push({ name, dtype, data: buf, rowCount: totalRows });
-    } else if (dtype === DTYPE_UTF8) {
-      let totalStrLen = 0;
-      for (const batch of decoded) {
-        const col = batch.columns[ci];
-        totalStrLen += col.offsets ? col.offsets[batch.rowCount] : col.data.byteLength;
-      }
-
-      const offsets = new Uint32Array(totalRows + 1);
-      const strBuf = new Uint8Array(totalStrLen);
-      let strOffset = 0;
-      let row = 0;
-
-      for (const batch of decoded) {
-        const col = batch.columns[ci];
-        const srcOffsets = col.offsets!;
-        const srcData = new Uint8Array(col.data);
-        for (let r = 0; r < batch.rowCount; r++) {
-          offsets[row] = strOffset;
-          const start = srcOffsets[r];
-          const end = srcOffsets[r + 1];
-          if (end > start) {
-            strBuf.set(srcData.subarray(start, end), strOffset);
-            strOffset += end - start;
-          }
-          row++;
-        }
-      }
-      offsets[totalRows] = strOffset;
-      columns.push({ name, dtype, data: (strBuf.buffer as ArrayBuffer).slice(0, strOffset), rowCount: totalRows, offsets });
-    } else if (dtype === DTYPE_F32VEC) {
-      const dim = decoded[0].columns[ci].vectorDim || 0;
-      const buf = new ArrayBuffer(totalRows * dim * 4);
-      const out = new Uint8Array(buf);
-      let offset = 0;
-      for (const batch of decoded) {
-        const src = new Uint8Array(batch.columns[ci].data);
-        out.set(src, offset);
-        offset += src.length;
-      }
-      columns.push({ name, dtype, data: buf, rowCount: totalRows, vectorDim: dim });
-    } else {
-      columns.push({ name, dtype, data: new ArrayBuffer(0), rowCount: totalRows });
-    }
-  }
-
-  return encodeColumnarBatch({ columns, rowCount: totalRows });
+  const merged = concatColumnarBatches(decoded);
+  return merged ? encodeColumnarBatch(merged) : null;
 }
 
 // ============================================================================