Skip to content

Commit fdf96a4

Browse files
committed
fix: append null bitmap support — stop coercing null int/float/string to 0/""
rowsToColumnArrays now builds a validity bitmap (bit set = valid) for columns containing null/undefined values instead of silently replacing them with 0, 0n, or "". buildFragment passes the bitmap pointer to WASM fragmentAdd*Column (previously always 0). FragmentColumn gains a nullBitmap?: Uint8Array field. Capacity estimate includes bitmap bytes. 8 new tests cover all dtypes + mixed-column case.
1 parent b3b9802 commit fdf96a4

File tree

2 files changed

+121
-16
lines changed

2 files changed

+121
-16
lines changed

src/wasm-engine.integration.test.ts

Lines changed: 83 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
* Skips gracefully if binary not found.
55
*/
66
import { describe, it, expect, beforeAll } from "vitest";
7-
import { instantiateWasm, queryToSql, type WasmEngine } from "./wasm-engine.js";
7+
import { instantiateWasm, queryToSql, rowsToColumnArrays, type WasmEngine } from "./wasm-engine.js";
88
import type { QueryDescriptor } from "./client.js";
99
import type { PageInfo } from "./types.js";
1010
import * as fs from "node:fs/promises";
@@ -408,3 +408,85 @@ describe("queryToSql", () => {
408408
expect(sql).toContain("TOPK 10");
409409
});
410410
});
411+
412+
// Pure-TS tests — no WASM binary required
413+
describe("rowsToColumnArrays", () => {
414+
it("builds null bitmap for int64 columns with null values", () => {
415+
const rows = [{ x: 10 }, { x: null }, { x: 30 }];
416+
const cols = rowsToColumnArrays(rows);
417+
expect(cols.length).toBe(1);
418+
expect(cols[0].dtype).toBe("int64");
419+
expect(cols[0].nullBitmap).toBeDefined();
420+
// Validity bitmap: row 0 valid, row 1 null, row 2 valid → bits 0,2 set = 0b101 = 5
421+
expect(cols[0].nullBitmap![0]).toBe(0b00000101);
422+
// Null positions have 0n in data buffer (bitmap marks them invalid)
423+
const data = new BigInt64Array(cols[0].values);
424+
expect(data[0]).toBe(10n);
425+
expect(data[1]).toBe(0n);
426+
expect(data[2]).toBe(30n);
427+
});
428+
429+
it("builds null bitmap for float64 columns with null values", () => {
430+
const rows = [{ v: 1.5 }, { v: undefined }, { v: 3.5 }];
431+
const cols = rowsToColumnArrays(rows);
432+
expect(cols[0].dtype).toBe("float64");
433+
expect(cols[0].nullBitmap).toBeDefined();
434+
expect(cols[0].nullBitmap![0]).toBe(0b00000101);
435+
});
436+
437+
it("builds null bitmap for string columns with null values", () => {
438+
const rows = [{ s: "hello" }, { s: null }, { s: "world" }];
439+
const cols = rowsToColumnArrays(rows);
440+
expect(cols[0].dtype).toBe("utf8");
441+
expect(cols[0].nullBitmap).toBeDefined();
442+
expect(cols[0].nullBitmap![0]).toBe(0b00000101);
443+
});
444+
445+
it("builds null bitmap for bool columns with null values", () => {
446+
const rows = [{ b: true }, { b: null }, { b: false }];
447+
const cols = rowsToColumnArrays(rows);
448+
expect(cols[0].dtype).toBe("bool");
449+
expect(cols[0].nullBitmap).toBeDefined();
450+
expect(cols[0].nullBitmap![0]).toBe(0b00000101);
451+
});
452+
453+
it("builds null bitmap for bigint columns with null values", () => {
454+
const rows = [{ n: 100n }, { n: null }, { n: 300n }];
455+
const cols = rowsToColumnArrays(rows);
456+
expect(cols[0].dtype).toBe("int64");
457+
expect(cols[0].nullBitmap).toBeDefined();
458+
expect(cols[0].nullBitmap![0]).toBe(0b00000101);
459+
});
460+
461+
it("omits null bitmap when all values are non-null", () => {
462+
const rows = [{ x: 1 }, { x: 2 }, { x: 3 }];
463+
const cols = rowsToColumnArrays(rows);
464+
expect(cols[0].nullBitmap).toBeUndefined();
465+
});
466+
467+
it("handles all-null columns by skipping (no non-null sample)", () => {
468+
const rows = [{ x: null }, { x: null }];
469+
const cols = rowsToColumnArrays(rows);
470+
// Column skipped because no non-null sample to infer type
471+
expect(cols.length).toBe(0);
472+
});
473+
474+
it("handles mixed types in same row — each column gets own bitmap", () => {
475+
const rows = [
476+
{ id: 1, name: "alice", score: 95.0 },
477+
{ id: null, name: "bob", score: null },
478+
{ id: 3, name: null, score: 85.0 },
479+
];
480+
const cols = rowsToColumnArrays(rows);
481+
const idCol = cols.find(c => c.name === "id")!;
482+
const nameCol = cols.find(c => c.name === "name")!;
483+
const scoreCol = cols.find(c => c.name === "score")!;
484+
485+
// id: valid at 0,2 → 0b101
486+
expect(idCol.nullBitmap![0]).toBe(0b00000101);
487+
// name: valid at 0,1 → 0b011
488+
expect(nameCol.nullBitmap![0]).toBe(0b00000011);
489+
// score: valid at 0,2 → 0b101
490+
expect(scoreCol.nullBitmap![0]).toBe(0b00000101);
491+
});
492+
});

src/wasm-engine.ts

Lines changed: 38 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -814,9 +814,9 @@ export class WasmEngine {
814814
* Returns the raw Lance binary bytes ready to write to R2/disk.
815815
*/
816816
buildFragment(columns: FragmentColumn[]): Uint8Array {
817-
// Estimate capacity: sum of all values plus overhead
817+
// Estimate capacity: sum of all values + null bitmaps + overhead
818818
let totalBytes = 0;
819-
for (const col of columns) totalBytes += col.values.byteLength;
819+
for (const col of columns) totalBytes += col.values.byteLength + (col.nullBitmap?.byteLength ?? 0);
820820
const capacity = totalBytes + columns.length * 256 + 4096; // metadata overhead
821821

822822
if (!this.exports.fragmentBegin(capacity)) {
@@ -827,6 +827,15 @@ export class WasmEngine {
827827
const { ptr: namePtr, len: nameLen } = this.writeString(col.name);
828828
if (!namePtr) throw new Error(`WASM OOM writing column name "${col.name}"`);
829829

830+
// Write null bitmap to WASM if present
831+
let nullablePtr = 0;
832+
if (col.nullBitmap) {
833+
nullablePtr = this.safeAlloc(col.nullBitmap.byteLength);
834+
if (!nullablePtr) throw new Error("WASM OOM writing null bitmap");
835+
new Uint8Array(this.exports.memory.buffer, nullablePtr, col.nullBitmap.byteLength)
836+
.set(col.nullBitmap);
837+
}
838+
830839
let result = 0;
831840
switch (col.dtype) {
832841
case "int64": {
@@ -835,7 +844,7 @@ export class WasmEngine {
835844
if (!dataPtr) throw new Error("WASM OOM");
836845
new Uint8Array(this.exports.memory.buffer, dataPtr, col.values.byteLength)
837846
.set(new Uint8Array(col.values instanceof ArrayBuffer ? col.values : col.values.slice(0)));
838-
result = this.exports.fragmentAddInt64Column(namePtr, nameLen, dataPtr, i64.length, 0);
847+
result = this.exports.fragmentAddInt64Column(namePtr, nameLen, dataPtr, i64.length, nullablePtr);
839848
break;
840849
}
841850
case "int32": {
@@ -844,7 +853,7 @@ export class WasmEngine {
844853
if (!dataPtr) throw new Error("WASM OOM");
845854
new Uint8Array(this.exports.memory.buffer, dataPtr, col.values.byteLength)
846855
.set(new Uint8Array(col.values instanceof ArrayBuffer ? col.values : col.values.slice(0)));
847-
result = this.exports.fragmentAddInt32Column(namePtr, nameLen, dataPtr, i32.length, 0);
856+
result = this.exports.fragmentAddInt32Column(namePtr, nameLen, dataPtr, i32.length, nullablePtr);
848857
break;
849858
}
850859
case "float64": {
@@ -853,7 +862,7 @@ export class WasmEngine {
853862
if (!dataPtr) throw new Error("WASM OOM");
854863
new Uint8Array(this.exports.memory.buffer, dataPtr, col.values.byteLength)
855864
.set(new Uint8Array(col.values instanceof ArrayBuffer ? col.values : col.values.slice(0)));
856-
result = this.exports.fragmentAddFloat64Column(namePtr, nameLen, dataPtr, f64.length, 0);
865+
result = this.exports.fragmentAddFloat64Column(namePtr, nameLen, dataPtr, f64.length, nullablePtr);
857866
break;
858867
}
859868
case "float32": {
@@ -862,7 +871,7 @@ export class WasmEngine {
862871
if (!dataPtr) throw new Error("WASM OOM");
863872
new Uint8Array(this.exports.memory.buffer, dataPtr, col.values.byteLength)
864873
.set(new Uint8Array(col.values instanceof ArrayBuffer ? col.values : col.values.slice(0)));
865-
result = this.exports.fragmentAddFloat32Column(namePtr, nameLen, dataPtr, f32.length, 0);
874+
result = this.exports.fragmentAddFloat32Column(namePtr, nameLen, dataPtr, f32.length, nullablePtr);
866875
break;
867876
}
868877
case "utf8": case "string": {
@@ -901,7 +910,7 @@ export class WasmEngine {
901910
new Uint8Array(this.exports.memory.buffer, offsetsPtr, offsets.byteLength)
902911
.set(new Uint8Array(offsets.buffer));
903912
result = this.exports.fragmentAddStringColumn(
904-
namePtr, nameLen, dataPtr, totalStrBytes, offsetsPtr / 4, count, 0,
913+
namePtr, nameLen, dataPtr, totalStrBytes, offsetsPtr / 4, count, nullablePtr,
905914
);
906915
break;
907916
}
@@ -913,7 +922,7 @@ export class WasmEngine {
913922
const byteCount = col.values.byteLength;
914923
// rowCount must be provided for exact count; byteCount * 8 overestimates when rows % 8 != 0
915924
const rowCount = col.rowCount ?? byteCount * 8;
916-
result = this.exports.fragmentAddBoolColumn(namePtr, nameLen, dataPtr, byteCount, rowCount, 0);
925+
result = this.exports.fragmentAddBoolColumn(namePtr, nameLen, dataPtr, byteCount, rowCount, nullablePtr);
917926
break;
918927
}
919928
default:
@@ -1041,6 +1050,8 @@ export interface FragmentColumn {
10411050
values: ArrayBufferLike;
10421051
/** Required for bool columns (byteCount*8 overestimates when rows%8!=0). */
10431052
rowCount?: number;
1053+
/** Validity bitmap — bit i set means row i is valid (non-null). Absent = all valid. */
1054+
nullBitmap?: Uint8Array;
10441055
}
10451056

10461057
/**
@@ -1056,33 +1067,45 @@ export function rowsToColumnArrays(rows: Record<string, unknown>[]): FragmentCol
10561067
const sample = rows.find(r => r[colName] != null)?.[colName];
10571068
if (sample === undefined) continue;
10581069

1070+
// Build null bitmap for columns with null/undefined values
1071+
const bitmapBytes = Math.ceil(rows.length / 8);
1072+
let hasNull = false;
1073+
const nullBitmap = new Uint8Array(bitmapBytes);
1074+
for (let i = 0; i < rows.length; i++) {
1075+
if (rows[i][colName] != null) {
1076+
nullBitmap[i >> 3] |= 1 << (i & 7); // bit set = valid
1077+
} else {
1078+
hasNull = true;
1079+
}
1080+
}
1081+
10591082
if (typeof sample === "number") {
10601083
if (Number.isInteger(sample)) {
10611084
const arr = new BigInt64Array(rows.length);
1062-
for (let i = 0; i < rows.length; i++) { const v = rows[i][colName]; arr[i] = typeof v === "bigint" ? v as bigint : BigInt(Math.trunc(Number(v ?? 0))); }
1063-
result.push({ name: colName, dtype: "int64", values: arr.buffer });
1085+
for (let i = 0; i < rows.length; i++) { const v = rows[i][colName]; arr[i] = v != null ? (typeof v === "bigint" ? v as bigint : BigInt(Math.trunc(Number(v)))) : 0n; }
1086+
result.push({ name: colName, dtype: "int64", values: arr.buffer, ...(hasNull && { nullBitmap }) });
10641087
} else {
10651088
const arr = new Float64Array(rows.length);
10661089
for (let i = 0; i < rows.length; i++) { const v = rows[i][colName]; arr[i] = v != null ? v as number : 0; }
1067-
result.push({ name: colName, dtype: "float64", values: arr.buffer });
1090+
result.push({ name: colName, dtype: "float64", values: arr.buffer, ...(hasNull && { nullBitmap }) });
10681091
}
10691092
} else if (typeof sample === "bigint") {
10701093
const arr = new BigInt64Array(rows.length);
10711094
for (let i = 0; i < rows.length; i++) { const v = rows[i][colName]; arr[i] = v != null ? v as bigint : 0n; }
1072-
result.push({ name: colName, dtype: "int64", values: arr.buffer });
1095+
result.push({ name: colName, dtype: "int64", values: arr.buffer, ...(hasNull && { nullBitmap }) });
10731096
} else if (typeof sample === "boolean") {
10741097
const byteCount = Math.ceil(rows.length / 8);
10751098
const boolBuf = new Uint8Array(byteCount);
10761099
for (let i = 0; i < rows.length; i++) {
10771100
if (rows[i][colName]) boolBuf[i >> 3] |= 1 << (i & 7);
10781101
}
1079-
result.push({ name: colName, dtype: "bool", values: boolBuf.buffer, rowCount: rows.length });
1102+
result.push({ name: colName, dtype: "bool", values: boolBuf.buffer, rowCount: rows.length, ...(hasNull && { nullBitmap }) });
10801103
} else if (typeof sample === "string") {
10811104
const enc = textEncoder;
10821105
const encoded: Uint8Array[] = new Array(rows.length);
10831106
let totalLen = 0;
10841107
for (let i = 0; i < rows.length; i++) {
1085-
encoded[i] = enc.encode(String(rows[i][colName] ?? ""));
1108+
encoded[i] = enc.encode(rows[i][colName] != null ? String(rows[i][colName]) : "");
10861109
totalLen += 4 + encoded[i].length;
10871110
}
10881111
const buf = new Uint8Array(totalLen);
@@ -1094,7 +1117,7 @@ export function rowsToColumnArrays(rows: Record<string, unknown>[]): FragmentCol
10941117
buf.set(encoded[i], off);
10951118
off += encoded[i].length;
10961119
}
1097-
result.push({ name: colName, dtype: "utf8", values: buf.buffer });
1120+
result.push({ name: colName, dtype: "utf8", values: buf.buffer, ...(hasNull && { nullBitmap }) });
10981121
}
10991122
}
11001123
return result;

0 commit comments

Comments
 (0)