Skip to content

Commit 5d970b0

Browse files
authored
Inline thrift readFieldBegin (#159)
1 parent 56bc3ed commit 5d970b0

File tree

4 files changed

+48
-85
lines changed

4 files changed

+48
-85
lines changed

package.json

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
"dataset",
1111
"hyperparam",
1212
"hyparquet",
13+
"geoparquet",
14+
"llm",
1315
"ml",
1416
"parquet",
1517
"parquetjs",
@@ -55,10 +57,10 @@
5557
"test": "vitest run"
5658
},
5759
"devDependencies": {
58-
"@types/node": "25.2.3",
60+
"@types/node": "25.3.0",
5961
"@vitest/coverage-v8": "4.0.18",
6062
"eslint": "9.39.2",
61-
"eslint-plugin-jsdoc": "62.5.4",
63+
"eslint-plugin-jsdoc": "62.7.0",
6264
"hyparquet-compressors": "1.1.1",
6365
"typescript": "5.9.3",
6466
"vitest": "4.0.18"

src/thrift.js

Lines changed: 41 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,19 @@
1+
/**
2+
* @import {DataReader, ThriftObject, ThriftType} from '../src/types.d.ts'
3+
*/
4+
15
// TCompactProtocol types
2-
export const CompactType = {
3-
STOP: 0,
4-
TRUE: 1,
5-
FALSE: 2,
6-
BYTE: 3,
7-
I16: 4,
8-
I32: 5,
9-
I64: 6,
10-
DOUBLE: 7,
11-
BINARY: 8,
12-
LIST: 9,
13-
SET: 10,
14-
MAP: 11,
15-
STRUCT: 12,
16-
UUID: 13,
17-
}
6+
const STOP = 0
7+
const TRUE = 1
8+
const FALSE = 2
9+
const BYTE = 3
10+
const I16 = 4
11+
const I32 = 5
12+
const I64 = 6
13+
const DOUBLE = 7
14+
const BINARY = 8
15+
const LIST = 9
16+
const STRUCT = 12
1817

1918
/**
2019
* Parse TCompactProtocol
@@ -23,20 +22,17 @@ export const CompactType = {
2322
* @returns {{ [key: `field_${number}`]: any }}
2423
*/
2524
export function deserializeTCompactProtocol(reader) {
26-
let lastFid = 0
2725
/** @type {ThriftObject} */
2826
const value = {}
27+
let fid = 0
2928

3029
while (reader.offset < reader.view.byteLength) {
3130
// Parse each field based on its type and add to the result object
32-
const [type, fid, newLastFid] = readFieldBegin(reader, lastFid)
33-
lastFid = newLastFid
34-
35-
if (type === CompactType.STOP) {
36-
break
37-
}
38-
39-
// Handle the field based on its type
31+
const byte = reader.view.getUint8(reader.offset++)
32+
const type = byte & 0x0f
33+
if (type === STOP) break
34+
const delta = byte >> 4
35+
fid = delta ? fid + delta : readZigZag(reader)
4036
value[`field_${fid}`] = readElement(reader, type)
4137
}
4238

@@ -46,73 +42,59 @@ export function deserializeTCompactProtocol(reader) {
4642
/**
4743
* Read a single element based on its type
4844
*
49-
* @import {DataReader, ThriftObject, ThriftType} from '../src/types.d.ts'
5045
* @param {DataReader} reader
5146
* @param {number} type
5247
* @returns {ThriftType}
5348
*/
5449
function readElement(reader, type) {
5550
switch (type) {
56-
case CompactType.TRUE:
51+
case TRUE:
5752
return true
58-
case CompactType.FALSE:
53+
case FALSE:
5954
return false
60-
case CompactType.BYTE:
61-
// read byte directly
55+
case BYTE:
6256
return reader.view.getInt8(reader.offset++)
63-
case CompactType.I16:
64-
case CompactType.I32:
57+
case I16:
58+
case I32:
6559
return readZigZag(reader)
66-
case CompactType.I64:
60+
case I64:
6761
return readZigZagBigInt(reader)
68-
case CompactType.DOUBLE: {
62+
case DOUBLE: {
6963
const value = reader.view.getFloat64(reader.offset, true)
7064
reader.offset += 8
7165
return value
7266
}
73-
case CompactType.BINARY: {
67+
case BINARY: {
7468
const stringLength = readVarInt(reader)
7569
const strBytes = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, stringLength)
7670
reader.offset += stringLength
7771
return strBytes
7872
}
79-
case CompactType.LIST: {
73+
case LIST: {
8074
const byte = reader.view.getUint8(reader.offset++)
8175
const elemType = byte & 0x0f
8276
let listSize = byte >> 4
8377
if (listSize === 15) {
8478
listSize = readVarInt(reader)
8579
}
86-
const boolType = elemType === CompactType.TRUE || elemType === CompactType.FALSE
80+
const boolType = elemType === TRUE || elemType === FALSE
8781
const values = new Array(listSize)
8882
for (let i = 0; i < listSize; i++) {
89-
values[i] = boolType ? readElement(reader, CompactType.BYTE) === 1 : readElement(reader, elemType)
83+
values[i] = boolType ? readElement(reader, BYTE) === 1 : readElement(reader, elemType)
9084
}
9185
return values
9286
}
93-
case CompactType.STRUCT: {
94-
/** @type {ThriftObject} */
95-
const structValues = {}
96-
let lastFid = 0
97-
while (true) {
98-
const [fieldType, fid, newLastFid] = readFieldBegin(reader, lastFid)
99-
lastFid = newLastFid
100-
if (fieldType === CompactType.STOP) {
101-
break
102-
}
103-
structValues[`field_${fid}`] = readElement(reader, fieldType)
104-
}
105-
return structValues
106-
}
107-
// TODO: MAP, SET, UUID
87+
case STRUCT:
88+
// main function handles struct parsing
89+
return deserializeTCompactProtocol(reader)
10890
default:
91+
// MAP, SET, UUID not used by parquet
10992
throw new Error(`thrift unhandled type: ${type}`)
11093
}
11194
}
11295

11396
/**
114-
* Var int aka Unsigned LEB128.
115-
* Reads groups of 7 low bits until high bit is 0.
97+
* Read varint aka Unsigned LEB128.
11698
*
11799
* @param {DataReader} reader
118100
* @returns {number}
@@ -121,6 +103,7 @@ export function readVarInt(reader) {
121103
let result = 0
122104
let shift = 0
123105
while (true) {
106+
// Read groups of 7 low bits until high bit is 0
124107
const byte = reader.view.getUint8(reader.offset++)
125108
result |= (byte & 0x7f) << shift
126109
if (!(byte & 0x80)) {
@@ -150,46 +133,24 @@ function readVarBigInt(reader) {
150133
}
151134

152135
/**
153-
* Values of type int32 and int64 are transformed to a zigzag int.
154-
* A zigzag int folds positive and negative numbers into the positive number space.
136+
* Read a zigzag number.
137+
* Zigzag folds positive and negative numbers into the positive number space.
155138
*
156139
* @param {DataReader} reader
157140
* @returns {number}
158141
*/
159142
export function readZigZag(reader) {
160143
const zigzag = readVarInt(reader)
161-
// convert zigzag to int
162144
return zigzag >>> 1 ^ -(zigzag & 1)
163145
}
164146

165147
/**
166-
* A zigzag int folds positive and negative numbers into the positive number space.
167-
* This version returns a BigInt.
148+
* Read a zigzag bigint.
168149
*
169150
* @param {DataReader} reader
170151
* @returns {bigint}
171152
*/
172153
export function readZigZagBigInt(reader) {
173154
const zigzag = readVarBigInt(reader)
174-
// convert zigzag to int
175155
return zigzag >> 1n ^ -(zigzag & 1n)
176156
}
177-
178-
/**
179-
* Read field type and field id
180-
*
181-
* @param {DataReader} reader
182-
* @param {number} lastFid
183-
* @returns {[number, number, number]} [type, fid, newLastFid]
184-
*/
185-
function readFieldBegin(reader, lastFid) {
186-
const byte = reader.view.getUint8(reader.offset++)
187-
const type = byte & 0x0f
188-
if (type === CompactType.STOP) {
189-
// STOP also ends a struct
190-
return [0, 0, lastFid]
191-
}
192-
const delta = byte >> 4
193-
const fid = delta ? lastFid + delta : readZigZag(reader)
194-
return [type, fid, fid]
195-
}

src/types.d.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -408,8 +408,8 @@ interface PageLocation {
408408

409409
export interface ColumnIndex {
410410
null_pages: boolean[]
411-
min_values: MinMaxType[]
412-
max_values: MinMaxType[]
411+
min_values: Uint8Array[]
412+
max_values: Uint8Array[]
413413
boundary_order: BoundaryOrder
414414
null_counts?: bigint[]
415415
repetition_level_histograms?: bigint[]

test/snappy.test.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ describe('snappy uncompress', () => {
3434
},
3535
// from datapage_v2.snappy.parquet
3636
{ compressed: [2, 4, 0, 3], expected: new Uint8Array([0, 3]) },
37-
{ compressed: [ 6, 20, 2, 0, 0, 0, 3, 23], expected: new Uint8Array([2, 0, 0, 0, 3, 23]) },
37+
{ compressed: [6, 20, 2, 0, 0, 0, 3, 23], expected: new Uint8Array([2, 0, 0, 0, 3, 23]) },
3838
]
3939

4040
for (const { compressed, expected } of testCases) {

0 commit comments

Comments
 (0)