@@ -18,33 +18,28 @@ import { flatten } from './utils.js'
1818 * @returns {AsyncRowGroup } resolves to column data
1919 */
2020export function readRowGroup ( options , { metadata } , groupPlan ) {
21- const { file, compressors, utf8 } = options
22-
2321 /** @type {AsyncColumn[] } */
2422 const asyncColumns = [ ]
25- /** @type {ParquetParsers } */
26- const parsers = { ...DEFAULT_PARSERS , ...options . parsers }
2723
2824 // read column data
29- for ( const chunkPlan of groupPlan . chunks ) {
30- const { codec , path_in_schema : pathInSchema , type } = chunkPlan . columnMetadata
25+ for ( const chunk of groupPlan . chunks ) {
26+ const { data_page_offset , dictionary_page_offset , path_in_schema : pathInSchema } = chunk . columnMetadata
3127 const schemaPath = getSchemaPath ( metadata . schema , pathInSchema )
3228 const columnDecoder = {
3329 pathInSchema,
34- type,
3530 element : schemaPath [ schemaPath . length - 1 ] . element ,
3631 schemaPath,
37- codec,
38- parsers,
39- compressors,
40- utf8,
32+ parsers : { ...DEFAULT_PARSERS , ...options . parsers } ,
33+ ...options ,
34+ ...chunk . columnMetadata ,
4135 }
36+ let { startByte, endByte } = chunk . range
4237
4338 // non-offset-index case
44- if ( ! ( 'offsetIndex' in chunkPlan ) ) {
39+ if ( ! ( 'offsetIndex' in chunk ) ) {
4540 asyncColumns . push ( {
4641 pathInSchema,
47- data : Promise . resolve ( file . slice ( chunkPlan . range . startByte , chunkPlan . range . endByte ) )
42+ data : Promise . resolve ( options . file . slice ( startByte , endByte ) )
4843 . then ( buffer => {
4944 const reader = { view : new DataView ( buffer ) , offset : 0 }
5045 return readColumn ( reader , groupPlan , columnDecoder , options . onPage )
@@ -57,37 +52,30 @@ export function readRowGroup(options, { metadata }, groupPlan) {
5752 asyncColumns . push ( {
5853 pathInSchema,
5954 // fetch offset index
60- data : Promise . resolve ( file . slice ( chunkPlan . offsetIndex . startByte , chunkPlan . offsetIndex . endByte ) )
55+ data : Promise . resolve ( options . file . slice ( chunk . offsetIndex . startByte , chunk . offsetIndex . endByte ) )
6156 . then ( async arrayBuffer => {
62- const offsetIndex = readOffsetIndex ( { view : new DataView ( arrayBuffer ) , offset : 0 } )
6357 // use offset index to read only necessary pages
6458 const { selectStart, selectEnd } = groupPlan
65- const pages = offsetIndex . page_locations
66- let startByte = NaN
67- let endByte = NaN
59+ const pages = readOffsetIndex ( { view : new DataView ( arrayBuffer ) , offset : 0 } ) . page_locations
6860 let skipped = 0
61+ // include dictionary if present, handle polars missing dictionary_page_offset
62+ const hasDict = dictionary_page_offset || data_page_offset < pages [ 0 ] . offset
6963 for ( let i = 0 ; i < pages . length ; i ++ ) {
7064 const page = pages [ i ]
7165 const pageStart = Number ( page . first_row_index )
7266 const pageEnd = i + 1 < pages . length
7367 ? Number ( pages [ i + 1 ] . first_row_index )
7468 : groupPlan . groupRows // last page extends to end of row group
7569 // check if page overlaps with [selectStart, selectEnd)
76- if ( pageStart < selectEnd && pageEnd > selectStart ) {
77- if ( Number . isNaN ( startByte ) ) {
78- startByte = Number ( page . offset )
79- skipped = pageStart
80- }
70+ if ( ! skipped && ! hasDict && pageEnd > selectStart ) {
71+ startByte = Number ( page . offset )
72+ skipped = pageStart
73+ }
74+ if ( pageStart < selectEnd ) {
8175 endByte = Number ( page . offset ) + page . compressed_page_size
8276 }
8377 }
84- // include dictionary page so readColumn can decode dictionary-encoded values
85- const dictOffset = chunkPlan . columnMetadata . dictionary_page_offset
86- if ( dictOffset !== undefined ) {
87- startByte = Number ( dictOffset )
88- skipped = 0
89- }
90- const buffer = await file . slice ( startByte , endByte )
78+ const buffer = await options . file . slice ( startByte , endByte )
9179 const reader = { view : new DataView ( buffer ) , offset : 0 }
9280 // adjust row selection for skipped pages
9381 const adjustedGroupPlan = skipped ? {
0 commit comments