Fix polars missing dictionary_page_offset

platypii · platypii · commit 2de3db090fd5 · 2026-04-02T00:51:29.000-07:00
diff --git a/src/plan.js b/src/plan.js
@@ -60,7 +60,7 @@ export function parquetPlan({ metadata, rowStart = 0, rowEnd = Infinity, columns
                 startByte: offsetIndexStart,
                 endByte: offsetIndexStart + chunk.offset_index_length,
               },
-              bounds: { startByte, endByte },
+              range: { startByte, endByte },
             })
           } else {
             chunks.push({
diff --git a/src/rowgroup.js b/src/rowgroup.js
@@ -18,33 +18,28 @@ import { flatten } from './utils.js'
  * @returns {AsyncRowGroup} resolves to column data
  */
 export function readRowGroup(options, { metadata }, groupPlan) {
-  const { file, compressors, utf8 } = options
-
   /** @type {AsyncColumn[]} */
   const asyncColumns = []
-  /** @type {ParquetParsers} */
-  const parsers = { ...DEFAULT_PARSERS, ...options.parsers }
 
   // read column data
-  for (const chunkPlan of groupPlan.chunks) {
-    const { codec, path_in_schema: pathInSchema, type } = chunkPlan.columnMetadata
+  for (const chunk of groupPlan.chunks) {
+    const { data_page_offset, dictionary_page_offset, path_in_schema: pathInSchema } = chunk.columnMetadata
     const schemaPath = getSchemaPath(metadata.schema, pathInSchema)
     const columnDecoder = {
       pathInSchema,
-      type,
       element: schemaPath[schemaPath.length - 1].element,
       schemaPath,
-      codec,
-      parsers,
-      compressors,
-      utf8,
+      parsers: { ...DEFAULT_PARSERS, ...options.parsers },
+      ...options,
+      ...chunk.columnMetadata,
     }
+    let { startByte, endByte } = chunk.range
 
     // non-offset-index case
-    if (!('offsetIndex' in chunkPlan)) {
+    if (!('offsetIndex' in chunk)) {
       asyncColumns.push({
         pathInSchema,
-        data: Promise.resolve(file.slice(chunkPlan.range.startByte, chunkPlan.range.endByte))
+        data: Promise.resolve(options.file.slice(startByte, endByte))
           .then(buffer => {
             const reader = { view: new DataView(buffer), offset: 0 }
             return readColumn(reader, groupPlan, columnDecoder, options.onPage)
@@ -57,37 +52,30 @@ export function readRowGroup(options, { metadata }, groupPlan) {
     asyncColumns.push({
       pathInSchema,
       // fetch offset index
-      data: Promise.resolve(file.slice(chunkPlan.offsetIndex.startByte, chunkPlan.offsetIndex.endByte))
+      data: Promise.resolve(options.file.slice(chunk.offsetIndex.startByte, chunk.offsetIndex.endByte))
         .then(async arrayBuffer => {
-          const offsetIndex = readOffsetIndex({ view: new DataView(arrayBuffer), offset: 0 })
           // use offset index to read only necessary pages
           const { selectStart, selectEnd } = groupPlan
-          const pages = offsetIndex.page_locations
-          let startByte = NaN
-          let endByte = NaN
+          const pages = readOffsetIndex({ view: new DataView(arrayBuffer), offset: 0 }).page_locations
           let skipped = 0
+          // include dictionary if present, handle polars missing dictionary_page_offset
+          const hasDict = dictionary_page_offset || data_page_offset < pages[0].offset
           for (let i = 0; i < pages.length; i++) {
             const page = pages[i]
             const pageStart = Number(page.first_row_index)
             const pageEnd = i + 1 < pages.length
               ? Number(pages[i + 1].first_row_index)
               : groupPlan.groupRows // last page extends to end of row group
             // check if page overlaps with [selectStart, selectEnd)
-            if (pageStart < selectEnd && pageEnd > selectStart) {
-              if (Number.isNaN(startByte)) {
-                startByte = Number(page.offset)
-                skipped = pageStart
-              }
+            if (!skipped && !hasDict && pageEnd > selectStart) {
+              startByte = Number(page.offset)
+              skipped = pageStart
+            }
+            if (pageStart < selectEnd) {
               endByte = Number(page.offset) + page.compressed_page_size
             }
           }
-          // include dictionary page so readColumn can decode dictionary-encoded values
-          const dictOffset = chunkPlan.columnMetadata.dictionary_page_offset
-          if (dictOffset !== undefined) {
-            startByte = Number(dictOffset)
-            skipped = 0
-          }
-          const buffer = await file.slice(startByte, endByte)
+          const buffer = await options.file.slice(startByte, endByte)
           const reader = { view: new DataView(buffer), offset: 0 }
           // adjust row selection for skipped pages
           const adjustedGroupPlan = skipped ? {
diff --git a/src/types.d.ts b/src/types.d.ts
@@ -458,7 +458,7 @@ interface ChunkFull {
 interface ChunkOffsetIndexed {
   columnMetadata: ColumnMetaData
   offsetIndex: ByteRange
-  bounds: ByteRange
+  range: ByteRange
 }
 
 export interface ColumnDecoder {
diff --git a/test/files/offset_index_no_dict_offset.json b/test/files/offset_index_no_dict_offset.json
@@ -0,0 +1,14 @@
+[
+  [
+    "alice",
+    1
+  ],
+  [
+    "bob",
+    2
+  ],
+  [
+    "charlie",
+    3
+  ]
+]
diff --git a/test/files/offset_index_no_dict_offset.metadata.json b/test/files/offset_index_no_dict_offset.metadata.json
@@ -0,0 +1,99 @@
+{
+  "version": 1,
+  "schema": [
+    {
+      "name": "root",
+      "num_children": 2
+    },
+    {
+      "type": "BYTE_ARRAY",
+      "repetition_type": "OPTIONAL",
+      "name": "name",
+      "converted_type": "UTF8",
+      "logical_type": {
+        "type": "STRING"
+      }
+    },
+    {
+      "type": "INT64",
+      "repetition_type": "OPTIONAL",
+      "name": "value"
+    }
+  ],
+  "num_rows": 3,
+  "row_groups": [
+    {
+      "columns": [
+        {
+          "file_offset": 91,
+          "meta_data": {
+            "type": "BYTE_ARRAY",
+            "encodings": [
+              "PLAIN",
+              "RLE",
+              "RLE_DICTIONARY"
+            ],
+            "path_in_schema": [
+              "name"
+            ],
+            "codec": "UNCOMPRESSED",
+            "num_values": 3,
+            "total_uncompressed_size": 87,
+            "total_compressed_size": 87,
+            "data_page_offset": 4,
+            "statistics": {
+              "null_count": 0,
+              "max_value": "charlie",
+              "min_value": "alice"
+            }
+          },
+          "offset_index_offset": 337,
+          "offset_index_length": 10,
+          "column_index_offset": 279,
+          "column_index_length": 27
+        },
+        {
+          "file_offset": 226,
+          "meta_data": {
+            "type": "INT64",
+            "encodings": [
+              "PLAIN",
+              "RLE",
+              "RLE_DICTIONARY"
+            ],
+            "path_in_schema": [
+              "value"
+            ],
+            "codec": "UNCOMPRESSED",
+            "num_values": 3,
+            "total_uncompressed_size": 88,
+            "total_compressed_size": 88,
+            "data_page_offset": 138,
+            "statistics": {
+              "null_count": 0,
+              "max_value": 3,
+              "min_value": 1
+            }
+          },
+          "offset_index_offset": 347,
+          "offset_index_length": 11,
+          "column_index_offset": 306,
+          "column_index_length": 31
+        }
+      ],
+      "total_byte_size": 175,
+      "num_rows": 3,
+      "file_offset": 4,
+      "total_compressed_size": 175,
+      "ordinal": 0
+    }
+  ],
+  "key_value_metadata": [
+    {
+      "key": "ARROW:schema",
+      "value": "/////6kAAAAEAAAA8v///xQAAAAEAAEAAAAKAAsACAAKAAQA+P///wwAAAAIAAgAAAAEAAIAAABAAAAABAAAALT///8oAAAAEAAAAAgAAAABAgAAAAAAAPT///9AAAAAAQAAAAgACQAEAAgABQAAAHZhbHVlAAAA7P///ywAAAAgAAAAGAAAAAEUAAAQABIABAAQABEACAAAAAwAAAAAAPz///8EAAQABAAAAG5hbWUA"
+    }
+  ],
+  "created_by": "Polars",
+  "metadata_length": 464
+}
diff --git a/test/files/offset_index_no_dict_offset.offset_indexes.json b/test/files/offset_index_no_dict_offset.offset_indexes.json
@@ -0,0 +1,22 @@
+[
+  [
+    {
+      "page_locations": [
+        {
+          "offset": 44,
+          "compressed_page_size": 47,
+          "first_row_index": 0
+        }
+      ]
+    },
+    {
+      "page_locations": [
+        {
+          "offset": 175,
+          "compressed_page_size": 51,
+          "first_row_index": 0
+        }
+      ]
+    }
+  ]
+]
diff --git a/test/files/offset_index_no_dict_offset.parquet b/test/files/offset_index_no_dict_offset.parquet
diff --git a/test/read.test.js b/test/read.test.js
@@ -257,6 +257,14 @@ describe('parquetRead', () => {
     expect(counting.bytes).toBe(892)
   })
 
+  it('uses OffsetIndex when dictionary_page_offset is missing (polars)', async () => {
+    // polars writes RLE_DICTIONARY columns without setting dictionary_page_offset
+    const file = await asyncBufferFromFile('test/files/offset_index_no_dict_offset.parquet')
+    const allRows = await parquetReadObjects({ file })
+    const rows = await parquetReadObjects({ file, rowEnd: 1, useOffsetIndex: true })
+    expect(rows).toEqual(allRows.slice(0, 1))
+  })
+
   it('reads only required row groups on the boundary', async () => {
     const originalFile = await asyncBufferFromFile('test/files/alpha.parquet')
     const metadata = await parquetMetadataAsync(originalFile)

Original file line number	Diff line number	Diff line change
`@@ -458,7 +458,7 @@ interface ChunkFull {`
`458`	`458`	`interface ChunkOffsetIndexed {`
`459`	`459`	`columnMetadata: ColumnMetaData`
`460`	`460`	`offsetIndex: ByteRange`
`461`		`- bounds: ByteRange`
	`461`	`+ range: ByteRange`
`462`	`462`	`}`
`463`	`463`
`464`	`464`	`export interface ColumnDecoder {`
-Original file line number
+Diff line change
@@ @@ -0,0 +1,14 @@ @@
 +[
 +  [
 +    "alice",
 +    1
 +  ],
 +  [
 +    "bob",
 +    2
 +  ],
 +  [
 +    "charlie",
 +    3
 +  ]
 +]