Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
321a8d3
Finding where to start
sdf-jkl Jan 7, 2026
8acfb4e
Seems to work
sdf-jkl Jan 9, 2026
ed2a182
Fix
sdf-jkl Jan 9, 2026
145ecec
Merge branch 'main' into bitmask-skip-page
sdf-jkl Jan 9, 2026
5395dbf
Fix async err?
sdf-jkl Jan 9, 2026
df9a493
Fix complexity from O(n^2) to O(logn)
sdf-jkl Jan 12, 2026
014dbc9
Merge branch 'bitmask-skip-page' of https://github.com/sdf-jkl/arrow-…
sdf-jkl Jan 12, 2026
55e0126
Pass pagelocation instead offsetindexmetadata
sdf-jkl Jan 12, 2026
6919196
Fix clippy
sdf-jkl Jan 12, 2026
83dfb46
Merge branch 'main' of https://github.com/apache/arrow-rs into bitmas…
sdf-jkl Jan 14, 2026
87e21d9
Only add page_offsets if the policy is bitmask
sdf-jkl Jan 14, 2026
b41a94b
Add assert row values to end to end tests
sdf-jkl Jan 14, 2026
700d550
Add cursor page awarness test
sdf-jkl Jan 14, 2026
48d93af
Merge branch 'main' into bitmask-skip-page
Dandandan Jan 14, 2026
8d658b0
Add more tests
Jan 15, 2026
ee0a75f
Merge remote-tracking branch 'sdf-jkl/bitmask-skip-page' into bitmask…
Jan 15, 2026
6d35513
Merge pull request #1 from hhhizzz/bitmask-skip-page
sdf-jkl Jan 15, 2026
c1876e4
cargo fmt
sdf-jkl Jan 15, 2026
b6baf2d
Merge branch 'main' of https://github.com/sdf-jkl/arrow-rs into bitma…
sdf-jkl Jan 22, 2026
2aed549
Merge branch 'main' of https://github.com/apache/arrow-rs into bitmas…
sdf-jkl Jan 22, 2026
6639ac7
fix clippy
sdf-jkl Jan 22, 2026
2fb401f
Fix PageIndexPolicy::from() to Required
sdf-jkl Jan 22, 2026
bc07ac0
Merge branch 'main' of https://github.com/apache/arrow-rs into bitmas…
sdf-jkl Jan 23, 2026
d200ee9
Change page_aware logic
sdf-jkl Jan 23, 2026
717a1cb
Make mask use page offsets with smallest pages
sdf-jkl Jan 23, 2026
03c8bdb
cargo fmt
sdf-jkl Jan 23, 2026
1158af0
clippy
sdf-jkl Jan 23, 2026
d260f94
Use all page boundaries to cap next mask chunk
sdf-jkl Jan 25, 2026
eea54f1
fix clippy
alamb Jan 26, 2026
a4b04c9
Add documentation
alamb Jan 26, 2026
27174e3
Reduce test replication
alamb Jan 26, 2026
f619b8f
Remove redundant sync test
sdf-jkl Jan 27, 2026
0e0265f
Remove redundant async test
sdf-jkl Jan 27, 2026
b3b2851
Merge branch 'main' of https://github.com/apache/arrow-rs into bitmas…
sdf-jkl Jan 27, 2026
e6cae42
Merge branch 'main' of https://github.com/apache/arrow-rs into bitmas…
sdf-jkl Feb 3, 2026
22d938c
Merge branch 'main' of https://github.com/apache/arrow-rs into bitmas…
sdf-jkl Feb 5, 2026
e23ba8e
Merge branch 'bitmask-skip-page' of https://github.com/sdf-jkl/arrow-…
sdf-jkl Feb 5, 2026
9c14271
clippy
sdf-jkl Feb 5, 2026
8759692
Merge branch 'main' of https://github.com/apache/arrow-rs into bitmas…
sdf-jkl Feb 10, 2026
48cd54c
Add tests
sdf-jkl Feb 11, 2026
ddef11a
cargo fmt
sdf-jkl Feb 11, 2026
eebc2f2
clippy
sdf-jkl Feb 11, 2026
c6e5425
Merge branch 'main' of https://github.com/apache/arrow-rs into bitmas…
sdf-jkl Feb 15, 2026
290abdf
Replace all page offsets with just the skipped ones
sdf-jkl Feb 16, 2026
7563d6d
replace binary search with linear lookup
sdf-jkl Feb 16, 2026
2111012
Make MaskCursor next skipped page aware
sdf-jkl Feb 16, 2026
7f74fc6
Merge branch 'main' into bitmask-skip-page
sdf-jkl Mar 4, 2026
8776f60
remove redundant test
sdf-jkl Mar 4, 2026
16a3549
Merge branch 'main' of https://github.com/apache/arrow-rs into bitmas…
sdf-jkl Mar 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion parquet/src/arrow/arrow_reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1368,12 +1368,15 @@ impl ParquetRecordBatchReader {
if batch_size == 0 {
return Ok(None);
}
let page_boundaries = self.read_plan.page_boundaries();
match self.read_plan.row_selection_cursor_mut() {
RowSelectionCursor::Mask(mask_cursor) => {
// Stream the record batch reader using contiguous segments of the selection
// mask, avoiding the need to materialize intermediate `RowSelector` ranges.
while !mask_cursor.is_empty() {
let Some(mask_chunk) = mask_cursor.next_mask_chunk(batch_size) else {
let Some(mask_chunk) =
mask_cursor.next_mask_chunk(batch_size, page_boundaries.as_deref())
else {
return Ok(None);
};

Expand Down
19 changes: 19 additions & 0 deletions parquet/src/arrow/arrow_reader/read_plan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ use crate::errors::{ParquetError, Result};
use arrow_array::Array;
use arrow_select::filter::prep_null_mask_filter;
use std::collections::VecDeque;
use std::sync::Arc;

/// A builder for [`ReadPlan`]
#[derive(Clone, Debug)]
Expand All @@ -37,6 +38,8 @@ pub struct ReadPlanBuilder {
selection: Option<RowSelection>,
/// Policy to use when materializing the row selection
row_selection_policy: RowSelectionPolicy,
/// Precomputed page boundary row indices for mask chunking
page_boundaries: Option<Arc<[usize]>>,
}

impl ReadPlanBuilder {
Expand All @@ -46,6 +49,7 @@ impl ReadPlanBuilder {
batch_size,
selection: None,
row_selection_policy: RowSelectionPolicy::default(),
page_boundaries: None,
}
}

Expand Down Expand Up @@ -175,6 +179,12 @@ impl ReadPlanBuilder {
Ok(self)
}

/// Set page boundary rows directly for mask chunking
pub(crate) fn with_page_boundaries(mut self, boundaries: Option<Arc<[usize]>>) -> Self {
self.page_boundaries = boundaries;
self
}

/// Create a final `ReadPlan` the read plan for the scan
pub fn build(mut self) -> ReadPlan {
// If selection is empty, truncate
Expand All @@ -189,6 +199,7 @@ impl ReadPlanBuilder {
batch_size,
selection,
row_selection_policy: _,
page_boundaries: _,
} = self;

let selection = selection.map(|s| s.trim());
Expand All @@ -209,6 +220,7 @@ impl ReadPlanBuilder {
ReadPlan {
batch_size,
row_selection_cursor,
page_boundaries: self.page_boundaries,
}
}
}
Expand Down Expand Up @@ -307,6 +319,8 @@ pub struct ReadPlan {
batch_size: usize,
/// Row ranges to be selected from the data source
row_selection_cursor: RowSelectionCursor,
/// Precomputed page boundary row indices for mask chunking
page_boundaries: Option<Arc<[usize]>>,
Comment thread
sdf-jkl marked this conversation as resolved.
}

impl ReadPlan {
Expand All @@ -330,6 +344,11 @@ impl ReadPlan {
pub fn batch_size(&self) -> usize {
self.batch_size
}

/// Return the page boundary row indices used for mask chunking
pub fn page_boundaries(&self) -> Option<Arc<[usize]>> {
self.page_boundaries.clone()
}
}

#[cfg(test)]
Expand Down
149 changes: 141 additions & 8 deletions parquet/src/arrow/arrow_reader/selection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -271,8 +271,64 @@ impl RowSelection {
})
}

/// Returns true if selectors should be forced, preventing mask materialisation
pub(crate) fn should_force_selectors(
/// Returns row offsets for the starts of skipped pages across projected columns
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function to retrieve row offsets for skipped pages across projected columns.

fn skipped_page_row_offsets(
&self,
projection: &ProjectionMask,
columns: &[OffsetIndexMetaData],
) -> Vec<usize> {
let mut skipped_page_rows: Vec<usize> = Vec::new();

for (leaf_idx, column) in columns.iter().enumerate() {
if !projection.leaf_included(leaf_idx) {
continue;
}

let locations = column.page_locations();
if locations.is_empty() {
continue;
}

let selected_ranges = self.scan_ranges(locations);
let mut selected_idx = 0usize;
for page in locations {
let page_start = page.offset as u64;

while selected_idx < selected_ranges.len()
&& selected_ranges[selected_idx].start < page_start
{
selected_idx += 1;
}

let selected = selected_idx < selected_ranges.len()
&& selected_ranges[selected_idx].start == page_start;

if selected {
selected_idx += 1;
} else {
skipped_page_rows.push(page.first_row_index as usize);
}
}
}

skipped_page_rows.sort_unstable();
skipped_page_rows.dedup();
skipped_page_rows
}

/// Returns row offsets for skipped page starts when page-aware mask chunking is needed
pub(crate) fn page_aware_mask_boundaries(
&self,
projection: &ProjectionMask,
offset_index: Option<&[OffsetIndexMetaData]>,
) -> Option<Vec<usize>> {
offset_index
.map(|columns| self.skipped_page_row_offsets(projection, columns))
.filter(|offsets| !offsets.is_empty())
}

/// Returns true if bitmasks should be page aware
pub(crate) fn requires_page_aware_mask(
&self,
projection: &ProjectionMask,
offset_index: Option<&[OffsetIndexMetaData]>,
Expand Down Expand Up @@ -770,6 +826,9 @@ pub struct MaskCursor {
mask: BooleanBuffer,
/// Current absolute offset into the selection
position: usize,
/// Index of the next page boundary candidate. This advances monotonically
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make MaskCursor know next boundary index like the idea in sdf-jkl#2

/// as `position` advances.
next_boundary_idx: usize,
}

impl MaskCursor {
Expand All @@ -778,8 +837,13 @@ impl MaskCursor {
self.position >= self.mask.len()
}

/// Advance through the mask representation, producing the next chunk summary
pub fn next_mask_chunk(&mut self, batch_size: usize) -> Option<MaskChunk> {
/// Advance through the mask representation, producing the next chunk summary.
/// Optionally clips chunk boundaries to the next page boundary.
pub fn next_mask_chunk(
&mut self,
batch_size: usize,
page_boundaries: Option<&[usize]>,
) -> Option<MaskChunk> {
let (initial_skip, chunk_rows, selected_rows, mask_start, end_position) = {
let mask = &self.mask;

Expand All @@ -791,6 +855,7 @@ impl MaskCursor {
let mut cursor = start_position;
let mut initial_skip = 0;

// Skip unselected rows
while cursor < mask.len() && !mask.value(cursor) {
initial_skip += 1;
cursor += 1;
Expand All @@ -800,10 +865,23 @@ impl MaskCursor {
let mut chunk_rows = 0;
let mut selected_rows = 0;

// Advance until enough rows have been selected to satisfy the batch size,
// or until the mask is exhausted. This mirrors the behaviour of the legacy
// `RowSelector` queue-based iteration.
while cursor < mask.len() && selected_rows < batch_size {
let max_chunk_rows = page_boundaries
Comment thread
sdf-jkl marked this conversation as resolved.
Copy link
Copy Markdown
Contributor Author

@sdf-jkl sdf-jkl Feb 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This way we can avoid binary search in every MaskChunk

.and_then(|boundaries| {
while self.next_boundary_idx < boundaries.len()
&& boundaries[self.next_boundary_idx] <= mask_start
{
self.next_boundary_idx += 1;
}
boundaries
.get(self.next_boundary_idx)
.and_then(|&start| (start > mask_start).then_some(start - mask_start))
})
.unwrap_or(usize::MAX);

// Advance until enough rows have been selected to satisfy batch_size,
// or until the mask is exhausted or until a page boundary.
while cursor < mask.len() && selected_rows < batch_size && chunk_rows < max_chunk_rows {
// Increment counters
chunk_rows += 1;
if mask.value(cursor) {
selected_rows += 1;
Expand Down Expand Up @@ -906,6 +984,7 @@ impl RowSelectionCursor {
Self::Mask(MaskCursor {
mask: boolean_mask_from_selectors(&selectors),
position: 0,
next_boundary_idx: 0,
})
}

Expand Down Expand Up @@ -1537,6 +1616,60 @@ mod tests {
assert_eq!(ranges, vec![10..20, 20..30, 30..40]);
}

#[test]
fn test_page_aware_mask_boundaries_skipped_pages_only() {
let selection = RowSelection::from(vec![
RowSelector::skip(10),
RowSelector::select(10),
RowSelector::skip(10),
RowSelector::select(20),
RowSelector::skip(20),
]);

let page_locations = vec![
PageLocation {
offset: 0,
compressed_page_size: 10,
first_row_index: 0,
},
PageLocation {
offset: 10,
compressed_page_size: 10,
first_row_index: 10,
},
PageLocation {
offset: 20,
compressed_page_size: 10,
first_row_index: 20,
},
PageLocation {
offset: 30,
compressed_page_size: 10,
first_row_index: 30,
},
PageLocation {
offset: 40,
compressed_page_size: 10,
first_row_index: 40,
},
PageLocation {
offset: 50,
compressed_page_size: 10,
first_row_index: 50,
},
];

let offsets = selection.page_aware_mask_boundaries(
&ProjectionMask::all(),
Some(&[OffsetIndexMetaData {
page_locations,
unencoded_byte_array_data_bytes: None,
}]),
);

assert_eq!(offsets, Some(vec![0, 20, 50]));
}

#[test]
fn test_from_ranges() {
let ranges = [1..3, 4..6, 6..6, 8..8, 9..10];
Expand Down
Loading
Loading