Skip to content

Commit bf63ec5

Browse files
sonhmaiDandandan
authored andcommitted
doc: add example of RowFilter usage (apache#9115)
# Which issue does this PR close? - Closes apache#9096. # Rationale for this change The RowFilter API does exist and can evaluate predicates during evaluation, but it has no examples. # What changes are included in this PR? - Added a rustdoc example and blog link to `ParquetRecordBatchReaderBuilder::with_row_filter`. - Added a running example in `parquet/examples/read_with_row_filter.rs` # Are these changes tested? Yes ``` cargo run -p parquet --example read_with_row_filter cargo test -p parquet --doc ``` # Are there any user-facing changes? Yes, doc only. No API changes.
1 parent ac1afae commit bf63ec5

File tree

3 files changed

+84
-0
lines changed

3 files changed

+84
-0
lines changed

parquet/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,11 @@ name = "async_read_parquet"
156156
required-features = ["arrow", "async"]
157157
path = "./examples/async_read_parquet.rs"
158158

159+
[[example]]
160+
name = "read_with_row_filter"
161+
required-features = ["arrow"]
162+
path = "./examples/read_with_row_filter.rs"
163+
159164
[[example]]
160165
name = "read_with_rowgroup"
161166
required-features = ["arrow", "async"]
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use arrow_array::Int32Array;
19+
use arrow_cast::pretty::print_batches;
20+
use parquet::arrow::ProjectionMask;
21+
use parquet::arrow::arrow_reader::{ArrowPredicateFn, ParquetRecordBatchReaderBuilder, RowFilter};
22+
use parquet::errors::Result;
23+
use std::fs::File;
24+
25+
// RowFilter / with_row_filter usage. For background and more
26+
// context, see <https://arrow.apache.org/blog/2025/12/11/parquet-late-materialization-deep-dive/>
27+
fn main() -> Result<()> {
28+
let testdata = arrow::util::test_util::parquet_test_data();
29+
let path = format!("{testdata}/alltypes_plain.parquet");
30+
let file = File::open(&path)?;
31+
let builder = ParquetRecordBatchReaderBuilder::try_new(file)?;
32+
let schema_desc = builder.metadata().file_metadata().schema_descr_ptr();
33+
34+
// Create predicate: column id > 4. This col has index 0.
35+
// Projection mask ensures only predicate columns are read to evaluate the filter.
36+
let projection_mask = ProjectionMask::leaves(&schema_desc, [0]);
37+
let predicate = ArrowPredicateFn::new(projection_mask, |batch| {
38+
let id_col = batch.column(0);
39+
arrow::compute::kernels::cmp::gt(id_col, &Int32Array::new_scalar(4))
40+
});
41+
42+
let row_filter = RowFilter::new(vec![Box::new(predicate)]);
43+
let reader = builder.with_row_filter(row_filter).build()?;
44+
45+
let filtered_batches: Vec<_> = reader.map(|b| b.unwrap()).collect();
46+
print_batches(&filtered_batches)?;
47+
48+
Ok(())
49+
}

parquet/src/arrow/arrow_reader/mod.rs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,36 @@ impl<T> ArrowReaderBuilder<T> {
304304
///
305305
/// It is recommended to enable reading the page index if using this functionality, to allow
306306
/// more efficient skipping over data pages. See [`ArrowReaderOptions::with_page_index`].
307+
///
308+
/// For a running example see `parquet/examples/read_with_row_filter.rs`.
309+
/// See <https://arrow.apache.org/blog/2025/12/11/parquet-late-materialization-deep-dive/>
310+
/// for a technical explanation of late materialization.
311+
///
312+
/// # Example
313+
/// ```rust
314+
/// # use std::fs::File;
315+
/// # use arrow_array::Int32Array;
316+
/// # use parquet::arrow::ProjectionMask;
317+
/// # use parquet::arrow::arrow_reader::{ArrowPredicateFn, ParquetRecordBatchReaderBuilder, RowFilter};
318+
/// # fn main() -> Result<(), parquet::errors::ParquetError> {
319+
/// # let testdata = arrow::util::test_util::parquet_test_data();
320+
/// # let path = format!("{testdata}/alltypes_plain.parquet");
321+
/// # let file = File::open(&path)?;
322+
/// let builder = ParquetRecordBatchReaderBuilder::try_new(file)?;
323+
/// let schema_desc = builder.metadata().file_metadata().schema_descr_ptr();
324+
///
325+
/// // Create predicate: column id > 4. This col has index 0.
326+
/// let projection = ProjectionMask::leaves(&schema_desc, [0]);
327+
/// let predicate = ArrowPredicateFn::new(projection, |batch| {
328+
/// let id_col = batch.column(0);
329+
/// arrow::compute::kernels::cmp::gt(id_col, &Int32Array::new_scalar(4))
330+
/// });
331+
///
332+
/// let row_filter = RowFilter::new(vec![Box::new(predicate)]);
333+
/// let _reader = builder.with_row_filter(row_filter).build()?;
334+
/// # Ok(())
335+
/// # }
336+
/// ```
307337
pub fn with_row_filter(self, filter: RowFilter) -> Self {
308338
Self {
309339
filter: Some(filter),

0 commit comments

Comments
 (0)