From bc8e06f611f3d4b8012231b6a8d727933a6bf5a2 Mon Sep 17 00:00:00 2001 From: sonhmai <14060682+sonhmai@users.noreply.github.com> Date: Thu, 8 Jan 2026 13:38:40 +0700 Subject: [PATCH] doc: add example of RowFilter usage --- parquet/Cargo.toml | 5 +++ parquet/examples/read_with_row_filter.rs | 49 ++++++++++++++++++++++++ parquet/src/arrow/arrow_reader/mod.rs | 30 +++++++++++++++ 3 files changed, 84 insertions(+) create mode 100644 parquet/examples/read_with_row_filter.rs diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 50f69fea5441..fa125628ce85 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -156,6 +156,11 @@ name = "async_read_parquet" required-features = ["arrow", "async"] path = "./examples/async_read_parquet.rs" +[[example]] +name = "read_with_row_filter" +required-features = ["arrow"] +path = "./examples/read_with_row_filter.rs" + [[example]] name = "read_with_rowgroup" required-features = ["arrow", "async"] diff --git a/parquet/examples/read_with_row_filter.rs b/parquet/examples/read_with_row_filter.rs new file mode 100644 index 000000000000..e0efb536b418 --- /dev/null +++ b/parquet/examples/read_with_row_filter.rs @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::Int32Array; +use arrow_cast::pretty::print_batches; +use parquet::arrow::ProjectionMask; +use parquet::arrow::arrow_reader::{ArrowPredicateFn, ParquetRecordBatchReaderBuilder, RowFilter}; +use parquet::errors::Result; +use std::fs::File; + +// RowFilter / with_row_filter usage. For background and more +// context, see +fn main() -> Result<()> { + let testdata = arrow::util::test_util::parquet_test_data(); + let path = format!("{testdata}/alltypes_plain.parquet"); + let file = File::open(&path)?; + let builder = ParquetRecordBatchReaderBuilder::try_new(file)?; + let schema_desc = builder.metadata().file_metadata().schema_descr_ptr(); + + // Create predicate: column id > 4. This col has index 0. + // Projection mask ensures only predicate columns are read to evaluate the filter. + let projection_mask = ProjectionMask::leaves(&schema_desc, [0]); + let predicate = ArrowPredicateFn::new(projection_mask, |batch| { + let id_col = batch.column(0); + arrow::compute::kernels::cmp::gt(id_col, &Int32Array::new_scalar(4)) + }); + + let row_filter = RowFilter::new(vec![Box::new(predicate)]); + let reader = builder.with_row_filter(row_filter).build()?; + + let filtered_batches: Vec<_> = reader.map(|b| b.unwrap()).collect(); + print_batches(&filtered_batches)?; + + Ok(()) +} diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index a626076ebdd7..459555d90a1c 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -304,6 +304,36 @@ impl ArrowReaderBuilder { /// /// It is recommended to enable reading the page index if using this functionality, to allow /// more efficient skipping over data pages. See [`ArrowReaderOptions::with_page_index`]. + /// + /// For a running example see `parquet/examples/read_with_row_filter.rs`. + /// See + /// for a technical explanation of late materialization. + /// + /// # Example + /// ```rust + /// # use std::fs::File; + /// # use arrow_array::Int32Array; + /// # use parquet::arrow::ProjectionMask; + /// # use parquet::arrow::arrow_reader::{ArrowPredicateFn, ParquetRecordBatchReaderBuilder, RowFilter}; + /// # fn main() -> Result<(), parquet::errors::ParquetError> { + /// # let testdata = arrow::util::test_util::parquet_test_data(); + /// # let path = format!("{testdata}/alltypes_plain.parquet"); + /// # let file = File::open(&path)?; + /// let builder = ParquetRecordBatchReaderBuilder::try_new(file)?; + /// let schema_desc = builder.metadata().file_metadata().schema_descr_ptr(); + /// + /// // Create predicate: column id > 4. This col has index 0. + /// let projection = ProjectionMask::leaves(&schema_desc, [0]); + /// let predicate = ArrowPredicateFn::new(projection, |batch| { + /// let id_col = batch.column(0); + /// arrow::compute::kernels::cmp::gt(id_col, &Int32Array::new_scalar(4)) + /// }); + /// + /// let row_filter = RowFilter::new(vec![Box::new(predicate)]); + /// let _reader = builder.with_row_filter(row_filter).build()?; + /// # Ok(()) + /// # } + /// ``` pub fn with_row_filter(self, filter: RowFilter) -> Self { Self { filter: Some(filter),