Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 71 additions & 21 deletions parquet/benches/arrow_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ use parquet::arrow::array_reader::{
ListArrayReader, make_byte_array_reader, make_byte_view_array_reader,
make_fixed_len_byte_array_reader,
};
use parquet::arrow::arrow_reader::DEFAULT_BATCH_SIZE;
use parquet::basic::Type;
use parquet::data_type::{ByteArray, FixedLenByteArrayType};
use parquet::util::{DataPageBuilder, DataPageBuilderImpl, InMemoryPageIterator};
Expand Down Expand Up @@ -709,15 +710,23 @@ fn create_primitive_array_reader(
use parquet::arrow::array_reader::PrimitiveArrayReader;
match column_desc.physical_type() {
Type::INT32 => {
let reader =
PrimitiveArrayReader::<Int32Type>::new(Box::new(page_iterator), column_desc, None)
.unwrap();
let reader = PrimitiveArrayReader::<Int32Type>::new(
Box::new(page_iterator),
column_desc,
None,
DEFAULT_BATCH_SIZE,
)
.unwrap();
Box::new(reader)
}
Type::INT64 => {
let reader =
PrimitiveArrayReader::<Int64Type>::new(Box::new(page_iterator), column_desc, None)
.unwrap();
let reader = PrimitiveArrayReader::<Int64Type>::new(
Box::new(page_iterator),
column_desc,
None,
DEFAULT_BATCH_SIZE,
)
.unwrap();
Box::new(reader)
}
_ => unreachable!(),
Expand All @@ -730,9 +739,13 @@ fn create_f16_by_bytes_reader(
) -> Box<dyn ArrayReader> {
let physical_type = column_desc.physical_type();
match physical_type {
Type::FIXED_LEN_BYTE_ARRAY => {
make_fixed_len_byte_array_reader(Box::new(page_iterator), column_desc, None).unwrap()
}
Type::FIXED_LEN_BYTE_ARRAY => make_fixed_len_byte_array_reader(
Box::new(page_iterator),
column_desc,
None,
DEFAULT_BATCH_SIZE,
)
.unwrap(),
_ => unimplemented!(),
}
}
Expand All @@ -743,12 +756,20 @@ fn create_decimal_by_bytes_reader(
) -> Box<dyn ArrayReader> {
let physical_type = column_desc.physical_type();
match physical_type {
Type::BYTE_ARRAY => {
make_byte_array_reader(Box::new(page_iterator), column_desc, None).unwrap()
}
Type::FIXED_LEN_BYTE_ARRAY => {
make_fixed_len_byte_array_reader(Box::new(page_iterator), column_desc, None).unwrap()
}
Type::BYTE_ARRAY => make_byte_array_reader(
Box::new(page_iterator),
column_desc,
None,
DEFAULT_BATCH_SIZE,
)
.unwrap(),
Type::FIXED_LEN_BYTE_ARRAY => make_fixed_len_byte_array_reader(
Box::new(page_iterator),
column_desc,
None,
DEFAULT_BATCH_SIZE,
)
.unwrap(),
_ => unimplemented!(),
}
}
Expand All @@ -757,28 +778,52 @@ fn create_fixed_len_byte_array_reader(
page_iterator: impl PageIterator + 'static,
column_desc: ColumnDescPtr,
) -> Box<dyn ArrayReader> {
make_fixed_len_byte_array_reader(Box::new(page_iterator), column_desc, None).unwrap()
make_fixed_len_byte_array_reader(
Box::new(page_iterator),
column_desc,
None,
DEFAULT_BATCH_SIZE,
)
.unwrap()
}

fn create_byte_array_reader(
page_iterator: impl PageIterator + 'static,
column_desc: ColumnDescPtr,
) -> Box<dyn ArrayReader> {
make_byte_array_reader(Box::new(page_iterator), column_desc, None).unwrap()
make_byte_array_reader(
Box::new(page_iterator),
column_desc,
None,
DEFAULT_BATCH_SIZE,
)
.unwrap()
}

fn create_byte_view_array_reader(
page_iterator: impl PageIterator + 'static,
column_desc: ColumnDescPtr,
) -> Box<dyn ArrayReader> {
make_byte_view_array_reader(Box::new(page_iterator), column_desc, None).unwrap()
make_byte_view_array_reader(
Box::new(page_iterator),
column_desc,
None,
DEFAULT_BATCH_SIZE,
)
.unwrap()
}

fn create_string_view_byte_array_reader(
page_iterator: impl PageIterator + 'static,
column_desc: ColumnDescPtr,
) -> Box<dyn ArrayReader> {
make_byte_view_array_reader(Box::new(page_iterator), column_desc, None).unwrap()
make_byte_view_array_reader(
Box::new(page_iterator),
column_desc,
None,
DEFAULT_BATCH_SIZE,
)
.unwrap()
}

fn create_string_byte_array_dictionary_reader(
Expand All @@ -788,8 +833,13 @@ fn create_string_byte_array_dictionary_reader(
use parquet::arrow::array_reader::make_byte_array_dictionary_reader;
let arrow_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));

make_byte_array_dictionary_reader(Box::new(page_iterator), column_desc, Some(arrow_type))
.unwrap()
make_byte_array_dictionary_reader(
Box::new(page_iterator),
column_desc,
Some(arrow_type),
DEFAULT_BATCH_SIZE,
)
.unwrap()
}

fn create_string_list_reader(
Expand Down
58 changes: 48 additions & 10 deletions parquet/src/arrow/array_reader/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ use crate::arrow::array_reader::{
NullArrayReader, PrimitiveArrayReader, RowGroups, StructArrayReader,
make_byte_array_dictionary_reader, make_byte_array_reader,
};
use crate::arrow::arrow_reader::DEFAULT_BATCH_SIZE;
use crate::arrow::arrow_reader::metrics::ArrowReaderMetrics;
use crate::arrow::schema::{ParquetField, ParquetFieldType, VirtualColumnType};
use crate::basic::Type as PhysicalType;
Expand Down Expand Up @@ -96,18 +97,30 @@ pub struct ArrayReaderBuilder<'a> {
parquet_metadata: Option<&'a ParquetMetaData>,
/// metrics
metrics: &'a ArrowReaderMetrics,
/// Batch size for pre-allocating internal buffers
batch_size: usize,
}

impl<'a> ArrayReaderBuilder<'a> {
/// Create a new `ArrayReaderBuilder`
pub fn new(row_groups: &'a dyn RowGroups, metrics: &'a ArrowReaderMetrics) -> Self {
Self {
row_groups,
cache_options: None,
parquet_metadata: None,
metrics,
batch_size: DEFAULT_BATCH_SIZE,
}
}

/// Set the batch size used to pre-allocate internal buffers.
///
/// This avoids reallocations when reading the first batch of data.
pub fn with_batch_size(mut self, batch_size: usize) -> Self {
self.batch_size = batch_size;
self
}

/// Add cache options to the builder
pub fn with_cache_options(mut self, cache_options: Option<&'a CacheOptions<'a>>) -> Self {
self.cache_options = cache_options;
Expand Down Expand Up @@ -414,55 +427,78 @@ impl<'a> ArrayReaderBuilder<'a> {
page_iterator,
column_desc,
arrow_type,
self.batch_size,
)?) as _,
PhysicalType::INT32 => {
if let Some(DataType::Null) = arrow_type {
Box::new(NullArrayReader::<Int32Type>::new(
page_iterator,
column_desc,
self.batch_size,
)?) as _
} else {
Box::new(PrimitiveArrayReader::<Int32Type>::new(
page_iterator,
column_desc,
arrow_type,
self.batch_size,
)?) as _
}
}
PhysicalType::INT64 => Box::new(PrimitiveArrayReader::<Int64Type>::new(
page_iterator,
column_desc,
arrow_type,
self.batch_size,
)?) as _,
PhysicalType::INT96 => Box::new(PrimitiveArrayReader::<Int96Type>::new(
page_iterator,
column_desc,
arrow_type,
self.batch_size,
)?) as _,
PhysicalType::FLOAT => Box::new(PrimitiveArrayReader::<FloatType>::new(
page_iterator,
column_desc,
arrow_type,
self.batch_size,
)?) as _,
PhysicalType::DOUBLE => Box::new(PrimitiveArrayReader::<DoubleType>::new(
page_iterator,
column_desc,
arrow_type,
self.batch_size,
)?) as _,
PhysicalType::BYTE_ARRAY => match arrow_type {
Some(DataType::Dictionary(_, _)) => {
make_byte_array_dictionary_reader(page_iterator, column_desc, arrow_type)?
Some(DataType::Dictionary(_, _)) => make_byte_array_dictionary_reader(
page_iterator,
column_desc,
arrow_type,
self.batch_size,
)?,
Some(DataType::Utf8View | DataType::BinaryView) => make_byte_view_array_reader(
page_iterator,
column_desc,
arrow_type,
self.batch_size,
)?,
_ => {
make_byte_array_reader(page_iterator, column_desc, arrow_type, self.batch_size)?
}
Some(DataType::Utf8View | DataType::BinaryView) => {
make_byte_view_array_reader(page_iterator, column_desc, arrow_type)?
}
_ => make_byte_array_reader(page_iterator, column_desc, arrow_type)?,
},
PhysicalType::FIXED_LEN_BYTE_ARRAY => match arrow_type {
Some(DataType::Dictionary(_, _)) => {
make_byte_array_dictionary_reader(page_iterator, column_desc, arrow_type)?
}
_ => make_fixed_len_byte_array_reader(page_iterator, column_desc, arrow_type)?,
Some(DataType::Dictionary(_, _)) => make_byte_array_dictionary_reader(
page_iterator,
column_desc,
arrow_type,
self.batch_size,
)?,
_ => make_fixed_len_byte_array_reader(
page_iterator,
column_desc,
arrow_type,
self.batch_size,
)?,
},
};
Ok(Some(reader))
Expand Down Expand Up @@ -533,6 +569,7 @@ mod tests {

let metrics = ArrowReaderMetrics::disabled();
let array_reader = ArrayReaderBuilder::new(&file_reader, &metrics)
.with_batch_size(DEFAULT_BATCH_SIZE)
.build_array_reader(fields.as_ref(), &mask)
.unwrap();

Expand Down Expand Up @@ -566,6 +603,7 @@ mod tests {

let metrics = ArrowReaderMetrics::disabled();
let array_reader = ArrayReaderBuilder::new(&file_reader, &metrics)
.with_batch_size(DEFAULT_BATCH_SIZE)
.with_parquet_metadata(file_reader.metadata())
.build_array_reader(fields.as_ref(), &mask)
.unwrap();
Expand Down
Loading
Loading