Skip to content

Commit 96b3052

Browse files
authored
[Parquet] perf: Create Utf8/BinaryViewArray directly rather than via ArrayData (#9121)
# Which issue does this PR close? - part of #9061 - part of - Part of #9128 # Rationale for this change - similarly to #9120 Creating Arrays via ArrayData / `make_array` has overhead (at least 2 Vec allocations) compared to simply creating the arrays directly ViewArrays also have an extra Vec allocation (to hold their buffers) # What changes are included in this PR? Update the parquet reader to create ViewArrays directly # Are these changes tested? By CI # Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. If there are any breaking changes to public APIs, please call them out. -->
1 parent cfb9807 commit 96b3052

File tree

1 file changed

+11
-19
lines changed

1 file changed

+11
-19
lines changed

parquet/src/arrow/buffer/view_buffer.rs

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@
1616
// under the License.
1717

1818
use crate::arrow::record_reader::buffer::ValuesBuffer;
19-
use arrow_array::{ArrayRef, builder::make_view, make_array};
20-
use arrow_buffer::Buffer;
21-
use arrow_data::ArrayDataBuilder;
19+
use arrow_array::{ArrayRef, BinaryViewArray, StringViewArray, builder::make_view};
20+
use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer, ScalarBuffer};
2221
use arrow_schema::DataType as ArrowType;
22+
use std::sync::Arc;
2323

2424
/// A buffer of view type byte arrays that can be converted into
2525
/// `GenericByteViewArray`
@@ -70,26 +70,18 @@ impl ViewBuffer {
7070
/// Converts this into an [`ArrayRef`] with the provided `data_type` and `null_buffer`
7171
pub fn into_array(self, null_buffer: Option<Buffer>, data_type: &ArrowType) -> ArrayRef {
7272
let len = self.views.len();
73-
let views = Buffer::from_vec(self.views);
73+
let views = ScalarBuffer::from(self.views);
74+
let nulls = null_buffer
75+
.map(|b| NullBuffer::new(BooleanBuffer::new(b, 0, len)))
76+
.filter(|n| n.null_count() != 0);
7477
match data_type {
7578
ArrowType::Utf8View => {
76-
let builder = ArrayDataBuilder::new(ArrowType::Utf8View)
77-
.len(len)
78-
.add_buffer(views)
79-
.add_buffers(self.buffers)
80-
.null_bit_buffer(null_buffer);
81-
// We have checked that the data is utf8 when building the buffer, so it is safe
82-
let array = unsafe { builder.build_unchecked() };
83-
make_array(array)
79+
// Safety: views were created correctly, and checked that the data is utf8 when building the buffer
80+
unsafe { Arc::new(StringViewArray::new_unchecked(views, self.buffers, nulls)) }
8481
}
8582
ArrowType::BinaryView => {
86-
let builder = ArrayDataBuilder::new(ArrowType::BinaryView)
87-
.len(len)
88-
.add_buffer(views)
89-
.add_buffers(self.buffers)
90-
.null_bit_buffer(null_buffer);
91-
let array = unsafe { builder.build_unchecked() };
92-
make_array(array)
83+
// Safety: views were created correctly
84+
unsafe { Arc::new(BinaryViewArray::new_unchecked(views, self.buffers, nulls)) }
9385
}
9486
_ => panic!("Unsupported data type: {data_type}"),
9587
}

0 commit comments

Comments
 (0)