-
Notifications
You must be signed in to change notification settings - Fork 0
21538: perf: Optimize NULL handling in StringViewArrayBuilder
#311
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| The AI agents should never link to any issue or a pull request | ||
| in any GitHub repository in the code reviews! | ||
|
|
||
| The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md! | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,9 +21,9 @@ use datafusion_common::{Result, exec_datafusion_err, internal_err}; | |
|
|
||
| use arrow::array::{ | ||
| Array, ArrayAccessor, ArrayDataBuilder, BinaryArray, ByteView, LargeStringArray, | ||
| StringArray, StringViewArray, StringViewBuilder, make_view, | ||
| StringArray, StringViewArray, make_view, | ||
| }; | ||
| use arrow::buffer::{MutableBuffer, NullBuffer}; | ||
| use arrow::buffer::{Buffer, MutableBuffer, NullBuffer, ScalarBuffer}; | ||
| use arrow::datatypes::DataType; | ||
|
|
||
| /// Optimized version of the StringBuilder in Arrow that: | ||
|
|
@@ -150,18 +150,25 @@ impl StringArrayBuilder { | |
| } | ||
| } | ||
|
|
||
| /// Optimized version of Arrow's [`StringViewBuilder`]. Rather than adding NULLs | ||
| /// on a row-by-row basis, the caller should provide nulls when calling | ||
| /// [`finish`](Self::finish). This allows callers to compute nulls more | ||
| /// efficiently (e.g., via bulk bitmap operations). | ||
| /// | ||
| /// [`StringViewBuilder`]: arrow::array::StringViewBuilder | ||
| pub struct StringViewArrayBuilder { | ||
| builder: StringViewBuilder, | ||
| views: Vec<u128>, | ||
| data: Vec<u8>, | ||
| block: Vec<u8>, | ||
| /// If true, a safety check is required during the `append_offset` call | ||
| tainted: bool, | ||
| } | ||
|
|
||
| impl StringViewArrayBuilder { | ||
| pub fn with_capacity(item_capacity: usize, _data_capacity: usize) -> Self { | ||
| let builder = StringViewBuilder::with_capacity(item_capacity); | ||
| pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self { | ||
| Self { | ||
| builder, | ||
| views: Vec::with_capacity(item_capacity), | ||
| data: Vec::with_capacity(data_capacity), | ||
| block: vec![], | ||
| tainted: false, | ||
| } | ||
|
|
@@ -214,16 +221,23 @@ impl StringViewArrayBuilder { | |
| } | ||
| } | ||
|
|
||
| /// Finalizes the current row by converting the accumulated data into a | ||
| /// StringView and appending it to the views buffer. | ||
| pub fn append_offset(&mut self) -> Result<()> { | ||
| let block_str = if self.tainted { | ||
| if self.tainted { | ||
| std::str::from_utf8(&self.block) | ||
| .map_err(|_| exec_datafusion_err!("invalid UTF-8 in binary literal"))? | ||
| } else { | ||
| // SAFETY: all data that was appended was valid UTF8 | ||
| unsafe { std::str::from_utf8_unchecked(&self.block) } | ||
| }; | ||
| self.builder.append_value(block_str); | ||
| .map_err(|_| exec_datafusion_err!("invalid UTF-8 in binary literal"))?; | ||
| } | ||
|
|
||
| let v = &self.block; | ||
| let offset = self.data.len() as u32; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The cast let offset: u32 = self.data.len().try_into().map_err(|_| {
exec_datafusion_err!("StringView data buffer overflow")
})?;
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. value:good-to-have; category:bug; feedback: The Gemini AI reviewer is correct! The offsets in Arrow are u32, so the cast is needed but it could be done with TryFrom to return an error if the data length is bigger than u32::MAX. Prevents silent truncation and invalid results because of this. |
||
| if v.len() > 12 { | ||
| self.data.extend_from_slice(v); | ||
| } | ||
| self.views.push(make_view(v, 0, offset)); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For Severity: medium 🤖 Was this useful? React with 👍 or 👎, or 🚀 if it prevented an incident/outage.
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. value:good-to-have; category:bug; feedback: The Augment AI reviewer is correct! The |
||
|
|
||
| self.block.clear(); | ||
| self.tainted = false; | ||
| Ok(()) | ||
| } | ||
|
|
||
|
|
@@ -233,21 +247,35 @@ impl StringViewArrayBuilder { | |
| /// | ||
| /// Returns an error when: | ||
| /// | ||
| /// - the provided `null_buffer` does not match amount of `append_offset` calls. | ||
| pub fn finish(mut self, null_buffer: Option<NullBuffer>) -> Result<StringViewArray> { | ||
| let array = self.builder.finish(); | ||
| match null_buffer { | ||
| Some(nulls) if nulls.len() != array.len() => { | ||
| internal_err!("Null buffer and views buffer must be the same length") | ||
| } | ||
| Some(nulls) => { | ||
| let array_builder = array.into_data().into_builder().nulls(Some(nulls)); | ||
| // SAFETY: the underlying data is valid; we are only adding a null buffer | ||
| let array_data = unsafe { array_builder.build_unchecked() }; | ||
| Ok(StringViewArray::from(array_data)) | ||
| } | ||
| None => Ok(array), | ||
| /// - the provided `null_buffer` length does not match the row count. | ||
| pub fn finish(self, null_buffer: Option<NullBuffer>) -> Result<StringViewArray> { | ||
| if let Some(ref nulls) = null_buffer | ||
| && nulls.len() != self.views.len() | ||
| { | ||
| return internal_err!( | ||
| "Null buffer length ({}) must match row count ({})", | ||
| nulls.len(), | ||
| self.views.len() | ||
| ); | ||
| } | ||
|
|
||
| let buffers: Vec<Buffer> = if self.data.is_empty() { | ||
| vec![] | ||
| } else { | ||
| vec![Buffer::from(self.data)] | ||
| }; | ||
|
|
||
| // SAFETY: views were constructed with correct lengths, offsets, and | ||
| // prefixes. UTF-8 validity has also been checked, if the input was | ||
| // tainted. | ||
| let array = unsafe { | ||
| StringViewArray::new_unchecked( | ||
| ScalarBuffer::from(self.views), | ||
| buffers, | ||
| null_buffer, | ||
| ) | ||
| }; | ||
| Ok(array) | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
offsetis computed withself.data.len() as u32, which will silently truncate once the accumulated buffer exceedsu32::MAX; since this is later passed intoStringViewArray::new_unchecked, it could produce invalid views and potential out-of-bounds reads. Consider enforcing the 32-bit offset invariant with a checked conversion/error (similar to the other builders’ overflow handling).Severity: high
🤖 Was this useful? React with 👍 or 👎, or 🚀 if it prevented an incident/outage.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
value:good-to-have; category:bug; feedback: The Augment AI reviewer is correct! The offsets in Arrow are u32, so the cast is needed but it could be done with TryFrom to return an error if the data length is bigger than u32::MAX. Prevents silent truncation and invalid results because of this.