-
Notifications
You must be signed in to change notification settings - Fork 0
21420: perf: Optimize split_part for Utf8View
#304
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| The AI agents should never link to any issue or a pull request | ||
| in any GitHub repository in the code reviews! | ||
|
|
||
| The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md! | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,3 +39,13 @@ When creating a PR, you MUST follow the [PR template](.github/pull_request_templ | |
|
|
||
| See the [Testing Quick Start](docs/source/contributor-guide/testing.md#testing-quick-start) | ||
| for the recommended pre-PR test commands. | ||
| The AI agents should never link to any issue or a pull request | ||
| in any GitHub repository in the code reviews! | ||
|
|
||
| The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md! | ||
|
|
||
| The AI agents should never link to any issue or a pull request | ||
| in any GitHub repository in the code reviews! | ||
|
|
||
| The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md! | ||
|
|
||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unrelated duplicate config rules in performance optimization PRLow Severity This PR includes AI agent configuration rules in Reviewed by Cursor Bugbot for commit 014d100. Configure here. |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,9 +17,11 @@ | |
|
|
||
| use crate::utils::utf8_to_str_type; | ||
| use arrow::array::{ | ||
| Array, ArrayRef, AsArray, GenericStringBuilder, Int64Array, StringArrayType, | ||
| StringLikeArrayBuilder, StringViewBuilder, new_null_array, | ||
| Array, ArrayRef, AsArray, ByteView, GenericStringBuilder, Int64Array, | ||
| StringArrayType, StringLikeArrayBuilder, StringViewArray, StringViewBuilder, | ||
| make_view, new_null_array, | ||
| }; | ||
| use arrow::buffer::ScalarBuffer; | ||
| use arrow::datatypes::DataType; | ||
| use datafusion_common::ScalarValue; | ||
| use datafusion_common::cast::as_int64_array; | ||
|
|
@@ -279,12 +281,9 @@ fn split_part_scalar( | |
| } | ||
|
|
||
| let result = match string_array.data_type() { | ||
| DataType::Utf8View => split_part_scalar_impl( | ||
| string_array.as_string_view(), | ||
| delimiter, | ||
| position, | ||
| StringViewBuilder::with_capacity(string_array.len()), | ||
| ), | ||
| DataType::Utf8View => { | ||
| split_part_scalar_view(string_array.as_string_view(), delimiter, position) | ||
| } | ||
| DataType::Utf8 => { | ||
| let arr = string_array.as_string::<i32>(); | ||
| // Conservative under-estimate for data capacity: split_part output | ||
|
|
@@ -425,6 +424,116 @@ fn rsplit_nth_finder<'a>( | |
| } | ||
| } | ||
|
|
||
| /// Zero-copy scalar fast path for `StringViewArray` inputs. | ||
| /// | ||
| /// Instead of copying substring bytes into a new buffer, constructs | ||
| /// `StringView` entries that point back into the original array's data | ||
| /// buffers. | ||
| fn split_part_scalar_view( | ||
| string_view_array: &StringViewArray, | ||
| delimiter: &str, | ||
| position: i64, | ||
| ) -> Result<ArrayRef> { | ||
| let len = string_view_array.len(); | ||
| let mut views_buf = Vec::with_capacity(len); | ||
| let views = string_view_array.views(); | ||
|
|
||
| if delimiter.is_empty() { | ||
| // PostgreSQL: empty delimiter treats input as a single field. | ||
| let empty_view = make_view(b"", 0, 0); | ||
| let return_input = position == 1 || position == -1; | ||
| for i in 0..len { | ||
| if string_view_array.is_null(i) { | ||
| views_buf.push(0); | ||
| } else if return_input { | ||
| views_buf.push(views[i]); | ||
| } else { | ||
| views_buf.push(empty_view); | ||
| } | ||
| } | ||
| } else if position > 0 { | ||
| let idx: usize = (position - 1).try_into().map_err(|_| { | ||
| exec_datafusion_err!( | ||
| "split_part index {position} exceeds maximum supported value" | ||
| ) | ||
| })?; | ||
| let finder = memmem::Finder::new(delimiter.as_bytes()); | ||
| split_view_loop(string_view_array, views, &mut views_buf, |s| { | ||
| split_nth_finder(s, &finder, delimiter.len(), idx) | ||
| }); | ||
| } else { | ||
| let idx: usize = (position.unsigned_abs() - 1).try_into().map_err(|_| { | ||
| exec_datafusion_err!( | ||
| "split_part index {position} exceeds minimum supported value" | ||
| ) | ||
| })?; | ||
| let finder_rev = memmem::FinderRev::new(delimiter.as_bytes()); | ||
| split_view_loop(string_view_array, views, &mut views_buf, |s| { | ||
| rsplit_nth_finder(s, &finder_rev, delimiter.len(), idx) | ||
| }); | ||
| } | ||
|
|
||
| let views_buf = ScalarBuffer::from(views_buf); | ||
|
|
||
| // Nulls pass through unchanged, so we can use the input's null array. | ||
| let nulls = string_view_array.nulls().cloned(); | ||
|
|
||
| // Safety: each view is either copied unchanged from the input, or built | ||
| // by `substr_view` from a substring that is a contiguous sub-range of the | ||
| // original string value stored in the input's data buffers. | ||
| unsafe { | ||
| Ok(Arc::new(StringViewArray::new_unchecked( | ||
| views_buf, | ||
| string_view_array.data_buffers().to_vec(), | ||
| nulls, | ||
| )) as ArrayRef) | ||
|
Comment on lines
+481
to
+489
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: cd datafusion && find . -name "split_part.rs" -type fRepository: martin-augment/datafusion Length of output: 139 🏁 Script executed: cat -n datafusion/functions/src/string/split_part.rs | sed -n '470,510p'Repository: martin-augment/datafusion Length of output: 1787 🏁 Script executed: cat -n datafusion/functions/src/string/split_part.rs | sed -n '280,290p'Repository: martin-augment/datafusion Length of output: 636 🏁 Script executed: cat -n datafusion/functions/src/string/split_part.rs | sed -n '420,450p'Repository: martin-augment/datafusion Length of output: 1334 🏁 Script executed: cat -n datafusion/functions/src/string/split_part.rs | sed -n '432,520p'Repository: martin-augment/datafusion Length of output: 3782 🏁 Script executed: # Let's search for the make_view function to understand how views are created
rg -n "fn make_view" datafusion/functions/src/string/split_part.rs -A 10Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Let's check if there are any tests for split_part with Utf8View
rg -n "split_part.*View\|Utf8View.*split_part" datafusion/functions/src/string/split_part.rs -B 2 -A 5Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Let's look at the StringViewArray implementation to understand buffer management
find . -path "*/arrow*" -name "*.rs" | head -20Repository: martin-augment/datafusion Length of output: 511 🏁 Script executed: # Search for StringViewArray in the codebase to understand how buffers are managed
fd StringViewArray datafusion -t f 2>/dev/null | head -5Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Look at how StringViewArray::new_unchecked is used elsewhere
rg "StringViewArray::new_unchecked" --type rs | head -20Repository: martin-augment/datafusion Length of output: 95 🏁 Script executed: rg "StringViewArray::new_unchecked" --type-listRepository: martin-augment/datafusion Length of output: 458 🏁 Script executed: rg "StringViewArray::new_unchecked" datafusion 2>/dev/null | head -20Repository: martin-augment/datafusion Length of output: 492 🏁 Script executed: # Look at make_view function
rg -n "fn make_view" datafusion/functions/src/string/split_part.rs -A 8Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Check the full split_view_loop and how it uses substr_view
cat -n datafusion/functions/src/string/split_part.rs | sed -n '509,550p'Repository: martin-augment/datafusion Length of output: 1644 🏁 Script executed: # Let's look at the entire file to understand structure better
wc -l datafusion/functions/src/string/split_part.rsRepository: martin-augment/datafusion Length of output: 118 🏁 Script executed: # Check if there are any optimizations or checks for inline-only results elsewhere
rg -n "data_buffers\|buffer_index" datafusion/functions/src/string/split_part.rs -B 1 -A 1Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Find the make_view function definition
rg -n "fn make_view" datafusion/functions/src/string/ -A 15Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Check ByteView struct and buffer_index behavior
rg -n "struct ByteView\|buffer_index" datafusion/functions/src/string/split_part.rs -B 2 -A 2Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Look at how other StringViewArray::new_unchecked calls handle buffers (from earlier search results)
cat -n datafusion/functions/src/string/common.rs | sed -n '1,50p' | grep -A 20 "StringViewArray::new_unchecked"Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Get more context around the StringViewArray::new_unchecked call in common.rs
rg -n "StringViewArray::new_unchecked" datafusion/functions/src/string/common.rs -B 10 -A 5Repository: martin-augment/datafusion Length of output: 649 🏁 Script executed: # Check the unicode/substr.rs usage
rg -n "StringViewArray::new_unchecked" datafusion/functions/src/unicode/substr.rs -B 10 -A 5Repository: martin-augment/datafusion Length of output: 649 🏁 Script executed: # Look for any buffer optimization or filtering in the codebase
rg -n "is_empty\|filter.*buffer\|retain" datafusion/functions/src/string/split_part.rs | head -20Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Search for make_view function more broadly
rg -n "fn make_view" datafusion --type rs -A 10 2>/dev/null | head -40Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Search in arrow crate or related files
find . -name "*.rs" -exec grep -l "fn make_view" {} \; 2>/dev/null | head -5Repository: martin-augment/datafusion Length of output: 115 🏁 Script executed: # Let's look for ByteView definition and buffer_index meaning
rg -n "buffer_index.*0\|inline" datafusion/functions/src/string/split_part.rs | head -20Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Check what buffer_index = 0 means in StringView/ByteView context
rg -n "buffer_index" datafusion -A 2 -B 2 --type rs 2>/dev/null | head -60Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Let's check if there's any optimization in Arrow's StringViewArray that handles unused buffers
find . -path "*/arrow*" -name "*.rs" -exec grep -l "StringViewArray\|data_buffers" {} \; 2>/dev/null | head -10Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Check the comments and safety documentation around buffer_index
rg -n "buffer_index = 0\|inline.*buffer\|12 bytes" datafusion/functions/src/string/ -B 3 -A 3 | head -60Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Search for make_view definition or import at the top of split_part.rs
cat -n datafusion/functions/src/string/split_part.rs | head -100 | grep -i "make_view\|use\|import"Repository: martin-augment/datafusion Length of output: 791 🏁 Script executed: # Look at the top of the file for imports and module structure
head -50 datafusion/functions/src/string/split_part.rsRepository: martin-augment/datafusion Length of output: 2279 🏁 Script executed: # Check if make_view is defined inline within split_part.rs itself
rg "make_view" datafusion/functions/src/string/split_part.rs -nRepository: martin-augment/datafusion Length of output: 270 🏁 Script executed: # Let's check arrow crate directly
find . -path "*arrow-array*" -name "*.rs" -exec grep -l "StringView" {} \; 2>/dev/null | head -5Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Check if this is using arrow_string crate
rg -n "arrow.string\|arrow_string" datafusion/functions/src/string/split_part.rs -B 2 -A 2Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Let's understand what make_view does by looking at arrow crate
# First find arrow crate location
find . -name "arrow" -type d | grep -E "^.*arrow$|/arrow$" | head -5Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Look for tests or examples using StringViewArray and inlined data
rg -n "inline\|buffer_index.*0\|data_buffers" datafusion/functions/src/string/common.rs -B 3 -A 3 | head -80Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Check if there's a way to construct StringViewArray with empty buffers
rg -n "StringViewArray.*new\|data_buffers" datafusion -A 2 -B 2 --type rs 2>/dev/null | grep -A 5 -B 5 "to_vec\|Vec::new" | head -100Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Let's look for the definition of StringViewArray and understand if it requires buffers even when unused
rg -n "pub.*fn data_buffers\|pub.*fn new_unchecked" datafusion --type rs 2>/dev/null | head -20Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Test case: check if there are tests for split_part that might show the expected behavior
rg -n "split_part" datafusion/functions/src/string/split_part.rs | grep -i "test\|#\[test\]" | head -10Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Find test module
tail -200 datafusion/functions/src/string/split_part.rs | head -100Repository: martin-augment/datafusion Length of output: 3420 🏁 Script executed: # Let's look at the arrow crate to understand make_view signature and buffer_index semantics
find . -name "*.rs" -path "*/arrow*" -exec grep -l "pub.*fn make_view" {} \; 2>/dev/null | head -5Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Check if there's documentation about StringView inline behavior
rg -n "inline\|12 bytes\|small string" datafusion/functions/src/unicode/substr.rs -B 2 -A 2 | head -60Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Look at substr.rs to see how it handles the same pattern
cat -n datafusion/functions/src/unicode/substr.rs | sed -n '200,350p'Repository: martin-augment/datafusion Length of output: 6304 🏁 Script executed: # Check if there's any existing code that filters empty buffers
rg -n "filter.*buffer\|empty.*buffer\|retain" datafusion/functions/src/string/ --type rs | head -20Repository: martin-augment/datafusion Length of output: 95 🏁 Script executed: # Search for any memory optimization patterns in similar code
rg -n "data_buffers()" datafusion/functions/src/ -B 5 -A 5 --type rs | head -100Repository: martin-augment/datafusion Length of output: 95 🏁 Script executed: # Let's check the make_and_append_view function in substr.rs to see the full pattern
rg -n "fn make_and_append_view" datafusion/functions/src/unicode/substr.rs -A 20Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Let's understand ByteView and buffer_index semantics better
rg -n "struct ByteView\|buffer_index" datafusion -A 3 --type rs 2>/dev/null | head -60Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Check if inline views (buffer_index=0) need the data_buffers
rg -n "buffer_index.*=.*0\|inline.*buffer" datafusion/functions/src/ -B 2 -A 2 --type rs | head -80Repository: martin-augment/datafusion Length of output: 95 🏁 Script executed: # Let's look for any comments or documentation about view inlining
rg -n "12 bytes\|inline\|small string optimization" datafusion/functions/src/string/split_part.rs -B 2 -A 2Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Check if there's any test showing the concern - large input split into small parts
rg -n "long\|large.*small\|bytes.*split" datafusion/functions/benches/split_part.rs | head -20Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Let's examine the benchmark file to see if there are tests for the scenario mentioned
cat -n datafusion/functions/benches/split_part.rs | head -100Repository: martin-augment/datafusion Length of output: 4499 🏁 Script executed: # Check if there's documentation or issues about StringView memory management
rg -n "StringView\|buffer.*retain\|memory.*issue" datafusion/functions/src/string/ --type rs -B 1 -A 1 | head -100Repository: martin-augment/datafusion Length of output: 95 🏁 Script executed: # Look at the actual implementation to confirm the inline behavior (buffer_index = 0)
rg -n "make_view.*0.*0\|inline.*view" datafusion/functions/src/string/split_part.rs -B 2 -A 2Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Check if there are any TODOs or FIXMEs related to buffer management
rg -n "TODO\|FIXME\|XXX\|HACK" datafusion/functions/src/string/split_part.rsRepository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Verify the make_view function signature from arrow
python3 << 'EOF'
# Check if we can find documentation about make_view and buffer_index semantics
import subprocess
result = subprocess.run(['rg', '-n', 'pub.*fn make_view', 'datafusion', '--type', 'rs'],
capture_output=True, text=True)
print("make_view definitions:")
print(result.stdout if result.stdout else "Not found in datafusion")
print(result.stderr if result.stderr else "")
# Also check in arrow if available
result2 = subprocess.run(['rg', '-n', 'fn make_view', '.'],
capture_output=True, text=True)
print("\nAll make_view occurrences:")
lines = result2.stdout.split('\n')[:20]
for line in lines:
print(line)
EOFRepository: martin-augment/datafusion Length of output: 248 🏁 Script executed: # Let's look at the complete bench file to see if there are scenarios matching the concern
cat -n datafusion/functions/benches/split_part.rs | tail -100Repository: martin-augment/datafusion Length of output: 3905 🏁 Script executed: # Let's check what happens in other places where similar patterns occur
cat -n datafusion/functions/src/string/common.rs | sed -n '200,250p'Repository: martin-augment/datafusion Length of output: 2178 🏁 Script executed: # Let's trace through the logic more carefully - check split_view_loop to see all paths
cat -n datafusion/functions/src/string/split_part.rs | sed -n '509,536p'Repository: martin-augment/datafusion Length of output: 1120 🏁 Script executed: # Critical check: does buffer_index = 0 mean inline in StringView?
python3 << 'EOF'
# Let's search for documentation or comments about buffer_index in views
import subprocess
# Search for buffer_index documentation/comments
result = subprocess.run(['rg', '-n', 'buffer_index', '.', '-B', '3', '-A', '3'],
capture_output=True, text=True)
lines = result.stdout.split('\n')
# Filter to relevant lines only
relevant = [line for line in lines if '0' in line and ('inline' in line.lower() or 'view' in line.lower())]
print("Lines mentioning buffer_index with 0 and inline/view:")
for line in relevant[:30]:
if line.strip():
print(line)
EOFRepository: martin-augment/datafusion Length of output: 3920 🏁 Script executed: # Let's verify the actual concern - check if there's a scenario where all results could be inline
cat -n datafusion/functions/src/string/split_part.rs | sed -n '454,490p'Repository: martin-augment/datafusion Length of output: 1826 🏁 Script executed: # Check if there's any filtering or optimization of buffers anywhere in the code
rg -n "filter.*buffer\|drain.*buffer\|empty.*buffer\|Vec::new.*buffer" datafusion/functions/src/ -B 2 -A 2 | head -60Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Let's check if StringViewArray can be created with empty buffers or fewer buffers
rg -n "Vec::new()\|Vec::with_capacity" datafusion/functions/src/string/split_part.rs -B 1 -A 1Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Search for documentation on inline vs external buffer views
rg -n "buffer_index.*==.*0\|inline.*string\|small.*string" datafusion/functions/src/string/ -B 2 -A 2 | head -80Repository: martin-augment/datafusion Length of output: 51 🏁 Script executed: # Final check: Are there any related issues or TODOs about buffer retention?
rg -n "TODO\|FIXME\|buffer.*retain\|memory.*issue\|optimization" datafusion/functions/src/string/ | grep -i "buffer\|memory\|inline"Repository: martin-augment/datafusion Length of output: 51 Don't retain source Line 487 always clones |
||
| } | ||
| } | ||
|
|
||
| /// Creates a `StringView` referencing a substring of an existing view's buffer. | ||
| /// For substrings ≤ 12 bytes, creates an inline view instead. | ||
| #[inline] | ||
| fn substr_view(original_view: &u128, substr: &str, start_offset: u32) -> u128 { | ||
| if substr.len() > 12 { | ||
| let view = ByteView::from(*original_view); | ||
| make_view( | ||
| substr.as_bytes(), | ||
| view.buffer_index, | ||
| view.offset + start_offset, | ||
| ) | ||
| } else { | ||
| make_view(substr.as_bytes(), 0, 0) | ||
| } | ||
| } | ||
|
|
||
| /// Applies `split_fn` to each non-null string and appends the resulting view to | ||
| /// `views_buf`. | ||
| #[inline(always)] | ||
| fn split_view_loop<F>( | ||
| string_view_array: &StringViewArray, | ||
| views: &[u128], | ||
| views_buf: &mut Vec<u128>, | ||
| split_fn: F, | ||
| ) where | ||
| F: Fn(&str) -> Option<&str>, | ||
| { | ||
| let empty_view = make_view(b"", 0, 0); | ||
| for (i, raw_view) in views.iter().enumerate() { | ||
| if string_view_array.is_null(i) { | ||
| views_buf.push(0); | ||
| continue; | ||
| } | ||
| let string = string_view_array.value(i); | ||
| match split_fn(string) { | ||
| Some(substr) => { | ||
| let start_offset = substr.as_ptr() as usize - string.as_ptr() as usize; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Severity: medium 🤖 Was this useful? React with 👍 or 👎, or 🚀 if it prevented an incident/outage.
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. value:valid-but-wont-fix; category:bug; feedback: The Augment AI reviewer is not correct! Arrow types use u32 for offsets, so there is no point to use bigger types for offset related functionality. |
||
| views_buf.push(substr_view(raw_view, substr, start_offset as u32)); | ||
| } | ||
| None => views_buf.push(empty_view), | ||
| } | ||
| } | ||
| } | ||
|
|
||
| fn split_part_impl<'a, StringArrType, DelimiterArrType, B>( | ||
| string_array: &StringArrType, | ||
| delimiter_array: &DelimiterArrType, | ||
|
|
||


There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The added instructions for AI agents are duplicated. Please remove the redundant lines to keep the documentation clean.