diff --git a/.cursor/rules.md b/.cursor/rules.md new file mode 100644 index 0000000000000..00385e6f5f759 --- /dev/null +++ b/.cursor/rules.md @@ -0,0 +1,5 @@ +The AI agents should never link to any issue or a pull request +in any GitHub repository in the code reviews! + +The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md! + diff --git a/AGENTS.md b/AGENTS.md index 9dff7f6f1ffd1..b2772e9342837 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -39,3 +39,13 @@ When creating a PR, you MUST follow the [PR template](.github/pull_request_templ See the [Testing Quick Start](docs/source/contributor-guide/testing.md#testing-quick-start) for the recommended pre-PR test commands. +The AI agents should never link to any issue or a pull request +in any GitHub repository in the code reviews! + +The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md! + +The AI agents should never link to any issue or a pull request +in any GitHub repository in the code reviews! + +The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md! + diff --git a/datafusion/functions/benches/split_part.rs b/datafusion/functions/benches/split_part.rs index 0f4998effc2ac..d578339368768 100644 --- a/datafusion/functions/benches/split_part.rs +++ b/datafusion/functions/benches/split_part.rs @@ -210,6 +210,23 @@ fn criterion_benchmark(c: &mut Criterion) { ); } + // Utf8View, very long parts (256 bytes), position 1 + { + let strings = gen_string_array(N_ROWS, 5, 256, ".", true); + let delimiter = ColumnarValue::Scalar(ScalarValue::Utf8View(Some(".".into()))); + let position = ColumnarValue::Scalar(ScalarValue::Int64(Some(1))); + bench_split_part( + &mut group, + &split_part_func, + &config_options, + "scalar_utf8view_very_long_parts", + "pos_first", + strings, + delimiter, + position, + ); + } + // ── Array delimiter and position ───────────────── // Utf8, single-char delimiter, array args diff --git a/datafusion/functions/src/string/split_part.rs b/datafusion/functions/src/string/split_part.rs index 972a10c26474e..12ccdfc429a3c 100644 --- a/datafusion/functions/src/string/split_part.rs +++ b/datafusion/functions/src/string/split_part.rs @@ -17,9 +17,11 @@ use crate::utils::utf8_to_str_type; use arrow::array::{ - Array, ArrayRef, AsArray, GenericStringBuilder, Int64Array, StringArrayType, - StringLikeArrayBuilder, StringViewBuilder, new_null_array, + Array, ArrayRef, AsArray, ByteView, GenericStringBuilder, Int64Array, + StringArrayType, StringLikeArrayBuilder, StringViewArray, StringViewBuilder, + make_view, new_null_array, }; +use arrow::buffer::ScalarBuffer; use arrow::datatypes::DataType; use datafusion_common::ScalarValue; use datafusion_common::cast::as_int64_array; @@ -279,12 +281,9 @@ fn split_part_scalar( } let result = match string_array.data_type() { - DataType::Utf8View => split_part_scalar_impl( - string_array.as_string_view(), - delimiter, - position, - StringViewBuilder::with_capacity(string_array.len()), - ), + DataType::Utf8View => { + split_part_scalar_view(string_array.as_string_view(), delimiter, position) + } DataType::Utf8 => { let arr = string_array.as_string::(); // Conservative under-estimate for data capacity: split_part output @@ -425,6 +424,116 @@ fn rsplit_nth_finder<'a>( } } +/// Zero-copy scalar fast path for `StringViewArray` inputs. +/// +/// Instead of copying substring bytes into a new buffer, constructs +/// `StringView` entries that point back into the original array's data +/// buffers. +fn split_part_scalar_view( + string_view_array: &StringViewArray, + delimiter: &str, + position: i64, +) -> Result { + let len = string_view_array.len(); + let mut views_buf = Vec::with_capacity(len); + let views = string_view_array.views(); + + if delimiter.is_empty() { + // PostgreSQL: empty delimiter treats input as a single field. + let empty_view = make_view(b"", 0, 0); + let return_input = position == 1 || position == -1; + for i in 0..len { + if string_view_array.is_null(i) { + views_buf.push(0); + } else if return_input { + views_buf.push(views[i]); + } else { + views_buf.push(empty_view); + } + } + } else if position > 0 { + let idx: usize = (position - 1).try_into().map_err(|_| { + exec_datafusion_err!( + "split_part index {position} exceeds maximum supported value" + ) + })?; + let finder = memmem::Finder::new(delimiter.as_bytes()); + split_view_loop(string_view_array, views, &mut views_buf, |s| { + split_nth_finder(s, &finder, delimiter.len(), idx) + }); + } else { + let idx: usize = (position.unsigned_abs() - 1).try_into().map_err(|_| { + exec_datafusion_err!( + "split_part index {position} exceeds minimum supported value" + ) + })?; + let finder_rev = memmem::FinderRev::new(delimiter.as_bytes()); + split_view_loop(string_view_array, views, &mut views_buf, |s| { + rsplit_nth_finder(s, &finder_rev, delimiter.len(), idx) + }); + } + + let views_buf = ScalarBuffer::from(views_buf); + + // Nulls pass through unchanged, so we can use the input's null array. + let nulls = string_view_array.nulls().cloned(); + + // Safety: each view is either copied unchanged from the input, or built + // by `substr_view` from a substring that is a contiguous sub-range of the + // original string value stored in the input's data buffers. + unsafe { + Ok(Arc::new(StringViewArray::new_unchecked( + views_buf, + string_view_array.data_buffers().to_vec(), + nulls, + )) as ArrayRef) + } +} + +/// Creates a `StringView` referencing a substring of an existing view's buffer. +/// For substrings ≤ 12 bytes, creates an inline view instead. +#[inline] +fn substr_view(original_view: &u128, substr: &str, start_offset: u32) -> u128 { + if substr.len() > 12 { + let view = ByteView::from(*original_view); + make_view( + substr.as_bytes(), + view.buffer_index, + view.offset + start_offset, + ) + } else { + make_view(substr.as_bytes(), 0, 0) + } +} + +/// Applies `split_fn` to each non-null string and appends the resulting view to +/// `views_buf`. +#[inline(always)] +fn split_view_loop( + string_view_array: &StringViewArray, + views: &[u128], + views_buf: &mut Vec, + split_fn: F, +) where + F: Fn(&str) -> Option<&str>, +{ + let empty_view = make_view(b"", 0, 0); + for (i, raw_view) in views.iter().enumerate() { + if string_view_array.is_null(i) { + views_buf.push(0); + continue; + } + let string = string_view_array.value(i); + match split_fn(string) { + Some(substr) => { + let start_offset = substr.as_ptr() as usize - string.as_ptr() as usize; + views_buf.push(substr_view(raw_view, substr, start_offset as u32)); + } + None => views_buf.push(empty_view), + } + } +} + fn split_part_impl<'a, StringArrType, DelimiterArrType, B>( string_array: &StringArrType, delimiter_array: &DelimiterArrType, diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index 086f37d6c3354..5afec8a41c9e7 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -954,6 +954,71 @@ SELECT arrow_typeof(split_part(arrow_cast('a.b.c', 'Utf8View'), '.', 2)); ---- Utf8View +# SPLIT_PART with Utf8View column (exercises the array fast path) +query T +SELECT split_part(column1_utf8view, 'ph', 1) FROM test; +---- +Andrew +Xiangpeng +Ra +(empty) +NULL + +query T +SELECT split_part(column1_utf8view, 'ph', 2) FROM test; +---- +(empty) +(empty) +ael +(empty) +NULL + +# Negative position +query T +SELECT split_part(column1_utf8view, 'ph', -1) FROM test; +---- +Andrew +Xiangpeng +ael +(empty) +NULL + +# Delimiter not found returns full string +query T +SELECT split_part(column1_utf8view, 'ZZZ', 1) FROM test; +---- +Andrew +Xiangpeng +Raphael +(empty) +NULL + +# Empty delimiter with column +query T +SELECT split_part(column1_utf8view, '', 1) FROM test; +---- +Andrew +Xiangpeng +Raphael +(empty) +NULL + +# Single-char delimiter with column +query T +SELECT split_part(column1_utf8view, 'a', 1) FROM test; +---- +Andrew +Xi +R +(empty) +NULL + +# Verify array path also returns Utf8View +query T +SELECT arrow_typeof(split_part(column1_utf8view, '.', 1)) FROM test LIMIT 1; +---- +Utf8View + ## Ensure no casts for STRPOS query TT EXPLAIN SELECT