martin-augment · martin-augment · Apr 6, 2026 · Apr 7, 2026 · gemini-code-assist · Apr 7, 2026
diff --git a/.cursor/rules.md b/.cursor/rules.md
@@ -0,0 +1,5 @@
+The AI agents should never link to any issue or a pull request
+in any GitHub repository in the code reviews!
+
+The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!
+
diff --git a/AGENTS.md b/AGENTS.md
@@ -39,3 +39,13 @@ When creating a PR, you MUST follow the [PR template](.github/pull_request_templ
 
 See the [Testing Quick Start](docs/source/contributor-guide/testing.md#testing-quick-start)
 for the recommended pre-PR test commands.
+The AI agents should never link to any issue or a pull request
+in any GitHub repository in the code reviews!
+
+The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!
+
+The AI agents should never link to any issue or a pull request
+in any GitHub repository in the code reviews!
+
+The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!
+
-The AI agents should never link to any issue or a pull request
-in any GitHub repository in the code reviews!
-
-The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!
-
-The AI agents should never link to any issue or a pull request
-in any GitHub repository in the code reviews!
-
-The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!
+The AI agents should never link to any issue or a pull request
+in any GitHub repository in the code reviews!
+
+The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!
+
-The AI agents should never link to any issue or a pull request
-in any GitHub repository in the code reviews!
-
-The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!
-
-The AI agents should never link to any issue or a pull request
-in any GitHub repository in the code reviews!
-
-The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!
+The AI agents should never link to any issue or a pull request
+in any GitHub repository in the code reviews!
+
+The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!
+
diff --git a/datafusion/functions/benches/split_part.rs b/datafusion/functions/benches/split_part.rs
@@ -210,6 +210,23 @@ fn criterion_benchmark(c: &mut Criterion) {
         );
     }
 
+    // Utf8View, very long parts (256 bytes), position 1
+    {
+        let strings = gen_string_array(N_ROWS, 5, 256, ".", true);
+        let delimiter = ColumnarValue::Scalar(ScalarValue::Utf8View(Some(".".into())));
+        let position = ColumnarValue::Scalar(ScalarValue::Int64(Some(1)));
+        bench_split_part(
+            &mut group,
+            &split_part_func,
+            &config_options,
+            "scalar_utf8view_very_long_parts",
+            "pos_first",
+            strings,
+            delimiter,
+            position,
+        );
+    }
+
     // ── Array delimiter and position ─────────────────
 
     // Utf8, single-char delimiter, array args

diff --git a/datafusion/functions/src/string/split_part.rs b/datafusion/functions/src/string/split_part.rs
@@ -17,9 +17,11 @@
 
 use crate::utils::utf8_to_str_type;
 use arrow::array::{
-    Array, ArrayRef, AsArray, GenericStringBuilder, Int64Array, StringArrayType,
-    StringLikeArrayBuilder, StringViewBuilder, new_null_array,
+    Array, ArrayRef, AsArray, ByteView, GenericStringBuilder, Int64Array,
+    StringArrayType, StringLikeArrayBuilder, StringViewArray, StringViewBuilder,
+    make_view, new_null_array,
 };
+use arrow::buffer::ScalarBuffer;
 use arrow::datatypes::DataType;
 use datafusion_common::ScalarValue;
 use datafusion_common::cast::as_int64_array;
@@ -279,12 +281,9 @@ fn split_part_scalar(
     }
 
     let result = match string_array.data_type() {
-        DataType::Utf8View => split_part_scalar_impl(
-            string_array.as_string_view(),
-            delimiter,
-            position,
-            StringViewBuilder::with_capacity(string_array.len()),
-        ),
+        DataType::Utf8View => {
+            split_part_scalar_view(string_array.as_string_view(), delimiter, position)
+        }
         DataType::Utf8 => {
             let arr = string_array.as_string::<i32>();
             // Conservative under-estimate for data capacity: split_part output
@@ -425,6 +424,116 @@ fn rsplit_nth_finder<'a>(
     }
 }
 
+/// Zero-copy scalar fast path for `StringViewArray` inputs.
+///
+/// Instead of copying substring bytes into a new buffer, constructs
+/// `StringView` entries that point back into the original array's data
+/// buffers.
+fn split_part_scalar_view(
+    string_view_array: &StringViewArray,
+    delimiter: &str,
+    position: i64,
+) -> Result<ArrayRef> {
+    let len = string_view_array.len();
+    let mut views_buf = Vec::with_capacity(len);
+    let views = string_view_array.views();
+
+    if delimiter.is_empty() {
+        // PostgreSQL: empty delimiter treats input as a single field.
+        let empty_view = make_view(b"", 0, 0);
+        let return_input = position == 1 || position == -1;
+        for i in 0..len {
+            if string_view_array.is_null(i) {
+                views_buf.push(0);
+            } else if return_input {
+                views_buf.push(views[i]);
+            } else {
+                views_buf.push(empty_view);
+            }
+        }
+    } else if position > 0 {
+        let idx: usize = (position - 1).try_into().map_err(|_| {
+            exec_datafusion_err!(
+                "split_part index {position} exceeds maximum supported value"
+            )
+        })?;
+        let finder = memmem::Finder::new(delimiter.as_bytes());
+        split_view_loop(string_view_array, views, &mut views_buf, |s| {
+            split_nth_finder(s, &finder, delimiter.len(), idx)
+        });
+    } else {
+        let idx: usize = (position.unsigned_abs() - 1).try_into().map_err(|_| {
+            exec_datafusion_err!(
+                "split_part index {position} exceeds minimum supported value"
+            )
+        })?;
+        let finder_rev = memmem::FinderRev::new(delimiter.as_bytes());
+        split_view_loop(string_view_array, views, &mut views_buf, |s| {
+            rsplit_nth_finder(s, &finder_rev, delimiter.len(), idx)
+        });
+    }
+
+    let views_buf = ScalarBuffer::from(views_buf);
+
+    // Nulls pass through unchanged, so we can use the input's null array.
+    let nulls = string_view_array.nulls().cloned();
+
+    // Safety: each view is either copied unchanged from the input, or built
+    // by `substr_view` from a substring that is a contiguous sub-range of the
+    // original string value stored in the input's data buffers.
+    unsafe {
+        Ok(Arc::new(StringViewArray::new_unchecked(
+            views_buf,
+            string_view_array.data_buffers().to_vec(),
+            nulls,
+        )) as ArrayRef)
+    }
+}
+
+/// Creates a `StringView` referencing a substring of an existing view's buffer.
+/// For substrings ≤ 12 bytes, creates an inline view instead.
+#[inline]
+fn substr_view(original_view: &u128, substr: &str, start_offset: u32) -> u128 {
+    if substr.len() > 12 {
+        let view = ByteView::from(*original_view);
+        make_view(
+            substr.as_bytes(),
+            view.buffer_index,
+            view.offset + start_offset,
+        )
+    } else {
+        make_view(substr.as_bytes(), 0, 0)
+    }
+}
+
+/// Applies `split_fn` to each non-null string and appends the resulting view to
+/// `views_buf`.
+#[inline(always)]
+fn split_view_loop<F>(
+    string_view_array: &StringViewArray,
+    views: &[u128],
+    views_buf: &mut Vec<u128>,
+    split_fn: F,
+) where
+    F: Fn(&str) -> Option<&str>,
+{
+    let empty_view = make_view(b"", 0, 0);
+    for (i, raw_view) in views.iter().enumerate() {
+        if string_view_array.is_null(i) {
+            views_buf.push(0);
+            continue;
+        }
+        let string = string_view_array.value(i);
+        match split_fn(string) {
+            Some(substr) => {
+                let start_offset = substr.as_ptr() as usize - string.as_ptr() as usize;
+                views_buf.push(substr_view(raw_view, substr, start_offset as u32));
+            }
+            None => views_buf.push(empty_view),
+        }
+    }
+}
+
 fn split_part_impl<'a, StringArrType, DelimiterArrType, B>(
     string_array: &StringArrType,
     delimiter_array: &DelimiterArrType,

diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt
@@ -954,6 +954,71 @@ SELECT arrow_typeof(split_part(arrow_cast('a.b.c', 'Utf8View'), '.', 2));
 ----
 Utf8View
 
+# SPLIT_PART with Utf8View column (exercises the array fast path)
+query T
+SELECT split_part(column1_utf8view, 'ph', 1) FROM test;
+----
+Andrew
+Xiangpeng
+Ra
+(empty)
+NULL
+
+query T
+SELECT split_part(column1_utf8view, 'ph', 2) FROM test;
+----
+(empty)
+(empty)
+ael
+(empty)
+NULL
+
+# Negative position
+query T
+SELECT split_part(column1_utf8view, 'ph', -1) FROM test;
+----
+Andrew
+Xiangpeng
+ael
+(empty)
+NULL
+
+# Delimiter not found returns full string
+query T
+SELECT split_part(column1_utf8view, 'ZZZ', 1) FROM test;
+----
+Andrew
+Xiangpeng
+Raphael
+(empty)
+NULL
+
+# Empty delimiter with column
+query T
+SELECT split_part(column1_utf8view, '', 1) FROM test;
+----
+Andrew
+Xiangpeng
+Raphael
+(empty)
+NULL
+
+# Single-char delimiter with column
+query T
+SELECT split_part(column1_utf8view, 'a', 1) FROM test;
+----
+Andrew
+Xi
+R
+(empty)
+NULL
+
+# Verify array path also returns Utf8View
+query T
+SELECT arrow_typeof(split_part(column1_utf8view, '.', 1)) FROM test LIMIT 1;
+----
+Utf8View
+
 ## Ensure no casts for STRPOS
 query TT
 EXPLAIN SELECT