Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .cursor/rules.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
The AI agents should never link to any issue or a pull request
in any GitHub repository in the code reviews!

The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!

10 changes: 10 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,13 @@ When creating a PR, you MUST follow the [PR template](.github/pull_request_templ

See the [Testing Quick Start](docs/source/contributor-guide/testing.md#testing-quick-start)
for the recommended pre-PR test commands.
The AI agents should never link to any issue or a pull request
in any GitHub repository in the code reviews!

The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!

The AI agents should never link to any issue or a pull request
in any GitHub repository in the code reviews!

The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!

Comment on lines +42 to +51
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The added instructions for AI agents are duplicated. Please remove the redundant lines to keep the documentation clean.

Suggested change
The AI agents should never link to any issue or a pull request
in any GitHub repository in the code reviews!
The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!
The AI agents should never link to any issue or a pull request
in any GitHub repository in the code reviews!
The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!
The AI agents should never link to any issue or a pull request
in any GitHub repository in the code reviews!
The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unrelated duplicate config rules in performance optimization PR

Low Severity

This PR includes AI agent configuration rules in AGENTS.md and a new .cursor/rules.md. These changes appear unrelated to the PR's performance optimization goal, and the rules in AGENTS.md are duplicated.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 014d100. Configure here.

17 changes: 17 additions & 0 deletions datafusion/functions/benches/split_part.rs
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,23 @@ fn criterion_benchmark(c: &mut Criterion) {
);
}

// Utf8View, very long parts (256 bytes), position 1
{
let strings = gen_string_array(N_ROWS, 5, 256, ".", true);
let delimiter = ColumnarValue::Scalar(ScalarValue::Utf8View(Some(".".into())));
let position = ColumnarValue::Scalar(ScalarValue::Int64(Some(1)));
bench_split_part(
&mut group,
&split_part_func,
&config_options,
"scalar_utf8view_very_long_parts",
"pos_first",
strings,
delimiter,
position,
);
}

// ── Array delimiter and position ─────────────────

// Utf8, single-char delimiter, array args
Expand Down
125 changes: 117 additions & 8 deletions datafusion/functions/src/string/split_part.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@

use crate::utils::utf8_to_str_type;
use arrow::array::{
Array, ArrayRef, AsArray, GenericStringBuilder, Int64Array, StringArrayType,
StringLikeArrayBuilder, StringViewBuilder, new_null_array,
Array, ArrayRef, AsArray, ByteView, GenericStringBuilder, Int64Array,
StringArrayType, StringLikeArrayBuilder, StringViewArray, StringViewBuilder,
make_view, new_null_array,
};
use arrow::buffer::ScalarBuffer;
use arrow::datatypes::DataType;
use datafusion_common::ScalarValue;
use datafusion_common::cast::as_int64_array;
Expand Down Expand Up @@ -279,12 +281,9 @@ fn split_part_scalar(
}

let result = match string_array.data_type() {
DataType::Utf8View => split_part_scalar_impl(
string_array.as_string_view(),
delimiter,
position,
StringViewBuilder::with_capacity(string_array.len()),
),
DataType::Utf8View => {
split_part_scalar_view(string_array.as_string_view(), delimiter, position)
}
DataType::Utf8 => {
let arr = string_array.as_string::<i32>();
// Conservative under-estimate for data capacity: split_part output
Expand Down Expand Up @@ -425,6 +424,116 @@ fn rsplit_nth_finder<'a>(
}
}

/// Zero-copy scalar fast path for `StringViewArray` inputs.
///
/// Instead of copying substring bytes into a new buffer, constructs
/// `StringView` entries that point back into the original array's data
/// buffers.
fn split_part_scalar_view(
string_view_array: &StringViewArray,
delimiter: &str,
position: i64,
) -> Result<ArrayRef> {
let len = string_view_array.len();
let mut views_buf = Vec::with_capacity(len);
let views = string_view_array.views();

if delimiter.is_empty() {
// PostgreSQL: empty delimiter treats input as a single field.
let empty_view = make_view(b"", 0, 0);
let return_input = position == 1 || position == -1;
for i in 0..len {
if string_view_array.is_null(i) {
views_buf.push(0);
} else if return_input {
views_buf.push(views[i]);
} else {
views_buf.push(empty_view);
}
}
} else if position > 0 {
let idx: usize = (position - 1).try_into().map_err(|_| {
exec_datafusion_err!(
"split_part index {position} exceeds maximum supported value"
)
})?;
let finder = memmem::Finder::new(delimiter.as_bytes());
split_view_loop(string_view_array, views, &mut views_buf, |s| {
split_nth_finder(s, &finder, delimiter.len(), idx)
});
} else {
let idx: usize = (position.unsigned_abs() - 1).try_into().map_err(|_| {
exec_datafusion_err!(
"split_part index {position} exceeds minimum supported value"
)
})?;
let finder_rev = memmem::FinderRev::new(delimiter.as_bytes());
split_view_loop(string_view_array, views, &mut views_buf, |s| {
rsplit_nth_finder(s, &finder_rev, delimiter.len(), idx)
});
}

let views_buf = ScalarBuffer::from(views_buf);

// Nulls pass through unchanged, so we can use the input's null array.
let nulls = string_view_array.nulls().cloned();

// Safety: each view is either copied unchanged from the input, or built
// by `substr_view` from a substring that is a contiguous sub-range of the
// original string value stored in the input's data buffers.
unsafe {
Ok(Arc::new(StringViewArray::new_unchecked(
views_buf,
string_view_array.data_buffers().to_vec(),
nulls,
)) as ArrayRef)
Comment on lines +481 to +489
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

cd datafusion && find . -name "split_part.rs" -type f

Repository: martin-augment/datafusion

Length of output: 139


🏁 Script executed:

cat -n datafusion/functions/src/string/split_part.rs | sed -n '470,510p'

Repository: martin-augment/datafusion

Length of output: 1787


🏁 Script executed:

cat -n datafusion/functions/src/string/split_part.rs | sed -n '280,290p'

Repository: martin-augment/datafusion

Length of output: 636


🏁 Script executed:

cat -n datafusion/functions/src/string/split_part.rs | sed -n '420,450p'

Repository: martin-augment/datafusion

Length of output: 1334


🏁 Script executed:

cat -n datafusion/functions/src/string/split_part.rs | sed -n '432,520p'

Repository: martin-augment/datafusion

Length of output: 3782


🏁 Script executed:

# Let's search for the make_view function to understand how views are created
rg -n "fn make_view" datafusion/functions/src/string/split_part.rs -A 10

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Let's check if there are any tests for split_part with Utf8View
rg -n "split_part.*View\|Utf8View.*split_part" datafusion/functions/src/string/split_part.rs -B 2 -A 5

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Let's look at the StringViewArray implementation to understand buffer management
find . -path "*/arrow*" -name "*.rs" | head -20

Repository: martin-augment/datafusion

Length of output: 511


🏁 Script executed:

# Search for StringViewArray in the codebase to understand how buffers are managed
fd StringViewArray datafusion -t f 2>/dev/null | head -5

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Look at how StringViewArray::new_unchecked is used elsewhere
rg "StringViewArray::new_unchecked" --type rs | head -20

Repository: martin-augment/datafusion

Length of output: 95


🏁 Script executed:

rg "StringViewArray::new_unchecked" --type-list

Repository: martin-augment/datafusion

Length of output: 458


🏁 Script executed:

rg "StringViewArray::new_unchecked" datafusion 2>/dev/null | head -20

Repository: martin-augment/datafusion

Length of output: 492


🏁 Script executed:

# Look at make_view function
rg -n "fn make_view" datafusion/functions/src/string/split_part.rs -A 8

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Check the full split_view_loop and how it uses substr_view
cat -n datafusion/functions/src/string/split_part.rs | sed -n '509,550p'

Repository: martin-augment/datafusion

Length of output: 1644


🏁 Script executed:

# Let's look at the entire file to understand structure better
wc -l datafusion/functions/src/string/split_part.rs

Repository: martin-augment/datafusion

Length of output: 118


🏁 Script executed:

# Check if there are any optimizations or checks for inline-only results elsewhere
rg -n "data_buffers\|buffer_index" datafusion/functions/src/string/split_part.rs -B 1 -A 1

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Find the make_view function definition
rg -n "fn make_view" datafusion/functions/src/string/ -A 15

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Check ByteView struct and buffer_index behavior
rg -n "struct ByteView\|buffer_index" datafusion/functions/src/string/split_part.rs -B 2 -A 2

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Look at how other StringViewArray::new_unchecked calls handle buffers (from earlier search results)
cat -n datafusion/functions/src/string/common.rs | sed -n '1,50p' | grep -A 20 "StringViewArray::new_unchecked"

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Get more context around the StringViewArray::new_unchecked call in common.rs
rg -n "StringViewArray::new_unchecked" datafusion/functions/src/string/common.rs -B 10 -A 5

Repository: martin-augment/datafusion

Length of output: 649


🏁 Script executed:

# Check the unicode/substr.rs usage
rg -n "StringViewArray::new_unchecked" datafusion/functions/src/unicode/substr.rs -B 10 -A 5

Repository: martin-augment/datafusion

Length of output: 649


🏁 Script executed:

# Look for any buffer optimization or filtering in the codebase
rg -n "is_empty\|filter.*buffer\|retain" datafusion/functions/src/string/split_part.rs | head -20

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Search for make_view function more broadly
rg -n "fn make_view" datafusion --type rs -A 10 2>/dev/null | head -40

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Search in arrow crate or related files
find . -name "*.rs" -exec grep -l "fn make_view" {} \; 2>/dev/null | head -5

Repository: martin-augment/datafusion

Length of output: 115


🏁 Script executed:

# Let's look for ByteView definition and buffer_index meaning
rg -n "buffer_index.*0\|inline" datafusion/functions/src/string/split_part.rs | head -20

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Check what buffer_index = 0 means in StringView/ByteView context
rg -n "buffer_index" datafusion -A 2 -B 2 --type rs 2>/dev/null | head -60

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Let's check if there's any optimization in Arrow's StringViewArray that handles unused buffers
find . -path "*/arrow*" -name "*.rs" -exec grep -l "StringViewArray\|data_buffers" {} \; 2>/dev/null | head -10

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Check the comments and safety documentation around buffer_index
rg -n "buffer_index = 0\|inline.*buffer\|12 bytes" datafusion/functions/src/string/ -B 3 -A 3 | head -60

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Search for make_view definition or import at the top of split_part.rs
cat -n datafusion/functions/src/string/split_part.rs | head -100 | grep -i "make_view\|use\|import"

Repository: martin-augment/datafusion

Length of output: 791


🏁 Script executed:

# Look at the top of the file for imports and module structure
head -50 datafusion/functions/src/string/split_part.rs

Repository: martin-augment/datafusion

Length of output: 2279


🏁 Script executed:

# Check if make_view is defined inline within split_part.rs itself
rg "make_view" datafusion/functions/src/string/split_part.rs -n

Repository: martin-augment/datafusion

Length of output: 270


🏁 Script executed:

# Let's check arrow crate directly
find . -path "*arrow-array*" -name "*.rs" -exec grep -l "StringView" {} \; 2>/dev/null | head -5

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Check if this is using arrow_string crate
rg -n "arrow.string\|arrow_string" datafusion/functions/src/string/split_part.rs -B 2 -A 2

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Let's understand what make_view does by looking at arrow crate
# First find arrow crate location
find . -name "arrow" -type d | grep -E "^.*arrow$|/arrow$" | head -5

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Look for tests or examples using StringViewArray and inlined data
rg -n "inline\|buffer_index.*0\|data_buffers" datafusion/functions/src/string/common.rs -B 3 -A 3 | head -80

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Check if there's a way to construct StringViewArray with empty buffers
rg -n "StringViewArray.*new\|data_buffers" datafusion -A 2 -B 2 --type rs 2>/dev/null | grep -A 5 -B 5 "to_vec\|Vec::new" | head -100

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Let's look for the definition of StringViewArray and understand if it requires buffers even when unused
rg -n "pub.*fn data_buffers\|pub.*fn new_unchecked" datafusion --type rs 2>/dev/null | head -20

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Test case: check if there are tests for split_part that might show the expected behavior
rg -n "split_part" datafusion/functions/src/string/split_part.rs | grep -i "test\|#\[test\]" | head -10

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Find test module
tail -200 datafusion/functions/src/string/split_part.rs | head -100

Repository: martin-augment/datafusion

Length of output: 3420


🏁 Script executed:

# Let's look at the arrow crate to understand make_view signature and buffer_index semantics
find . -name "*.rs" -path "*/arrow*" -exec grep -l "pub.*fn make_view" {} \; 2>/dev/null | head -5

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Check if there's documentation about StringView inline behavior
rg -n "inline\|12 bytes\|small string" datafusion/functions/src/unicode/substr.rs -B 2 -A 2 | head -60

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Look at substr.rs to see how it handles the same pattern
cat -n datafusion/functions/src/unicode/substr.rs | sed -n '200,350p'

Repository: martin-augment/datafusion

Length of output: 6304


🏁 Script executed:

# Check if there's any existing code that filters empty buffers
rg -n "filter.*buffer\|empty.*buffer\|retain" datafusion/functions/src/string/ --type rs | head -20

Repository: martin-augment/datafusion

Length of output: 95


🏁 Script executed:

# Search for any memory optimization patterns in similar code
rg -n "data_buffers()" datafusion/functions/src/ -B 5 -A 5 --type rs | head -100

Repository: martin-augment/datafusion

Length of output: 95


🏁 Script executed:

# Let's check the make_and_append_view function in substr.rs to see the full pattern
rg -n "fn make_and_append_view" datafusion/functions/src/unicode/substr.rs -A 20

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Let's understand ByteView and buffer_index semantics better
rg -n "struct ByteView\|buffer_index" datafusion -A 3 --type rs 2>/dev/null | head -60

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Check if inline views (buffer_index=0) need the data_buffers
rg -n "buffer_index.*=.*0\|inline.*buffer" datafusion/functions/src/ -B 2 -A 2 --type rs | head -80

Repository: martin-augment/datafusion

Length of output: 95


🏁 Script executed:

# Let's look for any comments or documentation about view inlining
rg -n "12 bytes\|inline\|small string optimization" datafusion/functions/src/string/split_part.rs -B 2 -A 2

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Check if there's any test showing the concern - large input split into small parts
rg -n "long\|large.*small\|bytes.*split" datafusion/functions/benches/split_part.rs | head -20

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Let's examine the benchmark file to see if there are tests for the scenario mentioned
cat -n datafusion/functions/benches/split_part.rs | head -100

Repository: martin-augment/datafusion

Length of output: 4499


🏁 Script executed:

# Check if there's documentation or issues about StringView memory management
rg -n "StringView\|buffer.*retain\|memory.*issue" datafusion/functions/src/string/ --type rs -B 1 -A 1 | head -100

Repository: martin-augment/datafusion

Length of output: 95


🏁 Script executed:

# Look at the actual implementation to confirm the inline behavior (buffer_index = 0)
rg -n "make_view.*0.*0\|inline.*view" datafusion/functions/src/string/split_part.rs -B 2 -A 2

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Check if there are any TODOs or FIXMEs related to buffer management
rg -n "TODO\|FIXME\|XXX\|HACK" datafusion/functions/src/string/split_part.rs

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Verify the make_view function signature from arrow
python3 << 'EOF'
# Check if we can find documentation about make_view and buffer_index semantics
import subprocess
result = subprocess.run(['rg', '-n', 'pub.*fn make_view', 'datafusion', '--type', 'rs'], 
                       capture_output=True, text=True)
print("make_view definitions:")
print(result.stdout if result.stdout else "Not found in datafusion")
print(result.stderr if result.stderr else "")

# Also check in arrow if available
result2 = subprocess.run(['rg', '-n', 'fn make_view', '.'], 
                        capture_output=True, text=True)
print("\nAll make_view occurrences:")
lines = result2.stdout.split('\n')[:20]
for line in lines:
    print(line)
EOF

Repository: martin-augment/datafusion

Length of output: 248


🏁 Script executed:

# Let's look at the complete bench file to see if there are scenarios matching the concern
cat -n datafusion/functions/benches/split_part.rs | tail -100

Repository: martin-augment/datafusion

Length of output: 3905


🏁 Script executed:

# Let's check what happens in other places where similar patterns occur
cat -n datafusion/functions/src/string/common.rs | sed -n '200,250p'

Repository: martin-augment/datafusion

Length of output: 2178


🏁 Script executed:

# Let's trace through the logic more carefully - check split_view_loop to see all paths
cat -n datafusion/functions/src/string/split_part.rs | sed -n '509,536p'

Repository: martin-augment/datafusion

Length of output: 1120


🏁 Script executed:

# Critical check: does buffer_index = 0 mean inline in StringView?
python3 << 'EOF'
# Let's search for documentation or comments about buffer_index in views
import subprocess

# Search for buffer_index documentation/comments
result = subprocess.run(['rg', '-n', 'buffer_index', '.', '-B', '3', '-A', '3'], 
                       capture_output=True, text=True)
lines = result.stdout.split('\n')
# Filter to relevant lines only
relevant = [line for line in lines if '0' in line and ('inline' in line.lower() or 'view' in line.lower())]
print("Lines mentioning buffer_index with 0 and inline/view:")
for line in relevant[:30]:
    if line.strip():
        print(line)
EOF

Repository: martin-augment/datafusion

Length of output: 3920


🏁 Script executed:

# Let's verify the actual concern - check if there's a scenario where all results could be inline
cat -n datafusion/functions/src/string/split_part.rs | sed -n '454,490p'

Repository: martin-augment/datafusion

Length of output: 1826


🏁 Script executed:

# Check if there's any filtering or optimization of buffers anywhere in the code
rg -n "filter.*buffer\|drain.*buffer\|empty.*buffer\|Vec::new.*buffer" datafusion/functions/src/ -B 2 -A 2 | head -60

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Let's check if StringViewArray can be created with empty buffers or fewer buffers
rg -n "Vec::new()\|Vec::with_capacity" datafusion/functions/src/string/split_part.rs -B 1 -A 1

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Search for documentation on inline vs external buffer views
rg -n "buffer_index.*==.*0\|inline.*string\|small.*string" datafusion/functions/src/string/ -B 2 -A 2 | head -80

Repository: martin-augment/datafusion

Length of output: 51


🏁 Script executed:

# Final check: Are there any related issues or TODOs about buffer retention?
rg -n "TODO\|FIXME\|buffer.*retain\|memory.*issue\|optimization" datafusion/functions/src/string/ | grep -i "buffer\|memory\|inline"

Repository: martin-augment/datafusion

Length of output: 51


Don't retain source Utf8View buffers when the result is fully inline.

Line 487 always clones string_view_array.data_buffers(), even when substr_view in lines 496–506 has inlined every emitted part. On workloads with long Utf8View inputs split into short tokens (< 12 bytes), this retains the original large buffers for the output array's entire lifetime even though no rows actually reference them. Consider special-casing the all-inline scenario to avoid unnecessary buffer retention. This pattern also appears in substr.rs (lines 320–325) and common.rs (lines 236–240).

}
}

/// Creates a `StringView` referencing a substring of an existing view's buffer.
/// For substrings ≤ 12 bytes, creates an inline view instead.
#[inline]
fn substr_view(original_view: &u128, substr: &str, start_offset: u32) -> u128 {
if substr.len() > 12 {
let view = ByteView::from(*original_view);
make_view(
substr.as_bytes(),
view.buffer_index,
view.offset + start_offset,
)
} else {
make_view(substr.as_bytes(), 0, 0)
}
}

/// Applies `split_fn` to each non-null string and appends the resulting view to
/// `views_buf`.
#[inline(always)]
fn split_view_loop<F>(
string_view_array: &StringViewArray,
views: &[u128],
views_buf: &mut Vec<u128>,
split_fn: F,
) where
F: Fn(&str) -> Option<&str>,
{
let empty_view = make_view(b"", 0, 0);
for (i, raw_view) in views.iter().enumerate() {
if string_view_array.is_null(i) {
views_buf.push(0);
continue;
}
let string = string_view_array.value(i);
match split_fn(string) {
Some(substr) => {
let start_offset = substr.as_ptr() as usize - string.as_ptr() as usize;
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

split_view_loop computes start_offset via raw pointer subtraction and then truncates it with as u32; this relies on split_fn always returning a slice of string and the offset always fitting in u32, otherwise the derived view could become invalid (especially since the result is built with StringViewArray::new_unchecked). Consider adding a defensive assertion and/or documenting this invariant more explicitly to prevent accidental unsoundness if this helper is reused/modified later.

Severity: medium

Fix This in Augment

🤖 Was this useful? React with 👍 or 👎, or 🚀 if it prevented an incident/outage.

Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

value:valid-but-wont-fix; category:bug; feedback: The Augment AI reviewer is not correct! Arrow types use u32 for offsets, so there is no point to use bigger types for offset related functionality.

views_buf.push(substr_view(raw_view, substr, start_offset as u32));
}
None => views_buf.push(empty_view),
}
}
}

fn split_part_impl<'a, StringArrType, DelimiterArrType, B>(
string_array: &StringArrType,
delimiter_array: &DelimiterArrType,
Expand Down
65 changes: 65 additions & 0 deletions datafusion/sqllogictest/test_files/string/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -954,6 +954,71 @@ SELECT arrow_typeof(split_part(arrow_cast('a.b.c', 'Utf8View'), '.', 2));
----
Utf8View

# SPLIT_PART with Utf8View column (exercises the array fast path)
query T
SELECT split_part(column1_utf8view, 'ph', 1) FROM test;
----
Andrew
Xiangpeng
Ra
(empty)
NULL

query T
SELECT split_part(column1_utf8view, 'ph', 2) FROM test;
----
(empty)
(empty)
ael
(empty)
NULL

# Negative position
query T
SELECT split_part(column1_utf8view, 'ph', -1) FROM test;
----
Andrew
Xiangpeng
ael
(empty)
NULL

# Delimiter not found returns full string
query T
SELECT split_part(column1_utf8view, 'ZZZ', 1) FROM test;
----
Andrew
Xiangpeng
Raphael
(empty)
NULL

# Empty delimiter with column
query T
SELECT split_part(column1_utf8view, '', 1) FROM test;
----
Andrew
Xiangpeng
Raphael
(empty)
NULL

# Single-char delimiter with column
query T
SELECT split_part(column1_utf8view, 'a', 1) FROM test;
----
Andrew
Xi
R
(empty)
NULL

# Verify array path also returns Utf8View
query T
SELECT arrow_typeof(split_part(column1_utf8view, '.', 1)) FROM test LIMIT 1;
----
Utf8View

## Ensure no casts for STRPOS
query TT
EXPLAIN SELECT
Expand Down
Loading