martin-augment · martin-augment · Apr 6, 2026 · Apr 6, 2026 · Apr 7, 2026 · gemini-code-assist
diff --git a/.cursor/rules.md b/.cursor/rules.md
@@ -0,0 +1,5 @@
+The AI agents should never link to any issue or a pull request
+in any GitHub repository in the code reviews!
+
+The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!
+
diff --git a/AGENTS.md b/AGENTS.md
@@ -39,3 +39,13 @@ When creating a PR, you MUST follow the [PR template](.github/pull_request_templ
 
 See the [Testing Quick Start](docs/source/contributor-guide/testing.md#testing-quick-start)
 for the recommended pre-PR test commands.
+The AI agents should never link to any issue or a pull request
+in any GitHub repository in the code reviews!
+
+The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!
+
+The AI agents should never link to any issue or a pull request
+in any GitHub repository in the code reviews!
+
+The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!
+
-The AI agents should never link to any issue or a pull request
-in any GitHub repository in the code reviews!
-
-The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!
-
-The AI agents should never link to any issue or a pull request
-in any GitHub repository in the code reviews!
-
-The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!
+The AI agents should never link to any issue or a pull request
+in any GitHub repository in the code reviews!
+
+The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!
+
-The AI agents should never link to any issue or a pull request
-in any GitHub repository in the code reviews!
-
-The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!
-
-The AI agents should never link to any issue or a pull request
-in any GitHub repository in the code reviews!
-
-The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!
+The AI agents should never link to any issue or a pull request
+in any GitHub repository in the code reviews!
+
+The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md!
+
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml
@@ -59,7 +59,7 @@ regex_expressions = ["regex"]
 # enable string functions
 string_expressions = ["uuid"]
 # enable unicode functions
-unicode_expressions = ["unicode-segmentation"]
+unicode_expressions = []
 
 [lib]
 name = "datafusion_functions"
@@ -87,7 +87,6 @@ num-traits = { workspace = true }
 rand = { workspace = true }
 regex = { workspace = true, optional = true }
 sha2 = { workspace = true, optional = true }
-unicode-segmentation = { version = "^1.13.2", optional = true }
 uuid = { workspace = true, features = ["v4"], optional = true }
 
 [dev-dependencies]

diff --git a/datafusion/functions/src/unicode/common.rs b/datafusion/functions/src/unicode/common.rs
@@ -78,6 +78,16 @@ impl LeftRightSlicer for RightSlicer {
     }
 }
 
+/// Returns the byte offset of the `n`th codepoint in `string`, or
+/// `string.len()` if the string has fewer than `n` codepoints.
+#[inline]
+pub(crate) fn byte_offset_of_char(string: &str, n: usize) -> usize {
+    string
+        .char_indices()
+        .nth(n)
+        .map_or(string.len(), |(i, _)| i)
+}
+
 /// Calculate the byte length of the substring of `n` chars from string `string`
 #[inline]
 fn left_right_byte_length(string: &str, n: i64) -> usize {
@@ -88,11 +98,9 @@ fn left_right_byte_length(string: &str, n: i64) -> usize {
             .map(|(index, _)| index)
             .unwrap_or(0),
         Ordering::Equal => 0,
-        Ordering::Greater => string
-            .char_indices()
-            .nth(n.unsigned_abs().min(usize::MAX as u64) as usize)
-            .map(|(index, _)| index)
-            .unwrap_or(string.len()),
+        Ordering::Greater => {
+            byte_offset_of_char(string, n.unsigned_abs().min(usize::MAX as u64) as usize)
+        }
     }
 }
 

diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs
@@ -24,7 +24,6 @@ use arrow::array::{
     OffsetSizeTrait, StringArrayType, StringViewArray,
 };
 use arrow::datatypes::DataType;
-use unicode_segmentation::UnicodeSegmentation;
 
 use crate::utils::{make_scalar_function, utf8_to_str_type};
 use datafusion_common::cast::as_int64_array;
@@ -178,7 +177,7 @@ impl ScalarUDFImpl for LPadFunc {
     }
 }
 
-use super::common::{try_as_scalar_i64, try_as_scalar_str};
+use super::common::{byte_offset_of_char, try_as_scalar_i64, try_as_scalar_str};
 
 /// Optimized lpad for constant target_len and fill arguments.
 fn lpad_scalar_args<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
@@ -270,22 +269,19 @@ fn lpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
     let data_capacity = string_array.len().saturating_mul(target_len * 4);
-    let data_capacity = string_array.len().saturating_mul(target_len * 4);
+    let data_capacity = string_array.len().saturating_mul(target_len.saturating_mul(4));
-    let data_capacity = string_array.len().saturating_mul(target_len * 4);
+    let data_capacity = string_array.len().saturating_mul(target_len.saturating_mul(4));
     let mut builder =
         GenericStringBuilder::<T>::with_capacity(string_array.len(), data_capacity);
-    let mut graphemes_buf = Vec::new();
 
     for maybe_string in string_array.iter() {
         match maybe_string {
             Some(string) => {
-                graphemes_buf.clear();
-                graphemes_buf.extend(string.graphemes(true));
+                let char_count = string.chars().count();
 
-                if target_len < graphemes_buf.len() {
-                    let end: usize =
-                        graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
-                    builder.append_value(&string[..end]);
+                if target_len < char_count {
+                    builder
+                        .append_value(&string[..byte_offset_of_char(string, target_len)]);
-                let char_count = string.chars().count();
-
-                if target_len < graphemes_buf.len() {
-                    let end: usize =
-                        graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
-                    builder.append_value(&string[..end]);
-                if target_len < char_count {
-                    builder
-                        .append_value(&string[..byte_offset_of_char(string, target_len)]);
+                let mut char_count = 0;
+                let mut truncate_offset = None;
+                for (i, (byte_idx, _)) in string.char_indices().enumerate() {
+                    if i == target_len {
+                        truncate_offset = Some(byte_idx);
+                    }
+                    char_count += 1;
+                }
+
+                if let Some(offset) = truncate_offset {
+                    builder.append_value(&string[..offset]);
-                let char_count = string.chars().count();
-
-                if target_len < graphemes_buf.len() {
-                    let end: usize =
-                        graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
-                    builder.append_value(&string[..end]);
-                if target_len < char_count {
-                    builder
-                        .append_value(&string[..byte_offset_of_char(string, target_len)]);
+                let mut char_count = 0;
+                let mut truncate_offset = None;
+                for (i, (byte_idx, _)) in string.char_indices().enumerate() {
+                    if i == target_len {
+                        truncate_offset = Some(byte_idx);
+                    }
+                    char_count += 1;
+                }
+
+                if let Some(offset) = truncate_offset {
+                    builder.append_value(&string[..offset]);
                 } else if fill_chars.is_empty() {
                     builder.append_value(string);
                 } else {
-                    let pad_chars = target_len - graphemes_buf.len();
+                    let pad_chars = target_len - char_count;
                     let pad_bytes = char_byte_offsets[pad_chars];
                     builder.write_str(&padding_buf[..pad_bytes])?;
                     builder.append_value(string);
@@ -378,7 +374,6 @@ where
 {
     let array = if let Some(fill_array) = fill_array {
         let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
-        let mut graphemes_buf = Vec::new();
         let mut fill_chars_buf = Vec::new();
 
         for ((string, target_len), fill) in string_array
@@ -407,8 +402,7 @@ where
                 }
 
                 if string.is_ascii() && fill.is_ascii() {
-                    // ASCII fast path: byte length == character length,
-                    // so we skip expensive grapheme segmentation.
+                    // ASCII fast path: byte length == character length.
                     let str_len = string.len();
                     if target_len < str_len {
                         builder.append_value(&string[..target_len]);
@@ -428,21 +422,19 @@ where
                         builder.append_value(string);
                     }
                 } else {
-                    // Reuse buffers by clearing and refilling
-                    graphemes_buf.clear();
-                    graphemes_buf.extend(string.graphemes(true));
+                    let char_count = string.chars().count();
 
                     fill_chars_buf.clear();
                     fill_chars_buf.extend(fill.chars());
 
-                    if target_len < graphemes_buf.len() {
-                        let end: usize =
-                            graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
-                        builder.append_value(&string[..end]);
+                    if target_len < char_count {
+                        builder.append_value(
+                            &string[..byte_offset_of_char(string, target_len)],
+                        );
                     } else if fill_chars_buf.is_empty() {
                         builder.append_value(string);
                     } else {
-                        for l in 0..target_len - graphemes_buf.len() {
+                        for l in 0..target_len - char_count {
                             let c =
                                 *fill_chars_buf.get(l % fill_chars_buf.len()).unwrap();
                             builder.write_char(c)?;
@@ -458,7 +450,6 @@ where
         builder.finish()
     } else {
         let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
-        let mut graphemes_buf = Vec::new();
 
         for (string, target_len) in string_array.iter().zip(length_array.iter()) {
             if let (Some(string), Some(target_len)) = (string, target_len) {
@@ -491,16 +482,14 @@ where
                         builder.append_value(string);
                     }
                 } else {
-                    // Reuse buffer by clearing and refilling
-                    graphemes_buf.clear();
-                    graphemes_buf.extend(string.graphemes(true));
-
-                    if target_len < graphemes_buf.len() {
-                        let end: usize =
-                            graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
-                        builder.append_value(&string[..end]);
+                    let char_count = string.chars().count();
+
+                    if target_len < char_count {
+                        builder.append_value(
+                            &string[..byte_offset_of_char(string, target_len)],
+                        );
                     } else {
-                        for _ in 0..(target_len - graphemes_buf.len()) {
+                        for _ in 0..(target_len - char_count) {
                             builder.write_str(" ")?;
                         }
                         builder.append_value(string);

diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs
@@ -24,7 +24,6 @@ use arrow::array::{
     OffsetSizeTrait, StringArrayType, StringViewArray,
 };
 use arrow::datatypes::DataType;
-use unicode_segmentation::UnicodeSegmentation;
 
 use crate::utils::{make_scalar_function, utf8_to_str_type};
 use datafusion_common::cast::as_int64_array;
@@ -178,7 +177,7 @@ impl ScalarUDFImpl for RPadFunc {
     }
 }
 
-use super::common::{try_as_scalar_i64, try_as_scalar_str};
+use super::common::{byte_offset_of_char, try_as_scalar_i64, try_as_scalar_str};
 
 /// Optimized rpad for constant target_len and fill arguments.
 fn rpad_scalar_args<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
@@ -271,22 +270,19 @@ fn rpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
     let data_capacity = string_array.len().saturating_mul(target_len * 4);
     let mut builder =
         GenericStringBuilder::<T>::with_capacity(string_array.len(), data_capacity);
-    let mut graphemes_buf = Vec::new();
 
     for maybe_string in string_array.iter() {
         match maybe_string {
             Some(string) => {
-                graphemes_buf.clear();
-                graphemes_buf.extend(string.graphemes(true));
+                let char_count = string.chars().count();
 
-                if target_len < graphemes_buf.len() {
-                    let end: usize =
-                        graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
-                    builder.append_value(&string[..end]);
+                if target_len < char_count {
+                    builder
+                        .append_value(&string[..byte_offset_of_char(string, target_len)]);
                 } else if fill_chars.is_empty() {
-
-                if target_len < graphemes_buf.len() {
-                    let end: usize =
-                        graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
-                    builder.append_value(&string[..end]);
-                if target_len < char_count {
-                    builder
-                        .append_value(&string[..byte_offset_of_char(string, target_len)]);
-                } else if fill_chars.is_empty() {
+                let mut char_count = 0;
+                let mut truncate_offset = None;
+                for (i, (byte_idx, _)) in string.char_indices().enumerate() {
+                    if i == target_len {
+                        truncate_offset = Some(byte_idx);
+                    }
+                    char_count += 1;
+                }
+
+                if let Some(offset) = truncate_offset {
+                    builder.append_value(&string[..offset]);
-
-                if target_len < graphemes_buf.len() {
-                    let end: usize =
-                        graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
-                    builder.append_value(&string[..end]);
-                if target_len < char_count {
-                    builder
-                        .append_value(&string[..byte_offset_of_char(string, target_len)]);
-                } else if fill_chars.is_empty() {
+                let mut char_count = 0;
+                let mut truncate_offset = None;
+                for (i, (byte_idx, _)) in string.char_indices().enumerate() {
+                    if i == target_len {
+                        truncate_offset = Some(byte_idx);
+                    }
+                    char_count += 1;
+                }
+
+                if let Some(offset) = truncate_offset {
+                    builder.append_value(&string[..offset]);
                     builder.append_value(string);
                 } else {
-                    let pad_chars = target_len - graphemes_buf.len();
+                    let pad_chars = target_len - char_count;
                     let pad_bytes = char_byte_offsets[pad_chars];
                     builder.write_str(string)?;
                     builder.write_str(&padding_buf[..pad_bytes])?;
@@ -377,7 +373,6 @@ where
 {
     let array = if let Some(fill_array) = fill_array {
         let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
-        let mut graphemes_buf = Vec::new();
         let mut fill_chars_buf = Vec::new();
 
         for ((string, target_len), fill) in string_array
@@ -406,8 +401,7 @@ where
                 }
 
                 if string.is_ascii() && fill.is_ascii() {
-                    // ASCII fast path: byte length == character length,
-                    // so we skip expensive grapheme segmentation.
+                    // ASCII fast path: byte length == character length.
                     let str_len = string.len();
                     if target_len < str_len {
                         builder.append_value(&string[..target_len]);
@@ -428,21 +422,20 @@ where
                         builder.append_value("");
                     }
                 } else {
-                    graphemes_buf.clear();
-                    graphemes_buf.extend(string.graphemes(true));
+                    let char_count = string.chars().count();
 
                     fill_chars_buf.clear();
                     fill_chars_buf.extend(fill.chars());
 
-                    if target_len < graphemes_buf.len() {
-                        let end: usize =
-                            graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
-                        builder.append_value(&string[..end]);
+                    if target_len < char_count {
+                        builder.append_value(
+                            &string[..byte_offset_of_char(string, target_len)],
+                        );
                     } else if fill_chars_buf.is_empty() {
                         builder.append_value(string);
                     } else {
                         builder.write_str(string)?;
-                        for l in 0..target_len - graphemes_buf.len() {
+                        for l in 0..target_len - char_count {
                             let c =
                                 *fill_chars_buf.get(l % fill_chars_buf.len()).unwrap();
                             builder.write_char(c)?;
@@ -458,7 +451,6 @@ where
         builder.finish()
     } else {
         let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
-        let mut graphemes_buf = Vec::new();
 
         for (string, target_len) in string_array.iter().zip(length_array.iter()) {
             if let (Some(string), Some(target_len)) = (string, target_len) {
@@ -492,16 +484,15 @@ where
                         builder.append_value("");
                     }
                 } else {
-                    graphemes_buf.clear();
-                    graphemes_buf.extend(string.graphemes(true));
+                    let char_count = string.chars().count();
 
-                    if target_len < graphemes_buf.len() {
-                        let end: usize =
-                            graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
-                        builder.append_value(&string[..end]);
+                    if target_len < char_count {
+                        builder.append_value(
+                            &string[..byte_offset_of_char(string, target_len)],
+                        );
                     } else {
                         builder.write_str(string)?;
-                        for _ in 0..(target_len - graphemes_buf.len()) {
+                        for _ in 0..(target_len - char_count) {
                             builder.write_str(" ")?;
                         }
                         builder.append_value("");