Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 34 additions & 14 deletions datafusion/functions/benches/translate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,19 @@ use arrow::array::OffsetSizeTrait;
use arrow::datatypes::{DataType, Field};
use arrow::util::bench_util::create_string_array_with_len;
use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
use datafusion_common::DataFusionError;
use datafusion_common::config::ConfigOptions;
use datafusion_common::{DataFusionError, ScalarValue};
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
use datafusion_functions::unicode;
use std::hint::black_box;
use std::sync::Arc;
use std::time::Duration;

fn create_args<O: OffsetSizeTrait>(size: usize, str_len: usize) -> Vec<ColumnarValue> {
fn create_args_array_from_to<O: OffsetSizeTrait>(
size: usize,
str_len: usize,
) -> Vec<ColumnarValue> {
let string_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
// Create simple from/to strings for translation
let from_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, 3));
let to_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, 2));

Expand All @@ -42,6 +44,19 @@ fn create_args<O: OffsetSizeTrait>(size: usize, str_len: usize) -> Vec<ColumnarV
]
}

fn create_args_scalar_from_to<O: OffsetSizeTrait>(
size: usize,
str_len: usize,
) -> Vec<ColumnarValue> {
let string_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));

vec![
ColumnarValue::Array(string_array),
ColumnarValue::Scalar(ScalarValue::from("aeiou")),
ColumnarValue::Scalar(ScalarValue::from("AEIOU")),
]
}

fn invoke_translate_with_args(
args: Vec<ColumnarValue>,
number_rows: usize,
Expand Down Expand Up @@ -69,17 +84,22 @@ fn criterion_benchmark(c: &mut Criterion) {
group.sample_size(10);
group.measurement_time(Duration::from_secs(10));

for str_len in [8, 32] {
let args = create_args::<i32>(size, str_len);
group.bench_function(
format!("translate_string [size={size}, str_len={str_len}]"),
|b| {
b.iter(|| {
let args_cloned = args.clone();
black_box(invoke_translate_with_args(args_cloned, size))
})
},
);
for str_len in [8, 32, 128, 1024] {
let args = create_args_array_from_to::<i32>(size, str_len);
group.bench_function(format!("array_from_to [str_len={str_len}]"), |b| {
b.iter(|| {
let args_cloned = args.clone();
black_box(invoke_translate_with_args(args_cloned, size))
})
});

let args = create_args_scalar_from_to::<i32>(size, str_len);
group.bench_function(format!("scalar_from_to [str_len={str_len}]"), |b| {
b.iter(|| {
let args_cloned = args.clone();
black_box(invoke_translate_with_args(args_cloned, size))
})
});
}

group.finish();
Expand Down
188 changes: 182 additions & 6 deletions datafusion/functions/src/unicode/translate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ use datafusion_macros::user_doc;

#[user_doc(
doc_section(label = "String Functions"),
description = "Translates characters in a string to specified translation characters.",
syntax_example = "translate(str, chars, translation)",
description = "Performs character-wise substitution based on a mapping.",
syntax_example = "translate(str, from, to)",
sql_example = r#"```sql
> select translate('twice', 'wic', 'her');
+--------------------------------------------------+
Expand All @@ -46,10 +46,10 @@ use datafusion_macros::user_doc;
+--------------------------------------------------+
```"#,
standard_argument(name = "str", prefix = "String"),
argument(name = "chars", description = "Characters to translate."),
argument(name = "from", description = "The characters to be replaced."),
argument(
name = "translation",
description = "Translation characters. Translation characters replace only characters at the same position in the **chars** string."
name = "to",
description = "The characters to replace them with. Each character in **from** that is found in **str** is replaced by the character at the same index in **to**. Any characters in **from** that don't have a corresponding character in **to** are removed."
)
)]
#[derive(Debug, PartialEq, Eq, Hash)]
Expand All @@ -71,6 +71,7 @@ impl TranslateFunc {
vec![
Exact(vec![Utf8View, Utf8, Utf8]),
Exact(vec![Utf8, Utf8, Utf8]),
Exact(vec![LargeUtf8, Utf8, Utf8]),
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new signature arm Exact(vec![LargeUtf8, Utf8, Utf8]) looks inconsistent with invoke_translate’s DataType::LargeUtf8 branch, which downcasts from/to via as_string::<i64>(). If from/to are arrays (as allowed by the signature), this will likely panic due to offset-size mismatch (Utf8/i32 vs LargeUtf8/i64).

Severity: high

Other Locations
  • datafusion/functions/src/unicode/translate.rs:194
  • datafusion/functions/src/unicode/translate.rs:195

Fix This in Augment

🤖 Was this useful? React with 👍 or 👎, or 🚀 if it prevented an incident/outage.

Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

value:useful; category:bug; feedback: The Augment AI reviewer is correct! There is no need to use LargeUtf8 for the from and to arguments, since there is no alphabet that long (u64::MAX). u32::MAX is more than enough for any alphabet. There is a bug in the as_string::() calls though! They should use as_string::() since this is Utf8 type.

],
Volatility::Immutable,
),
Expand Down Expand Up @@ -99,6 +100,65 @@ impl ScalarUDFImpl for TranslateFunc {
&self,
args: datafusion_expr::ScalarFunctionArgs,
) -> Result<ColumnarValue> {
// When from and to are scalars, pre-build the translation map once
if let (Some(from_str), Some(to_str)) = (
try_as_scalar_str(&args.args[1]),
try_as_scalar_str(&args.args[2]),
) {
let from_graphemes: Vec<&str> = from_str.graphemes(true).collect();
let to_graphemes: Vec<&str> = to_str.graphemes(true).collect();

let mut from_map: HashMap<&str, usize> = HashMap::new();
for (index, c) in from_graphemes.iter().enumerate() {
// Ignore characters that already exist in from_map
from_map.entry(*c).or_insert(index);
}

let ascii_table = build_ascii_translate_table(from_str, to_str);

let string_array = match &args.args[0] {
ColumnarValue::Array(arr) => Arc::clone(arr),
ColumnarValue::Scalar(s) => s.to_array_of_size(args.number_rows)?,
};

let result = match string_array.data_type() {
DataType::Utf8View => {
let arr = string_array.as_string_view();
translate_with_map::<i32, _>(
arr,
&from_map,
&to_graphemes,
ascii_table.as_ref(),
)
}
DataType::Utf8 => {
let arr = string_array.as_string::<i32>();
translate_with_map::<i32, _>(
arr,
&from_map,
&to_graphemes,
ascii_table.as_ref(),
)
}
DataType::LargeUtf8 => {
let arr = string_array.as_string::<i64>();
translate_with_map::<i64, _>(
arr,
&from_map,
&to_graphemes,
ascii_table.as_ref(),
)
}
other => {
return exec_err!(
"Unsupported data type {other:?} for function translate"
);
}
}?;

return Ok(ColumnarValue::Array(result));
}

make_scalar_function(invoke_translate, vec![])(&args.args)
}

Expand All @@ -107,6 +167,14 @@ impl ScalarUDFImpl for TranslateFunc {
}
}

/// If `cv` is a non-null scalar string, return its value.
fn try_as_scalar_str(cv: &ColumnarValue) -> Option<&str> {
match cv {
ColumnarValue::Scalar(s) => s.try_as_str().flatten(),
_ => None,
}
}

fn invoke_translate(args: &[ArrayRef]) -> Result<ArrayRef> {
match args[0].data_type() {
DataType::Utf8View => {
Expand Down Expand Up @@ -170,7 +238,7 @@ where
// Build from_map using reusable buffer
from_graphemes.extend(from.graphemes(true));
for (index, c) in from_graphemes.iter().enumerate() {
// Ignore characters that already exist in from_map, else insert
// Ignore characters that already exist in from_map
from_map.entry(*c).or_insert(index);
}

Expand Down Expand Up @@ -199,6 +267,99 @@ where
Ok(Arc::new(result) as ArrayRef)
}

/// Sentinel value in the ASCII translate table indicating the character should
/// be deleted (the `from` character has no corresponding `to` character). Any
/// value > 127 works since valid ASCII is 0–127.
const ASCII_DELETE: u8 = 0xFF;

/// If `from` and `to` are both ASCII, build a fixed-size lookup table for
/// translation. Each entry maps an input byte to its replacement byte, or to
/// [`ASCII_DELETE`] if the character should be removed. Returns `None` if
/// either string contains non-ASCII characters.
fn build_ascii_translate_table(from: &str, to: &str) -> Option<[u8; 128]> {
if !from.is_ascii() || !to.is_ascii() {
return None;
}
let mut table = [0u8; 128];
for i in 0..128u8 {
table[i as usize] = i;
}
let to_bytes = to.as_bytes();
let mut seen = [false; 128];
for (i, from_byte) in from.bytes().enumerate() {
let idx = from_byte as usize;
if !seen[idx] {
seen[idx] = true;
if i < to_bytes.len() {
table[idx] = to_bytes[i];
} else {
table[idx] = ASCII_DELETE;
}
}
}
Some(table)
}

/// Optimized translate for constant `from` and `to` arguments: uses a pre-built
/// translation map instead of rebuilding it for every row. When an ASCII byte
/// lookup table is provided, ASCII input rows the lookup table; non-ASCII
/// inputs fallback to using the map.
Comment on lines +303 to +306
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Typo in doc comment.

Line 305: "ASCII input rows the lookup table" → "ASCII input rows use the lookup table".

📝 Fix
-/// lookup table is provided, ASCII input rows the lookup table; non-ASCII
+/// lookup table is provided, ASCII input rows use the lookup table; non-ASCII
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
/// Optimized translate for constant `from` and `to` arguments: uses a pre-built
/// translation map instead of rebuilding it for every row. When an ASCII byte
/// lookup table is provided, ASCII input rows the lookup table; non-ASCII
/// inputs fallback to using the map.
/// Optimized translate for constant `from` and `to` arguments: uses a pre-built
/// translation map instead of rebuilding it for every row. When an ASCII byte
/// lookup table is provided, ASCII input rows use the lookup table; non-ASCII
/// inputs fallback to using the map.
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@datafusion/functions/src/unicode/translate.rs` around lines 303 - 306, Fix
the typo in the doc comment above the optimized translate implementation in
translate.rs: change the phrase "ASCII input rows the lookup table" to "ASCII
input rows use the lookup table" in the block describing the ASCII byte lookup
table behavior for constant `from`/`to` optimized translate.

Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

value:good-to-have; category:typo; feedback: The CodeRabbit AI reviewer is correct! The is a typo in the documentation that should be fixed.

fn translate_with_map<'a, T: OffsetSizeTrait, V>(
string_array: V,
from_map: &HashMap<&str, usize>,
to_graphemes: &[&str],
ascii_table: Option<&[u8; 128]>,
) -> Result<ArrayRef>
where
V: ArrayAccessor<Item = &'a str>,
{
let mut string_graphemes: Vec<&str> = Vec::new();
let mut result_graphemes: Vec<&str> = Vec::new();
let mut ascii_buf: Vec<u8> = Vec::new();

let result = ArrayIter::new(string_array)
.map(|string| {
string.map(|s| {
// Fast path: byte-level table lookup for ASCII strings
if let Some(table) = ascii_table
&& s.is_ascii()
{
ascii_buf.clear();
for &b in s.as_bytes() {
let mapped = table[b as usize];
if mapped != ASCII_DELETE {
ascii_buf.push(mapped);
}
}
// ascii_buf contains only ASCII bytes, so it is valid
// UTF-8.
return String::from_utf8(ascii_buf.clone()).unwrap();
}

// Slow path: grapheme-based translation
string_graphemes.clear();
result_graphemes.clear();

string_graphemes.extend(s.graphemes(true));
for c in &string_graphemes {
match from_map.get(*c) {
Some(n) => {
if let Some(replacement) = to_graphemes.get(*n) {
result_graphemes.push(*replacement);
}
}
None => result_graphemes.push(*c),
}
}
Comment on lines +340 to +353
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The string_graphemes vector seems unnecessary. You can iterate directly over the graphemes of the input string s to build result_graphemes. This avoids populating an intermediate vector for each row, which should be more efficient.

You would also need to remove the declaration of string_graphemes at line 316.

                result_graphemes.clear();

                for c in s.graphemes(true) {
                    match from_map.get(c) {
                        Some(n) => {
                            if let Some(replacement) = to_graphemes.get(*n) {
                                result_graphemes.push(*replacement);
                            }
                        }
                        None => result_graphemes.push(c),
                    }
                }

Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

value:useful; category:bug; feedback: The Gemini AI reviewer is correct! There is no need to store the graphemes in a local variable that makes an unnecessary allocation. Just iterating over the graphemes is enough for the logic needs. Prevents a heap allocation that is not really necessary.


result_graphemes.concat()
})
})
.collect::<GenericStringArray<T>>();

Ok(Arc::new(result) as ArrayRef)
}

#[cfg(test)]
mod tests {
use arrow::array::{Array, StringArray};
Expand Down Expand Up @@ -284,6 +445,21 @@ mod tests {
Utf8,
StringArray
);
// Non-ASCII input with ASCII scalar from/to: exercises the
// grapheme fallback within translate_with_map.
test_function!(
TranslateFunc::new(),
vec![
ColumnarValue::Scalar(ScalarValue::from("café")),
ColumnarValue::Scalar(ScalarValue::from("ae")),
ColumnarValue::Scalar(ScalarValue::from("AE"))
],
Ok(Some("cAfé")),
&str,
Utf8,
StringArray
);

#[cfg(not(feature = "unicode_expressions"))]
test_function!(
TranslateFunc::new(),
Expand Down
8 changes: 4 additions & 4 deletions docs/source/user-guide/sql/scalar_functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -2068,17 +2068,17 @@ to_hex(int)

### `translate`

Translates characters in a string to specified translation characters.
Performs character-wise substitution based on a mapping.

```sql
translate(str, chars, translation)
translate(str, from, to)
```

#### Arguments

- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators.
- **chars**: Characters to translate.
- **translation**: Translation characters. Translation characters replace only characters at the same position in the **chars** string.
- **from**: The characters to be replaced.
- **to**: The characters to replace them with. Each character in **from** that is found in **str** is replaced by the character at the same index in **to**. Any characters in **from** that don't have a corresponding character in **to** are removed.

#### Example

Expand Down