Skip to content

Commit 08dcc0b

Browse files
Jefffreyalamb
andauthored
fix: cast Binary/String dictionary to view (#8912)
# Which issue does this PR close? - Closes #8841 # Rationale for this change Be able to successfully cast from Dictionary type to View types. # What changes are included in this PR? Add checks on which array types can use the fast path that was previously erroring. Also do a little refactoring in surrounding code. # Are these changes tested? Added new tests. # Are there any user-facing changes? No. Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 6ff8cc4 commit 08dcc0b

File tree

2 files changed

+137
-105
lines changed

2 files changed

+137
-105
lines changed

arrow-cast/src/cast/dictionary.rs

Lines changed: 82 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -28,111 +28,92 @@ pub(crate) fn dictionary_cast<K: ArrowDictionaryKeyType>(
2828
) -> Result<ArrayRef, ArrowError> {
2929
use DataType::*;
3030

31-
match to_type {
32-
Dictionary(to_index_type, to_value_type) => {
33-
let dict_array = array
34-
.as_any()
35-
.downcast_ref::<DictionaryArray<K>>()
36-
.ok_or_else(|| {
37-
ArrowError::ComputeError(
38-
"Internal Error: Cannot cast dictionary to DictionaryArray of expected type".to_string(),
39-
)
40-
})?;
31+
let array = array.as_dictionary::<K>();
32+
let from_child_type = array.values().data_type();
33+
match (from_child_type, to_type) {
34+
(_, Dictionary(to_index_type, to_value_type)) => {
35+
dictionary_to_dictionary_cast(array, to_index_type, to_value_type, cast_options)
36+
}
37+
// `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data
38+
// copy of the value buffer. Fast path which avoids copying underlying values buffer.
39+
// TODO: handle LargeUtf8/LargeBinary -> View (need to check offsets can fit)
40+
// TODO: handle cross types (String -> BinaryView, Binary -> StringView)
41+
// (need to validate utf8?)
42+
(Utf8, Utf8View) => view_from_dict_values::<K, Utf8Type, StringViewType>(
43+
array.keys(),
44+
array.values().as_string::<i32>(),
45+
),
46+
(Binary, BinaryView) => view_from_dict_values::<K, BinaryType, BinaryViewType>(
47+
array.keys(),
48+
array.values().as_binary::<i32>(),
49+
),
50+
_ => unpack_dictionary(array, to_type, cast_options),
51+
}
52+
}
4153

42-
let keys_array: ArrayRef =
43-
Arc::new(PrimitiveArray::<K>::from(dict_array.keys().to_data()));
44-
let values_array = dict_array.values();
45-
let cast_keys = cast_with_options(&keys_array, to_index_type, cast_options)?;
46-
let cast_values = cast_with_options(values_array, to_value_type, cast_options)?;
54+
fn dictionary_to_dictionary_cast<K: ArrowDictionaryKeyType>(
55+
array: &DictionaryArray<K>,
56+
to_index_type: &DataType,
57+
to_value_type: &DataType,
58+
cast_options: &CastOptions,
59+
) -> Result<ArrayRef, ArrowError> {
60+
use DataType::*;
4761

48-
// Failure to cast keys (because they don't fit in the
49-
// target type) results in NULL values;
50-
if cast_keys.null_count() > keys_array.null_count() {
51-
return Err(ArrowError::ComputeError(format!(
52-
"Could not convert {} dictionary indexes from {:?} to {:?}",
53-
cast_keys.null_count() - keys_array.null_count(),
54-
keys_array.data_type(),
55-
to_index_type
56-
)));
57-
}
62+
let keys_array: ArrayRef = Arc::new(PrimitiveArray::<K>::from(array.keys().to_data()));
63+
let values_array = array.values();
64+
let cast_keys = cast_with_options(&keys_array, to_index_type, cast_options)?;
65+
let cast_values = cast_with_options(values_array, to_value_type, cast_options)?;
5866

59-
let data = cast_keys.into_data();
60-
let builder = data
61-
.into_builder()
62-
.data_type(to_type.clone())
63-
.child_data(vec![cast_values.into_data()]);
67+
// Failure to cast keys (because they don't fit in the
68+
// target type) results in NULL values;
69+
if cast_keys.null_count() > keys_array.null_count() {
70+
return Err(ArrowError::ComputeError(format!(
71+
"Could not convert {} dictionary indexes from {:?} to {:?}",
72+
cast_keys.null_count() - keys_array.null_count(),
73+
keys_array.data_type(),
74+
to_index_type
75+
)));
76+
}
6477

65-
// Safety
66-
// Cast keys are still valid
67-
let data = unsafe { builder.build_unchecked() };
78+
let data = cast_keys.into_data();
79+
let builder = data
80+
.into_builder()
81+
.data_type(Dictionary(
82+
Box::new(to_index_type.clone()),
83+
Box::new(to_value_type.clone()),
84+
))
85+
.child_data(vec![cast_values.into_data()]);
6886

69-
// create the appropriate array type
70-
let new_array: ArrayRef = match **to_index_type {
71-
Int8 => Arc::new(DictionaryArray::<Int8Type>::from(data)),
72-
Int16 => Arc::new(DictionaryArray::<Int16Type>::from(data)),
73-
Int32 => Arc::new(DictionaryArray::<Int32Type>::from(data)),
74-
Int64 => Arc::new(DictionaryArray::<Int64Type>::from(data)),
75-
UInt8 => Arc::new(DictionaryArray::<UInt8Type>::from(data)),
76-
UInt16 => Arc::new(DictionaryArray::<UInt16Type>::from(data)),
77-
UInt32 => Arc::new(DictionaryArray::<UInt32Type>::from(data)),
78-
UInt64 => Arc::new(DictionaryArray::<UInt64Type>::from(data)),
79-
_ => {
80-
return Err(ArrowError::CastError(format!(
81-
"Unsupported type {to_index_type} for dictionary index"
82-
)));
83-
}
84-
};
87+
// Safety
88+
// Cast keys are still valid
89+
let data = unsafe { builder.build_unchecked() };
8590

86-
Ok(new_array)
91+
// create the appropriate array type
92+
let new_array: ArrayRef = match to_index_type {
93+
Int8 => Arc::new(DictionaryArray::<Int8Type>::from(data)),
94+
Int16 => Arc::new(DictionaryArray::<Int16Type>::from(data)),
95+
Int32 => Arc::new(DictionaryArray::<Int32Type>::from(data)),
96+
Int64 => Arc::new(DictionaryArray::<Int64Type>::from(data)),
97+
UInt8 => Arc::new(DictionaryArray::<UInt8Type>::from(data)),
98+
UInt16 => Arc::new(DictionaryArray::<UInt16Type>::from(data)),
99+
UInt32 => Arc::new(DictionaryArray::<UInt32Type>::from(data)),
100+
UInt64 => Arc::new(DictionaryArray::<UInt64Type>::from(data)),
101+
_ => {
102+
return Err(ArrowError::CastError(format!(
103+
"Unsupported type {to_index_type} for dictionary index"
104+
)));
87105
}
88-
Utf8View => {
89-
// `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer.
90-
// we handle it here to avoid the copy.
91-
let dict_array = array
92-
.as_dictionary::<K>()
93-
.downcast_dict::<StringArray>()
94-
.ok_or_else(|| {
95-
ArrowError::ComputeError(
96-
"Internal Error: Cannot cast Utf8View to StringArray of expected type"
97-
.to_string(),
98-
)
99-
})?;
106+
};
100107

101-
let string_view = view_from_dict_values::<K, StringViewType, GenericStringType<i32>>(
102-
dict_array.values(),
103-
dict_array.keys(),
104-
)?;
105-
Ok(Arc::new(string_view))
106-
}
107-
BinaryView => {
108-
// `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer.
109-
// we handle it here to avoid the copy.
110-
let dict_array = array
111-
.as_dictionary::<K>()
112-
.downcast_dict::<BinaryArray>()
113-
.ok_or_else(|| {
114-
ArrowError::ComputeError(
115-
"Internal Error: Cannot cast BinaryView to BinaryArray of expected type"
116-
.to_string(),
117-
)
118-
})?;
119-
120-
let binary_view = view_from_dict_values::<K, BinaryViewType, BinaryType>(
121-
dict_array.values(),
122-
dict_array.keys(),
123-
)?;
124-
Ok(Arc::new(binary_view))
125-
}
126-
_ => unpack_dictionary::<K>(array, to_type, cast_options),
127-
}
108+
Ok(new_array)
128109
}
129110

130-
fn view_from_dict_values<K: ArrowDictionaryKeyType, T: ByteViewType, V: ByteArrayType>(
131-
array: &GenericByteArray<V>,
111+
fn view_from_dict_values<K: ArrowDictionaryKeyType, V: ByteArrayType, T: ByteViewType>(
132112
keys: &PrimitiveArray<K>,
133-
) -> Result<GenericByteViewArray<T>, ArrowError> {
134-
let value_buffer = array.values();
135-
let value_offsets = array.value_offsets();
113+
values: &GenericByteArray<V>,
114+
) -> Result<ArrayRef, ArrowError> {
115+
let value_buffer = values.values();
116+
let value_offsets = values.value_offsets();
136117
let mut builder = GenericByteViewBuilder::<T>::with_capacity(keys.len());
137118
builder.append_block(value_buffer.clone());
138119
for i in keys.iter() {
@@ -157,21 +138,17 @@ fn view_from_dict_values<K: ArrowDictionaryKeyType, T: ByteViewType, V: ByteArra
157138
}
158139
}
159140
}
160-
Ok(builder.finish())
141+
Ok(Arc::new(builder.finish()))
161142
}
162143

163-
// Unpack a dictionary where the keys are of type <K> into a flattened array of type to_type
164-
pub(crate) fn unpack_dictionary<K>(
165-
array: &dyn Array,
144+
// Unpack a dictionary into a flattened array of type to_type
145+
pub(crate) fn unpack_dictionary<K: ArrowDictionaryKeyType>(
146+
array: &DictionaryArray<K>,
166147
to_type: &DataType,
167148
cast_options: &CastOptions,
168-
) -> Result<ArrayRef, ArrowError>
169-
where
170-
K: ArrowDictionaryKeyType,
171-
{
172-
let dict_array = array.as_dictionary::<K>();
173-
let cast_dict_values = cast_with_options(dict_array.values(), to_type, cast_options)?;
174-
take(cast_dict_values.as_ref(), dict_array.keys(), None)
149+
) -> Result<ArrayRef, ArrowError> {
150+
let cast_dict_values = cast_with_options(array.values(), to_type, cast_options)?;
151+
take(cast_dict_values.as_ref(), array.keys(), None)
175152
}
176153

177154
/// Pack a data type into a dictionary array passing the values through a primitive array

arrow-cast/src/cast/mod.rs

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12168,4 +12168,59 @@ mod tests {
1216812168
// 03:12:44 = 3*3600 + 12*60 + 44 = 10800 + 720 + 44 = 11564
1216912169
assert_eq!(cast_array.value(0), 11564);
1217012170
}
12171+
#[test]
12172+
fn test_string_dicts_to_binary_view() {
12173+
let expected = BinaryViewArray::from_iter(vec![
12174+
VIEW_TEST_DATA[1],
12175+
VIEW_TEST_DATA[0],
12176+
None,
12177+
VIEW_TEST_DATA[3],
12178+
None,
12179+
VIEW_TEST_DATA[1],
12180+
VIEW_TEST_DATA[4],
12181+
]);
12182+
12183+
let values_arrays: [ArrayRef; _] = [
12184+
Arc::new(StringArray::from_iter(VIEW_TEST_DATA)),
12185+
Arc::new(StringViewArray::from_iter(VIEW_TEST_DATA)),
12186+
Arc::new(LargeStringArray::from_iter(VIEW_TEST_DATA)),
12187+
];
12188+
for values in values_arrays {
12189+
let keys =
12190+
Int8Array::from_iter([Some(1), Some(0), None, Some(3), None, Some(1), Some(4)]);
12191+
let string_dict_array =
12192+
DictionaryArray::<Int8Type>::try_new(keys, Arc::new(values)).unwrap();
12193+
12194+
let casted = cast(&string_dict_array, &DataType::BinaryView).unwrap();
12195+
assert_eq!(casted.as_ref(), &expected);
12196+
}
12197+
}
12198+
12199+
#[test]
12200+
fn test_binary_dicts_to_string_view() {
12201+
let expected = StringViewArray::from_iter(vec![
12202+
VIEW_TEST_DATA[1],
12203+
VIEW_TEST_DATA[0],
12204+
None,
12205+
VIEW_TEST_DATA[3],
12206+
None,
12207+
VIEW_TEST_DATA[1],
12208+
VIEW_TEST_DATA[4],
12209+
]);
12210+
12211+
let values_arrays: [ArrayRef; _] = [
12212+
Arc::new(BinaryArray::from_iter(VIEW_TEST_DATA)),
12213+
Arc::new(BinaryViewArray::from_iter(VIEW_TEST_DATA)),
12214+
Arc::new(LargeBinaryArray::from_iter(VIEW_TEST_DATA)),
12215+
];
12216+
for values in values_arrays {
12217+
let keys =
12218+
Int8Array::from_iter([Some(1), Some(0), None, Some(3), None, Some(1), Some(4)]);
12219+
let string_dict_array =
12220+
DictionaryArray::<Int8Type>::try_new(keys, Arc::new(values)).unwrap();
12221+
12222+
let casted = cast(&string_dict_array, &DataType::Utf8View).unwrap();
12223+
assert_eq!(casted.as_ref(), &expected);
12224+
}
12225+
}
1217112226
}

0 commit comments

Comments
 (0)