Skip to content

Commit 3b097a5

Browse files
authored
arrow-ipc: Add ListView support (#9006)
# Which issue does this PR close? - Closes #9002 # Rationale for this change See the issue. # What changes are included in this PR? Add tests and support for ListView/LargeListView support for arrow-ipc. # Are these changes tested? Yes, various test cases added. # Are there any user-facing changes? No breaking changes, strictly new support for types. @alamb @vegarsti
1 parent 15b18c1 commit 3b097a5

File tree

3 files changed

+257
-2
lines changed

3 files changed

+257
-2
lines changed

arrow-ipc/src/convert.rs

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,20 @@ pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> Dat
425425
}
426426
DataType::LargeList(Arc::new(children.get(0).into()))
427427
}
428+
crate::Type::ListView => {
429+
let children = field.children().unwrap();
430+
if children.len() != 1 {
431+
panic!("expect a listview to have one child")
432+
}
433+
DataType::ListView(Arc::new(children.get(0).into()))
434+
}
435+
crate::Type::LargeListView => {
436+
let children = field.children().unwrap();
437+
if children.len() != 1 {
438+
panic!("expect a large listview to have one child")
439+
}
440+
DataType::LargeListView(Arc::new(children.get(0).into()))
441+
}
428442
crate::Type::FixedSizeList => {
429443
let children = field.children().unwrap();
430444
if children.len() != 1 {
@@ -769,7 +783,24 @@ pub(crate) fn get_fb_field_type<'a>(
769783
children: Some(fbb.create_vector(&[child])),
770784
}
771785
}
772-
ListView(_) | LargeListView(_) => unimplemented!("ListView/LargeListView not implemented"),
786+
ListView(list_type) => {
787+
let child = build_field(fbb, dictionary_tracker, list_type);
788+
FBFieldType {
789+
type_type: crate::Type::ListView,
790+
type_: crate::ListViewBuilder::new(fbb).finish().as_union_value(),
791+
children: Some(fbb.create_vector(&[child])),
792+
}
793+
}
794+
LargeListView(list_type) => {
795+
let child = build_field(fbb, dictionary_tracker, list_type);
796+
FBFieldType {
797+
type_type: crate::Type::LargeListView,
798+
type_: crate::LargeListViewBuilder::new(fbb)
799+
.finish()
800+
.as_union_value(),
801+
children: Some(fbb.create_vector(&[child])),
802+
}
803+
}
773804
LargeList(list_type) => {
774805
let child = build_field(fbb, dictionary_tracker, list_type);
775806
FBFieldType {

arrow-ipc/src/reader.rs

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,16 @@ impl RecordBatchDecoder<'_> {
122122
let values = self.create_array(list_field, variadic_counts)?;
123123
self.create_list_array(list_node, data_type, &list_buffers, values)
124124
}
125+
ListView(list_field) | LargeListView(list_field) => {
126+
let list_node = self.next_node(field)?;
127+
let list_buffers = [
128+
self.next_buffer()?, // null buffer
129+
self.next_buffer()?, // offsets
130+
self.next_buffer()?, // sizes
131+
];
132+
let values = self.create_array(list_field, variadic_counts)?;
133+
self.create_list_view_array(list_node, data_type, &list_buffers, values)
134+
}
125135
FixedSizeList(list_field, _) => {
126136
let list_node = self.next_node(field)?;
127137
let list_buffers = [self.next_buffer()?];
@@ -322,6 +332,30 @@ impl RecordBatchDecoder<'_> {
322332
self.create_array_from_builder(builder)
323333
}
324334

335+
fn create_list_view_array(
336+
&self,
337+
field_node: &FieldNode,
338+
data_type: &DataType,
339+
buffers: &[Buffer],
340+
child_array: ArrayRef,
341+
) -> Result<ArrayRef, ArrowError> {
342+
assert!(matches!(data_type, ListView(_) | LargeListView(_)));
343+
344+
let null_buffer = (field_node.null_count() > 0).then_some(buffers[0].clone());
345+
let length = field_node.length() as usize;
346+
let child_data = child_array.into_data();
347+
348+
self.create_array_from_builder(
349+
ArrayData::builder(data_type.clone())
350+
.len(length)
351+
.add_buffer(buffers[1].clone()) // offsets
352+
.add_buffer(buffers[2].clone()) // sizes
353+
.add_child_data(child_data)
354+
.null_bit_buffer(null_buffer)
355+
.null_count(field_node.null_count() as usize),
356+
)
357+
}
358+
325359
fn create_struct_array(
326360
&self,
327361
struct_node: &FieldNode,

arrow-ipc/src/writer.rs

Lines changed: 191 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1705,6 +1705,36 @@ fn get_list_array_buffers<O: OffsetSizeTrait>(data: &ArrayData) -> (Buffer, Arra
17051705
(offsets, child_data)
17061706
}
17071707

1708+
/// Returns the offsets, sizes, and child data buffers for a ListView array.
1709+
///
1710+
/// Unlike List arrays, ListView arrays store both offsets and sizes explicitly,
1711+
/// and offsets can be non-monotonic. When slicing, we simply pass through the
1712+
/// offsets and sizes without re-encoding, and do not slice the child data.
1713+
fn get_list_view_array_buffers<O: OffsetSizeTrait>(
1714+
data: &ArrayData,
1715+
) -> (Buffer, Buffer, ArrayData) {
1716+
if data.is_empty() {
1717+
return (
1718+
MutableBuffer::new(0).into(),
1719+
MutableBuffer::new(0).into(),
1720+
data.child_data()[0].slice(0, 0),
1721+
);
1722+
}
1723+
1724+
let offsets = &data.buffers()[0];
1725+
let sizes = &data.buffers()[1];
1726+
1727+
let element_size = std::mem::size_of::<O>();
1728+
let offsets_slice =
1729+
offsets.slice_with_length(data.offset() * element_size, data.len() * element_size);
1730+
let sizes_slice =
1731+
sizes.slice_with_length(data.offset() * element_size, data.len() * element_size);
1732+
1733+
let child_data = data.child_data()[0].clone();
1734+
1735+
(offsets_slice, sizes_slice, child_data)
1736+
}
1737+
17081738
/// Returns the sliced views [`Buffer`] for a BinaryView/Utf8View array.
17091739
///
17101740
/// The views buffer is sliced to only include views in the valid range based on
@@ -1901,6 +1931,52 @@ fn write_array_data(
19011931
write_options,
19021932
)?;
19031933
return Ok(offset);
1934+
} else if matches!(
1935+
data_type,
1936+
DataType::ListView(_) | DataType::LargeListView(_)
1937+
) {
1938+
assert_eq!(array_data.buffers().len(), 2); // offsets + sizes
1939+
assert_eq!(array_data.child_data().len(), 1);
1940+
1941+
let (offsets, sizes, child_data) = match data_type {
1942+
DataType::ListView(_) => get_list_view_array_buffers::<i32>(array_data),
1943+
DataType::LargeListView(_) => get_list_view_array_buffers::<i64>(array_data),
1944+
_ => unreachable!(),
1945+
};
1946+
1947+
offset = write_buffer(
1948+
offsets.as_slice(),
1949+
buffers,
1950+
arrow_data,
1951+
offset,
1952+
compression_codec,
1953+
compression_context,
1954+
write_options.alignment,
1955+
)?;
1956+
1957+
offset = write_buffer(
1958+
sizes.as_slice(),
1959+
buffers,
1960+
arrow_data,
1961+
offset,
1962+
compression_codec,
1963+
compression_context,
1964+
write_options.alignment,
1965+
)?;
1966+
1967+
offset = write_array_data(
1968+
&child_data,
1969+
buffers,
1970+
arrow_data,
1971+
nodes,
1972+
offset,
1973+
child_data.len(),
1974+
child_data.null_count(),
1975+
compression_codec,
1976+
compression_context,
1977+
write_options,
1978+
)?;
1979+
return Ok(offset);
19041980
} else if let DataType::FixedSizeList(_, fixed_size) = data_type {
19051981
assert_eq!(array_data.child_data().len(), 1);
19061982
let fixed_size = *fixed_size as usize;
@@ -2043,7 +2119,9 @@ mod tests {
20432119
use arrow_array::builder::MapBuilder;
20442120
use arrow_array::builder::StringViewBuilder;
20452121
use arrow_array::builder::UnionBuilder;
2046-
use arrow_array::builder::{GenericListBuilder, ListBuilder, StringBuilder};
2122+
use arrow_array::builder::{
2123+
GenericListBuilder, GenericListViewBuilder, ListBuilder, StringBuilder,
2124+
};
20472125
use arrow_array::builder::{PrimitiveRunBuilder, UInt32Builder};
20482126
use arrow_array::types::*;
20492127
use arrow_buffer::ScalarBuffer;
@@ -3212,6 +3290,118 @@ mod tests {
32123290
roundtrip_ensure_sliced_smaller(in_batch, 1000);
32133291
}
32143292

3293+
fn generate_list_view_data<O: OffsetSizeTrait>() -> GenericListViewArray<O> {
3294+
let mut builder = GenericListViewBuilder::<O, _>::new(UInt32Builder::new());
3295+
3296+
for i in 0u32..100_000 {
3297+
if i.is_multiple_of(10_000) {
3298+
builder.append(false);
3299+
continue;
3300+
}
3301+
for value in [i, i, i] {
3302+
builder.values().append_value(value);
3303+
}
3304+
builder.append(true);
3305+
}
3306+
3307+
builder.finish()
3308+
}
3309+
3310+
#[test]
3311+
fn encode_list_view_arrays() {
3312+
let val_inner = Field::new_list_field(DataType::UInt32, true);
3313+
let val_field = Field::new("val", DataType::ListView(Arc::new(val_inner)), true);
3314+
let schema = Arc::new(Schema::new(vec![val_field]));
3315+
3316+
let values = Arc::new(generate_list_view_data::<i32>());
3317+
3318+
let in_batch = RecordBatch::try_new(schema, vec![values]).unwrap();
3319+
let out_batch = deserialize_file(serialize_file(&in_batch));
3320+
assert_eq!(in_batch, out_batch);
3321+
}
3322+
3323+
#[test]
3324+
fn encode_large_list_view_arrays() {
3325+
let val_inner = Field::new_list_field(DataType::UInt32, true);
3326+
let val_field = Field::new("val", DataType::LargeListView(Arc::new(val_inner)), true);
3327+
let schema = Arc::new(Schema::new(vec![val_field]));
3328+
3329+
let values = Arc::new(generate_list_view_data::<i64>());
3330+
3331+
let in_batch = RecordBatch::try_new(schema, vec![values]).unwrap();
3332+
let out_batch = deserialize_file(serialize_file(&in_batch));
3333+
assert_eq!(in_batch, out_batch);
3334+
}
3335+
3336+
#[test]
3337+
fn check_sliced_list_view_array() {
3338+
let inner = Field::new_list_field(DataType::UInt32, true);
3339+
let field = Field::new("val", DataType::ListView(Arc::new(inner)), true);
3340+
let schema = Arc::new(Schema::new(vec![field]));
3341+
let values = Arc::new(generate_list_view_data::<i32>());
3342+
3343+
for (offset, len) in [(999, 1), (0, 13), (47, 12), (values.len() - 13, 13)] {
3344+
let in_batch = RecordBatch::try_new(schema.clone(), vec![values.clone()])
3345+
.unwrap()
3346+
.slice(offset, len);
3347+
let out_batch = deserialize_file(serialize_file(&in_batch));
3348+
assert_eq!(in_batch, out_batch);
3349+
}
3350+
}
3351+
3352+
#[test]
3353+
fn check_sliced_large_list_view_array() {
3354+
let inner = Field::new_list_field(DataType::UInt32, true);
3355+
let field = Field::new("val", DataType::LargeListView(Arc::new(inner)), true);
3356+
let schema = Arc::new(Schema::new(vec![field]));
3357+
let values = Arc::new(generate_list_view_data::<i64>());
3358+
3359+
for (offset, len) in [(999, 1), (0, 13), (47, 12), (values.len() - 13, 13)] {
3360+
let in_batch = RecordBatch::try_new(schema.clone(), vec![values.clone()])
3361+
.unwrap()
3362+
.slice(offset, len);
3363+
let out_batch = deserialize_file(serialize_file(&in_batch));
3364+
assert_eq!(in_batch, out_batch);
3365+
}
3366+
}
3367+
3368+
fn generate_nested_list_view_data<O: OffsetSizeTrait>() -> GenericListViewArray<O> {
3369+
let inner_builder = UInt32Builder::new();
3370+
let middle_builder = GenericListViewBuilder::<O, _>::new(inner_builder);
3371+
let mut outer_builder = GenericListViewBuilder::<O, _>::new(middle_builder);
3372+
3373+
for i in 0u32..10_000 {
3374+
if i.is_multiple_of(1_000) {
3375+
outer_builder.append(false);
3376+
continue;
3377+
}
3378+
3379+
for _ in 0..3 {
3380+
for value in [i, i + 1, i + 2] {
3381+
outer_builder.values().values().append_value(value);
3382+
}
3383+
outer_builder.values().append(true);
3384+
}
3385+
outer_builder.append(true);
3386+
}
3387+
3388+
outer_builder.finish()
3389+
}
3390+
3391+
#[test]
3392+
fn encode_nested_list_views() {
3393+
let inner_int = Arc::new(Field::new_list_field(DataType::UInt32, true));
3394+
let inner_list_field = Arc::new(Field::new_list_field(DataType::ListView(inner_int), true));
3395+
let list_field = Field::new("val", DataType::ListView(inner_list_field), true);
3396+
let schema = Arc::new(Schema::new(vec![list_field]));
3397+
3398+
let values = Arc::new(generate_nested_list_view_data::<i32>());
3399+
3400+
let in_batch = RecordBatch::try_new(schema, vec![values]).unwrap();
3401+
let out_batch = deserialize_file(serialize_file(&in_batch));
3402+
assert_eq!(in_batch, out_batch);
3403+
}
3404+
32153405
#[test]
32163406
fn test_decimal128_alignment16_is_sufficient() {
32173407
const IPC_ALIGNMENT: usize = 16;

0 commit comments

Comments
 (0)