Skip to content

Commit 037bb18

Browse files
committed
Merge remote-tracking branch 'origin/main' into parquet_bench
2 parents 87cff69 + ce4edd5 commit 037bb18

File tree

6 files changed

+202
-19
lines changed

6 files changed

+202
-19
lines changed

arrow-array/src/array/mod.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -824,20 +824,20 @@ pub fn make_array(data: ArrayData) -> ArrayRef {
824824
DataType::UInt16 => Arc::new(DictionaryArray::<UInt16Type>::from(data)) as ArrayRef,
825825
DataType::UInt32 => Arc::new(DictionaryArray::<UInt32Type>::from(data)) as ArrayRef,
826826
DataType::UInt64 => Arc::new(DictionaryArray::<UInt64Type>::from(data)) as ArrayRef,
827-
dt => panic!("Unexpected dictionary key type {dt}"),
827+
dt => unimplemented!("Unexpected dictionary key type {dt}"),
828828
},
829829
DataType::RunEndEncoded(run_ends_type, _) => match run_ends_type.data_type() {
830830
DataType::Int16 => Arc::new(RunArray::<Int16Type>::from(data)) as ArrayRef,
831831
DataType::Int32 => Arc::new(RunArray::<Int32Type>::from(data)) as ArrayRef,
832832
DataType::Int64 => Arc::new(RunArray::<Int64Type>::from(data)) as ArrayRef,
833-
dt => panic!("Unexpected data type for run_ends array {dt}"),
833+
dt => unimplemented!("Unexpected data type for run_ends array {dt}"),
834834
},
835835
DataType::Null => Arc::new(NullArray::from(data)) as ArrayRef,
836836
DataType::Decimal32(_, _) => Arc::new(Decimal32Array::from(data)) as ArrayRef,
837837
DataType::Decimal64(_, _) => Arc::new(Decimal64Array::from(data)) as ArrayRef,
838838
DataType::Decimal128(_, _) => Arc::new(Decimal128Array::from(data)) as ArrayRef,
839839
DataType::Decimal256(_, _) => Arc::new(Decimal256Array::from(data)) as ArrayRef,
840-
dt => panic!("Unexpected data type {dt}"),
840+
dt => unimplemented!("Unexpected data type {dt}"),
841841
}
842842
}
843843

arrow-array/src/builder/mod.rs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -594,7 +594,7 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box<dyn ArrayBuilde
594594
LargeBinaryDictionaryBuilder::with_capacity(capacity, 256, 1024);
595595
Box::new(dict_builder)
596596
}
597-
t => panic!("Dictionary value type {t} is not currently supported"),
597+
t => unimplemented!("Dictionary value type {t} is not currently supported"),
598598
}
599599
};
600600
}
@@ -604,10 +604,12 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box<dyn ArrayBuilde
604604
DataType::Int32 => dict_builder!(Int32Type),
605605
DataType::Int64 => dict_builder!(Int64Type),
606606
_ => {
607-
panic!("Data type {t} with key type {key_type} is not currently supported")
607+
unimplemented!(
608+
"Data type {t} with key type {key_type} is not currently supported"
609+
)
608610
}
609611
}
610612
}
611-
t => panic!("Data type {t} is not currently supported"),
613+
t => unimplemented!("Data type {t} is not currently supported"),
612614
}
613615
}

arrow-array/src/ffi.rs

Lines changed: 143 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -140,8 +140,8 @@ pub unsafe fn export_array_into_raw(
140140
Ok(())
141141
}
142142

143-
// returns the number of bits that buffer `i` (in the C data interface) is expected to have.
144-
// This is set by the Arrow specification
143+
/// returns the number of bits that buffer `i` (in the C data interface) is expected to have.
144+
/// This is set by the Arrow specification
145145
fn bit_width(data_type: &DataType, i: usize) -> Result<usize> {
146146
if let Some(primitive) = data_type.primitive_width() {
147147
return match i {
@@ -180,6 +180,10 @@ fn bit_width(data_type: &DataType, i: usize) -> Result<usize> {
180180
| (DataType::List(_), 1)
181181
| (DataType::Map(_, _), 1) => i32::BITS as _,
182182
(DataType::Utf8, 2) | (DataType::Binary, 2) => u8::BITS as _,
183+
// List views have two i32 buffers, offsets and sizes
184+
(DataType::ListView(_), 1) | (DataType::ListView(_), 2) => i32::BITS as _,
185+
// Large list views have two i64 buffers, offsets and sizes
186+
(DataType::LargeListView(_), 1) | (DataType::LargeListView(_), 2) => i64::BITS as _,
183187
(DataType::List(_), _) | (DataType::Map(_, _), _) => {
184188
return Err(ArrowError::CDataInterface(format!(
185189
"The datatype \"{data_type}\" expects 2 buffers, but requested {i}. Please verify that the C data interface is correctly implemented."
@@ -351,6 +355,8 @@ impl ImportedArrowArray<'_> {
351355
DataType::List(field)
352356
| DataType::FixedSizeList(field, _)
353357
| DataType::LargeList(field)
358+
| DataType::ListView(field)
359+
| DataType::LargeListView(field)
354360
| DataType::Map(field, _) => Ok([self.consume_child(0, field.data_type())?].to_vec()),
355361
DataType::Struct(fields) => {
356362
assert!(fields.len() == self.array.num_children());
@@ -471,6 +477,14 @@ impl ImportedArrowArray<'_> {
471477
debug_assert_eq!(bits % 8, 0);
472478
(length + 1) * (bits / 8)
473479
}
480+
(DataType::ListView(_), 1)
481+
| (DataType::ListView(_), 2)
482+
| (DataType::LargeListView(_), 1)
483+
| (DataType::LargeListView(_), 2) => {
484+
let bits = bit_width(data_type, i)?;
485+
debug_assert_eq!(bits % 8, 0);
486+
length * (bits / 8)
487+
}
474488
(DataType::Utf8, 2) | (DataType::Binary, 2) => {
475489
if self.array.is_empty() {
476490
return Ok(0);
@@ -553,7 +567,7 @@ mod tests_to_then_from_ffi {
553567
use std::collections::HashMap;
554568
use std::mem::ManuallyDrop;
555569

556-
use arrow_buffer::NullBuffer;
570+
use arrow_buffer::{ArrowNativeType, NullBuffer};
557571
use arrow_schema::Field;
558572

559573
use crate::builder::UnionBuilder;
@@ -783,6 +797,71 @@ mod tests_to_then_from_ffi {
783797
test_generic_list::<i64>()
784798
}
785799

800+
fn test_generic_list_view<Offset: OffsetSizeTrait + ArrowNativeType>() -> Result<()> {
801+
// Construct a value array
802+
let value_data = ArrayData::builder(DataType::Int16)
803+
.len(8)
804+
.add_buffer(Buffer::from_slice_ref([0_i16, 1, 2, 3, 4, 5, 6, 7]))
805+
.build()
806+
.unwrap();
807+
808+
// Construct a buffer for value offsets, for the nested array:
809+
// [[0, 1, 2], [3, 4, 5], [6, 7]]
810+
let value_offsets = [0_usize, 3, 6]
811+
.iter()
812+
.map(|i| Offset::from_usize(*i).unwrap())
813+
.collect::<Buffer>();
814+
815+
let sizes_buffer = [3_usize, 3, 2]
816+
.iter()
817+
.map(|i| Offset::from_usize(*i).unwrap())
818+
.collect::<Buffer>();
819+
820+
// Construct a list array from the above two
821+
let list_view_dt = GenericListViewArray::<Offset>::DATA_TYPE_CONSTRUCTOR(Arc::new(
822+
Field::new_list_field(DataType::Int16, false),
823+
));
824+
825+
let list_data = ArrayData::builder(list_view_dt)
826+
.len(3)
827+
.add_buffer(value_offsets)
828+
.add_buffer(sizes_buffer)
829+
.add_child_data(value_data)
830+
.build()
831+
.unwrap();
832+
833+
let original = GenericListViewArray::<Offset>::from(list_data.clone());
834+
835+
// export it
836+
let (array, schema) = to_ffi(&original.to_data())?;
837+
838+
// (simulate consumer) import it
839+
let data = unsafe { from_ffi(array, &schema) }?;
840+
let array = make_array(data);
841+
842+
// downcast
843+
let array = array
844+
.as_any()
845+
.downcast_ref::<GenericListViewArray<Offset>>()
846+
.unwrap();
847+
848+
assert_eq!(&array.value(0), &original.value(0));
849+
assert_eq!(&array.value(1), &original.value(1));
850+
assert_eq!(&array.value(2), &original.value(2));
851+
852+
Ok(())
853+
}
854+
855+
#[test]
856+
fn test_list_view() -> Result<()> {
857+
test_generic_list_view::<i32>()
858+
}
859+
860+
#[test]
861+
fn test_large_list_view() -> Result<()> {
862+
test_generic_list_view::<i64>()
863+
}
864+
786865
fn test_generic_binary<Offset: OffsetSizeTrait>() -> Result<()> {
787866
// create an array natively
788867
let array: Vec<Option<&[u8]>> = vec![Some(b"a"), None, Some(b"aaa")];
@@ -1315,6 +1394,7 @@ mod tests_from_ffi {
13151394
use std::ptr::NonNull;
13161395
use std::sync::Arc;
13171396

1397+
use arrow_buffer::NullBuffer;
13181398
#[cfg(not(feature = "force_validate"))]
13191399
use arrow_buffer::{ScalarBuffer, bit_util, buffer::Buffer};
13201400
#[cfg(feature = "force_validate")]
@@ -1325,6 +1405,7 @@ mod tests_from_ffi {
13251405
use arrow_schema::{DataType, Field};
13261406

13271407
use super::Result;
1408+
13281409
use crate::builder::GenericByteViewBuilder;
13291410
use crate::types::{BinaryViewType, ByteViewType, Int32Type, StringViewType};
13301411
use crate::{
@@ -1528,6 +1609,65 @@ mod tests_from_ffi {
15281609
test_round_trip(&data)
15291610
}
15301611

1612+
#[test]
1613+
fn test_list_view() -> Result<()> {
1614+
// Construct a value array
1615+
let value_data = ArrayData::builder(DataType::Int16)
1616+
.len(8)
1617+
.add_buffer(Buffer::from_slice_ref([0_i16, 1, 2, 3, 4, 5, 6, 7]))
1618+
.build()
1619+
.unwrap();
1620+
1621+
// Construct a buffer for value offsets, for the nested array:
1622+
// [[0, 1, 2], [3, 4, 5], [6, 7]]
1623+
let value_offsets = Buffer::from(vec![0_i32, 3, 6]);
1624+
let sizes_buffer = Buffer::from(vec![3_i32, 3, 2]);
1625+
1626+
// Construct a list array from the above two
1627+
let list_view_dt =
1628+
DataType::ListView(Arc::new(Field::new_list_field(DataType::Int16, false)));
1629+
1630+
let list_view_data = ArrayData::builder(list_view_dt)
1631+
.len(3)
1632+
.add_buffer(value_offsets)
1633+
.add_buffer(sizes_buffer)
1634+
.add_child_data(value_data)
1635+
.build()
1636+
.unwrap();
1637+
1638+
test_round_trip(&list_view_data)
1639+
}
1640+
1641+
#[test]
1642+
fn test_list_view_with_nulls() -> Result<()> {
1643+
// Construct a value array
1644+
let value_data = ArrayData::builder(DataType::Int16)
1645+
.len(8)
1646+
.add_buffer(Buffer::from_slice_ref([0_i16, 1, 2, 3, 4, 5, 6, 7]))
1647+
.build()
1648+
.unwrap();
1649+
1650+
// Construct a buffer for value offsets, for the nested array:
1651+
// [[0, 1, 2], [3, 4, 5], [6, 7], null]
1652+
let value_offsets = Buffer::from(vec![0_i32, 3, 6, 8]);
1653+
let sizes_buffer = Buffer::from(vec![3_i32, 3, 2, 0]);
1654+
1655+
// Construct a list array from the above two
1656+
let list_view_dt =
1657+
DataType::ListView(Arc::new(Field::new_list_field(DataType::Int16, true)));
1658+
1659+
let list_view_data = ArrayData::builder(list_view_dt)
1660+
.len(4)
1661+
.add_buffer(value_offsets)
1662+
.add_buffer(sizes_buffer)
1663+
.add_child_data(value_data)
1664+
.nulls(Some(NullBuffer::from(vec![true, true, true, false])))
1665+
.build()
1666+
.unwrap();
1667+
1668+
test_round_trip(&list_view_data)
1669+
}
1670+
15311671
#[test]
15321672
#[cfg(not(feature = "force_validate"))]
15331673
fn test_empty_string_with_non_zero_offset() -> Result<()> {

arrow-data/src/data.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1793,7 +1793,7 @@ impl DataTypeLayout {
17931793
},
17941794
],
17951795
can_contain_null_mask: true,
1796-
variadic: true,
1796+
variadic: false,
17971797
}
17981798
}
17991799
}

arrow-pyarrow-integration-testing/tests/test_sql.py

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@
2727

2828
import arrow_pyarrow_integration_testing as rust
2929

30-
PYARROW_PRE_14 = int(pa.__version__.split('.')[0]) < 14
30+
PYARROW_MAJOR_VER = int(pa.__version__.split(".")[0])
31+
PYARROW_PRE_14 = PYARROW_MAJOR_VER < 14
32+
PYARROW_PRE_16 = PYARROW_MAJOR_VER < 16
3133

3234

3335
@contextlib.contextmanager
@@ -112,8 +114,16 @@ def assert_pyarrow_leak():
112114
),
113115
]
114116

115-
_unsupported_pyarrow_types = [
116-
]
117+
if PYARROW_MAJOR_VER >= 16:
118+
_supported_pyarrow_types.extend(
119+
[
120+
pa.list_view(pa.uint64()),
121+
pa.large_list_view(pa.uint64()),
122+
pa.list_view(pa.string()),
123+
pa.large_list_view(pa.string()),
124+
]
125+
)
126+
117127

118128
# As of pyarrow 14, pyarrow implements the Arrow PyCapsule interface
119129
# (https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
@@ -158,12 +168,6 @@ def test_type_roundtrip_pycapsule(pyarrow_type):
158168
assert restored == pyarrow_type
159169
assert restored is not pyarrow_type
160170

161-
162-
@pytest.mark.parametrize("pyarrow_type", _unsupported_pyarrow_types, ids=str)
163-
def test_type_roundtrip_raises(pyarrow_type):
164-
with pytest.raises(pa.ArrowException):
165-
rust.round_trip_type(pyarrow_type)
166-
167171
@pytest.mark.parametrize('pyarrow_type', _supported_pyarrow_types, ids=str)
168172
def test_field_roundtrip(pyarrow_type):
169173
pyarrow_field = pa.field("test", pyarrow_type, nullable=True)
@@ -337,6 +341,21 @@ def test_list_array():
337341
del a
338342
del b
339343

344+
345+
@pytest.mark.skipif(PYARROW_PRE_16, reason="requires pyarrow 16")
346+
def test_list_view_array():
347+
"""
348+
Python -> Rust -> Python
349+
"""
350+
a = pa.array([[], None, [1, 2], [4, 5, 6]], pa.list_view(pa.int64()))
351+
b = rust.round_trip_array(a)
352+
b.validate(full=True)
353+
assert a.to_pylist() == b.to_pylist()
354+
assert a.type == b.type
355+
del a
356+
del b
357+
358+
340359
def test_map_array():
341360
"""
342361
Python -> Rust -> Python

arrow-schema/src/ffi.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -456,6 +456,14 @@ impl TryFrom<&FFI_ArrowSchema> for DataType {
456456
let c_child = c_schema.child(0);
457457
DataType::LargeList(Arc::new(Field::try_from(c_child)?))
458458
}
459+
"+vl" => {
460+
let c_child = c_schema.child(0);
461+
DataType::ListView(Arc::new(Field::try_from(c_child)?))
462+
}
463+
"+vL" => {
464+
let c_child = c_schema.child(0);
465+
DataType::LargeListView(Arc::new(Field::try_from(c_child)?))
466+
}
459467
"+s" => {
460468
let fields = c_schema.children().map(Field::try_from);
461469
DataType::Struct(fields.collect::<Result<_, ArrowError>>()?)
@@ -657,6 +665,8 @@ impl TryFrom<&DataType> for FFI_ArrowSchema {
657665
let children = match dtype {
658666
DataType::List(child)
659667
| DataType::LargeList(child)
668+
| DataType::ListView(child)
669+
| DataType::LargeListView(child)
660670
| DataType::FixedSizeList(child, _)
661671
| DataType::Map(child, _) => {
662672
vec![FFI_ArrowSchema::try_from(child.as_ref())?]
@@ -746,6 +756,8 @@ fn get_format_string(dtype: &DataType) -> Result<Cow<'static, str>, ArrowError>
746756
DataType::Interval(IntervalUnit::MonthDayNano) => Ok("tin".into()),
747757
DataType::List(_) => Ok("+l".into()),
748758
DataType::LargeList(_) => Ok("+L".into()),
759+
DataType::ListView(_) => Ok("+vl".into()),
760+
DataType::LargeListView(_) => Ok("+vL".into()),
749761
DataType::Struct(_) => Ok("+s".into()),
750762
DataType::Map(_, _) => Ok("+m".into()),
751763
DataType::RunEndEncoded(_, _) => Ok("+r".into()),
@@ -874,6 +886,16 @@ mod tests {
874886
DataType::Int16,
875887
false,
876888
))));
889+
round_trip_type(DataType::ListView(Arc::new(Field::new(
890+
"a",
891+
DataType::Int16,
892+
false,
893+
))));
894+
round_trip_type(DataType::LargeListView(Arc::new(Field::new(
895+
"a",
896+
DataType::Int16,
897+
false,
898+
))));
877899
round_trip_type(DataType::Struct(Fields::from(vec![Field::new(
878900
"a",
879901
DataType::Utf8,

0 commit comments

Comments
 (0)