Skip to content

Commit 5ded4c0

Browse files
committed
Merge branch 'main' into avro-row-encoder
2 parents ce72c5e + 1db1a88 commit 5ded4c0

File tree

33 files changed

+2863
-715
lines changed

33 files changed

+2863
-715
lines changed

arrow-arith/src/numeric.rs

Lines changed: 161 additions & 475 deletions
Large diffs are not rendered by default.

arrow-array/src/array/boolean_array.rs

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -389,24 +389,21 @@ impl From<Vec<Option<bool>>> for BooleanArray {
389389

390390
impl From<ArrayData> for BooleanArray {
391391
fn from(data: ArrayData) -> Self {
392+
let (data_type, len, nulls, offset, mut buffers, _child_data) = data.into_parts();
392393
assert_eq!(
393-
data.data_type(),
394-
&DataType::Boolean,
395-
"BooleanArray expected ArrayData with type {} got {}",
394+
data_type,
396395
DataType::Boolean,
397-
data.data_type()
396+
"BooleanArray expected ArrayData with type Boolean got {data_type:?}",
398397
);
399398
assert_eq!(
400-
data.buffers().len(),
399+
buffers.len(),
401400
1,
402401
"BooleanArray data should contain a single buffer only (values buffer)"
403402
);
404-
let values = BooleanBuffer::new(data.buffers()[0].clone(), data.offset(), data.len());
403+
let buffer = buffers.pop().expect("checked above");
404+
let values = BooleanBuffer::new(buffer, offset, len);
405405

406-
Self {
407-
values,
408-
nulls: data.nulls().cloned(),
409-
}
406+
Self { values, nulls }
410407
}
411408
}
412409

arrow-array/src/array/byte_array.rs

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use crate::array::{get_offsets, print_long_array};
18+
use crate::array::{get_offsets_from_buffer, print_long_array};
1919
use crate::builder::GenericByteBuilder;
2020
use crate::iterator::ArrayIter;
2121
use crate::types::ByteArrayType;
@@ -542,30 +542,34 @@ impl<'a, T: ByteArrayType> ArrayAccessor for &'a GenericByteArray<T> {
542542

543543
impl<T: ByteArrayType> From<ArrayData> for GenericByteArray<T> {
544544
fn from(data: ArrayData) -> Self {
545+
let (data_type, len, nulls, offset, mut buffers, _child_data) = data.into_parts();
545546
assert_eq!(
546-
data.data_type(),
547-
&Self::DATA_TYPE,
547+
data_type,
548+
Self::DATA_TYPE,
548549
"{}{}Array expects DataType::{}",
549550
T::Offset::PREFIX,
550551
T::PREFIX,
551552
Self::DATA_TYPE
552553
);
553554
assert_eq!(
554-
data.buffers().len(),
555+
buffers.len(),
555556
2,
556557
"{}{}Array data should contain 2 buffers only (offsets and values)",
557558
T::Offset::PREFIX,
558559
T::PREFIX,
559560
);
561+
// buffers are offset then value, so pop in reverse
562+
let value_data = buffers.pop().expect("checked above");
563+
let offset_buffer = buffers.pop().expect("checked above");
564+
560565
// SAFETY:
561566
// ArrayData is valid, and verified type above
562-
let value_offsets = unsafe { get_offsets(&data) };
563-
let value_data = data.buffers()[1].clone();
567+
let value_offsets = unsafe { get_offsets_from_buffer(offset_buffer, offset, len) };
564568
Self {
565569
value_offsets,
566570
value_data,
567-
data_type: T::DATA_TYPE,
568-
nulls: data.nulls().cloned(),
571+
data_type,
572+
nulls,
569573
}
570574
}
571575
}

arrow-array/src/array/byte_view_array.rs

Lines changed: 57 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -988,14 +988,20 @@ impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray<T>
988988

989989
impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> {
990990
fn from(data: ArrayData) -> Self {
991-
let (_data_type, len, nulls, offset, mut buffers, _child_data) = data.into_parts();
992-
let views = buffers.remove(0); // need to maintain order of remaining buffers
993-
let buffers = Arc::from(buffers);
994-
let views = ScalarBuffer::new(views, offset, len);
991+
let (data_type, len, nulls, offset, buffers, _child_data) = data.into_parts();
992+
assert_eq!(
993+
data_type,
994+
T::DATA_TYPE,
995+
"Mismatched data type, expected {}, got {data_type}",
996+
T::DATA_TYPE
997+
);
998+
let mut buffers = buffers.into_iter();
999+
// first buffer is views, remaining are data buffers
1000+
let views = ScalarBuffer::new(buffers.next().unwrap(), offset, len);
9951001
Self {
996-
data_type: T::DATA_TYPE,
1002+
data_type,
9971003
views,
998-
buffers,
1004+
buffers: Arc::from_iter(buffers),
9991005
nulls,
10001006
phantom: Default::default(),
10011007
}
@@ -1205,9 +1211,11 @@ mod tests {
12051211
Array, BinaryViewArray, GenericBinaryArray, GenericByteViewArray, StringViewArray,
12061212
};
12071213
use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer};
1208-
use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN};
1214+
use arrow_data::{ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN};
1215+
use arrow_schema::DataType;
12091216
use rand::prelude::StdRng;
12101217
use rand::{Rng, SeedableRng};
1218+
use std::str::from_utf8;
12111219

12121220
const BLOCK_SIZE: u32 = 8;
12131221

@@ -1814,4 +1822,46 @@ mod tests {
18141822

18151823
assert_eq!(lengths_iter.next(), None, "Should not have more lengths");
18161824
}
1825+
1826+
#[should_panic(expected = "Mismatched data type, expected Utf8View, got BinaryView")]
1827+
#[test]
1828+
fn invalid_casting_from_array_data() {
1829+
// Should not be able to cast to StringViewArray due to invalid UTF-8
1830+
let array_data = binary_view_array_with_invalid_utf8_data().into_data();
1831+
let _ = StringViewArray::from(array_data);
1832+
}
1833+
1834+
#[should_panic(expected = "invalid utf-8 sequence")]
1835+
#[test]
1836+
fn invalid_array_data() {
1837+
let (views, buffers, nulls) = binary_view_array_with_invalid_utf8_data().into_parts();
1838+
1839+
// manually try and add invalid array data with Utf8View data type
1840+
let mut builder = ArrayDataBuilder::new(DataType::Utf8View)
1841+
.add_buffer(views.into_inner())
1842+
.len(3);
1843+
for buffer in buffers.iter() {
1844+
builder = builder.add_buffer(buffer.clone())
1845+
}
1846+
builder = builder.nulls(nulls);
1847+
1848+
let data = builder.build().unwrap(); // should fail validation
1849+
let _arr = StringViewArray::from(data);
1850+
}
1851+
1852+
/// Returns a BinaryViewArray with one invalid UTF-8 value
1853+
fn binary_view_array_with_invalid_utf8_data() -> BinaryViewArray {
1854+
let array = GenericByteViewArray::<BinaryViewType>::from(vec![
1855+
b"aaaaaaaaaaaaaaaaaaaaaaaaaaa" as &[u8],
1856+
&[
1857+
0xf0, 0x80, 0x80, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1858+
0x00, 0x00,
1859+
],
1860+
b"good",
1861+
]);
1862+
assert!(from_utf8(array.value(0)).is_ok());
1863+
assert!(from_utf8(array.value(1)).is_err()); // value 1 is invalid utf8
1864+
assert!(from_utf8(array.value(2)).is_ok());
1865+
array
1866+
}
18171867
}

arrow-array/src/array/dictionary_array.rs

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ use crate::{
2525
};
2626
use arrow_buffer::bit_util::set_bit;
2727
use arrow_buffer::buffer::NullBuffer;
28-
use arrow_buffer::{ArrowNativeType, BooleanBuffer, BooleanBufferBuilder};
28+
use arrow_buffer::{ArrowNativeType, BooleanBuffer, BooleanBufferBuilder, ScalarBuffer};
2929
use arrow_data::ArrayData;
3030
use arrow_schema::{ArrowError, DataType};
3131
use std::any::Any;
@@ -580,21 +580,25 @@ impl<K: ArrowDictionaryKeyType> DictionaryArray<K> {
580580
}
581581
}
582582

583-
/// Constructs a `DictionaryArray` from an array data reference.
583+
/// Constructs a `DictionaryArray` from an `ArrayData`
584584
impl<T: ArrowDictionaryKeyType> From<ArrayData> for DictionaryArray<T> {
585585
fn from(data: ArrayData) -> Self {
586+
let (data_type, len, nulls, offset, mut buffers, mut child_data) = data.into_parts();
587+
586588
assert_eq!(
587-
data.buffers().len(),
589+
buffers.len(),
588590
1,
589591
"DictionaryArray data should contain a single buffer only (keys)."
590592
);
593+
let buffer = buffers.pop().expect("checked above");
591594
assert_eq!(
592-
data.child_data().len(),
595+
child_data.len(),
593596
1,
594597
"DictionaryArray should contain a single child array (values)."
595598
);
599+
let cd = child_data.pop().expect("checked above");
596600

597-
if let DataType::Dictionary(key_data_type, _) = data.data_type() {
601+
if let DataType::Dictionary(key_data_type, _) = &data_type {
598602
assert_eq!(
599603
&T::DATA_TYPE,
600604
key_data_type.as_ref(),
@@ -603,19 +607,10 @@ impl<T: ArrowDictionaryKeyType> From<ArrayData> for DictionaryArray<T> {
603607
key_data_type
604608
);
605609

606-
let values = make_array(data.child_data()[0].clone());
607-
let data_type = data.data_type().clone();
610+
let values = make_array(cd);
608611

609612
// create a zero-copy of the keys' data
610-
// SAFETY:
611-
// ArrayData is valid and verified type above
612-
613-
let keys = PrimitiveArray::<T>::from(unsafe {
614-
data.into_builder()
615-
.data_type(T::DATA_TYPE)
616-
.child_data(vec![])
617-
.build_unchecked()
618-
});
613+
let keys = PrimitiveArray::<T>::new(ScalarBuffer::new(buffer, offset, len), nulls);
619614

620615
Self {
621616
data_type,

arrow-array/src/array/mod.rs

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
mod binary_array;
2121

2222
use crate::types::*;
23-
use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer, ScalarBuffer};
23+
use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, OffsetBuffer, ScalarBuffer};
2424
use arrow_data::ArrayData;
2525
use arrow_schema::{DataType, IntervalUnit, TimeUnit};
2626
use std::any::Any;
@@ -939,6 +939,27 @@ unsafe fn get_offsets<O: ArrowNativeType>(data: &ArrayData) -> OffsetBuffer<O> {
939939
}
940940
}
941941

942+
/// Helper function that creates an [`OffsetBuffer`] from a buffer and array offset/ length
943+
///
944+
/// # Safety
945+
///
946+
/// - buffer must contain valid arrow offsets ( [`OffsetBuffer`] ) for the
947+
/// given length and offset.
948+
unsafe fn get_offsets_from_buffer<O: ArrowNativeType>(
949+
buffer: Buffer,
950+
offset: usize,
951+
len: usize,
952+
) -> OffsetBuffer<O> {
953+
if len == 0 && buffer.is_empty() {
954+
return OffsetBuffer::new_empty();
955+
}
956+
957+
let scalar_buffer = ScalarBuffer::new(buffer, offset, len + 1);
958+
// Safety:
959+
// Arguments were valid
960+
unsafe { OffsetBuffer::new_unchecked(scalar_buffer) }
961+
}
962+
942963
/// Helper function for printing potentially long arrays.
943964
fn print_long_array<A, F>(array: &A, f: &mut std::fmt::Formatter, print_item: F) -> std::fmt::Result
944965
where

arrow-array/src/array/primitive_array.rs

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1597,18 +1597,21 @@ impl<T: ArrowTimestampType> PrimitiveArray<T> {
15971597
/// Constructs a `PrimitiveArray` from an array data reference.
15981598
impl<T: ArrowPrimitiveType> From<ArrayData> for PrimitiveArray<T> {
15991599
fn from(data: ArrayData) -> Self {
1600-
Self::assert_compatible(data.data_type());
1600+
let (data_type, len, nulls, offset, mut buffers, _child_data) = data.into_parts();
1601+
1602+
Self::assert_compatible(&data_type);
16011603
assert_eq!(
1602-
data.buffers().len(),
1604+
buffers.len(),
16031605
1,
16041606
"PrimitiveArray data should contain a single buffer only (values buffer)"
16051607
);
1608+
let buffer = buffers.pop().expect("checked above");
16061609

1607-
let values = ScalarBuffer::new(data.buffers()[0].clone(), data.offset(), data.len());
1610+
let values = ScalarBuffer::new(buffer, offset, len);
16081611
Self {
1609-
data_type: data.data_type().clone(),
1612+
data_type,
16101613
values,
1611-
nulls: data.nulls().cloned(),
1614+
nulls,
16121615
}
16131616
}
16141617
}

0 commit comments

Comments
 (0)