Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 134 additions & 1 deletion arrow-array/src/array/byte_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,26 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
})
}

/// Return an iterator over the length of each array element, including null values.
///
/// Null values length would equal to the underlying bytes length and NOT 0
///
/// Example of getting 0 for null values
/// ```rust
/// # use arrow_array::StringViewArray;
/// # use arrow_array::Array;
/// use arrow_data::ByteView;
///
/// fn lengths_with_zero_for_nulls(view: &StringViewArray) -> impl Iterator<Item = u32> {
/// view.lengths()
/// .enumerate()
/// .map(|(index, length)| if view.is_null(index) { 0 } else { length })
/// }
/// ```
pub fn lengths(&self) -> impl ExactSizeIterator<Item = u32> + Clone {
self.views().iter().map(|v| *v as u32)
}

/// Returns a zero-copy slice of this array with the indicated offset and length.
pub fn slice(&self, offset: usize, length: usize) -> Self {
Self {
Expand Down Expand Up @@ -1183,7 +1203,7 @@ mod tests {
use crate::{
Array, BinaryViewArray, GenericBinaryArray, GenericByteViewArray, StringViewArray,
};
use arrow_buffer::{Buffer, ScalarBuffer};
use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer};
use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN};
use rand::prelude::StdRng;
use rand::{Rng, SeedableRng};
Expand Down Expand Up @@ -1680,4 +1700,117 @@ mod tests {
);
}
}

#[test]
fn empty_array_should_return_empty_lengths_iterator() {
let empty = GenericByteViewArray::<BinaryViewType>::from(Vec::<&[u8]>::new());

let mut lengths_iter = empty.lengths();
assert_eq!(lengths_iter.len(), 0);
assert_eq!(lengths_iter.next(), None);
}

#[test]
fn array_lengths_should_return_correct_length_for_both_inlined_and_non_inlined() {
let cases = GenericByteViewArray::<BinaryViewType>::from(vec![
// Not inlined as longer than 12 bytes
b"Supercalifragilisticexpialidocious" as &[u8],
// Inlined as shorter than 12 bytes
b"Hello",
// Empty value
b"",
// Exactly 12 bytes
b"abcdefghijkl",
]);

let mut lengths_iter = cases.lengths();

assert_eq!(lengths_iter.len(), cases.len());

let cases_iter = cases.iter();

for case in cases_iter {
let case_value = case.unwrap();
let length = lengths_iter.next().expect("Should have a length");

assert_eq!(case_value.len(), length as usize);
}

assert_eq!(lengths_iter.next(), None, "Should not have more lengths");
}

#[test]
fn array_lengths_should_return_the_underlying_length_for_null_values() {
let cases = GenericByteViewArray::<BinaryViewType>::from(vec![
// Not inlined as longer than 12 bytes
b"Supercalifragilisticexpialidocious" as &[u8],
// Inlined as shorter than 12 bytes
b"Hello",
// Empty value
b"",
// Exactly 12 bytes
b"abcdefghijkl",
]);

let (views, buffer, _) = cases.clone().into_parts();

// Keeping the values but just adding nulls on top
let cases_with_all_nulls = GenericByteViewArray::<BinaryViewType>::new(
views,
buffer,
Some(NullBuffer::new_null(cases.len())),
);

let lengths_iter = cases.lengths();
let mut all_nulls_lengths_iter = cases_with_all_nulls.lengths();

assert_eq!(lengths_iter.len(), all_nulls_lengths_iter.len());

for expected_length in lengths_iter {
let actual_length = all_nulls_lengths_iter.next().expect("Should have a length");

assert_eq!(expected_length, actual_length);
}

assert_eq!(
all_nulls_lengths_iter.next(),
None,
"Should not have more lengths"
);
}

#[test]
fn array_lengths_on_sliced_should_only_return_lengths_for_sliced_data() {
let array = GenericByteViewArray::<BinaryViewType>::from(vec![
b"aaaaaaaaaaaaaaaaaaaaaaaaaaa" as &[u8],
b"Hello",
b"something great",
b"is",
b"coming soon!",
b"when you find what it is",
b"let me know",
b"cause",
b"I",
b"have no idea",
b"what it",
b"is",
]);

let sliced_array = array.slice(2, array.len() - 3);

let mut lengths_iter = sliced_array.lengths();

assert_eq!(lengths_iter.len(), sliced_array.len());

let values_iter = sliced_array.iter();

for value in values_iter {
let value = value.unwrap();
let length = lengths_iter.next().expect("Should have a length");

assert_eq!(value.len(), length as usize);
}

assert_eq!(lengths_iter.next(), None, "Should not have more lengths");
}
}
42 changes: 31 additions & 11 deletions arrow-row/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ use std::hash::{Hash, Hasher};
use std::sync::Arc;

use arrow_array::cast::*;
use arrow_array::types::ArrowDictionaryKeyType;
use arrow_array::types::{ArrowDictionaryKeyType, ByteViewType};
use arrow_array::*;
use arrow_buffer::{ArrowNativeType, Buffer, OffsetBuffer, ScalarBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder};
Expand Down Expand Up @@ -1549,11 +1549,7 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> LengthTracker {
.iter()
.map(|slice| variable::encoded_len(slice))
),
DataType::BinaryView => tracker.push_variable(
array.as_binary_view()
.iter()
.map(|slice| variable::encoded_len(slice))
),
DataType::BinaryView => push_byte_view_array_lengths(&mut tracker, array.as_binary_view()),
DataType::Utf8 => tracker.push_variable(
array.as_string::<i32>()
.iter()
Expand All @@ -1564,11 +1560,7 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> LengthTracker {
.iter()
.map(|slice| variable::encoded_len(slice.map(|x| x.as_bytes())))
),
DataType::Utf8View => tracker.push_variable(
array.as_string_view()
.iter()
.map(|slice| variable::encoded_len(slice.map(|x| x.as_bytes())))
),
DataType::Utf8View => push_byte_view_array_lengths(&mut tracker, array.as_string_view()),
DataType::FixedSizeBinary(len) => {
let len = len.to_usize().unwrap();
tracker.push_fixed(1 + len)
Expand Down Expand Up @@ -1658,6 +1650,34 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> LengthTracker {
tracker
}

/// Add to [`LengthTracker`] the encoded length of each item in the [`GenericByteViewArray`]
fn push_byte_view_array_lengths<T: ByteViewType>(
tracker: &mut LengthTracker,
array: &GenericByteViewArray<T>,
) {
if let Some(nulls) = array.nulls().filter(|n| n.null_count() > 0) {
tracker.push_variable(
array
.lengths()
.zip(nulls.iter())
.map(|(length, is_valid)| {
if is_valid {
Some(length as usize)
} else {
None
}
})
.map(variable::padded_length),
)
} else {
tracker.push_variable(
array
.lengths()
.map(|len| variable::padded_length(Some(len as usize))),
)
}
}

/// Encodes a column to the provided [`Rows`] incrementing the offsets as it progresses
fn encode_column(
data: &mut [u8],
Expand Down
Loading