diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 09f0f56ba3ac..9badbede06a4 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -438,6 +438,26 @@ impl GenericByteViewArray { }) } + /// Return an iterator over the length of each array element, including null values. + /// + /// Null values length would equal to the underlying bytes length and NOT 0 + /// + /// Example of getting 0 for null values + /// ```rust + /// # use arrow_array::StringViewArray; + /// # use arrow_array::Array; + /// use arrow_data::ByteView; + /// + /// fn lengths_with_zero_for_nulls(view: &StringViewArray) -> impl Iterator { + /// view.lengths() + /// .enumerate() + /// .map(|(index, length)| if view.is_null(index) { 0 } else { length }) + /// } + /// ``` + pub fn lengths(&self) -> impl ExactSizeIterator + Clone { + self.views().iter().map(|v| *v as u32) + } + /// Returns a zero-copy slice of this array with the indicated offset and length. pub fn slice(&self, offset: usize, length: usize) -> Self { Self { @@ -1183,7 +1203,7 @@ mod tests { use crate::{ Array, BinaryViewArray, GenericBinaryArray, GenericByteViewArray, StringViewArray, }; - use arrow_buffer::{Buffer, ScalarBuffer}; + use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer}; use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN}; use rand::prelude::StdRng; use rand::{Rng, SeedableRng}; @@ -1680,4 +1700,117 @@ mod tests { ); } } + + #[test] + fn empty_array_should_return_empty_lengths_iterator() { + let empty = GenericByteViewArray::::from(Vec::<&[u8]>::new()); + + let mut lengths_iter = empty.lengths(); + assert_eq!(lengths_iter.len(), 0); + assert_eq!(lengths_iter.next(), None); + } + + #[test] + fn array_lengths_should_return_correct_length_for_both_inlined_and_non_inlined() { + let cases = GenericByteViewArray::::from(vec![ + // Not inlined as longer than 12 bytes + b"Supercalifragilisticexpialidocious" as &[u8], + // Inlined as shorter than 12 bytes + b"Hello", + // Empty value + b"", + // Exactly 12 bytes + b"abcdefghijkl", + ]); + + let mut lengths_iter = cases.lengths(); + + assert_eq!(lengths_iter.len(), cases.len()); + + let cases_iter = cases.iter(); + + for case in cases_iter { + let case_value = case.unwrap(); + let length = lengths_iter.next().expect("Should have a length"); + + assert_eq!(case_value.len(), length as usize); + } + + assert_eq!(lengths_iter.next(), None, "Should not have more lengths"); + } + + #[test] + fn array_lengths_should_return_the_underlying_length_for_null_values() { + let cases = GenericByteViewArray::::from(vec![ + // Not inlined as longer than 12 bytes + b"Supercalifragilisticexpialidocious" as &[u8], + // Inlined as shorter than 12 bytes + b"Hello", + // Empty value + b"", + // Exactly 12 bytes + b"abcdefghijkl", + ]); + + let (views, buffer, _) = cases.clone().into_parts(); + + // Keeping the values but just adding nulls on top + let cases_with_all_nulls = GenericByteViewArray::::new( + views, + buffer, + Some(NullBuffer::new_null(cases.len())), + ); + + let lengths_iter = cases.lengths(); + let mut all_nulls_lengths_iter = cases_with_all_nulls.lengths(); + + assert_eq!(lengths_iter.len(), all_nulls_lengths_iter.len()); + + for expected_length in lengths_iter { + let actual_length = all_nulls_lengths_iter.next().expect("Should have a length"); + + assert_eq!(expected_length, actual_length); + } + + assert_eq!( + all_nulls_lengths_iter.next(), + None, + "Should not have more lengths" + ); + } + + #[test] + fn array_lengths_on_sliced_should_only_return_lengths_for_sliced_data() { + let array = GenericByteViewArray::::from(vec![ + b"aaaaaaaaaaaaaaaaaaaaaaaaaaa" as &[u8], + b"Hello", + b"something great", + b"is", + b"coming soon!", + b"when you find what it is", + b"let me know", + b"cause", + b"I", + b"have no idea", + b"what it", + b"is", + ]); + + let sliced_array = array.slice(2, array.len() - 3); + + let mut lengths_iter = sliced_array.lengths(); + + assert_eq!(lengths_iter.len(), sliced_array.len()); + + let values_iter = sliced_array.iter(); + + for value in values_iter { + let value = value.unwrap(); + let length = lengths_iter.next().expect("Should have a length"); + + assert_eq!(value.len(), length as usize); + } + + assert_eq!(lengths_iter.next(), None, "Should not have more lengths"); + } } diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 4cafbc2748ee..30e714217752 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -164,7 +164,7 @@ use std::hash::{Hash, Hasher}; use std::sync::Arc; use arrow_array::cast::*; -use arrow_array::types::ArrowDictionaryKeyType; +use arrow_array::types::{ArrowDictionaryKeyType, ByteViewType}; use arrow_array::*; use arrow_buffer::{ArrowNativeType, Buffer, OffsetBuffer, ScalarBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; @@ -1549,11 +1549,7 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> LengthTracker { .iter() .map(|slice| variable::encoded_len(slice)) ), - DataType::BinaryView => tracker.push_variable( - array.as_binary_view() - .iter() - .map(|slice| variable::encoded_len(slice)) - ), + DataType::BinaryView => push_byte_view_array_lengths(&mut tracker, array.as_binary_view()), DataType::Utf8 => tracker.push_variable( array.as_string::() .iter() @@ -1564,11 +1560,7 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> LengthTracker { .iter() .map(|slice| variable::encoded_len(slice.map(|x| x.as_bytes()))) ), - DataType::Utf8View => tracker.push_variable( - array.as_string_view() - .iter() - .map(|slice| variable::encoded_len(slice.map(|x| x.as_bytes()))) - ), + DataType::Utf8View => push_byte_view_array_lengths(&mut tracker, array.as_string_view()), DataType::FixedSizeBinary(len) => { let len = len.to_usize().unwrap(); tracker.push_fixed(1 + len) @@ -1658,6 +1650,34 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> LengthTracker { tracker } +/// Add to [`LengthTracker`] the encoded length of each item in the [`GenericByteViewArray`] +fn push_byte_view_array_lengths( + tracker: &mut LengthTracker, + array: &GenericByteViewArray, +) { + if let Some(nulls) = array.nulls().filter(|n| n.null_count() > 0) { + tracker.push_variable( + array + .lengths() + .zip(nulls.iter()) + .map(|(length, is_valid)| { + if is_valid { + Some(length as usize) + } else { + None + } + }) + .map(variable::padded_length), + ) + } else { + tracker.push_variable( + array + .lengths() + .map(|len| variable::padded_length(Some(len as usize))), + ) + } +} + /// Encodes a column to the provided [`Rows`] incrementing the offsets as it progresses fn encode_column( data: &mut [u8],