Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 79 additions & 37 deletions arrow-array/src/builder/primitive_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ use crate::builder::ArrayBuilder;
use crate::types::*;
use crate::{Array, ArrayRef, PrimitiveArray};
use arrow_buffer::{Buffer, MutableBuffer, NullBufferBuilder, ScalarBuffer};
use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType};
use std::any::Any;
use std::sync::Arc;
Expand Down Expand Up @@ -100,7 +99,8 @@ pub type Decimal256Builder = PrimitiveBuilder<Decimal256Type>;
pub struct PrimitiveBuilder<T: ArrowPrimitiveType> {
values_builder: Vec<T::Native>,
null_buffer_builder: NullBufferBuilder,
data_type: DataType,
/// Optional data type override (e.g. to add timezone or precision/scale)
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This time I tried switching to use Option so we only pay the extra type check when there is actually a datatype override

data_type: Option<DataType>,
}

impl<T: ArrowPrimitiveType> ArrayBuilder for PrimitiveBuilder<T> {
Expand Down Expand Up @@ -152,7 +152,7 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
Self {
values_builder: Vec::with_capacity(capacity),
null_buffer_builder: NullBufferBuilder::new(capacity),
data_type: T::DATA_TYPE,
data_type: None,
}
}

Expand All @@ -170,7 +170,7 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
Self {
values_builder,
null_buffer_builder,
data_type: T::DATA_TYPE,
data_type: None,
}
}

Expand All @@ -183,15 +183,14 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
///
/// # Panics
///
/// This method panics if `data_type` is not [PrimitiveArray::is_compatible]
/// This method will not panic, but [`Self::build`], [`Self::finish`] will panics if `data_type` is
/// not [PrimitiveArray::is_compatible] with the builder's primitive type
/// `T`.
pub fn with_data_type(self, data_type: DataType) -> Self {
assert!(
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this assert is done as part of the PrimitiveArray::with_type later on

I also updated the tests to show that

PrimitiveArray::<T>::is_compatible(&data_type),
"incompatible data type for builder, expected {} got {}",
T::DATA_TYPE,
data_type
);
Self { data_type, ..self }
Self {
data_type: Some(data_type),
..self
}
}

/// Returns the capacity of this builder measured in slots of type `T`
Expand Down Expand Up @@ -268,11 +267,11 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
/// Panics if `array` and `self` data types are different
#[inline]
pub fn append_array(&mut self, array: &PrimitiveArray<T>) {
assert_eq!(
&self.data_type,
array.data_type(),
"array data type mismatch"
);
if let Some(data_type) = &self.data_type {
assert_eq!(data_type, array.data_type(), "array data type mismatch");
} else {
assert_eq!(&T::DATA_TYPE, array.data_type(), "array data type mismatch")
}

self.values_builder.extend_from_slice(array.values());
if let Some(null_buffer) = array.nulls() {
Expand Down Expand Up @@ -300,30 +299,59 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
}

/// Builds the [`PrimitiveArray`] and reset this builder.
///
/// See [`Self::build`] to consume the builder, instead of resetting it.
pub fn finish(&mut self) -> PrimitiveArray<T> {
let len = self.len();
let values_buffer = Buffer::from(std::mem::take(&mut self.values_builder));
let nulls = self.null_buffer_builder.finish();
let builder = ArrayData::builder(self.data_type.clone())
.len(len)
.add_buffer(std::mem::take(&mut self.values_builder).into())
.nulls(nulls);

let array_data = unsafe { builder.build_unchecked() };
PrimitiveArray::<T>::from(array_data)
let mut new_array = PrimitiveArray::<T>::new(ScalarBuffer::from(values_buffer), nulls);
if let Some(data_type) = &self.data_type {
new_array = new_array.with_data_type(data_type.clone())
}
new_array
}

/// Builds the [`PrimitiveArray`] without resetting the builder.
pub fn finish_cloned(&self) -> PrimitiveArray<T> {
let len = self.len();
let nulls = self.null_buffer_builder.finish_cloned();
let values_buffer = Buffer::from_slice_ref(self.values_builder.as_slice());
let builder = ArrayData::builder(self.data_type.clone())
.len(len)
.add_buffer(values_buffer)
.nulls(nulls);
let mut new_array = PrimitiveArray::<T>::new(ScalarBuffer::from(values_buffer), nulls);
if let Some(data_type) = &self.data_type {
new_array = new_array.with_data_type(data_type.clone())
}
new_array
}

let array_data = unsafe { builder.build_unchecked() };
PrimitiveArray::<T>::from(array_data)
/// Builds the [`PrimitiveArray`] , consuming this builder.
///
/// Use [`Self::finish`] to reuse the builder
///
/// # Example:
///
/// ```
/// # use arrow_array::builder::UInt8Builder;
/// use arrow_array::UInt8Array;
/// let mut builder = UInt8Builder::with_capacity(5);
/// builder.append_slice(&[42, 44, 46]);
/// builder.append_nulls(2);
/// let array = builder.build();
/// assert_eq!(UInt8Array::from(vec![Some(42), Some(44), Some(46), None, None]), array);
/// ```
#[inline]
pub fn build(self) -> PrimitiveArray<T> {
let Self {
values_builder,
null_buffer_builder,
data_type,
} = self;

let nulls = null_buffer_builder.build();
let values_buffer = ScalarBuffer::from(Buffer::from(values_builder));
let mut new_array = PrimitiveArray::<T>::new(values_buffer, nulls);
if let Some(data_type) = data_type {
new_array = new_array.with_data_type(data_type)
}
new_array
}

/// Returns the current values buffer as a slice
Expand Down Expand Up @@ -360,7 +388,7 @@ impl<P: DecimalType> PrimitiveBuilder<P> {
pub fn with_precision_and_scale(self, precision: u8, scale: i8) -> Result<Self, ArrowError> {
validate_decimal_precision_and_scale::<P>(precision, scale)?;
Ok(Self {
data_type: P::TYPE_CONSTRUCTOR(precision, scale),
data_type: Some(P::TYPE_CONSTRUCTOR(precision, scale)),
..self
})
}
Expand All @@ -375,7 +403,7 @@ impl<P: ArrowTimestampType> PrimitiveBuilder<P> {
/// Sets an optional timezone
pub fn with_timezone_opt<S: Into<Arc<str>>>(self, timezone: Option<S>) -> Self {
Self {
data_type: DataType::Timestamp(P::UNIT, timezone.map(Into::into)),
data_type: Some(DataType::Timestamp(P::UNIT, timezone.map(Into::into))),
..self
}
}
Expand Down Expand Up @@ -630,9 +658,23 @@ mod tests {
}

#[test]
#[should_panic(expected = "incompatible data type for builder, expected Int32 got Int64")]
fn test_invalid_with_data_type() {
Int32Builder::new().with_data_type(DataType::Int64);
#[should_panic(expected = "PrimitiveArray expected data type Int32 got Int64")]
fn test_invalid_with_data_type_finish() {
Int32Builder::new().with_data_type(DataType::Int64).finish();
}

#[test]
#[should_panic(expected = "PrimitiveArray expected data type Int32 got Int64")]
fn test_invalid_with_data_type_finish_cloned() {
Int32Builder::new()
.with_data_type(DataType::Int64)
.finish_cloned();
}

#[test]
#[should_panic(expected = "PrimitiveArray expected data type Int32 got Int64")]
fn test_invalid_with_data_type_build() {
Int32Builder::new().with_data_type(DataType::Int64).build();
}

#[test]
Expand Down
Loading