Skip to content

Commit 9bbc886

Browse files
committed
fix: New try_extend method that doesn't panic
1 parent adf9308 commit 9bbc886

File tree

26 files changed

+499
-232
lines changed

26 files changed

+499
-232
lines changed

arrow-array/src/array/null_array.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ mod tests {
230230

231231
// Simulate a NULL value in the parent array, for instance, if array being queried by
232232
// invalid index
233-
mutable.extend_nulls(1);
233+
mutable.try_extend_nulls(1).unwrap();
234234
let data = mutable.freeze();
235235

236236
let struct_array = Arc::new(StructArray::from(data.clone()));

arrow-array/src/ffi.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1729,7 +1729,7 @@ mod tests_from_ffi {
17291729
let data = array.to_data();
17301730

17311731
let mut mutable = MutableArrayData::new(vec![&data], false, len);
1732-
mutable.extend(0, 0, len);
1732+
mutable.try_extend(0, 0, len).unwrap();
17331733
make_array(mutable.freeze())
17341734
}
17351735

arrow-cast/src/cast/list.rs

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -190,10 +190,14 @@ where
190190
if cast_options.safe || array.is_null(idx) {
191191
if last_pos != start_pos {
192192
// Extend with valid slices
193-
mutable.extend(0, last_pos, start_pos);
193+
mutable
194+
.try_extend(0, last_pos, start_pos)
195+
.map_err(|e| ArrowError::CastError(e.to_string()))?;
194196
}
195197
// Pad this slice with nulls
196-
mutable.extend_nulls(size as _);
198+
mutable
199+
.try_extend_nulls(size as _)
200+
.map_err(|e| ArrowError::CastError(e.to_string()))?;
197201
null_builder.set_bit(idx, false);
198202
// Set last_pos to the end of this slice's values
199203
last_pos = end_pos
@@ -211,7 +215,9 @@ where
211215
if mutable.len() != cap {
212216
// Remaining slices were all correct length
213217
let remaining = cap - mutable.len();
214-
mutable.extend(0, last_pos, last_pos + remaining)
218+
mutable
219+
.try_extend(0, last_pos, last_pos + remaining)
220+
.map_err(|e| ArrowError::CastError(e.to_string()))?;
215221
}
216222
make_array(mutable.freeze())
217223
}
@@ -252,15 +258,19 @@ pub(crate) fn cast_list_view_to_fixed_size_list<O: OffsetSizeTrait>(
252258
if len != size as usize {
253259
// Nulls in FixedSizeListArray take up space and so we must pad the values
254260
if cast_options.safe || array.is_null(idx) {
255-
mutable.extend_nulls(size as _);
261+
mutable
262+
.try_extend_nulls(size as _)
263+
.map_err(|e| ArrowError::CastError(e.to_string()))?;
256264
null_builder.set_bit(idx, false);
257265
} else {
258266
return Err(ArrowError::CastError(format!(
259267
"Cannot cast to FixedSizeList({size}): value at index {idx} has length {len}",
260268
)));
261269
}
262270
} else {
263-
mutable.extend(0, offset, offset + len);
271+
mutable
272+
.try_extend(0, offset, offset + len)
273+
.map_err(|e| ArrowError::CastError(e.to_string()))?;
264274
}
265275
}
266276

arrow-data/src/transform/boolean.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,16 @@ pub(super) fn build_extend(array: &ArrayData) -> Extend<'_> {
3232
array.offset() + start,
3333
len,
3434
);
35+
Ok(())
3536
},
3637
)
3738
}
3839

39-
pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) {
40+
pub(super) fn extend_nulls(
41+
mutable: &mut _MutableArrayData,
42+
len: usize,
43+
) -> Result<(), arrow_schema::ArrowError> {
4044
let buffer = &mut mutable.buffer1;
4145
resize_for_bits(buffer, mutable.len + len);
46+
Ok(())
4247
}

arrow-data/src/transform/fixed_binary.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,21 @@ pub(super) fn build_extend(array: &ArrayData) -> Extend<'_> {
3030
move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| {
3131
let buffer = &mut mutable.buffer1;
3232
buffer.extend_from_slice(&values[start * size..(start + len) * size]);
33+
Ok(())
3334
},
3435
)
3536
}
3637

37-
pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) {
38+
pub(super) fn extend_nulls(
39+
mutable: &mut _MutableArrayData,
40+
len: usize,
41+
) -> Result<(), arrow_schema::ArrowError> {
3842
let size = match mutable.data_type {
3943
DataType::FixedSizeBinary(i) => i as usize,
4044
_ => unreachable!(),
4145
};
4246

4347
let values_buffer = &mut mutable.buffer1;
4448
values_buffer.extend_zeros(len * size);
49+
Ok(())
4550
}

arrow-data/src/transform/fixed_size_list.rs

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
// under the License.
1717

1818
use crate::ArrayData;
19-
use arrow_schema::DataType;
19+
use arrow_schema::{ArrowError, DataType};
2020

2121
use super::{_MutableArrayData, Extend};
2222

@@ -28,22 +28,25 @@ pub(super) fn build_extend(array: &ArrayData) -> Extend<'_> {
2828

2929
Box::new(
3030
move |mutable: &mut _MutableArrayData, index: usize, start: usize, len: usize| {
31-
mutable
32-
.child_data
33-
.iter_mut()
34-
.for_each(|child| child.extend(index, start * size, (start + len) * size))
31+
for child in mutable.child_data.iter_mut() {
32+
child.try_extend(index, start * size, (start + len) * size)?;
33+
}
34+
Ok(())
3535
},
3636
)
3737
}
3838

39-
pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) {
39+
pub(super) fn extend_nulls(
40+
mutable: &mut _MutableArrayData,
41+
len: usize,
42+
) -> Result<(), ArrowError> {
4043
let size = match mutable.data_type {
4144
DataType::FixedSizeList(_, i) => i as usize,
4245
_ => unreachable!(),
4346
};
4447

45-
mutable
46-
.child_data
47-
.iter_mut()
48-
.for_each(|child| child.extend_nulls(len * size))
48+
for child in mutable.child_data.iter_mut() {
49+
child.try_extend_nulls(len * size)?;
50+
}
51+
Ok(())
4952
}

arrow-data/src/transform/list.rs

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,11 @@
1717

1818
use super::{
1919
_MutableArrayData, Extend,
20-
utils::{extend_offsets, get_last_offset},
20+
utils::{get_last_offset, try_extend_offsets},
2121
};
2222
use crate::ArrayData;
2323
use arrow_buffer::ArrowNativeType;
24+
use arrow_schema::ArrowError;
2425
use num_integer::Integer;
2526
use num_traits::CheckedAdd;
2627

@@ -36,9 +37,13 @@ pub(super) fn build_extend<T: ArrowNativeType + Integer + CheckedAdd>(
3637
let last_offset: T = unsafe { get_last_offset(offset_buffer) };
3738

3839
// offsets
39-
extend_offsets::<T>(offset_buffer, last_offset, &offsets[start..start + len + 1]);
40+
try_extend_offsets::<T>(
41+
offset_buffer,
42+
last_offset,
43+
&offsets[start..start + len + 1],
44+
)?;
4045

41-
mutable.child_data[0].extend(
46+
mutable.child_data[0].try_extend(
4247
index,
4348
offsets[start].as_usize(),
4449
offsets[start + len].as_usize(),
@@ -47,11 +52,15 @@ pub(super) fn build_extend<T: ArrowNativeType + Integer + CheckedAdd>(
4752
)
4853
}
4954

50-
pub(super) fn extend_nulls<T: ArrowNativeType>(mutable: &mut _MutableArrayData, len: usize) {
55+
pub(super) fn extend_nulls<T: ArrowNativeType>(
56+
mutable: &mut _MutableArrayData,
57+
len: usize,
58+
) -> Result<(), ArrowError> {
5159
let offset_buffer = &mut mutable.buffer1;
5260

5361
// this is safe due to how offset is built. See details on `get_last_offset`
5462
let last_offset: T = unsafe { get_last_offset(offset_buffer) };
5563

56-
(0..len).for_each(|_| offset_buffer.push(last_offset))
64+
(0..len).for_each(|_| offset_buffer.push(last_offset));
65+
Ok(())
5766
}

arrow-data/src/transform/list_view.rs

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
use crate::ArrayData;
1919
use crate::transform::_MutableArrayData;
2020
use arrow_buffer::ArrowNativeType;
21+
use arrow_schema::ArrowError;
2122
use num_integer::Integer;
2223
use num_traits::CheckedAdd;
2324

@@ -33,23 +34,35 @@ pub(super) fn build_extend<T: ArrowNativeType + Integer + CheckedAdd>(
3334
for i in start..start + len {
3435
mutable.buffer1.push(new_offset);
3536
mutable.buffer2.push(sizes[i]);
36-
new_offset = new_offset.checked_add(&sizes[i]).expect("offset overflow");
37+
new_offset = new_offset.checked_add(&sizes[i]).ok_or_else(|| {
38+
ArrowError::InvalidArgumentError(
39+
"offset overflow: data exceeds the capacity of the offset type. \
40+
Try splitting into smaller batches or using a larger type \
41+
(e.g. LargeListView instead of ListView)"
42+
.to_string(),
43+
)
44+
})?;
3745

3846
let size = sizes[i].as_usize();
3947
if size > 0 {
4048
let child_start = offsets[i].as_usize();
41-
mutable.child_data[0].extend(index, child_start, child_start + size);
49+
mutable.child_data[0].try_extend(index, child_start, child_start + size)?;
4250
}
4351
}
52+
Ok(())
4453
},
4554
)
4655
}
4756

48-
pub(super) fn extend_nulls<T: ArrowNativeType>(mutable: &mut _MutableArrayData, len: usize) {
57+
pub(super) fn extend_nulls<T: ArrowNativeType>(
58+
mutable: &mut _MutableArrayData,
59+
len: usize,
60+
) -> Result<(), ArrowError> {
4961
let offset_buffer = &mut mutable.buffer1;
5062
let sizes_buffer = &mut mutable.buffer2;
5163

5264
// We push 0 as a placeholder for NULL values in both the offsets and sizes
5365
(0..len).for_each(|_| offset_buffer.push(T::default()));
5466
(0..len).for_each(|_| sizes_buffer.push(T::default()));
67+
Ok(())
5568
}

arrow-data/src/transform/mod.rs

Lines changed: 83 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,10 @@ mod variable_size;
4545
type ExtendNullBits<'a> = Box<dyn Fn(&mut _MutableArrayData, usize, usize) + 'a>;
4646
// function that extends `[start..start+len]` to the mutable array.
4747
// this is dynamic because different data_types influence how buffers and children are extended.
48-
type Extend<'a> = Box<dyn Fn(&mut _MutableArrayData, usize, usize, usize) + 'a>;
48+
type Extend<'a> =
49+
Box<dyn Fn(&mut _MutableArrayData, usize, usize, usize) -> Result<(), ArrowError> + 'a>;
4950

50-
type ExtendNulls = Box<dyn Fn(&mut _MutableArrayData, usize)>;
51+
type ExtendNulls = Box<dyn Fn(&mut _MutableArrayData, usize) -> Result<(), ArrowError>>;
5152

5253
/// A mutable [ArrayData] that knows how to freeze itself into an [ArrayData].
5354
/// This is just a data container.
@@ -230,7 +231,8 @@ fn build_extend_view(array: &ArrayData, buffer_offset: u32) -> Extend<'_> {
230231
let mut view = ByteView::from(*v);
231232
view.buffer_index += buffer_offset;
232233
view.into()
233-
}))
234+
}));
235+
Ok(())
234236
},
235237
)
236238
}
@@ -628,7 +630,10 @@ impl<'a> MutableArrayData<'a> {
628630
let mut mutable = MutableArrayData::new(dictionaries, false, capacity);
629631

630632
for (i, len) in lengths.iter().enumerate() {
631-
mutable.extend(i, 0, *len)
633+
mutable.try_extend(i, 0, *len).expect(
634+
"extend failed while building dictionary; \
635+
this is a bug in MutableArrayData",
636+
)
632637
}
633638

634639
(Some(mutable.freeze()), true)
@@ -716,36 +721,103 @@ impl<'a> MutableArrayData<'a> {
716721
}
717722
}
718723

719-
/// Extends the in progress array with a region of the input arrays
724+
/// Extends the in progress array with a region of the input arrays, returning an error on
725+
/// overflow.
726+
///
727+
/// Prefer this over [`extend`](Self::extend) to handle cases where the data exceeds the
728+
/// capacity of the offset type (e.g. more than 2 GiB in a `StringArray`). The error message
729+
/// will indicate which array type overflowed and suggest a larger type.
720730
///
721731
/// # Arguments
722-
/// * `index` - the index of array that you what to copy values from
732+
/// * `index` - the index of array that you want to copy values from
723733
/// * `start` - the start index of the chunk (inclusive)
724734
/// * `end` - the end index of the chunk (exclusive)
725735
///
736+
/// # Errors
737+
/// Returns an error if offset arithmetic overflows the underlying integer type.
738+
///
726739
/// # Panic
727740
/// This function panics if there is an invalid index,
728741
/// i.e. `index` >= the number of source arrays
729742
/// or `end` > the length of the `index`th array
730-
pub fn extend(&mut self, index: usize, start: usize, end: usize) {
743+
pub fn try_extend(
744+
&mut self,
745+
index: usize,
746+
start: usize,
747+
end: usize,
748+
) -> Result<(), ArrowError> {
731749
let len = end - start;
732750
(self.extend_null_bits[index])(&mut self.data, start, len);
733-
(self.extend_values[index])(&mut self.data, index, start, len);
751+
// Snapshot buffer lengths before attempting the extend so we can roll
752+
// back to a consistent state if it fails.
753+
let buf1_len = self.data.buffer1.len();
754+
let buf2_len = self.data.buffer2.len();
755+
if let Err(e) = (self.extend_values[index])(&mut self.data, index, start, len) {
756+
// Restore buffers to their pre-call lengths so the array remains
757+
// in a valid state for the caller to inspect or retry.
758+
self.data.buffer1.truncate(buf1_len);
759+
self.data.buffer2.truncate(buf2_len);
760+
return Err(e);
761+
}
734762
self.data.len += len;
763+
Ok(())
735764
}
736765

737-
/// Extends the in progress array with null elements, ignoring the input arrays.
766+
767+
/// Extends the in progress array with a region of the input arrays.
768+
///
769+
/// # Deprecated
770+
/// Use [`try_extend`](Self::try_extend) instead, which returns an [`ArrowError`] on overflow
771+
/// rather than panicking.
772+
///
773+
/// # Panic
774+
/// This function panics if there is an invalid index,
775+
/// i.e. `index` >= the number of source arrays,
776+
/// `end` > the length of the `index`th array,
777+
/// or the offset type overflows (e.g. more than 2 GiB in a `StringArray`).
778+
#[deprecated(
779+
note = "Use `try_extend` which returns an error on overflow instead of panicking"
780+
)]
781+
pub fn extend(&mut self, index: usize, start: usize, end: usize) {
782+
self.try_extend(index, start, end)
783+
.expect("extend failed due to offset overflow")
784+
}
785+
786+
/// Extends the in progress array with null elements, ignoring the input arrays, returning an
787+
/// error on overflow.
788+
///
789+
/// Prefer this over [`extend_nulls`](Self::extend_nulls) to handle cases where the run-end
790+
/// counter overflows (relevant for `RunEndEncoded` arrays).
738791
///
739792
/// # Panics
740793
///
741794
/// Panics if [`MutableArrayData`] not created with `use_nulls` or nullable source arrays
742-
pub fn extend_nulls(&mut self, len: usize) {
795+
pub fn try_extend_nulls(&mut self, len: usize) -> Result<(), ArrowError> {
743796
self.data.len += len;
744797
let bit_len = bit_util::ceil(self.data.len, 8);
745798
let nulls = self.data.null_buffer();
746799
nulls.resize(bit_len, 0);
747800
self.data.null_count += len;
748-
(self.extend_nulls)(&mut self.data, len);
801+
(self.extend_nulls)(&mut self.data, len)?;
802+
Ok(())
803+
}
804+
805+
/// Extends the in progress array with null elements, ignoring the input arrays.
806+
///
807+
/// # Deprecated
808+
/// Use [`try_extend_nulls`](Self::try_extend_nulls) instead, which returns an [`ArrowError`]
809+
/// on overflow rather than panicking.
810+
///
811+
/// # Panics
812+
///
813+
/// Panics if [`MutableArrayData`] not created with `use_nulls` or nullable source arrays,
814+
/// or if the run-end counter overflows for `RunEndEncoded` arrays.
815+
#[deprecated(
816+
note = "Use `try_extend_nulls` which returns an error on overflow instead of panicking"
817+
)]
818+
pub fn extend_nulls(&mut self, len: usize) {
819+
self.try_extend_nulls(len)
820+
.expect("extend_nulls failed due to overflow")
749821
}
750822

751823
/// Returns the current length

0 commit comments

Comments
 (0)