Skip to content

Commit bfbe554

Browse files
committed
fix:[9018]Fixed RunArray slice offsets
1 parent 6aae76b commit bfbe554

File tree

8 files changed

+137
-35
lines changed

8 files changed

+137
-35
lines changed

arrow-array/src/array/run_array.rs

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,16 @@ impl<R: RunEndIndexType> RunArray<R> {
136136
&self.values
137137
}
138138

139+
/// Similar to [`values`] but accounts for logical slicing, returning only the values
140+
/// that are part of the logical slice of this array.
141+
///
142+
/// [`values`]: Self::values
143+
pub fn values_slice(&self) -> ArrayRef {
144+
let start = self.get_start_physical_index();
145+
let end = self.get_end_physical_index();
146+
self.values.slice(start, end - start + 1)
147+
}
148+
139149
/// Returns the physical index at which the array slice starts.
140150
///
141151
/// See [`RunEndBuffer::get_start_physical_index`].
@@ -1132,4 +1142,35 @@ mod tests {
11321142

11331143
assert_eq!(array_i16_1, array_i16_2);
11341144
}
1145+
1146+
#[test]
1147+
fn test_run_array_values_slice() {
1148+
// 0, 0, 1, 1, 1, 2...2 (15 2s)
1149+
let run_ends: PrimitiveArray<Int32Type> = vec![2, 5, 20].into();
1150+
let values: PrimitiveArray<Int32Type> = vec![0, 1, 2].into();
1151+
let array = RunArray::<Int32Type>::try_new(&run_ends, &values).unwrap();
1152+
1153+
let slice = array.slice(1, 4); // 0 | 1, 1, 1 |
1154+
// logical indices: 1, 2, 3, 4
1155+
// physical indices: 0, 1, 1, 1
1156+
// values at 0 is 0
1157+
// values at 1 is 1
1158+
// values slice should be [0, 1]
1159+
assert_eq!(slice.get_start_physical_index(), 0);
1160+
assert_eq!(slice.get_end_physical_index(), 1);
1161+
1162+
let values_slice = slice.values_slice();
1163+
let values_slice = values_slice.as_primitive::<Int32Type>();
1164+
assert_eq!(values_slice.values(), &[0, 1]);
1165+
1166+
let slice2 = array.slice(2, 3); // 1, 1, 1
1167+
// logical indices: 2, 3, 4
1168+
// physical indices: 1, 1, 1
1169+
assert_eq!(slice2.get_start_physical_index(), 1);
1170+
assert_eq!(slice2.get_end_physical_index(), 1);
1171+
1172+
let values_slice2 = slice2.values_slice();
1173+
let values_slice2 = values_slice2.as_primitive::<Int32Type>();
1174+
assert_eq!(values_slice2.values(), &[1]);
1175+
}
11351176
}

arrow-buffer/src/buffer/run.rs

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,24 @@ where
189189
&self.run_ends
190190
}
191191

192+
/// Returns an iterator yielding run ends adjusted for the logical slice.
193+
///
194+
/// Each yielded value is subtracted by the [`logical_offset`] and capped
195+
/// at the [`logical_length`].
196+
///
197+
/// [`logical_offset`]: Self::offset
198+
/// [`logical_length`]: Self::len
199+
pub fn sliced_values(&self) -> impl Iterator<Item = E> + '_ {
200+
let offset = self.logical_offset;
201+
let len = self.logical_length;
202+
let start = self.get_start_physical_index();
203+
let end = self.get_end_physical_index();
204+
self.run_ends[start..=end].iter().map(move |&val| {
205+
let val = val.as_usize().saturating_sub(offset).min(len);
206+
E::from_usize(val).unwrap()
207+
})
208+
}
209+
192210
/// Returns the maximum run-end encoded in the underlying buffer; that is, the
193211
/// last physical run of the buffer. This does not take into account any logical
194212
/// slicing that may have occurred.
@@ -368,4 +386,26 @@ mod tests {
368386
assert_eq!(buffer.get_start_physical_index(), 0);
369387
assert_eq!(buffer.get_end_physical_index(), 0);
370388
}
389+
390+
#[test]
391+
fn test_sliced_values() {
392+
// [0, 0, 1, 2, 2, 2]
393+
let buffer = RunEndBuffer::new(vec![2i32, 3, 6].into(), 0, 6);
394+
395+
// Slice: [0, 1, 2, 2] start: 1, len: 4
396+
// Logical indices: 1, 2, 3, 4
397+
// Original run ends: [2, 3, 6]
398+
// Adjusted: [2-1, 3-1, 6-1] capped at 4 -> [1, 2, 4]
399+
let sliced = buffer.slice(1, 4);
400+
let sliced_values: Vec<i32> = sliced.sliced_values().collect();
401+
assert_eq!(sliced_values, &[1, 2, 4]);
402+
403+
// Slice: [2, 2] start: 4, len: 2
404+
// Original run ends: [2, 3, 6]
405+
// Slicing at 4 means we only have the last run (physical index 2, which ends at 6)
406+
// Adjusted: [6-4] capped at 2 -> [2]
407+
let sliced = buffer.slice(4, 2);
408+
let sliced_values: Vec<i32> = sliced.sliced_values().collect();
409+
assert_eq!(sliced_values, &[2]);
410+
}
371411
}

arrow-cast/src/cast/mod.rs

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11746,9 +11746,6 @@ mod tests {
1174611746
}
1174711747

1174811748
#[test]
11749-
#[should_panic = "assertion `left == right` failed\n left: ScalarBuffer([1, 1, 2])\n right: [2, 2, 3]"]
11750-
// TODO: fix cast of RunArrays to account for sliced RunArray's
11751-
// https://github.com/apache/arrow-rs/issues/9018
1175211749
fn test_sliced_run_end_encoded_to_primitive() {
1175311750
let run_ends = Int32Array::from(vec![2, 5, 6]);
1175411751
let values = Int32Array::from(vec![1, 2, 3]);

arrow-cast/src/cast/run_array.rs

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -72,13 +72,17 @@ pub(crate) fn run_end_encoded_cast<K: RunEndIndexType>(
7272

7373
// Expand to logical form
7474
_ => {
75-
let run_ends = run_array.run_ends().values().to_vec();
76-
let mut indices = Vec::with_capacity(run_array.run_ends().len());
77-
let mut physical_idx: usize = 0;
78-
for logical_idx in 0..run_array.run_ends().len() {
79-
// If the logical index is equal to the (next) run end, increment the physical index,
80-
// since we are at the end of a run.
75+
let len = run_array.len();
76+
let offset = run_array.offset();
77+
let run_ends = run_array.run_ends().values();
78+
79+
let mut indices = Vec::with_capacity(len);
80+
let mut physical_idx = run_array.get_start_physical_index();
81+
82+
for logical_idx in offset..offset + len {
8183
if logical_idx == run_ends[physical_idx].as_usize() {
84+
// If the logical index is equal to the (next) run end, increment the physical index,
85+
// since we are at the end of a run.
8286
physical_idx += 1;
8387
}
8488
indices.push(physical_idx as i32);

arrow-data/src/transform/run.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ pub fn build_extend(array: &ArrayData) -> Extend<'_> {
206206
let (run_ends_bytes, values_range) = build_extend_arrays::<$run_end_type>(
207207
source_buffer,
208208
source_run_ends.len(),
209-
start,
209+
start + array.offset(),
210210
len,
211211
dest_last_run_end,
212212
);

arrow-select/src/concat.rs

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -364,7 +364,7 @@ where
364364
run_arrays
365365
.iter()
366366
.scan(R::default_value(), |acc, run_array| {
367-
*acc = *acc + *run_array.run_ends().values().last().unwrap();
367+
*acc = *acc + R::Native::from_usize(run_array.len()).unwrap();
368368
Some(*acc)
369369
}),
370370
)
@@ -379,18 +379,17 @@ where
379379
let adjustment = needed_run_end_adjustments[i];
380380
run_array
381381
.run_ends()
382-
.values()
383-
.iter()
384-
.map(move |run_end| *run_end + adjustment)
382+
.sliced_values()
383+
.map(move |run_end| run_end + adjustment)
385384
},
386385
));
387386

388-
let all_values = concat(
389-
&run_arrays
390-
.iter()
391-
.map(|x| x.values().as_ref())
392-
.collect::<Vec<_>>(),
393-
)?;
387+
let values_slices: Vec<ArrayRef> = run_arrays
388+
.iter()
389+
.map(|run_array| run_array.values_slice())
390+
.collect();
391+
392+
let all_values = concat(&values_slices.iter().map(|x| x.as_ref()).collect::<Vec<_>>())?;
394393

395394
let builder = ArrayDataBuilder::new(run_arrays[0].data_type().clone())
396395
.len(total_len)
@@ -1716,9 +1715,6 @@ mod tests {
17161715
}
17171716

17181717
#[test]
1719-
#[should_panic = "assertion `left == right` failed\n left: [20, 20, 40, 40, 40]\n right: [10, 10, 20, 20, 30, 40, 40, 40]"]
1720-
// TODO: fix concat of RunArrays to account for sliced RunArray's
1721-
// https://github.com/apache/arrow-rs/issues/9018
17221718
fn test_concat_sliced_run_array() {
17231719
// Slicing away first run in both arrays
17241720
let run_ends1 = Int32Array::from(vec![2, 4]);
@@ -1879,4 +1875,29 @@ mod tests {
18791875
assert_eq!(values.len(), 6);
18801876
assert_eq!(&[10, 20, 30, 40, 50, 60], values.values());
18811877
}
1878+
1879+
#[test]
1880+
fn test_concat_run_array_with_truncated_run() {
1881+
// Create a run array with run ends [2, 5] and values [10, 20]
1882+
// Logical: [10, 10, 20, 20, 20]
1883+
let run_ends1 = Int32Array::from(vec![2, 5]);
1884+
let values1 = Int32Array::from(vec![10, 20]);
1885+
let array1 = RunArray::try_new(&run_ends1, &values1).unwrap();
1886+
let array1_sliced = array1.slice(0, 3);
1887+
1888+
let run_ends2 = Int32Array::from(vec![2]);
1889+
let values2 = Int32Array::from(vec![30]);
1890+
let array2 = RunArray::try_new(&run_ends2, &values2).unwrap();
1891+
1892+
let result = concat(&[&array1_sliced, &array2]).unwrap();
1893+
let result_run_array = result.as_run::<Int32Type>();
1894+
1895+
// Result should be [10, 10, 20, 30, 30]
1896+
// Run ends should be [2, 3, 5]
1897+
assert_eq!(result_run_array.len(), 5);
1898+
let run_ends = result_run_array.run_ends().values();
1899+
let values = result_run_array.values().as_primitive::<Int32Type>();
1900+
assert_eq!(values.values(), &[10, 20, 30]);
1901+
assert_eq!(&[2, 3, 5], run_ends);
1902+
}
18821903
}

arrow-select/src/filter.rs

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -487,17 +487,22 @@ where
487487
R::Native: AddAssign,
488488
{
489489
let run_ends: &RunEndBuffer<R::Native> = array.run_ends();
490-
let mut new_run_ends = vec![R::default_value(); run_ends.len()];
490+
let start_physical = run_ends.get_start_physical_index();
491+
let end_physical = run_ends.get_end_physical_index();
492+
let physical_len = end_physical - start_physical + 1;
493+
494+
let mut new_run_ends = vec![R::default_value(); physical_len];
495+
let offset = run_ends.offset() as u64;
491496

492497
let mut start = 0u64;
493498
let mut j = 0;
494499
let mut count = R::default_value();
495500
let filter_values = predicate.filter.values();
496501
let run_ends = run_ends.inner();
497502

498-
let pred: BooleanArray = BooleanBuffer::collect_bool(run_ends.len(), |i| {
503+
let pred: BooleanArray = BooleanBuffer::collect_bool(physical_len, |i| {
499504
let mut keep = false;
500-
let mut end = run_ends[i].into() as u64;
505+
let mut end = (run_ends[i + start_physical].into() as u64).saturating_sub(offset);
501506
let difference = end.saturating_sub(filter_values.len() as u64);
502507
end -= difference;
503508

@@ -517,8 +522,8 @@ where
517522

518523
new_run_ends.truncate(j);
519524

520-
let values = array.values();
521-
let values = filter(&values, &pred)?;
525+
let values = array.values_slice();
526+
let values = filter(values.as_ref(), &pred)?;
522527

523528
let run_ends = PrimitiveArray::<R>::try_new(new_run_ends.into(), None)?;
524529
RunArray::try_new(&run_ends, &values)
@@ -1355,9 +1360,6 @@ mod tests {
13551360
}
13561361

13571362
#[test]
1358-
#[should_panic = "assertion `left == right` failed\n left: [-2, 9]\n right: [7, -2]"]
1359-
// TODO: fix filter of RunArrays to account for sliced RunArray's
1360-
// https://github.com/apache/arrow-rs/issues/9018
13611363
fn test_filter_run_end_encoding_array_sliced() {
13621364
let run_ends = Int64Array::from(vec![2, 3, 8]);
13631365
let values = Int64Array::from(vec![7, -2, 9]);

arrow-select/src/interleave.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1172,9 +1172,6 @@ mod tests {
11721172
}
11731173

11741174
#[test]
1175-
#[should_panic = "assertion `left == right` failed\n left: [1, 4, 2, 5, 6]\n right: [2, 5, 2, 5, 6]"]
1176-
// TODO: fix interleave of RunArrays to account for sliced RunArray's
1177-
// https://github.com/apache/arrow-rs/issues/9018
11781175
fn test_interleave_run_end_encoded_sliced() {
11791176
let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new();
11801177
builder.extend([1, 1, 2, 2, 2, 3].into_iter().map(Some));
@@ -1186,7 +1183,7 @@ mod tests {
11861183
let b = builder.finish();
11871184
let b = b.slice(1, 3); // [5, 5, 6]
11881185

1189-
let indices = &[(0, 1), (1, 0), (0, 3), (1, 2), (1, 3)];
1186+
let indices = &[(0, 1), (1, 0), (0, 2), (1, 1), (1, 2)];
11901187
let result = interleave(&[&a, &b], indices).unwrap();
11911188

11921189
let result = result.as_run::<Int32Type>();

0 commit comments

Comments
 (0)