Skip to content

Commit b904318

Browse files
authored
fix:[9018]Fixed RunArray slice offsets (#9036)
# Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. --> - Part of #9018 . # Rationale for this change <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> To consider offset in slicing of RunArray. # What changes are included in this PR? <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> 1. Considered offset in slicing of RunArray. 2. Enhanced RunArray slice API. # Are these changes tested? <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 3. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> yes # Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. If there are any breaking changes to public APIs, please call them out. --> Yes, extended API to access RunArray slices directly than getting it from index.
1 parent 3f5eebd commit b904318

File tree

8 files changed

+137
-35
lines changed

8 files changed

+137
-35
lines changed

arrow-array/src/array/run_array.rs

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,16 @@ impl<R: RunEndIndexType> RunArray<R> {
136136
&self.values
137137
}
138138

139+
/// Similar to [`values`] but accounts for logical slicing, returning only the values
140+
/// that are part of the logical slice of this array.
141+
///
142+
/// [`values`]: Self::values
143+
pub fn values_slice(&self) -> ArrayRef {
144+
let start = self.get_start_physical_index();
145+
let end = self.get_end_physical_index();
146+
self.values.slice(start, end - start + 1)
147+
}
148+
139149
/// Returns the physical index at which the array slice starts.
140150
///
141151
/// See [`RunEndBuffer::get_start_physical_index`].
@@ -1132,4 +1142,35 @@ mod tests {
11321142

11331143
assert_eq!(array_i16_1, array_i16_2);
11341144
}
1145+
1146+
#[test]
1147+
fn test_run_array_values_slice() {
1148+
// 0, 0, 1, 1, 1, 2...2 (15 2s)
1149+
let run_ends: PrimitiveArray<Int32Type> = vec![2, 5, 20].into();
1150+
let values: PrimitiveArray<Int32Type> = vec![0, 1, 2].into();
1151+
let array = RunArray::<Int32Type>::try_new(&run_ends, &values).unwrap();
1152+
1153+
let slice = array.slice(1, 4); // 0 | 1, 1, 1 |
1154+
// logical indices: 1, 2, 3, 4
1155+
// physical indices: 0, 1, 1, 1
1156+
// values at 0 is 0
1157+
// values at 1 is 1
1158+
// values slice should be [0, 1]
1159+
assert_eq!(slice.get_start_physical_index(), 0);
1160+
assert_eq!(slice.get_end_physical_index(), 1);
1161+
1162+
let values_slice = slice.values_slice();
1163+
let values_slice = values_slice.as_primitive::<Int32Type>();
1164+
assert_eq!(values_slice.values(), &[0, 1]);
1165+
1166+
let slice2 = array.slice(2, 3); // 1, 1, 1
1167+
// logical indices: 2, 3, 4
1168+
// physical indices: 1, 1, 1
1169+
assert_eq!(slice2.get_start_physical_index(), 1);
1170+
assert_eq!(slice2.get_end_physical_index(), 1);
1171+
1172+
let values_slice2 = slice2.values_slice();
1173+
let values_slice2 = values_slice2.as_primitive::<Int32Type>();
1174+
assert_eq!(values_slice2.values(), &[1]);
1175+
}
11351176
}

arrow-buffer/src/buffer/run.rs

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,24 @@ where
189189
&self.run_ends
190190
}
191191

192+
/// Returns an iterator yielding run ends adjusted for the logical slice.
193+
///
194+
/// Each yielded value is subtracted by the [`logical_offset`] and capped
195+
/// at the [`logical_length`].
196+
///
197+
/// [`logical_offset`]: Self::offset
198+
/// [`logical_length`]: Self::len
199+
pub fn sliced_values(&self) -> impl Iterator<Item = E> + '_ {
200+
let offset = self.logical_offset;
201+
let len = self.logical_length;
202+
let start = self.get_start_physical_index();
203+
let end = self.get_end_physical_index();
204+
self.run_ends[start..=end].iter().map(move |&val| {
205+
let val = val.as_usize().saturating_sub(offset).min(len);
206+
E::from_usize(val).unwrap()
207+
})
208+
}
209+
192210
/// Returns the maximum run-end encoded in the underlying buffer; that is, the
193211
/// last physical run of the buffer. This does not take into account any logical
194212
/// slicing that may have occurred.
@@ -368,4 +386,26 @@ mod tests {
368386
assert_eq!(buffer.get_start_physical_index(), 0);
369387
assert_eq!(buffer.get_end_physical_index(), 0);
370388
}
389+
390+
#[test]
391+
fn test_sliced_values() {
392+
// [0, 0, 1, 2, 2, 2]
393+
let buffer = RunEndBuffer::new(vec![2i32, 3, 6].into(), 0, 6);
394+
395+
// Slice: [0, 1, 2, 2] start: 1, len: 4
396+
// Logical indices: 1, 2, 3, 4
397+
// Original run ends: [2, 3, 6]
398+
// Adjusted: [2-1, 3-1, 6-1] capped at 4 -> [1, 2, 4]
399+
let sliced = buffer.slice(1, 4);
400+
let sliced_values: Vec<i32> = sliced.sliced_values().collect();
401+
assert_eq!(sliced_values, &[1, 2, 4]);
402+
403+
// Slice: [2, 2] start: 4, len: 2
404+
// Original run ends: [2, 3, 6]
405+
// Slicing at 4 means we only have the last run (physical index 2, which ends at 6)
406+
// Adjusted: [6-4] capped at 2 -> [2]
407+
let sliced = buffer.slice(4, 2);
408+
let sliced_values: Vec<i32> = sliced.sliced_values().collect();
409+
assert_eq!(sliced_values, &[2]);
410+
}
371411
}

arrow-cast/src/cast/mod.rs

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11705,9 +11705,6 @@ mod tests {
1170511705
}
1170611706

1170711707
#[test]
11708-
#[should_panic = "assertion `left == right` failed\n left: ScalarBuffer([1, 1, 2])\n right: [2, 2, 3]"]
11709-
// TODO: fix cast of RunArrays to account for sliced RunArray's
11710-
// https://github.com/apache/arrow-rs/issues/9018
1171111708
fn test_sliced_run_end_encoded_to_primitive() {
1171211709
let run_ends = Int32Array::from(vec![2, 5, 6]);
1171311710
let values = Int32Array::from(vec![1, 2, 3]);

arrow-cast/src/cast/run_array.rs

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -72,13 +72,17 @@ pub(crate) fn run_end_encoded_cast<K: RunEndIndexType>(
7272

7373
// Expand to logical form
7474
_ => {
75-
let run_ends = run_array.run_ends().values().to_vec();
76-
let mut indices = Vec::with_capacity(run_array.run_ends().len());
77-
let mut physical_idx: usize = 0;
78-
for logical_idx in 0..run_array.run_ends().len() {
79-
// If the logical index is equal to the (next) run end, increment the physical index,
80-
// since we are at the end of a run.
75+
let len = run_array.len();
76+
let offset = run_array.offset();
77+
let run_ends = run_array.run_ends().values();
78+
79+
let mut indices = Vec::with_capacity(len);
80+
let mut physical_idx = run_array.get_start_physical_index();
81+
82+
for logical_idx in offset..offset + len {
8183
if logical_idx == run_ends[physical_idx].as_usize() {
84+
// If the logical index is equal to the (next) run end, increment the physical index,
85+
// since we are at the end of a run.
8286
physical_idx += 1;
8387
}
8488
indices.push(physical_idx as i32);

arrow-data/src/transform/run.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ pub fn build_extend(array: &ArrayData) -> Extend<'_> {
206206
let (run_ends_bytes, values_range) = build_extend_arrays::<$run_end_type>(
207207
source_buffer,
208208
source_run_ends.len(),
209-
start,
209+
start + array.offset(),
210210
len,
211211
dest_last_run_end,
212212
);

arrow-select/src/concat.rs

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -364,7 +364,7 @@ where
364364
run_arrays
365365
.iter()
366366
.scan(R::default_value(), |acc, run_array| {
367-
*acc = *acc + *run_array.run_ends().values().last().unwrap();
367+
*acc = *acc + R::Native::from_usize(run_array.len()).unwrap();
368368
Some(*acc)
369369
}),
370370
)
@@ -379,18 +379,17 @@ where
379379
let adjustment = needed_run_end_adjustments[i];
380380
run_array
381381
.run_ends()
382-
.values()
383-
.iter()
384-
.map(move |run_end| *run_end + adjustment)
382+
.sliced_values()
383+
.map(move |run_end| run_end + adjustment)
385384
},
386385
));
387386

388-
let all_values = concat(
389-
&run_arrays
390-
.iter()
391-
.map(|x| x.values().as_ref())
392-
.collect::<Vec<_>>(),
393-
)?;
387+
let values_slices: Vec<ArrayRef> = run_arrays
388+
.iter()
389+
.map(|run_array| run_array.values_slice())
390+
.collect();
391+
392+
let all_values = concat(&values_slices.iter().map(|x| x.as_ref()).collect::<Vec<_>>())?;
394393

395394
let builder = ArrayDataBuilder::new(run_arrays[0].data_type().clone())
396395
.len(total_len)
@@ -1716,9 +1715,6 @@ mod tests {
17161715
}
17171716

17181717
#[test]
1719-
#[should_panic = "assertion `left == right` failed\n left: [20, 20, 40, 40, 40]\n right: [10, 10, 20, 20, 30, 40, 40, 40]"]
1720-
// TODO: fix concat of RunArrays to account for sliced RunArray's
1721-
// https://github.com/apache/arrow-rs/issues/9018
17221718
fn test_concat_sliced_run_array() {
17231719
// Slicing away first run in both arrays
17241720
let run_ends1 = Int32Array::from(vec![2, 4]);
@@ -1879,4 +1875,29 @@ mod tests {
18791875
assert_eq!(values.len(), 6);
18801876
assert_eq!(&[10, 20, 30, 40, 50, 60], values.values());
18811877
}
1878+
1879+
#[test]
1880+
fn test_concat_run_array_with_truncated_run() {
1881+
// Create a run array with run ends [2, 5] and values [10, 20]
1882+
// Logical: [10, 10, 20, 20, 20]
1883+
let run_ends1 = Int32Array::from(vec![2, 5]);
1884+
let values1 = Int32Array::from(vec![10, 20]);
1885+
let array1 = RunArray::try_new(&run_ends1, &values1).unwrap();
1886+
let array1_sliced = array1.slice(0, 3);
1887+
1888+
let run_ends2 = Int32Array::from(vec![2]);
1889+
let values2 = Int32Array::from(vec![30]);
1890+
let array2 = RunArray::try_new(&run_ends2, &values2).unwrap();
1891+
1892+
let result = concat(&[&array1_sliced, &array2]).unwrap();
1893+
let result_run_array = result.as_run::<Int32Type>();
1894+
1895+
// Result should be [10, 10, 20, 30, 30]
1896+
// Run ends should be [2, 3, 5]
1897+
assert_eq!(result_run_array.len(), 5);
1898+
let run_ends = result_run_array.run_ends().values();
1899+
let values = result_run_array.values().as_primitive::<Int32Type>();
1900+
assert_eq!(values.values(), &[10, 20, 30]);
1901+
assert_eq!(&[2, 3, 5], run_ends);
1902+
}
18821903
}

arrow-select/src/filter.rs

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -487,17 +487,22 @@ where
487487
R::Native: AddAssign,
488488
{
489489
let run_ends: &RunEndBuffer<R::Native> = array.run_ends();
490-
let mut new_run_ends = vec![R::default_value(); run_ends.len()];
490+
let start_physical = run_ends.get_start_physical_index();
491+
let end_physical = run_ends.get_end_physical_index();
492+
let physical_len = end_physical - start_physical + 1;
493+
494+
let mut new_run_ends = vec![R::default_value(); physical_len];
495+
let offset = run_ends.offset() as u64;
491496

492497
let mut start = 0u64;
493498
let mut j = 0;
494499
let mut count = R::default_value();
495500
let filter_values = predicate.filter.values();
496501
let run_ends = run_ends.inner();
497502

498-
let pred: BooleanArray = BooleanBuffer::collect_bool(run_ends.len(), |i| {
503+
let pred: BooleanArray = BooleanBuffer::collect_bool(physical_len, |i| {
499504
let mut keep = false;
500-
let mut end = run_ends[i].into() as u64;
505+
let mut end = (run_ends[i + start_physical].into() as u64).saturating_sub(offset);
501506
let difference = end.saturating_sub(filter_values.len() as u64);
502507
end -= difference;
503508

@@ -517,8 +522,8 @@ where
517522

518523
new_run_ends.truncate(j);
519524

520-
let values = array.values();
521-
let values = filter(&values, &pred)?;
525+
let values = array.values_slice();
526+
let values = filter(values.as_ref(), &pred)?;
522527

523528
let run_ends = PrimitiveArray::<R>::try_new(new_run_ends.into(), None)?;
524529
RunArray::try_new(&run_ends, &values)
@@ -1355,9 +1360,6 @@ mod tests {
13551360
}
13561361

13571362
#[test]
1358-
#[should_panic = "assertion `left == right` failed\n left: [-2, 9]\n right: [7, -2]"]
1359-
// TODO: fix filter of RunArrays to account for sliced RunArray's
1360-
// https://github.com/apache/arrow-rs/issues/9018
13611363
fn test_filter_run_end_encoding_array_sliced() {
13621364
let run_ends = Int64Array::from(vec![2, 3, 8]);
13631365
let values = Int64Array::from(vec![7, -2, 9]);

arrow-select/src/interleave.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1172,9 +1172,6 @@ mod tests {
11721172
}
11731173

11741174
#[test]
1175-
#[should_panic = "assertion `left == right` failed\n left: [1, 4, 2, 5, 6]\n right: [2, 5, 2, 5, 6]"]
1176-
// TODO: fix interleave of RunArrays to account for sliced RunArray's
1177-
// https://github.com/apache/arrow-rs/issues/9018
11781175
fn test_interleave_run_end_encoded_sliced() {
11791176
let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new();
11801177
builder.extend([1, 1, 2, 2, 2, 3].into_iter().map(Some));
@@ -1186,7 +1183,7 @@ mod tests {
11861183
let b = builder.finish();
11871184
let b = b.slice(1, 3); // [5, 5, 6]
11881185

1189-
let indices = &[(0, 1), (1, 0), (0, 3), (1, 2), (1, 3)];
1186+
let indices = &[(0, 1), (1, 0), (0, 2), (1, 1), (1, 2)];
11901187
let result = interleave(&[&a, &b], indices).unwrap();
11911188

11921189
let result = result.as_run::<Int32Type>();

0 commit comments

Comments
 (0)