|
16 | 16 | // under the License. |
17 | 17 |
|
18 | 18 | use crate::data::ArrayData; |
| 19 | +use arrow_buffer::ArrowNativeType; |
| 20 | +use arrow_schema::DataType; |
| 21 | +use num_traits::ToPrimitive; |
19 | 22 |
|
20 | 23 | use super::equal_range; |
21 | 24 |
|
22 | | -/// The current implementation of comparison of run array support physical comparison. |
23 | | -/// Comparing run encoded array based on logical indices (`lhs_start`, `rhs_start`) will |
24 | | -/// be time consuming as converting from logical index to physical index cannot be done |
25 | | -/// in constant time. The current comparison compares the underlying physical arrays. |
26 | 25 | pub(super) fn run_equal( |
27 | 26 | lhs: &ArrayData, |
28 | 27 | rhs: &ArrayData, |
29 | 28 | lhs_start: usize, |
30 | 29 | rhs_start: usize, |
31 | 30 | len: usize, |
32 | 31 | ) -> bool { |
33 | | - if lhs_start != 0 |
34 | | - || rhs_start != 0 |
35 | | - || (lhs.len() != len && rhs.len() != len) |
36 | | - || lhs.offset() > 0 |
37 | | - || rhs.offset() > 0 |
38 | | - { |
39 | | - unimplemented!("Logical comparison for run array not supported.") |
| 32 | + let lhs_index_type = match lhs.data_type() { |
| 33 | + DataType::RunEndEncoded(f, _) => f.data_type(), |
| 34 | + _ => unreachable!(), |
| 35 | + }; |
| 36 | + |
| 37 | + match lhs_index_type { |
| 38 | + DataType::Int16 => run_equal_inner::<i16>(lhs, rhs, lhs_start, rhs_start, len), |
| 39 | + DataType::Int32 => run_equal_inner::<i32>(lhs, rhs, lhs_start, rhs_start, len), |
| 40 | + DataType::Int64 => run_equal_inner::<i64>(lhs, rhs, lhs_start, rhs_start, len), |
| 41 | + _ => unreachable!(), |
40 | 42 | } |
| 43 | +} |
41 | 44 |
|
42 | | - if lhs.len() != rhs.len() { |
43 | | - return false; |
| 45 | +fn run_equal_inner<T: ArrowNativeType + ToPrimitive>( |
| 46 | + lhs: &ArrayData, |
| 47 | + rhs: &ArrayData, |
| 48 | + lhs_start: usize, |
| 49 | + rhs_start: usize, |
| 50 | + len: usize, |
| 51 | +) -> bool { |
| 52 | + if len == 0 { |
| 53 | + return true; |
44 | 54 | } |
| 55 | + // RunEndEncoded arrays are guaranteed to have at least 2 children [run_ends, values] |
| 56 | + let lhs_run_ends_data = &lhs.child_data()[0]; |
| 57 | + let rhs_run_ends_data = &rhs.child_data()[0]; |
| 58 | + let lhs_values = &lhs.child_data()[1]; |
| 59 | + let rhs_values = &rhs.child_data()[1]; |
45 | 60 |
|
46 | | - let lhs_child_data = lhs.child_data(); |
47 | | - let lhs_run_ends_array = &lhs_child_data[0]; |
48 | | - let lhs_values_array = &lhs_child_data[1]; |
| 61 | + let lhs_run_ends = &lhs_run_ends_data.buffers()[0].typed_data::<T>() |
| 62 | + [lhs_run_ends_data.offset()..lhs_run_ends_data.offset() + lhs_run_ends_data.len()]; |
| 63 | + let rhs_run_ends = &rhs_run_ends_data.buffers()[0].typed_data::<T>() |
| 64 | + [rhs_run_ends_data.offset()..rhs_run_ends_data.offset() + rhs_run_ends_data.len()]; |
49 | 65 |
|
50 | | - let rhs_child_data = rhs.child_data(); |
51 | | - let rhs_run_ends_array = &rhs_child_data[0]; |
52 | | - let rhs_values_array = &rhs_child_data[1]; |
| 66 | + let lhs_abs_start = lhs.offset() + lhs_start; |
| 67 | + let rhs_abs_start = rhs.offset() + rhs_start; |
| 68 | + let lhs_abs_end = lhs_abs_start + len; |
| 69 | + let rhs_abs_end = rhs_abs_start + len; |
53 | 70 |
|
54 | | - if lhs_run_ends_array.len() != rhs_run_ends_array.len() { |
55 | | - return false; |
56 | | - } |
| 71 | + let l_start_phys = find_physical_index(lhs_run_ends, lhs_abs_start); |
| 72 | + let r_start_phys = find_physical_index(rhs_run_ends, rhs_abs_start); |
| 73 | + |
| 74 | + let l_end_phys = find_physical_index(lhs_run_ends, lhs_abs_end - 1); |
| 75 | + let r_end_phys = find_physical_index(rhs_run_ends, rhs_abs_end - 1); |
| 76 | + |
| 77 | + let mut l_phys = l_start_phys; |
| 78 | + let mut r_phys = r_start_phys; |
57 | 79 |
|
58 | | - if lhs_values_array.len() != rhs_values_array.len() { |
59 | | - return false; |
| 80 | + let l_runs = l_end_phys - l_start_phys + 1; |
| 81 | + let r_runs = r_end_phys - r_start_phys + 1; |
| 82 | + |
| 83 | + if l_runs == r_runs { |
| 84 | + let physical_match = lhs_run_ends[l_start_phys..l_end_phys] |
| 85 | + .iter() |
| 86 | + .zip(&rhs_run_ends[r_start_phys..r_end_phys]) |
| 87 | + .all(|(l, r)| l.as_usize() - lhs_abs_start == r.as_usize() - rhs_abs_start); |
| 88 | + |
| 89 | + if physical_match { |
| 90 | + return equal_range(lhs_values, rhs_values, l_start_phys, r_start_phys, l_runs); |
| 91 | + } |
60 | 92 | } |
61 | 93 |
|
62 | | - // check run ends array are equal. The length of the physical array |
63 | | - // is used to validate the child arrays. |
64 | | - let run_ends_equal = equal_range( |
65 | | - lhs_run_ends_array, |
66 | | - rhs_run_ends_array, |
67 | | - lhs_start, |
68 | | - rhs_start, |
69 | | - lhs_run_ends_array.len(), |
70 | | - ); |
71 | | - |
72 | | - // if run ends array are not the same return early without validating |
73 | | - // values array. |
74 | | - if !run_ends_equal { |
75 | | - return false; |
| 94 | + let mut processed = 0; |
| 95 | + while processed < len { |
| 96 | + if !equal_range(lhs_values, rhs_values, l_phys, r_phys, 1) { |
| 97 | + return false; |
| 98 | + } |
| 99 | + |
| 100 | + let l_run_end = lhs_run_ends[l_phys].as_usize(); |
| 101 | + let r_run_end = rhs_run_ends[r_phys].as_usize(); |
| 102 | + |
| 103 | + let l_remaining = l_run_end - (lhs_abs_start + processed); |
| 104 | + let r_remaining = r_run_end - (rhs_abs_start + processed); |
| 105 | + |
| 106 | + let step = l_remaining.min(r_remaining).min(len - processed); |
| 107 | + processed += step; |
| 108 | + |
| 109 | + if processed < len { |
| 110 | + if lhs_abs_start + processed == l_run_end { |
| 111 | + l_phys += 1; |
| 112 | + } |
| 113 | + if rhs_abs_start + processed == r_run_end { |
| 114 | + r_phys += 1; |
| 115 | + } |
| 116 | + } |
76 | 117 | } |
77 | 118 |
|
78 | | - // check values array are equal |
79 | | - equal_range( |
80 | | - lhs_values_array, |
81 | | - rhs_values_array, |
82 | | - lhs_start, |
83 | | - rhs_start, |
84 | | - rhs_values_array.len(), |
85 | | - ) |
| 119 | + true |
| 120 | +} |
| 121 | + |
| 122 | +fn find_physical_index<T: ArrowNativeType + ToPrimitive>( |
| 123 | + run_ends: &[T], |
| 124 | + logical_index: usize, |
| 125 | +) -> usize { |
| 126 | + if logical_index == 0 { |
| 127 | + return 0; |
| 128 | + } |
| 129 | + match run_ends.binary_search_by(|val| val.as_usize().cmp(&logical_index)) { |
| 130 | + Ok(idx) => idx + 1, |
| 131 | + Err(idx) => idx, |
| 132 | + } |
86 | 133 | } |
0 commit comments