Skip to content

Commit 86cab0c

Browse files
authored
fix(parquet): fix CDC panic on nested ListArrays with null entries (#9644)
The CDC chunker's value_offset diverged from actual leaf array positions when null list entries had non-empty child offset ranges (valid per the Arrow columnar format spec). This caused slice_for_chunk to produce incorrect non_null_indices, leading to an out-of-bounds panic in write_mini_batch. Track non-null value counts (nni) separately from leaf slot counts in the chunker, and use them in slice_for_chunk to correctly index into non_null_indices regardless of gaps in the leaf array. # Which issue does this PR close? Closes #9637 # Rationale for this change The Arrow spec allows null list entries to own non-empty child segments. When such arrays were written with CDC enabled, `CdcChunk::value_offset` diverged from the actual index into `non_null_indices`, causing an out-of-bounds panic. # What changes are included in this PR? - Redefine `CdcChunk::value_offset`/`num_values` as index/count into `non_null_indices` instead of leaf array positions. - Introduce `leaf_offset` in the nested branch to track leaf array position for hashing separately from `value_offset`. - Rewrite `slice_for_chunk` to directly index `non_null_indices`. # Are these changes tested? Yes. Unit tests for `slice_for_chunk` (nested nulls, all-null chunks) and end-to-end roundtrip tests for lists with non-empty null segments. # Are there any user-facing changes? No.
1 parent 43d984e commit 86cab0c

File tree

3 files changed

+318
-125
lines changed

3 files changed

+318
-125
lines changed

parquet/src/arrow/arrow_writer/levels.rs

Lines changed: 105 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -805,37 +805,29 @@ impl ArrayLevels {
805805

806806
/// Create a sliced view of this `ArrayLevels` for a CDC chunk.
807807
///
808-
/// Note: `def_levels`, `rep_levels`, and `non_null_indices` are copied (not zero-copy),
809-
/// while `array` is sliced without copying.
808+
/// The chunk's `value_offset`/`num_values` select the relevant slice of
809+
/// `non_null_indices`. The array is sliced to the range covered by
810+
/// those indices, and they are shifted to be relative to the slice.
810811
pub(crate) fn slice_for_chunk(&self, chunk: &CdcChunk) -> Self {
811-
let level_offset = chunk.level_offset;
812-
let num_levels = chunk.num_levels;
813-
let value_offset = chunk.value_offset;
814-
let num_values = chunk.num_values;
815-
let def_levels = self
816-
.def_levels
817-
.as_ref()
818-
.map(|levels| levels[level_offset..level_offset + num_levels].to_vec());
819-
let rep_levels = self
820-
.rep_levels
821-
.as_ref()
822-
.map(|levels| levels[level_offset..level_offset + num_levels].to_vec());
823-
824-
// Filter non_null_indices to [value_offset, value_offset + num_values)
825-
// and shift by -value_offset. Use binary search since the slice is sorted.
826-
let value_end = value_offset + num_values;
827-
let start = self
828-
.non_null_indices
829-
.partition_point(|&idx| idx < value_offset);
830-
let end = self
831-
.non_null_indices
832-
.partition_point(|&idx| idx < value_end);
833-
let non_null_indices: Vec<usize> = self.non_null_indices[start..end]
834-
.iter()
835-
.map(|&idx| idx - value_offset)
836-
.collect();
812+
let def_levels = self.def_levels.as_ref().map(|levels| {
813+
levels[chunk.level_offset..chunk.level_offset + chunk.num_levels].to_vec()
814+
});
815+
let rep_levels = self.rep_levels.as_ref().map(|levels| {
816+
levels[chunk.level_offset..chunk.level_offset + chunk.num_levels].to_vec()
817+
});
837818

838-
let array = self.array.slice(value_offset, num_values);
819+
// Select the non-null indices for this chunk.
820+
let nni = &self.non_null_indices[chunk.value_offset..chunk.value_offset + chunk.num_values];
821+
// Compute the array range spanned by the non-null indices.
822+
// When nni is empty (all-null chunk), start=0, end=0 → zero-length
823+
// array slice; write_batch_internal will process only the def/rep
824+
// levels and write no values.
825+
let start = nni.first().copied().unwrap_or(0);
826+
let end = nni.last().map_or(0, |&i| i + 1);
827+
// Shift indices to be relative to the sliced array.
828+
let non_null_indices = nni.iter().map(|&idx| idx - start).collect();
829+
// Slice the array to the computed range.
830+
let array = self.array.slice(start, end - start);
839831
let logical_nulls = array.logical_nulls();
840832

841833
Self {
@@ -2149,9 +2141,8 @@ mod tests {
21492141
fn test_slice_for_chunk_flat() {
21502142
// Case 1: required field (max_def_level=0, no def/rep levels stored).
21512143
// Array has 6 values; all are non-null so non_null_indices covers every position.
2152-
// The chunk selects value_offset=2, num_values=3 → the sub-array [3, 4, 5].
2153-
// Since there are no levels, num_levels=0 and level_offset are irrelevant.
2154-
// non_null_indices [0,1,2,3,4,5] filtered to [2,4) and shifted by -2 → [0,1,2].
2144+
// value_offset=2, num_values=3 → non_null_indices[2..5] = [2,3,4].
2145+
// Array is sliced (no def_levels → write_batch_internal uses values.len()).
21552146
let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6]));
21562147
let logical_nulls = array.logical_nulls();
21572148
let levels = ArrayLevels {
@@ -2176,14 +2167,9 @@ mod tests {
21762167

21772168
// Case 2: optional field (max_def_level=1, def levels present, no rep levels).
21782169
// Array: [Some(1), None, Some(3), None, Some(5), Some(6)]
2179-
// def_levels: [1, 0, 1, 0, 1, 1] (1=non-null, 0=null)
2180-
// non_null_indices: [0, 2, 4, 5] (array positions of the four non-null values)
2181-
//
2182-
// The chunk selects level_offset=1, num_levels=3, value_offset=1, num_values=3:
2183-
// - def_levels[1..4] = [0, 1, 0] → null, non-null, null
2184-
// - sub-array slice(1, 3) = [None, Some(3), None]
2185-
// - non_null_indices filtered to [value_offset=1, value_end=4): only index 2 qualifies,
2186-
// shifted by -1 → [1] (position of Some(3) within the sliced sub-array)
2170+
// non_null_indices: [0, 2, 4, 5]
2171+
// value_offset=1, num_values=1 → non_null_indices[1..2] = [2].
2172+
// Array is not sliced (def_levels present → num_levels from def_levels.len()).
21872173
let array: ArrayRef = Arc::new(Int32Array::from(vec![
21882174
Some(1),
21892175
None,
@@ -2206,90 +2192,111 @@ mod tests {
22062192
level_offset: 1,
22072193
num_levels: 3,
22082194
value_offset: 1,
2209-
num_values: 3,
2195+
num_values: 1,
22102196
});
22112197
assert_eq!(sliced.def_levels, Some(vec![0, 1, 0]));
22122198
assert!(sliced.rep_levels.is_none());
2213-
assert_eq!(sliced.non_null_indices, vec![1]);
2214-
assert_eq!(sliced.array.len(), 3);
2199+
assert_eq!(sliced.non_null_indices, vec![0]); // [2] shifted by -2 (nni[0])
2200+
assert_eq!(sliced.array.len(), 1);
22152201
}
22162202

22172203
#[test]
2218-
fn test_slice_for_chunk_nested() {
2219-
// [[1,2],[3],[4,5]]: def=[2,2,2,2,2], rep=[0,1,0,0,1]
2220-
// Slice levels 2..5 (def=[2,2,2], rep=[0,0,1]), values 2..5
2221-
let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]));
2204+
fn test_slice_for_chunk_nested_with_nulls() {
2205+
// Regression test for https://github.com/apache/arrow-rs/issues/9637
2206+
//
2207+
// Simulates a List<Int32?> where null list entries have non-zero child
2208+
// ranges (valid per Arrow spec: "a null value may correspond to a
2209+
// non-empty segment in the child array"). This creates gaps in the
2210+
// leaf array that don't correspond to any levels.
2211+
//
2212+
// 5 rows with 2 null list entries owning non-empty child ranges:
2213+
// row 0: [1] → leaf[0]
2214+
// row 1: null list → owns leaf[1..3] (gap of 2)
2215+
// row 2: [2, null] → leaf[3], leaf[4]=null element
2216+
// row 3: null list → owns leaf[5..8] (gap of 3)
2217+
// row 4: [4, 5] → leaf[8], leaf[9]
2218+
//
2219+
// def_levels: [3, 0, 3, 2, 0, 3, 3]
2220+
// rep_levels: [0, 0, 0, 1, 0, 0, 1]
2221+
// non_null_indices: [0, 3, 8, 9]
2222+
// gaps in array: 0→3 (skip 1,2), 3→8 (skip 5,6,7)
2223+
let array: ArrayRef = Arc::new(Int32Array::from(vec![
2224+
Some(1), // 0: row 0
2225+
None, // 1: gap (null list row 1)
2226+
None, // 2: gap (null list row 1)
2227+
Some(2), // 3: row 2
2228+
None, // 4: row 2, null element
2229+
None, // 5: gap (null list row 3)
2230+
None, // 6: gap (null list row 3)
2231+
None, // 7: gap (null list row 3)
2232+
Some(4), // 8: row 4
2233+
Some(5), // 9: row 4
2234+
]));
22222235
let logical_nulls = array.logical_nulls();
22232236
let levels = ArrayLevels {
2224-
def_levels: Some(vec![2, 2, 2, 2, 2]),
2225-
rep_levels: Some(vec![0, 1, 0, 0, 1]),
2226-
non_null_indices: vec![0, 1, 2, 3, 4],
2227-
max_def_level: 2,
2237+
def_levels: Some(vec![3, 0, 3, 2, 0, 3, 3]),
2238+
rep_levels: Some(vec![0, 0, 0, 1, 0, 0, 1]),
2239+
non_null_indices: vec![0, 3, 8, 9],
2240+
max_def_level: 3,
22282241
max_rep_level: 1,
22292242
array,
22302243
logical_nulls,
22312244
};
2232-
let sliced = levels.slice_for_chunk(&CdcChunk {
2245+
2246+
// Chunk 0: rows 0-1, nni=[0] → array sliced to [0..1]
2247+
let chunk0 = levels.slice_for_chunk(&CdcChunk {
2248+
level_offset: 0,
2249+
num_levels: 2,
2250+
value_offset: 0,
2251+
num_values: 1,
2252+
});
2253+
assert_eq!(chunk0.non_null_indices, vec![0]);
2254+
assert_eq!(chunk0.array.len(), 1);
2255+
2256+
// Chunk 1: rows 2-3, nni=[3] → array sliced to [3..4]
2257+
let chunk1 = levels.slice_for_chunk(&CdcChunk {
22332258
level_offset: 2,
22342259
num_levels: 3,
2260+
value_offset: 1,
2261+
num_values: 1,
2262+
});
2263+
assert_eq!(chunk1.non_null_indices, vec![0]);
2264+
assert_eq!(chunk1.array.len(), 1);
2265+
2266+
// Chunk 2: row 4, nni=[8, 9] → array sliced to [8..10]
2267+
let chunk2 = levels.slice_for_chunk(&CdcChunk {
2268+
level_offset: 5,
2269+
num_levels: 2,
22352270
value_offset: 2,
2236-
num_values: 3,
2271+
num_values: 2,
22372272
});
2238-
assert_eq!(sliced.def_levels, Some(vec![2, 2, 2]));
2239-
assert_eq!(sliced.rep_levels, Some(vec![0, 0, 1]));
2240-
// [0,1,2,3,4] filtered to [2,5) → [2,3,4] → shifted -2 → [0,1,2]
2241-
assert_eq!(sliced.non_null_indices, vec![0, 1, 2]);
2242-
assert_eq!(sliced.array.len(), 3);
2273+
assert_eq!(chunk2.non_null_indices, vec![0, 1]);
2274+
assert_eq!(chunk2.array.len(), 2);
22432275
}
22442276

22452277
#[test]
2246-
fn test_slice_for_chunk_non_null_indices_boundary() {
2247-
// [1, null, 3]: non_null_indices=[0, 2]; test inclusive lower / exclusive upper bounds
2248-
let array: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), None, Some(3)]));
2278+
fn test_slice_for_chunk_all_null() {
2279+
// All-null chunk: num_values=0 → empty nni slice → zero-length array.
2280+
let array: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), None, None, Some(4)]));
22492281
let logical_nulls = array.logical_nulls();
22502282
let levels = ArrayLevels {
2251-
def_levels: Some(vec![1, 0, 1]),
2283+
def_levels: Some(vec![1, 0, 0, 1]),
22522284
rep_levels: None,
2253-
non_null_indices: vec![0, 2],
2285+
non_null_indices: vec![0, 3],
22542286
max_def_level: 1,
22552287
max_rep_level: 0,
22562288
array,
22572289
logical_nulls,
22582290
};
2259-
assert_eq!(
2260-
levels
2261-
.slice_for_chunk(&CdcChunk {
2262-
level_offset: 0,
2263-
num_levels: 1,
2264-
value_offset: 0,
2265-
num_values: 1
2266-
})
2267-
.non_null_indices,
2268-
vec![0]
2269-
);
2270-
// idx 2 in range [1,3), shifted -1 → 1
2271-
assert_eq!(
2272-
levels
2273-
.slice_for_chunk(&CdcChunk {
2274-
level_offset: 1,
2275-
num_levels: 2,
2276-
value_offset: 1,
2277-
num_values: 2
2278-
})
2279-
.non_null_indices,
2280-
vec![1]
2281-
);
2282-
// idx 2 excluded from [1,2)
2283-
assert_eq!(
2284-
levels
2285-
.slice_for_chunk(&CdcChunk {
2286-
level_offset: 1,
2287-
num_levels: 1,
2288-
value_offset: 1,
2289-
num_values: 1
2290-
})
2291-
.non_null_indices,
2292-
Vec::<usize>::new()
2293-
);
2291+
// Chunk covering only the two null rows (levels 1..3), zero non-null values.
2292+
let sliced = levels.slice_for_chunk(&CdcChunk {
2293+
level_offset: 1,
2294+
num_levels: 2,
2295+
value_offset: 1,
2296+
num_values: 0,
2297+
});
2298+
assert_eq!(sliced.def_levels, Some(vec![0, 0]));
2299+
assert_eq!(sliced.non_null_indices, Vec::<usize>::new());
2300+
assert_eq!(sliced.array.len(), 0);
22942301
}
22952302
}

0 commit comments

Comments
 (0)