Skip to content

Commit 2b179b8

Browse files
authored
feat(parquet): relax type compatility check in parquet ArrowWriter (#9099)
# Which issue does this PR close? - Closes #9098. # Rationale for this change Don't require strict equality for nested fields (including inner field name/metadata), just require that nested data types are logically equivalent. # What changes are included in this PR? Use `a.equals_datatype(b)` instead of `a == b` at the start of `LevelInfoBuilder::types_compatible`. # Are these changes tested? Yes. # Are there any user-facing changes?
1 parent 10a976f commit 2b179b8

File tree

2 files changed

+51
-4
lines changed

2 files changed

+51
-4
lines changed

parquet/src/arrow/arrow_writer/levels.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -550,8 +550,8 @@ impl LevelInfoBuilder {
550550
/// and the other is a native array, the dictionary values must have the same type as the
551551
/// native array
552552
fn types_compatible(a: &DataType, b: &DataType) -> bool {
553-
// if the Arrow data types are the same, the types are clearly compatible
554-
if a == b {
553+
// if the Arrow data types are equal, the types are deemed compatible
554+
if a.equals_datatype(b) {
555555
return true;
556556
}
557557

parquet/src/arrow/arrow_writer/mod.rs

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1522,11 +1522,12 @@ fn get_fsb_array_slice(
15221522
#[cfg(test)]
15231523
mod tests {
15241524
use super::*;
1525+
use std::collections::HashMap;
15251526

15261527
use std::fs::File;
15271528

1528-
use crate::arrow::ARROW_SCHEMA_META_KEY;
15291529
use crate::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder};
1530+
use crate::arrow::{ARROW_SCHEMA_META_KEY, PARQUET_FIELD_ID_META_KEY};
15301531
use crate::column::page::{Page, PageReader};
15311532
use crate::file::metadata::thrift::PageHeader;
15321533
use crate::file::page_index::column_index::ColumnIndexMetaData;
@@ -1539,7 +1540,7 @@ mod tests {
15391540
use arrow::util::data_gen::create_random_array;
15401541
use arrow::util::pretty::pretty_format_batches;
15411542
use arrow::{array::*, buffer::Buffer};
1542-
use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano, NullBuffer, i256};
1543+
use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano, NullBuffer, OffsetBuffer, i256};
15431544
use arrow_schema::Fields;
15441545
use half::f16;
15451546
use num_traits::{FromPrimitive, ToPrimitive};
@@ -3323,6 +3324,52 @@ mod tests {
33233324
BinaryViewArray::from_iter_values(vec![b"barquet"]),
33243325
LargeBinaryArray::from_iter_values(vec![b"parquet", b"barquet"]),
33253326
);
3327+
3328+
// check compatibility for list types
3329+
3330+
let list_field_metadata = HashMap::from_iter(vec![(
3331+
PARQUET_FIELD_ID_META_KEY.to_string(),
3332+
"1".to_string(),
3333+
)]);
3334+
let list_field = Field::new_list_field(DataType::Int32, false);
3335+
3336+
let values1 = Arc::new(Int32Array::from(vec![0, 1, 2, 3, 4]));
3337+
let offsets1 = OffsetBuffer::new(vec![0, 2, 5].into());
3338+
3339+
let values2 = Arc::new(Int32Array::from(vec![5, 6, 7, 8, 9]));
3340+
let offsets2 = OffsetBuffer::new(vec![0, 3, 5].into());
3341+
3342+
let values_expected = Arc::new(Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]));
3343+
let offsets_expected = OffsetBuffer::new(vec![0, 2, 5, 8, 10].into());
3344+
3345+
ensure_compatible_write(
3346+
// when the initial schema has the metadata ...
3347+
ListArray::try_new(
3348+
Arc::new(
3349+
list_field
3350+
.clone()
3351+
.with_metadata(list_field_metadata.clone()),
3352+
),
3353+
offsets1,
3354+
values1,
3355+
None,
3356+
)
3357+
.unwrap(),
3358+
// ... and some intermediate schema doesn't have the metadata
3359+
ListArray::try_new(Arc::new(list_field.clone()), offsets2, values2, None).unwrap(),
3360+
// ... the write will still go through, and the resulting schema will inherit the initial metadata
3361+
ListArray::try_new(
3362+
Arc::new(
3363+
list_field
3364+
.clone()
3365+
.with_metadata(list_field_metadata.clone()),
3366+
),
3367+
offsets_expected,
3368+
values_expected,
3369+
None,
3370+
)
3371+
.unwrap(),
3372+
);
33263373
}
33273374

33283375
#[test]

0 commit comments

Comments
 (0)