From 124af9aa7b1c150f97ec1a7c0192a7ad177a0e72 Mon Sep 17 00:00:00 2001 From: Dylan Chen Date: Thu, 11 Dec 2025 17:07:27 +0800 Subject: [PATCH 1/4] fix record to struct deserialization --- crates/iceberg/src/spec/values/serde.rs | 73 +++++++++++++++++++------ crates/iceberg/src/spec/values/tests.rs | 22 ++++++++ 2 files changed, 77 insertions(+), 18 deletions(-) diff --git a/crates/iceberg/src/spec/values/serde.rs b/crates/iceberg/src/spec/values/serde.rs index 053acca8b0..9ade43d1de 100644 --- a/crates/iceberg/src/spec/values/serde.rs +++ b/crates/iceberg/src/spec/values/serde.rs @@ -18,6 +18,8 @@ //\! Serialization and deserialization support for Iceberg values pub(crate) mod _serde { + use std::collections::HashMap; + use serde::de::Visitor; use serde::ser::{SerializeMap, SerializeSeq, SerializeStruct}; use serde::{Deserialize, Serialize}; @@ -668,26 +670,61 @@ pub(crate) mod _serde { } _ => Err(invalid_err("list")), }, - RawLiteralEnum::Record(Record { - required, - optional: _, - }) => match ty { + RawLiteralEnum::Record(Record { required, optional }) => match ty { Type::Struct(struct_ty) => { - let iters: Vec> = required - .into_iter() - .map(|(field_name, value)| { - let field = struct_ty - .field_by_name(field_name.as_str()) - .ok_or_else(|| { - invalid_err_with_reason( + let mut value_by_name = HashMap::new(); + + for (field_name, value) in required.into_iter() { + let field = struct_ty + .field_by_name(field_name.as_str()) + .ok_or_else(|| { + invalid_err_with_reason( + "record", + &format!("field {} is not exist", &field_name), + ) + })?; + let value = value.try_into(&field.field_type)?; + if value.is_none() && field.required { + return Err(invalid_err_with_reason( + "record", + &format!("required field {} is null", &field_name), + )); + } + value_by_name.insert(field.name.clone(), value); + } + + for (field_name, value) in optional.into_iter() { + let field = struct_ty + .field_by_name(field_name.as_str()) + .ok_or_else(|| { + invalid_err_with_reason( + "record", + &format!("field {} is not exist", &field_name), + ) + })?; + let value = match value { + Some(v) => v.try_into(&field.field_type)?, + None => None, + }; + value_by_name.insert(field.name.clone(), value); + } + + let mut iters = Vec::with_capacity(struct_ty.fields().len()); + for field in struct_ty.fields() { + match value_by_name.remove(&field.name) { + Some(value) => iters.push(value), + None => { + if field.required { + return Err(invalid_err_with_reason( "record", - &format!("field {} is not exist", &field_name), - ) - })?; - let value = value.try_into(&field.field_type)?; - Ok(value) - }) - .collect::>()?; + &format!("required field {} is missing", field.name), + )); + } + iters.push(None); + } + } + } + Ok(Some(Literal::Struct(Struct::from_iter(iters)))) } Type::Map(map_ty) => { diff --git a/crates/iceberg/src/spec/values/tests.rs b/crates/iceberg/src/spec/values/tests.rs index 73343a9a1a..902fde8faf 100644 --- a/crates/iceberg/src/spec/values/tests.rs +++ b/crates/iceberg/src/spec/values/tests.rs @@ -272,6 +272,28 @@ fn json_struct() { ); } +#[test] +fn json_struct_preserves_schema_order() { + // struct fields are deliberately ordered as b, then a to detect ordering drift + let struct_type = StructType::new(vec![ + NestedField::required(2, "b", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(1, "a", Type::Primitive(PrimitiveType::Long)).into(), + ]); + let literal = Literal::Struct(Struct::from_iter(vec![ + Some(Literal::Primitive(PrimitiveLiteral::Int(5))), + Some(Literal::Primitive(PrimitiveLiteral::Long(10))), + ])); + + let raw = RawLiteral::try_from(literal.clone(), &Type::Struct(struct_type.clone())).unwrap(); + let json = serde_json::to_string(&raw).unwrap(); + + // serde_json maps use BTreeMap ordering; this verifies we still recover schema order. + let deser: RawLiteral = serde_json::from_str(&json).unwrap(); + let roundtrip = deser.try_into(&Type::Struct(struct_type)).unwrap().unwrap(); + + assert_eq!(roundtrip, literal); +} + #[test] fn json_list() { let record = r#"[1, 2, 3, null]"#; From 1b6ce6d4c8c4031fa3cae0c492709cdc89e8a1d2 Mon Sep 17 00:00:00 2001 From: Dylan Chen Date: Thu, 11 Dec 2025 17:14:32 +0800 Subject: [PATCH 2/4] refine the test case --- crates/iceberg/src/spec/values/tests.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/crates/iceberg/src/spec/values/tests.rs b/crates/iceberg/src/spec/values/tests.rs index 902fde8faf..9a154379ec 100644 --- a/crates/iceberg/src/spec/values/tests.rs +++ b/crates/iceberg/src/spec/values/tests.rs @@ -285,10 +285,9 @@ fn json_struct_preserves_schema_order() { ])); let raw = RawLiteral::try_from(literal.clone(), &Type::Struct(struct_type.clone())).unwrap(); - let json = serde_json::to_string(&raw).unwrap(); - - // serde_json maps use BTreeMap ordering; this verifies we still recover schema order. - let deser: RawLiteral = serde_json::from_str(&json).unwrap(); + // serde_json::Value uses BTreeMap (sorted keys), which mimics the RW metadata path. + let value = serde_json::to_value(&raw).unwrap(); + let deser: RawLiteral = serde_json::from_value(value).unwrap(); let roundtrip = deser.try_into(&Type::Struct(struct_type)).unwrap().unwrap(); assert_eq!(roundtrip, literal); From 86adb13eee33f3f6ca15c8abf2845f3c6bc26d09 Mon Sep 17 00:00:00 2001 From: Dylan Chen Date: Thu, 11 Dec 2025 17:54:57 +0800 Subject: [PATCH 3/4] fmt --- crates/iceberg/src/spec/values/serde.rs | 34 +++++++++++++------------ 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/crates/iceberg/src/spec/values/serde.rs b/crates/iceberg/src/spec/values/serde.rs index 9ade43d1de..a9f1d77f41 100644 --- a/crates/iceberg/src/spec/values/serde.rs +++ b/crates/iceberg/src/spec/values/serde.rs @@ -675,14 +675,15 @@ pub(crate) mod _serde { let mut value_by_name = HashMap::new(); for (field_name, value) in required.into_iter() { - let field = struct_ty - .field_by_name(field_name.as_str()) - .ok_or_else(|| { - invalid_err_with_reason( - "record", - &format!("field {} is not exist", &field_name), - ) - })?; + let field = + struct_ty + .field_by_name(field_name.as_str()) + .ok_or_else(|| { + invalid_err_with_reason( + "record", + &format!("field {} is not exist", &field_name), + ) + })?; let value = value.try_into(&field.field_type)?; if value.is_none() && field.required { return Err(invalid_err_with_reason( @@ -694,14 +695,15 @@ pub(crate) mod _serde { } for (field_name, value) in optional.into_iter() { - let field = struct_ty - .field_by_name(field_name.as_str()) - .ok_or_else(|| { - invalid_err_with_reason( - "record", - &format!("field {} is not exist", &field_name), - ) - })?; + let field = + struct_ty + .field_by_name(field_name.as_str()) + .ok_or_else(|| { + invalid_err_with_reason( + "record", + &format!("field {} is not exist", &field_name), + ) + })?; let value = match value { Some(v) => v.try_into(&field.field_type)?, None => None, From 8e3c586e757685b9482734a359e9ab17e47c2bfa Mon Sep 17 00:00:00 2001 From: Dylan Chen Date: Thu, 11 Dec 2025 18:09:22 +0800 Subject: [PATCH 4/4] fix tests --- crates/iceberg/src/spec/manifest/mod.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/iceberg/src/spec/manifest/mod.rs b/crates/iceberg/src/spec/manifest/mod.rs index b126396e3c..59e688c70f 100644 --- a/crates/iceberg/src/spec/manifest/mod.rs +++ b/crates/iceberg/src/spec/manifest/mod.rs @@ -1127,7 +1127,7 @@ mod tests { .file_size_in_bytes(1024) .record_count(100) .partition_spec_id(1) - .partition(Struct::empty()) + .partition(Struct::from_iter([None::])) .column_sizes(HashMap::from([(1, 512), (2, 1024)])) .value_counts(HashMap::from([(1, 100), (2, 500)])) .null_value_counts(HashMap::from([(1, 0), (2, 1)])) @@ -1140,7 +1140,7 @@ mod tests { .file_size_in_bytes(2048) .record_count(200) .partition_spec_id(1) - .partition(Struct::empty()) + .partition(Struct::from_iter([None::])) .column_sizes(HashMap::from([(1, 1024), (2, 2048)])) .value_counts(HashMap::from([(1, 200), (2, 600)])) .null_value_counts(HashMap::from([(1, 10), (2, 999)])) @@ -1163,7 +1163,7 @@ mod tests { "content": 0, "file_path": "path/to/file1.parquet", "file_format": "PARQUET", - "partition": {}, + "partition": { "id_partition": null }, "record_count": 100, "file_size_in_bytes": 1024, "column_sizes": [ @@ -1194,7 +1194,7 @@ mod tests { "content": 0, "file_path": "path/to/file2.parquet", "file_format": "PARQUET", - "partition": {}, + "partition": { "id_partition": null }, "record_count": 200, "file_size_in_bytes": 2048, "column_sizes": [