|
22 | 22 | use bytes::Bytes; |
23 | 23 | use core::f32; |
24 | 24 | use half::f16; |
25 | | -use std::{fs, path::PathBuf, sync::Arc}; |
| 25 | +use std::{path::PathBuf, sync::Arc}; |
26 | 26 |
|
27 | 27 | use arrow::util::test_util::parquet_test_data; |
28 | | -use arrow_array::{Array, Float16Array, Float32Array, Float64Array, UInt64Array}; |
29 | | -use arrow_schema::Schema; |
| 28 | +use arrow_array::{Array, Float16Array, Float32Array, Float64Array, RecordBatch, UInt64Array}; |
| 29 | +use arrow_schema::{DataType, Field, Schema}; |
30 | 30 | use parquet::{ |
31 | 31 | arrow::{ |
32 | 32 | ArrowWriter, |
@@ -266,7 +266,173 @@ fn test_ieee754_interop() { |
266 | 266 | .expect("validate written metadata"); |
267 | 267 | } |
268 | 268 |
|
269 | | - fs::write("output.pq", outbuf.clone()).unwrap(); |
| 269 | + //fs::write("output.pq", outbuf.clone()).unwrap(); |
| 270 | + |
| 271 | + // now re-validate the bit we've written |
| 272 | + let options = ArrowReaderOptions::new() |
| 273 | + .with_page_index_policy(parquet::file::metadata::PageIndexPolicy::Required); |
| 274 | + let builder = ArrowReaderBuilder::try_new_with_options(Bytes::from(outbuf), options).unwrap(); |
| 275 | + let file_metadata = builder.metadata().clone(); |
| 276 | + let schema = builder.schema().clone(); |
| 277 | + let parquet_schema = builder.parquet_schema().clone(); |
| 278 | + |
| 279 | + println!("validate from rust output"); |
| 280 | + validate_metadata(file_metadata.as_ref(), schema.as_ref(), &parquet_schema) |
| 281 | + .expect("validate re-read metadata"); |
| 282 | +} |
| 283 | + |
| 284 | +// This test replicates the data produced by the parquet-java code that generated |
| 285 | +// parquet-testing/data/floating_orders_nan_count.parquet |
| 286 | +#[test] |
| 287 | +fn test_ieee754_interop2() { |
| 288 | + // define schema |
| 289 | + let schema = Schema::new(vec![ |
| 290 | + Field::new("float_ieee754", DataType::Float32, false), |
| 291 | + Field::new("double_ieee754", DataType::Float64, false), |
| 292 | + Field::new("float16_ieee754", DataType::Float16, false), |
| 293 | + ]); |
| 294 | + let schema = Arc::new(schema); |
| 295 | + |
| 296 | + let mut outbuf = Vec::new(); |
| 297 | + { |
| 298 | + let writer_options = WriterProperties::builder() |
| 299 | + .set_max_row_group_row_count(Some(10)) |
| 300 | + .build(); |
| 301 | + let mut writer = ArrowWriter::try_new(&mut outbuf, schema.clone(), Some(writer_options)) |
| 302 | + .expect("create arrow writer"); |
| 303 | + |
| 304 | + // this only works for non-NaN cases |
| 305 | + let make_batch = |data: &[f32]| -> RecordBatch { |
| 306 | + let arr1 = Float32Array::from(data.to_vec()); |
| 307 | + let arr2 = Float64Array::from(data.iter().map(|v| *v as f64).collect::<Vec<_>>()); |
| 308 | + let arr3 = |
| 309 | + Float16Array::from(data.iter().map(|v| f16::from_f32(*v)).collect::<Vec<_>>()); |
| 310 | + |
| 311 | + RecordBatch::try_new( |
| 312 | + schema.clone(), |
| 313 | + vec![Arc::new(arr1), Arc::new(arr2), Arc::new(arr3)], |
| 314 | + ) |
| 315 | + .unwrap() |
| 316 | + }; |
| 317 | + |
| 318 | + // batch 1: no NaNs |
| 319 | + let batch = make_batch(&[-2.0f32, -1.0, -0.0, 0.0, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0]); |
| 320 | + writer.write(&batch).expect("writing batch1"); |
| 321 | + |
| 322 | + // batch 2: mixed |
| 323 | + let float_data = vec![ |
| 324 | + FLOAT_NEG_NAN_SMALL, |
| 325 | + -2.0, |
| 326 | + FLOAT_NEG_NAN_LARGE, |
| 327 | + -1.0, |
| 328 | + -0.0, |
| 329 | + 0.0, |
| 330 | + 1.0, |
| 331 | + FLOAT_NAN_SMALL, |
| 332 | + 3.0, |
| 333 | + FLOAT_NAN_LARGE, |
| 334 | + ]; |
| 335 | + let double_data = vec![ |
| 336 | + DOUBLE_NEG_NAN_SMALL, |
| 337 | + -2.0, |
| 338 | + DOUBLE_NEG_NAN_LARGE, |
| 339 | + -1.0, |
| 340 | + -0.0, |
| 341 | + 0.0, |
| 342 | + 1.0, |
| 343 | + DOUBLE_NAN_SMALL, |
| 344 | + 3.0, |
| 345 | + DOUBLE_NAN_LARGE, |
| 346 | + ]; |
| 347 | + let float16_data = vec![ |
| 348 | + FLOAT16_NEG_NAN_SMALL, |
| 349 | + f16::from_f32(-2.0), |
| 350 | + FLOAT16_NEG_NAN_LARGE, |
| 351 | + f16::from_f32(-1.0), |
| 352 | + f16::from_f32(-0.0), |
| 353 | + f16::from_f32(0.0), |
| 354 | + f16::from_f32(1.0), |
| 355 | + FLOAT16_NAN_SMALL, |
| 356 | + f16::from_f32(3.0), |
| 357 | + FLOAT16_NAN_LARGE, |
| 358 | + ]; |
| 359 | + let batch = RecordBatch::try_new( |
| 360 | + schema.clone(), |
| 361 | + vec![ |
| 362 | + Arc::new(Float32Array::from(float_data)), |
| 363 | + Arc::new(Float64Array::from(double_data)), |
| 364 | + Arc::new(Float16Array::from(float16_data)), |
| 365 | + ], |
| 366 | + ) |
| 367 | + .unwrap(); |
| 368 | + writer.write(&batch).expect("writing batch2"); |
| 369 | + |
| 370 | + // batch 3: all NaN |
| 371 | + let float_data = vec![ |
| 372 | + FLOAT_NEG_NAN_SMALL, |
| 373 | + FLOAT_NEG_NAN_LARGE, |
| 374 | + FLOAT_NAN_SMALL, |
| 375 | + FLOAT_NAN_LARGE, |
| 376 | + FLOAT_NEG_NAN_SMALL, |
| 377 | + FLOAT_NEG_NAN_LARGE, |
| 378 | + FLOAT_NAN_SMALL, |
| 379 | + FLOAT_NAN_LARGE, |
| 380 | + FLOAT_NEG_NAN_SMALL, |
| 381 | + FLOAT_NAN_LARGE, |
| 382 | + ]; |
| 383 | + let double_data = vec![ |
| 384 | + DOUBLE_NEG_NAN_SMALL, |
| 385 | + DOUBLE_NEG_NAN_LARGE, |
| 386 | + DOUBLE_NAN_SMALL, |
| 387 | + DOUBLE_NAN_LARGE, |
| 388 | + DOUBLE_NEG_NAN_SMALL, |
| 389 | + DOUBLE_NEG_NAN_LARGE, |
| 390 | + DOUBLE_NAN_SMALL, |
| 391 | + DOUBLE_NAN_LARGE, |
| 392 | + DOUBLE_NEG_NAN_SMALL, |
| 393 | + DOUBLE_NAN_LARGE, |
| 394 | + ]; |
| 395 | + let float16_data = vec![ |
| 396 | + FLOAT16_NEG_NAN_SMALL, |
| 397 | + FLOAT16_NEG_NAN_LARGE, |
| 398 | + FLOAT16_NAN_SMALL, |
| 399 | + FLOAT16_NAN_LARGE, |
| 400 | + FLOAT16_NEG_NAN_SMALL, |
| 401 | + FLOAT16_NEG_NAN_LARGE, |
| 402 | + FLOAT16_NAN_SMALL, |
| 403 | + FLOAT16_NAN_LARGE, |
| 404 | + FLOAT16_NEG_NAN_SMALL, |
| 405 | + FLOAT16_NAN_LARGE, |
| 406 | + ]; |
| 407 | + let batch = RecordBatch::try_new( |
| 408 | + schema.clone(), |
| 409 | + vec![ |
| 410 | + Arc::new(Float32Array::from(float_data)), |
| 411 | + Arc::new(Float64Array::from(double_data)), |
| 412 | + Arc::new(Float16Array::from(float16_data)), |
| 413 | + ], |
| 414 | + ) |
| 415 | + .unwrap(); |
| 416 | + writer.write(&batch).expect("writing batch3"); |
| 417 | + |
| 418 | + // batch 4: 0 min |
| 419 | + let batch = make_batch(&[0.0f32, 0.0, 0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0]); |
| 420 | + writer.write(&batch).expect("writing batch4"); |
| 421 | + |
| 422 | + // batch 5: -0 max |
| 423 | + let batch = make_batch(&[ |
| 424 | + -5.0f32, -4.0, -3.0, -2.0, -1.5, -1.0, -0.5, -0.0, -0.0, -0.0, |
| 425 | + ]); |
| 426 | + writer.write(&batch).expect("writing batch5"); |
| 427 | + |
| 428 | + let write_meta = writer.close().expect("closing file"); |
| 429 | + let parquet_schema = write_meta.file_metadata().schema_descr(); |
| 430 | + println!("validate writer output"); |
| 431 | + validate_metadata(&write_meta, schema.as_ref(), parquet_schema) |
| 432 | + .expect("validate written metadata"); |
| 433 | + } |
| 434 | + |
| 435 | + //fs::write("output2.pq", outbuf.clone()).unwrap(); |
270 | 436 |
|
271 | 437 | // now re-validate the bit we've written |
272 | 438 | let options = ArrowReaderOptions::new() |
|
0 commit comments