From 3e65e08fb15c098cd5c0ec4711d079e2fd74b2fd Mon Sep 17 00:00:00 2001 From: Gabriel Groover Date: Fri, 8 Nov 2024 16:33:14 -0500 Subject: [PATCH] apply fix for byte buffer serialization --- README.md | 51 ++++++++++++++++++- .../parquet/avro/AvroRecordConverter.java | 10 ++++ pom.xml | 6 +-- 3 files changed, 63 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 9c68f53742..fddc61d790 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,56 @@ Parquet Java (formerly Parquet MR) [![Build Status](https://github.com/apache/parquet-java/workflows/Test/badge.svg)](https://github.com/apache/parquet-java/actions) ====== -This repository contains a Java implementation of [Apache Parquet](https://parquet.apache.org/) +This repository contains a **modified** Java implementation of [Apache Parquet](https://parquet.apache.org/). The changes +in this version allow for the serialization of Java generic supertypes in a collection, without the correct type +being lost on read. + +See below for an example of this fix: +```java +class AbstractRecord { TreeSet recordSet; } + +/** + * The template type will be lost on write-out and deserialization will fail without this change + * @param the concrete template type stored in {@link AbstractRecord#recordSet} + */ +class OutputRecord extends AbstractRecord {} +``` + +### Releasing new versions + +- Update main with the latest Parquet-java changes and rebase the forked changes +- Ensure you have the upstream parquet fork as a git remote and fetch tags + ```shell + git remote add fork-source https://github.com/apache/parquet-java + git fetch --tags fork-source + ``` +- Check out a new release branch from the relevant avro release tag + `git checkout -b release/1.0.0-1.15.0 apache-parquet-1.15.0` +- Apply the most recent fork change to that branch + `git cherry-pick ` +- Set the new project version. **If** adjusting the fork itself bump the base version (1.0.0) + `mvn versions:set 1.0.0-1.15.0` +- Deploy the final jars from `lang/java/avro` + `mvn deploy -DskipTests -DaltDeploymentRepository=repository-id::repository-url` +- Push the release branch to remote + +--- + +

NOTICE

+ +

This work was produced for the U.S. Government under Contract 693KA8-22-C-00001 and is subject to Federal Aviation Administration Acquisition Management System Clause 3.5-13, Rights In Data-General (Oct. 2014), Alt. III and Alt. IV (Oct. 2009).

+ +

The contents of this document reflect the views of the author and The MITRE Corporation and do not necessarily reflect the views of the Federal Aviation Administration (FAA) or the Department of Transportation (DOT). Neither the FAA nor the DOT makes any warranty or guarantee, expressed or implied, concerning the content or accuracy of these views.

+ +

For further information, please contact The MITRE Corporation, Contracts Management Office, 7515 Colshire Drive, McLean, VA 22102-7539, (703) 983-6000.

+ +

© 2024 The MITRE Corporation. All Rights Reserved.

+ +--- + +

Approved for Public Release; Distribution Unlimited. Public Release Case Number 24-3517

+ +--- Apache Parquet is an open source, column-oriented data file format designed for efficient data storage and retrieval. It provides high diff --git a/parquet-avro/src/main/java/org/apache/parquet/avro/AvroRecordConverter.java b/parquet-avro/src/main/java/org/apache/parquet/avro/AvroRecordConverter.java index 441428bfa7..6ba2535c2c 100644 --- a/parquet-avro/src/main/java/org/apache/parquet/avro/AvroRecordConverter.java +++ b/parquet-avro/src/main/java/org/apache/parquet/avro/AvroRecordConverter.java @@ -35,6 +35,7 @@ import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.lang.reflect.Modifier; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; @@ -141,6 +142,15 @@ public void add(Object value) { }; Class fieldClass = fields.get(avroField.name()); + if ((null != fieldClass) + && + /* Explicitly exclude ByteBuffers as parquet directly encodes them as byte[]s in the output data model - but the field class + * for ByteBuffer is abstract - so if we don't exclude them here all ByteBuffer fields are reflectively populated with byte[]s */ + ((Modifier.isAbstract(fieldClass.getModifiers()) && !fieldClass.isAssignableFrom(ByteBuffer.class)) + || Modifier.isInterface(fieldClass.getModifiers()) + || fieldClass.equals(Object.class))) { + fieldClass = null; + } converters[parquetFieldIndex] = newConverter(nonNullSchema, parquetField, this.model, fieldClass, container); diff --git a/pom.xml b/pom.xml index fa738370f5..e143dac377 100644 --- a/pom.xml +++ b/pom.xml @@ -18,9 +18,9 @@ Parquet is a columnar storage format that supports nested data. This provides the java implementation. - scm:git:git@github.com:apache/parquet-mr.git - scm:git:git@github.com:apache/parquet-mr.git - scm:git:git@github.com:apache/parquet-mr.git + scm:git:https://github.com/mitre-public/parquet-java + scm:git:https://github.com/mitre-public/parquet-java + scm:git:https://github.com/mitre-public/parquet-java HEAD