diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java index be1a3324ae43..0dffe9fb6499 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java @@ -34,6 +34,7 @@ import org.apache.parquet.column.values.RequiresPreviousReader; import org.apache.parquet.column.values.ValuesReader; import org.apache.parquet.io.ParquetDecodingException; +import org.apache.parquet.schema.PrimitiveType; public class VectorizedPageIterator extends BasePageIterator { private final boolean setArrowValidityVector; @@ -100,6 +101,14 @@ protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, i case DELTA_BINARY_PACKED: valuesReader = new VectorizedDeltaEncodedValuesReader(); break; + case RLE: + if (desc.getPrimitiveType().getPrimitiveTypeName() + == PrimitiveType.PrimitiveTypeName.BOOLEAN) { + valuesReader = + new VectorizedRunLengthEncodedParquetValuesReader(setArrowValidityVector); + break; + } + // fall through default: throw new UnsupportedOperationException( "Cannot support vectorized reads for column " diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedRunLengthEncodedParquetValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedRunLengthEncodedParquetValuesReader.java new file mode 100644 index 000000000000..f30701c1db0d --- /dev/null +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedRunLengthEncodedParquetValuesReader.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.arrow.vectorized.parquet; + +import org.apache.arrow.vector.FieldVector; +import org.apache.parquet.io.api.Binary; + +/** + * A {@link VectorizedValuesReader} implementation for the encoding type Run Length Encoding / RLE. + * + * @see + * Parquet format encodings: RLE + */ +public class VectorizedRunLengthEncodedParquetValuesReader extends BaseVectorizedParquetValuesReader + implements VectorizedValuesReader { + + // Since we can only read booleans, bit-width is always 1 + private static final int BOOLEAN_BIT_WIDTH = 1; + // Since this can only be used in the context of a data page, the definition level can be set to + // anything, and it doesn't really matter + private static final int IRRELEVANT_MAX_DEFINITION_LEVEL = 1; + // For boolean values in data page v1 & v2, length is always prepended to the encoded data + // See + // https://parquet.apache.org/docs/file-format/data-pages/encodings/#run-length-encoding--bit-packing-hybrid-rle--3 + private static final boolean ALWAYS_READ_LENGTH = true; + + public VectorizedRunLengthEncodedParquetValuesReader(boolean setArrowValidityVector) { + super( + BOOLEAN_BIT_WIDTH, + IRRELEVANT_MAX_DEFINITION_LEVEL, + ALWAYS_READ_LENGTH, + setArrowValidityVector); + } + + /** RLE only supports BOOLEAN as a data page encoding */ + @Override + public byte readByte() { + throw new UnsupportedOperationException("readByte is not supported"); + } + + /** RLE only supports BOOLEAN as a data page encoding */ + @Override + public short readShort() { + throw new UnsupportedOperationException("readShort is not supported"); + } + + /** RLE only supports BOOLEAN as a data page encoding */ + @Override + public long readLong() { + throw new UnsupportedOperationException("readLong is not supported"); + } + + /** RLE only supports BOOLEAN as a data page encoding */ + @Override + public float readFloat() { + throw new UnsupportedOperationException("readFloat is not supported"); + } + + /** RLE only supports BOOLEAN as a data page encoding */ + @Override + public double readDouble() { + throw new UnsupportedOperationException("readDouble is not supported"); + } + + /** RLE only supports BOOLEAN as a data page encoding */ + @Override + public Binary readBinary(int len) { + throw new UnsupportedOperationException("readBinary is not supported"); + } + + /** RLE only supports BOOLEAN as a data page encoding */ + @Override + public void readIntegers(int total, FieldVector vec, int rowId) { + throw new UnsupportedOperationException("readIntegers is not supported"); + } + + /** RLE only supports BOOLEAN as a data page encoding */ + @Override + public void readLongs(int total, FieldVector vec, int rowId) { + throw new UnsupportedOperationException("readLongs is not supported"); + } + + /** RLE only supports BOOLEAN as a data page encoding */ + @Override + public void readFloats(int total, FieldVector vec, int rowId) { + throw new UnsupportedOperationException("readFloats is not supported"); + } + + /** RLE only supports BOOLEAN as a data page encoding */ + @Override + public void readDoubles(int total, FieldVector vec, int rowId) { + throw new UnsupportedOperationException("readDoubles is not supported"); + } +} diff --git a/parquet/src/testFixtures/resources/encodings/PLAIN/boolean_with_nulls.parquet b/parquet/src/testFixtures/resources/encodings/PLAIN/boolean_with_nulls.parquet new file mode 100644 index 000000000000..4678e31da179 Binary files /dev/null and b/parquet/src/testFixtures/resources/encodings/PLAIN/boolean_with_nulls.parquet differ diff --git a/parquet/src/testFixtures/resources/encodings/RLE/boolean.parquet b/parquet/src/testFixtures/resources/encodings/RLE/boolean.parquet new file mode 100644 index 000000000000..bbc11933919f Binary files /dev/null and b/parquet/src/testFixtures/resources/encodings/RLE/boolean.parquet differ diff --git a/parquet/src/testFixtures/resources/encodings/RLE/boolean_with_nulls.parquet b/parquet/src/testFixtures/resources/encodings/RLE/boolean_with_nulls.parquet new file mode 100644 index 000000000000..6eed9b602c68 Binary files /dev/null and b/parquet/src/testFixtures/resources/encodings/RLE/boolean_with_nulls.parquet differ diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java index 98c25268cb45..00f334e8f9fa 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java @@ -55,6 +55,13 @@ public void testVectorizedReadsWithNewContainers() throws IOException { // Disabled since this code path is already tested in TestParquetVectorizedReads } + @Test + @Override + public void testUnsupportedReadsForParquetV2() throws Exception { + // Disabled since vectorized reads are supported for parquet v2 written files over + // dictionary-encoded files + } + @Test public void testMixedDictionaryNonDictionaryReads() throws IOException { Schema schema = new Schema(SUPPORTED_PRIMITIVES.fields()); diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java index 6bff9010eb4f..aaf6c6e15f8b 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java @@ -20,17 +20,20 @@ import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; +import static org.apache.parquet.schema.Types.primitive; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.assertj.core.api.Assumptions.assumeThat; import java.io.IOException; +import java.nio.ByteBuffer; import java.util.Iterator; import java.util.function.Consumer; import org.apache.arrow.memory.BufferAllocator; import org.apache.avro.generic.GenericData; import org.apache.iceberg.Schema; import org.apache.iceberg.arrow.ArrowAllocation; +import org.apache.iceberg.arrow.vectorized.parquet.VectorizedPageIterator; import org.apache.iceberg.inmemory.InMemoryOutputFile; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.FileAppender; @@ -48,9 +51,13 @@ import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders; import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.Encoding; import org.apache.parquet.column.ParquetProperties; import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type; import org.apache.spark.sql.vectorized.ColumnarBatch; import org.junit.jupiter.api.Test; @@ -295,12 +302,16 @@ public void testReadsForTypePromotedColumns() throws Exception { public void testSupportedReadsForParquetV2() throws Exception { // Float and double column types are written using plain encoding with Parquet V2, // also Parquet V2 will dictionary encode decimals that use fixed length binary - // (i.e. decimals > 8 bytes) + // (i.e. decimals > 8 bytes). Int and long types use DELTA_BINARY_PACKED. + // Boolean types use RLE. Schema schema = new Schema( optional(102, "float_data", Types.FloatType.get()), optional(103, "double_data", Types.DoubleType.get()), - optional(104, "decimal_data", Types.DecimalType.of(25, 5))); + optional(104, "decimal_data", Types.DecimalType.of(25, 5)), + optional(105, "int_data", Types.IntegerType.get()), + optional(106, "long_data", Types.LongType.get()), + optional(107, "boolean_data", Types.BooleanType.get())); OutputFile outputFile = new InMemoryOutputFile(); Iterable data = @@ -331,6 +342,33 @@ public void testUnsupportedReadsForParquetV2() throws Exception { .hasMessageEndingWith("Disable vectorized reads to read this table/file"); } + @Test + public void testRLEEncodingOnlySupportsBooleanDataPage() { + MessageType schema = + new MessageType( + "test", + primitive(PrimitiveTypeName.INT32, Type.Repetition.OPTIONAL).id(1).named("int_col")); + ColumnDescriptor intColumnDesc = schema.getColumnDescription(new String[] {"int_col"}); + ByteBufferInputStream stream = ByteBufferInputStream.wrap(ByteBuffer.allocate(0)); + + String expectedMessage = + "Cannot support vectorized reads for column " + + intColumnDesc + + " with encoding " + + Encoding.RLE + + ". Disable vectorized reads to read this table/file"; + + assertThatThrownBy( + () -> + new VectorizedPageIterator(intColumnDesc, "parquet-mr", false) { + { + initDataReader(Encoding.RLE, stream, 0); + } + }) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessage(expectedMessage); + } + @Test public void testUuidReads() throws Exception { // Just one row to maintain dictionary encoding diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java index 284fa0b0552f..bb0eff70d096 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java @@ -94,6 +94,12 @@ Iterable generateData( @Disabled // Ignored since this code path is already tested in TestParquetVectorizedReads public void testVectorizedReadsWithNewContainers() throws IOException {} + @Test + @Override + @Disabled // Ignored since vectorized reads are supported for parquet v2 written files over + // dictionary-encoded files + public void testUnsupportedReadsForParquetV2() throws IOException {} + @Test public void testMixedDictionaryNonDictionaryReads() throws IOException { Schema schema = new Schema(SUPPORTED_PRIMITIVES.fields()); diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java index 6e823a8bfc05..e9fe199abd3b 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java @@ -21,6 +21,7 @@ import static java.nio.file.StandardCopyOption.REPLACE_EXISTING; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; +import static org.apache.parquet.schema.Types.primitive; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.assertj.core.api.Assumptions.assumeThat; @@ -30,6 +31,7 @@ import java.io.InputStream; import java.net.URISyntaxException; import java.net.URL; +import java.nio.ByteBuffer; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Iterator; @@ -41,6 +43,7 @@ import org.apache.iceberg.Files; import org.apache.iceberg.Schema; import org.apache.iceberg.arrow.ArrowAllocation; +import org.apache.iceberg.arrow.vectorized.parquet.VectorizedPageIterator; import org.apache.iceberg.data.RandomGenericData; import org.apache.iceberg.data.Record; import org.apache.iceberg.data.parquet.GenericParquetReaders; @@ -64,9 +67,13 @@ import org.apache.iceberg.types.Type.PrimitiveType; import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.Encoding; import org.apache.parquet.column.ParquetProperties; import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.vectorized.ColumnarBatch; @@ -81,7 +88,7 @@ public class TestParquetVectorizedReads extends AvroDataTestBase { private static final String PLAIN = "PLAIN"; private static final List GOLDEN_FILE_ENCODINGS = - ImmutableList.of("PLAIN_DICTIONARY", "RLE_DICTIONARY", "DELTA_BINARY_PACKED"); + ImmutableList.of("PLAIN_DICTIONARY", "RLE_DICTIONARY", "DELTA_BINARY_PACKED", "RLE"); private static final Map GOLDEN_FILE_TYPES = ImmutableMap.of( "string", Types.StringType.get(), @@ -404,13 +411,15 @@ public void testSupportedReadsForParquetV2() throws Exception { // Float and double column types are written using plain encoding with Parquet V2, // also Parquet V2 will dictionary encode decimals that use fixed length binary // (i.e. decimals > 8 bytes). Int and long types use DELTA_BINARY_PACKED. + // Boolean types use RLE. Schema schema = new Schema( optional(102, "float_data", Types.FloatType.get()), optional(103, "double_data", Types.DoubleType.get()), optional(104, "decimal_data", Types.DecimalType.of(25, 5)), optional(105, "int_data", Types.IntegerType.get()), - optional(106, "long_data", Types.LongType.get())); + optional(106, "long_data", Types.LongType.get()), + optional(107, "boolean_data", Types.BooleanType.get())); File dataFile = File.createTempFile("junit", null, temp.toFile()); assertThat(dataFile.delete()).as("Delete should succeed").isTrue(); @@ -439,6 +448,33 @@ public void testUnsupportedReadsForParquetV2() throws Exception { .hasMessageEndingWith("Disable vectorized reads to read this table/file"); } + @Test + public void testRLEEncodingOnlySupportsBooleanDataPage() { + MessageType schema = + new MessageType( + "test", + primitive(PrimitiveTypeName.INT32, Type.Repetition.OPTIONAL).id(1).named("int_col")); + ColumnDescriptor intColumnDesc = schema.getColumnDescription(new String[] {"int_col"}); + ByteBufferInputStream stream = ByteBufferInputStream.wrap(ByteBuffer.allocate(0)); + + String expectedMessage = + "Cannot support vectorized reads for column " + + intColumnDesc + + " with encoding " + + Encoding.RLE + + ". Disable vectorized reads to read this table/file"; + + assertThatThrownBy( + () -> + new VectorizedPageIterator(intColumnDesc, "parquet-mr", false) { + { + initDataReader(Encoding.RLE, stream, 0); + } + }) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessage(expectedMessage); + } + @Test public void testUuidReads() throws Exception { // Just one row to maintain dictionary encoding @@ -504,10 +540,16 @@ static Stream goldenFilesAndEncodings() { .flatMap( e -> Stream.of(true, false) - .map( + .flatMap( vectorized -> - Arguments.of( - encoding, e.getKey(), e.getValue(), vectorized)))); + Stream.of( + Arguments.of( + encoding, e.getKey(), e.getValue(), vectorized), + Arguments.of( + encoding, + e.getKey() + "_with_nulls", + e.getValue(), + vectorized))))); } private File resourceUrlToLocalFile(URL url) throws IOException, URISyntaxException { diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java index 284fa0b0552f..bf965fda667e 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java @@ -196,4 +196,10 @@ public void testDecimalNotAllPagesDictionaryEncoded() throws Exception { } }); } + + @Test + @Override + @Disabled // Ignored since vectorized reads are supported for parquet v2 written files over + // dictionary-encoded files + public void testUnsupportedReadsForParquetV2() throws IOException {} } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java index 46a6a302e1c4..1a9587b452e9 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java @@ -21,6 +21,7 @@ import static java.nio.file.StandardCopyOption.REPLACE_EXISTING; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; +import static org.apache.parquet.schema.Types.primitive; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.assertj.core.api.Assumptions.assumeThat; @@ -30,6 +31,7 @@ import java.io.InputStream; import java.net.URISyntaxException; import java.net.URL; +import java.nio.ByteBuffer; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Iterator; @@ -41,6 +43,7 @@ import org.apache.iceberg.Files; import org.apache.iceberg.Schema; import org.apache.iceberg.arrow.ArrowAllocation; +import org.apache.iceberg.arrow.vectorized.parquet.VectorizedPageIterator; import org.apache.iceberg.data.RandomGenericData; import org.apache.iceberg.data.Record; import org.apache.iceberg.data.parquet.GenericParquetReaders; @@ -64,9 +67,13 @@ import org.apache.iceberg.types.Type.PrimitiveType; import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; +import org.apache.parquet.bytes.ByteBufferInputStream; +import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.Encoding; import org.apache.parquet.column.ParquetProperties; import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.vectorized.ColumnarBatch; @@ -81,7 +88,7 @@ public class TestParquetVectorizedReads extends AvroDataTestBase { private static final String PLAIN = "PLAIN"; private static final List GOLDEN_FILE_ENCODINGS = - ImmutableList.of("PLAIN_DICTIONARY", "RLE_DICTIONARY", "DELTA_BINARY_PACKED"); + ImmutableList.of("PLAIN_DICTIONARY", "RLE_DICTIONARY", "DELTA_BINARY_PACKED", "RLE"); private static final Map GOLDEN_FILE_TYPES = ImmutableMap.of( "string", Types.StringType.get(), @@ -404,13 +411,15 @@ public void testSupportedReadsForParquetV2() throws Exception { // Float and double column types are written using plain encoding with Parquet V2, // also Parquet V2 will dictionary encode decimals that use fixed length binary // (i.e. decimals > 8 bytes). Int and long types use DELTA_BINARY_PACKED. + // Boolean types use RLE. Schema schema = new Schema( optional(102, "float_data", Types.FloatType.get()), optional(103, "double_data", Types.DoubleType.get()), optional(104, "decimal_data", Types.DecimalType.of(25, 5)), optional(105, "int_data", Types.IntegerType.get()), - optional(106, "long_data", Types.LongType.get())); + optional(106, "long_data", Types.LongType.get()), + optional(107, "boolean_data", Types.BooleanType.get())); File dataFile = File.createTempFile("junit", null, temp.toFile()); assertThat(dataFile.delete()).as("Delete should succeed").isTrue(); @@ -439,6 +448,33 @@ public void testUnsupportedReadsForParquetV2() throws Exception { .hasMessageEndingWith("Disable vectorized reads to read this table/file"); } + @Test + public void testRLEEncodingOnlySupportsBooleanDataPage() { + MessageType schema = + new MessageType( + "test", + primitive(PrimitiveTypeName.INT32, Type.Repetition.OPTIONAL).id(1).named("int_col")); + ColumnDescriptor intColumnDesc = schema.getColumnDescription(new String[] {"int_col"}); + ByteBufferInputStream stream = ByteBufferInputStream.wrap(ByteBuffer.allocate(0)); + + String expectedMessage = + "Cannot support vectorized reads for column " + + intColumnDesc + + " with encoding " + + Encoding.RLE + + ". Disable vectorized reads to read this table/file"; + + assertThatThrownBy( + () -> + new VectorizedPageIterator(intColumnDesc, "parquet-mr", false) { + { + initDataReader(Encoding.RLE, stream, 0); + } + }) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessage(expectedMessage); + } + @Test public void testUuidReads() throws Exception { // Just one row to maintain dictionary encoding @@ -490,10 +526,16 @@ static Stream goldenFilesAndEncodings() { .flatMap( e -> Stream.of(true, false) - .map( + .flatMap( vectorized -> - Arguments.of( - encoding, e.getKey(), e.getValue(), vectorized)))); + Stream.of( + Arguments.of( + encoding, e.getKey(), e.getValue(), vectorized), + Arguments.of( + encoding, + e.getKey() + "_with_nulls", + e.getValue(), + vectorized))))); } private File resourceUrlToLocalFile(URL url) throws IOException, URISyntaxException {