diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java
index be1a3324ae43..0dffe9fb6499 100644
--- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedPageIterator.java
@@ -34,6 +34,7 @@
import org.apache.parquet.column.values.RequiresPreviousReader;
import org.apache.parquet.column.values.ValuesReader;
import org.apache.parquet.io.ParquetDecodingException;
+import org.apache.parquet.schema.PrimitiveType;
public class VectorizedPageIterator extends BasePageIterator {
private final boolean setArrowValidityVector;
@@ -100,6 +101,14 @@ protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, i
case DELTA_BINARY_PACKED:
valuesReader = new VectorizedDeltaEncodedValuesReader();
break;
+ case RLE:
+ if (desc.getPrimitiveType().getPrimitiveTypeName()
+ == PrimitiveType.PrimitiveTypeName.BOOLEAN) {
+ valuesReader =
+ new VectorizedRunLengthEncodedParquetValuesReader(setArrowValidityVector);
+ break;
+ }
+ // fall through
default:
throw new UnsupportedOperationException(
"Cannot support vectorized reads for column "
diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedRunLengthEncodedParquetValuesReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedRunLengthEncodedParquetValuesReader.java
new file mode 100644
index 000000000000..f30701c1db0d
--- /dev/null
+++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedRunLengthEncodedParquetValuesReader.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.arrow.vectorized.parquet;
+
+import org.apache.arrow.vector.FieldVector;
+import org.apache.parquet.io.api.Binary;
+
+/**
+ * A {@link VectorizedValuesReader} implementation for the encoding type Run Length Encoding / RLE.
+ *
+ * @see
+ * Parquet format encodings: RLE
+ */
+public class VectorizedRunLengthEncodedParquetValuesReader extends BaseVectorizedParquetValuesReader
+ implements VectorizedValuesReader {
+
+ // Since we can only read booleans, bit-width is always 1
+ private static final int BOOLEAN_BIT_WIDTH = 1;
+ // Since this can only be used in the context of a data page, the definition level can be set to
+ // anything, and it doesn't really matter
+ private static final int IRRELEVANT_MAX_DEFINITION_LEVEL = 1;
+ // For boolean values in data page v1 & v2, length is always prepended to the encoded data
+ // See
+ // https://parquet.apache.org/docs/file-format/data-pages/encodings/#run-length-encoding--bit-packing-hybrid-rle--3
+ private static final boolean ALWAYS_READ_LENGTH = true;
+
+ public VectorizedRunLengthEncodedParquetValuesReader(boolean setArrowValidityVector) {
+ super(
+ BOOLEAN_BIT_WIDTH,
+ IRRELEVANT_MAX_DEFINITION_LEVEL,
+ ALWAYS_READ_LENGTH,
+ setArrowValidityVector);
+ }
+
+ /** RLE only supports BOOLEAN as a data page encoding */
+ @Override
+ public byte readByte() {
+ throw new UnsupportedOperationException("readByte is not supported");
+ }
+
+ /** RLE only supports BOOLEAN as a data page encoding */
+ @Override
+ public short readShort() {
+ throw new UnsupportedOperationException("readShort is not supported");
+ }
+
+ /** RLE only supports BOOLEAN as a data page encoding */
+ @Override
+ public long readLong() {
+ throw new UnsupportedOperationException("readLong is not supported");
+ }
+
+ /** RLE only supports BOOLEAN as a data page encoding */
+ @Override
+ public float readFloat() {
+ throw new UnsupportedOperationException("readFloat is not supported");
+ }
+
+ /** RLE only supports BOOLEAN as a data page encoding */
+ @Override
+ public double readDouble() {
+ throw new UnsupportedOperationException("readDouble is not supported");
+ }
+
+ /** RLE only supports BOOLEAN as a data page encoding */
+ @Override
+ public Binary readBinary(int len) {
+ throw new UnsupportedOperationException("readBinary is not supported");
+ }
+
+ /** RLE only supports BOOLEAN as a data page encoding */
+ @Override
+ public void readIntegers(int total, FieldVector vec, int rowId) {
+ throw new UnsupportedOperationException("readIntegers is not supported");
+ }
+
+ /** RLE only supports BOOLEAN as a data page encoding */
+ @Override
+ public void readLongs(int total, FieldVector vec, int rowId) {
+ throw new UnsupportedOperationException("readLongs is not supported");
+ }
+
+ /** RLE only supports BOOLEAN as a data page encoding */
+ @Override
+ public void readFloats(int total, FieldVector vec, int rowId) {
+ throw new UnsupportedOperationException("readFloats is not supported");
+ }
+
+ /** RLE only supports BOOLEAN as a data page encoding */
+ @Override
+ public void readDoubles(int total, FieldVector vec, int rowId) {
+ throw new UnsupportedOperationException("readDoubles is not supported");
+ }
+}
diff --git a/parquet/src/testFixtures/resources/encodings/PLAIN/boolean_with_nulls.parquet b/parquet/src/testFixtures/resources/encodings/PLAIN/boolean_with_nulls.parquet
new file mode 100644
index 000000000000..4678e31da179
Binary files /dev/null and b/parquet/src/testFixtures/resources/encodings/PLAIN/boolean_with_nulls.parquet differ
diff --git a/parquet/src/testFixtures/resources/encodings/RLE/boolean.parquet b/parquet/src/testFixtures/resources/encodings/RLE/boolean.parquet
new file mode 100644
index 000000000000..bbc11933919f
Binary files /dev/null and b/parquet/src/testFixtures/resources/encodings/RLE/boolean.parquet differ
diff --git a/parquet/src/testFixtures/resources/encodings/RLE/boolean_with_nulls.parquet b/parquet/src/testFixtures/resources/encodings/RLE/boolean_with_nulls.parquet
new file mode 100644
index 000000000000..6eed9b602c68
Binary files /dev/null and b/parquet/src/testFixtures/resources/encodings/RLE/boolean_with_nulls.parquet differ
diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java
index 98c25268cb45..00f334e8f9fa 100644
--- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java
+++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetDictionaryEncodedVectorizedReads.java
@@ -55,6 +55,13 @@ public void testVectorizedReadsWithNewContainers() throws IOException {
// Disabled since this code path is already tested in TestParquetVectorizedReads
}
+ @Test
+ @Override
+ public void testUnsupportedReadsForParquetV2() throws Exception {
+ // Disabled since vectorized reads are supported for parquet v2 written files over
+ // dictionary-encoded files
+ }
+
@Test
public void testMixedDictionaryNonDictionaryReads() throws IOException {
Schema schema = new Schema(SUPPORTED_PRIMITIVES.fields());
diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java
index 6bff9010eb4f..aaf6c6e15f8b 100644
--- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java
+++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/parquet/vectorized/TestParquetVectorizedReads.java
@@ -20,17 +20,20 @@
import static org.apache.iceberg.types.Types.NestedField.optional;
import static org.apache.iceberg.types.Types.NestedField.required;
+import static org.apache.parquet.schema.Types.primitive;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
import static org.assertj.core.api.Assumptions.assumeThat;
import java.io.IOException;
+import java.nio.ByteBuffer;
import java.util.Iterator;
import java.util.function.Consumer;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.avro.generic.GenericData;
import org.apache.iceberg.Schema;
import org.apache.iceberg.arrow.ArrowAllocation;
+import org.apache.iceberg.arrow.vectorized.parquet.VectorizedPageIterator;
import org.apache.iceberg.inmemory.InMemoryOutputFile;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.FileAppender;
@@ -48,9 +51,13 @@
import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders;
import org.apache.iceberg.types.TypeUtil;
import org.apache.iceberg.types.Types;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.column.ColumnDescriptor;
+import org.apache.parquet.column.Encoding;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.schema.Type;
import org.apache.spark.sql.vectorized.ColumnarBatch;
import org.junit.jupiter.api.Test;
@@ -295,12 +302,16 @@ public void testReadsForTypePromotedColumns() throws Exception {
public void testSupportedReadsForParquetV2() throws Exception {
// Float and double column types are written using plain encoding with Parquet V2,
// also Parquet V2 will dictionary encode decimals that use fixed length binary
- // (i.e. decimals > 8 bytes)
+ // (i.e. decimals > 8 bytes). Int and long types use DELTA_BINARY_PACKED.
+ // Boolean types use RLE.
Schema schema =
new Schema(
optional(102, "float_data", Types.FloatType.get()),
optional(103, "double_data", Types.DoubleType.get()),
- optional(104, "decimal_data", Types.DecimalType.of(25, 5)));
+ optional(104, "decimal_data", Types.DecimalType.of(25, 5)),
+ optional(105, "int_data", Types.IntegerType.get()),
+ optional(106, "long_data", Types.LongType.get()),
+ optional(107, "boolean_data", Types.BooleanType.get()));
OutputFile outputFile = new InMemoryOutputFile();
Iterable data =
@@ -331,6 +342,33 @@ public void testUnsupportedReadsForParquetV2() throws Exception {
.hasMessageEndingWith("Disable vectorized reads to read this table/file");
}
+ @Test
+ public void testRLEEncodingOnlySupportsBooleanDataPage() {
+ MessageType schema =
+ new MessageType(
+ "test",
+ primitive(PrimitiveTypeName.INT32, Type.Repetition.OPTIONAL).id(1).named("int_col"));
+ ColumnDescriptor intColumnDesc = schema.getColumnDescription(new String[] {"int_col"});
+ ByteBufferInputStream stream = ByteBufferInputStream.wrap(ByteBuffer.allocate(0));
+
+ String expectedMessage =
+ "Cannot support vectorized reads for column "
+ + intColumnDesc
+ + " with encoding "
+ + Encoding.RLE
+ + ". Disable vectorized reads to read this table/file";
+
+ assertThatThrownBy(
+ () ->
+ new VectorizedPageIterator(intColumnDesc, "parquet-mr", false) {
+ {
+ initDataReader(Encoding.RLE, stream, 0);
+ }
+ })
+ .isInstanceOf(UnsupportedOperationException.class)
+ .hasMessage(expectedMessage);
+ }
+
@Test
public void testUuidReads() throws Exception {
// Just one row to maintain dictionary encoding
diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java
index 284fa0b0552f..bb0eff70d096 100644
--- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java
+++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java
@@ -94,6 +94,12 @@ Iterable generateData(
@Disabled // Ignored since this code path is already tested in TestParquetVectorizedReads
public void testVectorizedReadsWithNewContainers() throws IOException {}
+ @Test
+ @Override
+ @Disabled // Ignored since vectorized reads are supported for parquet v2 written files over
+ // dictionary-encoded files
+ public void testUnsupportedReadsForParquetV2() throws IOException {}
+
@Test
public void testMixedDictionaryNonDictionaryReads() throws IOException {
Schema schema = new Schema(SUPPORTED_PRIMITIVES.fields());
diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java
index 6e823a8bfc05..e9fe199abd3b 100644
--- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java
+++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java
@@ -21,6 +21,7 @@
import static java.nio.file.StandardCopyOption.REPLACE_EXISTING;
import static org.apache.iceberg.types.Types.NestedField.optional;
import static org.apache.iceberg.types.Types.NestedField.required;
+import static org.apache.parquet.schema.Types.primitive;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
import static org.assertj.core.api.Assumptions.assumeThat;
@@ -30,6 +31,7 @@
import java.io.InputStream;
import java.net.URISyntaxException;
import java.net.URL;
+import java.nio.ByteBuffer;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Iterator;
@@ -41,6 +43,7 @@
import org.apache.iceberg.Files;
import org.apache.iceberg.Schema;
import org.apache.iceberg.arrow.ArrowAllocation;
+import org.apache.iceberg.arrow.vectorized.parquet.VectorizedPageIterator;
import org.apache.iceberg.data.RandomGenericData;
import org.apache.iceberg.data.Record;
import org.apache.iceberg.data.parquet.GenericParquetReaders;
@@ -64,9 +67,13 @@
import org.apache.iceberg.types.Type.PrimitiveType;
import org.apache.iceberg.types.TypeUtil;
import org.apache.iceberg.types.Types;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.column.ColumnDescriptor;
+import org.apache.parquet.column.Encoding;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.schema.Type;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.vectorized.ColumnarBatch;
@@ -81,7 +88,7 @@ public class TestParquetVectorizedReads extends AvroDataTestBase {
private static final String PLAIN = "PLAIN";
private static final List GOLDEN_FILE_ENCODINGS =
- ImmutableList.of("PLAIN_DICTIONARY", "RLE_DICTIONARY", "DELTA_BINARY_PACKED");
+ ImmutableList.of("PLAIN_DICTIONARY", "RLE_DICTIONARY", "DELTA_BINARY_PACKED", "RLE");
private static final Map GOLDEN_FILE_TYPES =
ImmutableMap.of(
"string", Types.StringType.get(),
@@ -404,13 +411,15 @@ public void testSupportedReadsForParquetV2() throws Exception {
// Float and double column types are written using plain encoding with Parquet V2,
// also Parquet V2 will dictionary encode decimals that use fixed length binary
// (i.e. decimals > 8 bytes). Int and long types use DELTA_BINARY_PACKED.
+ // Boolean types use RLE.
Schema schema =
new Schema(
optional(102, "float_data", Types.FloatType.get()),
optional(103, "double_data", Types.DoubleType.get()),
optional(104, "decimal_data", Types.DecimalType.of(25, 5)),
optional(105, "int_data", Types.IntegerType.get()),
- optional(106, "long_data", Types.LongType.get()));
+ optional(106, "long_data", Types.LongType.get()),
+ optional(107, "boolean_data", Types.BooleanType.get()));
File dataFile = File.createTempFile("junit", null, temp.toFile());
assertThat(dataFile.delete()).as("Delete should succeed").isTrue();
@@ -439,6 +448,33 @@ public void testUnsupportedReadsForParquetV2() throws Exception {
.hasMessageEndingWith("Disable vectorized reads to read this table/file");
}
+ @Test
+ public void testRLEEncodingOnlySupportsBooleanDataPage() {
+ MessageType schema =
+ new MessageType(
+ "test",
+ primitive(PrimitiveTypeName.INT32, Type.Repetition.OPTIONAL).id(1).named("int_col"));
+ ColumnDescriptor intColumnDesc = schema.getColumnDescription(new String[] {"int_col"});
+ ByteBufferInputStream stream = ByteBufferInputStream.wrap(ByteBuffer.allocate(0));
+
+ String expectedMessage =
+ "Cannot support vectorized reads for column "
+ + intColumnDesc
+ + " with encoding "
+ + Encoding.RLE
+ + ". Disable vectorized reads to read this table/file";
+
+ assertThatThrownBy(
+ () ->
+ new VectorizedPageIterator(intColumnDesc, "parquet-mr", false) {
+ {
+ initDataReader(Encoding.RLE, stream, 0);
+ }
+ })
+ .isInstanceOf(UnsupportedOperationException.class)
+ .hasMessage(expectedMessage);
+ }
+
@Test
public void testUuidReads() throws Exception {
// Just one row to maintain dictionary encoding
@@ -504,10 +540,16 @@ static Stream goldenFilesAndEncodings() {
.flatMap(
e ->
Stream.of(true, false)
- .map(
+ .flatMap(
vectorized ->
- Arguments.of(
- encoding, e.getKey(), e.getValue(), vectorized))));
+ Stream.of(
+ Arguments.of(
+ encoding, e.getKey(), e.getValue(), vectorized),
+ Arguments.of(
+ encoding,
+ e.getKey() + "_with_nulls",
+ e.getValue(),
+ vectorized)))));
}
private File resourceUrlToLocalFile(URL url) throws IOException, URISyntaxException {
diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java
index 284fa0b0552f..bf965fda667e 100644
--- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java
+++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java
@@ -196,4 +196,10 @@ public void testDecimalNotAllPagesDictionaryEncoded() throws Exception {
}
});
}
+
+ @Test
+ @Override
+ @Disabled // Ignored since vectorized reads are supported for parquet v2 written files over
+ // dictionary-encoded files
+ public void testUnsupportedReadsForParquetV2() throws IOException {}
}
diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java
index 46a6a302e1c4..1a9587b452e9 100644
--- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java
+++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetVectorizedReads.java
@@ -21,6 +21,7 @@
import static java.nio.file.StandardCopyOption.REPLACE_EXISTING;
import static org.apache.iceberg.types.Types.NestedField.optional;
import static org.apache.iceberg.types.Types.NestedField.required;
+import static org.apache.parquet.schema.Types.primitive;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
import static org.assertj.core.api.Assumptions.assumeThat;
@@ -30,6 +31,7 @@
import java.io.InputStream;
import java.net.URISyntaxException;
import java.net.URL;
+import java.nio.ByteBuffer;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Iterator;
@@ -41,6 +43,7 @@
import org.apache.iceberg.Files;
import org.apache.iceberg.Schema;
import org.apache.iceberg.arrow.ArrowAllocation;
+import org.apache.iceberg.arrow.vectorized.parquet.VectorizedPageIterator;
import org.apache.iceberg.data.RandomGenericData;
import org.apache.iceberg.data.Record;
import org.apache.iceberg.data.parquet.GenericParquetReaders;
@@ -64,9 +67,13 @@
import org.apache.iceberg.types.Type.PrimitiveType;
import org.apache.iceberg.types.TypeUtil;
import org.apache.iceberg.types.Types;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.column.ColumnDescriptor;
+import org.apache.parquet.column.Encoding;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.schema.Type;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.vectorized.ColumnarBatch;
@@ -81,7 +88,7 @@ public class TestParquetVectorizedReads extends AvroDataTestBase {
private static final String PLAIN = "PLAIN";
private static final List GOLDEN_FILE_ENCODINGS =
- ImmutableList.of("PLAIN_DICTIONARY", "RLE_DICTIONARY", "DELTA_BINARY_PACKED");
+ ImmutableList.of("PLAIN_DICTIONARY", "RLE_DICTIONARY", "DELTA_BINARY_PACKED", "RLE");
private static final Map GOLDEN_FILE_TYPES =
ImmutableMap.of(
"string", Types.StringType.get(),
@@ -404,13 +411,15 @@ public void testSupportedReadsForParquetV2() throws Exception {
// Float and double column types are written using plain encoding with Parquet V2,
// also Parquet V2 will dictionary encode decimals that use fixed length binary
// (i.e. decimals > 8 bytes). Int and long types use DELTA_BINARY_PACKED.
+ // Boolean types use RLE.
Schema schema =
new Schema(
optional(102, "float_data", Types.FloatType.get()),
optional(103, "double_data", Types.DoubleType.get()),
optional(104, "decimal_data", Types.DecimalType.of(25, 5)),
optional(105, "int_data", Types.IntegerType.get()),
- optional(106, "long_data", Types.LongType.get()));
+ optional(106, "long_data", Types.LongType.get()),
+ optional(107, "boolean_data", Types.BooleanType.get()));
File dataFile = File.createTempFile("junit", null, temp.toFile());
assertThat(dataFile.delete()).as("Delete should succeed").isTrue();
@@ -439,6 +448,33 @@ public void testUnsupportedReadsForParquetV2() throws Exception {
.hasMessageEndingWith("Disable vectorized reads to read this table/file");
}
+ @Test
+ public void testRLEEncodingOnlySupportsBooleanDataPage() {
+ MessageType schema =
+ new MessageType(
+ "test",
+ primitive(PrimitiveTypeName.INT32, Type.Repetition.OPTIONAL).id(1).named("int_col"));
+ ColumnDescriptor intColumnDesc = schema.getColumnDescription(new String[] {"int_col"});
+ ByteBufferInputStream stream = ByteBufferInputStream.wrap(ByteBuffer.allocate(0));
+
+ String expectedMessage =
+ "Cannot support vectorized reads for column "
+ + intColumnDesc
+ + " with encoding "
+ + Encoding.RLE
+ + ". Disable vectorized reads to read this table/file";
+
+ assertThatThrownBy(
+ () ->
+ new VectorizedPageIterator(intColumnDesc, "parquet-mr", false) {
+ {
+ initDataReader(Encoding.RLE, stream, 0);
+ }
+ })
+ .isInstanceOf(UnsupportedOperationException.class)
+ .hasMessage(expectedMessage);
+ }
+
@Test
public void testUuidReads() throws Exception {
// Just one row to maintain dictionary encoding
@@ -490,10 +526,16 @@ static Stream goldenFilesAndEncodings() {
.flatMap(
e ->
Stream.of(true, false)
- .map(
+ .flatMap(
vectorized ->
- Arguments.of(
- encoding, e.getKey(), e.getValue(), vectorized))));
+ Stream.of(
+ Arguments.of(
+ encoding, e.getKey(), e.getValue(), vectorized),
+ Arguments.of(
+ encoding,
+ e.getKey() + "_with_nulls",
+ e.getValue(),
+ vectorized)))));
}
private File resourceUrlToLocalFile(URL url) throws IOException, URISyntaxException {