diff --git a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala index 2ab3af7ceaad..ab70c3db52da 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala @@ -113,6 +113,16 @@ object VeloxBackendSettings extends BackendSettingsApi { hadoopConf: Configuration, partitionFileFormats: Set[ReadFileFormat]): ValidationResult = { + // When parquet vectorized reader is disabled, fallback to Spark's vanilla reader + // (parquet-mr) to preserve its behavior (e.g., allowing decimal precision narrowing). + if ( + format == ReadFileFormat.ParquetReadFormat && + !SQLConf.get.getConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED) + ) { + return ValidationResult.failed( + "Fallback to vanilla reader when parquet vectorized reader is disabled.") + } + def validateScheme(): Option[String] = { val filteredRootPaths = distinctRootPaths(rootPaths) if ( diff --git a/cpp/velox/substrait/SubstraitToVeloxPlan.cc b/cpp/velox/substrait/SubstraitToVeloxPlan.cc index 834127e20cc1..f754f4ae0bbf 100644 --- a/cpp/velox/substrait/SubstraitToVeloxPlan.cc +++ b/cpp/velox/substrait/SubstraitToVeloxPlan.cc @@ -1495,6 +1495,31 @@ core::PlanNodePtr SubstraitToVeloxPlanConverter::toVeloxPlan(const ::substrait:: // The columns present in the table, if not available default to the baseSchema. auto tableSchema = splitInfo->tableSchema ? splitInfo->tableSchema : baseSchema; + // Build dataColumns from tableSchema, excluding partition columns. + // HiveTableHandle::dataColumns() is used as fileSchema for the reader. + // Partition columns should not be validated against the file's physical types + // (their values come from the partition path, not from the file). + std::unordered_set partitionColNames; + for (int idx = 0; idx < colNameList.size(); idx++) { + if (columnTypes[idx] == ColumnType::kPartitionKey) { + partitionColNames.insert(colNameList[idx]); + } + } + RowTypePtr dataColumns; + if (partitionColNames.empty()) { + dataColumns = tableSchema; + } else { + std::vector dataColNames; + std::vector dataColTypes; + for (int idx = 0; idx < tableSchema->size(); idx++) { + if (partitionColNames.find(tableSchema->nameOf(idx)) == partitionColNames.end()) { + dataColNames.push_back(tableSchema->nameOf(idx)); + dataColTypes.push_back(tableSchema->childAt(idx)); + } + } + dataColumns = ROW(std::move(dataColNames), std::move(dataColTypes)); + } + connector::ConnectorTableHandlePtr tableHandle; auto remainingFilter = readRel.has_filter() ? exprConverter_->toVeloxExpr(readRel.filter(), baseSchema) : nullptr; auto connectorId = kHiveConnectorId; @@ -1506,7 +1531,7 @@ core::PlanNodePtr SubstraitToVeloxPlanConverter::toVeloxPlan(const ::substrait:: } common::SubfieldFilters subfieldFilters; tableHandle = std::make_shared( - connectorId, "hive_table", std::move(subfieldFilters), remainingFilter, tableSchema); + connectorId, "hive_table", std::move(subfieldFilters), remainingFilter, dataColumns); // Get assignments and out names. std::vector outNames; diff --git a/ep/build-velox/src/get-velox.sh b/ep/build-velox/src/get-velox.sh index 558f988e5c93..4ef66dcfc9ea 100755 --- a/ep/build-velox/src/get-velox.sh +++ b/ep/build-velox/src/get-velox.sh @@ -17,8 +17,8 @@ set -exu CURRENT_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) -VELOX_REPO=https://github.com/IBM/velox.git -VELOX_BRANCH=dft-2026_03_08-iceberg +VELOX_REPO=https://github.com/baibaichen/velox.git +VELOX_BRANCH=pr3/parquet-type-widening VELOX_ENHANCED_BRANCH=ibm-2026_03_08 VELOX_HOME="" RUN_SETUP_SCRIPT=ON diff --git a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 4f7c67daaad6..e61ed402490c 100644 --- a/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark40/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -323,69 +323,11 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenParquetCommitterSuite] enableSuite[GlutenParquetFieldIdSchemaSuite] enableSuite[GlutenParquetTypeWideningSuite] + // Velox does not support DELTA_BYTE_ARRAY encoding for FIXED_LEN_BYTE_ARRAY decimals. .exclude("parquet decimal precision change Decimal(20, 2) -> Decimal(22, 2)") .exclude("parquet decimal precision and scale change Decimal(20, 7) -> Decimal(22, 5)") .exclude("parquet decimal precision and scale change Decimal(20, 5) -> Decimal(22, 8)") .exclude("parquet decimal precision and scale change Decimal(20, 2) -> Decimal(22, 4)") - .exclude("parquet decimal precision and scale change Decimal(10, 4) -> Decimal(12, 7)") - .exclude("parquet decimal precision and scale change Decimal(10, 6) -> Decimal(12, 4)") - .exclude("parquet decimal precision and scale change Decimal(10, 7) -> Decimal(5, 2)") - .exclude("parquet decimal precision and scale change Decimal(12, 4) -> Decimal(10, 2)") - .exclude("parquet decimal precision and scale change Decimal(12, 4) -> Decimal(10, 6)") - .exclude("parquet decimal precision and scale change Decimal(20, 17) -> Decimal(10, 2)") - .exclude("parquet decimal precision and scale change Decimal(20, 17) -> Decimal(5, 2)") - .exclude("parquet decimal precision and scale change Decimal(22, 4) -> Decimal(20, 2)") - .exclude("parquet decimal precision and scale change Decimal(22, 5) -> Decimal(20, 7)") - .exclude("parquet decimal precision and scale change Decimal(5, 2) -> Decimal(6, 4)") - .exclude("parquet decimal precision and scale change Decimal(7, 4) -> Decimal(5, 2)") - .exclude("parquet decimal precision and scale change Decimal(10, 2) -> Decimal(12, 4)") - .exclude("parquet decimal precision and scale change Decimal(10, 2) -> Decimal(20, 12)") - .exclude("parquet decimal precision and scale change Decimal(5, 2) -> Decimal(10, 7)") - .exclude("parquet decimal precision and scale change Decimal(5, 2) -> Decimal(20, 17)") - .exclude("parquet decimal precision and scale change Decimal(5, 2) -> Decimal(7, 4)") - .exclude("parquet decimal precision change Decimal(10, 2) -> Decimal(5, 2)") - .exclude("parquet decimal precision change Decimal(12, 2) -> Decimal(10, 2)") - .exclude("parquet decimal precision change Decimal(20, 2) -> Decimal(10, 2)") - .exclude("parquet decimal precision change Decimal(20, 2) -> Decimal(5, 2)") - .exclude("parquet decimal precision change Decimal(22, 2) -> Decimal(20, 2)") - .exclude("parquet decimal precision change Decimal(7, 2) -> Decimal(5, 2)") - .exclude("parquet decimal precision change Decimal(10, 2) -> Decimal(12, 2)") - .exclude("parquet decimal precision change Decimal(10, 2) -> Decimal(20, 2)") - .exclude("parquet decimal precision change Decimal(5, 2) -> Decimal(10, 2)") - .exclude("parquet decimal precision change Decimal(5, 2) -> Decimal(20, 2)") - .exclude("parquet decimal precision change Decimal(5, 2) -> Decimal(7, 2)") - .exclude("parquet decimal type change Decimal(5, 2) -> Decimal(3, 2) overflows with parquet-mr") - .exclude("unsupported parquet conversion ByteType -> DecimalType(1,0)") - .exclude("unsupported parquet conversion ByteType -> DecimalType(2,0)") - .exclude("unsupported parquet conversion ByteType -> DecimalType(3,0)") - .exclude("unsupported parquet conversion ByteType -> DecimalType(3,1)") - .exclude("unsupported parquet conversion ByteType -> DecimalType(4,1)") - .exclude("unsupported parquet conversion IntegerType -> DecimalType(10,1)") - .exclude("unsupported parquet conversion IntegerType -> DecimalType(5,0)") - .exclude("unsupported parquet conversion IntegerType -> DecimalType(9,0)") - .exclude("unsupported parquet conversion LongType -> DateType") - .exclude("unsupported parquet conversion LongType -> DecimalType(10,0)") - .exclude("unsupported parquet conversion LongType -> DecimalType(19,0)") - .exclude("unsupported parquet conversion LongType -> DecimalType(20,1)") - .exclude("unsupported parquet conversion LongType -> IntegerType") - .exclude("unsupported parquet conversion ShortType -> DecimalType(3,0)") - .exclude("unsupported parquet conversion ShortType -> DecimalType(4,0)") - .exclude("unsupported parquet conversion ShortType -> DecimalType(5,0)") - .exclude("unsupported parquet conversion ShortType -> DecimalType(5,1)") - .exclude("unsupported parquet conversion ShortType -> DecimalType(6,1)") - .exclude("parquet widening conversion ByteType -> DecimalType(11,1)") - .exclude("parquet widening conversion ByteType -> DecimalType(20,0)") - .exclude("parquet widening conversion IntegerType -> DecimalType(11,1)") - .exclude("parquet widening conversion IntegerType -> DecimalType(20,0)") - .exclude("parquet widening conversion IntegerType -> DecimalType(38,0)") - .exclude("parquet widening conversion IntegerType -> DoubleType") - .exclude("parquet widening conversion LongType -> DecimalType(20,0)") - .exclude("parquet widening conversion LongType -> DecimalType(21,1)") - .exclude("parquet widening conversion LongType -> DecimalType(38,0)") - .exclude("parquet widening conversion ShortType -> DecimalType(11,1)") - .exclude("parquet widening conversion ShortType -> DecimalType(20,0)") - .exclude("parquet widening conversion ShortType -> DecimalType(38,0)") - .exclude("parquet widening conversion ShortType -> DoubleType") enableSuite[GlutenParquetVariantShreddingSuite] // Generated suites for org.apache.spark.sql.execution.datasources.text // TODO: 4.x enableSuite[GlutenWholeTextFileV1Suite] // 1 failure diff --git a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetTypeWideningSuite.scala b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetTypeWideningSuite.scala index 2090b70f7727..91d658aafea9 100644 --- a/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetTypeWideningSuite.scala +++ b/gluten-ut/spark40/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetTypeWideningSuite.scala @@ -16,6 +16,17 @@ */ package org.apache.spark.sql.execution.datasources.parquet +import org.apache.gluten.config.GlutenConfig + +import org.apache.spark.SparkConf import org.apache.spark.sql.GlutenSQLTestsTrait -class GlutenParquetTypeWideningSuite extends ParquetTypeWideningSuite with GlutenSQLTestsTrait {} +class GlutenParquetTypeWideningSuite extends ParquetTypeWideningSuite with GlutenSQLTestsTrait { + + // Disable native writer so that writeParquetFiles() uses Spark's Parquet writer. + // This suite tests the READ path. The native writer doesn't produce + // DELTA_BINARY_PACKED/DELTA_BYTE_ARRAY encodings that the parent test's + // V2 encoding assertions expect. + override def sparkConf: SparkConf = + super.sparkConf.set(GlutenConfig.NATIVE_WRITER_ENABLED.key, "false") +} diff --git a/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 0dadfa1d0bd8..68382180bcc2 100644 --- a/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark41/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -334,69 +334,11 @@ class VeloxTestSettings extends BackendTestSettings { enableSuite[GlutenParquetCommitterSuite] enableSuite[GlutenParquetFieldIdSchemaSuite] enableSuite[GlutenParquetTypeWideningSuite] + // Velox does not support DELTA_BYTE_ARRAY encoding for FIXED_LEN_BYTE_ARRAY decimals. .exclude("parquet decimal precision change Decimal(20, 2) -> Decimal(22, 2)") .exclude("parquet decimal precision and scale change Decimal(20, 7) -> Decimal(22, 5)") .exclude("parquet decimal precision and scale change Decimal(20, 5) -> Decimal(22, 8)") .exclude("parquet decimal precision and scale change Decimal(20, 2) -> Decimal(22, 4)") - .exclude("parquet decimal precision and scale change Decimal(10, 4) -> Decimal(12, 7)") - .exclude("parquet decimal precision and scale change Decimal(10, 6) -> Decimal(12, 4)") - .exclude("parquet decimal precision and scale change Decimal(10, 7) -> Decimal(5, 2)") - .exclude("parquet decimal precision and scale change Decimal(12, 4) -> Decimal(10, 2)") - .exclude("parquet decimal precision and scale change Decimal(12, 4) -> Decimal(10, 6)") - .exclude("parquet decimal precision and scale change Decimal(20, 17) -> Decimal(10, 2)") - .exclude("parquet decimal precision and scale change Decimal(20, 17) -> Decimal(5, 2)") - .exclude("parquet decimal precision and scale change Decimal(22, 4) -> Decimal(20, 2)") - .exclude("parquet decimal precision and scale change Decimal(22, 5) -> Decimal(20, 7)") - .exclude("parquet decimal precision and scale change Decimal(5, 2) -> Decimal(6, 4)") - .exclude("parquet decimal precision and scale change Decimal(7, 4) -> Decimal(5, 2)") - .exclude("parquet decimal precision and scale change Decimal(10, 2) -> Decimal(12, 4)") - .exclude("parquet decimal precision and scale change Decimal(10, 2) -> Decimal(20, 12)") - .exclude("parquet decimal precision and scale change Decimal(5, 2) -> Decimal(10, 7)") - .exclude("parquet decimal precision and scale change Decimal(5, 2) -> Decimal(20, 17)") - .exclude("parquet decimal precision and scale change Decimal(5, 2) -> Decimal(7, 4)") - .exclude("parquet decimal precision change Decimal(10, 2) -> Decimal(5, 2)") - .exclude("parquet decimal precision change Decimal(12, 2) -> Decimal(10, 2)") - .exclude("parquet decimal precision change Decimal(20, 2) -> Decimal(10, 2)") - .exclude("parquet decimal precision change Decimal(20, 2) -> Decimal(5, 2)") - .exclude("parquet decimal precision change Decimal(22, 2) -> Decimal(20, 2)") - .exclude("parquet decimal precision change Decimal(7, 2) -> Decimal(5, 2)") - .exclude("parquet decimal precision change Decimal(10, 2) -> Decimal(12, 2)") - .exclude("parquet decimal precision change Decimal(10, 2) -> Decimal(20, 2)") - .exclude("parquet decimal precision change Decimal(5, 2) -> Decimal(10, 2)") - .exclude("parquet decimal precision change Decimal(5, 2) -> Decimal(20, 2)") - .exclude("parquet decimal precision change Decimal(5, 2) -> Decimal(7, 2)") - .exclude("parquet decimal type change Decimal(5, 2) -> Decimal(3, 2) overflows with parquet-mr") - .exclude("unsupported parquet conversion ByteType -> DecimalType(1,0)") - .exclude("unsupported parquet conversion ByteType -> DecimalType(2,0)") - .exclude("unsupported parquet conversion ByteType -> DecimalType(3,0)") - .exclude("unsupported parquet conversion ByteType -> DecimalType(3,1)") - .exclude("unsupported parquet conversion ByteType -> DecimalType(4,1)") - .exclude("unsupported parquet conversion IntegerType -> DecimalType(10,1)") - .exclude("unsupported parquet conversion IntegerType -> DecimalType(5,0)") - .exclude("unsupported parquet conversion IntegerType -> DecimalType(9,0)") - .exclude("unsupported parquet conversion LongType -> DateType") - .exclude("unsupported parquet conversion LongType -> DecimalType(10,0)") - .exclude("unsupported parquet conversion LongType -> DecimalType(19,0)") - .exclude("unsupported parquet conversion LongType -> DecimalType(20,1)") - .exclude("unsupported parquet conversion LongType -> IntegerType") - .exclude("unsupported parquet conversion ShortType -> DecimalType(3,0)") - .exclude("unsupported parquet conversion ShortType -> DecimalType(4,0)") - .exclude("unsupported parquet conversion ShortType -> DecimalType(5,0)") - .exclude("unsupported parquet conversion ShortType -> DecimalType(5,1)") - .exclude("unsupported parquet conversion ShortType -> DecimalType(6,1)") - .exclude("parquet widening conversion ByteType -> DecimalType(11,1)") - .exclude("parquet widening conversion ByteType -> DecimalType(20,0)") - .exclude("parquet widening conversion IntegerType -> DecimalType(11,1)") - .exclude("parquet widening conversion IntegerType -> DecimalType(20,0)") - .exclude("parquet widening conversion IntegerType -> DecimalType(38,0)") - .exclude("parquet widening conversion IntegerType -> DoubleType") - .exclude("parquet widening conversion LongType -> DecimalType(20,0)") - .exclude("parquet widening conversion LongType -> DecimalType(21,1)") - .exclude("parquet widening conversion LongType -> DecimalType(38,0)") - .exclude("parquet widening conversion ShortType -> DecimalType(11,1)") - .exclude("parquet widening conversion ShortType -> DecimalType(20,0)") - .exclude("parquet widening conversion ShortType -> DecimalType(38,0)") - .exclude("parquet widening conversion ShortType -> DoubleType") // TODO: 4.x enableSuite[GlutenParquetVariantShreddingSuite] // 1 failure // Generated suites for org.apache.spark.sql.execution.datasources.text // TODO: 4.x enableSuite[GlutenWholeTextFileV1Suite] // 1 failure diff --git a/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetTypeWideningSuite.scala b/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetTypeWideningSuite.scala index 2090b70f7727..91d658aafea9 100644 --- a/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetTypeWideningSuite.scala +++ b/gluten-ut/spark41/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/GlutenParquetTypeWideningSuite.scala @@ -16,6 +16,17 @@ */ package org.apache.spark.sql.execution.datasources.parquet +import org.apache.gluten.config.GlutenConfig + +import org.apache.spark.SparkConf import org.apache.spark.sql.GlutenSQLTestsTrait -class GlutenParquetTypeWideningSuite extends ParquetTypeWideningSuite with GlutenSQLTestsTrait {} +class GlutenParquetTypeWideningSuite extends ParquetTypeWideningSuite with GlutenSQLTestsTrait { + + // Disable native writer so that writeParquetFiles() uses Spark's Parquet writer. + // This suite tests the READ path. The native writer doesn't produce + // DELTA_BINARY_PACKED/DELTA_BYTE_ARRAY encodings that the parent test's + // V2 encoding assertions expect. + override def sparkConf: SparkConf = + super.sparkConf.set(GlutenConfig.NATIVE_WRITER_ENABLED.key, "false") +}