apache · ccat3z · Feb 28, 2025 · Feb 17, 2025 · Mar 3, 2025 · Feb 28, 2025
diff --git a/...ds-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHIteratorApi.scala b/...ds-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHIteratorApi.scala
@@ -130,7 +130,8 @@ class CHIteratorApi extends IteratorApi with Logging with LogLevelUtil {
       partitionSchema: StructType,
       fileFormat: ReadFileFormat,
       metadataColumnNames: Seq[String],
-      properties: Map[String, String]): SplitInfo = {
+      properties: Map[String, String],
+      dataSchema: StructType): SplitInfo = {
     partition match {
       case p: GlutenMergeTreePartition =>
         ExtensionTableBuilder

diff --git a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxIteratorApi.scala b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxIteratorApi.scala
@@ -55,7 +55,8 @@ class VeloxIteratorApi extends IteratorApi with Logging {
       partitionSchema: StructType,
       fileFormat: ReadFileFormat,
       metadataColumnNames: Seq[String],
-      properties: Map[String, String]): SplitInfo = {
+      properties: Map[String, String],
+      dataSchema: StructType): SplitInfo = {
     partition match {
       case f: FilePartition =>
         val (
@@ -69,7 +70,7 @@ class VeloxIteratorApi extends IteratorApi with Logging {
           constructSplitInfo(partitionSchema, f.files, metadataColumnNames)
         val preferredLocations =
           SoftAffinity.getFilePartitionLocations(f)
-        LocalFilesBuilder.makeLocalFiles(
+        val localFile = LocalFilesBuilder.makeLocalFiles(
           f.index,
           paths,
           starts,
@@ -82,6 +83,8 @@ class VeloxIteratorApi extends IteratorApi with Logging {
           preferredLocations.toList.asJava,
           mapAsJavaMap(properties)
         )
+        localFile.setFileSchema(dataSchema)
+        localFile
       case _ =>
         throw new UnsupportedOperationException(s"Unsupported input partition.")
     }
@@ -168,26 +171,28 @@ class VeloxIteratorApi extends IteratorApi with Logging {
           SparkShimLoader.getSparkShims.generateMetadataColumns(file, metadataColumnNames)
         metadataColumns.add(metadataColumn)
         val partitionColumn = new JHashMap[String, String]()
-        for (i <- 0 until file.partitionValues.numFields) {
-          val partitionColumnValue = if (file.partitionValues.isNullAt(i)) {
-            ExternalCatalogUtils.DEFAULT_PARTITION_NAME
-          } else {
-            val pn = file.partitionValues.get(i, schema.fields(i).dataType)
-            schema.fields(i).dataType match {
-              case _: BinaryType =>
-                new String(pn.asInstanceOf[Array[Byte]], StandardCharsets.UTF_8)
-              case _: DateType =>
-                DateFormatter.apply().format(pn.asInstanceOf[Integer])
-              case _: DecimalType =>
-                pn.asInstanceOf[Decimal].toJavaBigInteger.toString
-              case _: TimestampType =>
-                TimestampFormatter
-                  .getFractionFormatter(ZoneOffset.UTC)
-                  .format(pn.asInstanceOf[java.lang.Long])
-              case _ => pn.toString
+        if (file.partitionValues != null) {
+          for (i <- 0 until file.partitionValues.numFields) {
+            val partitionColumnValue = if (file.partitionValues.isNullAt(i)) {
+              ExternalCatalogUtils.DEFAULT_PARTITION_NAME
+            } else {
+              val pn = file.partitionValues.get(i, schema.fields(i).dataType)
+              schema.fields(i).dataType match {
+                case _: BinaryType =>
+                  new String(pn.asInstanceOf[Array[Byte]], StandardCharsets.UTF_8)
+                case _: DateType =>
+                  DateFormatter.apply().format(pn.asInstanceOf[Integer])
+                case _: DecimalType =>
+                  pn.asInstanceOf[Decimal].toJavaBigInteger.toString
+                case _: TimestampType =>
+                  TimestampFormatter
+                    .getFractionFormatter(ZoneOffset.UTC)
+                    .format(pn.asInstanceOf[java.lang.Long])
+                case _ => pn.toString
+              }
             }
+            partitionColumn.put(schema.names(i), partitionColumnValue)
           }
-          partitionColumn.put(schema.names(i), partitionColumnValue)
         }
         partitionColumns.add(partitionColumn)
     }

diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxOrcForcePositionalSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxOrcForcePositionalSuite.scala
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.gluten.execution
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
+
+class VeloxOrcForcePositionalSuite extends VeloxWholeStageTransformerSuite {
+  override protected val resourcePath: String = "/tpch-data-parquet"
+  override protected val fileFormat: String = "parquet"
+
+  override protected def sparkConf: SparkConf =
+    super.sparkConf
+      .set("spark.gluten.sql.columnar.backend.velox.glogSeverityLevel", "0")
+      .set("spark.gluten.sql.columnar.backend.velox.glogVerboseLevel", "1")
+      .set("spark.executorEnv.GLOG_v", "1")
+      .set("spark.gluten.sql.debug", "true")
+      .set("orc.force.positional.evolution", "true")
+
+  import testImplicits._
+
+  test("rename root columns") {
+    withTempPath {
+      tmp =>
+        val path = tmp.getCanonicalPath
+        val df1 = Seq((1, 2), (4, 5), (8, 9)).toDF("col1", "col2")
+        val df2 = Seq((10, 12), (15, 19), (22, 40)).toDF("col1", "col3")
+
+        val dir1 = s"file://$path/part=one"
+        val dir2 = s"file://$path/part=two"
+
+        df1.write.format("orc").save(dir1)
+        df2.write.format("orc").save(dir2)
+
+        spark.read
+          .schema(df2.schema)
+          .format("orc")
+          .load(s"file://$path")
+          .createOrReplaceTempView("test")
+
+        runQueryAndCompare("select * from test") { _ => }
+    }
+  }
+
+  test("rename nested columns") {
+    withTempPath {
+      tmp =>
+        val path = tmp.getCanonicalPath
+        val df1 = spark.createDataFrame(
+          spark.sparkContext.parallelize(Row(1, Row("abc", 2)) :: Nil),
+          schema = StructType(
+            Array(
+              StructField("col1", IntegerType, nullable = true),
+              StructField(
+                "col2",
+                StructType(
+                  Array(
+                    StructField("a", StringType, nullable = true),
+                    StructField("b", IntegerType, nullable = true)
+                  )))
+            ))
+        )
+        val df2 = spark.createDataFrame(
+          spark.sparkContext.parallelize(Row(20, Row("EFG", 10)) :: Nil),
+          schema = StructType(
+            Array(
+              StructField("col1", IntegerType, nullable = true),
+              StructField(
+                "col2",
+                StructType(
+                  Array(
+                    StructField("a", StringType, nullable = true),
+                    StructField("c", IntegerType, nullable = true)
+                  )))
+            ))
+        )
+
+        df1.write.mode("overwrite").format("orc").save(s"file://$path/part=one")
+        df2.write.mode("overwrite").format("orc").save(s"file://$path/part=two")
+
+        spark.read
+          .schema(df2.schema)
+          .format("orc")
+          .load(s"file://$path")
+          .createOrReplaceTempView("test")
+
+        runQueryAndCompare("select * from test") { _ => }
+    }
+  }
+
+  test("prune nested schema") {
+    withTempPath {
+      tmp =>
+        val path = tmp.getCanonicalPath
+
+        val df1 = spark.createDataFrame(
+          spark.sparkContext.parallelize(Row(1, 5, Row("abc", 2)) :: Nil),
+          schema = StructType(
+            Array(
+              StructField("col1", IntegerType, nullable = true),
+              StructField("col2", IntegerType, nullable = true),
+              StructField(
+                "col3",
+                StructType(
+                  Array(
+                    StructField("a", StringType, nullable = true),
+                    StructField("b", IntegerType, nullable = true)
+                  )))
+            ))
+        )
+        df1.write.format("orc").save(s"file://$path")
+
+        spark.read
+          .format("orc")
+          .schema(StructType(Array(
+            StructField("col1", IntegerType, nullable = true),
+            StructField("col2", IntegerType, nullable = true),
+            StructField(
+              "col3",
+              StructType(Array(
+                StructField("b", IntegerType, nullable = true)
+              )))
+          )))
+          .load(s"file://$path")
+          .createOrReplaceTempView("test")
+
+        runQueryAndCompare("select * from test") { _ => }
+    }
+  }
+}
+
+class VeloxOrcForcePositionalOffSuite extends VeloxOrcForcePositionalSuite {
+  override protected def sparkConf: SparkConf =
+    super.sparkConf
+      .set("orc.force.positional.evolution", "false")
+}
diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxOrcSchemaEvolutionSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxOrcSchemaEvolutionSuite.scala
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.gluten.execution
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
+
+class VeloxOrcSchemaEvolutionSuite extends VeloxWholeStageTransformerSuite {
+  override protected val resourcePath: String = "/tpch-data-parquet"
+  override protected val fileFormat: String = "parquet"
+
+  import testImplicits._
+
+  test("read ORC with column names all starting with '_col'") {
+    withTempPath {
+      tmp =>
+        val df = Seq((1, 2, 3), (4, 5, 6), (7, 8, 9)).toDF("_col0", "_col1", "_col2")
+        df.write.format("orc").save(s"file://${tmp.getCanonicalPath}")
+
+        withTempView("test") {
+          spark.read
+            .format("orc")
+            .schema(
+              StructType(
+                Array(
+                  StructField("a", IntegerType, nullable = true),
+                  StructField("b", IntegerType, nullable = true),
+                  StructField("c", IntegerType, nullable = true)
+                )))
+            .load(s"file://${tmp.getCanonicalPath}")
+            .createOrReplaceTempView("test")
+
+          runQueryAndCompare("select a, b, c from test") {
+            df =>
+              checkAnswer(
+                df,
+                Row(1, 2, 3) ::
+                  Row(4, 5, 6) ::
+                  Row(7, 8, 9) ::
+                  Nil)
+          }
+        }
+    }
+  }
+}
diff --git a/cpp/velox/compute/VeloxBackend.cc b/cpp/velox/compute/VeloxBackend.cc
@@ -271,6 +271,11 @@ void VeloxBackend::initConnector() {
   connectorConfMap[velox::connector::hive::HiveConfig::kFilePreloadThreshold] =
       backendConf_->get<std::string>(kFilePreloadThreshold, "1048576"); // 1M
 
+  // Map table schema to file schema using name
+  connectorConfMap[velox::connector::hive::HiveConfig::kParquetUseColumnNames] = "true";
+  connectorConfMap[velox::connector::hive::HiveConfig::kOrcUseColumnNames] = "true";
+  connectorConfMap[velox::connector::hive::HiveConfig::kOrcUseNestedColumnNames] = "true";
+
   // read as UTC
   connectorConfMap[velox::connector::hive::HiveConfig::kReadTimestampPartitionValueAsLocalTime] = "false";
 

diff --git a/cpp/velox/compute/VeloxPlanConverter.cc b/cpp/velox/compute/VeloxPlanConverter.cc
@@ -51,6 +51,10 @@ std::shared_ptr<SplitInfo> parseScanSplitInfo(
   splitInfo->partitionColumns.reserve(fileList.size());
   splitInfo->properties.reserve(fileList.size());
   splitInfo->metadataColumns.reserve(fileList.size());
+
+  std::vector<std::string> colNames;
+  std::vector<TypePtr> veloxTypes;
+
   for (const auto& file : fileList) {
     // Expect all Partitions share the same index.
     splitInfo->partitionIndex = file.partition_index();
@@ -71,6 +75,16 @@ std::shared_ptr<SplitInfo> parseScanSplitInfo(
     splitInfo->starts.emplace_back(file.start());
     splitInfo->lengths.emplace_back(file.length());
 
+    if (colNames.empty() && file.has_schema()) {
+      const auto& tableSchema = file.schema();
+      colNames.reserve(tableSchema.names().size());
+      for (const auto& name : tableSchema.names()) {
+        colNames.emplace_back(name);
+      }
+      veloxTypes = SubstraitParser::parseNamedStruct(tableSchema);
+    }
+    splitInfo->fileSchema = ROW(std::move(colNames), std::move(veloxTypes));
+
     facebook::velox::FileProperties fileProps;
     if (file.has_properties()) {
       fileProps.fileSize = file.properties().filesize();

diff --git a/cpp/velox/compute/WholeStageResultIterator.cc b/cpp/velox/compute/WholeStageResultIterator.cc
@@ -598,6 +598,12 @@ std::shared_ptr<velox::config::ConfigBase> WholeStageResultIterator::createConne
       std::to_string(veloxCfg_->get<int32_t>(kMaxPartitions, 10000));
   configs[velox::connector::hive::HiveConfig::kIgnoreMissingFilesSession] =
       std::to_string(veloxCfg_->get<bool>(kIgnoreMissingFiles, false));
+  configs[velox::connector::hive::HiveConfig::kParquetUseColumnNamesSession] = "true";
+  if (veloxCfg_->get<bool>(kOrcForcePositionalEvolution, false)) {
+    configs[velox::connector::hive::HiveConfig::kOrcUseNestedColumnNamesSession] = "true";
+  } else {
+    configs[velox::connector::hive::HiveConfig::kOrcUseColumnNamesSession] = "true";
+  }
   return std::make_shared<velox::config::ConfigBase>(std::move(configs));
 }
 

diff --git a/cpp/velox/config/VeloxConfig.h b/cpp/velox/config/VeloxConfig.h
@@ -127,6 +127,7 @@ const std::string kLoadQuantum = "spark.gluten.sql.columnar.backend.velox.loadQu
 const std::string kMaxCoalescedDistance = "spark.gluten.sql.columnar.backend.velox.maxCoalescedDistance";
 const std::string kMaxCoalescedBytes = "spark.gluten.sql.columnar.backend.velox.maxCoalescedBytes";
 const std::string kCachePrefetchMinPct = "spark.gluten.sql.columnar.backend.velox.cachePrefetchMinPct";
+const std::string kOrcForcePositionalEvolution = "spark.hadoop.orc.force.positional.evolution";
 
 // write fies
 const std::string kMaxPartitions = "spark.gluten.sql.columnar.backend.velox.maxPartitionsPerWritersSession";

diff --git a/cpp/velox/substrait/SubstraitToVeloxPlan.cc b/cpp/velox/substrait/SubstraitToVeloxPlan.cc
@@ -1284,15 +1284,25 @@ core::PlanNodePtr SubstraitToVeloxPlanConverter::toVeloxPlan(const ::substrait::
   std::shared_ptr<connector::hive::HiveTableHandle> tableHandle;
   if (!readRel.has_filter()) {
     tableHandle = std::make_shared<connector::hive::HiveTableHandle>(
-        kHiveConnectorId, "hive_table", filterPushdownEnabled, common::SubfieldFilters{}, nullptr);
+        kHiveConnectorId,
+        "hive_table",
+        filterPushdownEnabled,
+        common::SubfieldFilters{},
+        nullptr,
+        splitInfo->fileSchema);
   } else {
     common::SubfieldFilters subfieldFilters;
     auto names = colNameList;
     auto types = veloxTypeList;
     auto remainingFilter = exprConverter_->toVeloxExpr(readRel.filter(), ROW(std::move(names), std::move(types)));
 
     tableHandle = std::make_shared<connector::hive::HiveTableHandle>(
-        kHiveConnectorId, "hive_table", filterPushdownEnabled, std::move(subfieldFilters), remainingFilter);
+        kHiveConnectorId,
+        "hive_table",
+        filterPushdownEnabled,
+        std::move(subfieldFilters),
+        remainingFilter,
+        splitInfo->fileSchema);
   }
 
   // Get assignments and out names.

diff --git a/cpp/velox/substrait/SubstraitToVeloxPlan.h b/cpp/velox/substrait/SubstraitToVeloxPlan.h
@@ -55,6 +55,9 @@ struct SplitInfo {
   /// The file sizes and modification times of the files to be scanned.
   std::vector<std::optional<facebook::velox::FileProperties>> properties;
 
+  /// The file schema
+  RowTypePtr fileSchema;
+
   /// Make SplitInfo polymorphic
   virtual ~SplitInfo() = default;
 };