From 1b8fe5d95c82345d6f4a86b381822d54496fff0c Mon Sep 17 00:00:00 2001 From: Yuan Date: Wed, 11 Feb 2026 17:30:58 +0000 Subject: [PATCH 1/2] [VL] Adding configurations on max write file size Signed-off-by: Yuan --- .../scala/org/apache/gluten/config/VeloxConfig.scala | 10 ++++++++++ cpp/velox/config/VeloxConfig.h | 1 + cpp/velox/utils/ConfigExtractor.cc | 2 ++ docs/velox-configuration.md | 1 + 4 files changed, 14 insertions(+) diff --git a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala index ee0866391ce0..4448a03f678e 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala @@ -274,6 +274,16 @@ object VeloxConfig extends ConfigRegistry { .checkValue(_ > 0, "must be a positive number") .createWithDefault(10000) + val MAX_TARGET_FILE_SIZE_SESSION = + buildConf("spark.gluten.sql.columnar.backend.velox.maxTargetFileSizeSession") + .doc( + "The target file size for each output file when writing data. " + + "0 means no limit on target file size, and the actual file size will be determined by " + + "other factors such as max partition number and shuffle batch size.") + .bytesConf(ByteUnit.BYTE) + .checkValue(_ >= 0, "must be a non-negative number") + .createWithDefault(0) + val COLUMNAR_VELOX_RESIZE_BATCHES_SHUFFLE_INPUT = buildConf("spark.gluten.sql.columnar.backend.velox.resizeBatches.shuffleInput") .doc( diff --git a/cpp/velox/config/VeloxConfig.h b/cpp/velox/config/VeloxConfig.h index 566ce875aacc..eebac119ac7c 100644 --- a/cpp/velox/config/VeloxConfig.h +++ b/cpp/velox/config/VeloxConfig.h @@ -160,6 +160,7 @@ const std::string kParquetUseColumnNames = "spark.gluten.sql.columnar.backend.ve // write fies const std::string kMaxPartitions = "spark.gluten.sql.columnar.backend.velox.maxPartitionsPerWritersSession"; +const std::string KMaxTargetFileSize = "spark.gluten.sql.columnar.backend.velox.maxTargetFileSizeSession"; const std::string kGlogVerboseLevel = "spark.gluten.sql.columnar.backend.velox.glogVerboseLevel"; const uint32_t kGlogVerboseLevelDefault = 0; diff --git a/cpp/velox/utils/ConfigExtractor.cc b/cpp/velox/utils/ConfigExtractor.cc index 52e42d606dce..533ac3344613 100644 --- a/cpp/velox/utils/ConfigExtractor.cc +++ b/cpp/velox/utils/ConfigExtractor.cc @@ -231,6 +231,8 @@ std::shared_ptr createHiveConnectorSessionC configs[facebook::velox::connector::hive::HiveConfig::kReadTimestampUnitSession] = std::string("6"); configs[facebook::velox::connector::hive::HiveConfig::kMaxPartitionsPerWritersSession] = conf->get(kMaxPartitions, "10000"); + configs[facebook::velox::connector::hive::HiveConfig::kMaxTargetFileSizeSession] = + conf->get(KMaxTargetFileSize, "0B"); // 0 means no limit on target file size configs[facebook::velox::connector::hive::HiveConfig::kIgnoreMissingFilesSession] = conf->get(kIgnoreMissingFiles, false) ? "true" : "false"; configs[facebook::velox::connector::hive::HiveConfig::kParquetUseColumnNamesSession] = diff --git a/docs/velox-configuration.md b/docs/velox-configuration.md index f4a79c465211..379aa8ea52ce 100644 --- a/docs/velox-configuration.md +++ b/docs/velox-configuration.md @@ -48,6 +48,7 @@ nav_order: 16 | spark.gluten.sql.columnar.backend.velox.maxSpillFileSize | 1GB | The maximum size of a single spill file created | | spark.gluten.sql.columnar.backend.velox.maxSpillLevel | 4 | The max allowed spilling level with zero being the initial spilling level | | spark.gluten.sql.columnar.backend.velox.maxSpillRunRows | 3M | The maximum row size of a single spill run | +| spark.gluten.sql.columnar.backend.velox.maxTargetFileSizeSession | 0b | The target file size for each output file when writing data. 0 means no limit on target file size, and the actual file size will be determined by other factors such as max partition number and shuffle batch size. | | spark.gluten.sql.columnar.backend.velox.memCacheSize | 1GB | The memory cache size | | spark.gluten.sql.columnar.backend.velox.memInitCapacity | 8MB | The initial memory capacity to reserve for a newly created Velox query memory pool. | | spark.gluten.sql.columnar.backend.velox.memoryPoolCapacityTransferAcrossTasks | true | Whether to allow memory capacity transfer between memory pools from different tasks. | From a84a31d3d146b6a28563ea6ae4bec16e3828b7f2 Mon Sep 17 00:00:00 2001 From: Yuan Date: Fri, 13 Mar 2026 12:16:44 +0000 Subject: [PATCH 2/2] use hive config Signed-off-by: Yuan --- .../src/main/scala/org/apache/gluten/config/VeloxConfig.scala | 2 +- cpp/velox/config/VeloxConfig.h | 2 +- cpp/velox/utils/ConfigExtractor.cc | 4 ++-- docs/velox-configuration.md | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala index 4448a03f678e..e8ec2db671e2 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala @@ -275,7 +275,7 @@ object VeloxConfig extends ConfigRegistry { .createWithDefault(10000) val MAX_TARGET_FILE_SIZE_SESSION = - buildConf("spark.gluten.sql.columnar.backend.velox.maxTargetFileSizeSession") + buildConf("spark.gluten.sql.columnar.backend.velox.maxTargetFileSize") .doc( "The target file size for each output file when writing data. " + "0 means no limit on target file size, and the actual file size will be determined by " + diff --git a/cpp/velox/config/VeloxConfig.h b/cpp/velox/config/VeloxConfig.h index eebac119ac7c..a8186550db2c 100644 --- a/cpp/velox/config/VeloxConfig.h +++ b/cpp/velox/config/VeloxConfig.h @@ -160,7 +160,7 @@ const std::string kParquetUseColumnNames = "spark.gluten.sql.columnar.backend.ve // write fies const std::string kMaxPartitions = "spark.gluten.sql.columnar.backend.velox.maxPartitionsPerWritersSession"; -const std::string KMaxTargetFileSize = "spark.gluten.sql.columnar.backend.velox.maxTargetFileSizeSession"; +const std::string kMaxTargetFileSize = "spark.gluten.sql.columnar.backend.velox.maxTargetFileSize"; const std::string kGlogVerboseLevel = "spark.gluten.sql.columnar.backend.velox.glogVerboseLevel"; const uint32_t kGlogVerboseLevelDefault = 0; diff --git a/cpp/velox/utils/ConfigExtractor.cc b/cpp/velox/utils/ConfigExtractor.cc index 533ac3344613..61367956b193 100644 --- a/cpp/velox/utils/ConfigExtractor.cc +++ b/cpp/velox/utils/ConfigExtractor.cc @@ -231,8 +231,8 @@ std::shared_ptr createHiveConnectorSessionC configs[facebook::velox::connector::hive::HiveConfig::kReadTimestampUnitSession] = std::string("6"); configs[facebook::velox::connector::hive::HiveConfig::kMaxPartitionsPerWritersSession] = conf->get(kMaxPartitions, "10000"); - configs[facebook::velox::connector::hive::HiveConfig::kMaxTargetFileSizeSession] = - conf->get(KMaxTargetFileSize, "0B"); // 0 means no limit on target file size + configs[facebook::velox::connector::hive::HiveConfig::kMaxTargetFileSize] = + conf->get(kMaxTargetFileSize, "0B"); // 0 means no limit on target file size configs[facebook::velox::connector::hive::HiveConfig::kIgnoreMissingFilesSession] = conf->get(kIgnoreMissingFiles, false) ? "true" : "false"; configs[facebook::velox::connector::hive::HiveConfig::kParquetUseColumnNamesSession] = diff --git a/docs/velox-configuration.md b/docs/velox-configuration.md index 379aa8ea52ce..1cbcfec9f476 100644 --- a/docs/velox-configuration.md +++ b/docs/velox-configuration.md @@ -48,7 +48,7 @@ nav_order: 16 | spark.gluten.sql.columnar.backend.velox.maxSpillFileSize | 1GB | The maximum size of a single spill file created | | spark.gluten.sql.columnar.backend.velox.maxSpillLevel | 4 | The max allowed spilling level with zero being the initial spilling level | | spark.gluten.sql.columnar.backend.velox.maxSpillRunRows | 3M | The maximum row size of a single spill run | -| spark.gluten.sql.columnar.backend.velox.maxTargetFileSizeSession | 0b | The target file size for each output file when writing data. 0 means no limit on target file size, and the actual file size will be determined by other factors such as max partition number and shuffle batch size. | +| spark.gluten.sql.columnar.backend.velox.maxTargetFileSize | 0b | The target file size for each output file when writing data. 0 means no limit on target file size, and the actual file size will be determined by other factors such as max partition number and shuffle batch size. | | spark.gluten.sql.columnar.backend.velox.memCacheSize | 1GB | The memory cache size | | spark.gluten.sql.columnar.backend.velox.memInitCapacity | 8MB | The initial memory capacity to reserve for a newly created Velox query memory pool. | | spark.gluten.sql.columnar.backend.velox.memoryPoolCapacityTransferAcrossTasks | true | Whether to allow memory capacity transfer between memory pools from different tasks. |