diff --git a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala index ee0866391ce0..e8ec2db671e2 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala @@ -274,6 +274,16 @@ object VeloxConfig extends ConfigRegistry { .checkValue(_ > 0, "must be a positive number") .createWithDefault(10000) + val MAX_TARGET_FILE_SIZE_SESSION = + buildConf("spark.gluten.sql.columnar.backend.velox.maxTargetFileSize") + .doc( + "The target file size for each output file when writing data. " + + "0 means no limit on target file size, and the actual file size will be determined by " + + "other factors such as max partition number and shuffle batch size.") + .bytesConf(ByteUnit.BYTE) + .checkValue(_ >= 0, "must be a non-negative number") + .createWithDefault(0) + val COLUMNAR_VELOX_RESIZE_BATCHES_SHUFFLE_INPUT = buildConf("spark.gluten.sql.columnar.backend.velox.resizeBatches.shuffleInput") .doc( diff --git a/cpp/velox/config/VeloxConfig.h b/cpp/velox/config/VeloxConfig.h index 566ce875aacc..a8186550db2c 100644 --- a/cpp/velox/config/VeloxConfig.h +++ b/cpp/velox/config/VeloxConfig.h @@ -160,6 +160,7 @@ const std::string kParquetUseColumnNames = "spark.gluten.sql.columnar.backend.ve // write fies const std::string kMaxPartitions = "spark.gluten.sql.columnar.backend.velox.maxPartitionsPerWritersSession"; +const std::string kMaxTargetFileSize = "spark.gluten.sql.columnar.backend.velox.maxTargetFileSize"; const std::string kGlogVerboseLevel = "spark.gluten.sql.columnar.backend.velox.glogVerboseLevel"; const uint32_t kGlogVerboseLevelDefault = 0; diff --git a/cpp/velox/utils/ConfigExtractor.cc b/cpp/velox/utils/ConfigExtractor.cc index 52e42d606dce..61367956b193 100644 --- a/cpp/velox/utils/ConfigExtractor.cc +++ b/cpp/velox/utils/ConfigExtractor.cc @@ -231,6 +231,8 @@ std::shared_ptr createHiveConnectorSessionC configs[facebook::velox::connector::hive::HiveConfig::kReadTimestampUnitSession] = std::string("6"); configs[facebook::velox::connector::hive::HiveConfig::kMaxPartitionsPerWritersSession] = conf->get(kMaxPartitions, "10000"); + configs[facebook::velox::connector::hive::HiveConfig::kMaxTargetFileSize] = + conf->get(kMaxTargetFileSize, "0B"); // 0 means no limit on target file size configs[facebook::velox::connector::hive::HiveConfig::kIgnoreMissingFilesSession] = conf->get(kIgnoreMissingFiles, false) ? "true" : "false"; configs[facebook::velox::connector::hive::HiveConfig::kParquetUseColumnNamesSession] = diff --git a/docs/velox-configuration.md b/docs/velox-configuration.md index f4a79c465211..1cbcfec9f476 100644 --- a/docs/velox-configuration.md +++ b/docs/velox-configuration.md @@ -48,6 +48,7 @@ nav_order: 16 | spark.gluten.sql.columnar.backend.velox.maxSpillFileSize | 1GB | The maximum size of a single spill file created | | spark.gluten.sql.columnar.backend.velox.maxSpillLevel | 4 | The max allowed spilling level with zero being the initial spilling level | | spark.gluten.sql.columnar.backend.velox.maxSpillRunRows | 3M | The maximum row size of a single spill run | +| spark.gluten.sql.columnar.backend.velox.maxTargetFileSize | 0b | The target file size for each output file when writing data. 0 means no limit on target file size, and the actual file size will be determined by other factors such as max partition number and shuffle batch size. | | spark.gluten.sql.columnar.backend.velox.memCacheSize | 1GB | The memory cache size | | spark.gluten.sql.columnar.backend.velox.memInitCapacity | 8MB | The initial memory capacity to reserve for a newly created Velox query memory pool. | | spark.gluten.sql.columnar.backend.velox.memoryPoolCapacityTransferAcrossTasks | true | Whether to allow memory capacity transfer between memory pools from different tasks. |