From 7c641d5c88bbf2a2a5dba8668728ae68d0ffbdfc Mon Sep 17 00:00:00 2001 From: zouxxyy Date: Fri, 3 Oct 2025 08:29:32 +0800 Subject: [PATCH] rebase --- .github/labeler.yml | 6 +++--- .github/workflows/build_bundle_package.yml | 2 +- .github/workflows/docker_image.yml | 2 +- .github/workflows/scala_code_format.yml | 2 +- ...ark_resources.sh => install-spark-resources.sh} | 0 .../util/{setup_helper.sh => setup-helper.sh} | 0 .github/workflows/velox_backend_arm.yml | 2 +- .github/workflows/velox_backend_cache.yml | 4 ++-- .github/workflows/velox_backend_enhanced.yml | 4 ++-- .github/workflows/velox_backend_x86.yml | 2 +- .github/workflows/velox_nightly.yml | 2 +- .github/workflows/velox_weekly.yml | 2 +- .../org/apache/gluten/config/VeloxConfig.scala | 4 ++-- .../gluten/config/AllVeloxConfiguration.scala | 2 +- dev/{build_arrow.sh => build-arrow.sh} | 2 +- ...lper_functions.sh => build-helper-functions.sh} | 0 dev/{build_libhdfs3.sh => build-libhdfs3.sh} | 2 +- dev/builddeps-veloxbe.sh | 8 ++++---- dev/docker/Dockerfile.centos8-dynamic-build | 10 +++++----- dev/docker/Dockerfile.centos9-dynamic-build | 10 +++++----- dev/docker/cudf/Dockerfile | 2 +- ...n_all_config_docs.sh => gen-all-config-docs.sh} | 0 dev/{start_cudf.sh => start-cudf.sh} | 0 docs/developers/HowTo.md | 14 +++++++------- docs/developers/clickhouse-backend-debug.md | 2 +- docs/get-started/ClickHouse.md | 4 ++-- docs/get-started/Velox.md | 8 ++++---- docs/get-started/VeloxGPU.md | 2 +- docs/get-started/build-guide.md | 2 +- docs/velox-backend-support-progress.md | 8 ++++---- .../{build_clickhouse.sh => build-clickhouse.sh} | 0 .../src/{install_ubuntu.sh => install-ubuntu.sh} | 0 ep/build-clickhouse/src/package.sh | 2 +- .../resources/bin/{check_env.sh => check-env.sh} | 0 ep/build-clickhouse/src/resources/bin/gluten.sh | 2 +- .../src/{build_velox.sh => build-velox.sh} | 0 ep/build-velox/src/{get_velox.sh => get-velox.sh} | 2 +- .../apache/gluten/config/GlutenCoreConfig.scala | 2 +- .../org/apache/gluten/config/GlutenConfig.scala | 2 +- .../gluten/config/AllGlutenConfiguration.scala | 6 +++--- tools/workload/benchmark_velox/README.md | 2 +- .../{run_perf_analysis.sh => run-perf-analysis.sh} | 0 .../{build_gluten.sh => build-gluten.sh} | 0 tools/workload/benchmark_velox/initialize.ipynb | 2 +- .../benchmark_velox/native_sql_initialize.ipynb | 2 +- .../{run_tpc_workload.sh => run-tpc-workload.sh} | 0 tools/workload/tpcds-delta/README.md | 4 ++-- ...cds_datagen_delta.sh => tpcds-dategen-delta.sh} | 0 .../run_tpcds/{run_tpcds.sh => run-tpcds.sh} | 0 tools/workload/tpcds/README.md | 2 +- ...datagen_parquet.sh => tpcds-dategen-parquet.sh} | 0 .../tpcds/run_tpcds/{run_tpcds.sh => run-tpcds.sh} | 0 tools/workload/tpch/README.md | 2 +- ..._datagen_parquet.sh => tpch-dategen-parquet.sh} | 0 .../run_tpch/{tpch_parquet.sh => tpch-parquet.sh} | 0 55 files changed, 69 insertions(+), 69 deletions(-) rename .github/workflows/util/{install_spark_resources.sh => install-spark-resources.sh} (100%) rename .github/workflows/util/{setup_helper.sh => setup-helper.sh} (100%) rename dev/{build_arrow.sh => build-arrow.sh} (99%) rename dev/{build_helper_functions.sh => build-helper-functions.sh} (100%) rename dev/{build_libhdfs3.sh => build-libhdfs3.sh} (97%) rename dev/{gen_all_config_docs.sh => gen-all-config-docs.sh} (100%) rename dev/{start_cudf.sh => start-cudf.sh} (100%) rename ep/build-clickhouse/src/{build_clickhouse.sh => build-clickhouse.sh} (100%) rename ep/build-clickhouse/src/{install_ubuntu.sh => install-ubuntu.sh} (100%) rename ep/build-clickhouse/src/resources/bin/{check_env.sh => check-env.sh} (100%) rename ep/build-velox/src/{build_velox.sh => build-velox.sh} (100%) rename ep/build-velox/src/{get_velox.sh => get-velox.sh} (99%) rename tools/workload/benchmark_velox/analysis/{run_perf_analysis.sh => run-perf-analysis.sh} (100%) rename tools/workload/benchmark_velox/{build_gluten.sh => build-gluten.sh} (100%) rename tools/workload/benchmark_velox/{run_tpc_workload.sh => run-tpc-workload.sh} (100%) rename tools/workload/tpcds-delta/gen_data/{tpcds_datagen_delta.sh => tpcds-dategen-delta.sh} (100%) rename tools/workload/tpcds-delta/run_tpcds/{run_tpcds.sh => run-tpcds.sh} (100%) rename tools/workload/tpcds/gen_data/parquet_dataset/{tpcds_datagen_parquet.sh => tpcds-dategen-parquet.sh} (100%) rename tools/workload/tpcds/run_tpcds/{run_tpcds.sh => run-tpcds.sh} (100%) rename tools/workload/tpch/gen_data/parquet_dataset/{tpch_datagen_parquet.sh => tpch-dategen-parquet.sh} (100%) rename tools/workload/tpch/run_tpch/{tpch_parquet.sh => tpch-parquet.sh} (100%) diff --git a/.github/labeler.yml b/.github/labeler.yml index 565db9bfec85..186eeeb5ce9d 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -30,9 +30,9 @@ BUILD: - changed-files: - any-glob-to-any-file: [ 'dev/**/*', - 'ep/build-velox/src/get_velox.sh', - 'ep/build-velox/src/build_velox.sh', - 'ep/build-clickhouse/src/build_clickhouse.sh' + 'ep/build-velox/src/get-velox.sh', + 'ep/build-velox/src/build-velox.sh', + 'ep/build-clickhouse/src/build-clickhouse.sh' ] DOCS: diff --git a/.github/workflows/build_bundle_package.yml b/.github/workflows/build_bundle_package.yml index c4861ffea42f..b6947d0109a3 100644 --- a/.github/workflows/build_bundle_package.yml +++ b/.github/workflows/build_bundle_package.yml @@ -18,7 +18,7 @@ name: Build bundle package env: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true CCACHE_DIR: "${{ github.workspace }}/.ccache" - SETUP: 'bash .github/workflows/util/setup_helper.sh' + SETUP: 'bash .github/workflows/util/setup-helper.sh' concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} diff --git a/.github/workflows/docker_image.yml b/.github/workflows/docker_image.yml index 40aefa6c18b6..5342cc212991 100644 --- a/.github/workflows/docker_image.yml +++ b/.github/workflows/docker_image.yml @@ -21,7 +21,7 @@ on: - main paths: - '.github/workflows/docker_image.yml' - - '.github/workflows/util/install_spark_resources.sh' + - '.github/workflows/util/install-spark-resources.sh' - 'dev/docker/Dockerfile.centos7-static-build' - 'dev/docker/Dockerfile.centos8-static-build' - 'dev/docker/Dockerfile.centos9-static-build' diff --git a/.github/workflows/scala_code_format.yml b/.github/workflows/scala_code_format.yml index 1d72be94fa01..9f1fc711bb05 100644 --- a/.github/workflows/scala_code_format.yml +++ b/.github/workflows/scala_code_format.yml @@ -38,7 +38,7 @@ concurrency: cancel-in-progress: true env: - SETUP: 'bash .github/workflows/util/setup_helper.sh' + SETUP: 'bash .github/workflows/util/setup-helper.sh' jobs: diff --git a/.github/workflows/util/install_spark_resources.sh b/.github/workflows/util/install-spark-resources.sh similarity index 100% rename from .github/workflows/util/install_spark_resources.sh rename to .github/workflows/util/install-spark-resources.sh diff --git a/.github/workflows/util/setup_helper.sh b/.github/workflows/util/setup-helper.sh similarity index 100% rename from .github/workflows/util/setup_helper.sh rename to .github/workflows/util/setup-helper.sh diff --git a/.github/workflows/velox_backend_arm.yml b/.github/workflows/velox_backend_arm.yml index 5608097f9792..16604847d8ac 100644 --- a/.github/workflows/velox_backend_arm.yml +++ b/.github/workflows/velox_backend_arm.yml @@ -43,7 +43,7 @@ env: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true MVN_CMD: 'mvn -ntp' WGET_CMD: 'wget -nv' - SETUP: 'bash .github/workflows/util/setup_helper.sh' + SETUP: 'bash .github/workflows/util/setup-helper.sh' CCACHE_DIR: "${{ github.workspace }}/.ccache" concurrency: diff --git a/.github/workflows/velox_backend_cache.yml b/.github/workflows/velox_backend_cache.yml index fc739ad5c411..887f2cefce90 100644 --- a/.github/workflows/velox_backend_cache.yml +++ b/.github/workflows/velox_backend_cache.yml @@ -183,7 +183,7 @@ jobs: # run: | # rm -rf /opt/miniconda-for-velox/ # cd ep/build-velox/src && \ - # ./get_velox.sh + # ./get-velox.sh # cd ../build/velox_ep/ # make EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_PARQUET=ON -DVELOX_BUILD_TESTING=ON -DVELOX_BUILD_TEST_UTILS=ON" @@ -221,7 +221,7 @@ jobs: # run: | # rm -rf /opt/miniconda-for-velox/ # cd ep/build-velox/src && \ -# ./get_velox.sh +# ./get-velox.sh # cd ../build/velox_ep/ # source /opt/rh/gcc-toolset-9/enable # make EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_PARQUET=ON -DVELOX_BUILD_TESTING=ON -DVELOX_BUILD_TEST_UTILS=ON" diff --git a/.github/workflows/velox_backend_enhanced.yml b/.github/workflows/velox_backend_enhanced.yml index 554d13aaddd7..3988409c8557 100644 --- a/.github/workflows/velox_backend_enhanced.yml +++ b/.github/workflows/velox_backend_enhanced.yml @@ -42,7 +42,7 @@ env: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true MVN_CMD: 'mvn -ntp' WGET_CMD: 'wget -nv' - SETUP: 'bash .github/workflows/util/setup_helper.sh' + SETUP: 'bash .github/workflows/util/setup-helper.sh' CCACHE_DIR: "${{ github.workspace }}/.ccache" concurrency: @@ -229,7 +229,7 @@ jobs: - name: Prepare Spark Resources for Spark 3.5.5 run: | rm -rf /opt/shims/spark35 - bash .github/workflows/util/install_spark_resources.sh 3.5 + bash .github/workflows/util/install-spark-resources.sh 3.5 - name: Build and Run unit test for Spark 3.5.5 (slow tests) run: | cd $GITHUB_WORKSPACE/ diff --git a/.github/workflows/velox_backend_x86.yml b/.github/workflows/velox_backend_x86.yml index 18c5a5d90d08..2b1cacca183b 100644 --- a/.github/workflows/velox_backend_x86.yml +++ b/.github/workflows/velox_backend_x86.yml @@ -43,7 +43,7 @@ env: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true MVN_CMD: 'mvn -ntp' WGET_CMD: 'wget -nv' - SETUP: 'source .github/workflows/util/setup_helper.sh' + SETUP: 'source .github/workflows/util/setup-helper.sh' CCACHE_DIR: "${{ github.workspace }}/.ccache" # spark.sql.ansi.enabled defaults to false. SPARK_ANSI_SQL_MODE: false diff --git a/.github/workflows/velox_nightly.yml b/.github/workflows/velox_nightly.yml index c8d1527b81ec..2d7ea7d95926 100644 --- a/.github/workflows/velox_nightly.yml +++ b/.github/workflows/velox_nightly.yml @@ -27,7 +27,7 @@ on: env: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true CCACHE_DIR: "${{ github.workspace }}/.ccache" - SETUP: 'bash .github/workflows/util/setup_helper.sh' + SETUP: 'bash .github/workflows/util/setup-helper.sh' concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} diff --git a/.github/workflows/velox_weekly.yml b/.github/workflows/velox_weekly.yml index 2af2341fbf5c..c66786f59e7f 100644 --- a/.github/workflows/velox_weekly.yml +++ b/.github/workflows/velox_weekly.yml @@ -25,7 +25,7 @@ on: env: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true MVN_CMD: 'mvn -ntp' - SETUP: 'source .github/workflows/util/setup_helper.sh' + SETUP: 'source .github/workflows/util/setup-helper.sh' TPCH_TEST: "env GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare --local --preset=velox --benchmark-type=h --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1" INSTALL_PREFIX: /usr/local diff --git a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala index b4f4556fe1fb..f6e7a3f10a71 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala @@ -23,8 +23,8 @@ import java.util.Locale import java.util.concurrent.TimeUnit /* - * Note: Gluten configiguration.md is automatically generated from this code. - * Make sure to run dev/gen_all_config_docs.sh after making changes to this file. + * Note: Gluten configuration.md is automatically generated from this code. + * Make sure to run dev/gen-all-config-docs.sh after making changes to this file. */ class VeloxConfig(conf: SQLConf) extends GlutenConfig(conf) { import VeloxConfig._ diff --git a/backends-velox/src/test/scala/org/apache/gluten/config/AllVeloxConfiguration.scala b/backends-velox/src/test/scala/org/apache/gluten/config/AllVeloxConfiguration.scala index a693437b8dc8..65059972b97f 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/config/AllVeloxConfiguration.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/config/AllVeloxConfiguration.scala @@ -79,6 +79,6 @@ class AllVeloxConfiguration extends AnyFunSuite { AllGlutenConfiguration.verifyOrRegenerateGoldenFile( markdown, builder.toMarkdown, - "dev/gen_all_config_docs.sh") + "dev/gen-all-config-docs.sh") } } diff --git a/dev/build_arrow.sh b/dev/build-arrow.sh similarity index 99% rename from dev/build_arrow.sh rename to dev/build-arrow.sh index 011fc9c10b82..0fa2e35c20da 100755 --- a/dev/build_arrow.sh +++ b/dev/build-arrow.sh @@ -19,7 +19,7 @@ set -exu CURRENT_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) SUDO="${SUDO:-""}" -source ${CURRENT_DIR}/build_helper_functions.sh +source ${CURRENT_DIR}/build-helper-functions.sh VELOX_ARROW_BUILD_VERSION=15.0.0 ARROW_PREFIX=$CURRENT_DIR/../ep/_ep/arrow_ep BUILD_TYPE=Release diff --git a/dev/build_helper_functions.sh b/dev/build-helper-functions.sh similarity index 100% rename from dev/build_helper_functions.sh rename to dev/build-helper-functions.sh diff --git a/dev/build_libhdfs3.sh b/dev/build-libhdfs3.sh similarity index 97% rename from dev/build_libhdfs3.sh rename to dev/build-libhdfs3.sh index 645031327fec..66a73fb52d95 100755 --- a/dev/build_libhdfs3.sh +++ b/dev/build-libhdfs3.sh @@ -19,7 +19,7 @@ set -exu CURRENT_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) export SUDO=sudo -source ${CURRENT_DIR}/build_helper_functions.sh +source ${CURRENT_DIR}/build-helper-functions.sh DEPENDENCY_DIR=${DEPENDENCY_DIR:-$CURRENT_DIR/../ep/_ep} function build_libhdfs3 { diff --git a/dev/builddeps-veloxbe.sh b/dev/builddeps-veloxbe.sh index 8bdc72b1d325..9f7f0594ec27 100755 --- a/dev/builddeps-veloxbe.sh +++ b/dev/builddeps-veloxbe.sh @@ -210,14 +210,14 @@ function build_arrow { get_velox && setup_dependencies fi cd $GLUTEN_DIR/dev - source ./build_arrow.sh + source ./build-arrow.sh } function build_velox { echo "Start to build Velox" cd $GLUTEN_DIR/ep/build-velox/src # When BUILD_TESTS is on for gluten cpp, we need turn on VELOX_BUILD_TEST_UTILS via build_test_utils. - ./build_velox.sh --enable_s3=$ENABLE_S3 --enable_gcs=$ENABLE_GCS --build_type=$BUILD_TYPE --enable_hdfs=$ENABLE_HDFS \ + ./build-velox.sh --enable_s3=$ENABLE_S3 --enable_gcs=$ENABLE_GCS --build_type=$BUILD_TYPE --enable_hdfs=$ENABLE_HDFS \ --enable_abfs=$ENABLE_ABFS --enable_gpu=$ENABLE_GPU --build_test_utils=$BUILD_TESTS \ --build_tests=$BUILD_VELOX_TESTS --build_benchmarks=$BUILD_VELOX_BENCHMARKS --num_threads=$NUM_THREADS \ --velox_home=$VELOX_HOME @@ -263,14 +263,14 @@ function build_velox_backend { function get_velox { cd $GLUTEN_DIR/ep/build-velox/src - ./get_velox.sh $VELOX_PARAMETER + ./get-velox.sh $VELOX_PARAMETER } function setup_dependencies { DEPENDENCY_DIR=${DEPENDENCY_DIR:-$CURRENT_DIR/../ep/_ep} mkdir -p ${DEPENDENCY_DIR} - source $GLUTEN_DIR/dev/build_helper_functions.sh + source $GLUTEN_DIR/dev/build-helper-functions.sh source ${VELOX_HOME}/scripts/setup-common.sh echo "Start to install dependencies" diff --git a/dev/docker/Dockerfile.centos8-dynamic-build b/dev/docker/Dockerfile.centos8-dynamic-build index 29053fdc1af6..4f81ed557e97 100644 --- a/dev/docker/Dockerfile.centos8-dynamic-build +++ b/dev/docker/Dockerfile.centos8-dynamic-build @@ -47,11 +47,11 @@ RUN set -ex; \ wget -nv https://archive.apache.org/dist/hadoop/common/hadoop-2.8.5/hadoop-2.8.5.tar.gz -P /opt/; \ git clone --depth=1 https://github.com/apache/incubator-gluten /opt/gluten; \ cd /opt/gluten/.github/workflows/util/; \ - ./install_spark_resources.sh 3.2; \ - ./install_spark_resources.sh 3.3; \ - ./install_spark_resources.sh 3.4; \ - ./install_spark_resources.sh 3.5; \ - ./install_spark_resources.sh 3.5-scala2.13; \ + ./install-spark-resources.sh 3.2; \ + ./install-spark-resources.sh 3.3; \ + ./install-spark-resources.sh 3.4; \ + ./install-spark-resources.sh 3.5; \ + ./install-spark-resources.sh 3.5-scala2.13; \ if [ "$(uname -m)" = "aarch64" ]; then \ export CPU_TARGET="aarch64"; \ fi; \ diff --git a/dev/docker/Dockerfile.centos9-dynamic-build b/dev/docker/Dockerfile.centos9-dynamic-build index 7fffe6cd3db7..230947a6ff9a 100644 --- a/dev/docker/Dockerfile.centos9-dynamic-build +++ b/dev/docker/Dockerfile.centos9-dynamic-build @@ -45,11 +45,11 @@ RUN set -ex; \ wget -nv https://archive.apache.org/dist/hadoop/common/hadoop-2.8.5/hadoop-2.8.5.tar.gz -P /opt/; \ git clone --depth=1 https://github.com/apache/incubator-gluten /opt/gluten; \ cd /opt/gluten/.github/workflows/util/; \ - ./install_spark_resources.sh 3.2; \ - ./install_spark_resources.sh 3.3; \ - ./install_spark_resources.sh 3.4; \ - ./install_spark_resources.sh 3.5; \ - ./install_spark_resources.sh 3.5-scala2.13; \ + ./install-spark-resources.sh 3.2; \ + ./install-spark-resources.sh 3.3; \ + ./install-spark-resources.sh 3.4; \ + ./install-spark-resources.sh 3.5; \ + ./install-spark-resources.sh 3.5-scala2.13; \ if [ "$(uname -m)" = "aarch64" ]; then \ export CPU_TARGET="aarch64"; \ fi; \ diff --git a/dev/docker/cudf/Dockerfile b/dev/docker/cudf/Dockerfile index f598f90bf80e..ed2f97dc8328 100644 --- a/dev/docker/cudf/Dockerfile +++ b/dev/docker/cudf/Dockerfile @@ -4,7 +4,7 @@ RUN yum install -y sudo patch maven perl && ln -sf /usr/local/bin/cmake /usr/bin RUN git clone --depth=1 https://github.com/apache/incubator-gluten /opt/gluten # Install spark to folder /opt -RUN cd /opt/gluten/.github/workflows/util/ && ./install_spark_resources.sh 3.4 +RUN cd /opt/gluten/.github/workflows/util/ && ./install-spark-resources.sh 3.4 ENV SPARK_HOME=/opt/spark-3.4.4-bin-hadoop3 ENV PATH=$SPARK_HOME/bin:$PATH ENV CUDA_ARCHITECTURES=70 diff --git a/dev/gen_all_config_docs.sh b/dev/gen-all-config-docs.sh similarity index 100% rename from dev/gen_all_config_docs.sh rename to dev/gen-all-config-docs.sh diff --git a/dev/start_cudf.sh b/dev/start-cudf.sh similarity index 100% rename from dev/start_cudf.sh rename to dev/start-cudf.sh diff --git a/docs/developers/HowTo.md b/docs/developers/HowTo.md index 5386ae732165..0ddb04f058c1 100644 --- a/docs/developers/HowTo.md +++ b/docs/developers/HowTo.md @@ -128,16 +128,16 @@ mvn test -Pspark-3.5 -Pspark-ut -Pbackends-velox -DargLine="-Dspark.test.home=/p Please set `wildcardSuites` with a fully qualified class name. `spark.test.home` is optional to set. It is only required for some test suites to use Spark resources. If you are specifying the `spark.test.home` arg, it should be set to either: * The path a directory containing Spark source code, which has already been built -* Or use the `install_spark_resources.sh` script to get a directory with the necessary resource files: +* Or use the `install-spark-resources.sh` script to get a directory with the necessary resource files: ``` # Define a directory to use for the Spark files and the Spark version export spark_dir=/tmp/spark export spark_version=3.5 - # Run the install_spark_resources.sh script - .github/workflows/util/install_spark_resources.sh ${spark_version} ${spark_dir} + # Run the install-spark-resources.sh script + .github/workflows/util/install-spark-resources.sh ${spark_version} ${spark_dir} ``` - After running the `install_spark_resources.sh`, define the `spark.test.home` directory like: + After running the `install-spark-resources.sh`, define the `spark.test.home` directory like: `-DargLine="-Dspark.test.home=${spark_dir}/shims/spark35/spark_home"` when running unit tests. For most cases, please make sure Gluten native build is done before running a Scala/Java test. @@ -186,13 +186,13 @@ Here we will explain how to run TPC-H on Velox backend with the Parquet file for var gluten_root = "/home/gluten" ``` - - Modify `${GLUTEN_HOME}/tools/workload/tpch/run_tpch/tpch_parquet.sh`. + - Modify `${GLUTEN_HOME}/tools/workload/tpch/run_tpch/tpch-parquet.sh`. - Set `GLUTEN_JAR` correctly. Please refer to the section of [Build Gluten with Velox Backend](../get-started/Velox.md#build-gluten-with-velox-backend) - Set `SPARK_HOME` correctly. - Set the memory configurations appropriately. - - Execute `tpch_parquet.sh` using the below command. + - Execute `tpch-parquet.sh` using the below command. - `cd ${GLUTEN_HOME}/tools/workload/tpch/run_tpch/` - - `./tpch_parquet.sh` + - `./tpch-parquet.sh` # How to run TPC-DS diff --git a/docs/developers/clickhouse-backend-debug.md b/docs/developers/clickhouse-backend-debug.md index 000566028882..c2e273583d48 100644 --- a/docs/developers/clickhouse-backend-debug.md +++ b/docs/developers/clickhouse-backend-debug.md @@ -12,7 +12,7 @@ parent: /developer-overview/ 1. Build Gluten ClickHouse Native Lib. ``` - export CMAKE_BUILD_TYPE=Release && bash ep/build-clickhouse/src/build_clickhouse.sh + export CMAKE_BUILD_TYPE=Release && bash ep/build-clickhouse/src/build-clickhouse.sh ``` libch.so will be generated in `cpp-ch/build/utils/extern-local-engine/libch.so`. diff --git a/docs/get-started/ClickHouse.md b/docs/get-started/ClickHouse.md index 8b583af5213d..3573db40e45f 100644 --- a/docs/get-started/ClickHouse.md +++ b/docs/get-started/ClickHouse.md @@ -36,7 +36,7 @@ Following softwares are required, - cmake 3.20 or higher version - ninja-build 1.8.2 -You can run `sudo $gluten_root/ep/build-clickhouse/src/install_ubuntu.sh` to setup the requirements. We also provide a [docker file](../../cpp-ch/local-engine/docker/image/Dockerfile), you can build your own image +You can run `sudo $gluten_root/ep/build-clickhouse/src/install-ubuntu.sh` to setup the requirements. We also provide a [docker file](../../cpp-ch/local-engine/docker/image/Dockerfile), you can build your own image ```shell cd $gluten_root/cpp-ch/local-engine/docker/image/ docker build . -t libch_builder @@ -96,7 +96,7 @@ If you have setup all requirements, you can use following command to build it di ```bash cd $gluten_root -bash ./ep/build-clickhouse/src/build_clickhouse.sh +bash ./ep/build-clickhouse/src/build-clickhouse.sh ``` diff --git a/docs/get-started/Velox.md b/docs/get-started/Velox.md index 64d35bba1c65..19597aca0168 100644 --- a/docs/get-started/Velox.md +++ b/docs/get-started/Velox.md @@ -129,7 +129,7 @@ To enable this functionality, you must set the JAVA_HOME and HADOOP_HOME environ ### Build libhdfs3 -If you want to run Gluten with libhdfs3.so, you need to manually compile libhdfs3 to obtain the libhdfs3.so file. We provide the script dev/build_libhdfs3.sh in Gluten to help you compile libhdfs3.so. +If you want to run Gluten with libhdfs3.so, you need to manually compile libhdfs3 to obtain the libhdfs3.so file. We provide the script dev/build-libhdfs3.sh in Gluten to help you compile libhdfs3.so. ### Build with HDFS support @@ -161,7 +161,7 @@ cp /path/to/hdfs-client.xml hdfs-client.xml One typical deployment on Spark/HDFS cluster is to enable [short-circuit reading](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/ShortCircuitLocalReads.html). Short-circuit reads provide a substantial performance boost to many applications. -By default libhdfs3 does not set the default hdfs domain socket path to support HDFS short-circuit read. If this feature is required in HDFS setup, users may need to setup the domain socket path correctly by patching the libhdfs3 source code or by setting the correct config environment. In Gluten the short-circuit domain socket path is set to "/var/lib/hadoop-hdfs/dn_socket" in [build_velox.sh](https://github.com/apache/incubator-gluten/blob/main/ep/build-velox/src/build_velox.sh) So we need to make sure the folder existed and user has write access as below script. +By default libhdfs3 does not set the default hdfs domain socket path to support HDFS short-circuit read. If this feature is required in HDFS setup, users may need to setup the domain socket path correctly by patching the libhdfs3 source code or by setting the correct config environment. In Gluten the short-circuit domain socket path is set to "/var/lib/hadoop-hdfs/dn_socket" in [build-velox.sh](https://github.com/apache/incubator-gluten/blob/main/ep/build-velox/src/build-velox.sh) So we need to make sure the folder existed and user has write access as below script. ``` sudo mkdir -p /var/lib/hadoop-hdfs/ @@ -461,8 +461,8 @@ All TPC-H and TPC-DS queries are supported in Gluten Velox backend. You may refe ## Data preparation -The data generation scripts are [TPC-H dategen script](../../tools/workload/tpch/gen_data/parquet_dataset/tpch_datagen_parquet.sh) and -[TPC-DS dategen script](../../tools/workload/tpcds/gen_data/parquet_dataset/tpcds_datagen_parquet.sh). +The data generation scripts are [TPC-H dategen script](../../tools/workload/tpch/gen_data/parquet_dataset/tpch-dategen-parquet.sh) and +[TPC-DS dategen script](../../tools/workload/tpcds/gen_data/parquet_dataset/tpcds-dategen-parquet.sh). The used TPC-H and TPC-DS queries are the original ones, and can be accessed from [TPC-DS queries](../../tools/gluten-it/common/src/main/resources/tpcds-queries) and [TPC-H queries](../../tools/gluten-it/common/src/main/resources/tpch-queries). diff --git a/docs/get-started/VeloxGPU.md b/docs/get-started/VeloxGPU.md index 96776490125d..25a8abb5d160 100644 --- a/docs/get-started/VeloxGPU.md +++ b/docs/get-started/VeloxGPU.md @@ -23,7 +23,7 @@ parent: Getting-Started - **NVIDIA Drivers**: Compatible with CUDA 12.8. - **Container Toolkit**: Install `nvidia-container-toolkit` ([guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)). - **System Reboot**: Required after driver installation. -- **Environment Setup**: Use [`start_cudf.sh`](https://github.com/apache/incubator-gluten/tree/main/dev/start_cudf.sh) for host configuration . +- **Environment Setup**: Use [`start-cudf.sh`](https://github.com/apache/incubator-gluten/tree/main/dev/start-cudf.sh) for host configuration . --- diff --git a/docs/get-started/build-guide.md b/docs/get-started/build-guide.md index ffd02ad7ab61..d306e445773e 100644 --- a/docs/get-started/build-guide.md +++ b/docs/get-started/build-guide.md @@ -30,7 +30,7 @@ Please set them via `--`, e.g. `--build_type=Release`. | build_arrow | Build arrow java/cpp and install the libs in local. Can turn it OFF after first build. | ON | | spark_version | Build for specified version of Spark(3.2, 3.3, 3.4, 3.5, ALL). `ALL` means build for all versions. | ALL | -### Velox build parameters for build_velox.sh +### Velox build parameters for build-velox.sh Please set them via `--`, e.g., `--velox_home=/YOUR/PATH`. | Parameters | Description | Default | diff --git a/docs/velox-backend-support-progress.md b/docs/velox-backend-support-progress.md index b454f613eb1e..15727b50f059 100644 --- a/docs/velox-backend-support-progress.md +++ b/docs/velox-backend-support-progress.md @@ -96,16 +96,16 @@ In Gluten, function support is automatically generated by a script and maintaine When running the script, the `--spark_home` arg should be set to either: * The directory containing the Spark source code for the latest supported Spark version in Gluten, and the Spark project must be built from source. -* Or use the `install_spark_resources.sh` script to get a directory with the necessary resource files: +* Or use the `install-spark-resources.sh` script to get a directory with the necessary resource files: ``` # Define a directory to use for the Spark files and the latest Spark version export spark_dir=/tmp/spark export spark_version=3.5 - # Run the install_spark_resources.sh script - .github/workflows/util/install_spark_resources.sh ${spark_version} ${spark_dir} + # Run the install-spark-resources.sh script + .github/workflows/util/install-spark-resources.sh ${spark_version} ${spark_dir} ``` - After running the `install_spark_resources.sh`, the `--spark_home` for the document generation script will be + After running the `install-spark-resources.sh`, the `--spark_home` for the document generation script will be something like: `--spark_home=${spark_dir}/shims/spark35/spark_home"` Use the following command to generate and update the support status: diff --git a/ep/build-clickhouse/src/build_clickhouse.sh b/ep/build-clickhouse/src/build-clickhouse.sh similarity index 100% rename from ep/build-clickhouse/src/build_clickhouse.sh rename to ep/build-clickhouse/src/build-clickhouse.sh diff --git a/ep/build-clickhouse/src/install_ubuntu.sh b/ep/build-clickhouse/src/install-ubuntu.sh similarity index 100% rename from ep/build-clickhouse/src/install_ubuntu.sh rename to ep/build-clickhouse/src/install-ubuntu.sh diff --git a/ep/build-clickhouse/src/package.sh b/ep/build-clickhouse/src/package.sh index 06ca63c5d4d9..9fc833dcc6df 100755 --- a/ep/build-clickhouse/src/package.sh +++ b/ep/build-clickhouse/src/package.sh @@ -108,7 +108,7 @@ do done # build libch.so -bash "${GLUTEN_SOURCE}"/ep/build-clickhouse/src/build_clickhouse.sh +bash "${GLUTEN_SOURCE}"/ep/build-clickhouse/src/build-clickhouse.sh cp "$GLUTEN_SOURCE"/cpp-ch/build/utils/extern-local-engine/libch.so "${PACKAGE_DIR_PATH}"/libs/libch.so # copy bin and conf diff --git a/ep/build-clickhouse/src/resources/bin/check_env.sh b/ep/build-clickhouse/src/resources/bin/check-env.sh similarity index 100% rename from ep/build-clickhouse/src/resources/bin/check_env.sh rename to ep/build-clickhouse/src/resources/bin/check-env.sh diff --git a/ep/build-clickhouse/src/resources/bin/gluten.sh b/ep/build-clickhouse/src/resources/bin/gluten.sh index f45d21b69306..22bd469f7e05 100755 --- a/ep/build-clickhouse/src/resources/bin/gluten.sh +++ b/ep/build-clickhouse/src/resources/bin/gluten.sh @@ -17,7 +17,7 @@ set -e export GLUTEN_HOME=$(cd -P -- "$(dirname -- "$0")/.." && pwd -P) -source ${GLUTEN_HOME}/bin/check_env.sh || exit 1 +source ${GLUTEN_HOME}/bin/check-env.sh || exit 1 [[ ! -d "${GLUTEN_HOME}"/logs ]] && mkdir -p "${GLUTEN_HOME}"/logs diff --git a/ep/build-velox/src/build_velox.sh b/ep/build-velox/src/build-velox.sh similarity index 100% rename from ep/build-velox/src/build_velox.sh rename to ep/build-velox/src/build-velox.sh diff --git a/ep/build-velox/src/get_velox.sh b/ep/build-velox/src/get-velox.sh similarity index 99% rename from ep/build-velox/src/get_velox.sh rename to ep/build-velox/src/get-velox.sh index f7287db781f2..c000e377fe94 100755 --- a/ep/build-velox/src/get_velox.sh +++ b/ep/build-velox/src/get-velox.sh @@ -170,7 +170,7 @@ function setup_linux { elif [[ "$LINUX_DISTRIBUTION" == "openEuler" ]]; then # this is workaround for gcc-12.3.1 # https://github.com/facebookincubator/velox/blob/b263d9dd8b8910dc642d8fdb0c0adee4b2a1fb29/CMakeLists.txt#L433 - sed -i "s|no-unknown-warning-option|no-unknown-warning-option -Wno-restrict|g" ../../src/build_velox.sh + sed -i "s|no-unknown-warning-option|no-unknown-warning-option -Wno-restrict|g" ../../src/build-velox.sh case "$LINUX_VERSION_ID" in 24.03) ;; *) diff --git a/gluten-core/src/main/scala/org/apache/gluten/config/GlutenCoreConfig.scala b/gluten-core/src/main/scala/org/apache/gluten/config/GlutenCoreConfig.scala index e76a9431d15a..d362060ded98 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/config/GlutenCoreConfig.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/config/GlutenCoreConfig.scala @@ -60,7 +60,7 @@ class GlutenCoreConfig(conf: SQLConf) extends Logging { /* * Note: Gluten configuration.md is automatically generated from this code. - * Make sure to run dev/gen_all_config_docs.sh after making changes to this file. + * Make sure to run dev/gen-all-config-docs.sh after making changes to this file. */ object GlutenCoreConfig extends ConfigRegistry { override def get: GlutenCoreConfig = { diff --git a/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala b/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala index 5cdaaeda1adb..9359a465a11b 100644 --- a/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala +++ b/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala @@ -47,7 +47,7 @@ case object RssSortShuffleWriterType extends ShuffleWriterType { /* * Note: Gluten configiguration.md is automatically generated from this code. - * Make sure to run dev/gen_all_config_docs.sh after making changes to this file. + * Make sure to run dev/gen-all-config-docs.sh after making changes to this file. */ class GlutenConfig(conf: SQLConf) extends GlutenCoreConfig(conf) { import GlutenConfig._ diff --git a/gluten-substrait/src/test/scala/org/apache/gluten/config/AllGlutenConfiguration.scala b/gluten-substrait/src/test/scala/org/apache/gluten/config/AllGlutenConfiguration.scala index b7e98c85d924..2e6bd0cfcc82 100644 --- a/gluten-substrait/src/test/scala/org/apache/gluten/config/AllGlutenConfiguration.scala +++ b/gluten-substrait/src/test/scala/org/apache/gluten/config/AllGlutenConfiguration.scala @@ -36,12 +36,12 @@ import scala.io.Source * * To run the entire test suite: * {{{ - * GLUTEN_UPDATE=0 dev/gen_all_config_docs.sh + * GLUTEN_UPDATE=0 dev/gen-all-config-docs.sh * }}} * * To re-generate golden files for entire suite, run: * {{{ - * dev/gen_all_config_docs.sh + * dev/gen-all-config-docs.sh * }}} */ class AllGlutenConfiguration extends AnyFunSuite { @@ -142,7 +142,7 @@ class AllGlutenConfiguration extends AnyFunSuite { AllGlutenConfiguration.verifyOrRegenerateGoldenFile( markdown, builder.toMarkdown, - "dev/gen_all_config_docs.sh") + "dev/gen-all-config-docs.sh") } } diff --git a/tools/workload/benchmark_velox/README.md b/tools/workload/benchmark_velox/README.md index ee71cc03cff9..786e66a1c003 100644 --- a/tools/workload/benchmark_velox/README.md +++ b/tools/workload/benchmark_velox/README.md @@ -31,7 +31,7 @@ papermill tpc_workload.ipynb --inject-output-path -f params.yaml -p workoad tpcd Please refer to the Papermill documentation for additional usage details. -We also provide a script [run_tpc_workload.sh](./run_tpc_workload.sh). This script wraps the Papermill command, automatically renaming the output notebook with a timestamp and application ID to prevent overwriting existing output files. +We also provide a script [run-tpc-workload.sh](./run-tpc-workload.sh). This script wraps the Papermill command, automatically renaming the output notebook with a timestamp and application ID to prevent overwriting existing output files. ## Analyzing Performance Results diff --git a/tools/workload/benchmark_velox/analysis/run_perf_analysis.sh b/tools/workload/benchmark_velox/analysis/run-perf-analysis.sh similarity index 100% rename from tools/workload/benchmark_velox/analysis/run_perf_analysis.sh rename to tools/workload/benchmark_velox/analysis/run-perf-analysis.sh diff --git a/tools/workload/benchmark_velox/build_gluten.sh b/tools/workload/benchmark_velox/build-gluten.sh similarity index 100% rename from tools/workload/benchmark_velox/build_gluten.sh rename to tools/workload/benchmark_velox/build-gluten.sh diff --git a/tools/workload/benchmark_velox/initialize.ipynb b/tools/workload/benchmark_velox/initialize.ipynb index e2b6fab7ba45..2e80ac77c873 100644 --- a/tools/workload/benchmark_velox/initialize.ipynb +++ b/tools/workload/benchmark_velox/initialize.ipynb @@ -2263,7 +2263,7 @@ }, "outputs": [], "source": [ - "!bash build_gluten.sh" + "!bash build-gluten.sh" ] }, { diff --git a/tools/workload/benchmark_velox/native_sql_initialize.ipynb b/tools/workload/benchmark_velox/native_sql_initialize.ipynb index f34558dbea99..868667deaffe 100644 --- a/tools/workload/benchmark_velox/native_sql_initialize.ipynb +++ b/tools/workload/benchmark_velox/native_sql_initialize.ipynb @@ -899,7 +899,7 @@ " if not self.server:\n", " return\n", "\n", - " run_script=f'{server_gluten_home}/tools/workload/benchmark_velox/analysis/run_perf_analysis.sh'\n", + " run_script=f'{server_gluten_home}/tools/workload/benchmark_velox/analysis/run-perf-analysis.sh'\n", "\n", " disk=','.join(disk_dev)\n", " nic=','.join(nic_dev)\n", diff --git a/tools/workload/benchmark_velox/run_tpc_workload.sh b/tools/workload/benchmark_velox/run-tpc-workload.sh similarity index 100% rename from tools/workload/benchmark_velox/run_tpc_workload.sh rename to tools/workload/benchmark_velox/run-tpc-workload.sh diff --git a/tools/workload/tpcds-delta/README.md b/tools/workload/tpcds-delta/README.md index 8fa4509cba55..11e9ea01dcf6 100644 --- a/tools/workload/tpcds-delta/README.md +++ b/tools/workload/tpcds-delta/README.md @@ -4,7 +4,7 @@ This workload example is verified with JDK 8, Spark 3.4.4 and Delta 2.4.0. ## Test dataset -Use bash script `tpcds_datagen_delta.sh` to generate the data. The script relies on a already-built gluten-it +Use bash script `tpcds-dategen-delta.sh` to generate the data. The script relies on a already-built gluten-it executable. To build it, following the steps: ```bash @@ -16,7 +16,7 @@ Then call the data generator script: ```bash cd ${GLUTEN_HOME}/tools/workload/tpcds-delta/gen_data -./tpcds_datagen_delta.sh +./tpcds-dategen-delta.sh ``` Meanings of the commands that are used in the script are explained as follows: diff --git a/tools/workload/tpcds-delta/gen_data/tpcds_datagen_delta.sh b/tools/workload/tpcds-delta/gen_data/tpcds-dategen-delta.sh similarity index 100% rename from tools/workload/tpcds-delta/gen_data/tpcds_datagen_delta.sh rename to tools/workload/tpcds-delta/gen_data/tpcds-dategen-delta.sh diff --git a/tools/workload/tpcds-delta/run_tpcds/run_tpcds.sh b/tools/workload/tpcds-delta/run_tpcds/run-tpcds.sh similarity index 100% rename from tools/workload/tpcds-delta/run_tpcds/run_tpcds.sh rename to tools/workload/tpcds-delta/run_tpcds/run-tpcds.sh diff --git a/tools/workload/tpcds/README.md b/tools/workload/tpcds/README.md index 8091054a0fb5..a64701089e5a 100644 --- a/tools/workload/tpcds/README.md +++ b/tools/workload/tpcds/README.md @@ -7,7 +7,7 @@ Parquet format is supported. Here are the steps to generate the testing datasets Please refer to the scripts in [parquet_dataset](./gen_data/parquet_dataset/) directory to generate parquet dataset. Note this script relies on the [spark-sql-perf](https://github.com/databricks/spark-sql-perf) and the [tpcds-kit](https://github.com/databricks/tpcds-kit) package from Databricks. -In tpcds_datagen_parquet.sh, several parameters should be configured according to the system. +In tpcds-dategen-parquet.sh, several parameters should be configured according to the system. ``` spark_sql_perf_jar=/PATH/TO/spark-sql-perf_2.12-0.5.1-SNAPSHOT.jar ... diff --git a/tools/workload/tpcds/gen_data/parquet_dataset/tpcds_datagen_parquet.sh b/tools/workload/tpcds/gen_data/parquet_dataset/tpcds-dategen-parquet.sh similarity index 100% rename from tools/workload/tpcds/gen_data/parquet_dataset/tpcds_datagen_parquet.sh rename to tools/workload/tpcds/gen_data/parquet_dataset/tpcds-dategen-parquet.sh diff --git a/tools/workload/tpcds/run_tpcds/run_tpcds.sh b/tools/workload/tpcds/run_tpcds/run-tpcds.sh similarity index 100% rename from tools/workload/tpcds/run_tpcds/run_tpcds.sh rename to tools/workload/tpcds/run_tpcds/run-tpcds.sh diff --git a/tools/workload/tpch/README.md b/tools/workload/tpch/README.md index 10a8930583dc..5d288229f983 100644 --- a/tools/workload/tpch/README.md +++ b/tools/workload/tpch/README.md @@ -7,7 +7,7 @@ Parquet and DWRF (a fork of the ORC file format) format files are both supported Please refer to the scripts in [parquet_dataset](./gen_data/parquet_dataset/) directory to generate parquet dataset. Note this script relies on the [spark-sql-perf](https://github.com/databricks/spark-sql-perf) and [tpch-dbgen](https://github.com/databricks/tpch-dbgen) package from Databricks. Note in the tpch-dbgen kits, we need to do a slight modification to allow Spark to convert the csv based content to parquet, please make sure to use this commit: [0469309147b42abac8857fa61b4cf69a6d3128a8](https://github.com/databricks/tpch-dbgen/commit/0469309147b42abac8857fa61b4cf69a6d3128a8) -In tpch_datagen_parquet.sh, several parameters should be configured according to the system. +In tpch-dategen-parquet.sh, several parameters should be configured according to the system. ``` spark_sql_perf_jar=/PATH/TO/spark-sql-perf_2.12-0.5.1-SNAPSHOT.jar ... diff --git a/tools/workload/tpch/gen_data/parquet_dataset/tpch_datagen_parquet.sh b/tools/workload/tpch/gen_data/parquet_dataset/tpch-dategen-parquet.sh similarity index 100% rename from tools/workload/tpch/gen_data/parquet_dataset/tpch_datagen_parquet.sh rename to tools/workload/tpch/gen_data/parquet_dataset/tpch-dategen-parquet.sh diff --git a/tools/workload/tpch/run_tpch/tpch_parquet.sh b/tools/workload/tpch/run_tpch/tpch-parquet.sh similarity index 100% rename from tools/workload/tpch/run_tpch/tpch_parquet.sh rename to tools/workload/tpch/run_tpch/tpch-parquet.sh