From 0be69c847c80e2859f055c86e0f2ecf2d6e68c58 Mon Sep 17 00:00:00 2001 From: Yuan Date: Wed, 10 Dec 2025 13:20:09 +0000 Subject: [PATCH 01/15] fix gpu build by bumping to cuda-13.1 Signed-off-by: Yuan --- .github/workflows/velox_backend_x86.yml | 9 ++++++--- cpp/velox/CMakeLists.txt | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/velox_backend_x86.yml b/.github/workflows/velox_backend_x86.yml index 5618aba857cb..78cb24062351 100644 --- a/.github/workflows/velox_backend_x86.yml +++ b/.github/workflows/velox_backend_x86.yml @@ -1329,11 +1329,14 @@ jobs: rm -rf /opt/rh/gcc-toolset-12 && cp -r /opt/rh/gcc-toolset-14 /opt/rh/gcc-toolset-12 # hack to use gcc 14, should upgrade in Velox build script later dnf autoremove -y df -a + dnf remove cuda-toolkit* && dnf install -y cuda-toolkit-13-1 source /opt/rh/gcc-toolset-12/enable + + export CMAKE_BUILD_PARALLEL_LEVEL=4 export NUM_THREADS=4 - # bash dev/builddeps-veloxbe.sh --run_setup_script=OFF --build_arrow=OFF --build_tests=OFF --build_benchmarks=ON --enable_gpu=ON # TODO: re-enable tests with more disk space - # rm -rf ep/build-velox/build/velox_ep - # mvn clean package -Pbackends-velox -Pspark-3.4 -DskipTests + bash dev/builddeps-veloxbe.sh --run_setup_script=OFF --build_arrow=OFF --build_tests=OFF --build_benchmarks=ON --enable_gpu=ON # TODO: re-enable tests with more disk space + rm -rf ep/build-velox/build/velox_ep + mvn clean package -Pbackends-velox -Pspark-3.4 -DskipTests ccache -s spark-test-spark40: diff --git a/cpp/velox/CMakeLists.txt b/cpp/velox/CMakeLists.txt index e389113c4e04..20cf366d2ea7 100644 --- a/cpp/velox/CMakeLists.txt +++ b/cpp/velox/CMakeLists.txt @@ -439,6 +439,7 @@ if(ENABLE_GPU) ${VELOX_BUILD_PATH}/_deps/nvtx3-src/c/include ${VELOX_BUILD_PATH}/_deps/nvcomp_proprietary_binary-src/include ${VELOX_BUILD_PATH}/_deps/rapids_logger-src/include + /usr/local/cuda/include/cccl /usr/local/cuda/include) target_compile_definitions( From 948e9878f5872d025ab3448241a899a728fe14af Mon Sep 17 00:00:00 2001 From: Yuan Date: Thu, 11 Dec 2025 04:31:59 +0000 Subject: [PATCH 02/15] fix Signed-off-by: Yuan --- .github/workflows/velox_backend_x86.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/velox_backend_x86.yml b/.github/workflows/velox_backend_x86.yml index 78cb24062351..86bd8cb7f33d 100644 --- a/.github/workflows/velox_backend_x86.yml +++ b/.github/workflows/velox_backend_x86.yml @@ -1327,13 +1327,14 @@ jobs: - name: Build Gluten native libraries run: | rm -rf /opt/rh/gcc-toolset-12 && cp -r /opt/rh/gcc-toolset-14 /opt/rh/gcc-toolset-12 # hack to use gcc 14, should upgrade in Velox build script later - dnf autoremove -y df -a - dnf remove cuda-toolkit* && dnf install -y cuda-toolkit-13-1 + dnf remove cuda-toolkit-12* && dnf install -y cuda-toolkit-13-1 + dnf autoremove -y && dnf clean all source /opt/rh/gcc-toolset-12/enable export CMAKE_BUILD_PARALLEL_LEVEL=4 export NUM_THREADS=4 + bash dev/builddeps-veloxbe.sh --run_setup_script=OFF --build_arrow=OFF --build_tests=OFF --build_benchmarks=ON --enable_gpu=ON # TODO: re-enable tests with more disk space rm -rf ep/build-velox/build/velox_ep mvn clean package -Pbackends-velox -Pspark-3.4 -DskipTests From 648e2f25c992249c908c585217c719c05b8650f0 Mon Sep 17 00:00:00 2001 From: Yuan Date: Mon, 15 Dec 2025 02:50:43 +0000 Subject: [PATCH 03/15] fix Signed-off-by: Yuan --- cpp/velox/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/velox/CMakeLists.txt b/cpp/velox/CMakeLists.txt index 20cf366d2ea7..c003a322bf9b 100644 --- a/cpp/velox/CMakeLists.txt +++ b/cpp/velox/CMakeLists.txt @@ -439,7 +439,9 @@ if(ENABLE_GPU) ${VELOX_BUILD_PATH}/_deps/nvtx3-src/c/include ${VELOX_BUILD_PATH}/_deps/nvcomp_proprietary_binary-src/include ${VELOX_BUILD_PATH}/_deps/rapids_logger-src/include - /usr/local/cuda/include/cccl + ${VELOX_DEPS_PATH}/cccl-src/thrust + ${VELOX_DEPS_PATH}/cccl-src/cub + ${VELOX_DEPS_PATH}/cccl-src/libcudacxx/include /usr/local/cuda/include) target_compile_definitions( From f851a891503a2b9a4dccb542b21c01c5a481f911 Mon Sep 17 00:00:00 2001 From: Yuan Date: Mon, 15 Dec 2025 05:24:21 +0000 Subject: [PATCH 04/15] fix Signed-off-by: Yuan --- .github/workflows/velox_backend_x86.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/velox_backend_x86.yml b/.github/workflows/velox_backend_x86.yml index 86bd8cb7f33d..346117c06b3f 100644 --- a/.github/workflows/velox_backend_x86.yml +++ b/.github/workflows/velox_backend_x86.yml @@ -1328,7 +1328,6 @@ jobs: run: | rm -rf /opt/rh/gcc-toolset-12 && cp -r /opt/rh/gcc-toolset-14 /opt/rh/gcc-toolset-12 # hack to use gcc 14, should upgrade in Velox build script later df -a - dnf remove cuda-toolkit-12* && dnf install -y cuda-toolkit-13-1 dnf autoremove -y && dnf clean all source /opt/rh/gcc-toolset-12/enable From 384f7c15aa98eb39de88a74bd22087a843d8e279 Mon Sep 17 00:00:00 2001 From: Yuan Date: Mon, 15 Dec 2025 07:55:27 +0000 Subject: [PATCH 05/15] fix Signed-off-by: Yuan --- .github/workflows/velox_backend_x86.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/velox_backend_x86.yml b/.github/workflows/velox_backend_x86.yml index 346117c06b3f..460622ffed3c 100644 --- a/.github/workflows/velox_backend_x86.yml +++ b/.github/workflows/velox_backend_x86.yml @@ -1326,7 +1326,7 @@ jobs: ccache-centos9-release-shared-${{runner.arch}} - name: Build Gluten native libraries run: | - rm -rf /opt/rh/gcc-toolset-12 && cp -r /opt/rh/gcc-toolset-14 /opt/rh/gcc-toolset-12 # hack to use gcc 14, should upgrade in Velox build script later + rm -rf /opt/rh/gcc-toolset-12 && ln -s /opt/rh/gcc-toolset-14 /opt/rh/gcc-toolset-12 # hack to use gcc 14, should upgrade in Velox build script later df -a dnf autoremove -y && dnf clean all source /opt/rh/gcc-toolset-12/enable From e2a759d0f63da73f647b76b3df37cb4d84efb347 Mon Sep 17 00:00:00 2001 From: Yuan Date: Mon, 15 Dec 2025 07:55:31 +0000 Subject: [PATCH 06/15] Revert "fix" This reverts commit 648e2f25c992249c908c585217c719c05b8650f0. --- cpp/velox/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/velox/CMakeLists.txt b/cpp/velox/CMakeLists.txt index c003a322bf9b..20cf366d2ea7 100644 --- a/cpp/velox/CMakeLists.txt +++ b/cpp/velox/CMakeLists.txt @@ -439,9 +439,7 @@ if(ENABLE_GPU) ${VELOX_BUILD_PATH}/_deps/nvtx3-src/c/include ${VELOX_BUILD_PATH}/_deps/nvcomp_proprietary_binary-src/include ${VELOX_BUILD_PATH}/_deps/rapids_logger-src/include - ${VELOX_DEPS_PATH}/cccl-src/thrust - ${VELOX_DEPS_PATH}/cccl-src/cub - ${VELOX_DEPS_PATH}/cccl-src/libcudacxx/include + /usr/local/cuda/include/cccl /usr/local/cuda/include) target_compile_definitions( From 4a9dd17f74c95aba3a55057275b3bd98a3fa2384 Mon Sep 17 00:00:00 2001 From: Yuan Date: Mon, 15 Dec 2025 08:11:03 +0000 Subject: [PATCH 07/15] fix cache Signed-off-by: Yuan --- .github/workflows/velox_backend_cache.yml | 75 +---------------------- .github/workflows/velox_backend_x86.yml | 4 +- 2 files changed, 5 insertions(+), 74 deletions(-) diff --git a/.github/workflows/velox_backend_cache.yml b/.github/workflows/velox_backend_cache.yml index eb060c3a90c1..51021ef4404f 100644 --- a/.github/workflows/velox_backend_cache.yml +++ b/.github/workflows/velox_backend_cache.yml @@ -160,8 +160,9 @@ jobs: export CCACHE_MAXSIZE=1G dnf autoremove -y df -a - rm -rf /opt/rh/gcc-toolset-12 && cp -r /opt/rh/gcc-toolset-14 /opt/rh/gcc-toolset-12 # hack to use gcc 14, should upgrade in Velox build script later + rm -rf /opt/rh/gcc-toolset-12 && ln -s /opt/rh/gcc-toolset-14 /opt/rh/gcc-toolset-12 # hack to use gcc 14, should upgrade in Velox build script later source /opt/rh/gcc-toolset-12/enable + export CMAKE_BUILD_PARALLEL_LEVEL=4 export NUM_THREADS=4 bash dev/builddeps-veloxbe.sh --run_setup_script=OFF --build_arrow=OFF --build_tests=OFF --build_benchmarks=ON --enable_gpu=ON # TODO: re-enable tests with more disk space rm -rf ep/build-velox/build/velox_ep @@ -172,75 +173,3 @@ jobs: with: path: '${{ env.CCACHE_DIR }}' key: ccache-centos9-release-shared-${{runner.arch}}-${{github.sha}} - - # ccache-native-lib-ubuntu-velox-ut: - # runs-on: ubuntu-22.04 - # env: - # CCACHE_DIR: "${{ github.workspace }}/.ccache" - # container: ghcr.io/facebookincubator/velox-dev:amd64-ubuntu-22.04-avx - # steps: - # - uses: actions/checkout@v2 - # - name: Get Ccache - # uses: actions/cache/restore@v3 - # with: - # path: '${{ env.CCACHE_DIR }}' - # key: ccache-ubuntu-release-default - # - name: Ensure Cache Dirs Exists - # working-directory: ${{ github.workspace }} - # run: | - # mkdir -p '${{ env.CCACHE_DIR }}' - # - name: Build Gluten native libraries - # run: | - # rm -rf /opt/miniconda-for-velox/ - # cd ep/build-velox/src && \ - # ./get-velox.sh - # cd ../build/velox_ep/ - # make EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_PARQUET=ON -DVELOX_BUILD_TESTING=ON -DVELOX_BUILD_TEST_UTILS=ON" - - # - name: CCache after - # run: | - # ccache -vs - - # - uses: actions/cache/save@v3 - # with: - # path: '${{ env.CCACHE_DIR }}' - # key: ccache-ubuntu-release-default -# ccache-native-lib-centos-velox-ut: -# runs-on: ubuntu-22.04 -# env: -# CCACHE_DIR: "${{ github.workspace }}/.ccache" -# container: ghcr.io/facebookincubator/velox-dev:centos8 -# steps: -# - uses: actions/checkout@v2 -# - name: Setup java and maven -# run: | -# yum install sudo patch java-1.8.0-openjdk-devel wget -y && \ -# wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz -# tar -xvf apache-maven-3.8.8-bin.tar.gz -# mv apache-maven-3.8.8 /usr/lib/maven -# - name: Get Ccache -# uses: actions/cache/restore@v3 -# with: -# path: '${{ env.CCACHE_DIR }}' -# key: ccache-centos-release-default -# - name: Ensure Cache Dirs Exists -# working-directory: ${{ github.workspace }} -# run: | -# mkdir -p '${{ env.CCACHE_DIR }}' -# - name: Build Gluten native libraries -# run: | -# rm -rf /opt/miniconda-for-velox/ -# cd ep/build-velox/src && \ -# ./get-velox.sh -# cd ../build/velox_ep/ -# source /opt/rh/gcc-toolset-9/enable -# make EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_PARQUET=ON -DVELOX_BUILD_TESTING=ON -DVELOX_BUILD_TEST_UTILS=ON" -# -# - name: CCache after -# run: | -# ccache -s -# -# - uses: actions/cache/save@v3 -# with: -# path: '${{ env.CCACHE_DIR }}' -# key: ccache-centos-release-default diff --git a/.github/workflows/velox_backend_x86.yml b/.github/workflows/velox_backend_x86.yml index 460622ffed3c..6c8c66d459bb 100644 --- a/.github/workflows/velox_backend_x86.yml +++ b/.github/workflows/velox_backend_x86.yml @@ -1329,12 +1329,14 @@ jobs: rm -rf /opt/rh/gcc-toolset-12 && ln -s /opt/rh/gcc-toolset-14 /opt/rh/gcc-toolset-12 # hack to use gcc 14, should upgrade in Velox build script later df -a dnf autoremove -y && dnf clean all + dnf remove cuda-toolkit-12* && dnf install -y cuda-toolkit-13-1 + ls -l /usr/local/ source /opt/rh/gcc-toolset-12/enable export CMAKE_BUILD_PARALLEL_LEVEL=4 export NUM_THREADS=4 - bash dev/builddeps-veloxbe.sh --run_setup_script=OFF --build_arrow=OFF --build_tests=OFF --build_benchmarks=ON --enable_gpu=ON # TODO: re-enable tests with more disk space + bash dev/builddeps-veloxbe.sh --run_setup_script=OFF --build_arrow=OFF --build_tests=OFF --build_benchmarks=OFF --enable_gpu=ON # TODO: re-enable tests with more disk space rm -rf ep/build-velox/build/velox_ep mvn clean package -Pbackends-velox -Pspark-3.4 -DskipTests ccache -s From 731609b4fe8a5ee88227f06302aa4990d1396857 Mon Sep 17 00:00:00 2001 From: Yuan Date: Mon, 15 Dec 2025 09:32:23 +0000 Subject: [PATCH 08/15] fix Signed-off-by: Yuan --- .github/workflows/velox_backend_x86.yml | 2 +- ep/build-velox/src/build-velox.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/velox_backend_x86.yml b/.github/workflows/velox_backend_x86.yml index 6c8c66d459bb..26318e53e6e1 100644 --- a/.github/workflows/velox_backend_x86.yml +++ b/.github/workflows/velox_backend_x86.yml @@ -1329,7 +1329,7 @@ jobs: rm -rf /opt/rh/gcc-toolset-12 && ln -s /opt/rh/gcc-toolset-14 /opt/rh/gcc-toolset-12 # hack to use gcc 14, should upgrade in Velox build script later df -a dnf autoremove -y && dnf clean all - dnf remove cuda-toolkit-12* && dnf install -y cuda-toolkit-13-1 + dnf remove -y cuda-toolkit-12* && dnf install -y cuda-toolkit-13-1 ls -l /usr/local/ source /opt/rh/gcc-toolset-12/enable diff --git a/ep/build-velox/src/build-velox.sh b/ep/build-velox/src/build-velox.sh index 3e0f6be4fb25..c202eac119cd 100755 --- a/ep/build-velox/src/build-velox.sh +++ b/ep/build-velox/src/build-velox.sh @@ -135,7 +135,7 @@ function compile { # the cuda default options are for Centos9 image from Meta echo "enable GPU support." COMPILE_OPTION="$COMPILE_OPTION -DVELOX_ENABLE_GPU=ON -DVELOX_ENABLE_CUDF=ON -DCMAKE_CUDA_ARCHITECTURES=70 \ - -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.8/bin/nvcc" + -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc" fi if [ -n "${GLUTEN_VCPKG_ENABLED:-}" ]; then COMPILE_OPTION="$COMPILE_OPTION -DVELOX_GFLAGS_TYPE=static" From 85a206406357598054397ce9ff3a88c39d3c296b Mon Sep 17 00:00:00 2001 From: Yuan Date: Mon, 15 Dec 2025 10:39:27 +0000 Subject: [PATCH 09/15] test Signed-off-by: Yuan --- .github/workflows/velox_backend_x86.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/velox_backend_x86.yml b/.github/workflows/velox_backend_x86.yml index 26318e53e6e1..d33a52376f7c 100644 --- a/.github/workflows/velox_backend_x86.yml +++ b/.github/workflows/velox_backend_x86.yml @@ -1314,7 +1314,7 @@ jobs: build-cudf-centos-9: runs-on: ubuntu-22.04 - container: apache/gluten:centos-9-jdk8-cudf + container: inteldpo/gluten-ci-images:centos-9-jdk8-cudf steps: - uses: actions/checkout@v2 - name: Get Ccache From 35addab74acf4d3d8a95a95193090b2749e1130b Mon Sep 17 00:00:00 2001 From: Yuan Date: Mon, 15 Dec 2025 11:37:35 +0000 Subject: [PATCH 10/15] Revert "test" This reverts commit 85a206406357598054397ce9ff3a88c39d3c296b. --- .github/workflows/velox_backend_x86.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/velox_backend_x86.yml b/.github/workflows/velox_backend_x86.yml index d33a52376f7c..26318e53e6e1 100644 --- a/.github/workflows/velox_backend_x86.yml +++ b/.github/workflows/velox_backend_x86.yml @@ -1314,7 +1314,7 @@ jobs: build-cudf-centos-9: runs-on: ubuntu-22.04 - container: inteldpo/gluten-ci-images:centos-9-jdk8-cudf + container: apache/gluten:centos-9-jdk8-cudf steps: - uses: actions/checkout@v2 - name: Get Ccache From d60e779b8b6029b55a58ce900bd0500f26d9b99a Mon Sep 17 00:00:00 2001 From: Yuan Date: Mon, 15 Dec 2025 11:57:10 +0000 Subject: [PATCH 11/15] fix cuda path Signed-off-by: Yuan --- ep/build-velox/src/build-velox.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ep/build-velox/src/build-velox.sh b/ep/build-velox/src/build-velox.sh index c202eac119cd..2028a20fcff1 100755 --- a/ep/build-velox/src/build-velox.sh +++ b/ep/build-velox/src/build-velox.sh @@ -135,7 +135,7 @@ function compile { # the cuda default options are for Centos9 image from Meta echo "enable GPU support." COMPILE_OPTION="$COMPILE_OPTION -DVELOX_ENABLE_GPU=ON -DVELOX_ENABLE_CUDF=ON -DCMAKE_CUDA_ARCHITECTURES=70 \ - -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc" + -DCMAKE_CUDA_COMPILER=/usr/local/cuda-13.1/bin/nvcc" fi if [ -n "${GLUTEN_VCPKG_ENABLED:-}" ]; then COMPILE_OPTION="$COMPILE_OPTION -DVELOX_GFLAGS_TYPE=static" From faeb163d0f5a9e64ef6fade9814a5d53c66bdb63 Mon Sep 17 00:00:00 2001 From: Yuan Date: Mon, 15 Dec 2025 12:54:48 +0000 Subject: [PATCH 12/15] fix Signed-off-by: Yuan --- .github/workflows/velox_backend_x86.yml | 17 ++++++++++++++--- cpp/velox/CMakeLists.txt | 2 +- ep/build-velox/src/build-velox.sh | 2 +- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/.github/workflows/velox_backend_x86.yml b/.github/workflows/velox_backend_x86.yml index 26318e53e6e1..c5092b94220e 100644 --- a/.github/workflows/velox_backend_x86.yml +++ b/.github/workflows/velox_backend_x86.yml @@ -1314,9 +1314,15 @@ jobs: build-cudf-centos-9: runs-on: ubuntu-22.04 - container: apache/gluten:centos-9-jdk8-cudf steps: - - uses: actions/checkout@v2 + - name: "node-cleanup" # by default the free runner does not have enough disk space + run: | + sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL + sudo docker image prune --all --force + sudo docker builder prune -a + - run: df -h | sort -k 5 -nr # check disk space for debug + + - uses: actions/checkout@v4 - name: Get Ccache uses: actions/cache/restore@v4 with: @@ -1326,6 +1332,7 @@ jobs: ccache-centos9-release-shared-${{runner.arch}} - name: Build Gluten native libraries run: | + docker run -v $GITHUB_WORKSPACE:/work -w /work apache/gluten:centos-9-jdk8-cudf bash -c " rm -rf /opt/rh/gcc-toolset-12 && ln -s /opt/rh/gcc-toolset-14 /opt/rh/gcc-toolset-12 # hack to use gcc 14, should upgrade in Velox build script later df -a dnf autoremove -y && dnf clean all @@ -1335,11 +1342,15 @@ jobs: export CMAKE_BUILD_PARALLEL_LEVEL=4 export NUM_THREADS=4 - + export CCACHE_DIR=/work/.ccache + mkdir -p /work/.ccache + + cd /work bash dev/builddeps-veloxbe.sh --run_setup_script=OFF --build_arrow=OFF --build_tests=OFF --build_benchmarks=OFF --enable_gpu=ON # TODO: re-enable tests with more disk space rm -rf ep/build-velox/build/velox_ep mvn clean package -Pbackends-velox -Pspark-3.4 -DskipTests ccache -s + " spark-test-spark40: needs: build-native-lib-centos-7 diff --git a/cpp/velox/CMakeLists.txt b/cpp/velox/CMakeLists.txt index 20cf366d2ea7..a1ce8b347268 100644 --- a/cpp/velox/CMakeLists.txt +++ b/cpp/velox/CMakeLists.txt @@ -459,7 +459,7 @@ if(ENABLE_GPU) ${VELOX_BUILD_PATH}/_deps/nvcomp_proprietary_binary-src/lib64/libnvcomp.so ${VELOX_BUILD_PATH}/_deps/nvcomp_proprietary_binary-src/lib64/libnvcomp_cpu.so ${VELOX_BUILD_PATH}/_deps/rapids_logger-build/librapids_logger.so - /usr/local/cuda-12.8/lib64/libcudart.so.12) + /usr/local/cuda-13.1/lib64/libcudart.so.12) endif() add_custom_command( diff --git a/ep/build-velox/src/build-velox.sh b/ep/build-velox/src/build-velox.sh index 2028a20fcff1..e58b12b98f96 100755 --- a/ep/build-velox/src/build-velox.sh +++ b/ep/build-velox/src/build-velox.sh @@ -134,7 +134,7 @@ function compile { if [ $ENABLE_GPU == "ON" ]; then # the cuda default options are for Centos9 image from Meta echo "enable GPU support." - COMPILE_OPTION="$COMPILE_OPTION -DVELOX_ENABLE_GPU=ON -DVELOX_ENABLE_CUDF=ON -DCMAKE_CUDA_ARCHITECTURES=70 \ + COMPILE_OPTION="$COMPILE_OPTION -DVELOX_ENABLE_GPU=ON -DVELOX_ENABLE_CUDF=ON -DCMAKE_CUDA_ARCHITECTURES=75 \ -DCMAKE_CUDA_COMPILER=/usr/local/cuda-13.1/bin/nvcc" fi if [ -n "${GLUTEN_VCPKG_ENABLED:-}" ]; then From 5b91e79f7208838a1358ce34b14ced5dec80d436 Mon Sep 17 00:00:00 2001 From: Yuan Date: Mon, 15 Dec 2025 16:28:38 +0000 Subject: [PATCH 13/15] fix cuda version in path Signed-off-by: Yuan --- cpp/velox/CMakeLists.txt | 2 +- ep/build-velox/src/build-velox.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/velox/CMakeLists.txt b/cpp/velox/CMakeLists.txt index a1ce8b347268..9547cde0e254 100644 --- a/cpp/velox/CMakeLists.txt +++ b/cpp/velox/CMakeLists.txt @@ -459,7 +459,7 @@ if(ENABLE_GPU) ${VELOX_BUILD_PATH}/_deps/nvcomp_proprietary_binary-src/lib64/libnvcomp.so ${VELOX_BUILD_PATH}/_deps/nvcomp_proprietary_binary-src/lib64/libnvcomp_cpu.so ${VELOX_BUILD_PATH}/_deps/rapids_logger-build/librapids_logger.so - /usr/local/cuda-13.1/lib64/libcudart.so.12) + /usr/local/cuda/lib64/libcudart.so) endif() add_custom_command( diff --git a/ep/build-velox/src/build-velox.sh b/ep/build-velox/src/build-velox.sh index e58b12b98f96..ac62f8fc27f3 100755 --- a/ep/build-velox/src/build-velox.sh +++ b/ep/build-velox/src/build-velox.sh @@ -135,7 +135,7 @@ function compile { # the cuda default options are for Centos9 image from Meta echo "enable GPU support." COMPILE_OPTION="$COMPILE_OPTION -DVELOX_ENABLE_GPU=ON -DVELOX_ENABLE_CUDF=ON -DCMAKE_CUDA_ARCHITECTURES=75 \ - -DCMAKE_CUDA_COMPILER=/usr/local/cuda-13.1/bin/nvcc" + -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc" fi if [ -n "${GLUTEN_VCPKG_ENABLED:-}" ]; then COMPILE_OPTION="$COMPILE_OPTION -DVELOX_GFLAGS_TYPE=static" From cd21863fe264c84d2c2a1de44616774747a645ec Mon Sep 17 00:00:00 2001 From: Yuan Date: Tue, 16 Dec 2025 07:24:33 +0000 Subject: [PATCH 14/15] fix cache Signed-off-by: Yuan --- .github/workflows/velox_backend_cache.yml | 27 ++++++++++++++++++----- .github/workflows/velox_backend_x86.yml | 4 ++-- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/.github/workflows/velox_backend_cache.yml b/.github/workflows/velox_backend_cache.yml index 51021ef4404f..8d0a3da0c91e 100644 --- a/.github/workflows/velox_backend_cache.yml +++ b/.github/workflows/velox_backend_cache.yml @@ -145,9 +145,13 @@ jobs: strategy: matrix: os: [ ubuntu-22.04 ] - container: apache/gluten:centos-9-jdk8-cudf steps: - - uses: actions/checkout@v2 + - name: "node-cleanup" # by default the free runner does not have enough disk space + run: | + sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL + sudo docker image prune --all --force + sudo docker builder prune -a + - uses: actions/checkout@v4 - name: Get Ccache uses: actions/cache/restore@v3 with: @@ -157,15 +161,26 @@ jobs: ccache-centos9-release-shared-${{runner.arch}} - name: Build Gluten shared libraries run: | - export CCACHE_MAXSIZE=1G - dnf autoremove -y - df -a + docker run -v $GITHUB_WORKSPACE:/work -w /work apache/gluten:centos-9-jdk8-cudf bash -c " rm -rf /opt/rh/gcc-toolset-12 && ln -s /opt/rh/gcc-toolset-14 /opt/rh/gcc-toolset-12 # hack to use gcc 14, should upgrade in Velox build script later + df -a + dnf autoremove -y && dnf clean all + dnf remove -y cuda-toolkit-12* && dnf install -y cuda-toolkit-13-1 + ls -l /usr/local/ source /opt/rh/gcc-toolset-12/enable + export CMAKE_BUILD_PARALLEL_LEVEL=4 export NUM_THREADS=4 - bash dev/builddeps-veloxbe.sh --run_setup_script=OFF --build_arrow=OFF --build_tests=OFF --build_benchmarks=ON --enable_gpu=ON # TODO: re-enable tests with more disk space + export CCACHE_MAXSIZE=1G + export CCACHE_DIR=/work/.ccache + mkdir -p /work/.ccache + + cd /work + bash dev/builddeps-veloxbe.sh --run_setup_script=OFF --build_arrow=OFF --build_tests=ON --build_benchmarks=ON --enable_gpu=ON # TODO: re-enable tests with more disk space rm -rf ep/build-velox/build/velox_ep + mvn clean package -Pbackends-velox -Pspark-3.4 -DskipTests + ccache -s + " - name: Save Ccache if: always() uses: actions/cache/save@v3 diff --git a/.github/workflows/velox_backend_x86.yml b/.github/workflows/velox_backend_x86.yml index c5092b94220e..1bcf1f97efe8 100644 --- a/.github/workflows/velox_backend_x86.yml +++ b/.github/workflows/velox_backend_x86.yml @@ -1344,9 +1344,9 @@ jobs: export NUM_THREADS=4 export CCACHE_DIR=/work/.ccache mkdir -p /work/.ccache - + cd /work - bash dev/builddeps-veloxbe.sh --run_setup_script=OFF --build_arrow=OFF --build_tests=OFF --build_benchmarks=OFF --enable_gpu=ON # TODO: re-enable tests with more disk space + bash dev/builddeps-veloxbe.sh --run_setup_script=OFF --build_arrow=OFF --build_tests=ON --build_benchmarks=ON --enable_gpu=ON # TODO: re-enable tests with more disk space rm -rf ep/build-velox/build/velox_ep mvn clean package -Pbackends-velox -Pspark-3.4 -DskipTests ccache -s From 612d6a997d8a891d1be9251d89370e4b1271557c Mon Sep 17 00:00:00 2001 From: Yuan Date: Tue, 16 Dec 2025 07:27:58 +0000 Subject: [PATCH 15/15] fix gpu docker image Signed-off-by: Yuan --- dev/docker/cudf/Dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dev/docker/cudf/Dockerfile b/dev/docker/cudf/Dockerfile index 42258a69c94b..c900c632ea1f 100644 --- a/dev/docker/cudf/Dockerfile +++ b/dev/docker/cudf/Dockerfile @@ -28,7 +28,10 @@ ENV CUDA_ARCHITECTURES=70 WORKDIR /opt/gluten -RUN rm -rf /opt/rh/gcc-toolset-12 && cp -r /opt/rh/gcc-toolset-14 /opt/rh/gcc-toolset-12; \ +RUN rm -rf /opt/rh/gcc-toolset-12 && ln -s /opt/rh/gcc-toolset-14 /opt/rh/gcc-toolset-12; \ + dnf remove -y cuda-toolkit-12* && dnf install -y cuda-toolkit-13-1; \ + dnf autoremove -y && dnf clean all; \ + source /opt/rh/gcc-toolset-12/enable; \ bash ./dev/buildbundle-veloxbe.sh --run_setup_script=OFF --build_arrow=ON --spark_version=3.4 --build_tests=ON --build_benchmarks=ON --enable_gpu=ON && rm -rf /opt/gluten # You can try the data in folder backends-velox/src/test/resources/tpch-data-parquet