Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/velox_backend_cache.yml
Original file line number Diff line number Diff line change
Expand Up @@ -167,14 +167,15 @@ jobs:
dnf autoremove -y && dnf clean all
dnf remove -y cuda-toolkit-12* && dnf install -y cuda-toolkit-13-1
ls -l /usr/local/
source /opt/rh/gcc-toolset-12/enable

source /opt/rh/gcc-toolset-14/enable
Copy link
Author

@bdice bdice Jan 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do need GCC 14, but we could remove the extra steps above from #11275 that change the CUDA version if you wish. This PR should make it work with the CUDA 12 version that already exists in the container. I know there were quite a few workarounds to reduce the disk space to make room for CUDA 13.1 -- we could revert that too.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you'd like me to help revert those changes and minimize the build scripts, I can do that. Let me know your thoughts.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line enables the GCC14 though I don't know why we cannot enable opt/rh/gcc-toolset-14/enable directly. Do you try if the docker file can work?https://github.com/apache/incubator-gluten/blob/main/dev/docker/cudf/Dockerfile, I meet curl version issue before.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's ok for me to use cuda 13.1, I have resolved all the version mismatch issues, and meet a new issue with the newest Velox, I will try to fix it

26/01/13 10:19:08 ERROR Executor: Exception in task 7.0 in stage 40.0 (TID 12116)
org.apache.gluten.exception.GlutenException: Exception: VeloxRuntimeError
Error Source: RUNTIME
Error Code: INVALID_STATE
Reason: (1 vs. 0) Leaf child memory pool cudf-expr-precompile already exists in __sys_root__
Retriable: False
Expression: children_.count(name) == 0
Function: addLeafChild
File: /opt/gluten/ep/build-velox/build/velox_ep/velox/common/memory/MemoryPool.cpp
Line: 331
Stack trace:


export CMAKE_BUILD_PARALLEL_LEVEL=4
export NUM_THREADS=4
export CCACHE_MAXSIZE=1G
export CCACHE_DIR=/work/.ccache
mkdir -p /work/.ccache

cd /work
bash dev/builddeps-veloxbe.sh --run_setup_script=OFF --build_arrow=OFF --build_tests=ON --build_benchmarks=ON --enable_gpu=ON # TODO: re-enable tests with more disk space
rm -rf ep/build-velox/build/velox_ep
Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/velox_backend_x86.yml
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ jobs:
--extra-conf=spark.gluten.ras.enabled=true \
&& GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
--extra-conf=spark.gluten.ras.enabled=true
--extra-conf=spark.gluten.ras.enabled=true

tpc-test-centos7:
needs: build-native-lib-centos-7
Expand Down Expand Up @@ -356,7 +356,7 @@ jobs:
--extra-conf=spark.gluten.ras.enabled=true \
&& GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
--extra-conf=spark.gluten.ras.enabled=true
--extra-conf=spark.gluten.ras.enabled=true
"

tpc-test-ubuntu-oom:
Expand Down Expand Up @@ -399,7 +399,7 @@ jobs:
echo "JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64" >> $GITHUB_ENV
- name: Build for Spark ${{ matrix.spark }}
run: |
cd $GITHUB_WORKSPACE/
cd $GITHUB_WORKSPACE/
$MVN_CMD clean install -P${{ matrix.spark }} -Pbackends-velox -DskipTests
cd $GITHUB_WORKSPACE/tools/gluten-it
$GITHUB_WORKSPACE/build/mvn clean install -P${{ matrix.spark }}
Expand Down Expand Up @@ -514,7 +514,7 @@ jobs:
echo "JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64" >> $GITHUB_ENV
- name: Build for Spark ${{ matrix.spark }}
run: |
cd $GITHUB_WORKSPACE/
cd $GITHUB_WORKSPACE/
$MVN_CMD clean install -P${{ matrix.spark }} -Pbackends-velox -DskipTests
cd $GITHUB_WORKSPACE/tools/gluten-it
$GITHUB_WORKSPACE/build/mvn clean install -P${{ matrix.spark }}
Expand Down Expand Up @@ -1376,7 +1376,7 @@ jobs:
dnf autoremove -y && dnf clean all
dnf remove -y cuda-toolkit-12* && dnf install -y cuda-toolkit-13-1
ls -l /usr/local/
source /opt/rh/gcc-toolset-12/enable
source /opt/rh/gcc-toolset-14/enable

export CMAKE_BUILD_PARALLEL_LEVEL=4
export NUM_THREADS=4
Expand Down
27 changes: 7 additions & 20 deletions cpp/velox/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,10 @@ if(BUILD_EXAMPLES)
endif()

if(ENABLE_GPU)
# Find cuDF package which provides cudf::cudf target with all transitive dependencies
set(cudf_DIR "${VELOX_BUILD_PATH}/_deps/cudf-build")
find_package(cudf REQUIRED CONFIG)

import_library(
facebook::velox::velox_cudf_expression
${VELOX_BUILD_PATH}/velox/experimental/cudf/expression/libvelox_cudf_expression.a
Expand All @@ -431,16 +435,6 @@ if(ENABLE_GPU)
facebook::velox::velox_cudf_hive_connector
${VELOX_BUILD_PATH}/velox/experimental/cudf/connectors/hive/libvelox_cudf_hive_connector.a
)
target_include_directories(
velox
PRIVATE ${VELOX_BUILD_PATH}/_deps/cudf-src/cpp/include
${VELOX_BUILD_PATH}/_deps/rmm-src/cpp/include
${VELOX_BUILD_PATH}/_deps/kvikio-src/cpp/include
${VELOX_BUILD_PATH}/_deps/nvtx3-src/c/include
${VELOX_BUILD_PATH}/_deps/nvcomp_proprietary_binary-src/include
${VELOX_BUILD_PATH}/_deps/rapids_logger-src/include
/usr/local/cuda/include/cccl
/usr/local/cuda/include)

target_compile_definitions(
velox PRIVATE LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE)
Expand All @@ -450,16 +444,9 @@ if(ENABLE_GPU)
PUBLIC facebook::velox::velox_cudf_exec facebook::velox::velox_cudf_vector
facebook::velox::velox_cudf_hive_connector
facebook::velox::velox_cudf_expression)
target_link_libraries(
velox
PRIVATE
${VELOX_BUILD_PATH}/_deps/cudf-build/libcudf.so
${VELOX_BUILD_PATH}/_deps/rmm-build/librmm.so
${VELOX_BUILD_PATH}/_deps/kvikio-build/libkvikio.so
${VELOX_BUILD_PATH}/_deps/nvcomp_proprietary_binary-src/lib64/libnvcomp.so
${VELOX_BUILD_PATH}/_deps/nvcomp_proprietary_binary-src/lib64/libnvcomp_cpu.so
${VELOX_BUILD_PATH}/_deps/rapids_logger-build/librapids_logger.so
/usr/local/cuda/lib64/libcudart.so)
# Link cuDF which transitively provides all dependencies
# (rmm, kvikio, nvcomp, rapids_logger, CCCL for CUDA headers)
target_link_libraries(velox PRIVATE cudf::cudf)
endif()

add_custom_command(
Expand Down
2 changes: 1 addition & 1 deletion dev/ci-velox-buildstatic-centos-9.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

set -e

source /opt/rh/gcc-toolset-12/enable
source /opt/rh/gcc-toolset-14/enable
if [ "$(uname -m)" = "aarch64" ]; then
export CPU_TARGET="aarch64";
export VCPKG_FORCE_SYSTEM_BINARIES=1;
Expand Down
2 changes: 1 addition & 1 deletion dev/docker/cudf/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ WORKDIR /opt/gluten
RUN rm -rf /opt/rh/gcc-toolset-12 && ln -s /opt/rh/gcc-toolset-14 /opt/rh/gcc-toolset-12; \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you try to create docker image from this Dockerfile? I meet curl version issue before, please help verify if this PR can resolve it.

Copy link
Contributor

@jinchengchenghh jinchengchenghh Jan 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It cannot run successfully

692.8 -- [CURL] Enabled SSL backends: OpenSSL
692.8 -- Setting DuckDB source to AUTO
692.8 -- [DuckDB] Using SYSTEM DuckDB
692.8 -- Using ccache: /usr/bin/ccache
692.8 -- The CUDA compiler identification is unknown
692.8 -- Configuring incomplete, errors occurred!
692.8 make[1]: Leaving directory '/opt/gluten/ep/build-velox/build/velox_ep'
Dockerfile:31
--------------------
  30 |     WORKDIR /opt/gluten
  31 | >>> RUN rm -rf /opt/rh/gcc-toolset-12 && ln -s /opt/rh/gcc-toolset-14 /opt/rh/gcc-toolset-12; \
  32 | >>>     dnf remove -y cuda-toolkit-12* && dnf install -y cuda-toolkit-13-1; \
  33 | >>>     dnf autoremove -y && dnf clean all; \
  34 | >>>     source /opt/rh/gcc-toolset-14/enable; \
  35 | >>>     bash ./dev/buildbundle-veloxbe.sh --run_setup_script=OFF --build_arrow=ON --spark_version=3.4 --build_tests=ON --build_benchmarks=ON --enable_gpu=ON && rm -rf /opt/gluten
  36 |     
--------------------
ERROR: failed to solve: process "/bin/sh -c rm -rf /opt/rh/gcc-toolset-12 && ln -s /opt/rh/gcc-toolset-14 /opt/rh/gcc-toolset-12;     dnf remove -y cuda-toolkit-12* && dnf install -y cuda-toolkit-13-1;     dnf autoremove -y && dnf clean all;     source /opt/rh/gcc-toolset-14/enable;     bash ./dev/buildbundle-veloxbe.sh --run_setup_script=OFF --build_arrow=ON --spark_version=3.4 --build_tests=ON --build_benchmarks=ON --enable_gpu=ON && rm -rf /opt/gluten" did not complete successfully: exit code: 2

dnf remove -y cuda-toolkit-12* && dnf install -y cuda-toolkit-13-1; \
dnf autoremove -y && dnf clean all; \
source /opt/rh/gcc-toolset-12/enable; \
source /opt/rh/gcc-toolset-14/enable; \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it because we should not source gcc14?

CMake Error at CMakeLists.txt:476 (enable_language):
  The CMAKE_CUDA_COMPILER:

    /usr/local/cuda-12.8/bin/nvcc

  is not a full path to an existing compiler tool.

  Tell CMake where to find the compiler by setting either the environment
  variable "CUDACXX" or the CMake cache entry CMAKE_CUDA_COMPILER to the full
  path to the compiler, or to the compiler name if it is in the PATH.

bash ./dev/buildbundle-veloxbe.sh --run_setup_script=OFF --build_arrow=ON --spark_version=3.4 --build_tests=ON --build_benchmarks=ON --enable_gpu=ON && rm -rf /opt/gluten

# You can try the data in folder backends-velox/src/test/resources/tpch-data-parquet
Expand Down
11 changes: 10 additions & 1 deletion ep/build-velox/src/build-velox.sh
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,16 @@ function compile {
CXX_FLAGS='-Wno-error=stringop-overflow -Wno-error=cpp -Wno-missing-field-initializers \
-Wno-error=uninitialized -Wno-unknown-warning-option'

COMPILE_OPTION="-DCMAKE_CXX_FLAGS=\"$CXX_FLAGS\" -DVELOX_ENABLE_PARQUET=ON -DVELOX_BUILD_TESTING=OFF \
# Explicitly set compilers for CMake if CC and CXX are set
COMPILER_OPTS=""
if [ -n "${CC:-}" ]; then
COMPILER_OPTS="$COMPILER_OPTS -DCMAKE_C_COMPILER=$CC"
fi
if [ -n "${CXX:-}" ]; then
COMPILER_OPTS="$COMPILER_OPTS -DCMAKE_CXX_COMPILER=$CXX"
fi

COMPILE_OPTION="$COMPILER_OPTS -DCMAKE_CXX_FLAGS=\"$CXX_FLAGS\" -DVELOX_ENABLE_PARQUET=ON -DVELOX_BUILD_TESTING=OFF \
-DVELOX_MONO_LIBRARY=ON -DVELOX_BUILD_RUNNER=OFF -DVELOX_SIMDJSON_SKIPUTF8VALIDATION=ON \
-DVELOX_ENABLE_GEO=ON"
if [ $BUILD_TEST_UTILS == "ON" ]; then
Expand Down
Loading