From 54c9d6bc2befec47ddb6d5e2469dc906ac857ba5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Thu, 18 Dec 2025 09:44:41 +0100 Subject: [PATCH 1/6] GH-48582: [CI][GPU][C++][Python] Add new cuda jobs using the new self-hosted runners --- .github/workflows/cpp_extra.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/cpp_extra.yml b/.github/workflows/cpp_extra.yml index b56b74c9a29..2dc2dda7cb3 100644 --- a/.github/workflows/cpp_extra.yml +++ b/.github/workflows/cpp_extra.yml @@ -106,6 +106,16 @@ jobs: -e ARROW_USE_MESON=ON runs-on: ubuntu-latest title: AMD64 Ubuntu Meson + - envs: + - CUDA=13.0.2 + image: ubuntu-cuda-cpp + runs-on: "runs-on=${{ github.run_id }}/family=g4dn.xlarge/image=ubuntu24-gpu-x64" + title: AMD64 Ubuntu 24 CUDA 13.0.2 + - envs: + - CUDA=11.7.1 + image: ubuntu-cuda-cpp + runs-on: "runs-on=${{ github.run_id }}/family=g4dn.xlarge/image=ubuntu22-gpu-x64" + title: AMD64 Ubuntu 22 CUDA 11.7.1 # TODO: We should remove this "continue-on-error: true" once GH-47207 is resolved - continue-on-error: true envs: From c7731395db5bead73545bcfc9bddf2d4eb25404e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Thu, 18 Dec 2025 10:37:29 +0100 Subject: [PATCH 2/6] Set Ubuntu for archery via env --- .github/workflows/cpp_extra.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cpp_extra.yml b/.github/workflows/cpp_extra.yml index 2dc2dda7cb3..c7c78dd3a8c 100644 --- a/.github/workflows/cpp_extra.yml +++ b/.github/workflows/cpp_extra.yml @@ -108,13 +108,15 @@ jobs: title: AMD64 Ubuntu Meson - envs: - CUDA=13.0.2 + - UBUNTU=24.04 image: ubuntu-cuda-cpp runs-on: "runs-on=${{ github.run_id }}/family=g4dn.xlarge/image=ubuntu24-gpu-x64" title: AMD64 Ubuntu 24 CUDA 13.0.2 - envs: - CUDA=11.7.1 + - UBUNTU=22.04 image: ubuntu-cuda-cpp - runs-on: "runs-on=${{ github.run_id }}/family=g4dn.xlarge/image=ubuntu22-gpu-x64" + runs-on: "runs-on=${{ github.run_id }}/family=g4dn.xlarge/image=ubuntu24-gpu-x64" title: AMD64 Ubuntu 22 CUDA 11.7.1 # TODO: We should remove this "continue-on-error: true" once GH-47207 is resolved - continue-on-error: true From ac768577911bbefe306e7072f749aedc9e9454a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Thu, 18 Dec 2025 18:27:46 +0100 Subject: [PATCH 3/6] Try using g6f.large instances --- .github/workflows/cpp_extra.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cpp_extra.yml b/.github/workflows/cpp_extra.yml index c7c78dd3a8c..d2b9ba636cf 100644 --- a/.github/workflows/cpp_extra.yml +++ b/.github/workflows/cpp_extra.yml @@ -110,13 +110,13 @@ jobs: - CUDA=13.0.2 - UBUNTU=24.04 image: ubuntu-cuda-cpp - runs-on: "runs-on=${{ github.run_id }}/family=g4dn.xlarge/image=ubuntu24-gpu-x64" + runs-on: "runs-on=${{ github.run_id }}/family=g6f.large/image=ubuntu24-gpu-x64" title: AMD64 Ubuntu 24 CUDA 13.0.2 - envs: - CUDA=11.7.1 - UBUNTU=22.04 image: ubuntu-cuda-cpp - runs-on: "runs-on=${{ github.run_id }}/family=g4dn.xlarge/image=ubuntu24-gpu-x64" + runs-on: "runs-on=${{ github.run_id }}/family=g6f.large/image=ubuntu24-gpu-x64" title: AMD64 Ubuntu 22 CUDA 11.7.1 # TODO: We should remove this "continue-on-error: true" once GH-47207 is resolved - continue-on-error: true From 110a23e3c76b4cb786c3d77459d16cad1bd2bc93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Thu, 18 Dec 2025 18:32:41 +0100 Subject: [PATCH 4/6] Back to g4dn.xlarge to test --- .github/workflows/cpp_extra.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cpp_extra.yml b/.github/workflows/cpp_extra.yml index d2b9ba636cf..c7c78dd3a8c 100644 --- a/.github/workflows/cpp_extra.yml +++ b/.github/workflows/cpp_extra.yml @@ -110,13 +110,13 @@ jobs: - CUDA=13.0.2 - UBUNTU=24.04 image: ubuntu-cuda-cpp - runs-on: "runs-on=${{ github.run_id }}/family=g6f.large/image=ubuntu24-gpu-x64" + runs-on: "runs-on=${{ github.run_id }}/family=g4dn.xlarge/image=ubuntu24-gpu-x64" title: AMD64 Ubuntu 24 CUDA 13.0.2 - envs: - CUDA=11.7.1 - UBUNTU=22.04 image: ubuntu-cuda-cpp - runs-on: "runs-on=${{ github.run_id }}/family=g6f.large/image=ubuntu24-gpu-x64" + runs-on: "runs-on=${{ github.run_id }}/family=g4dn.xlarge/image=ubuntu24-gpu-x64" title: AMD64 Ubuntu 22 CUDA 11.7.1 # TODO: We should remove this "continue-on-error: true" once GH-47207 is resolved - continue-on-error: true From e7fa98b897d98e4c4144cf468961579dfd5485a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Thu, 18 Dec 2025 18:39:04 +0100 Subject: [PATCH 5/6] Try removing runs-on from matrix --- .github/workflows/cpp_extra.yml | 84 ++++++++++++++++++++++++++++----- 1 file changed, 73 insertions(+), 11 deletions(-) diff --git a/.github/workflows/cpp_extra.yml b/.github/workflows/cpp_extra.yml index c7c78dd3a8c..b11c93ed0c5 100644 --- a/.github/workflows/cpp_extra.yml +++ b/.github/workflows/cpp_extra.yml @@ -106,27 +106,88 @@ jobs: -e ARROW_USE_MESON=ON runs-on: ubuntu-latest title: AMD64 Ubuntu Meson + # TODO: We should remove this "continue-on-error: true" once GH-47207 is resolved + - continue-on-error: true + envs: + - DEBIAN=13 + image: debian-cpp + run-options: >- + -e CMAKE_CXX_STANDARD=23 + runs-on: ubuntu-latest + title: AMD64 Debian C++23 + env: + ARCHERY_DEBUG: 1 + ARROW_ENABLE_TIMING_TESTS: OFF + DOCKER_VOLUME_PREFIX: ".docker/" + steps: + - name: Checkout Arrow + uses: actions/checkout@v6 + with: + fetch-depth: 0 + submodules: recursive + - name: Cache Docker Volumes + uses: actions/cache@v5 + with: + path: .docker + key: extra-${{ matrix.image }}-${{ hashFiles('cpp/**') }} + restore-keys: extra-${{ matrix.image }}- + - name: Setup Python + uses: actions/setup-python@v6 + with: + python-version: 3 + - name: Setup Archery + run: python3 -m pip install -e dev/archery[docker] + - name: Execute Docker Build + continue-on-error: ${{ matrix.continue-on-error || false }} + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} + ENVS: ${{ toJSON(matrix.envs) }} + run: | + # GH-40558: reduce ASLR to avoid ASAN/LSAN crashes + sudo sysctl -w vm.mmap_rnd_bits=28 + source ci/scripts/util_enable_core_dumps.sh + if [ "${ENVS}" != "null" ]; then + echo "${ENVS}" | jq -r '.[]' | while read env; do + echo "${env}" >> .env + done + fi + archery docker run ${{ matrix.run-options || '' }} ${{ matrix.image }} + - name: Docker Push + if: >- + success() && + github.event_name == 'push' && + github.repository == 'apache/arrow' && + github.ref_name == 'main' + env: + ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} + ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} + continue-on-error: true + run: archery docker push ${{ matrix.image }} + + cuda: + needs: check-labels + name: ${{ matrix.title }} + runs-on: "runs-on=${{ github.run_id }}/family=g4dn.xlarge/image=ubuntu24-gpu-x64" + if: >- + needs.check-labels.outputs.force == 'true' || + contains(fromJSON(needs.check-labels.outputs.ci-extra-labels || '[]'), 'CI: Extra') || + contains(fromJSON(needs.check-labels.outputs.ci-extra-labels || '[]'), 'CI: Extra: C++') + timeout-minutes: 75 + strategy: + fail-fast: false + matrix: + include: - envs: - CUDA=13.0.2 - UBUNTU=24.04 image: ubuntu-cuda-cpp - runs-on: "runs-on=${{ github.run_id }}/family=g4dn.xlarge/image=ubuntu24-gpu-x64" title: AMD64 Ubuntu 24 CUDA 13.0.2 - envs: - CUDA=11.7.1 - UBUNTU=22.04 image: ubuntu-cuda-cpp - runs-on: "runs-on=${{ github.run_id }}/family=g4dn.xlarge/image=ubuntu24-gpu-x64" title: AMD64 Ubuntu 22 CUDA 11.7.1 - # TODO: We should remove this "continue-on-error: true" once GH-47207 is resolved - - continue-on-error: true - envs: - - DEBIAN=13 - image: debian-cpp - run-options: >- - -e CMAKE_CXX_STANDARD=23 - runs-on: ubuntu-latest - title: AMD64 Debian C++23 env: ARCHERY_DEBUG: 1 ARROW_ENABLE_TIMING_TESTS: OFF @@ -451,6 +512,7 @@ jobs: report-extra-cpp: if: github.event_name == 'schedule' && always() needs: + - cuda - docker - jni-linux - jni-macos From d6bb045e9efdcd9e4d6cb26148c34b77a53b0069 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 22 Dec 2025 21:08:04 +0100 Subject: [PATCH 6/6] Apply suggestion from @rok --- .github/workflows/cpp_extra.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cpp_extra.yml b/.github/workflows/cpp_extra.yml index b11c93ed0c5..c399fa33ee4 100644 --- a/.github/workflows/cpp_extra.yml +++ b/.github/workflows/cpp_extra.yml @@ -168,7 +168,7 @@ jobs: cuda: needs: check-labels name: ${{ matrix.title }} - runs-on: "runs-on=${{ github.run_id }}/family=g4dn.xlarge/image=ubuntu24-gpu-x64" + runs-on: "runs-on=${{ github.run_id }}/family=g4dn.xlarge/image=ubuntu24-gpu-x64/spot=capacity-optimized" if: >- needs.check-labels.outputs.force == 'true' || contains(fromJSON(needs.check-labels.outputs.ci-extra-labels || '[]'), 'CI: Extra') ||