From 80bad87c08c8992f5797967f68a200872cfd78f2 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Thu, 12 Mar 2026 00:26:09 -0500 Subject: [PATCH 1/5] batching, join-distribution=AUTOMATIC --- .../etc_worker/config_native.properties | 17 +++++++++++++++++ presto/scripts/generate_presto_config.sh | 4 ++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/presto/docker/config/template/etc_worker/config_native.properties b/presto/docker/config/template/etc_worker/config_native.properties index c36788c7..11292ff3 100644 --- a/presto/docker/config/template/etc_worker/config_native.properties +++ b/presto/docker/config/template/etc_worker/config_native.properties @@ -41,3 +41,20 @@ cudf.exchange.server.port=0000 cudf.memory_resource=async async-data-cache-enabled=false + +# Disable JIT because it take a few iterations to warm up jit cache in all workers +cudf.jit_expression_enabled=false +# Turn on to use intra-node exchange optimization. +# NOTE: In cudf exchange 20260212 branch, this is needed for UCX to use nvlink. +cudf.intra_node_exchange=true + +# Use 100M rows per chunk for cudf partitioned output. +# NOTE: This is not yet propagated to the worker properly because only a fixed set of query configs are supported. +# https://github.com/prestodb/presto/blob/a62672886152c8c6b61cf301d246f217d850e357/presto-native-execution/presto_cpp/main/PrestoToVeloxQueryConfig.cpp#L106-L224 +# As a result, this needs to be hardcoded right now. +cudf.partitioned_output_batch_rows=100000000 + +# Enable cudf rebatching before aggregations. +cudf.concat_optimization_enabled=true +cudf.batch_size_min_threshold=100000000 + diff --git a/presto/scripts/generate_presto_config.sh b/presto/scripts/generate_presto_config.sh index cc20b82a..23d0fbfb 100755 --- a/presto/scripts/generate_presto_config.sh +++ b/presto/scripts/generate_presto_config.sh @@ -53,8 +53,8 @@ function duplicate_worker_configs() { # make cudf.exchange=true if we are running multiple workers sed -i "s+cudf.exchange=false+cudf.exchange=true+g" ${worker_native_config} # make join-distribution-type=PARTITIONED if we are running multiple workers - # (ucx exchange does not currently support BROADCAST partition type) - sed -i "s+join-distribution-type=.*+join-distribution-type=PARTITIONED+g" ${coord_native_config} + # ucx exchange supports PARTITIONED and BROADCAST partition types + # sed -i "s+join-distribution-type=.*+join-distribution-type=PARTITIONED+g" ${coord_native_config} fi # Each worker node needs to have it's own http-server port. This isn't used, but From 284169030ad3435a3bd70e460100ce8ff5203dc4 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Thu, 12 Mar 2026 00:27:30 -0500 Subject: [PATCH 2/5] Enable NUMA binding --- .../docker-compose.native-gpu.yml.jinja | 3 +++ presto/docker/launch_presto_servers.sh | 21 +++++++++++++++++-- presto/docker/native_build.dockerfile | 2 +- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/presto/docker/docker-compose/template/docker-compose.native-gpu.yml.jinja b/presto/docker/docker-compose/template/docker-compose.native-gpu.yml.jinja index d5ad844d..46887431 100644 --- a/presto/docker/docker-compose/template/docker-compose.native-gpu.yml.jinja +++ b/presto/docker/docker-compose/template/docker-compose.native-gpu.yml.jinja @@ -44,7 +44,10 @@ x-presto-native-worker-gpu: &gpu_worker_base # - /dev/infiniband/uverbs9 depends_on: - presto-coordinator + privileged: true volumes: + # required for numa + - /sys/devices/system/node:/sys/devices/system/node - ../../config/generated/gpu/etc_common:/opt/presto-server/etc - ../../config/generated/gpu/etc_worker/node.properties:/opt/presto-server/etc/node.properties - ../../config/generated/gpu/etc_worker/config_native.properties:/opt/presto-server/etc/config.properties diff --git a/presto/docker/launch_presto_servers.sh b/presto/docker/launch_presto_servers.sh index 16dfd352..f266659e 100644 --- a/presto/docker/launch_presto_servers.sh +++ b/presto/docker/launch_presto_servers.sh @@ -5,12 +5,29 @@ set -e echo ldconfig if [ $# -eq 0 ]; then - presto_server --etc-dir="/opt/presto-server/etc/" & + gpu_id=${CUDA_VISIBLE_DEVICES:-0} + echo "For GPU $gpu_id" + nvidia-smi topo -C -M -i $gpu_id + numa_id=$(nvidia-smi topo -C -i $gpu_id | awk -F':' '/NUMA IDs of closest CPU/{gsub(/ /,"",$2); print $2}') + if [[ $numa_id =~ ^[0-9]+$ ]]; then + LAUNCHER="numactl --cpunodebind=$numa_id --membind=$numa_id" + else + LAUNCHER="" + fi + $LAUNCHER presto_server --etc-dir="/opt/presto-server/etc/" & else # Launch workers in parallel, each pinned to a different GPU # The GPU IDs are passed as command-line arguments for gpu_id in "$@"; do - CUDA_VISIBLE_DEVICES=$gpu_id presto_server --etc-dir="/opt/presto-server/etc${gpu_id}" & + echo "For GPU $gpu_id" + nvidia-smi topo -C -M -i $gpu_id + numa_id=$(nvidia-smi topo -C -i $gpu_id | awk -F':' '/NUMA IDs of closest CPU/{gsub(/ /,"",$2); print $2}') + if [[ $numa_id =~ ^[0-9]+$ ]]; then + LAUNCHER="numactl --cpunodebind=$numa_id --membind=$numa_id" + else + LAUNCHER="" + fi + CUDA_VISIBLE_DEVICES=$gpu_id $LAUNCHER presto_server --etc-dir="/opt/presto-server/etc${gpu_id}" & done fi diff --git a/presto/docker/native_build.dockerfile b/presto/docker/native_build.dockerfile index d4ab8a72..e14c68fe 100644 --- a/presto/docker/native_build.dockerfile +++ b/presto/docker/native_build.dockerfile @@ -3,7 +3,7 @@ FROM presto/prestissimo-dependency:centos9 RUN rpm --import https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub && \ dnf config-manager --add-repo "https://developer.download.nvidia.com/devtools/repos/rhel$(source /etc/os-release; echo ${VERSION_ID%%.*})/$(rpm --eval '%{_arch}' | sed s/aarch/arm/)/" && \ - dnf install -y nsight-systems-cli-2025.5.1 + dnf install -y nsight-systems-cli-2025.5.1 numactl ARG GPU=ON ARG BUILD_TYPE=release From f2be0cb590e7a6d00e5b2bd38abed1c3daa69e92 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 13 Mar 2026 04:14:47 +0000 Subject: [PATCH 3/5] fix build service for single container --- presto/scripts/start_presto_helper.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presto/scripts/start_presto_helper.sh b/presto/scripts/start_presto_helper.sh index 089fcceb..280eda47 100755 --- a/presto/scripts/start_presto_helper.sh +++ b/presto/scripts/start_presto_helper.sh @@ -132,7 +132,7 @@ elif [[ "$VARIANT_TYPE" == "gpu" ]]; then if [[ -n $GPU_IDS ]]; then FIRST_GPU_ID=$(echo $GPU_IDS | cut -d',' -f1) fi - if [[ -n "$NUM_WORKERS" && "$NUM_WORKERS" -gt 1 ]]; then + if [[ -n "$NUM_WORKERS" && "$NUM_WORKERS" -gt 1 && "$SINGLE_CONTAINER" == "false" ]]; then GPU_WORKER_SERVICE="presto-native-worker-gpu-${FIRST_GPU_ID}" fi conditionally_add_build_target $GPU_WORKER_IMAGE $GPU_WORKER_SERVICE "worker|w" From 744f4ed101fbdeedf668eabdf1cb9d5c52854a08 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 13 Mar 2026 04:18:42 +0000 Subject: [PATCH 4/5] update numa memory bind --- presto/docker/launch_presto_servers.sh | 63 +++++++++++++++----------- 1 file changed, 36 insertions(+), 27 deletions(-) diff --git a/presto/docker/launch_presto_servers.sh b/presto/docker/launch_presto_servers.sh index f266659e..5b1213e6 100644 --- a/presto/docker/launch_presto_servers.sh +++ b/presto/docker/launch_presto_servers.sh @@ -1,35 +1,44 @@ #!/bin/bash - set -e -# Run ldconfig once -echo ldconfig -if [ $# -eq 0 ]; then - gpu_id=${CUDA_VISIBLE_DEVICES:-0} - echo "For GPU $gpu_id" - nvidia-smi topo -C -M -i $gpu_id - numa_id=$(nvidia-smi topo -C -i $gpu_id | awk -F':' '/NUMA IDs of closest CPU/{gsub(/ /,"",$2); print $2}') - if [[ $numa_id =~ ^[0-9]+$ ]]; then - LAUNCHER="numactl --cpunodebind=$numa_id --membind=$numa_id" - else - LAUNCHER="" +ETC_BASE="/opt/presto-server/etc" + +# Resolve the NUMA node closest to a GPU and launch presto_server pinned to it. +# $1 — GPU ID +# $2 — etc-dir path for this instance +launch_worker() { + local gpu_id=$1 etc_dir=$2 + echo "Launching on GPU $gpu_id (config: $etc_dir)" + + local topo + topo=$(nvidia-smi topo -C -M -i "$gpu_id") + echo "$topo" + + local cpu_numa mem_numa + cpu_numa=$(echo "$topo" | awk -F: '/NUMA IDs of closest CPU/{ gsub(/ /,"",$2); print $2 }') + mem_numa=$(echo "$topo" | awk -F: '/NUMA IDs of closest memory/{ gsub(/ /,"",$2); print $2 }') + + local launcher=() + if [[ $cpu_numa =~ ^[0-9]+$ ]]; then + launcher=(numactl --cpunodebind="$cpu_numa") + if [[ $mem_numa =~ ^[0-9]+$ ]]; then + launcher+=(--membind="$mem_numa") + else + launcher+=(--membind="$cpu_numa") + fi fi - $LAUNCHER presto_server --etc-dir="/opt/presto-server/etc/" & + + CUDA_VISIBLE_DEVICES="$gpu_id" "${launcher[@]}" presto_server --etc-dir="$etc_dir" & +} + +# No args → single worker using CUDA_VISIBLE_DEVICES (default 0), shared config dir. +# With args → one worker per GPU ID, each with its own config dir (etc). +if [ $# -eq 0 ]; then + launch_worker "${CUDA_VISIBLE_DEVICES:-0}" "${ETC_BASE}/" else -# Launch workers in parallel, each pinned to a different GPU -# The GPU IDs are passed as command-line arguments -for gpu_id in "$@"; do - echo "For GPU $gpu_id" - nvidia-smi topo -C -M -i $gpu_id - numa_id=$(nvidia-smi topo -C -i $gpu_id | awk -F':' '/NUMA IDs of closest CPU/{gsub(/ /,"",$2); print $2}') - if [[ $numa_id =~ ^[0-9]+$ ]]; then - LAUNCHER="numactl --cpunodebind=$numa_id --membind=$numa_id" - else - LAUNCHER="" - fi - CUDA_VISIBLE_DEVICES=$gpu_id $LAUNCHER presto_server --etc-dir="/opt/presto-server/etc${gpu_id}" & -done + for gpu_id in "$@"; do + launch_worker "$gpu_id" "${ETC_BASE}${gpu_id}" + done fi -# Wait for all background processes wait From 6ce40e4a0b9a39e03070c4e67de85879cd02496a Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Fri, 13 Mar 2026 10:08:54 -0500 Subject: [PATCH 5/5] remove join-distribution-type editing in multi-workers config. --- presto/scripts/generate_presto_config.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/presto/scripts/generate_presto_config.sh b/presto/scripts/generate_presto_config.sh index 23d0fbfb..ccd741f8 100755 --- a/presto/scripts/generate_presto_config.sh +++ b/presto/scripts/generate_presto_config.sh @@ -53,8 +53,6 @@ function duplicate_worker_configs() { # make cudf.exchange=true if we are running multiple workers sed -i "s+cudf.exchange=false+cudf.exchange=true+g" ${worker_native_config} # make join-distribution-type=PARTITIONED if we are running multiple workers - # ucx exchange supports PARTITIONED and BROADCAST partition types - # sed -i "s+join-distribution-type=.*+join-distribution-type=PARTITIONED+g" ${coord_native_config} fi # Each worker node needs to have it's own http-server port. This isn't used, but