rapidsai · karthikeyann · Mar 12, 2026 · Mar 12, 2026 · Mar 13, 2026 · Mar 13, 2026
@@ -41,3 +41,20 @@ cudf.exchange.server.port=0000
 cudf.memory_resource=async
 
 async-data-cache-enabled=false
+
+# Disable JIT because it take a few iterations to warm up jit cache in all workers
+cudf.jit_expression_enabled=false
+# Turn on to use intra-node exchange optimization.
+# NOTE: In cudf exchange 20260212 branch, this is needed for UCX to use nvlink.
+cudf.intra_node_exchange=true
+
+# Use 100M rows per chunk for cudf partitioned output.
+# NOTE: This is not yet propagated to the worker properly because only a fixed set of query configs are supported.
+#       https://github.com/prestodb/presto/blob/a62672886152c8c6b61cf301d246f217d850e357/presto-native-execution/presto_cpp/main/PrestoToVeloxQueryConfig.cpp#L106-L224
+#       As a result, this needs to be hardcoded right now.
+cudf.partitioned_output_batch_rows=100000000
+
+# Enable cudf rebatching before aggregations.
+cudf.concat_optimization_enabled=true
+cudf.batch_size_min_threshold=100000000
+
@@ -44,7 +44,10 @@ x-presto-native-worker-gpu: &gpu_worker_base
     #   - /dev/infiniband/uverbs9
   depends_on:
     - presto-coordinator
+  privileged: true
   volumes:
+  # required for numa
+    - /sys/devices/system/node:/sys/devices/system/node
     - ../../config/generated/gpu/etc_common:/opt/presto-server/etc
     - ../../config/generated/gpu/etc_worker/node.properties:/opt/presto-server/etc/node.properties
     - ../../config/generated/gpu/etc_worker/config_native.properties:/opt/presto-server/etc/config.properties

@@ -1,18 +1,44 @@
 #!/bin/bash
-
 set -e
-# Run ldconfig once
-echo ldconfig
 
+ETC_BASE="/opt/presto-server/etc"
+
+# Resolve the NUMA node closest to a GPU and launch presto_server pinned to it.
+#   $1 — GPU ID
+#   $2 — etc-dir path for this instance
+launch_worker() {
+  local gpu_id=$1 etc_dir=$2
+  echo "Launching on GPU $gpu_id (config: $etc_dir)"
+
+  local topo
+  topo=$(nvidia-smi topo -C -M -i "$gpu_id")
+  echo "$topo"
+
+  local cpu_numa mem_numa
+  cpu_numa=$(echo "$topo" | awk -F: '/NUMA IDs of closest CPU/{ gsub(/ /,"",$2); print $2 }')
+  mem_numa=$(echo "$topo" | awk -F: '/NUMA IDs of closest memory/{ gsub(/ /,"",$2); print $2 }')
+
+  local launcher=()
+  if [[ $cpu_numa =~ ^[0-9]+$ ]]; then
+    launcher=(numactl --cpunodebind="$cpu_numa")
+    if [[ $mem_numa =~ ^[0-9]+$ ]]; then
+      launcher+=(--membind="$mem_numa")
+    else
+      launcher+=(--membind="$cpu_numa")
+    fi
+  fi
+
+  CUDA_VISIBLE_DEVICES="$gpu_id" "${launcher[@]}" presto_server --etc-dir="$etc_dir" &
+}
+
+# No args → single worker using CUDA_VISIBLE_DEVICES (default 0), shared config dir.
+# With args → one worker per GPU ID, each with its own config dir (etc<gpu_id>).
 if [ $# -eq 0 ]; then
-  presto_server --etc-dir="/opt/presto-server/etc/" &
+  launch_worker "${CUDA_VISIBLE_DEVICES:-0}" "${ETC_BASE}/"
 else
-# Launch workers in parallel, each pinned to a different GPU
-# The GPU IDs are passed as command-line arguments
-for gpu_id in "$@"; do
-  CUDA_VISIBLE_DEVICES=$gpu_id presto_server --etc-dir="/opt/presto-server/etc${gpu_id}" &
-done
+  for gpu_id in "$@"; do
+    launch_worker "$gpu_id" "${ETC_BASE}${gpu_id}"
+  done
 fi
 
-# Wait for all background processes
 wait
@@ -3,7 +3,7 @@ FROM presto/prestissimo-dependency:centos9
 
 RUN rpm --import https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub && \
     dnf config-manager --add-repo "https://developer.download.nvidia.com/devtools/repos/rhel$(source /etc/os-release; echo ${VERSION_ID%%.*})/$(rpm --eval '%{_arch}' | sed s/aarch/arm/)/" && \
-    dnf install -y nsight-systems-cli-2025.5.1
+    dnf install -y nsight-systems-cli-2025.5.1 numactl
 
 ARG GPU=ON
 ARG BUILD_TYPE=release

@@ -53,8 +53,6 @@ function duplicate_worker_configs() {
     # make cudf.exchange=true if we are running multiple workers
     sed -i "s+cudf.exchange=false+cudf.exchange=true+g" ${worker_native_config}
     # make join-distribution-type=PARTITIONED if we are running multiple workers
-    # (ucx exchange does not currently support BROADCAST partition type)
-    sed -i "s+join-distribution-type=.*+join-distribution-type=PARTITIONED+g" ${coord_native_config}
   fi
 
   # Each worker node needs to have it's own http-server port.  This isn't used, but

@@ -132,7 +132,7 @@ elif [[ "$VARIANT_TYPE" == "gpu" ]]; then
   if [[ -n $GPU_IDS ]]; then
     FIRST_GPU_ID=$(echo $GPU_IDS | cut -d',' -f1)
   fi
-  if [[ -n "$NUM_WORKERS" && "$NUM_WORKERS" -gt 1 ]]; then
+  if [[ -n "$NUM_WORKERS" && "$NUM_WORKERS" -gt 1 && "$SINGLE_CONTAINER" == "false" ]]; then
     GPU_WORKER_SERVICE="presto-native-worker-gpu-${FIRST_GPU_ID}"
   fi
   conditionally_add_build_target $GPU_WORKER_IMAGE $GPU_WORKER_SERVICE "worker|w"