From 80bad87c08c8992f5797967f68a200872cfd78f2 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 12 Mar 2026 00:26:09 -0500
Subject: [PATCH 1/5] batching, join-distribution=AUTOMATIC

---
 .../etc_worker/config_native.properties         | 17 +++++++++++++++++
 presto/scripts/generate_presto_config.sh        |  4 ++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/presto/docker/config/template/etc_worker/config_native.properties b/presto/docker/config/template/etc_worker/config_native.properties
index c36788c7..11292ff3 100644
--- a/presto/docker/config/template/etc_worker/config_native.properties
+++ b/presto/docker/config/template/etc_worker/config_native.properties
@@ -41,3 +41,20 @@ cudf.exchange.server.port=0000
 cudf.memory_resource=async
 
 async-data-cache-enabled=false
+
+# Disable JIT because it take a few iterations to warm up jit cache in all workers
+cudf.jit_expression_enabled=false
+# Turn on to use intra-node exchange optimization.
+# NOTE: In cudf exchange 20260212 branch, this is needed for UCX to use nvlink.
+cudf.intra_node_exchange=true
+
+# Use 100M rows per chunk for cudf partitioned output.
+# NOTE: This is not yet propagated to the worker properly because only a fixed set of query configs are supported.
+#       https://github.com/prestodb/presto/blob/a62672886152c8c6b61cf301d246f217d850e357/presto-native-execution/presto_cpp/main/PrestoToVeloxQueryConfig.cpp#L106-L224
+#       As a result, this needs to be hardcoded right now.
+cudf.partitioned_output_batch_rows=100000000
+
+# Enable cudf rebatching before aggregations.
+cudf.concat_optimization_enabled=true
+cudf.batch_size_min_threshold=100000000
+
diff --git a/presto/scripts/generate_presto_config.sh b/presto/scripts/generate_presto_config.sh
index cc20b82a..23d0fbfb 100755
--- a/presto/scripts/generate_presto_config.sh
+++ b/presto/scripts/generate_presto_config.sh
@@ -53,8 +53,8 @@ function duplicate_worker_configs() {
     # make cudf.exchange=true if we are running multiple workers
     sed -i "s+cudf.exchange=false+cudf.exchange=true+g" ${worker_native_config}
     # make join-distribution-type=PARTITIONED if we are running multiple workers
-    # (ucx exchange does not currently support BROADCAST partition type)
-    sed -i "s+join-distribution-type=.*+join-distribution-type=PARTITIONED+g" ${coord_native_config}
+    # ucx exchange supports PARTITIONED and BROADCAST partition types
+    # sed -i "s+join-distribution-type=.*+join-distribution-type=PARTITIONED+g" ${coord_native_config}
   fi
 
   # Each worker node needs to have it's own http-server port.  This isn't used, but

From 284169030ad3435a3bd70e460100ce8ff5203dc4 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 12 Mar 2026 00:27:30 -0500
Subject: [PATCH 2/5] Enable NUMA binding

---
 .../docker-compose.native-gpu.yml.jinja       |  3 +++
 presto/docker/launch_presto_servers.sh        | 21 +++++++++++++++++--
 presto/docker/native_build.dockerfile         |  2 +-
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/presto/docker/docker-compose/template/docker-compose.native-gpu.yml.jinja b/presto/docker/docker-compose/template/docker-compose.native-gpu.yml.jinja
index d5ad844d..46887431 100644
--- a/presto/docker/docker-compose/template/docker-compose.native-gpu.yml.jinja
+++ b/presto/docker/docker-compose/template/docker-compose.native-gpu.yml.jinja
@@ -44,7 +44,10 @@ x-presto-native-worker-gpu: &gpu_worker_base
     #   - /dev/infiniband/uverbs9
   depends_on:
     - presto-coordinator
+  privileged: true
   volumes:
+  # required for numa
+    - /sys/devices/system/node:/sys/devices/system/node
     - ../../config/generated/gpu/etc_common:/opt/presto-server/etc
     - ../../config/generated/gpu/etc_worker/node.properties:/opt/presto-server/etc/node.properties
     - ../../config/generated/gpu/etc_worker/config_native.properties:/opt/presto-server/etc/config.properties
diff --git a/presto/docker/launch_presto_servers.sh b/presto/docker/launch_presto_servers.sh
index 16dfd352..f266659e 100644
--- a/presto/docker/launch_presto_servers.sh
+++ b/presto/docker/launch_presto_servers.sh
@@ -5,12 +5,29 @@ set -e
 echo ldconfig
 
 if [ $# -eq 0 ]; then
-  presto_server --etc-dir="/opt/presto-server/etc/" &
+  gpu_id=${CUDA_VISIBLE_DEVICES:-0}
+  echo "For GPU $gpu_id"
+  nvidia-smi topo -C -M -i $gpu_id
+  numa_id=$(nvidia-smi topo -C -i $gpu_id | awk -F':' '/NUMA IDs of closest CPU/{gsub(/ /,"",$2); print $2}')
+  if [[ $numa_id =~ ^[0-9]+$ ]]; then
+    LAUNCHER="numactl --cpunodebind=$numa_id --membind=$numa_id"
+  else
+    LAUNCHER=""
+  fi
+  $LAUNCHER presto_server --etc-dir="/opt/presto-server/etc/" &
 else
 # Launch workers in parallel, each pinned to a different GPU
 # The GPU IDs are passed as command-line arguments
 for gpu_id in "$@"; do
-  CUDA_VISIBLE_DEVICES=$gpu_id presto_server --etc-dir="/opt/presto-server/etc${gpu_id}" &
+  echo "For GPU $gpu_id"
+  nvidia-smi topo -C -M -i $gpu_id
+  numa_id=$(nvidia-smi topo -C -i $gpu_id | awk -F':' '/NUMA IDs of closest CPU/{gsub(/ /,"",$2); print $2}')
+  if [[ $numa_id =~ ^[0-9]+$ ]]; then
+    LAUNCHER="numactl --cpunodebind=$numa_id --membind=$numa_id"
+  else
+    LAUNCHER=""
+  fi
+  CUDA_VISIBLE_DEVICES=$gpu_id $LAUNCHER presto_server --etc-dir="/opt/presto-server/etc${gpu_id}" &
 done
 fi
 
diff --git a/presto/docker/native_build.dockerfile b/presto/docker/native_build.dockerfile
index d4ab8a72..e14c68fe 100644
--- a/presto/docker/native_build.dockerfile
+++ b/presto/docker/native_build.dockerfile
@@ -3,7 +3,7 @@ FROM presto/prestissimo-dependency:centos9
 
 RUN rpm --import https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub && \
     dnf config-manager --add-repo "https://developer.download.nvidia.com/devtools/repos/rhel$(source /etc/os-release; echo ${VERSION_ID%%.*})/$(rpm --eval '%{_arch}' | sed s/aarch/arm/)/" && \
-    dnf install -y nsight-systems-cli-2025.5.1
+    dnf install -y nsight-systems-cli-2025.5.1 numactl
 
 ARG GPU=ON
 ARG BUILD_TYPE=release

From f2be0cb590e7a6d00e5b2bd38abed1c3daa69e92 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 13 Mar 2026 04:14:47 +0000
Subject: [PATCH 3/5] fix build service for single container

---
 presto/scripts/start_presto_helper.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/presto/scripts/start_presto_helper.sh b/presto/scripts/start_presto_helper.sh
index 089fcceb..280eda47 100755
--- a/presto/scripts/start_presto_helper.sh
+++ b/presto/scripts/start_presto_helper.sh
@@ -132,7 +132,7 @@ elif [[ "$VARIANT_TYPE" == "gpu" ]]; then
   if [[ -n $GPU_IDS ]]; then
     FIRST_GPU_ID=$(echo $GPU_IDS | cut -d',' -f1)
   fi
-  if [[ -n "$NUM_WORKERS" && "$NUM_WORKERS" -gt 1 ]]; then
+  if [[ -n "$NUM_WORKERS" && "$NUM_WORKERS" -gt 1 && "$SINGLE_CONTAINER" == "false" ]]; then
     GPU_WORKER_SERVICE="presto-native-worker-gpu-${FIRST_GPU_ID}"
   fi
   conditionally_add_build_target $GPU_WORKER_IMAGE $GPU_WORKER_SERVICE "worker|w"

From 744f4ed101fbdeedf668eabdf1cb9d5c52854a08 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 13 Mar 2026 04:18:42 +0000
Subject: [PATCH 4/5] update numa memory bind

---
 presto/docker/launch_presto_servers.sh | 63 +++++++++++++++-----------
 1 file changed, 36 insertions(+), 27 deletions(-)

diff --git a/presto/docker/launch_presto_servers.sh b/presto/docker/launch_presto_servers.sh
index f266659e..5b1213e6 100644
--- a/presto/docker/launch_presto_servers.sh
+++ b/presto/docker/launch_presto_servers.sh
@@ -1,35 +1,44 @@
 #!/bin/bash
-
 set -e
-# Run ldconfig once
-echo ldconfig
 
-if [ $# -eq 0 ]; then
-  gpu_id=${CUDA_VISIBLE_DEVICES:-0}
-  echo "For GPU $gpu_id"
-  nvidia-smi topo -C -M -i $gpu_id
-  numa_id=$(nvidia-smi topo -C -i $gpu_id | awk -F':' '/NUMA IDs of closest CPU/{gsub(/ /,"",$2); print $2}')
-  if [[ $numa_id =~ ^[0-9]+$ ]]; then
-    LAUNCHER="numactl --cpunodebind=$numa_id --membind=$numa_id"
-  else
-    LAUNCHER=""
+ETC_BASE="/opt/presto-server/etc"
+
+# Resolve the NUMA node closest to a GPU and launch presto_server pinned to it.
+#   $1 — GPU ID
+#   $2 — etc-dir path for this instance
+launch_worker() {
+  local gpu_id=$1 etc_dir=$2
+  echo "Launching on GPU $gpu_id (config: $etc_dir)"
+
+  local topo
+  topo=$(nvidia-smi topo -C -M -i "$gpu_id")
+  echo "$topo"
+
+  local cpu_numa mem_numa
+  cpu_numa=$(echo "$topo" | awk -F: '/NUMA IDs of closest CPU/{ gsub(/ /,"",$2); print $2 }')
+  mem_numa=$(echo "$topo" | awk -F: '/NUMA IDs of closest memory/{ gsub(/ /,"",$2); print $2 }')
+
+  local launcher=()
+  if [[ $cpu_numa =~ ^[0-9]+$ ]]; then
+    launcher=(numactl --cpunodebind="$cpu_numa")
+    if [[ $mem_numa =~ ^[0-9]+$ ]]; then
+      launcher+=(--membind="$mem_numa")
+    else
+      launcher+=(--membind="$cpu_numa")
+    fi
   fi
-  $LAUNCHER presto_server --etc-dir="/opt/presto-server/etc/" &
+
+  CUDA_VISIBLE_DEVICES="$gpu_id" "${launcher[@]}" presto_server --etc-dir="$etc_dir" &
+}
+
+# No args → single worker using CUDA_VISIBLE_DEVICES (default 0), shared config dir.
+# With args → one worker per GPU ID, each with its own config dir (etc<gpu_id>).
+if [ $# -eq 0 ]; then
+  launch_worker "${CUDA_VISIBLE_DEVICES:-0}" "${ETC_BASE}/"
 else
-# Launch workers in parallel, each pinned to a different GPU
-# The GPU IDs are passed as command-line arguments
-for gpu_id in "$@"; do
-  echo "For GPU $gpu_id"
-  nvidia-smi topo -C -M -i $gpu_id
-  numa_id=$(nvidia-smi topo -C -i $gpu_id | awk -F':' '/NUMA IDs of closest CPU/{gsub(/ /,"",$2); print $2}')
-  if [[ $numa_id =~ ^[0-9]+$ ]]; then
-    LAUNCHER="numactl --cpunodebind=$numa_id --membind=$numa_id"
-  else
-    LAUNCHER=""
-  fi
-  CUDA_VISIBLE_DEVICES=$gpu_id $LAUNCHER presto_server --etc-dir="/opt/presto-server/etc${gpu_id}" &
-done
+  for gpu_id in "$@"; do
+    launch_worker "$gpu_id" "${ETC_BASE}${gpu_id}"
+  done
 fi
 
-# Wait for all background processes
 wait

From 6ce40e4a0b9a39e03070c4e67de85879cd02496a Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Fri, 13 Mar 2026 10:08:54 -0500
Subject: [PATCH 5/5] remove join-distribution-type editing in multi-workers
 config.

---
 presto/scripts/generate_presto_config.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/presto/scripts/generate_presto_config.sh b/presto/scripts/generate_presto_config.sh
index 23d0fbfb..ccd741f8 100755
--- a/presto/scripts/generate_presto_config.sh
+++ b/presto/scripts/generate_presto_config.sh
@@ -53,8 +53,6 @@ function duplicate_worker_configs() {
     # make cudf.exchange=true if we are running multiple workers
     sed -i "s+cudf.exchange=false+cudf.exchange=true+g" ${worker_native_config}
     # make join-distribution-type=PARTITIONED if we are running multiple workers
-    # ucx exchange supports PARTITIONED and BROADCAST partition types
-    # sed -i "s+join-distribution-type=.*+join-distribution-type=PARTITIONED+g" ${coord_native_config}
   fi
 
   # Each worker node needs to have it's own http-server port.  This isn't used, but