Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,20 @@ cudf.exchange.server.port=0000
cudf.memory_resource=async

async-data-cache-enabled=false

# Disable JIT because it take a few iterations to warm up jit cache in all workers
cudf.jit_expression_enabled=false
# Turn on to use intra-node exchange optimization.
# NOTE: In cudf exchange 20260212 branch, this is needed for UCX to use nvlink.
cudf.intra_node_exchange=true

# Use 100M rows per chunk for cudf partitioned output.
# NOTE: This is not yet propagated to the worker properly because only a fixed set of query configs are supported.
# https://github.com/prestodb/presto/blob/a62672886152c8c6b61cf301d246f217d850e357/presto-native-execution/presto_cpp/main/PrestoToVeloxQueryConfig.cpp#L106-L224
# As a result, this needs to be hardcoded right now.
cudf.partitioned_output_batch_rows=100000000

# Enable cudf rebatching before aggregations.
cudf.concat_optimization_enabled=true
cudf.batch_size_min_threshold=100000000

Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,10 @@ x-presto-native-worker-gpu: &gpu_worker_base
# - /dev/infiniband/uverbs9
depends_on:
- presto-coordinator
privileged: true
volumes:
# required for numa
- /sys/devices/system/node:/sys/devices/system/node
- ../../config/generated/gpu/etc_common:/opt/presto-server/etc
- ../../config/generated/gpu/etc_worker/node.properties:/opt/presto-server/etc/node.properties
- ../../config/generated/gpu/etc_worker/config_native.properties:/opt/presto-server/etc/config.properties
Expand Down
46 changes: 36 additions & 10 deletions presto/docker/launch_presto_servers.sh
Original file line number Diff line number Diff line change
@@ -1,18 +1,44 @@
#!/bin/bash

set -e
# Run ldconfig once
echo ldconfig

ETC_BASE="/opt/presto-server/etc"

# Resolve the NUMA node closest to a GPU and launch presto_server pinned to it.
# $1 — GPU ID
# $2 — etc-dir path for this instance
launch_worker() {
local gpu_id=$1 etc_dir=$2
echo "Launching on GPU $gpu_id (config: $etc_dir)"

local topo
topo=$(nvidia-smi topo -C -M -i "$gpu_id")
echo "$topo"

local cpu_numa mem_numa
cpu_numa=$(echo "$topo" | awk -F: '/NUMA IDs of closest CPU/{ gsub(/ /,"",$2); print $2 }')
mem_numa=$(echo "$topo" | awk -F: '/NUMA IDs of closest memory/{ gsub(/ /,"",$2); print $2 }')

local launcher=()
if [[ $cpu_numa =~ ^[0-9]+$ ]]; then
launcher=(numactl --cpunodebind="$cpu_numa")
if [[ $mem_numa =~ ^[0-9]+$ ]]; then
launcher+=(--membind="$mem_numa")
else
launcher+=(--membind="$cpu_numa")
fi
fi

CUDA_VISIBLE_DEVICES="$gpu_id" "${launcher[@]}" presto_server --etc-dir="$etc_dir" &
}

# No args → single worker using CUDA_VISIBLE_DEVICES (default 0), shared config dir.
# With args → one worker per GPU ID, each with its own config dir (etc<gpu_id>).
if [ $# -eq 0 ]; then
presto_server --etc-dir="/opt/presto-server/etc/" &
launch_worker "${CUDA_VISIBLE_DEVICES:-0}" "${ETC_BASE}/"
else
# Launch workers in parallel, each pinned to a different GPU
# The GPU IDs are passed as command-line arguments
for gpu_id in "$@"; do
CUDA_VISIBLE_DEVICES=$gpu_id presto_server --etc-dir="/opt/presto-server/etc${gpu_id}" &
done
for gpu_id in "$@"; do
launch_worker "$gpu_id" "${ETC_BASE}${gpu_id}"
done
fi

# Wait for all background processes
wait
2 changes: 1 addition & 1 deletion presto/docker/native_build.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ FROM presto/prestissimo-dependency:centos9

RUN rpm --import https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub && \
dnf config-manager --add-repo "https://developer.download.nvidia.com/devtools/repos/rhel$(source /etc/os-release; echo ${VERSION_ID%%.*})/$(rpm --eval '%{_arch}' | sed s/aarch/arm/)/" && \
dnf install -y nsight-systems-cli-2025.5.1
dnf install -y nsight-systems-cli-2025.5.1 numactl

ARG GPU=ON
ARG BUILD_TYPE=release
Expand Down
2 changes: 0 additions & 2 deletions presto/scripts/generate_presto_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@ function duplicate_worker_configs() {
# make cudf.exchange=true if we are running multiple workers
sed -i "s+cudf.exchange=false+cudf.exchange=true+g" ${worker_native_config}
# make join-distribution-type=PARTITIONED if we are running multiple workers
# (ucx exchange does not currently support BROADCAST partition type)
sed -i "s+join-distribution-type=.*+join-distribution-type=PARTITIONED+g" ${coord_native_config}
fi

# Each worker node needs to have it's own http-server port. This isn't used, but
Expand Down
2 changes: 1 addition & 1 deletion presto/scripts/start_presto_helper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ elif [[ "$VARIANT_TYPE" == "gpu" ]]; then
if [[ -n $GPU_IDS ]]; then
FIRST_GPU_ID=$(echo $GPU_IDS | cut -d',' -f1)
fi
if [[ -n "$NUM_WORKERS" && "$NUM_WORKERS" -gt 1 ]]; then
if [[ -n "$NUM_WORKERS" && "$NUM_WORKERS" -gt 1 && "$SINGLE_CONTAINER" == "false" ]]; then
GPU_WORKER_SERVICE="presto-native-worker-gpu-${FIRST_GPU_ID}"
fi
conditionally_add_build_target $GPU_WORKER_IMAGE $GPU_WORKER_SERVICE "worker|w"
Expand Down
Loading