diff --git a/presto/docker/config/template/etc_worker/config_native.properties b/presto/docker/config/template/etc_worker/config_native.properties index c36788c7..11292ff3 100644 --- a/presto/docker/config/template/etc_worker/config_native.properties +++ b/presto/docker/config/template/etc_worker/config_native.properties @@ -41,3 +41,20 @@ cudf.exchange.server.port=0000 cudf.memory_resource=async async-data-cache-enabled=false + +# Disable JIT because it take a few iterations to warm up jit cache in all workers +cudf.jit_expression_enabled=false +# Turn on to use intra-node exchange optimization. +# NOTE: In cudf exchange 20260212 branch, this is needed for UCX to use nvlink. +cudf.intra_node_exchange=true + +# Use 100M rows per chunk for cudf partitioned output. +# NOTE: This is not yet propagated to the worker properly because only a fixed set of query configs are supported. +# https://github.com/prestodb/presto/blob/a62672886152c8c6b61cf301d246f217d850e357/presto-native-execution/presto_cpp/main/PrestoToVeloxQueryConfig.cpp#L106-L224 +# As a result, this needs to be hardcoded right now. +cudf.partitioned_output_batch_rows=100000000 + +# Enable cudf rebatching before aggregations. +cudf.concat_optimization_enabled=true +cudf.batch_size_min_threshold=100000000 + diff --git a/presto/docker/docker-compose/template/docker-compose.native-gpu.yml.jinja b/presto/docker/docker-compose/template/docker-compose.native-gpu.yml.jinja index d5ad844d..46887431 100644 --- a/presto/docker/docker-compose/template/docker-compose.native-gpu.yml.jinja +++ b/presto/docker/docker-compose/template/docker-compose.native-gpu.yml.jinja @@ -44,7 +44,10 @@ x-presto-native-worker-gpu: &gpu_worker_base # - /dev/infiniband/uverbs9 depends_on: - presto-coordinator + privileged: true volumes: + # required for numa + - /sys/devices/system/node:/sys/devices/system/node - ../../config/generated/gpu/etc_common:/opt/presto-server/etc - ../../config/generated/gpu/etc_worker/node.properties:/opt/presto-server/etc/node.properties - ../../config/generated/gpu/etc_worker/config_native.properties:/opt/presto-server/etc/config.properties diff --git a/presto/docker/launch_presto_servers.sh b/presto/docker/launch_presto_servers.sh index 16dfd352..5b1213e6 100644 --- a/presto/docker/launch_presto_servers.sh +++ b/presto/docker/launch_presto_servers.sh @@ -1,18 +1,44 @@ #!/bin/bash - set -e -# Run ldconfig once -echo ldconfig +ETC_BASE="/opt/presto-server/etc" + +# Resolve the NUMA node closest to a GPU and launch presto_server pinned to it. +# $1 — GPU ID +# $2 — etc-dir path for this instance +launch_worker() { + local gpu_id=$1 etc_dir=$2 + echo "Launching on GPU $gpu_id (config: $etc_dir)" + + local topo + topo=$(nvidia-smi topo -C -M -i "$gpu_id") + echo "$topo" + + local cpu_numa mem_numa + cpu_numa=$(echo "$topo" | awk -F: '/NUMA IDs of closest CPU/{ gsub(/ /,"",$2); print $2 }') + mem_numa=$(echo "$topo" | awk -F: '/NUMA IDs of closest memory/{ gsub(/ /,"",$2); print $2 }') + + local launcher=() + if [[ $cpu_numa =~ ^[0-9]+$ ]]; then + launcher=(numactl --cpunodebind="$cpu_numa") + if [[ $mem_numa =~ ^[0-9]+$ ]]; then + launcher+=(--membind="$mem_numa") + else + launcher+=(--membind="$cpu_numa") + fi + fi + + CUDA_VISIBLE_DEVICES="$gpu_id" "${launcher[@]}" presto_server --etc-dir="$etc_dir" & +} + +# No args → single worker using CUDA_VISIBLE_DEVICES (default 0), shared config dir. +# With args → one worker per GPU ID, each with its own config dir (etc). if [ $# -eq 0 ]; then - presto_server --etc-dir="/opt/presto-server/etc/" & + launch_worker "${CUDA_VISIBLE_DEVICES:-0}" "${ETC_BASE}/" else -# Launch workers in parallel, each pinned to a different GPU -# The GPU IDs are passed as command-line arguments -for gpu_id in "$@"; do - CUDA_VISIBLE_DEVICES=$gpu_id presto_server --etc-dir="/opt/presto-server/etc${gpu_id}" & -done + for gpu_id in "$@"; do + launch_worker "$gpu_id" "${ETC_BASE}${gpu_id}" + done fi -# Wait for all background processes wait diff --git a/presto/docker/native_build.dockerfile b/presto/docker/native_build.dockerfile index d4ab8a72..e14c68fe 100644 --- a/presto/docker/native_build.dockerfile +++ b/presto/docker/native_build.dockerfile @@ -3,7 +3,7 @@ FROM presto/prestissimo-dependency:centos9 RUN rpm --import https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub && \ dnf config-manager --add-repo "https://developer.download.nvidia.com/devtools/repos/rhel$(source /etc/os-release; echo ${VERSION_ID%%.*})/$(rpm --eval '%{_arch}' | sed s/aarch/arm/)/" && \ - dnf install -y nsight-systems-cli-2025.5.1 + dnf install -y nsight-systems-cli-2025.5.1 numactl ARG GPU=ON ARG BUILD_TYPE=release diff --git a/presto/scripts/generate_presto_config.sh b/presto/scripts/generate_presto_config.sh index cc20b82a..ccd741f8 100755 --- a/presto/scripts/generate_presto_config.sh +++ b/presto/scripts/generate_presto_config.sh @@ -53,8 +53,6 @@ function duplicate_worker_configs() { # make cudf.exchange=true if we are running multiple workers sed -i "s+cudf.exchange=false+cudf.exchange=true+g" ${worker_native_config} # make join-distribution-type=PARTITIONED if we are running multiple workers - # (ucx exchange does not currently support BROADCAST partition type) - sed -i "s+join-distribution-type=.*+join-distribution-type=PARTITIONED+g" ${coord_native_config} fi # Each worker node needs to have it's own http-server port. This isn't used, but diff --git a/presto/scripts/start_presto_helper.sh b/presto/scripts/start_presto_helper.sh index 089fcceb..280eda47 100755 --- a/presto/scripts/start_presto_helper.sh +++ b/presto/scripts/start_presto_helper.sh @@ -132,7 +132,7 @@ elif [[ "$VARIANT_TYPE" == "gpu" ]]; then if [[ -n $GPU_IDS ]]; then FIRST_GPU_ID=$(echo $GPU_IDS | cut -d',' -f1) fi - if [[ -n "$NUM_WORKERS" && "$NUM_WORKERS" -gt 1 ]]; then + if [[ -n "$NUM_WORKERS" && "$NUM_WORKERS" -gt 1 && "$SINGLE_CONTAINER" == "false" ]]; then GPU_WORKER_SERVICE="presto-native-worker-gpu-${FIRST_GPU_ID}" fi conditionally_add_build_target $GPU_WORKER_IMAGE $GPU_WORKER_SERVICE "worker|w"