diff --git a/.gitignore b/.gitignore index 694624a3..efa46633 100644 --- a/.gitignore +++ b/.gitignore @@ -22,4 +22,14 @@ presto/docker/config/generated*/ # Generated Presto Docker Compose files presto/docker/docker-compose/generated*/ +# Slurm logs and results +presto/slurm/presto-nvl72/logs/ +presto/slurm/presto-nvl72/*.err +presto/slurm/presto-nvl72/*.out +presto/slurm/presto-nvl72/result_dir/ +presto/slurm/presto-nvl72/kept_results/ +presto/slurm/presto-nvl72/worker_data/ +presto/slurm/presto-nvl72/profiles/ +presto/slurm/presto-nvl72/worker_info/ + devstate* diff --git a/benchmark_data_tools/duckdb_utils.py b/benchmark_data_tools/duckdb_utils.py index f7c85b74..0e16fc2a 100644 --- a/benchmark_data_tools/duckdb_utils.py +++ b/benchmark_data_tools/duckdb_utils.py @@ -6,6 +6,10 @@ import duckdb +def quote_ident(name: str) -> str: + return '"' + name.replace('"', '""') + '"' + + def init_benchmark_tables(benchmark_type, scale_factor): tables = duckdb.sql("SHOW TABLES").fetchall() assert len(tables) == 0 @@ -22,27 +26,27 @@ def init_benchmark_tables(benchmark_type, scale_factor): def drop_benchmark_tables(): tables = duckdb.sql("SHOW TABLES").fetchall() for (table,) in tables: - duckdb.sql(f"DROP TABLE {table}") + duckdb.sql(f"DROP TABLE {quote_ident(table)}") def create_table(table_name, data_path): - duckdb.sql(f"DROP TABLE IF EXISTS {table_name}") - duckdb.sql(f"CREATE TABLE {table_name} AS SELECT * FROM '{data_path}/*.parquet';") + duckdb.sql(f"DROP TABLE IF EXISTS {quote_ident(table_name)}") + duckdb.sql(f"CREATE TABLE {quote_ident(table_name)} AS SELECT * FROM '{data_path}/*.parquet';") # Generates a sample table with a small limit. # This is mainly used to extract the schema from the parquet files. def create_not_null_table_from_sample(table_name, data_path): - duckdb.sql(f"DROP TABLE IF EXISTS {table_name}") - duckdb.sql(f"CREATE TABLE {table_name} AS SELECT * FROM '{data_path}/*.parquet' LIMIT 10;") - ret = duckdb.sql(f"DESCRIBE TABLE {table_name}").fetchall() + duckdb.sql(f"DROP TABLE IF EXISTS {quote_ident(table_name)}") + duckdb.sql(f"CREATE TABLE {quote_ident(table_name)} AS SELECT * FROM '{data_path}/*.parquet' LIMIT 10;") + ret = duckdb.sql(f"DESCRIBE TABLE {quote_ident(table_name)}").fetchall() for row in ret: - duckdb.sql(f"ALTER TABLE {table_name} ALTER COLUMN {row[0]} SET NOT NULL;") + duckdb.sql(f"ALTER TABLE {quote_ident(table_name)} ALTER COLUMN {row[0]} SET NOT NULL;") def create_table_from_sample(table_name, data_path): - duckdb.sql(f"DROP TABLE IF EXISTS {table_name}") - duckdb.sql(f"CREATE TABLE {table_name} AS SELECT * FROM '{data_path}/*.parquet' LIMIT 10;") + duckdb.sql(f"DROP TABLE IF EXISTS {quote_ident(table_name)}") + duckdb.sql(f"CREATE TABLE {quote_ident(table_name)} AS SELECT * FROM '{data_path}/*.parquet' LIMIT 10;") def is_decimal_column(column_type): diff --git a/presto/docker/config/template/etc_coordinator/config_native.properties b/presto/docker/config/template/etc_coordinator/config_native.properties index fc2362b7..3e6ad0dc 100644 --- a/presto/docker/config/template/etc_coordinator/config_native.properties +++ b/presto/docker/config/template/etc_coordinator/config_native.properties @@ -48,7 +48,7 @@ optimizer.generate-domain-filters=true # Upper limit for broadcasted table size to avoid memory blowups. # See: https://github.com/prestodb/presto/issues/22161#issuecomment-1994128619 join-max-broadcast-table-size={{ .JoinMaxBroadcastTableSizeMb }}MB -# Default is AUTOMATIC, ucx exchange does not support BROADCAST partition type. +# overwritten to "PARTITIONED" in multi-node context join-distribution-type=AUTOMATIC # Client request timeout to avoid hung queries. @@ -58,7 +58,7 @@ query.execution-policy=phased # Kill queries based on total reservation on blocked nodes to recover memory. query.low-memory-killer.policy=total-reservation-on-blocked-nodes # Upper limit on query wall time to keep tests bounded. -query.max-execution-time=30m +query.max-execution-time=10m # Keep metadata of up to 1000 queries for UI and debugging. query.max-history=1000 # Memory quotas per node and cluster to protect stability. diff --git a/presto/docker/config/template/etc_worker/config_native.properties b/presto/docker/config/template/etc_worker/config_native.properties index b1ee1082..c36788c7 100644 --- a/presto/docker/config/template/etc_worker/config_native.properties +++ b/presto/docker/config/template/etc_worker/config_native.properties @@ -29,13 +29,15 @@ system-mem-limit-gb={{ sub .ContainerMemoryGb .GeneratorParameters.MemoryPushBac system-mem-shrink-gb=20 # Optimize for single-node execution when the entire query can run locally. +# overwritten to "false" in multi-node settings. single-node-execution-enabled=true # Enable cuDF (CPU mode will ignore this setting) cudf.enabled=true +# overwritten to "true" in multi-node settings. cudf.exchange=false -# Port number currently must be exactly 3 more than server port (ignored if cudf.exchange is false) -cudf.exchange.server.port=8083 +# overwritten when cudf.exchange is enabled (ignored otherwise) +cudf.exchange.server.port=0000 cudf.memory_resource=async async-data-cache-enabled=false diff --git a/presto/scripts/common_functions.sh b/presto/scripts/common_functions.sh index 9c7cc4df..a8fdf4aa 100644 --- a/presto/scripts/common_functions.sh +++ b/presto/scripts/common_functions.sh @@ -1,25 +1,12 @@ #!/bin/bash -# Copyright (c) 2025, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 function wait_for_worker_node_registration() { trap "rm -rf node_response.json" RETURN echo "Waiting for a worker node to be registered..." - HOSTNAME=${1:-localhost} - PORT=${2:-8080} COORDINATOR_URL=http://${HOSTNAME}:${PORT} echo "Coordinator URL: $COORDINATOR_URL" local -r MAX_RETRIES=12 diff --git a/presto/scripts/generate_presto_config.sh b/presto/scripts/generate_presto_config.sh index 71735825..cc20b82a 100755 --- a/presto/scripts/generate_presto_config.sh +++ b/presto/scripts/generate_presto_config.sh @@ -30,19 +30,23 @@ if [ ! -x "${SCRIPT_DIR}/../pbench/pbench" ]; then echo_error "ERROR: generate_presto_config.sh script cannot find pbench at ${SCRIPT_DIR}/../pbench/pbench" fi +# This function duplicates the worker configs when we are running multiple workers. +# It also adds certain config options to the workers if those options apply only to multi-worker environments. function duplicate_worker_configs() { local worker_id=$1 echo "Duplicating worker configs for GPU ID $worker_id" local worker_config="${CONFIG_DIR}/etc_worker_${worker_id}" - local worker_native_config="${worker_config}/config_native.properties" local coord_config="${CONFIG_DIR}/etc_coordinator" + local worker_native_config="${worker_config}/config_native.properties" local coord_native_config="${coord_config}/config_native.properties" + # Need to stagger the port numbers because ucx exchange currently expects to be exactly + # 3 higher than the http port. local http_port="10$(printf "%02d\n" "$worker_id")0" local exch_port="10$(printf "%02d\n" "$worker_id")3" rm -rf ${worker_config} cp -r ${CONFIG_DIR}/etc_worker ${worker_config} - # Single node execution needs to be disabled if we are running multiple workers. + # Some configs should only be applied if we are in a multi-worker environment. if [[ ${NUM_WORKERS} -gt 1 ]]; then sed -i "s+single-node-execution-enabled.*+single-node-execution-enabled=false+g" ${coord_native_config} sed -i "s+single-node-execution-enabled.*+single-node-execution-enabled=false+g" ${worker_native_config} @@ -72,7 +76,7 @@ RAM_GB=$(lsmem -b | grep "Total online memory" | awk '{print int($4 / (1024*1024 if [[ -z ${VARIANT_TYPE} || ! ${VARIANT_TYPE} =~ ^(cpu|gpu|java)$ ]]; then echo_error "ERROR: VARIANT_TYPE must be set to a valid variant type (cpu, gpu, java)." fi -if [[ -z ${VCPU_PER_WORKER} ]]; then +if [[ -z ${VCPU_PER_WORKER:-} ]]; then if [[ "${VARIANT_TYPE}" == "gpu" ]]; then VCPU_PER_WORKER=2 else @@ -122,6 +126,7 @@ EOF fi COORD_CONFIG="${CONFIG_DIR}/etc_coordinator/config_native.properties" + WORKER_CONFIG="${CONFIG_DIR}/etc_worker/config_native.properties" # now perform other variant-specific modifications to the generated configs if [[ "${VARIANT_TYPE}" == "gpu" ]]; then # for GPU variant, uncomment these optimizer settings @@ -158,10 +163,13 @@ fi # We want to propagate any changes from the original worker config to the new worker configs even if # we did not re-generate the configs. -if [[ -n "$NUM_WORKERS" && -n "$GPU_IDS" && "$VARIANT_TYPE" == "gpu" ]]; then - # Count the number of GPU IDs provided - IFS=',' read -ra GPU_ID_ARRAY <<< "$GPU_IDS" - for i in "${GPU_ID_ARRAY[@]}"; do +if [[ -n "$NUM_WORKERS" && "$VARIANT_TYPE" == "gpu" ]]; then + if [[ -n ${GPU_IDS:-} ]]; then + WORKER_IDS=($(echo "$GPU_IDS" | tr ',' ' ')) + else + WORKER_IDS=($(seq 0 $((NUM_WORKERS - 1)))) + fi + for i in "${WORKER_IDS[@]}"; do duplicate_worker_configs $i done fi diff --git a/presto/scripts/run_benchmark.sh b/presto/scripts/run_benchmark.sh index d651d71f..ff5117ef 100755 --- a/presto/scripts/run_benchmark.sh +++ b/presto/scripts/run_benchmark.sh @@ -30,6 +30,7 @@ OPTIONS: stored inside a directory under the --output-dir path with a name matching the tag name. Tags must contain only alphanumeric and underscore characters. -p, --profile Enable profiling of benchmark queries. + --profile-script-path Path to profiler functions script (default: ./profiler_functions.sh). --skip-drop-cache Skip dropping system caches before each benchmark query (dropped by default). -m, --metrics Collect detailed metrics from Presto REST API after each query. Metrics are stored in query-specific directories. @@ -147,6 +148,15 @@ parse_args() { PROFILE=true shift ;; + --profile-script-path) + if [[ -n $2 ]]; then + PROFILE_SCRIPT_PATH=$2 + shift 2 + else + echo "Error: --profile-script-path requires a value" + exit 1 + fi + ;; --skip-drop-cache) SKIP_DROP_CACHE=true shift @@ -218,7 +228,10 @@ if [[ -n ${TAG} ]]; then fi if [[ "${PROFILE}" == "true" ]]; then - PYTEST_ARGS+=("--profile --profile-script-path $(readlink -f ./profiler_functions.sh)") + if [[ -z "${PROFILE_SCRIPT_PATH:-}" ]]; then + PROFILE_SCRIPT_PATH="$(readlink -f ./profiler_functions.sh)" + fi + PYTEST_ARGS+=("--profile --profile-script-path ${PROFILE_SCRIPT_PATH}") fi if [[ "${METRICS}" == "true" ]]; then diff --git a/presto/scripts/setup_benchmark_helper_check_instance_and_parse_args.sh b/presto/scripts/setup_benchmark_helper_check_instance_and_parse_args.sh index 044c4fe6..bcf35d0a 100644 --- a/presto/scripts/setup_benchmark_helper_check_instance_and_parse_args.sh +++ b/presto/scripts/setup_benchmark_helper_check_instance_and_parse_args.sh @@ -31,6 +31,7 @@ OPTIONS: -s, --schema-name Name of the schema that will contain the created tables. -d, --data-dir-name Name of the directory inside the PRESTO_DATA_DIR path for the benchmark data. --skip-analyze-tables Skip analyzing tables after creating them. Default is to analyze tables. + --no-docker Skip the setup/teardown steps that require docker. $SCRIPT_EXTRA_OPTIONS_DESCRIPTION EXAMPLES: @@ -49,6 +50,7 @@ fi # Compute the directory where this script resides (if not already set by caller) SCRIPT_DIR="${SCRIPT_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)}" +NO_DOCKER=false SKIP_ANALYZE_TABLES=false parse_args() { while [[ $# -gt 0 ]]; do @@ -88,6 +90,10 @@ parse_args() { SKIP_ANALYZE_TABLES=true shift ;; + --no-docker) + NO_DOCKER=true + shift + ;; *) SCRIPT_EXTRA_OPTIONS_UNKNOWN_ARG=true if [[ -n $SCRIPT_EXTRA_OPTIONS_PARSER ]]; then diff --git a/presto/scripts/setup_benchmark_tables.sh b/presto/scripts/setup_benchmark_tables.sh index d9662219..e69391a5 100755 --- a/presto/scripts/setup_benchmark_tables.sh +++ b/presto/scripts/setup_benchmark_tables.sh @@ -32,11 +32,14 @@ function cleanup() { trap cleanup EXIT -"${SCRIPT_DIR}/start_native_cpu_presto.sh" - -source "${SCRIPT_DIR}/common_functions.sh" +# These scripts are used in some non-docker environments, so provide the option to skip +# the docker setup/teardown. +if [[ -z "$NO_DOCKER" ]]; then + "${SCRIPT_DIR}/start_native_cpu_presto.sh" + source "${SCRIPT_DIR}/common_functions.sh" + wait_for_worker_node_registration +fi -wait_for_worker_node_registration "${SCRIPT_DIR}/../../scripts/run_py_script.sh" -p $SCHEMA_GEN_SCRIPT_PATH \ --benchmark-type $BENCHMARK_TYPE \ @@ -53,4 +56,6 @@ if [[ "$SKIP_ANALYZE_TABLES" == "false" ]]; then "${SCRIPT_DIR}/analyze_tables.sh" -s $SCHEMA_NAME fi -"${SCRIPT_DIR}/stop_presto.sh" +if [[ -z "$NO_DOCKER" ]]; then + "${SCRIPT_DIR}/stop_presto.sh" +fi diff --git a/presto/scripts/start_presto_helper.sh b/presto/scripts/start_presto_helper.sh index 465fac38..7714701a 100755 --- a/presto/scripts/start_presto_helper.sh +++ b/presto/scripts/start_presto_helper.sh @@ -19,7 +19,12 @@ fi SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # Get the root of the git repository -REPO_ROOT="$(git -C "${SCRIPT_DIR}" rev-parse --show-toplevel)" +if command -v git &> /dev/null; then + REPO_ROOT="$(git -C "${SCRIPT_DIR}" rev-parse --show-toplevel)" +else + REPO_ROOT="$SCRIPT_DIR/../.." +fi + # Validate sibling repos if [[ "$VARIANT_TYPE" == "java" ]]; then @@ -88,12 +93,6 @@ else echo "Internal error: unexpected VARIANT_TYPE value: $VARIANT_TYPE" fi -# Default GPU_IDS if NUM_WORKERS is set but GPU_IDS is not -if [[ -n $NUM_WORKERS && -z $GPU_IDS ]]; then - # Generate default GPU IDs: 0,1,2,...,N-1 - export GPU_IDS=$(seq -s, 0 $((NUM_WORKERS - 1))) -fi - "${SCRIPT_DIR}/stop_presto.sh" "${SCRIPT_DIR}/generate_presto_config.sh" diff --git a/presto/slurm/presto-nvl72/README.md b/presto/slurm/presto-nvl72/README.md index baed39d0..b2fb24e3 100644 --- a/presto/slurm/presto-nvl72/README.md +++ b/presto/slurm/presto-nvl72/README.md @@ -17,43 +17,61 @@ presto-nvl72/ ## Quick Start -### Running the Benchmark +### Running the benchmark via launcher (recommended) ```bash -cd /mnt/data/bzaitlen/presto-nvl72 -./launch-run.sh +cd presto/slurm/presto-nvl72 +./launch-run.sh -n -s [-i ] [additional sbatch options] + +# examples +./launch-run.sh -n 8 -s 3000 +./launch-run.sh -n 4 -s 10000 -i 3 --partition gpu --account myacct ``` -Or submit directly: +The launcher: +- requires node count (-n/--nodes) and scale factor (-s/--scale-factor) +- accepts optional iterations (-i/--iterations, default 1) +- embeds nodes/SF/iterations in .out/.err filenames +- prints the first node’s hostname/IP when allocated and a ready-to-run SSH port-forward command to access the Presto Web UI on your machine (http://localhost:9200) + +### Submitting directly (advanced) ```bash -sbatch run-presto-benchmarks.slurm +export SCALE_FACTOR=3000 +export NUM_ITERATIONS=1 +sbatch --nodes 8 \ + --output "presto-tpch-run_n8_sf3000_i1_%j.out" \ + --error "presto-tpch-run_n8_sf3000_i1_%j.err" \ + --export "ALL,SCALE_FACTOR=${SCALE_FACTOR},NUM_ITERATIONS=${NUM_ITERATIONS}" \ + run-presto-benchmarks.slurm ``` ## Configuration -**To change settings, edit the values directly in `run-presto-benchmarks.slurm`** +Primary configuration is passed via the launcher flags and environment. The `.slurm` script validates that required variables are set. -All configuration is at the top of the file in the "User Configuration" section. +Key variables: -### Configuration Variables +- SCALE_FACTOR: required (provided via `-s/--scale-factor`) +- NUM_ITERATIONS: required by the job; launcher defaults to 1 (`-i/--iterations` to override) +- NUM_NODES: derived from Slurm allocation; provided via `-n/--nodes` to launcher +- REPO_ROOT: auto-detected from script location +- LOGS: `${SCRIPT_DIR}/logs` by default +- IMAGE_DIR, DATA, CONFIGS: see below or override via environment if needed -| Variable | Current Value | Description | -|----------|---------------|-------------| -| `SCALE_FACTOR` | 300 | TPC-H scale factor | -| `NUM_ITERATIONS` | 5 | Number of query iterations | -| `WORKER_IMAGE` | presto-native-worker-gpu | Worker container image | -| `NUM_NODES` | 4 | Number of nodes to allocate | -| `NUM_GPUS_PER_NODE` | 4 | GPUs per node | -| `DATA` | /mnt/data/tpch-rs/scale-300 | Data directory | -| `IMAGE_DIR` | /mnt/home/misiug/images | Container image directory | -| `LOGS` | /mnt/data/bzaitlen/presto-nvl72/logs | Log directory | +Other defaults: +- WORKER_IMAGE: `presto-native-worker-gpu` +- NUM_GPUS_PER_NODE: `4` +- DATA: `/mnt/data/tpch-rs` +- IMAGE_DIR: `/mnt/data/images/presto` +- CONFIGS: `${REPO_ROOT}/presto/docker/config/generated/gpu` ### SBATCH Directives - **Time limit**: 1 hour (adjust `--time` if needed) - **Node allocation**: Full node (144 CPUs, 4 GPUs, exclusive) - **Memory**: All available (`--mem=0`) +- `--nodes`, `--output`, and `--error` are passed by the launcher instead of being embedded in the `.slurm` file. ## Monitoring @@ -62,7 +80,7 @@ All configuration is at the top of the file in the "User Configuration" section. squeue -u $USER # Monitor job output -tail -f presto-tpch-run_.out +tail -f presto-tpch-run_n_sf_i_.out # Check logs during execution tail -f logs/coord.log @@ -70,12 +88,26 @@ tail -f logs/cli.log tail -f logs/worker_0.log ``` +## Coordinator IP and Web UI + +After submission, the launcher waits until nodes are allocated, then prints: +- the first node’s hostname/IP +- an SSH port-forward command you can run locally to access the Presto Web UI + +Example output snippet: + +```text +Run this command on a machine to get access to the webUI: + ssh -N -L 9200::9200 +The UI will be available at http://localhost:9200 +``` + ## Results Results are saved to: - **Logs**: `logs/` directory - **CSV Summary**: `result_dir/summary.csv` -- **Historical Results**: `${WORKSPACE}/benchmark-storage/YYYY/MM/DD/` +- **Historical Results**: `${REPO_ROOT}/benchmark-storage/YYYY/MM/DD/` ## Prerequisites @@ -85,7 +117,7 @@ Results are saved to: 2. **Data directory** must be accessible at `${DATA}` (will be mounted in containers) -3. **velox-testing repo** will be auto-cloned to `${WORKSPACE}/velox-testing` if not present +3. **velox-testing repo** will be auto-cloned to `${REPO_ROOT}/velox-testing` if not present ## Troubleshooting @@ -104,7 +136,7 @@ cat logs/worker_*.log ### Image not found Verify images exist: ```bash -ls -lh /mnt/home/misiug/images/*.sqsh +ls -lh /mnt/data/images/presto/*.sqsh ``` ### Data directory issues diff --git a/presto/slurm/presto-nvl72/create-presto-benchmarks.sh b/presto/slurm/presto-nvl72/create-presto-benchmarks.sh index 116192b2..4b219c29 100755 --- a/presto/slurm/presto-nvl72/create-presto-benchmarks.sh +++ b/presto/slurm/presto-nvl72/create-presto-benchmarks.sh @@ -1,4 +1,7 @@ #!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + set -e set -x @@ -8,29 +11,15 @@ set -x # This script creates the Presto schema and tables for existing TPC-H data # Source helper functions -source /mnt/home/misiug/veloxtesting/presto-nvl72/echo_helpers.sh -source /mnt/home/misiug/veloxtesting/presto-nvl72/functions.sh +source ./echo_helpers.sh +source ./functions.sh # ============================================================================== # Setup and Validation # ============================================================================== echo "Setting up Presto environment for schema creation..." -export VARIANT_TYPE=cpu setup -worker_config="${CONFIGS}/etc_worker/config_native.properties" -sed -i "s/system-memory-gb.*/system-memory-gb=400/g" ${worker_config} -sed -i "s/query-memory-gb.*/query-memory-gb=400/g" ${worker_config} -sed -i "s/query\.max-memory-per-node.*/query\.max-memory-per-node=400GB/g" ${worker_config} - -coord_config="${CONFIGS}/etc_coordinator/config_native.properties" -sed -i "s/memory\.heap-headroom-per-node.*/memory\.heap-headroom-per-node=120GB/g" ${coord_config} -sed -i "s/query\.max-total-memory-per-node.*/query\.max-total-memory-per-node=300GB/g" ${coord_config} -sed -i "s/query\.max-total-memory.*/query\.max-total-memory=300GB/g" ${coord_config} -sed -i "s/query\.max-memory-per-node.*/query\.max-memory-per-node=250GB/g" ${coord_config} -sed -i "s/query\.max-memory.*/query\.max-memory=250GB/g" ${coord_config} -sed -i "s/cluster-tag.*//g" ${coord_config} - # ============================================================================== # Start Coordinator # ============================================================================== @@ -38,8 +27,6 @@ echo "Starting Presto coordinator on ${COORD}..." run_coordinator wait_until_coordinator_is_running - - # ============================================================================== # Start Workers (GPU workers for schema creation) # ============================================================================== diff --git a/presto/slurm/presto-nvl72/create-presto-benchmarks.slurm b/presto/slurm/presto-nvl72/create-presto-benchmarks.slurm index adaac17d..1bea2354 100644 --- a/presto/slurm/presto-nvl72/create-presto-benchmarks.slurm +++ b/presto/slurm/presto-nvl72/create-presto-benchmarks.slurm @@ -1,9 +1,6 @@ #!/bin/bash #SBATCH --job-name=presto-tpch-create -#SBATCH --output=/mnt/home/misiug/veloxtesting/presto-nvl72/%x_%j.out -#SBATCH --error=/mnt/home/misiug/veloxtesting/presto-nvl72/%x_%j.err #SBATCH --time=01:00:00 -#SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=144 #SBATCH --mem=0 @@ -14,21 +11,31 @@ # User Configuration - Edit these values directly # ============================================================================== # TPC-H Configuration -export SCALE_FACTOR=10000 +if [ -z "${SCALE_FACTOR:-}" ]; then + echo "Error: SCALE_FACTOR is required. Set via launcher: -s|--scale-factor" >&2 + exit 1 +fi +export SCALE_FACTOR +if [ -z "${SCRIPT_DIR:-}" ]; then + echo "Error: SCRIPT_DIR is required." + exit 1 +fi +export SCRIPT_DIR # Directory Configuration -export WORKSPACE=/mnt/home/misiug +export VT_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." >/dev/null 2>&1 && pwd -P)" export DATA=/mnt/data/tpch-rs -export IMAGE_DIR=/mnt/home/misiug/images -export LOGS=/mnt/home/misiug/veloxtesting/presto-nvl72/logs -export CONFIGS=${WORKSPACE}/config/generated/cpu +export IMAGE_DIR=/mnt/data/images/presto +export LOGS=$SCRIPT_DIR/logs +export VARIANT_TYPE=cpu +export CONFIGS=$VT_ROOT/presto/docker/config/generated/$VARIANT_TYPE # Container Images # Coordinator: ${IMAGE_DIR}/presto-coordinator-test.sqsh # Worker: ${IMAGE_DIR}/${WORKER_IMAGE}.sqsh (CPU workers required for ANALYZE) -export WORKER_IMAGE=presto-native-worker-cpu -export NUM_NODES=1 -export NUM_GPUS_PER_NODE=1 +export WORKER_IMAGE=presto-native-worker-$VARIANT_TYPE +export NUM_NODES=$SLURM_JOB_NUM_NODES +export NUM_GPUS_PER_NODE=4 # Presto Configuration export PORT=9200 @@ -67,7 +74,7 @@ echo "========================================" # Create necessary directories mkdir -p ${LOGS} mkdir -p ${DATA} -mkdir -p ${WORKSPACE}/.hive_metastore +mkdir -p ${VT_ROOT}/.hive_metastore # Launch the job script -bash ${WORKSPACE}/veloxtesting/presto-nvl72/create-presto-benchmarks.sh +bash ${VT_ROOT}/veloxtesting/presto-nvl72/create-presto-benchmarks.sh diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh index 5f07d6dc..e337a667 100755 --- a/presto/slurm/presto-nvl72/functions.sh +++ b/presto/slurm/presto-nvl72/functions.sh @@ -1,18 +1,6 @@ #!/bin/bash - -# UCX Configuration -export UCX_TLS=^ib,ud:aux,sm -export UCX_MAX_RNDV_RAILS=1 -export UCX_RNDV_PIPELINE_ERROR_HANDLING=y -export UCX_TCP_KEEPINTVL=1ms -export UCX_KEEPALIVE_INTERVAL=1ms - - -# Image directory for presto container images (can be overridden via environment) -IMAGE_DIR="${IMAGE_DIR:-${WORKSPACE}/images}" - -# Logs directory for presto execution logs (can be overridden via environment) -LOGS="${LOGS:-/mnt/home/misiug/veloxtesting/presto-nvl72/logs}" +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 # Validates job preconditions and assigns default values for presto execution. function setup { @@ -20,46 +8,34 @@ function setup { [ -z "$SLURM_JOB_ACCOUNT" ] && echo "required argument '--account' not specified" && exit 1 [ -z "$SLURM_JOB_PARTITION" ] && echo "required argument '--partition' not specified" && exit 1 [ -z "$SLURM_NNODES" ] && echo "required argument '--nodes' not specified" && exit 1 - [ -z "$NUM_NODES" ] && echo "NUM_WORKERS must be set" && exit 1 + [ -z "$IMAGE_DIR" ] && echo "IMAGE_DIR must be set" && exit 1 + [ -z "$LOGS" ] && echo "LOGS must be set" && exit 1 + [ -z "$CONFIGS" ] && echo "CONFIGS must be set" && exit 1 + [ -z "$NUM_NODES" ] && echo "NUM_NODES must be set" && exit 1 [ -z "$NUM_GPUS_PER_NODE" ] && echo "NUM_GPUS_PER_NODE env variable must be set" && exit 1 - [ ! -d "$WORKSPACE" ] && echo "WORKSPACE must be a valid directory" && exit 1 + [ ! -d "$VT_ROOT" ] && echo "VT_ROOT must be a valid directory" && exit 1 [ ! -d "$DATA" ] && echo "DATA must be a valid directory" && exit 1 - NUM_WORKERS=$(( $NUM_NODES * $NUM_GPUS_PER_NODE )) - mkdir -p ${LOGS} - # Only set CONFIGS if not already set (allow override from environment) - #CONFIGS="${CONFIGS:-${WORKSPACE}/config/generated/gpu}" - #CONFIGS="${CONFIGS:-${WORKSPACE}/config/generated/cpu}" - CONFIGS="${CONFIGS:-${WORKSPACE}/config/generated/${VARIANT_TYPE}}" - COORD=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -1) - PORT=9200 - CUDF_LIB=/usr/lib64/presto-native-libs - if [ "${NUM_WORKERS}" -eq "1" ]; then - SINGLE_NODE_EXECUTION=true + if [ ! -d ${VT_ROOT}/.hive_metastore ]; then + echo "Copying hive metastore from data source." + copy_hive_metastore else - SINGLE_NODE_EXECUTION=false + echo "Hive metastore already exists. Reusing." fi - if [ ! -d ${WORKSPACE}/velox-testing ]; then - git clone -b misiug/cluster https://github.com/rapidsai/velox-testing.git ${WORKSPACE}/velox-testing - #sed -i "s/python3 /python3.12 /g" ${WORKSPACE}/velox-testing/scripts/py_env_functions.sh - fi + [ ! -d ${VT_ROOT}/.hive_metastore/tpchsf${SCALE_FACTOR} ] && echo "Schema for SF ${SCALE_FACTOR} does not exist in hive metastore." && exit 1 - [ ! -d ${CONFIGS} ] && generate_configs + generate_configs validate_config_directory } function generate_configs { mkdir -p ${CONFIGS} - pushd ${WORKSPACE}/velox-testing/presto/scripts - #VARIANT_TYPE=cpu ./generate_presto_config.sh - #VARIANT_TYPE=gpu ./generate_presto_config.sh + pushd ${VT_ROOT}/presto/scripts OVERWRITE_CONFIG=true ./generate_presto_config.sh popd - mv ${WORKSPACE}/velox-testing/presto/docker/config/generated/${VARIANT_TYPE}/* ${CONFIGS}/ - #mv ${WORKSPACE}/velox-testing/presto/docker/config/generated/gpu/* ${CONFIGS}/ - #mv ${WORKSPACE}/velox-testing/presto/docker/config/generated/cpu/* ${CONFIGS}/ + # These options are require to run in some cluster contexts. echo "--add-modules=java.management,jdk.management" >> ${CONFIGS}/etc_common/jvm.config echo "-Dcom.sun.management.jmxremote=false" >> ${CONFIGS}/etc_common/jvm.config echo "-XX:-UseContainerSupport" >> ${CONFIGS}/etc_common/jvm.config @@ -82,17 +58,15 @@ function validate_environment_preconditions { # Execute script through the coordinator image (used for coordinator and cli executables) function run_coord_image { [ $# -ne 2 ] && echo_error "$0 expected one argument for ' + + + + + + + + + + + + + + +
+
+ +
+
+
Loading...
+
+
+ +
+
+ Query Details +
+
Loading...
+
+ +
+ + + + + + + + diff --git a/presto/slurm/presto-nvl72/launch-run.sh b/presto/slurm/presto-nvl72/launch-run.sh index 52841bb1..cdb7dc6f 100755 --- a/presto/slurm/presto-nvl72/launch-run.sh +++ b/presto/slurm/presto-nvl72/launch-run.sh @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + + #!/bin/bash # ============================================================================== # Presto TPC-H Benchmark Launcher @@ -5,7 +9,10 @@ # Simple launcher script to submit the presto benchmark job to slurm # # Usage: -# ./launch-run.sh [additional sbatch options] +# ./launch-run.sh -n|--nodes -s|--scale-factor [-i|--iterations ] [-p|--profile] [additional sbatch options] +# +# Options: +# -p, --profile Enable profiling of benchmark queries (creates .nsys-rep files for each worker) # # To change configuration, edit run-presto-benchmarks.slurm directly # ============================================================================== @@ -15,23 +22,151 @@ set -e # Change to script directory cd "$(dirname "$0")" -# Clean up old output files -rm -f result_dir/* logs/* *.out *.err 2>/dev/null || true -mkdir -p result_dir logs +# Clean up old output files, worker info directory, and worker data directories +rm -f profiles/* result_dir/* logs/* *.out *.err 2>/dev/null || true +rm -rf worker_info worker_data 2>/dev/null || true +mkdir -p result_dir logs worker_info worker_data echo "Submitting Presto TPC-H benchmark job..." echo "Configuration is set in run-presto-benchmarks.slurm" echo "" -# Submit job -JOB_ID=$(sbatch "$@" run-presto-benchmarks.slurm | awk '{print $NF}') -#JOB_ID=$(sbatch "$@" create-presto-benchmarks.slurm | awk '{print $NF}') +# Parse required -n/--nodes and -s/--scale-factor, optional -i/--iterations, and collect extra sbatch args +NODES_COUNT="" +SCALE_FACTOR="" +NUM_ITERATIONS="1" +EXTRA_ARGS=() +NUM_GPUS_PER_NODE="4" +WORKER_IMAGE="presto-native-worker-gpu" +COORD_IMAGE="presto-coordinator" +ENABLE_PROFILING="false" +while [[ $# -gt 0 ]]; do + case "$1" in + -n|--nodes) + if [[ -n "${2:-}" && "${2:0:1}" != "-" ]]; then + NODES_COUNT="$2" + shift 2 + else + echo "Error: -n|--nodes requires a value." + echo "Usage: $0 -n|--nodes -s|--scale-factor [additional sbatch options]" + exit 1 + fi + ;; + -s|--scale-factor) + if [[ -n "${2:-}" && "${2:0:1}" != "-" ]]; then + SCALE_FACTOR="$2" + shift 2 + else + echo "Error: -s|--scale-factor requires a value." + echo "Usage: $0 -n|--nodes -s|--scale-factor [additional sbatch options]" + exit 1 + fi + ;; + -i|--iterations) + if [[ -n "${2:-}" && "${2:0:1}" != "-" ]]; then + NUM_ITERATIONS="$2" + shift 2 + else + echo "Error: -i|--iterations requires a value" + echo "Usage: $0 -n|--nodes -s|--scale-factor [-i|--iterations ] [additional sbatch options]" + exit 1 + fi + ;; + -g|--num-gpus-per-node) + if [[ -n "${2:-}" && "${2:0:1}" != "-" ]]; then + NUM_GPUS_PER_NODE="$2" + shift 2 + else + echo "Error: -g|--num-gpus-per-node requires a value" + echo "Usage: $0 -n|--nodes -s|--scale-factor [-i|--iterations ] [additional sbatch options]" + exit 1 + fi + ;; + -w|--worker-image) + if [[ -n "${2:-}" && "${2:0:1}" != "-" ]]; then + WORKER_IMAGE="$2" + shift 2 + else + echo "Error: -w|--worker-image requires a value" + echo "Usage: $0 -n|--nodes -s|--scale-factor [-i|--iterations ] [additional sbatch options]" + exit 1 + fi + ;; + -c|--coord-image) + if [[ -n "${2:-}" && "${2:0:1}" != "-" ]]; then + COORD_IMAGE="$2" + shift 2 + else + echo "Error: -c|--coord-image requires a value" + echo "Usage: $0 -n|--nodes -s|--scale-factor [-i|--iterations ] [additional sbatch options]" + exit 1 + fi + ;; + -p|--profile) + ENABLE_PROFILING="true" + shift + ;; + --) + shift + break + ;; + *) + EXTRA_ARGS+=("$1") + shift + ;; + esac +done + +if [[ -z "${NODES_COUNT}" ]]; then + echo "Error: -n|--nodes is required" + echo "Usage: $0 -n|--nodes -s|--scale-factor [-i|--iterations ] [additional sbatch options]" + exit 1 +fi +if [[ -z "${SCALE_FACTOR}" ]]; then + echo "Error: -s|--scale-factor is required" + echo "Usage: $0 -n|--nodes -s|--scale-factor [-i|--iterations ] [additional sbatch options]" + exit 1 +fi + +# Submit job (include nodes/SF/iterations in file names) +OUT_FMT="presto-tpch-run_n${NODES_COUNT}_sf${SCALE_FACTOR}_i${NUM_ITERATIONS}_%j.out" +ERR_FMT="presto-tpch-run_n${NODES_COUNT}_sf${SCALE_FACTOR}_i${NUM_ITERATIONS}_%j.err" +SCRIPT_DIR="$PWD" +JOB_ID=$(sbatch --nodes="${NODES_COUNT}" --export="ALL,SCALE_FACTOR=${SCALE_FACTOR},NUM_ITERATIONS=${NUM_ITERATIONS},SCRIPT_DIR=${SCRIPT_DIR},NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE},WORKER_IMAGE=${WORKER_IMAGE},COORD_IMAGE=${COORD_IMAGE},ENABLE_PROFILING=${ENABLE_PROFILING}" \ +--output="${OUT_FMT}" --error="${ERR_FMT}" "${EXTRA_ARGS[@]}" --gres="gpu:${NUM_GPUS_PER_NODE}" \ +run-presto-benchmarks.slurm | awk '{print $NF}') +OUT_FILE="${OUT_FMT//%j/${JOB_ID}}" +ERR_FILE="${ERR_FMT//%j/${JOB_ID}}" + +# Resolve and print first node IP once nodes are allocated +echo "Resolving first node IP..." +for i in {1..60}; do + STATE=$(squeue -j "$JOB_ID" -h -o "%T" 2>/dev/null || true) + NODELIST=$(squeue -j "$JOB_ID" -h -o "%N" 2>/dev/null || true) + if [[ -n "${NODELIST:-}" && "${NODELIST}" != "(null)" ]]; then + FIRST_NODE=$(scontrol show hostnames "$NODELIST" | head -n 1) + if [[ -n "${FIRST_NODE:-}" ]]; then + part=$(scontrol getaddrs "$FIRST_NODE" 2>/dev/null | awk 'NR==1{print $2}') + FIRST_IP="${part%%:*}" + echo "Run this command on a machine to get access to the webUI: + ssh -N -L 9200:$FIRST_IP:9200 sunk.pocf62-use13a.coreweave.app +The UI will be available at http://localhost:9200" + echo "" + break + fi + fi + sleep 5 +done echo "Job submitted with ID: $JOB_ID" echo "" echo "Monitor job with:" echo " squeue -j $JOB_ID" -echo " tail -f presto-tpch-run_${JOB_ID}.out" +echo " tail -f ${OUT_FILE}" +echo " tail -f ${ERR_FILE}" +echo " tail -f logs/coord.log" +echo " tail -f logs/worker_*.log" +echo " tail -f logs/cli.log" echo "" echo "Waiting for job to complete..." @@ -44,8 +179,10 @@ echo "" echo "Job completed!" echo "" echo "Output files:" -ls -lh presto-tpch-run_${JOB_ID}.{out,err} 2>/dev/null || echo "No output files found" +ls -lh "${OUT_FILE}" "${ERR_FILE}" 2>/dev/null || echo "No output files found" echo "" echo "Showing job output:" echo "========================================" -cat presto-tpch-run_${JOB_ID}.out 2>/dev/null || echo "No output available" +cat "${OUT_FILE}" 2>/dev/null || echo "No output available" +echo "Showing benchmark results:" +cat logs/cli.log 2>/dev/null || echo "No CLI output available" diff --git a/presto/slurm/presto-nvl72/profiler_functions_slurm.sh b/presto/slurm/presto-nvl72/profiler_functions_slurm.sh new file mode 100755 index 00000000..a3e13e03 --- /dev/null +++ b/presto/slurm/presto-nvl72/profiler_functions_slurm.sh @@ -0,0 +1,192 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -e + +# This script provides profiling functions for SLURM/Singularity-based Presto workers. +# It uses srun to execute commands in worker containers instead of docker exec. + +# Set default values if not provided (when called from within container) +# IMPORTANT: SCRIPT_DIR must point to the slurm directory where worker info files are stored +# If it's set to the scripts directory, fix it +if [[ -z "${SCRIPT_DIR:-}" ]] || [[ "${SCRIPT_DIR}" == *"/presto/scripts"* ]]; then + SCRIPT_DIR="/workspace/presto/slurm/presto-nvl72" +fi +VT_ROOT="${VT_ROOT:-/workspace}" +IMAGE_DIR="${IMAGE_DIR:-/mnt/data/images/presto}" + +# Get the worker node and image for a specific worker ID +function get_worker_info() { + local worker_id=$1 + local worker_info_file="${SCRIPT_DIR}/worker_info/worker_${worker_id}.info" + + if [ ! -f "$worker_info_file" ]; then + echo "Error: Worker info file not found for worker ${worker_id} at ${worker_info_file}" >&2 + echo "SCRIPT_DIR is: ${SCRIPT_DIR}" >&2 + echo "Looking for: ${worker_info_file}" >&2 + ls -la "${SCRIPT_DIR}/worker_info/"* 2>&1 || echo "No worker info files found" >&2 + return 1 + fi + + source "$worker_info_file" + if [ -z "${WORKER_NODE:-}" ] || [ -z "${WORKER_IMAGE:-}" ]; then + echo "Error: Worker info file incomplete for worker ${worker_id}" >&2 + return 1 + fi + echo "${WORKER_NODE}:${WORKER_IMAGE}" +} + +# Execute a command in a worker container using srun +function exec_in_worker() { + local worker_id=$1 + local command=$2 + + local worker_info + worker_info=$(get_worker_info "$worker_id") || return 1 + + local worker_node="${worker_info%%:*}" + local worker_image="${worker_info##*:}" + local worker_image_path="${IMAGE_DIR}/${worker_image}.sqsh" + + # Execute command in the worker container + # Note: This assumes we're running from within the SLURM job context where srun is available + srun -N1 -w "$worker_node" --ntasks=1 --overlap \ + --container-image="${worker_image_path}" \ + --export=ALL \ + --container-mounts="${VT_ROOT}:/workspace,${SCRIPT_DIR}/profiles:/presto_profiles,${SCRIPT_DIR}/worker_info:/worker_info" \ + -- bash -c "$command" +} + +# Check if profiling directory exists in worker container, create it if it doesn't +function check_profile_output_directory() { + local worker_id=$1 + + # Try to create the directory if it doesn't exist + exec_in_worker "$worker_id" "mkdir -p /presto_profiles" >/dev/null 2>&1 || true + + # Verify it exists now + if ! exec_in_worker "$worker_id" "[[ -d /presto_profiles ]]" 2>/dev/null; then + echo "Warning: Could not create /presto_profiles directory in worker ${worker_id} container" >&2 + return 1 + fi +} + +# Get the PID of presto_server process in a worker container +function get_presto_pid() { + local worker_id=$1 + local pid_file="/worker_info/worker_${worker_id}_pid.txt" + + # Try to read PID from file first + local pid=$(exec_in_worker "$worker_id" "cat ${pid_file} 2>/dev/null" 2>/dev/null | tr -d '\n\r ' || echo "") + + # If not found in file, try to find it by process name + if [ -z "$pid" ] || [ "$pid" = "0" ] || ! kill -0 "$pid" 2>/dev/null; then + pid=$(exec_in_worker "$worker_id" "pgrep -f 'presto_server.*--etc-dir' | head -1" 2>/dev/null | tr -d '\n\r ' || echo "") + fi + + if [ -z "$pid" ] || [ "$pid" = "0" ]; then + echo "Error: Could not find presto_server PID for worker ${worker_id}" >&2 + return 1 + fi + + echo "$pid" +} + +# Start profiling on a specific worker using nsys attach +function start_profiler_worker() { + local worker_id=$1 + local profile_output_file_path=$2 + + check_profile_output_directory "$worker_id" + + # Get the PID of the presto_server process + local pid + pid=$(get_presto_pid "$worker_id") || return 1 + + local profile_basename=$(basename "$profile_output_file_path") + local output_file="/presto_profiles/${profile_basename}.nsys-rep" + + # Use nsys attach to attach to the running process + # Note: This must be executed from the HOST, not from inside a container + # We'll write a command file that gets executed from the host + echo "Attaching nsys to presto_server (PID: $pid) in worker ${worker_id}" >&2 + + # For now, try to execute from container - this will fail but show the approach + # The real solution requires executing from host, which we'll implement via a command file + exec_in_worker "$worker_id" \ + "nsys attach --pid=$pid --gpu-metrics-devices=all -t nvtx,cuda,osrt,ucx --cuda-memory-usage=true --cuda-um-cpu-page-faults=true --cuda-um-gpu-page-faults=true --cudabacktrace=true -o ${output_file}" || { + echo "Warning: nsys attach failed. Trying alternative: writing command to file for host execution" >&2 + # Write command to a file that can be executed from the host + echo "nsys attach --pid=$pid -o ${output_file}" > "${SCRIPT_DIR}/profiles/.profiler_cmd_${worker_id}.sh" + return 1 + } +} + +# Stop profiling on a specific worker and ensure file is accessible +function stop_profiler_worker() { + local worker_id=$1 + local profile_output_file_path=$2 + + check_profile_output_directory "$worker_id" + + local profile_basename=$(basename "$profile_output_file_path") + local container_file_path="/presto_profiles/${profile_basename}.nsys-rep" + + # Stop profiling + exec_in_worker "$worker_id" "nsys stop" + + # Change ownership so file is accessible + exec_in_worker "$worker_id" "chown -R \$(id -u):\$(id -g) /presto_profiles" + + # The file should already be accessible via the mounted directory at ${SCRIPT_DIR}/profiles/ + # But we verify it exists + local host_file_path="${SCRIPT_DIR}/profiles/${profile_basename}.nsys-rep" + if [ ! -f "$host_file_path" ]; then + echo "Warning: Profile file not found at expected location: $host_file_path" >&2 + return 1 + fi + + echo "Profile saved to: $host_file_path" +} + +# Start profiling on all workers +function start_profiler() { + local profile_output_file_path=$1 + + if [ -z "${NUM_WORKERS:-}" ]; then + echo "Error: NUM_WORKERS not set" >&2 + return 1 + fi + + echo "Starting profiling on ${NUM_WORKERS} workers for profile: ${profile_output_file_path}" >&2 + for ((worker_id=0; worker_id ${worker_profile_path}" >&2 + if ! start_profiler_worker "$worker_id" "$worker_profile_path"; then + echo "Warning: Failed to start profiler on worker ${worker_id}" >&2 + fi + done +} + +# Stop profiling on all workers +function stop_profiler() { + local profile_output_file_path=$1 + + if [ -z "${NUM_WORKERS:-}" ]; then + echo "Error: NUM_WORKERS not set" >&2 + return 1 + fi + + echo "Stopping profiling on ${NUM_WORKERS} workers for profile: ${profile_output_file_path}" >&2 + for ((worker_id=0; worker_id ${worker_profile_path}" >&2 + if ! stop_profiler_worker "$worker_id" "$worker_profile_path"; then + echo "Warning: Failed to stop profiler on worker ${worker_id}" >&2 + fi + done +} + diff --git a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh index 3f444caf..4dc7b81d 100755 --- a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh +++ b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh @@ -1,4 +1,7 @@ #!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + set -e set -x @@ -9,16 +12,14 @@ set -x # by the slurm launcher script. All configuration is passed via environment vars. # Source helper functions -source /mnt/home/misiug/veloxtesting/presto-nvl72/echo_helpers.sh -source /mnt/home/misiug/veloxtesting/presto-nvl72/functions.sh +source $SCRIPT_DIR/echo_helpers.sh +source $SCRIPT_DIR/functions.sh # ============================================================================== # Setup and Validation # ============================================================================== echo "Setting up Presto environment..." -export VARIANT_TYPE=gpu setup -echo "Environment setup" # ============================================================================== # Start Coordinator @@ -47,9 +48,7 @@ done echo "Waiting for ${NUM_WORKERS} workers to register with coordinator..." wait_for_workers_to_register $NUM_WORKERS -# ============================================================================== -# Create Schema and Register Tables -# ============================================================================== +# Not currently needed because we are copying the hive metastore from the data source. #echo "Creating TPC-H schema and registering tables for scale factor ${SCALE_FACTOR}..." #setup_benchmark ${SCALE_FACTOR} @@ -59,16 +58,38 @@ wait_for_workers_to_register $NUM_WORKERS echo "Running TPC-H queries (${NUM_ITERATIONS} iterations, scale factor ${SCALE_FACTOR})..." run_queries ${NUM_ITERATIONS} ${SCALE_FACTOR} +# ============================================================================== +# Stop Workers (if profiling, this ensures profile files are created) +# ============================================================================== +if [ "${ENABLE_PROFILING:-false}" == "true" ]; then + echo "Stopping workers to finalize profile files..." + stop_workers +fi + # ============================================================================== # Process Results # ============================================================================== echo "Processing results..." -mkdir -p /mnt/home/misiug/veloxtesting/presto-nvl72/result_dir -#tpch_summary_to_csv ${LOGS}/cli.log /mnt/home/misiug/veloxtesting/presto-nvl72/result_dir/summary.csv -#push_csv +mkdir -p ${SCRIPT_DIR}/result_dir +cp -r ${LOGS}/cli.log ${SCRIPT_DIR}/result_dir/summary.txt + +# Check for profile files if profiling was enabled +if [ "${ENABLE_PROFILING:-false}" == "true" ]; then + echo "Checking for profile files..." + if [ -d "${SCRIPT_DIR}/profiles" ]; then + profile_count=$(find ${SCRIPT_DIR}/profiles -name "*.nsys-rep" 2>/dev/null | wc -l) + echo "Found ${profile_count} profile file(s) in ${SCRIPT_DIR}/profiles" + ls -lh ${SCRIPT_DIR}/profiles/*.nsys-rep 2>/dev/null || echo "No .nsys-rep files found (workers may still be running)" + else + echo "Profiles directory does not exist" + fi +fi echo "========================================" echo "Benchmark complete!" -echo "Results saved to: /mnt/home/misiug/veloxtesting/presto-nvl72/results_dir" +echo "Results saved to: ${SCRIPT_DIR}/results_dir" echo "Logs available at: ${LOGS}" +if [ "${ENABLE_PROFILING:-false}" == "true" ]; then + echo "Profiles directory: ${SCRIPT_DIR}/profiles" +fi echo "========================================" diff --git a/presto/slurm/presto-nvl72/run-presto-benchmarks.slurm b/presto/slurm/presto-nvl72/run-presto-benchmarks.slurm index bfe8016c..0d785912 100755 --- a/presto/slurm/presto-nvl72/run-presto-benchmarks.slurm +++ b/presto/slurm/presto-nvl72/run-presto-benchmarks.slurm @@ -1,44 +1,60 @@ #!/bin/bash #SBATCH --job-name=presto-tpch-run -#SBATCH --output=/mnt/home/misiug/veloxtesting/presto-nvl72/%x_%j.out -#SBATCH --error=/mnt/home/misiug/veloxtesting/presto-nvl72/%x_%j.err -#SBATCH --time=01:00:00 -#SBATCH --nodes=10 +#SBATCH --time=03:00:00 #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=144 #SBATCH --mem=0 -#SBATCH --gres=gpu:4 #SBATCH --exclusive # ============================================================================== # User Configuration - Edit these values directly # ============================================================================== # TPC-H Configuration -export SCALE_FACTOR=10000 -export NUM_ITERATIONS=1 +if [ -z "${SCALE_FACTOR:-}" ]; then + echo "Error: SCALE_FACTOR is required. Set via launcher: -s|--scale-factor" >&2 + exit 1 +fi +export SCALE_FACTOR +if [ -z "${NUM_ITERATIONS:-}" ]; then + echo "Error: NUM_ITERATIONS is required. Set via launcher: -i|--iterations" >&2 + exit 1 +fi +export NUM_ITERATIONS +if [ -z "${SCRIPT_DIR:-}" ]; then + echo "Error: SCRIPT_DIR is required." + exit 1 +fi +export SCRIPT_DIR +if [ -z "${WORKER_IMAGE:-}" ]; then + echo "Error: WORKER_IMAGE is required." + exit 1 +fi +export WORKER_IMAGE +if [ -z "${COORD_IMAGE:-}" ]; then + echo "Error: COORD_IMAGE is required." + exit 1 +fi +export COORD_IMAGE -# Directory Configuration -export WORKSPACE=/mnt/home/misiug +# Assumes the repo root is four steps up from the script directory. This should refer to velox-testing. +export VT_ROOT="$(cd -- "${SCRIPT_DIR}/../../.." >/dev/null 2>&1 && pwd -P)" export DATA=/mnt/data/tpch-rs -export IMAGE_DIR=/mnt/home/misiug/images -export LOGS=/mnt/home/misiug/veloxtesting/presto-nvl72/logs -export CONFIGS=/mnt/home/misiug/veloxtesting/config/generated/gpu -#export CONFIGS=/mnt/home/misiug/veloxtesting/config/generated/cpu +export IMAGE_DIR=/mnt/data/images/presto +export LOGS=$SCRIPT_DIR/logs +export VARIANT_TYPE=gpu +export CONFIGS=$VT_ROOT/presto/docker/config/generated/$VARIANT_TYPE # Container Images # Coordinator: ${IMAGE_DIR}/presto-coordinator-test.sqsh # Worker: ${IMAGE_DIR}/${WORKER_IMAGE}.sqsh -#export WORKER_IMAGE=presto-native-worker-cpu -export WORKER_IMAGE=presto-native-worker-gpu export NUM_NODES=$SLURM_JOB_NUM_NODES -export NUM_GPUS_PER_NODE=4 # Presto Configuration export PORT=9200 export CUDF_LIB=/usr/lib64/presto-native-libs # UCX Configuration -export UCX_TLS=^ib,ud:aux +export UCX_TLS=^ib,ud:aux,sm export UCX_MAX_RNDV_RAILS=1 export UCX_RNDV_PIPELINE_ERROR_HANDLING=y export UCX_TCP_KEEPINTVL=1ms @@ -67,6 +83,7 @@ echo "Nodes: $SLURM_JOB_NUM_NODES" echo "Node list: $SLURM_JOB_NODELIST" echo "Coordinator node: $COORD" echo "Worker image: $WORKER_IMAGE" +echo "Coord image: $COORD_IMAGE" echo "Scale factor: $SCALE_FACTOR" echo "Iterations: $NUM_ITERATIONS" echo "Data directory: $DATA" @@ -77,7 +94,6 @@ echo "========================================" # Create necessary directories mkdir -p ${LOGS} -mkdir -p ${DATA} # Launch the job script -bash /mnt/home/misiug/veloxtesting/presto-nvl72/run-presto-benchmarks.sh +bash $SCRIPT_DIR/run-presto-benchmarks.sh diff --git a/presto/slurm/presto-nvl72/run_multiple.sh b/presto/slurm/presto-nvl72/run_multiple.sh new file mode 100755 index 00000000..b3c9d9bd --- /dev/null +++ b/presto/slurm/presto-nvl72/run_multiple.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +NUM_ITERATIONS=2 +while [[ $# -gt 0 ]]; do + case "$1" in + -n|--nodes) + if [[ -n "${2:-}" && "${2:0:1}" != "-" ]]; then + NODES_COUNT="$2" + shift 2 + else + echo "Error: -n|--nodes requires a set of comma separated values. E.g. (2,4,8)" + echo "Usage: $0 -n|--nodes -s|--scale-factor -w -c [additional sbatch options]" + exit 1 + fi + ;; + -s|--scale-factor) + if [[ -n "${2:-}" && "${2:0:1}" != "-" ]]; then + SCALE_FACTOR="$2" + shift 2 + else + echo "Error: -s|--scale-factor requires a set of comma separated values. E.g. (1000,3000)" + echo "Usage: $0 -n|--nodes -s|--scale-factor -w -c [additional sbatch options]" + exit 1 + fi + ;; + -i|--iterations) + if [[ -n "${2:-}" && "${2:0:1}" != "-" ]]; then + NUM_ITERATIONS="$2" + shift 2 + else + echo "Error: -i|--iterations requires a value" + echo "Usage: $0 -n|--nodes -s|--scale-factor -w -c [additional sbatch options]" + exit 1 + fi + ;; + -w|--worker-image) + if [[ -n "${2:-}" && "${2:0:1}" != "-" ]]; then + WORKER_IMAGE="$2" + shift 2 + else + echo "Error: -w|--worker-image requires a value" + echo "Usage: $0 -n|--nodes -s|--scale-factor -w -c [additional sbatch options]" + exit 1 + fi + ;; + -c|--coord-image) + if [[ -n "${2:-}" && "${2:0:1}" != "-" ]]; then + COORD_IMAGE="$2" + shift 2 + else + echo "Error: -c|--coord-image requires a value" + echo "Usage: $0 -n|--nodes -s|--scale-factor -w -c [additional sbatch options]" + exit 1 + fi + ;; + *) + EXTRA_ARGS+=("$1") + shift + ;; + esac +done + +if [[ -z "${NODES_COUNT}" ]]; then + echo "Error: -n|--nodes is required" + exit 1 +fi +if [[ -z "${SCALE_FACTOR}" ]]; then + echo "Error: -s|--scale-factor is required" + exit 1 +fi +if [[ -z "${WORKER_IMAGE}" ]]; then + echo "Error: -w|--worker-image is required" + exit 1 +fi +if [[ -z "${COORD_IMAGE}" ]]; then + echo "Error: -c|--coord-image is required" + exit 1 +fi + + +mkdir -p kept_results + +IFS=',' read -ra NODES_ARRAY <<< "$NODES_COUNT" +IFS=',' read -ra SF_ARRAY <<< "$SCALE_FACTOR" +for s in "${SF_ARRAY[@]}"; do + for n in "${NODES_ARRAY[@]}"; do + ./launch-run.sh -s $s -n $n -i $NUM_ITERATIONS -w $WORKER_IMAGE -c $COORD_IMAGE + cp logs/cli.log kept_results/${n}N-${s}SF-summary.txt + done +done diff --git a/presto/testing/integration_tests/create_hive_tables.py b/presto/testing/integration_tests/create_hive_tables.py index ed2365a2..001b557d 100644 --- a/presto/testing/integration_tests/create_hive_tables.py +++ b/presto/testing/integration_tests/create_hive_tables.py @@ -55,7 +55,12 @@ def drop_schema(presto_cursor, schema_name): ) args = parser.parse_args() - conn = prestodb.dbapi.connect(host="localhost", port=8080, user="test_user", catalog="hive") + conn = prestodb.dbapi.connect( + host=os.environ.get("HOSTNAME", "localhost"), + port=int(os.environ.get("PORT", "8080")), + user="test_user", + catalog="hive", + ) cursor = conn.cursor() data_sub_directory = f"user_data/{args.data_dir_name}" create_tables(cursor, args.schema_name, args.schemas_dir_path, data_sub_directory) diff --git a/presto/testing/performance_benchmarks/common_fixtures.py b/presto/testing/performance_benchmarks/common_fixtures.py index 3198805b..fadbd58e 100644 --- a/presto/testing/performance_benchmarks/common_fixtures.py +++ b/presto/testing/performance_benchmarks/common_fixtures.py @@ -3,6 +3,7 @@ from pathlib import Path +import pandas as pd import prestodb import pytest @@ -82,8 +83,10 @@ def benchmark_query(request, presto_cursor, benchmark_queries, benchmark_result_ if profile: assert profile_script_path is not None + print(f"[Profiler] Profiling enabled with script: {profile_script_path}") profile_output_dir_path = Path(f"{bench_output_dir}/profiles/{benchmark_type}") profile_output_dir_path.mkdir(parents=True, exist_ok=True) + print(f"[Profiler] Profile output directory: {profile_output_dir_path}") benchmark_result_collector[benchmark_type] = { BenchmarkKeys.RAW_TIMES_KEY: {}, @@ -103,14 +106,28 @@ def benchmark_query_function(query_id): if profile: # Base path without .nsys-rep extension: {dir}/{query_id} profile_output_file_path = f"{profile_output_dir_path.absolute()}/{query_id}" + print(f"[Profiler] Starting profiler for query {query_id}, output: {profile_output_file_path}") start_profiler(profile_script_path, profile_output_file_path) result = [] - for _ in range(iterations): + for iteration_num in range(iterations): cursor = presto_cursor.execute( "--" + str(benchmark_type) + "_" + str(query_id) + "--" + "\n" + benchmark_queries[query_id] ) result.append(cursor.stats["elapsedTimeMillis"]) + # Save query results to Parquet (only on first iteration) + rows = cursor.fetchall() + columns = [desc[0] for desc in cursor.description] + df = pd.DataFrame(rows, columns=columns) + + # Save to Parquet format to match expected results + results_dir = Path(f"{bench_output_dir}/query_results") + results_dir.mkdir(parents=True, exist_ok=True) + parquet_path = results_dir / f"{query_id.lower()}.parquet" + df.to_parquet(parquet_path, index=False) + + print(f"Saved {query_id} results to {parquet_path}") + # Collect metrics after each query iteration if enabled if metrics: presto_query_id = cursor._query.query_id @@ -129,6 +146,7 @@ def benchmark_query_function(query_id): raise finally: if profile and profile_output_file_path is not None: + print(f"[Profiler] Stopping profiler for query {query_id}") stop_profiler(profile_script_path, profile_output_file_path) return benchmark_query_function diff --git a/presto/testing/performance_benchmarks/conftest.py b/presto/testing/performance_benchmarks/conftest.py index 0a15a36a..b10c13aa 100644 --- a/presto/testing/performance_benchmarks/conftest.py +++ b/presto/testing/performance_benchmarks/conftest.py @@ -137,8 +137,10 @@ def pytest_sessionfinish(session, exitstatus): ] else: AGG_KEYS = [BenchmarkKeys.LUKEWARM_KEY] + if not hasattr(session, "benchmark_results"): return + for benchmark_type, result in session.benchmark_results.items(): compute_aggregate_timings(result) json_result[benchmark_type] = { diff --git a/presto/testing/performance_benchmarks/profiler_utils.py b/presto/testing/performance_benchmarks/profiler_utils.py index 59430270..74f19558 100644 --- a/presto/testing/performance_benchmarks/profiler_utils.py +++ b/presto/testing/performance_benchmarks/profiler_utils.py @@ -26,11 +26,56 @@ def stop_profiler(profile_script_path, profile_output_file_path): def execute_profiler_function(profile_script_path, profile_output_file_path, profiler_function): + # Ensure SCRIPT_DIR is set correctly - it should point to the slurm directory + # where worker info files are stored, not the scripts directory + env = os.environ.copy() + # If SCRIPT_DIR is not set or points to scripts, fix it + script_dir = env.get("SCRIPT_DIR", "") + if not script_dir or "scripts" in script_dir: + # Try to derive from profile_script_path + if "presto-nvl72" in profile_script_path: + env["SCRIPT_DIR"] = "/workspace/presto/slurm/presto-nvl72" + else: + env["SCRIPT_DIR"] = script_dir if script_dir else "/workspace/presto/slurm/presto-nvl72" + + # IMPORTANT: We need to execute the profiler script from the HOST, not from inside the container + # because srun is only available on the host. We'll write a wrapper script that gets executed + # from the host via a mechanism that can escape the container. + # + # Since we're inside a container, we need to use a different approach: + # Option 1: Use nsys attach to attach to running processes (requires PID) + # Option 2: Write commands to a file that a host process reads + # Option 3: Use a mechanism to execute from host + + # For now, let's try to detect if we're in a container and provide a helpful error + print(f"[Profiler] Executing {profiler_function} with script: {profile_script_path}, output: {profile_output_file_path}") + print(f"[Profiler] SCRIPT_DIR={env.get('SCRIPT_DIR', 'NOT SET')}, VT_ROOT={env.get('VT_ROOT', 'NOT SET')}, IMAGE_DIR={env.get('IMAGE_DIR', 'NOT SET')}, NUM_WORKERS={env.get('NUM_WORKERS', 'NOT SET')}") + + # Check if we're in a container + in_container = os.path.exists("/.singularity.d/runscript") or "SINGULARITY" in env + + if in_container: + print(f"[Profiler] WARNING: Running inside container. Profiling via srun requires host execution.") + print(f"[Profiler] Attempting to use alternative method: nsys attach to running processes") + # We'll need to use nsys attach instead - this requires finding the PID of presto_server + # For now, let's try the original method and see if it fails gracefully + pass + profiler_command = ["bash", "-c", f"source {profile_script_path}; {profiler_function} {profile_output_file_path}"] - - result = subprocess.run(profiler_command, capture_output=True, text=True, env=os.environ) + result = subprocess.run(profiler_command, capture_output=True, text=True, env=env) + + # Always print output for debugging + if result.stdout: + print(f"[Profiler] stdout: {result.stdout}") + if result.stderr: + print(f"[Profiler] stderr: {result.stderr}") + if result.returncode != 0: - raise RuntimeError( + error_msg = ( f"{profiler_function} returned error code: {result.returncode}, " f"stdout: {result.stdout}, stderr: {result.stderr}" ) + print(f"[Profiler] ERROR: {error_msg}") + raise RuntimeError(error_msg) + else: + print(f"[Profiler] {profiler_function} completed successfully") diff --git a/presto/testing/requirements.txt b/presto/testing/requirements.txt index 4a2dfa8c..e9eeb1da 100644 --- a/presto/testing/requirements.txt +++ b/presto/testing/requirements.txt @@ -5,8 +5,11 @@ duckdb==1.3.2 idna==3.10 iniconfig==2.1.0 packaging==25.0 +pandas>=2.0.0 pluggy==1.6.0 presto-python-client==0.8.4 +pyarrow>=10.0.0 +pandas Pygments==2.19.2 pytest==8.4.1 requests==2.32.4 diff --git a/scripts/py_env_functions.sh b/scripts/py_env_functions.sh index 50018344..3a15e5b1 100755 --- a/scripts/py_env_functions.sh +++ b/scripts/py_env_functions.sh @@ -51,7 +51,7 @@ function init_python_virtual_env() { local venv_dir=${1:-".venv"} rm -rf $venv_dir - if python3 -m venv $venv_dir &>/dev/null; then + if python3.12 -m venv $venv_dir &>/dev/null; then echo "Created virtual environment using the venv module" else if [[ -z $MINIFORGE_HOME ]]; then