From 16b3b23060796ca9e9608c01cd365ff1a349ee2c Mon Sep 17 00:00:00 2001 From: misiugodfrey Date: Wed, 28 Jan 2026 08:15:26 -0800 Subject: [PATCH 01/22] Slurm scripts --- presto/slurm/presto-nvl72/README.md | 114 +++++ .../presto-nvl72/create-presto-benchmarks.sh | 73 +++ .../create-presto-benchmarks.slurm | 73 +++ presto/slurm/presto-nvl72/echo_helpers.sh | 19 + presto/slurm/presto-nvl72/functions.sh | 441 ++++++++++++++++++ presto/slurm/presto-nvl72/index.html | 63 +++ presto/slurm/presto-nvl72/launch-run.sh | 51 ++ .../presto-nvl72/run-presto-benchmarks.sh | 74 +++ .../presto-nvl72/run-presto-benchmarks.slurm | 83 ++++ 9 files changed, 991 insertions(+) create mode 100644 presto/slurm/presto-nvl72/README.md create mode 100755 presto/slurm/presto-nvl72/create-presto-benchmarks.sh create mode 100644 presto/slurm/presto-nvl72/create-presto-benchmarks.slurm create mode 100755 presto/slurm/presto-nvl72/echo_helpers.sh create mode 100755 presto/slurm/presto-nvl72/functions.sh create mode 100644 presto/slurm/presto-nvl72/index.html create mode 100755 presto/slurm/presto-nvl72/launch-run.sh create mode 100755 presto/slurm/presto-nvl72/run-presto-benchmarks.sh create mode 100755 presto/slurm/presto-nvl72/run-presto-benchmarks.slurm diff --git a/presto/slurm/presto-nvl72/README.md b/presto/slurm/presto-nvl72/README.md new file mode 100644 index 00000000..baed39d0 --- /dev/null +++ b/presto/slurm/presto-nvl72/README.md @@ -0,0 +1,114 @@ +# Presto TPC-H Benchmark (NVL72) + +This directory contains scripts for running Presto TPC-H benchmarks on CoreWeave NVL72 nodes. + +## Directory Structure + +``` +presto-nvl72/ +├── run-presto-benchmarks.slurm # Main slurm job script with configuration +├── run-presto-benchmarks.sh # Execution script +├── launch-run.sh # Convenience launcher +├── functions.sh # Presto helper functions +├── echo_helpers.sh # Logging helpers +├── logs/ # Execution logs +└── result_dir/ # Benchmark results +``` + +## Quick Start + +### Running the Benchmark + +```bash +cd /mnt/data/bzaitlen/presto-nvl72 +./launch-run.sh +``` + +Or submit directly: + +```bash +sbatch run-presto-benchmarks.slurm +``` + +## Configuration + +**To change settings, edit the values directly in `run-presto-benchmarks.slurm`** + +All configuration is at the top of the file in the "User Configuration" section. + +### Configuration Variables + +| Variable | Current Value | Description | +|----------|---------------|-------------| +| `SCALE_FACTOR` | 300 | TPC-H scale factor | +| `NUM_ITERATIONS` | 5 | Number of query iterations | +| `WORKER_IMAGE` | presto-native-worker-gpu | Worker container image | +| `NUM_NODES` | 4 | Number of nodes to allocate | +| `NUM_GPUS_PER_NODE` | 4 | GPUs per node | +| `DATA` | /mnt/data/tpch-rs/scale-300 | Data directory | +| `IMAGE_DIR` | /mnt/home/misiug/images | Container image directory | +| `LOGS` | /mnt/data/bzaitlen/presto-nvl72/logs | Log directory | + +### SBATCH Directives + +- **Time limit**: 1 hour (adjust `--time` if needed) +- **Node allocation**: Full node (144 CPUs, 4 GPUs, exclusive) +- **Memory**: All available (`--mem=0`) + +## Monitoring + +```bash +# Check job queue +squeue -u $USER + +# Monitor job output +tail -f presto-tpch-run_.out + +# Check logs during execution +tail -f logs/coord.log +tail -f logs/cli.log +tail -f logs/worker_0.log +``` + +## Results + +Results are saved to: +- **Logs**: `logs/` directory +- **CSV Summary**: `result_dir/summary.csv` +- **Historical Results**: `${WORKSPACE}/benchmark-storage/YYYY/MM/DD/` + +## Prerequisites + +1. **Container images** must exist in `${IMAGE_DIR}`: + - `presto-coordinator.sqsh` + - `presto-native-worker-gpu.sqsh` or `presto-native-worker-cpu.sqsh` + +2. **Data directory** must be accessible at `${DATA}` (will be mounted in containers) + +3. **velox-testing repo** will be auto-cloned to `${WORKSPACE}/velox-testing` if not present + +## Troubleshooting + +### Coordinator fails to start +Check coordinator logs: +```bash +cat logs/coord.log +``` + +### Workers not registering +Check worker logs: +```bash +cat logs/worker_*.log +``` + +### Image not found +Verify images exist: +```bash +ls -lh /mnt/home/misiug/images/*.sqsh +``` + +### Data directory issues +Verify data path is accessible: +```bash +ls -la /mnt/data/tpch-presto +``` diff --git a/presto/slurm/presto-nvl72/create-presto-benchmarks.sh b/presto/slurm/presto-nvl72/create-presto-benchmarks.sh new file mode 100755 index 00000000..116192b2 --- /dev/null +++ b/presto/slurm/presto-nvl72/create-presto-benchmarks.sh @@ -0,0 +1,73 @@ +#!/bin/bash +set -e +set -x + +# ============================================================================== +# Presto TPC-H Schema Creation Script +# ============================================================================== +# This script creates the Presto schema and tables for existing TPC-H data + +# Source helper functions +source /mnt/home/misiug/veloxtesting/presto-nvl72/echo_helpers.sh +source /mnt/home/misiug/veloxtesting/presto-nvl72/functions.sh + +# ============================================================================== +# Setup and Validation +# ============================================================================== +echo "Setting up Presto environment for schema creation..." +export VARIANT_TYPE=cpu +setup + +worker_config="${CONFIGS}/etc_worker/config_native.properties" +sed -i "s/system-memory-gb.*/system-memory-gb=400/g" ${worker_config} +sed -i "s/query-memory-gb.*/query-memory-gb=400/g" ${worker_config} +sed -i "s/query\.max-memory-per-node.*/query\.max-memory-per-node=400GB/g" ${worker_config} + +coord_config="${CONFIGS}/etc_coordinator/config_native.properties" +sed -i "s/memory\.heap-headroom-per-node.*/memory\.heap-headroom-per-node=120GB/g" ${coord_config} +sed -i "s/query\.max-total-memory-per-node.*/query\.max-total-memory-per-node=300GB/g" ${coord_config} +sed -i "s/query\.max-total-memory.*/query\.max-total-memory=300GB/g" ${coord_config} +sed -i "s/query\.max-memory-per-node.*/query\.max-memory-per-node=250GB/g" ${coord_config} +sed -i "s/query\.max-memory.*/query\.max-memory=250GB/g" ${coord_config} +sed -i "s/cluster-tag.*//g" ${coord_config} + +# ============================================================================== +# Start Coordinator +# ============================================================================== +echo "Starting Presto coordinator on ${COORD}..." +run_coordinator +wait_until_coordinator_is_running + + + +# ============================================================================== +# Start Workers (GPU workers for schema creation) +# ============================================================================== +echo "Starting ${NUM_WORKERS} Presto workers across ${NUM_NODES} nodes..." + +worker_id=0 +for node in $(scontrol show hostnames "$SLURM_JOB_NODELIST"); do + for gpu_id in $(seq 0 $((NUM_GPUS_PER_NODE - 1))); do + echo " Starting worker ${worker_id} on node ${node} GPU ${gpu_id}" + run_worker "${gpu_id}" "$WORKER_IMAGE" "${node}" "$worker_id" + worker_id=$((worker_id + 1)) + done +done + +# ============================================================================== +# Wait for Workers to Register +# ============================================================================== +echo "Waiting for ${NUM_WORKERS} workers to register with coordinator..." +wait_for_workers_to_register $NUM_WORKERS + +# ============================================================================== +# Create Schema and Tables +# ============================================================================== +echo "Creating TPC-H schema and tables for scale factor ${SCALE_FACTOR}..." +setup_benchmark ${SCALE_FACTOR} + +echo "========================================" +echo "Schema creation complete!" +echo "Schema: tpchsf${SCALE_FACTOR}" +echo "Logs available at: ${LOGS}" +echo "========================================" diff --git a/presto/slurm/presto-nvl72/create-presto-benchmarks.slurm b/presto/slurm/presto-nvl72/create-presto-benchmarks.slurm new file mode 100644 index 00000000..adaac17d --- /dev/null +++ b/presto/slurm/presto-nvl72/create-presto-benchmarks.slurm @@ -0,0 +1,73 @@ +#!/bin/bash +#SBATCH --job-name=presto-tpch-create +#SBATCH --output=/mnt/home/misiug/veloxtesting/presto-nvl72/%x_%j.out +#SBATCH --error=/mnt/home/misiug/veloxtesting/presto-nvl72/%x_%j.err +#SBATCH --time=01:00:00 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=144 +#SBATCH --mem=0 +#SBATCH --gres=gpu:1 +#SBATCH --exclusive + +# ============================================================================== +# User Configuration - Edit these values directly +# ============================================================================== +# TPC-H Configuration +export SCALE_FACTOR=10000 + +# Directory Configuration +export WORKSPACE=/mnt/home/misiug +export DATA=/mnt/data/tpch-rs +export IMAGE_DIR=/mnt/home/misiug/images +export LOGS=/mnt/home/misiug/veloxtesting/presto-nvl72/logs +export CONFIGS=${WORKSPACE}/config/generated/cpu + +# Container Images +# Coordinator: ${IMAGE_DIR}/presto-coordinator-test.sqsh +# Worker: ${IMAGE_DIR}/${WORKER_IMAGE}.sqsh (CPU workers required for ANALYZE) +export WORKER_IMAGE=presto-native-worker-cpu +export NUM_NODES=1 +export NUM_GPUS_PER_NODE=1 + +# Presto Configuration +export PORT=9200 +export CUDF_LIB=/usr/lib64/presto-native-libs + +# ============================================================================== +# Computed Values +# ============================================================================== +# Get the head node (coordinator node) - export for use inside container +export COORD=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +export NUM_WORKERS=$((NUM_NODES * NUM_GPUS_PER_NODE)) + +# Single node execution mode +if [ "${NUM_WORKERS}" -eq "1" ]; then + export SINGLE_NODE_EXECUTION=true +else + export SINGLE_NODE_EXECUTION=false +fi + +# ============================================================================== +# Pre-flight Info +# ============================================================================== +echo "========================================" +echo "Job ID: $SLURM_JOB_ID" +echo "Nodes: $SLURM_JOB_NUM_NODES" +echo "Node list: $SLURM_JOB_NODELIST" +echo "Coordinator node: $COORD" +echo "Worker image: $WORKER_IMAGE" +echo "Scale factor: $SCALE_FACTOR" +echo "Data directory: $DATA" +echo "Logs directory: $LOGS" +echo "Total workers: $NUM_WORKERS (${NUM_NODES} nodes × ${NUM_GPUS_PER_NODE} GPUs)" +echo "Single node execution: $SINGLE_NODE_EXECUTION" +echo "========================================" + +# Create necessary directories +mkdir -p ${LOGS} +mkdir -p ${DATA} +mkdir -p ${WORKSPACE}/.hive_metastore + +# Launch the job script +bash ${WORKSPACE}/veloxtesting/presto-nvl72/create-presto-benchmarks.sh diff --git a/presto/slurm/presto-nvl72/echo_helpers.sh b/presto/slurm/presto-nvl72/echo_helpers.sh new file mode 100755 index 00000000..1c7652dc --- /dev/null +++ b/presto/slurm/presto-nvl72/echo_helpers.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +RED='\033[0;31m' +YELLOW='\033[1;33m' +GREEN='\033[0;32m' +NC='\033[0m' # No Color + +function echo_error { + echo -e "${RED}$1${NC}" + exit 1 +} + +function echo_warning { + echo -e "${YELLOW}$1${NC}" +} + +function echo_success { + echo -e "${GREEN}$1${NC}" +} diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh new file mode 100755 index 00000000..5f07d6dc --- /dev/null +++ b/presto/slurm/presto-nvl72/functions.sh @@ -0,0 +1,441 @@ +#!/bin/bash + +# UCX Configuration +export UCX_TLS=^ib,ud:aux,sm +export UCX_MAX_RNDV_RAILS=1 +export UCX_RNDV_PIPELINE_ERROR_HANDLING=y +export UCX_TCP_KEEPINTVL=1ms +export UCX_KEEPALIVE_INTERVAL=1ms + + +# Image directory for presto container images (can be overridden via environment) +IMAGE_DIR="${IMAGE_DIR:-${WORKSPACE}/images}" + +# Logs directory for presto execution logs (can be overridden via environment) +LOGS="${LOGS:-/mnt/home/misiug/veloxtesting/presto-nvl72/logs}" + +# Validates job preconditions and assigns default values for presto execution. +function setup { + [ -z "$SLURM_JOB_NAME" ] && echo "required argument '--job-name' not specified" && exit 1 + [ -z "$SLURM_JOB_ACCOUNT" ] && echo "required argument '--account' not specified" && exit 1 + [ -z "$SLURM_JOB_PARTITION" ] && echo "required argument '--partition' not specified" && exit 1 + [ -z "$SLURM_NNODES" ] && echo "required argument '--nodes' not specified" && exit 1 + [ -z "$NUM_NODES" ] && echo "NUM_WORKERS must be set" && exit 1 + [ -z "$NUM_GPUS_PER_NODE" ] && echo "NUM_GPUS_PER_NODE env variable must be set" && exit 1 + [ ! -d "$WORKSPACE" ] && echo "WORKSPACE must be a valid directory" && exit 1 + [ ! -d "$DATA" ] && echo "DATA must be a valid directory" && exit 1 + + NUM_WORKERS=$(( $NUM_NODES * $NUM_GPUS_PER_NODE )) + mkdir -p ${LOGS} + # Only set CONFIGS if not already set (allow override from environment) + #CONFIGS="${CONFIGS:-${WORKSPACE}/config/generated/gpu}" + #CONFIGS="${CONFIGS:-${WORKSPACE}/config/generated/cpu}" + CONFIGS="${CONFIGS:-${WORKSPACE}/config/generated/${VARIANT_TYPE}}" + COORD=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -1) + PORT=9200 + CUDF_LIB=/usr/lib64/presto-native-libs + if [ "${NUM_WORKERS}" -eq "1" ]; then + SINGLE_NODE_EXECUTION=true + else + SINGLE_NODE_EXECUTION=false + fi + + if [ ! -d ${WORKSPACE}/velox-testing ]; then + git clone -b misiug/cluster https://github.com/rapidsai/velox-testing.git ${WORKSPACE}/velox-testing + #sed -i "s/python3 /python3.12 /g" ${WORKSPACE}/velox-testing/scripts/py_env_functions.sh + fi + + [ ! -d ${CONFIGS} ] && generate_configs + + validate_config_directory +} + +function generate_configs { + mkdir -p ${CONFIGS} + pushd ${WORKSPACE}/velox-testing/presto/scripts + #VARIANT_TYPE=cpu ./generate_presto_config.sh + #VARIANT_TYPE=gpu ./generate_presto_config.sh + OVERWRITE_CONFIG=true ./generate_presto_config.sh + popd + mv ${WORKSPACE}/velox-testing/presto/docker/config/generated/${VARIANT_TYPE}/* ${CONFIGS}/ + #mv ${WORKSPACE}/velox-testing/presto/docker/config/generated/gpu/* ${CONFIGS}/ + #mv ${WORKSPACE}/velox-testing/presto/docker/config/generated/cpu/* ${CONFIGS}/ + echo "--add-modules=java.management,jdk.management" >> ${CONFIGS}/etc_common/jvm.config + echo "-Dcom.sun.management.jmxremote=false" >> ${CONFIGS}/etc_common/jvm.config + echo "-XX:-UseContainerSupport" >> ${CONFIGS}/etc_common/jvm.config +} + +# Takes a list of environment variables. Checks that each one is set and of non-zero length. +function validate_environment_preconditions { + local missing=() + for var in "$@"; do + # -z "${!var+x}" => unset; -z "${!var}" => empty + if [[ -z "${!var+x}" || -z "${!var}" ]]; then + missing+=("$var") + fi + done + if ((${#missing[@]})); then + echo_error "required env var ${missing[*]} not set" + fi +} + +# Execute script through the coordinator image (used for coordinator and cli executables) +function run_coord_image { + [ $# -ne 2 ] && echo_error "$0 expected one argument for ' + + + + + + + + + + + + + + +
+
+ +
+
+
Loading...
+
+
+ +
+
+ Query Details +
+
Loading...
+
+ +
+ + + + + + + + diff --git a/presto/slurm/presto-nvl72/launch-run.sh b/presto/slurm/presto-nvl72/launch-run.sh new file mode 100755 index 00000000..52841bb1 --- /dev/null +++ b/presto/slurm/presto-nvl72/launch-run.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# ============================================================================== +# Presto TPC-H Benchmark Launcher +# ============================================================================== +# Simple launcher script to submit the presto benchmark job to slurm +# +# Usage: +# ./launch-run.sh [additional sbatch options] +# +# To change configuration, edit run-presto-benchmarks.slurm directly +# ============================================================================== + +set -e + +# Change to script directory +cd "$(dirname "$0")" + +# Clean up old output files +rm -f result_dir/* logs/* *.out *.err 2>/dev/null || true +mkdir -p result_dir logs + +echo "Submitting Presto TPC-H benchmark job..." +echo "Configuration is set in run-presto-benchmarks.slurm" +echo "" + +# Submit job +JOB_ID=$(sbatch "$@" run-presto-benchmarks.slurm | awk '{print $NF}') +#JOB_ID=$(sbatch "$@" create-presto-benchmarks.slurm | awk '{print $NF}') + +echo "Job submitted with ID: $JOB_ID" +echo "" +echo "Monitor job with:" +echo " squeue -j $JOB_ID" +echo " tail -f presto-tpch-run_${JOB_ID}.out" +echo "" +echo "Waiting for job to complete..." + +# Wait for job to finish +while squeue -j $JOB_ID 2>/dev/null | grep -q $JOB_ID; do + sleep 5 +done + +echo "" +echo "Job completed!" +echo "" +echo "Output files:" +ls -lh presto-tpch-run_${JOB_ID}.{out,err} 2>/dev/null || echo "No output files found" +echo "" +echo "Showing job output:" +echo "========================================" +cat presto-tpch-run_${JOB_ID}.out 2>/dev/null || echo "No output available" diff --git a/presto/slurm/presto-nvl72/run-presto-benchmarks.sh b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh new file mode 100755 index 00000000..3f444caf --- /dev/null +++ b/presto/slurm/presto-nvl72/run-presto-benchmarks.sh @@ -0,0 +1,74 @@ +#!/bin/bash +set -e +set -x + +# ============================================================================== +# Presto TPC-H Benchmark Execution Script +# ============================================================================== +# This script runs the actual benchmark execution after environment is configured +# by the slurm launcher script. All configuration is passed via environment vars. + +# Source helper functions +source /mnt/home/misiug/veloxtesting/presto-nvl72/echo_helpers.sh +source /mnt/home/misiug/veloxtesting/presto-nvl72/functions.sh + +# ============================================================================== +# Setup and Validation +# ============================================================================== +echo "Setting up Presto environment..." +export VARIANT_TYPE=gpu +setup +echo "Environment setup" + +# ============================================================================== +# Start Coordinator +# ============================================================================== +echo "Starting Presto coordinator on ${COORD}..." +run_coordinator +wait_until_coordinator_is_running + +# ============================================================================== +# Start Workers +# ============================================================================== +echo "Starting ${NUM_WORKERS} Presto workers across ${NUM_NODES} nodes..." + +worker_id=0 +for node in $(scontrol show hostnames "$SLURM_JOB_NODELIST"); do + for gpu_id in $(seq 0 $((NUM_GPUS_PER_NODE - 1))); do + echo " Starting worker ${worker_id} on node ${node} GPU ${gpu_id}" + run_worker "${gpu_id}" "$WORKER_IMAGE" "${node}" "$worker_id" + worker_id=$((worker_id + 1)) + done +done + +# ============================================================================== +# Wait for Workers to Register +# ============================================================================== +echo "Waiting for ${NUM_WORKERS} workers to register with coordinator..." +wait_for_workers_to_register $NUM_WORKERS + +# ============================================================================== +# Create Schema and Register Tables +# ============================================================================== +#echo "Creating TPC-H schema and registering tables for scale factor ${SCALE_FACTOR}..." +#setup_benchmark ${SCALE_FACTOR} + +# ============================================================================== +# Run Queries +# ============================================================================== +echo "Running TPC-H queries (${NUM_ITERATIONS} iterations, scale factor ${SCALE_FACTOR})..." +run_queries ${NUM_ITERATIONS} ${SCALE_FACTOR} + +# ============================================================================== +# Process Results +# ============================================================================== +echo "Processing results..." +mkdir -p /mnt/home/misiug/veloxtesting/presto-nvl72/result_dir +#tpch_summary_to_csv ${LOGS}/cli.log /mnt/home/misiug/veloxtesting/presto-nvl72/result_dir/summary.csv +#push_csv + +echo "========================================" +echo "Benchmark complete!" +echo "Results saved to: /mnt/home/misiug/veloxtesting/presto-nvl72/results_dir" +echo "Logs available at: ${LOGS}" +echo "========================================" diff --git a/presto/slurm/presto-nvl72/run-presto-benchmarks.slurm b/presto/slurm/presto-nvl72/run-presto-benchmarks.slurm new file mode 100755 index 00000000..bfe8016c --- /dev/null +++ b/presto/slurm/presto-nvl72/run-presto-benchmarks.slurm @@ -0,0 +1,83 @@ +#!/bin/bash +#SBATCH --job-name=presto-tpch-run +#SBATCH --output=/mnt/home/misiug/veloxtesting/presto-nvl72/%x_%j.out +#SBATCH --error=/mnt/home/misiug/veloxtesting/presto-nvl72/%x_%j.err +#SBATCH --time=01:00:00 +#SBATCH --nodes=10 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=144 +#SBATCH --mem=0 +#SBATCH --gres=gpu:4 +#SBATCH --exclusive + +# ============================================================================== +# User Configuration - Edit these values directly +# ============================================================================== +# TPC-H Configuration +export SCALE_FACTOR=10000 +export NUM_ITERATIONS=1 + +# Directory Configuration +export WORKSPACE=/mnt/home/misiug +export DATA=/mnt/data/tpch-rs +export IMAGE_DIR=/mnt/home/misiug/images +export LOGS=/mnt/home/misiug/veloxtesting/presto-nvl72/logs +export CONFIGS=/mnt/home/misiug/veloxtesting/config/generated/gpu +#export CONFIGS=/mnt/home/misiug/veloxtesting/config/generated/cpu + +# Container Images +# Coordinator: ${IMAGE_DIR}/presto-coordinator-test.sqsh +# Worker: ${IMAGE_DIR}/${WORKER_IMAGE}.sqsh +#export WORKER_IMAGE=presto-native-worker-cpu +export WORKER_IMAGE=presto-native-worker-gpu +export NUM_NODES=$SLURM_JOB_NUM_NODES +export NUM_GPUS_PER_NODE=4 + +# Presto Configuration +export PORT=9200 +export CUDF_LIB=/usr/lib64/presto-native-libs + +# UCX Configuration +export UCX_TLS=^ib,ud:aux +export UCX_MAX_RNDV_RAILS=1 +export UCX_RNDV_PIPELINE_ERROR_HANDLING=y +export UCX_TCP_KEEPINTVL=1ms +export UCX_KEEPALIVE_INTERVAL=1ms + +# ============================================================================== +# Computed Values +# ============================================================================== +# Get the head node (coordinator node) - export for use inside container +export COORD=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +export NUM_WORKERS=$((NUM_NODES * NUM_GPUS_PER_NODE)) + +# Single node execution mode +if [ "${NUM_WORKERS}" -eq "1" ]; then + export SINGLE_NODE_EXECUTION=true +else + export SINGLE_NODE_EXECUTION=false +fi + +# ============================================================================== +# Pre-flight Info +# ============================================================================== +echo "========================================" +echo "Job ID: $SLURM_JOB_ID" +echo "Nodes: $SLURM_JOB_NUM_NODES" +echo "Node list: $SLURM_JOB_NODELIST" +echo "Coordinator node: $COORD" +echo "Worker image: $WORKER_IMAGE" +echo "Scale factor: $SCALE_FACTOR" +echo "Iterations: $NUM_ITERATIONS" +echo "Data directory: $DATA" +echo "Logs directory: $LOGS" +echo "Total workers: $NUM_WORKERS (${NUM_NODES} nodes × ${NUM_GPUS_PER_NODE} GPUs)" +echo "Single node execution: $SINGLE_NODE_EXECUTION" +echo "========================================" + +# Create necessary directories +mkdir -p ${LOGS} +mkdir -p ${DATA} + +# Launch the job script +bash /mnt/home/misiug/veloxtesting/presto-nvl72/run-presto-benchmarks.sh From ba182110e9328811a358abc9719112847a17e9f8 Mon Sep 17 00:00:00 2001 From: misiugodfrey Date: Thu, 29 Jan 2026 16:19:35 -0800 Subject: [PATCH 02/22] untested refactor --- presto/slurm/presto-nvl72/README.md | 76 ++++++++++----- .../presto-nvl72/create-presto-benchmarks.sh | 16 ++-- .../create-presto-benchmarks.slurm | 17 ++-- presto/slurm/presto-nvl72/functions.sh | 94 +++++++----------- presto/slurm/presto-nvl72/get_one_col.sh | 10 ++ presto/slurm/presto-nvl72/launch-run.sh | 96 +++++++++++++++++-- .../presto-nvl72/run-presto-benchmarks.sh | 13 +-- .../presto-nvl72/run-presto-benchmarks.slurm | 35 ++++--- .../integration_tests/create_hive_tables.py | 7 +- scripts/py_env_functions.sh | 2 +- 10 files changed, 237 insertions(+), 129 deletions(-) create mode 100755 presto/slurm/presto-nvl72/get_one_col.sh diff --git a/presto/slurm/presto-nvl72/README.md b/presto/slurm/presto-nvl72/README.md index baed39d0..b2fb24e3 100644 --- a/presto/slurm/presto-nvl72/README.md +++ b/presto/slurm/presto-nvl72/README.md @@ -17,43 +17,61 @@ presto-nvl72/ ## Quick Start -### Running the Benchmark +### Running the benchmark via launcher (recommended) ```bash -cd /mnt/data/bzaitlen/presto-nvl72 -./launch-run.sh +cd presto/slurm/presto-nvl72 +./launch-run.sh -n -s [-i ] [additional sbatch options] + +# examples +./launch-run.sh -n 8 -s 3000 +./launch-run.sh -n 4 -s 10000 -i 3 --partition gpu --account myacct ``` -Or submit directly: +The launcher: +- requires node count (-n/--nodes) and scale factor (-s/--scale-factor) +- accepts optional iterations (-i/--iterations, default 1) +- embeds nodes/SF/iterations in .out/.err filenames +- prints the first node’s hostname/IP when allocated and a ready-to-run SSH port-forward command to access the Presto Web UI on your machine (http://localhost:9200) + +### Submitting directly (advanced) ```bash -sbatch run-presto-benchmarks.slurm +export SCALE_FACTOR=3000 +export NUM_ITERATIONS=1 +sbatch --nodes 8 \ + --output "presto-tpch-run_n8_sf3000_i1_%j.out" \ + --error "presto-tpch-run_n8_sf3000_i1_%j.err" \ + --export "ALL,SCALE_FACTOR=${SCALE_FACTOR},NUM_ITERATIONS=${NUM_ITERATIONS}" \ + run-presto-benchmarks.slurm ``` ## Configuration -**To change settings, edit the values directly in `run-presto-benchmarks.slurm`** +Primary configuration is passed via the launcher flags and environment. The `.slurm` script validates that required variables are set. -All configuration is at the top of the file in the "User Configuration" section. +Key variables: -### Configuration Variables +- SCALE_FACTOR: required (provided via `-s/--scale-factor`) +- NUM_ITERATIONS: required by the job; launcher defaults to 1 (`-i/--iterations` to override) +- NUM_NODES: derived from Slurm allocation; provided via `-n/--nodes` to launcher +- REPO_ROOT: auto-detected from script location +- LOGS: `${SCRIPT_DIR}/logs` by default +- IMAGE_DIR, DATA, CONFIGS: see below or override via environment if needed -| Variable | Current Value | Description | -|----------|---------------|-------------| -| `SCALE_FACTOR` | 300 | TPC-H scale factor | -| `NUM_ITERATIONS` | 5 | Number of query iterations | -| `WORKER_IMAGE` | presto-native-worker-gpu | Worker container image | -| `NUM_NODES` | 4 | Number of nodes to allocate | -| `NUM_GPUS_PER_NODE` | 4 | GPUs per node | -| `DATA` | /mnt/data/tpch-rs/scale-300 | Data directory | -| `IMAGE_DIR` | /mnt/home/misiug/images | Container image directory | -| `LOGS` | /mnt/data/bzaitlen/presto-nvl72/logs | Log directory | +Other defaults: +- WORKER_IMAGE: `presto-native-worker-gpu` +- NUM_GPUS_PER_NODE: `4` +- DATA: `/mnt/data/tpch-rs` +- IMAGE_DIR: `/mnt/data/images/presto` +- CONFIGS: `${REPO_ROOT}/presto/docker/config/generated/gpu` ### SBATCH Directives - **Time limit**: 1 hour (adjust `--time` if needed) - **Node allocation**: Full node (144 CPUs, 4 GPUs, exclusive) - **Memory**: All available (`--mem=0`) +- `--nodes`, `--output`, and `--error` are passed by the launcher instead of being embedded in the `.slurm` file. ## Monitoring @@ -62,7 +80,7 @@ All configuration is at the top of the file in the "User Configuration" section. squeue -u $USER # Monitor job output -tail -f presto-tpch-run_.out +tail -f presto-tpch-run_n_sf_i_.out # Check logs during execution tail -f logs/coord.log @@ -70,12 +88,26 @@ tail -f logs/cli.log tail -f logs/worker_0.log ``` +## Coordinator IP and Web UI + +After submission, the launcher waits until nodes are allocated, then prints: +- the first node’s hostname/IP +- an SSH port-forward command you can run locally to access the Presto Web UI + +Example output snippet: + +```text +Run this command on a machine to get access to the webUI: + ssh -N -L 9200::9200 +The UI will be available at http://localhost:9200 +``` + ## Results Results are saved to: - **Logs**: `logs/` directory - **CSV Summary**: `result_dir/summary.csv` -- **Historical Results**: `${WORKSPACE}/benchmark-storage/YYYY/MM/DD/` +- **Historical Results**: `${REPO_ROOT}/benchmark-storage/YYYY/MM/DD/` ## Prerequisites @@ -85,7 +117,7 @@ Results are saved to: 2. **Data directory** must be accessible at `${DATA}` (will be mounted in containers) -3. **velox-testing repo** will be auto-cloned to `${WORKSPACE}/velox-testing` if not present +3. **velox-testing repo** will be auto-cloned to `${REPO_ROOT}/velox-testing` if not present ## Troubleshooting @@ -104,7 +136,7 @@ cat logs/worker_*.log ### Image not found Verify images exist: ```bash -ls -lh /mnt/home/misiug/images/*.sqsh +ls -lh /mnt/data/images/presto/*.sqsh ``` ### Data directory issues diff --git a/presto/slurm/presto-nvl72/create-presto-benchmarks.sh b/presto/slurm/presto-nvl72/create-presto-benchmarks.sh index 116192b2..d1805d7f 100755 --- a/presto/slurm/presto-nvl72/create-presto-benchmarks.sh +++ b/presto/slurm/presto-nvl72/create-presto-benchmarks.sh @@ -19,16 +19,16 @@ export VARIANT_TYPE=cpu setup worker_config="${CONFIGS}/etc_worker/config_native.properties" -sed -i "s/system-memory-gb.*/system-memory-gb=400/g" ${worker_config} -sed -i "s/query-memory-gb.*/query-memory-gb=400/g" ${worker_config} -sed -i "s/query\.max-memory-per-node.*/query\.max-memory-per-node=400GB/g" ${worker_config} +#sed -i "s/system-memory-gb.*/system-memory-gb=400/g" ${worker_config} +#sed -i "s/query-memory-gb.*/query-memory-gb=400/g" ${worker_config} +#sed -i "s/query\.max-memory-per-node.*/query\.max-memory-per-node=400GB/g" ${worker_config} coord_config="${CONFIGS}/etc_coordinator/config_native.properties" -sed -i "s/memory\.heap-headroom-per-node.*/memory\.heap-headroom-per-node=120GB/g" ${coord_config} -sed -i "s/query\.max-total-memory-per-node.*/query\.max-total-memory-per-node=300GB/g" ${coord_config} -sed -i "s/query\.max-total-memory.*/query\.max-total-memory=300GB/g" ${coord_config} -sed -i "s/query\.max-memory-per-node.*/query\.max-memory-per-node=250GB/g" ${coord_config} -sed -i "s/query\.max-memory.*/query\.max-memory=250GB/g" ${coord_config} +#sed -i "s/memory\.heap-headroom-per-node.*/memory\.heap-headroom-per-node=120GB/g" ${coord_config} +#sed -i "s/query\.max-total-memory-per-node.*/query\.max-total-memory-per-node=300GB/g" ${coord_config} +#sed -i "s/query\.max-total-memory.*/query\.max-total-memory=300GB/g" ${coord_config} +#sed -i "s/query\.max-memory-per-node.*/query\.max-memory-per-node=250GB/g" ${coord_config} +#sed -i "s/query\.max-memory.*/query\.max-memory=250GB/g" ${coord_config} sed -i "s/cluster-tag.*//g" ${coord_config} # ============================================================================== diff --git a/presto/slurm/presto-nvl72/create-presto-benchmarks.slurm b/presto/slurm/presto-nvl72/create-presto-benchmarks.slurm index adaac17d..a8941a0c 100644 --- a/presto/slurm/presto-nvl72/create-presto-benchmarks.slurm +++ b/presto/slurm/presto-nvl72/create-presto-benchmarks.slurm @@ -1,9 +1,6 @@ #!/bin/bash #SBATCH --job-name=presto-tpch-create -#SBATCH --output=/mnt/home/misiug/veloxtesting/presto-nvl72/%x_%j.out -#SBATCH --error=/mnt/home/misiug/veloxtesting/presto-nvl72/%x_%j.err #SBATCH --time=01:00:00 -#SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=144 #SBATCH --mem=0 @@ -14,14 +11,18 @@ # User Configuration - Edit these values directly # ============================================================================== # TPC-H Configuration -export SCALE_FACTOR=10000 +if [ -z "${SCALE_FACTOR:-}" ]; then + echo "Error: SCALE_FACTOR is required. Set via launcher: -s|--scale-factor" >&2 + exit 1 +fi +export SCALE_FACTOR # Directory Configuration -export WORKSPACE=/mnt/home/misiug +export REPO_ROOT=/mnt/home/misiug export DATA=/mnt/data/tpch-rs export IMAGE_DIR=/mnt/home/misiug/images export LOGS=/mnt/home/misiug/veloxtesting/presto-nvl72/logs -export CONFIGS=${WORKSPACE}/config/generated/cpu +export CONFIGS=${REPO_ROOT}/veloxtesting/config/generated/cpu # Container Images # Coordinator: ${IMAGE_DIR}/presto-coordinator-test.sqsh @@ -67,7 +68,7 @@ echo "========================================" # Create necessary directories mkdir -p ${LOGS} mkdir -p ${DATA} -mkdir -p ${WORKSPACE}/.hive_metastore +mkdir -p ${REPO_ROOT}/.hive_metastore # Launch the job script -bash ${WORKSPACE}/veloxtesting/presto-nvl72/create-presto-benchmarks.sh +bash ${REPO_ROOT}/veloxtesting/presto-nvl72/create-presto-benchmarks.sh diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh index 5f07d6dc..19e5d4b6 100755 --- a/presto/slurm/presto-nvl72/functions.sh +++ b/presto/slurm/presto-nvl72/functions.sh @@ -1,65 +1,29 @@ #!/bin/bash -# UCX Configuration -export UCX_TLS=^ib,ud:aux,sm -export UCX_MAX_RNDV_RAILS=1 -export UCX_RNDV_PIPELINE_ERROR_HANDLING=y -export UCX_TCP_KEEPINTVL=1ms -export UCX_KEEPALIVE_INTERVAL=1ms - - -# Image directory for presto container images (can be overridden via environment) -IMAGE_DIR="${IMAGE_DIR:-${WORKSPACE}/images}" - -# Logs directory for presto execution logs (can be overridden via environment) -LOGS="${LOGS:-/mnt/home/misiug/veloxtesting/presto-nvl72/logs}" - # Validates job preconditions and assigns default values for presto execution. function setup { [ -z "$SLURM_JOB_NAME" ] && echo "required argument '--job-name' not specified" && exit 1 [ -z "$SLURM_JOB_ACCOUNT" ] && echo "required argument '--account' not specified" && exit 1 [ -z "$SLURM_JOB_PARTITION" ] && echo "required argument '--partition' not specified" && exit 1 [ -z "$SLURM_NNODES" ] && echo "required argument '--nodes' not specified" && exit 1 + [ -z "$IMAGE_DIR" ] && echo "IMAGE_DIR must be set" && exit 1 + [ -z "$LOGS" ] && echo "LOGS must be set" && exit 1 + [ -z "$CONFIGS" ] && echo "CONFIGS must be set" && exit 1 [ -z "$NUM_NODES" ] && echo "NUM_WORKERS must be set" && exit 1 [ -z "$NUM_GPUS_PER_NODE" ] && echo "NUM_GPUS_PER_NODE env variable must be set" && exit 1 - [ ! -d "$WORKSPACE" ] && echo "WORKSPACE must be a valid directory" && exit 1 + [ ! -d "$REPO_ROOT" ] && echo "REPO_ROOT must be a valid directory" && exit 1 [ ! -d "$DATA" ] && echo "DATA must be a valid directory" && exit 1 - - NUM_WORKERS=$(( $NUM_NODES * $NUM_GPUS_PER_NODE )) - mkdir -p ${LOGS} - # Only set CONFIGS if not already set (allow override from environment) - #CONFIGS="${CONFIGS:-${WORKSPACE}/config/generated/gpu}" - #CONFIGS="${CONFIGS:-${WORKSPACE}/config/generated/cpu}" - CONFIGS="${CONFIGS:-${WORKSPACE}/config/generated/${VARIANT_TYPE}}" - COORD=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -1) - PORT=9200 - CUDF_LIB=/usr/lib64/presto-native-libs - if [ "${NUM_WORKERS}" -eq "1" ]; then - SINGLE_NODE_EXECUTION=true - else - SINGLE_NODE_EXECUTION=false - fi - - if [ ! -d ${WORKSPACE}/velox-testing ]; then - git clone -b misiug/cluster https://github.com/rapidsai/velox-testing.git ${WORKSPACE}/velox-testing - #sed -i "s/python3 /python3.12 /g" ${WORKSPACE}/velox-testing/scripts/py_env_functions.sh - fi - [ ! -d ${CONFIGS} ] && generate_configs validate_config_directory } function generate_configs { + echo "GENERATING NEW CONFIGS" mkdir -p ${CONFIGS} - pushd ${WORKSPACE}/velox-testing/presto/scripts - #VARIANT_TYPE=cpu ./generate_presto_config.sh - #VARIANT_TYPE=gpu ./generate_presto_config.sh + pushd ${REPO_ROOT}/velox-testing/presto/scripts OVERWRITE_CONFIG=true ./generate_presto_config.sh popd - mv ${WORKSPACE}/velox-testing/presto/docker/config/generated/${VARIANT_TYPE}/* ${CONFIGS}/ - #mv ${WORKSPACE}/velox-testing/presto/docker/config/generated/gpu/* ${CONFIGS}/ - #mv ${WORKSPACE}/velox-testing/presto/docker/config/generated/cpu/* ${CONFIGS}/ echo "--add-modules=java.management,jdk.management" >> ${CONFIGS}/etc_common/jvm.config echo "-Dcom.sun.management.jmxremote=false" >> ${CONFIGS}/etc_common/jvm.config echo "-XX:-UseContainerSupport" >> ${CONFIGS}/etc_common/jvm.config @@ -82,7 +46,7 @@ function validate_environment_preconditions { # Execute script through the coordinator image (used for coordinator and cli executables) function run_coord_image { [ $# -ne 2 ] && echo_error "$0 expected one argument for '