diff --git a/.gitignore b/.gitignore index ad8182ca..d86b7454 100644 --- a/.gitignore +++ b/.gitignore @@ -23,6 +23,7 @@ presto/docker/config/generated*/ # Generated Presto Docker Compose files presto/docker/docker-compose/generated*/ +presto/scripts/presto_logs/ # Slurm logs and results presto/slurm/presto-nvl72/logs/ diff --git a/benchmark_reporting_tools/post_results.py b/benchmark_reporting_tools/post_results.py index 964e7628..318466fc 100644 --- a/benchmark_reporting_tools/post_results.py +++ b/benchmark_reporting_tools/post_results.py @@ -11,33 +11,34 @@ """ CLI for posting Velox benchmark results to the API. -This script operates on the parsed output of the benchmark runner. The -expected directory structure is: - - ../benchmark-root/ - ├── benchmark.json # optional - ├── configs # optional - │ ├── coordinator.config - │ └── worker.config - ├── logs # optional - │ └── slurm-4575179.out - └── result_dir - └── benchmark_result.json +The script reads benchmark_result.json from the input directory. +Engine configs and worker logs are automatically loaded from the +velox-testing repo by detecting the engine variant from the +benchmark results context. Paths can be overridden with +--config-dir and --logs-dir. + +Default locations (derived from the repo root and detected variant): + configs: presto/docker/config/generated/{variant}/ + logs: presto/scripts/presto_logs/ Usage: - python benchmark_reporting_tools/post_results.py /path/to/benchmark/dir \ + # Auto-detect configs/logs from the repo (default): + python benchmark_reporting_tools/post_results.py /path/to/benchmark_output \ --sku-name PDX-H100 \ --storage-configuration-name pdx-lustre-sf-100 \ + --benchmark-name tpch \ + --identifier-hash abc123 \ --cache-state warm - # With optional version info - python benchmark_reporting_tools/post_results.py /path/to/benchmark/dir \ + # Override with explicit paths: + python benchmark_reporting_tools/post_results.py /path/to/benchmark_output \ + --config-dir /custom/path/to/configs \ + --logs-dir /custom/path/to/logs \ --sku-name PDX-H100 \ --storage-configuration-name pdx-lustre-sf-100 \ - --cache-state warm \ + --benchmark-name tpch \ --identifier-hash abc123 \ - --version 1.0.0 \ - --commit-hash def456 + --cache-state warm Environment variables: BENCHMARK_API_URL: API URL @@ -50,6 +51,7 @@ import dataclasses import json import os +import re import sys from datetime import datetime from pathlib import Path @@ -57,29 +59,59 @@ import httpx +_ENGINE_TO_VARIANT = { + "presto-velox-gpu": "gpu", + "presto-velox-cpu": "cpu", + "presto-java": "java", +} + + +def _repo_root() -> Path: + """Return the velox-testing repo root (parent of benchmark_reporting_tools/).""" + return Path(__file__).resolve().parent.parent + + +def _default_config_dir(variant: str) -> Path | None: + """Derive the generated config directory for a given variant.""" + d = _repo_root() / "presto" / "docker" / "config" / "generated" / variant + return d if d.is_dir() else None + + +def _default_logs_dir() -> Path | None: + """Return the presto_logs directory.""" + link = _repo_root() / "presto" / "scripts" / "presto_logs" + if link.exists(): + return link.resolve() + return None + @dataclasses.dataclass(kw_only=True) class BenchmarkMetadata: - kind: str - benchmark: str + benchmark: list[str] timestamp: datetime - execution_number: int - n_workers: int - scale_factor: int - gpu_count: int - num_drivers: int - worker_image: str | None = None - gpu_name: str engine: str + kind: str | None = None + execution_number: int = 1 + worker_count: int | None = None + scale_factor: int | None = None + gpu_count: int | None = None + num_drivers: int | None = None + gpu_name: str | None = None @classmethod - def from_file(cls, file_path: Path) -> "BenchmarkMetadata": - data = json.loads(file_path.read_text()) - - # parse fields, like the timestamp + def from_parsed(cls, raw: dict) -> "BenchmarkMetadata": + """Extract metadata from the 'context' section of a parsed benchmark_result.json.""" + data = dict(raw["context"]) data["timestamp"] = datetime.fromisoformat(data["timestamp"].replace("Z", "+00:00")) - return cls(**data) + # Normalise legacy string values to a list. + if isinstance(data.get("benchmark"), str): + data["benchmark"] = [data["benchmark"]] + + known_fields = {f.name for f in dataclasses.fields(cls)} + filtered = {k: v for k, v in data.items() if k in known_fields} + + return cls(**filtered) def serialize(self) -> dict: out = dataclasses.asdict(self) @@ -94,11 +126,9 @@ class BenchmarkResults: failed_queries: dict[str, str] @classmethod - def from_file(cls, file_path: Path, benchmark_name: str) -> "BenchmarkResults": - data = json.loads(file_path.read_text()) - - if benchmark_name not in data.keys(): - raise KeyError(f"Expected '{benchmark_name}' key in {file_path}, got: {sorted(data.keys())}") + def from_parsed(cls, data: dict, benchmark_name: str) -> "BenchmarkResults": + if benchmark_name not in data: + raise KeyError(f"Expected '{benchmark_name}' key, got: {sorted(data.keys())}") raw_times_ms = data[benchmark_name]["raw_times_ms"] failed_queries = data[benchmark_name]["failed_queries"] @@ -110,7 +140,7 @@ def from_file(cls, file_path: Path, benchmark_name: str) -> "BenchmarkResults": ) -def parse_config_file(file_path: Path) -> dict[str, str]: +def _parse_config_file(file_path: Path) -> dict[str, str]: """Parse a key=value config file, ignoring comments and blank lines. Args: @@ -132,26 +162,60 @@ def parse_config_file(file_path: Path) -> dict[str, str]: return config +def _find_config_file(configs_dir: Path, subdir: str, variant: str | None = None) -> Path | None: + """Locate a config file under {configs_dir}/{subdir}/. + + When variant is known, the correct properties file is selected directly + (config_java for 'java', config_native for 'gpu'/'cpu'). + """ + sub = configs_dir / subdir + if not sub.is_dir(): + return None + if variant == "java": + candidate = sub / "config_java.properties" + elif variant in ("gpu", "cpu"): + candidate = sub / "config_native.properties" + else: + candidate = None + if candidate and candidate.is_file(): + return candidate + for fallback in sorted(sub.glob("config_*.properties")): + return fallback + return None + + @dataclasses.dataclass class EngineConfig: coordinator: dict[str, str] worker: dict[str, str] @classmethod - def from_dir(cls, configs_dir: Path) -> "EngineConfig": + def from_dir(cls, configs_dir: Path, variant: str | None = None) -> "EngineConfig": """Load engine configuration from a configs directory. - Expects coordinator.config and worker.config files. + Expects the generated layout: + etc_coordinator/config_*.properties + etc_worker/config_*.properties + + When variant is provided ('gpu', 'cpu', or 'java'), selects the + matching properties file (config_native vs config_java). """ - coordinator_config = parse_config_file(configs_dir / "coordinator.config") - worker_config = parse_config_file(configs_dir / "worker.config") + coord_file = _find_config_file(configs_dir, "etc_coordinator", variant) + worker_file = _find_config_file(configs_dir, "etc_worker", variant) + if coord_file is None or worker_file is None: + raise FileNotFoundError( + f"Could not find coordinator/worker config files in {configs_dir}. " + "Expected etc_coordinator/config_*.properties + etc_worker/config_*.properties." + ) + coordinator_config = _parse_config_file(coord_file) + worker_config = _parse_config_file(worker_file) return cls(coordinator=coordinator_config, worker=worker_config) def serialize(self) -> dict: return dataclasses.asdict(self) -def parse_args() -> argparse.Namespace: +def _parse_args() -> argparse.Namespace: """Parse command-line arguments.""" parser = argparse.ArgumentParser( description="Post Velox benchmark results to the API.", @@ -160,7 +224,7 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "input_path", type=str, - help="Path to benchmark directory containing benchmark.json and result_dir/", + help="Path to benchmark directory containing benchmark_result.json", ) parser.add_argument( "--api-url", @@ -191,7 +255,7 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--engine-name", default=None, - help="Query engine name (optionally derived from benchmark.json 'engine' field)", + help="Query engine name (overrides the 'engine' field from benchmark_result.json context)", ) parser.add_argument( "--identifier-hash", @@ -241,62 +305,24 @@ def parse_args() -> argparse.Namespace: type=int, default=1, ) - - # A bunch of optional arguments for when benchmark.json is not present. - parser.add_argument( - "--kind", - help="Run kind (e.g. 'single-node', 'multi-node')", - ) parser.add_argument( - "--benchmark", - help="Benchmark name (e.g. 'tpch')", - default="tpch", - ) - parser.add_argument( - "--timestamp", - help="Timestamp of the benchmark run", - default=None, - ) - parser.add_argument("--execution-number", help="Execution number of the benchmark run", type=int, default=1) - parser.add_argument( - "--n-workers", - help="Number of workers in the benchmark run", - type=int, - default=None, - ) - parser.add_argument( - "--scale-factor", - help="Scale factor of the benchmark run", - type=int, - default=None, - ) - parser.add_argument( - "--gpu-count", - help="Number of GPUs in the benchmark run", - type=int, - default=None, - ) - parser.add_argument( - "--gpu-name", - help="GPU name (e.g. 'H100')", - default=None, - ) - parser.add_argument( - "--num-drivers", - help="Number of drivers in the benchmark run", - type=int, + "--config-dir", + type=str, default=None, + help="Override config directory. Default: auto-detected from the engine variant " + "in benchmark_result.json → presto/docker/config/generated/{variant}/.", ) parser.add_argument( - "--worker-image", - help="Worker image (e.g. 'velox/worker:latest')", + "--logs-dir", + type=str, default=None, + help="Override server logs directory. Default: presto/scripts/presto_logs/.", ) return parser.parse_args() -def normalize_api_url(url: str) -> str: +def _normalize_api_url(url: str) -> str: """Normalize a user-provided API URL to a base URL. Handles various formats: @@ -314,7 +340,7 @@ def normalize_api_url(url: str) -> str: return normalized.rstrip("/") -def build_submission_payload( +def _build_submission_payload( benchmark_metadata: BenchmarkMetadata, benchmark_results: BenchmarkResults, engine_config: EngineConfig | None, @@ -333,7 +359,7 @@ def build_submission_payload( """Build a BenchmarkSubmission payload from parsed dataclasses. Args: - benchmark_metadata: Parsed benchmark.json as BenchmarkMetadata + benchmark_metadata: Parsed from the 'context' section of benchmark_result.json benchmark_results: Parsed benchmark_result.json as BenchmarkResults engine_config: Parsed config files as EngineConfig, optional sku_name: Hardware SKU name @@ -361,8 +387,14 @@ def build_submission_payload( raw_times = benchmark_results.raw_times_ms failed_queries = benchmark_results.failed_queries - # Sort query names for consistent ordering (Q1, Q2, ..., Q22) - query_names = sorted(raw_times.keys(), key=lambda x: int(x[1:])) + def _query_sort_key(name: str): + stripped = name.lstrip("Qq") + match = re.match(r"(\d+)(.*)", stripped) + if match: + return (int(match.group(1)), match.group(2)) + return (float("inf"), name) + + query_names = sorted(raw_times.keys(), key=_query_sort_key) for query_name in query_names: times = raw_times[query_name] @@ -404,14 +436,17 @@ def build_submission_payload( ) execution_order += 1 - # Build extra info from metadata + # Build extra info from metadata, omitting None values extra_info = { - "kind": benchmark_metadata.kind, - "gpu_count": benchmark_metadata.gpu_count, - "gpu_name": benchmark_metadata.gpu_name, - "num_drivers": benchmark_metadata.num_drivers, - "worker_image": benchmark_metadata.worker_image, - "execution_number": benchmark_metadata.execution_number, + k: v + for k, v in { + "kind": benchmark_metadata.kind, + "gpu_count": benchmark_metadata.gpu_count, + "gpu_name": benchmark_metadata.gpu_name, + "num_drivers": benchmark_metadata.num_drivers, + "execution_number": benchmark_metadata.execution_number, + }.items() + if v is not None } return { @@ -426,7 +461,8 @@ def build_submission_payload( "commit_hash": commit_hash, }, "run_at": benchmark_metadata.timestamp.isoformat(), - "node_count": benchmark_metadata.n_workers, + "node_count": 1, + "gpu_count": benchmark_metadata.gpu_count or 0, "query_logs": query_logs, "concurrency_streams": concurrency_streams, "engine_config": engine_config.serialize() if engine_config else {}, @@ -436,8 +472,8 @@ def build_submission_payload( } -def build_http_client(api_url: str, api_key: str, timeout: float) -> httpx.AsyncClient: - base_url = normalize_api_url(api_url) +def _build_http_client(api_url: str, api_key: str, timeout: float) -> httpx.AsyncClient: + base_url = _normalize_api_url(api_url) transport = httpx.AsyncHTTPTransport(retries=3) return httpx.AsyncClient( base_url=base_url, @@ -447,7 +483,7 @@ def build_http_client(api_url: str, api_key: str, timeout: float) -> httpx.Async ) -async def upload_log_files( +async def _upload_log_files( benchmark_dir: Path, api_url: str, api_key: str, @@ -473,7 +509,7 @@ async def upload_log_files( print(f" Uploading {len(log_files)} log file(s) (max {max_concurrency} concurrent)...", file=sys.stderr) semaphore = asyncio.Semaphore(max_concurrency) - async with build_http_client(api_url, api_key, timeout) as client: + async with _build_http_client(api_url, api_key, timeout) as client: async def _upload_one(log_file: Path) -> int: async with semaphore: @@ -496,18 +532,18 @@ async def _upload_one(log_file: Path) -> int: return list(asset_ids) -async def post_submission(api_url: str, api_key: str, payload: dict, timeout: float) -> tuple[int, str]: +async def _post_submission(api_url: str, api_key: str, payload: dict, timeout: float) -> tuple[int, str]: """Post a benchmark submission to the API. Returns: Tuple of (status_code, response_text) """ - async with build_http_client(api_url, api_key, timeout) as client: + async with _build_http_client(api_url, api_key, timeout) as client: response = await client.post("/api/benchmark/", json=payload) return response.status_code, response.text -async def process_benchmark_dir( +async def _process_benchmark_dir( benchmark_dir: Path, *, sku_name: str, @@ -524,18 +560,9 @@ async def process_benchmark_dir( timeout: float, upload_logs: bool = True, benchmark_definition_name: str, - # all the optional arguments for when benchmark.json is not present. concurrency_streams: int = 1, - kind: str | None = None, - benchmark: str | None = None, - timestamp: str | None = None, - execution_number: int = 1, - n_workers: int | None = None, - scale_factor: int | None = None, - gpu_count: int | None = None, - gpu_name: str | None = None, - worker_image: str | None = None, - num_drivers: int | None = None, + config_dir: Path | None = None, + logs_dir: Path | None = None, ) -> int: """Process a benchmark directory and post results to API. @@ -544,142 +571,138 @@ async def process_benchmark_dir( """ print(f"\nProcessing: {benchmark_dir}", file=sys.stderr) - # Load metadata, results, and config - - # benchmark.json is only optionally written out. - # We give preference to getting this from the user CLI options, - # falling back to - - benchmark_json_path = benchmark_dir / "benchmark.json" - - if not benchmark_json_path.exists(): - missing_args = [] - if kind is None: - missing_args.append("kind") - if benchmark is None: - missing_args.append("benchmark") - if timestamp is None: - missing_args.append("timestamp") - if n_workers is None: - missing_args.append("n_workers") - if scale_factor is None: - missing_args.append("scale_factor") - if gpu_count is None: - missing_args.append("gpu_count") - if gpu_name is None: - missing_args.append("gpu_name") - if num_drivers is None: - missing_args.append("num_drivers") - if engine_name is None: - missing_args.append("engine_name") - - if missing_args: - print(" Error: must provide benchmark metadata when benchmark.json is not present", file=sys.stderr) - print(f" Error: missing arguments: {', '.join(missing_args)}", file=sys.stderr) - return 1 + # Load metadata and results from benchmark_result.json. + # The "context" section contains run metadata; benchmark data sits + # under a top-level key matching the benchmark name (e.g. "tpch"). - # mypy doesn't realize that kind, benchmark, etc. have been narrowed to not-None by the check above. - benchmark_metadata = BenchmarkMetadata( - kind=kind, # type: ignore[arg-type] - benchmark=benchmark, # type: ignore[arg-type] - timestamp=datetime.fromisoformat(timestamp.replace("Z", "+00:00")), # type: ignore[union-attr] - execution_number=execution_number, - n_workers=n_workers, # type: ignore[arg-type] - scale_factor=scale_factor, # type: ignore[arg-type] - gpu_count=gpu_count, # type: ignore[arg-type] - gpu_name=gpu_name, # type: ignore[arg-type] - num_drivers=num_drivers, # type: ignore[arg-type] - worker_image=worker_image, - engine=engine_name, # type: ignore[arg-type] - ) - else: - try: - benchmark_metadata = BenchmarkMetadata.from_file(benchmark_dir / "benchmark.json") - except (ValueError, json.JSONDecodeError, FileNotFoundError) as e: - print(f" Error loading metadata: {e}", file=sys.stderr) - return 1 + result_file = benchmark_dir / "benchmark_result.json" try: - results = BenchmarkResults.from_file( - benchmark_dir / "result_dir" / "benchmark_result.json", benchmark_name=benchmark_metadata.benchmark - ) - except (ValueError, json.JSONDecodeError, FileNotFoundError) as e: - print(f" Error loading results: {e}", file=sys.stderr) + raw = json.loads(result_file.read_text()) + except (json.JSONDecodeError, FileNotFoundError) as e: + print(f" Error reading {result_file}: {e}", file=sys.stderr) return 1 - if (benchmark_dir / "configs").exists(): - print(" Loading engine config...", file=sys.stderr) - engine_config = EngineConfig.from_dir(benchmark_dir / "configs") + try: + benchmark_metadata = BenchmarkMetadata.from_parsed(raw) + except (ValueError, KeyError) as e: + print(f" Error loading metadata: {e}", file=sys.stderr) + return 1 + + # Resolve config directory: explicit override → auto-detect from variant + effective_config_dir = config_dir + variant = _ENGINE_TO_VARIANT.get(benchmark_metadata.engine) + if effective_config_dir is None: + if variant: + effective_config_dir = _default_config_dir(variant) + if effective_config_dir: + print(f" Auto-detected variant '{variant}' → config dir: {effective_config_dir}", file=sys.stderr) + else: + print(f" Auto-detected variant '{variant}' but config dir does not exist.", file=sys.stderr) + else: + print(f" Could not map engine '{benchmark_metadata.engine}' to a variant.", file=sys.stderr) + + if effective_config_dir and effective_config_dir.exists(): + print(f" Loading engine config from {effective_config_dir}...", file=sys.stderr) + try: + engine_config = EngineConfig.from_dir(effective_config_dir, variant=variant) + except FileNotFoundError as e: + print(f" Warning: could not load engine config: {e}", file=sys.stderr) + engine_config = None else: - print(" No engine config found.", file=sys.stderr) + if effective_config_dir: + print(f" Warning: config directory does not exist: {effective_config_dir}", file=sys.stderr) + else: + print(" Warning: no config directory found. Use --config-dir to specify one.", file=sys.stderr) engine_config = None - # Upload log files as assets + # Resolve logs directory: explicit override → auto-detect from repo + effective_logs_dir = logs_dir + if effective_logs_dir is None: + effective_logs_dir = _default_logs_dir() + if effective_logs_dir: + print(f" Auto-detected logs dir: {effective_logs_dir}", file=sys.stderr) + asset_ids = None - if upload_logs: + if upload_logs and effective_logs_dir and effective_logs_dir.exists(): if dry_run: - log_files = sorted(benchmark_dir.glob("*.log")) + log_files = sorted(effective_logs_dir.glob("*.log")) print( - f" [DRY RUN] Would upload {len(log_files)} log file(s): {[f.name for f in log_files]}", file=sys.stderr + f" [DRY RUN] Would upload {len(log_files)} log file(s) from {effective_logs_dir}: " + f"{[f.name for f in log_files]}", + file=sys.stderr, ) else: try: - asset_ids = await upload_log_files(benchmark_dir, api_url, api_key, timeout) + asset_ids = await _upload_log_files(effective_logs_dir, api_url, api_key, timeout) except (RuntimeError, httpx.RequestError) as e: print(f" Error uploading logs: {e}", file=sys.stderr) return 1 + elif upload_logs: + print(" No logs directory found; skipping log upload.", file=sys.stderr) - # Build submission payload - try: - payload = build_submission_payload( - benchmark_metadata=benchmark_metadata, - benchmark_results=results, - engine_config=engine_config, - benchmark_definition_name=benchmark_definition_name, - sku_name=sku_name, - storage_configuration_name=storage_configuration_name, - cache_state=cache_state, - engine_name=engine_name, - identifier_hash=identifier_hash, - version=version, - commit_hash=commit_hash, - is_official=is_official, - asset_ids=asset_ids, - concurrency_streams=concurrency_streams, - ) - except Exception as e: - print(f" Error building payload: {e}", file=sys.stderr) - return 1 + # Process each benchmark type found in the result file. + overall_result = 0 + for bench_name in benchmark_metadata.benchmark: + print(f"\n Processing benchmark type: {bench_name}", file=sys.stderr) - # Print summary - print(f" Benchmark definition: {payload['benchmark_definition_name']}", file=sys.stderr) - print(f" Engine: {payload['query_engine']['engine_name']}", file=sys.stderr) - print(f" Identifier hash: {payload['query_engine']['identifier_hash']}", file=sys.stderr) - print(f" Node count: {payload['node_count']}", file=sys.stderr) - print(f" Query logs: {len(payload['query_logs'])}", file=sys.stderr) + try: + results = BenchmarkResults.from_parsed(raw, benchmark_name=bench_name) + except (ValueError, KeyError) as e: + print(f" Error loading results for '{bench_name}': {e}", file=sys.stderr) + overall_result = 1 + continue - if dry_run: - print("\n [DRY RUN] Payload:", file=sys.stderr) - print(json.dumps(payload, indent=2, default=str)) - return 0 + try: + payload = _build_submission_payload( + benchmark_metadata=benchmark_metadata, + benchmark_results=results, + engine_config=engine_config, + benchmark_definition_name=benchmark_definition_name, + sku_name=sku_name, + storage_configuration_name=storage_configuration_name, + cache_state=cache_state, + engine_name=engine_name, + identifier_hash=identifier_hash, + version=version, + commit_hash=commit_hash, + is_official=is_official, + asset_ids=asset_ids, + concurrency_streams=concurrency_streams, + ) + except Exception as e: + print(f" Error building payload for '{bench_name}': {e}", file=sys.stderr) + overall_result = 1 + continue - # Post to API - try: - status_code, response_text = await post_submission(api_url, api_key, payload, timeout) - print(f" Status: {status_code}", file=sys.stderr) - if status_code >= 400: - print(f" Response: {response_text}", file=sys.stderr) - return 1 - else: - print(f" Success: {response_text}", file=sys.stderr) - return 0 - except httpx.RequestError as e: - print(f" Error posting: {e}", file=sys.stderr) - return 1 + print(f" Benchmark definition: {payload['benchmark_definition_name']}", file=sys.stderr) + print(f" Engine: {payload['query_engine']['engine_name']}", file=sys.stderr) + print(f" Identifier hash: {payload['query_engine']['identifier_hash']}", file=sys.stderr) + print(f" Node count: {payload['node_count']}", file=sys.stderr) + print(f" Query logs: {len(payload['query_logs'])}", file=sys.stderr) + + if dry_run: + print("\n [DRY RUN] Payload:", file=sys.stderr) + print(json.dumps(payload, indent=2, default=str)) + continue + + try: + status_code, response_text = await _post_submission(api_url, api_key, payload, timeout) + print(f" Status: {status_code}", file=sys.stderr) + if status_code >= 400: + print(f" Response: {response_text}", file=sys.stderr) + overall_result = 1 + else: + print(f" Success: {response_text}", file=sys.stderr) + except httpx.RequestError as e: + print(f" Error posting for '{bench_name}': {e}", file=sys.stderr) + overall_result = 1 + + return overall_result async def main() -> int: - args = parse_args() + args = _parse_args() # Resolve to str (parser already falls back to BENCHMARK_API_URL / BENCHMARK_API_KEY) api_url = args.api_url or "" @@ -705,7 +728,7 @@ async def main() -> int: print(f"Error: Input path is not a directory: {args.input_path}", file=sys.stderr) return 1 - result = await process_benchmark_dir( + result = await _process_benchmark_dir( benchmark_dir, sku_name=args.sku_name, storage_configuration_name=args.storage_configuration_name, @@ -721,17 +744,9 @@ async def main() -> int: timeout=args.timeout, upload_logs=args.upload_logs, benchmark_definition_name=args.benchmark_name, - kind=args.kind, - benchmark=args.benchmark, - timestamp=args.timestamp, - execution_number=args.execution_number, - n_workers=args.n_workers, - scale_factor=args.scale_factor, - gpu_count=args.gpu_count, - gpu_name=args.gpu_name, - worker_image=args.worker_image, - num_drivers=args.num_drivers, concurrency_streams=args.concurrency_streams, + config_dir=Path(args.config_dir) if args.config_dir else None, + logs_dir=Path(args.logs_dir) if args.logs_dir else None, ) return result diff --git a/benchmark_reporting_tools/requirements.txt b/benchmark_reporting_tools/requirements.txt new file mode 100644 index 00000000..d4ed1efa --- /dev/null +++ b/benchmark_reporting_tools/requirements.txt @@ -0,0 +1 @@ +httpx>=0.28.1 diff --git a/common/testing/conftest.py b/common/testing/conftest.py index fc3bc52c..830def6d 100644 --- a/common/testing/conftest.py +++ b/common/testing/conftest.py @@ -6,7 +6,7 @@ def pytest_generate_tests(metafunc): TPCH_FIXTURE_NAME = "tpch_query_id" if TPCH_FIXTURE_NAME in metafunc.fixturenames: TPCH_NUM_QUERIES = 22 - set_query_id_param(metafunc, TPCH_FIXTURE_NAME, TPCH_NUM_QUERIES, []) + _set_query_id_param(metafunc, TPCH_FIXTURE_NAME, TPCH_NUM_QUERIES, []) TPCDS_FIXTURE_NAME = "tpcds_query_id" if TPCDS_FIXTURE_NAME in metafunc.fixturenames: @@ -14,22 +14,22 @@ def pytest_generate_tests(metafunc): TPCDS_DISABLED_QUERIES = [ # All queries now pass with SQL fixes ] - set_query_id_param(metafunc, TPCDS_FIXTURE_NAME, TPCDS_NUM_QUERIES, TPCDS_DISABLED_QUERIES) + _set_query_id_param(metafunc, TPCDS_FIXTURE_NAME, TPCDS_NUM_QUERIES, TPCDS_DISABLED_QUERIES) -def set_query_id_param(metafunc, param_name, num_queries, disabled_queries): +def _set_query_id_param(metafunc, param_name, num_queries, disabled_queries): queries = metafunc.config.getoption("--queries") - metafunc.parametrize(param_name, get_query_ids(num_queries, queries, disabled_queries)) + metafunc.parametrize(param_name, _get_query_ids(num_queries, queries, disabled_queries)) -def get_query_ids(num_queries, selected_query_ids, disabled_queries): - query_ids = parse_selected_query_ids(selected_query_ids, num_queries) +def _get_query_ids(num_queries, selected_query_ids, disabled_queries): + query_ids = _parse_selected_query_ids(selected_query_ids, num_queries) if len(query_ids) == 0: query_ids = [id for id in range(1, num_queries + 1) if id not in disabled_queries] - return format_query_ids(query_ids) + return _format_query_ids(query_ids) -def parse_selected_query_ids(selected_query_ids, num_queries): +def _parse_selected_query_ids(selected_query_ids, num_queries): query_ids = [] if selected_query_ids and selected_query_ids.strip(): for id_str in selected_query_ids.split(","): @@ -40,5 +40,5 @@ def parse_selected_query_ids(selected_query_ids, num_queries): return query_ids -def format_query_ids(query_ids): +def _format_query_ids(query_ids): return [f"Q{query_id}" for query_id in query_ids] diff --git a/common/testing/performance_benchmarks/benchmark_keys.py b/common/testing/performance_benchmarks/benchmark_keys.py index 634af971..90f1ce35 100644 --- a/common/testing/performance_benchmarks/benchmark_keys.py +++ b/common/testing/performance_benchmarks/benchmark_keys.py @@ -21,4 +21,12 @@ class BenchmarkKeys(str, Enum): CONTEXT_KEY = "context" ITERATIONS_COUNT_KEY = "iterations_count" SCHEMA_NAME_KEY = "schema_name" + # Run configuration (from run context; written to context in benchmark_result.json) + TIMESTAMP_KEY = "timestamp" + NUM_WORKERS_KEY = "worker_count" + GPU_NAME_KEY = "gpu_name" + + ENGINE_KEY = "engine" + KIND_KEY = "kind" + GPU_COUNT_KEY = "gpu_count" DATASET_NAME_KEY = "dataset_name" diff --git a/common/testing/performance_benchmarks/conftest.py b/common/testing/performance_benchmarks/conftest.py index daf23d19..4ff35ffe 100644 --- a/common/testing/performance_benchmarks/conftest.py +++ b/common/testing/performance_benchmarks/conftest.py @@ -31,17 +31,17 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config): for benchmark_type, result in terminalreporter._session.benchmark_results.items(): assert BenchmarkKeys.AGGREGATE_TIMES_KEY in result - write_line(terminalreporter, text_report, "") - write_section( + _write_line(terminalreporter, text_report, "") + _write_section( terminalreporter, text_report, f"{benchmark_type} Benchmark Summary", sep="-", bold=True, yellow=True ) - write_line(terminalreporter, text_report, "") - write_line(terminalreporter, text_report, f"Iterations Count: {iterations}") - write_line(terminalreporter, text_report, f"{data_location_display_name} Name: {data_location_name}") + _write_line(terminalreporter, text_report, "") + _write_line(terminalreporter, text_report, f"Iterations Count: {iterations}") + _write_line(terminalreporter, text_report, f"{data_location_display_name} Name: {data_location_name}") if tag: - write_line(terminalreporter, text_report, f"Tag: {tag}") - write_line(terminalreporter, text_report, "") + _write_line(terminalreporter, text_report, f"Tag: {tag}") + _write_line(terminalreporter, text_report, "") if iterations > 1: AGG_HEADERS = [ @@ -59,9 +59,9 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config): header = " Query ID " for agg_header in AGG_HEADERS: header += f"|{agg_header:^{width}}" - write_line(terminalreporter, text_report, "-" * len(header), bold=True, yellow=True) - write_line(terminalreporter, text_report, header) - write_line(terminalreporter, text_report, "-" * len(header), bold=True, yellow=True) + _write_line(terminalreporter, text_report, "-" * len(header), bold=True, yellow=True) + _write_line(terminalreporter, text_report, header) + _write_line(terminalreporter, text_report, "-" * len(header), bold=True, yellow=True) for query_id, agg_timings in result[BenchmarkKeys.AGGREGATE_TIMES_KEY].items(): line = f"{query_id:^10}" if agg_timings: @@ -70,10 +70,10 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config): line += f"|{agg_timing:^{width}}" else: line += f"|{'NULL':^{width}}" * len(AGG_HEADERS) - write_line(terminalreporter, text_report, line) + _write_line(terminalreporter, text_report, line) # Print SUM row. - write_line(terminalreporter, text_report, "-" * len(header)) + _write_line(terminalreporter, text_report, "-" * len(header)) agg_sums = result[BenchmarkKeys.AGGREGATE_TIMES_SUM_KEY] line = f"{'SUM':^10}" if agg_sums: @@ -83,8 +83,8 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config): else: line += f"|{'NULL':^{width}}" * len(AGG_HEADERS) - write_line(terminalreporter, text_report, line) - write_line(terminalreporter, text_report, "") + _write_line(terminalreporter, text_report, line) + _write_line(terminalreporter, text_report, "") bench_output_dir = get_output_dir(config) assert bench_output_dir.is_dir() @@ -93,34 +93,25 @@ def pytest_terminal_summary(terminalreporter, exitstatus, config): file.write(f"{report_text}\n") -def write_line(terminalreporter, text_report, content, **kwargs): +def _write_line(terminalreporter, text_report, content, **kwargs): terminalreporter.write_line(content, **kwargs) text_report.append(content) -def write_section(terminalreporter, text_report, content, **kwargs): +def _write_section(terminalreporter, text_report, content, **kwargs): terminalreporter.section(content, **kwargs) sep = kwargs.get("sep", " ") text_report.append(f" {content} ".center(120, sep)) -def pytest_sessionfinish(session, exitstatus): - iterations = session.config.getoption("--iterations") - - data_location_option = pytest.data_location.option_name - data_location_key = pytest.data_location.option_name - data_location_name = session.config.getoption(data_location_option) - json_result = { - BenchmarkKeys.CONTEXT_KEY: { - BenchmarkKeys.ITERATIONS_COUNT_KEY: iterations, - data_location_key: data_location_name, - }, - } +def build_and_write_benchmark_result(session, json_result): + """Compute aggregate timings and write benchmark_result.json. - tag = session.config.getoption("--tag") - if tag: - json_result[BenchmarkKeys.CONTEXT_KEY][BenchmarkKeys.TAG_KEY] = tag + Callers populate json_result[CONTEXT_KEY] with their own context + before calling this function. + """ + iterations = session.config.getoption("--iterations") bench_output_dir = get_output_dir(session.config) bench_output_dir.mkdir(parents=True, exist_ok=True) @@ -162,6 +153,34 @@ def pytest_sessionfinish(session, exitstatus): file.write("\n") +def pytest_sessionfinish(session, exitstatus): + iterations = session.config.getoption("--iterations") + + data_location_option = pytest.data_location.option_name + data_location_key = pytest.data_location.key + data_location_name = session.config.getoption(data_location_option) + json_result = { + BenchmarkKeys.CONTEXT_KEY: { + BenchmarkKeys.ITERATIONS_COUNT_KEY: iterations, + data_location_key: data_location_name, + }, + } + + tag = session.config.getoption("--tag") + if tag: + json_result[BenchmarkKeys.CONTEXT_KEY][BenchmarkKeys.TAG_KEY] = tag + + if hasattr(session, "run_context"): + for key, value in session.run_context.items(): + json_result[BenchmarkKeys.CONTEXT_KEY][key] = value + + if hasattr(session, "benchmark_results"): + benchmark_types = list(session.benchmark_results.keys()) + json_result[BenchmarkKeys.CONTEXT_KEY]["benchmark"] = benchmark_types + + build_and_write_benchmark_result(session, json_result) + + def get_output_dir(config): bench_output_dir = config.getoption("--output-dir") tag = config.getoption("--tag") diff --git a/presto/docker/docker-compose.common.yml b/presto/docker/docker-compose.common.yml index 281a1f68..5b4fd06a 100644 --- a/presto/docker/docker-compose.common.yml +++ b/presto/docker/docker-compose.common.yml @@ -4,6 +4,8 @@ services: - ./.hive_metastore:/var/lib/presto/data/hive/metastore - ../../common/testing/integration_tests/data:/var/lib/presto/data/hive/data/integration_test - ${PRESTO_DATA_DIR:-/dev/null}:/var/lib/presto/data/hive/data/user_data + - ${LOGS_DIR:-/dev/null}:/opt/presto-server/logs + - ./launch_presto_servers.sh:/opt/launch_presto_servers.sh:ro presto-base-java: extends: @@ -18,6 +20,12 @@ services: image: presto-coordinator:${PRESTO_IMAGE_TAG} ports: - 8080:8080 + environment: + - RUN_TIMESTAMP=${RUN_TIMESTAMP:-} + volumes: + - ./launch_coordinator.sh:/opt/launch_coordinator.sh:ro + entrypoint: ["bash", "/opt/launch_coordinator.sh"] + command: [] presto-base-native-worker: extends: @@ -27,3 +35,4 @@ services: dockerfile: velox-testing/presto/docker/native_build.dockerfile environment: - GLOG_logtostderr=1 + - RUN_TIMESTAMP=${RUN_TIMESTAMP:-} diff --git a/presto/docker/docker-compose.java.yml b/presto/docker/docker-compose.java.yml index e728c291..963e22a5 100644 --- a/presto/docker/docker-compose.java.yml +++ b/presto/docker/docker-compose.java.yml @@ -15,10 +15,15 @@ services: service: presto-base-java container_name: presto-java-worker image: presto-java-worker:${PRESTO_IMAGE_TAG} + environment: + - RUN_TIMESTAMP=${RUN_TIMESTAMP:-} volumes: - ./config/generated/java/etc_common:/opt/presto-server/etc - ./config/generated/java/etc_worker/config_java.properties:/opt/presto-server/etc/config.properties - ./config/generated/java/etc_worker/node.properties:/opt/presto-server/etc/node.properties - ./config/generated/java/etc_worker/catalog/hive.properties:/opt/presto-server/etc/catalog/hive.properties + - ./launch_java_worker.sh:/opt/launch_java_worker.sh:ro + entrypoint: ["bash", "/opt/launch_java_worker.sh"] + command: [] depends_on: - presto-coordinator diff --git a/presto/docker/docker-compose.native-cpu.yml b/presto/docker/docker-compose.native-cpu.yml index b017dc81..3ba16cdd 100644 --- a/presto/docker/docker-compose.native-cpu.yml +++ b/presto/docker/docker-compose.native-cpu.yml @@ -18,6 +18,8 @@ services: build: args: - GPU=OFF + entrypoint: ["bash", "/opt/launch_presto_servers.sh"] + command: [] depends_on: - presto-coordinator volumes: diff --git a/presto/docker/docker-compose/template/docker-compose.native-gpu.yml.jinja b/presto/docker/docker-compose/template/docker-compose.native-gpu.yml.jinja index d5ad844d..0997ffc7 100644 --- a/presto/docker/docker-compose/template/docker-compose.native-gpu.yml.jinja +++ b/presto/docker/docker-compose/template/docker-compose.native-gpu.yml.jinja @@ -68,9 +68,11 @@ services: <<: *gpu_worker_base container_name: presto-native-worker-gpu-{{ gpu_id }} environment: + WORKER_ID: {{ gpu_id }} NVIDIA_VISIBLE_DEVICES: all PROFILE: ${PROFILE} PROFILE_ARGS: ${PROFILE_ARGS} + RUN_TIMESTAMP: ${RUN_TIMESTAMP:-} UCX_LOG_LEVEL: info #debug UCX_RNDV_PIPELINE_ERROR_HANDLING: y UCX_TLS: tcp,cuda_copy,cuda_ipc @@ -82,6 +84,7 @@ services: KVIKIO_NTHREADS: {{ kvikio_threads }} CUDA_VISIBLE_DEVICES: {{ gpu_id }} volumes: + - ../../config/generated/gpu/etc_common:/opt/presto-server/etc - ../../config/generated/gpu/etc_worker_{{ gpu_id }}/node.properties:/opt/presto-server/etc/node.properties - ../../config/generated/gpu/etc_worker_{{ gpu_id }}/config_native.properties:/opt/presto-server/etc/config.properties - ../../config/generated/gpu/etc_worker_{{ gpu_id }}/catalog/hive.properties:/opt/presto-server/etc/catalog/hive.properties @@ -97,6 +100,7 @@ services: NVIDIA_VISIBLE_DEVICES: all PROFILE: ${PROFILE} PROFILE_ARGS: ${PROFILE_ARGS} + RUN_TIMESTAMP: ${RUN_TIMESTAMP:-} UCX_LOG_LEVEL: info #debug UCX_RNDV_PIPELINE_ERROR_HANDLING: y UCX_TLS: tcp,cuda_copy,cuda_ipc @@ -112,6 +116,7 @@ services: NVIDIA_VISIBLE_DEVICES: all PROFILE: ${PROFILE} PROFILE_ARGS: ${PROFILE_ARGS} + RUN_TIMESTAMP: ${RUN_TIMESTAMP:-} UCX_LOG_LEVEL: info #debug UCX_RNDV_PIPELINE_ERROR_HANDLING: y UCX_TLS: tcp,cuda_copy,cuda_ipc diff --git a/presto/docker/launch_coordinator.sh b/presto/docker/launch_coordinator.sh new file mode 100755 index 00000000..1ac7537e --- /dev/null +++ b/presto/docker/launch_coordinator.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -e + +LOGS_DIR="/opt/presto-server/logs" +mkdir -p "${LOGS_DIR}" +RUN_TIMESTAMP="${RUN_TIMESTAMP:-$(date +"%Y%m%dT%H%M%S")}" +log_file="${LOGS_DIR}/coordinator_${RUN_TIMESTAMP}.log" + +exec /opt/presto-server/bin/launcher run >> "${log_file}" 2>&1 diff --git a/presto/docker/launch_java_worker.sh b/presto/docker/launch_java_worker.sh new file mode 100755 index 00000000..8cdeec47 --- /dev/null +++ b/presto/docker/launch_java_worker.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -e + +LOGS_DIR="/opt/presto-server/logs" +mkdir -p "${LOGS_DIR}" +RUN_TIMESTAMP="${RUN_TIMESTAMP:-$(date +"%Y%m%dT%H%M%S")}" +log_file="${LOGS_DIR}/worker_0_${RUN_TIMESTAMP}.log" + +exec /opt/presto-server/bin/launcher run >> "${log_file}" 2>&1 diff --git a/presto/docker/launch_presto_servers.sh b/presto/docker/launch_presto_servers.sh index 16dfd352..913f1118 100644 --- a/presto/docker/launch_presto_servers.sh +++ b/presto/docker/launch_presto_servers.sh @@ -1,17 +1,35 @@ #!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 set -e # Run ldconfig once -echo ldconfig +ldconfig + +LOGS_DIR="/opt/presto-server/logs" +mkdir -p "${LOGS_DIR}" +RUN_TIMESTAMP="${RUN_TIMESTAMP:-$(date +"%Y%m%dT%H%M%S")}" if [ $# -eq 0 ]; then - presto_server --etc-dir="/opt/presto-server/etc/" & + # Single worker mode. Use WORKER_ID env var for log filename (defaults to 0). + local_id="${WORKER_ID:-0}" + log_file="${LOGS_DIR}/worker_${local_id}_${RUN_TIMESTAMP}.log" + gpu_name="$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -n 1)" + echo "GPU Name: ${gpu_name:-unknown}" > "${log_file}" + presto_server --etc-dir="/opt/presto-server/etc/" >> "${log_file}" 2>&1 & else -# Launch workers in parallel, each pinned to a different GPU -# The GPU IDs are passed as command-line arguments -for gpu_id in "$@"; do - CUDA_VISIBLE_DEVICES=$gpu_id presto_server --etc-dir="/opt/presto-server/etc${gpu_id}" & -done + # Multi-worker single-container mode. Each GPU ID is an argument. + worker_id=0 + for gpu_id in "$@"; do + ( + export CUDA_VISIBLE_DEVICES=$gpu_id + log_file="${LOGS_DIR}/worker_${worker_id}_${RUN_TIMESTAMP}.log" + gpu_name="$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -n 1)" + echo "GPU Name: ${gpu_name:-unknown}" > "${log_file}" + exec presto_server --etc-dir="/opt/presto-server/etc${gpu_id}" >> "${log_file}" 2>&1 + ) & + worker_id=$((worker_id + 1)) + done fi # Wait for all background processes diff --git a/presto/scripts/generate_presto_config.sh b/presto/scripts/generate_presto_config.sh index cc20b82a..7bbbe771 100755 --- a/presto/scripts/generate_presto_config.sh +++ b/presto/scripts/generate_presto_config.sh @@ -133,16 +133,10 @@ EOF # optimizer.joins-not-null-inference-strategy=USE_FUNCTION_METADATA # optimizer.default-filter-factor-enabled=true sed -i 's/\#optimizer/optimizer/g' ${COORD_CONFIG} - - if [[ ${NUM_WORKERS} -eq 1 ]]; then - # Adds a cluster tag for gpu variant - echo "cluster-tag=native-gpu" >> ${COORD_CONFIG} - fi + echo "cluster-tag=native-gpu" >> ${COORD_CONFIG} fi - # now perform other variant-specific modifications to the generated configs if [[ "${VARIANT_TYPE}" == "cpu" ]]; then - # Adds a cluster tag for cpu variant echo "cluster-tag=native-cpu" >> ${COORD_CONFIG} fi diff --git a/presto/scripts/run_benchmark.sh b/presto/scripts/run_benchmark.sh index 2a4d7462..f54aca7e 100755 --- a/presto/scripts/run_benchmark.sh +++ b/presto/scripts/run_benchmark.sh @@ -8,6 +8,10 @@ set -e # Compute the directory where this script resides SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# LOGS_DIR points to the directory where server log files (including nvidia-smi +# output) are written so that run_context.py can parse GPU info. +export LOGS_DIR="${LOGS_DIR:-${SCRIPT_DIR}/presto_logs}" + source "${SCRIPT_DIR}/presto_connection_defaults.sh" print_help() { @@ -38,6 +42,9 @@ OPTIONS: --skip-drop-cache Skip dropping system caches before each benchmark query (dropped by default). -m, --metrics Collect detailed metrics from Presto REST API after each query. Metrics are stored in query-specific directories. + -v, --verbose Print debug logs for worker/engine detection + (e.g. node URIs, cluster-tag, GPU model). + Use when engine is misdetected or the run fails. EXAMPLES: $0 -b tpch -s bench_sf100 @@ -46,7 +53,7 @@ EXAMPLES: $0 -b tpch -s bench_sf100 -t gh200_cpu_sf100 $0 -b tpch -s bench_sf100 --profile $0 -b tpch -s bench_sf100 --metrics - $0 -h + $0 -b tpch -s bench_sf100 --verbose EOF } @@ -160,6 +167,10 @@ parse_args() { METRICS=true shift ;; + -v|--verbose) + export PRESTO_BENCHMARK_DEBUG=1 + shift + ;; *) echo "Error: Unknown argument $1" print_help @@ -225,7 +236,7 @@ if [[ -n ${TAG} ]]; then fi if [[ "${PROFILE}" == "true" ]]; then - PYTEST_ARGS+=("--profile --profile-script-path $(readlink -f ./profiler_functions.sh)") + PYTEST_ARGS+=("--profile --profile-script-path $(readlink -f "${SCRIPT_DIR}/profiler_functions.sh")") fi if [[ "${METRICS}" == "true" ]]; then diff --git a/presto/scripts/start_presto_helper.sh b/presto/scripts/start_presto_helper.sh index 089fcceb..9145f53f 100755 --- a/presto/scripts/start_presto_helper.sh +++ b/presto/scripts/start_presto_helper.sh @@ -140,6 +140,18 @@ else echo "Internal error: unexpected VARIANT_TYPE value: $VARIANT_TYPE" fi +# Set up LOGS_DIR before any docker compose commands (stop, build, or up) +# since docker-compose.common.yml requires it via ${LOGS_DIR:?...}. +LOGS_DIR="${LOGS_DIR:-${SCRIPT_DIR}/presto_logs}" +[ -L "${LOGS_DIR}" ] && rm -f "${LOGS_DIR}" +mkdir -p "${LOGS_DIR}" +if compgen -G "${LOGS_DIR}/*.log" > /dev/null 2>&1; then + mkdir -p "${LOGS_DIR}/archive" + mv "${LOGS_DIR}"/*.log "${LOGS_DIR}/archive/" +fi +export RUN_TIMESTAMP="$(date +"%Y%m%dT%H%M%S")" +export LOGS_DIR + "${SCRIPT_DIR}/stop_presto.sh" "${SCRIPT_DIR}/generate_presto_config.sh" diff --git a/presto/scripts/start_presto_helper_parse_args.sh b/presto/scripts/start_presto_helper_parse_args.sh index 20f4889c..289ff927 100644 --- a/presto/scripts/start_presto_helper_parse_args.sh +++ b/presto/scripts/start_presto_helper_parse_args.sh @@ -33,6 +33,8 @@ OPTIONS: --profile-args Arguments to pass to the profiler when it launches the Presto server. This will override the default arguments. --overwrite-config Force config to be regenerated (will overwrite local changes). + --logs-dir Directory for server log files (default: /presto_logs). + Old log files are archived to an archive/ subdirectory on each startup. --sccache Enable sccache distributed compilation caching (requires auth files in ~/.sccache-auth/). Run scripts/sccache/setup_sccache_auth.sh first. --sccache-version Install a specific version of rapidsai/sccache, e.g. "0.12.0-rapids.1" @@ -68,6 +70,7 @@ export PROFILE=OFF export NUM_WORKERS=1 export KVIKIO_THREADS=8 export VCPU_PER_WORKER="" +LOGS_DIR="" ENABLE_SCCACHE=false SCCACHE_AUTH_DIR="${SCCACHE_AUTH_DIR:-$HOME/.sccache-auth}" SCCACHE_ENABLE_DIST=false @@ -172,6 +175,15 @@ parse_args() { OVERWRITE_CONFIG=true shift ;; + --logs-dir) + if [[ -n $2 ]]; then + LOGS_DIR=$2 + shift 2 + else + echo "Error: --logs-dir requires a value" + exit 1 + fi + ;; --sccache) ENABLE_SCCACHE=true shift diff --git a/presto/slurm/presto-nvl72/README.md b/presto/slurm/presto-nvl72/README.md index b2fb24e3..007f3bcd 100644 --- a/presto/slurm/presto-nvl72/README.md +++ b/presto/slurm/presto-nvl72/README.md @@ -56,7 +56,7 @@ Key variables: - NUM_ITERATIONS: required by the job; launcher defaults to 1 (`-i/--iterations` to override) - NUM_NODES: derived from Slurm allocation; provided via `-n/--nodes` to launcher - REPO_ROOT: auto-detected from script location -- LOGS: `${SCRIPT_DIR}/logs` by default +- LOGS_DIR: `${SCRIPT_DIR}/logs` by default (log files are timestamped; old logs archived to `logs/archive/`) - IMAGE_DIR, DATA, CONFIGS: see below or override via environment if needed Other defaults: @@ -82,10 +82,10 @@ squeue -u $USER # Monitor job output tail -f presto-tpch-run_n_sf_i_.out -# Check logs during execution -tail -f logs/coord.log -tail -f logs/cli.log -tail -f logs/worker_0.log +# Check logs during execution (filenames include a run timestamp) +tail -f logs/coord_*.log +tail -f logs/cli_*.log +tail -f logs/worker_0_*.log ``` ## Coordinator IP and Web UI @@ -124,7 +124,7 @@ Results are saved to: ### Coordinator fails to start Check coordinator logs: ```bash -cat logs/coord.log +cat logs/coord_*.log ``` ### Workers not registering diff --git a/presto/slurm/presto-nvl72/functions.sh b/presto/slurm/presto-nvl72/functions.sh index 016c35b3..7c4144df 100755 --- a/presto/slurm/presto-nvl72/functions.sh +++ b/presto/slurm/presto-nvl72/functions.sh @@ -9,7 +9,7 @@ function setup { [ -z "$SLURM_JOB_PARTITION" ] && echo "required argument '--partition' not specified" && exit 1 [ -z "$SLURM_NNODES" ] && echo "required argument '--nodes' not specified" && exit 1 [ -z "$IMAGE_DIR" ] && echo "IMAGE_DIR must be set" && exit 1 - [ -z "$LOGS" ] && echo "LOGS must be set" && exit 1 + [ -z "$LOGS_DIR" ] && echo "LOGS_DIR must be set" && exit 1 [ -z "$CONFIGS" ] && echo "CONFIGS must be set" && exit 1 [ -z "$NUM_NODES" ] && echo "NUM_NODES must be set" && exit 1 [ -z "$NUM_GPUS_PER_NODE" ] && echo "NUM_GPUS_PER_NODE env variable must be set" && exit 1 @@ -58,11 +58,11 @@ function validate_environment_preconditions { # Execute script through the coordinator image (used for coordinator and cli executables) function run_coord_image { [ $# -ne 2 ] && echo_error "$0 expected one argument for '