From 849815215bf6ab39d715a48b165246cc194e3030 Mon Sep 17 00:00:00 2001 From: Chang chen Date: Mon, 9 Feb 2026 17:10:34 +0800 Subject: [PATCH 1/7] Upgrade protobuf-maven-plugin from 0.5.1 to 0.6.1 and enable checkStaleness Enable true in all protobuf-maven-plugin executions (gluten-core, gluten-substrait, backends-velox) so protobuf compilation is skipped when .proto files haven't changed, improving incremental build speed. --- backends-velox/pom.xml | 1 + gluten-core/pom.xml | 1 + gluten-substrait/pom.xml | 1 + pom.xml | 2 +- 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/backends-velox/pom.xml b/backends-velox/pom.xml index fa4e52817e88..72bd47305281 100644 --- a/backends-velox/pom.xml +++ b/backends-velox/pom.xml @@ -298,6 +298,7 @@ com.google.protobuf:protoc:${protobuf.version}:exe:${os.detected.classifier} src/main/resources/org/apache/gluten/proto false + true diff --git a/gluten-core/pom.xml b/gluten-core/pom.xml index 0f817143dccb..80bc53ebf140 100644 --- a/gluten-core/pom.xml +++ b/gluten-core/pom.xml @@ -171,6 +171,7 @@ com.google.protobuf:protoc:${protobuf.version}:exe:${os.detected.classifier} src/main/resources/org/apache/gluten/proto false + true diff --git a/gluten-substrait/pom.xml b/gluten-substrait/pom.xml index 7ee22dd12773..345f3a43b3fa 100644 --- a/gluten-substrait/pom.xml +++ b/gluten-substrait/pom.xml @@ -256,6 +256,7 @@ com.google.protobuf:protoc:${protobuf.version}:exe:${os.detected.classifier} src/main/resources/substrait/proto false + true diff --git a/pom.xml b/pom.xml index 21145c3d7d51..aaf97be33791 100644 --- a/pom.xml +++ b/pom.xml @@ -901,7 +901,7 @@ org.xolstice.maven.plugins protobuf-maven-plugin - 0.5.1 + 0.6.1 org.antlr From 7b70fe4ed6124a799073963ba22278bc38afea8c Mon Sep 17 00:00:00 2001 From: Chang chen Date: Mon, 9 Feb 2026 17:17:36 +0800 Subject: [PATCH 2/7] Enable Scala incremental compilation 1. Upgrade scala-maven-plugin from 4.8.0 to 4.9.2 (aligned with Spark) 2. Change scala.recompile.mode from 'all' to 'incremental' 3. Skip javac compilation - Zinc already handles Java sources in incremental mode (same approach as Apache Spark) 4. Add -Ybackend-parallelism 8 for both Scala 2.12 and 2.13 profiles 5. Update gluten-it to use incremental mode and 4.9.2 (hardcoded since it's a standalone third-party module without parent POM properties) --- pom.xml | 17 +++++++++++------ tools/gluten-it/common/pom.xml | 6 ++++-- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index aaf97be33791..f92d8f798f5b 100644 --- a/pom.xml +++ b/pom.xml @@ -97,7 +97,7 @@ --> unknown - all + incremental 2.15.0 4.13.1 @@ -140,7 +140,7 @@ 3.6.0 - 4.8.0 + 4.9.2 3.14.1 3.2.2 1.0.0 @@ -570,10 +570,11 @@ UTF-8 1024m - true - - -Xlint:all,-serial,-path - + + true + true @@ -634,6 +635,8 @@ -feature -Wconf:cat=deprecation:wv,any:e -P:wartremover:traverser:io.github.zhztheplayer.scalawarts.InheritFromCaseClass + -Ybackend-parallelism + 8 @@ -1057,6 +1060,8 @@ -Wconf:cat=unchecked&msg=eliminated by erasure:s -Wconf:msg=^(?=.*?a value of type)(?=.*?cannot also be).+$:s -P:wartremover:traverser:io.github.zhztheplayer.scalawarts.InheritFromCaseClass + -Ybackend-parallelism + 8 diff --git a/tools/gluten-it/common/pom.xml b/tools/gluten-it/common/pom.xml index ba1485a17bd6..dd8caa1b4ffd 100644 --- a/tools/gluten-it/common/pom.xml +++ b/tools/gluten-it/common/pom.xml @@ -113,9 +113,11 @@ net.alchim31.maven scala-maven-plugin - 4.8.0 + + 4.9.2 - all + incremental From 5a95de1a97fd5663b162993ab8929cbe49e6b393 Mon Sep 17 00:00:00 2001 From: Chang chen Date: Mon, 9 Feb 2026 17:22:52 +0800 Subject: [PATCH 3/7] Consolidate build-info generation into a single location Merge build-info and build-info-with-backends into a single execution in gluten-core, eliminating the separate call from gluten-substrait: - Remove build-info-with-backends execution from gluten-substrait/pom.xml - Remove redundant backend profile definitions from gluten-substrait - Add --backend parameter to gluten-core's build-info execution - Modify gluten-build-info.sh to compute backend paths internally based on backend_type (no longer needs external path argument) - Remove DO_REMOVAL flag; always regenerate the file from scratch --- dev/gluten-build-info.sh | 19 ++++++++++--------- gluten-core/pom.xml | 3 ++- gluten-substrait/pom.xml | 41 ---------------------------------------- 3 files changed, 12 insertions(+), 51 deletions(-) diff --git a/dev/gluten-build-info.sh b/dev/gluten-build-info.sh index d4bd424d6714..22db64372894 100755 --- a/dev/gluten-build-info.sh +++ b/dev/gluten-build-info.sh @@ -21,10 +21,9 @@ GLUTEN_ROOT=$(cd $(dirname -- $0)/..; pwd -P) EXTRA_RESOURCE_DIR=$GLUTEN_ROOT/gluten-core/target/generated-resources BUILD_INFO="$EXTRA_RESOURCE_DIR"/gluten-build-info.properties -DO_REMOVAL="$1" && shift -if [ "true" = "$DO_REMOVAL" ]; then - rm -rf "$BUILD_INFO" -fi + +# Delete old build-info file before regenerating +rm -f "$BUILD_INFO" mkdir -p "$EXTRA_RESOURCE_DIR" function echo_revision_info() { @@ -56,13 +55,15 @@ while (( "$#" )); do echo gluten_version="$2" >> "$BUILD_INFO" ;; --backend) - echo backend_type="$2" >> "$BUILD_INFO" - if [ "velox" = "$2" ]; then - echo_velox_revision_info "$3" >> "$BUILD_INFO" - elif [ "ch" = "$2" ]; then + BACKEND_TYPE="$2" + echo backend_type="$BACKEND_TYPE" >> "$BUILD_INFO" + # Compute backend home path based on type + if [ "velox" = "$BACKEND_TYPE" ]; then + BACKEND_HOME="$GLUTEN_ROOT/ep/build-velox/build/velox_ep" + echo_velox_revision_info "$BACKEND_HOME" >> "$BUILD_INFO" + elif [ "ch" = "$BACKEND_TYPE" ] || [ "clickhouse" = "$BACKEND_TYPE" ]; then echo_clickhouse_revision_info >> "$BUILD_INFO" fi - shift ;; --java) echo java_version="$2" >> "$BUILD_INFO" diff --git a/gluten-core/pom.xml b/gluten-core/pom.xml index 80bc53ebf140..ad55c158fa0c 100644 --- a/gluten-core/pom.xml +++ b/gluten-core/pom.xml @@ -190,7 +190,6 @@ - @@ -201,6 +200,8 @@ + + diff --git a/gluten-substrait/pom.xml b/gluten-substrait/pom.xml index 345f3a43b3fa..543021dc9e0f 100644 --- a/gluten-substrait/pom.xml +++ b/gluten-substrait/pom.xml @@ -216,30 +216,6 @@ - - org.apache.maven.plugins - maven-antrun-plugin - - - build-info-with-backends - - run - - generate-resources - - - - - - - - - - - - - - org.xolstice.maven.plugins @@ -314,21 +290,4 @@ target/scala-${scala.binary.version}/classes target/scala-${scala.binary.version}/test-classes - - - - backends-velox - - velox - ${project.basedir}/../ep/build-velox/build/velox_ep - - - - backends-clickhouse - - ch - ${project.basedir}/../cpp-ch/ClickHouse - - - From 58015127b410537a100a7ce9335a17276ebd4a6e Mon Sep 17 00:00:00 2001 From: Chang chen Date: Mon, 9 Feb 2026 18:19:35 +0800 Subject: [PATCH 4/7] Add dev tooling: run-scala-test.sh, mvnd wrapper, and build profiler - dev/run-scala-test.sh: Run ScalaTest like IntelliJ IDEA from CLI with auto classpath resolution, profiler support, and mvnd integration - build/mvnd: Maven Daemon wrapper (auto-downloads mvnd 1.0.3) for persistent JVM that keeps Zinc's JIT caches across builds - build/mvn: Increase ReservedCodeCacheSize from 1g to 2g - dev/analyze-build-profile.py: Analyze Maven profiler JSON reports - .gitignore: Add build/mvnd, .run-scala-test-cache/, .profiler/, .mvn/ --- .gitignore | 12 +- build/mvn | 2 +- build/mvnd | 179 ++++++++++ dev/analyze-build-profile.py | 513 ++++++++++++++++++++++++++ dev/run-scala-test.sh | 673 +++++++++++++++++++++++++++++++++++ 5 files changed, 1377 insertions(+), 2 deletions(-) create mode 100755 build/mvnd create mode 100755 dev/analyze-build-profile.py create mode 100755 dev/run-scala-test.sh diff --git a/.gitignore b/.gitignore index 8a794e386545..a679fc3002ea 100644 --- a/.gitignore +++ b/.gitignore @@ -27,8 +27,9 @@ CMakeFiles/ CMakeCache.txt CTestTestfile.cmake cmake_install.cmake -!build/mvn build/ +!build/mvn +!build/mvnd *-build/ Testing/ cmake-build-*/ @@ -76,3 +77,12 @@ metastore_db/ # GitHub Copilot config (allow local customization) .github/copilot-instructions.md .github/copilot-setup-steps.yml + +# Dev script build cache +.run-scala-test-cache/ + +# Maven profiler reports +.profiler/ + +# Maven extensions (local config) +.mvn/ diff --git a/build/mvn b/build/mvn index f20ea7f7b82b..550f1d1435b8 100755 --- a/build/mvn +++ b/build/mvn @@ -127,7 +127,7 @@ if [ ! -f "${MVN_BIN}" ]; then exit 1 fi -_COMPILE_JVM_OPTS="-Xss128m -Xmx4g -XX:ReservedCodeCacheSize=1g" +_COMPILE_JVM_OPTS="-Xss128m -Xmx4g -XX:ReservedCodeCacheSize=2g" # Set any `mvn` options if not already present export MAVEN_OPTS=${MAVEN_OPTS:-"$_COMPILE_JVM_OPTS"} diff --git a/build/mvnd b/build/mvnd new file mode 100755 index 000000000000..166c505f8162 --- /dev/null +++ b/build/mvnd @@ -0,0 +1,179 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script downloads and installs Maven Daemon (mvnd) if it's not already installed. +# mvnd keeps a persistent JVM that preserves Zinc's JIT-optimized code and +# classloader caches across builds, significantly speeding up incremental compilation. +# Usage: build/mvnd +# + +# Determine the current working directory +GLUTEN_HOME="$(cd "$(dirname "$0")"/.. && pwd)" || exit 1 +DOWNLOAD_DIR="${GLUTEN_HOME}/build" +MVND_DOWNLOAD_DIR="${DOWNLOAD_DIR}/.mvnd" + +# mvnd version to download if not found on system +MVND_VERSION="1.0.3" + +# Global variable for mvnd binary path +MVND_BIN="" + +# Detect OS and architecture for mvnd download +detect_platform() { + local os arch + os="$(uname -s)" + arch="$(uname -m)" + + case "$os" in + Linux) os="linux" ;; + Darwin) os="darwin" ;; + *) + echo "ERROR: Unsupported OS: $os. mvnd binaries are available for Linux and macOS." >&2 + exit 1 + ;; + esac + + case "$arch" in + x86_64|amd64) arch="amd64" ;; + aarch64|arm64) arch="aarch64" ;; + *) + echo "ERROR: Unsupported architecture: $arch. mvnd binaries are available for amd64 and aarch64." >&2 + exit 1 + ;; + esac + + echo "${os}-${arch}" +} + +install_mvnd() { + # Check for system mvnd first + local SYSTEM_MVND + SYSTEM_MVND="$(command -v mvnd)" + + if [ -n "$SYSTEM_MVND" ]; then + local MVND_DETECTED_VERSION + MVND_DETECTED_VERSION="$(mvnd --version 2>/dev/null | grep -i 'mvnd' | head -1 | awk '{print $NF}')" + echo "Using system mvnd: $SYSTEM_MVND (version $MVND_DETECTED_VERSION)" >&2 + MVND_BIN="$SYSTEM_MVND" + return 0 + fi + + # Detect platform + local PLATFORM + PLATFORM="$(detect_platform)" + + local MVND_DIR_NAME="maven-mvnd-${MVND_VERSION}-${PLATFORM}" + local MVND_LOCAL_BIN="${MVND_DOWNLOAD_DIR}/${MVND_DIR_NAME}/bin/mvnd" + + if [ ! -f "${MVND_LOCAL_BIN}" ]; then + echo "mvnd ${MVND_VERSION} not found locally. Downloading..." >&2 + + # Create download directory + mkdir -p "${MVND_DOWNLOAD_DIR}" + + local MVND_TAR="${MVND_DOWNLOAD_DIR}/${MVND_DIR_NAME}.tar.gz" + + if [ ! -f "${MVND_TAR}" ]; then + # Download from Apache + local DOWNLOAD_URL="https://downloads.apache.org/maven/mvnd/${MVND_VERSION}/${MVND_DIR_NAME}.tar.gz" + + echo "Downloading mvnd ${MVND_VERSION} for ${PLATFORM}..." >&2 + echo "URL: ${DOWNLOAD_URL}" >&2 + + if command -v curl > /dev/null 2>&1; then + curl -f -L --retry 3 --retry-delay 3 \ + --connect-timeout 30 --max-time 600 \ + -o "${MVND_TAR}" "${DOWNLOAD_URL}" || { + echo "ERROR: Failed to download mvnd from ${DOWNLOAD_URL}" >&2 + rm -f "${MVND_TAR}" + exit 1 + } + elif command -v wget > /dev/null 2>&1; then + wget --tries=3 --waitretry=3 \ + --connect-timeout=30 --read-timeout=600 \ + -O "${MVND_TAR}" "${DOWNLOAD_URL}" || { + echo "ERROR: Failed to download mvnd from ${DOWNLOAD_URL}" >&2 + rm -f "${MVND_TAR}" + exit 1 + } + else + echo "ERROR: Neither curl nor wget found. Please install one of them or install mvnd manually." >&2 + exit 1 + fi + + echo "Download completed successfully" >&2 + fi + + # Extract mvnd + echo "Extracting mvnd to ${MVND_DOWNLOAD_DIR}..." >&2 + if ! tar -xzf "${MVND_TAR}" -C "${MVND_DOWNLOAD_DIR}"; then + echo "ERROR: Failed to extract mvnd" >&2 + rm -f "${MVND_TAR}" + exit 1 + fi + + # Clean up tar file + rm -f "${MVND_TAR}" + + # Configure mvnd daemon JVM settings for Scala compilation + # ReservedCodeCacheSize=2g: Scala compiler generates enormous JIT code; + # default 240M fills up, causing ~10x slowdown in interpreted mode. + local MVND_PROPS="${MVND_DOWNLOAD_DIR}/${MVND_DIR_NAME}/conf/mvnd.properties" + if [ -f "${MVND_PROPS}" ]; then + echo "" >> "${MVND_PROPS}" + echo "# Gluten: tuned for Scala compilation workloads" >> "${MVND_PROPS}" + echo "mvnd.maxHeapSize = 30G" >> "${MVND_PROPS}" + echo "mvnd.threadStackSize = 128M" >> "${MVND_PROPS}" + echo "mvnd.jvmArgs = -XX:ReservedCodeCacheSize=2g" >> "${MVND_PROPS}" + echo "Configured mvnd daemon JVM settings in ${MVND_PROPS}" >&2 + fi + + echo "mvnd ${MVND_VERSION} installed successfully to ${MVND_DOWNLOAD_DIR}/${MVND_DIR_NAME}" >&2 + else + echo "Using downloaded mvnd: ${MVND_LOCAL_BIN} (version ${MVND_VERSION})" >&2 + fi + + # Set global variable + MVND_BIN="${MVND_LOCAL_BIN}" +} + +# Install mvnd if needed +install_mvnd + +# Verify mvnd binary is set +if [ -z "${MVND_BIN}" ]; then + echo "ERROR: mvnd binary not found. Please install mvnd or check your installation." >&2 + exit 1 +fi + +# Verify mvnd binary exists +if [ ! -f "${MVND_BIN}" ]; then + echo "ERROR: mvnd binary does not exist: ${MVND_BIN}" >&2 + exit 1 +fi + +_COMPILE_JVM_OPTS="-Xss128m -Xmx4g -XX:ReservedCodeCacheSize=2g" +# Set any `mvn` options if not already present +export MAVEN_OPTS=${MAVEN_OPTS:-"$_COMPILE_JVM_OPTS"} + +echo "MAVEN_OPTS: ${MAVEN_OPTS}" >&2 + +"${MVND_BIN}" "$@" + +MVND_RETCODE=$? + +exit $MVND_RETCODE diff --git a/dev/analyze-build-profile.py b/dev/analyze-build-profile.py new file mode 100755 index 000000000000..2306579eb648 --- /dev/null +++ b/dev/analyze-build-profile.py @@ -0,0 +1,513 @@ +#!/usr/bin/env python3 +""" +Analyze maven-profiler JSON reports and output Markdown summary. + +Usage: + # Single report analysis + ./dev/analyze-build-profile.py .profiler/profiler-report-*.json + + # Compare two reports (e.g., clean vs incremental) + ./dev/analyze-build-profile.py --compare clean.json incremental.json + + # Auto-pick latest report(s) + ./dev/analyze-build-profile.py --latest + ./dev/analyze-build-profile.py --latest --compare +""" + +import argparse +import glob +import json +import os +import re +import sys +from collections import defaultdict +from pathlib import Path + + +# ============================================================================= +# Parsing +# ============================================================================= + +def parse_time_ms(time_str: str) -> int: + """Parse '49612 ms' -> 49612.""" + m = re.match(r"(\d+)\s*ms", time_str.strip()) + return int(m.group(1)) if m else 0 + + +def parse_mojo_name(mojo_str: str): + """ + Parse 'groupId:artifactId:version:goal {execution: id}' + Returns (short_plugin, goal, execution_id). + """ + exec_id = "" + m = re.search(r"\{execution:\s*([^}]+)\}", mojo_str) + if m: + exec_id = m.group(1).strip() + + parts_str = re.sub(r"\s*\{.*\}", "", mojo_str).strip() + parts = parts_str.split(":") + if len(parts) >= 4: + artifact = parts[1] + goal = parts[3] + elif len(parts) >= 2: + artifact = parts[0] + goal = parts[-1] + else: + artifact = parts_str + goal = "" + + return artifact, goal, exec_id + + +def load_report(path: str) -> dict: + """Load and normalize a profiler JSON report.""" + with open(path) as f: + data = json.load(f) + + report = { + "file": os.path.basename(path), + "name": data.get("name", ""), + "date": data.get("date", ""), + "goals": data.get("goals", ""), + "total_ms": parse_time_ms(data.get("time", "0 ms")), + "modules": [], + } + + for proj in data.get("projects", []): + module = { + "name": proj["project"], + "time_ms": parse_time_ms(proj.get("time", "0 ms")), + "mojos": [], + } + for mojo in proj.get("mojos", []): + plugin, goal, exec_id = parse_mojo_name(mojo["mojo"]) + module["mojos"].append({ + "plugin": plugin, + "goal": goal, + "exec_id": exec_id, + "time_ms": parse_time_ms(mojo.get("time", "0 ms")), + "raw": mojo["mojo"], + }) + report["modules"].append(module) + + return report + + +# ============================================================================= +# Formatting helpers +# ============================================================================= + +def fmt_ms(ms: int) -> str: + """Format milliseconds to human-readable string.""" + if ms >= 60000: + return f"{ms / 60000:.1f}m" + elif ms >= 1000: + return f"{ms / 1000:.1f}s" + else: + return f"{ms}ms" + + +def bar(ratio: float, width: int = 20) -> str: + """Generate a text bar chart.""" + filled = int(ratio * width) + return "█" * filled + "░" * (width - filled) + + +def pct(part: int, total: int) -> str: + if total == 0: + return "0.0%" + return f"{part / total * 100:.1f}%" + + +# ============================================================================= +# Analysis: Single Report +# ============================================================================= + +def section_header(report: dict) -> str: + lines = [ + "# Maven Build Profile Analysis", + "", + "| Item | Value |", + "|------|-------|", + f"| **Project** | {report['name']} |", + f"| **Date** | {report['date']} |", + f"| **Goals** | `{report['goals']}` |", + f"| **Total Time** | **{fmt_ms(report['total_ms'])}** ({report['total_ms']}ms) |", + f"| **Modules** | {len(report['modules'])} |", + "", + ] + return "\n".join(lines) + + +def section_modules(report: dict) -> str: + """Module time ranking.""" + total = report["total_ms"] + modules = sorted(report["modules"], key=lambda m: m["time_ms"], reverse=True) + max_time = modules[0]["time_ms"] if modules else 1 + + lines = [ + "## 1. Module Time Ranking", + "", + "| # | Module | Time | % | Distribution |", + "|---|--------|------|---|-------------|", + ] + for i, m in enumerate(modules, 1): + ratio = m["time_ms"] / max_time if max_time > 0 else 0 + lines.append( + f"| {i} | {m['name']} | {fmt_ms(m['time_ms'])} | {pct(m['time_ms'], total)} | `{bar(ratio)}` |" + ) + + sum_modules = sum(m["time_ms"] for m in modules) + overhead = total - sum_modules + if overhead > 100: + lines.append(f"| | *Maven overhead* | {fmt_ms(overhead)} | {pct(overhead, total)} | |") + + lines.append("") + return "\n".join(lines) + + +def section_top_mojos(report: dict, top_n: int = 15) -> str: + """Top N slowest mojo executions across all modules.""" + all_mojos = [] + for m in report["modules"]: + for mojo in m["mojos"]: + if mojo["time_ms"] > 0: + all_mojos.append({ + "module": m["name"], + "plugin": mojo["plugin"], + "goal": mojo["goal"], + "exec_id": mojo["exec_id"], + "time_ms": mojo["time_ms"], + }) + + all_mojos.sort(key=lambda x: x["time_ms"], reverse=True) + total = report["total_ms"] + + lines = [ + f"## 2. Top {top_n} Slowest Mojo Executions", + "", + "| # | Module | Plugin:Goal | Time | % |", + "|---|--------|-------------|------|---|", + ] + for i, mojo in enumerate(all_mojos[:top_n], 1): + lines.append( + f"| {i} | {mojo['module']} | `{mojo['plugin']}:{mojo['goal']}` | {fmt_ms(mojo['time_ms'])} | {pct(mojo['time_ms'], total)} |" + ) + lines.append("") + return "\n".join(lines) + + +def section_goal_aggregate(report: dict) -> str: + """Aggregate time by goal type across all modules.""" + goal_times = defaultdict(int) + for m in report["modules"]: + for mojo in m["mojos"]: + key = f"{mojo['plugin']}:{mojo['goal']}" + goal_times[key] += mojo["time_ms"] + + sorted_goals = sorted(goal_times.items(), key=lambda x: x[1], reverse=True) + total = report["total_ms"] + + lines = [ + "## 3. Time by Goal Type (Aggregated)", + "", + "| Goal | Total Time | % |", + "|------|-----------|---|", + ] + for goal, ms in sorted_goals: + if ms > 0: + lines.append(f"| `{goal}` | {fmt_ms(ms)} | {pct(ms, total)} |") + lines.append("") + return "\n".join(lines) + + +def section_category_breakdown(report: dict) -> str: + """Categorize time into compile/testCompile/other.""" + categories = { + "scala:compile": 0, + "scala:testCompile": 0, + "java:compile": 0, + "java:testCompile": 0, + "other": 0, + } + + for m in report["modules"]: + for mojo in m["mojos"]: + if mojo["plugin"] == "scala-maven-plugin" and mojo["goal"] == "compile": + categories["scala:compile"] += mojo["time_ms"] + elif mojo["plugin"] == "scala-maven-plugin" and mojo["goal"] == "testCompile": + categories["scala:testCompile"] += mojo["time_ms"] + elif mojo["plugin"] == "maven-compiler-plugin" and mojo["goal"] == "compile": + categories["java:compile"] += mojo["time_ms"] + elif mojo["plugin"] == "maven-compiler-plugin" and mojo["goal"] == "testCompile": + categories["java:testCompile"] += mojo["time_ms"] + else: + categories["other"] += mojo["time_ms"] + + total = report["total_ms"] + sum_cat = sum(categories.values()) + overhead = total - sum_cat + + lines = [ + "## 4. Time by Category", + "", + "| Category | Time | % | Bar |", + "|----------|------|---|-----|", + ] + max_cat = max(categories.values()) if categories else 1 + for cat, ms in sorted(categories.items(), key=lambda x: x[1], reverse=True): + ratio = ms / max_cat if max_cat > 0 else 0 + lines.append(f"| **{cat}** | {fmt_ms(ms)} | {pct(ms, total)} | `{bar(ratio, 15)}` |") + + if overhead > 100: + lines.append(f"| *maven overhead* | {fmt_ms(overhead)} | {pct(overhead, total)} | |") + + scala_total = categories["scala:compile"] + categories["scala:testCompile"] + lines.extend([ + "", + f"> **Scala compilation**: {fmt_ms(scala_total)} ({pct(scala_total, total)} of total)", + ]) + lines.append("") + return "\n".join(lines) + + +def section_per_module_breakdown(report: dict) -> str: + """Per-module compile vs testCompile breakdown.""" + modules = sorted(report["modules"], key=lambda m: m["time_ms"], reverse=True) + + lines = [ + "## 5. Per-Module Compile Breakdown", + "", + "| Module | compile | testCompile | other | total |", + "|--------|---------|-------------|-------|-------|", + ] + for m in modules: + if m["time_ms"] < 100: + continue + compile_ms = sum( + mj["time_ms"] for mj in m["mojos"] + if mj["plugin"] == "scala-maven-plugin" and mj["goal"] == "compile" + ) + test_compile_ms = sum( + mj["time_ms"] for mj in m["mojos"] + if mj["plugin"] == "scala-maven-plugin" and mj["goal"] == "testCompile" + ) + other_ms = m["time_ms"] - compile_ms - test_compile_ms + lines.append( + f"| {m['name']} | {fmt_ms(compile_ms)} | {fmt_ms(test_compile_ms)} | {fmt_ms(other_ms)} | {fmt_ms(m['time_ms'])} |" + ) + lines.append("") + return "\n".join(lines) + + +def analyze_single(report: dict) -> str: + """Full single-report analysis.""" + return "\n".join([ + section_header(report), + section_modules(report), + section_top_mojos(report), + section_goal_aggregate(report), + section_category_breakdown(report), + section_per_module_breakdown(report), + ]) + + +# ============================================================================= +# Analysis: Compare Two Reports +# ============================================================================= + +def _delta(a_ms: int, b_ms: int) -> str: + """Format delta with sign.""" + diff = b_ms - a_ms + if diff == 0: + return "—" + sign = "+" if diff > 0 else "" + return f"{sign}{fmt_ms(diff)}" + + +def _speedup(a_ms: int, b_ms: int) -> str: + """Calculate speedup ratio.""" + if a_ms == 0: + return "—" + if b_ms == 0: + return "∞" + ratio = a_ms / b_ms + if ratio > 1: + return f"**{ratio:.1f}x** faster" + elif ratio < 1: + return f"{1/ratio:.1f}x slower" + else: + return "—" + + +def analyze_compare(report_a: dict, report_b: dict) -> str: + """Compare two profiler reports side by side.""" + lines = [ + "# Maven Build Profile Comparison", + "", + "| | A (Baseline) | B (Current) | Δ |", + "|---|---|---|---|", + f"| **File** | `{report_a['file']}` | `{report_b['file']}` | |", + f"| **Date** | {report_a['date']} | {report_b['date']} | |", + f"| **Total** | {fmt_ms(report_a['total_ms'])} | {fmt_ms(report_b['total_ms'])} | {_delta(report_a['total_ms'], report_b['total_ms'])} |", + "", + ] + + # Module comparison + lines.extend([ + "## Module Comparison", + "", + "| Module | A | B | Δ | Speedup |", + "|--------|---|---|---|---------|", + ]) + + mods_a = {m["name"]: m for m in report_a["modules"]} + mods_b = {m["name"]: m for m in report_b["modules"]} + all_names = list(dict.fromkeys( + [m["name"] for m in report_a["modules"]] + + [m["name"] for m in report_b["modules"]] + )) + + for name in all_names: + a_ms = mods_a[name]["time_ms"] if name in mods_a else 0 + b_ms = mods_b[name]["time_ms"] if name in mods_b else 0 + if a_ms == 0 and b_ms == 0: + continue + lines.append( + f"| {name} | {fmt_ms(a_ms)} | {fmt_ms(b_ms)} | {_delta(a_ms, b_ms)} | {_speedup(a_ms, b_ms)} |" + ) + + lines.append("") + + # Category comparison + lines.extend([ + "## Category Comparison", + "", + "| Category | A | B | Δ |", + "|----------|---|---|---|", + ]) + + for cat_name, cat_filter in [ + ("scala:compile", lambda mj: mj["plugin"] == "scala-maven-plugin" and mj["goal"] == "compile"), + ("scala:testCompile", lambda mj: mj["plugin"] == "scala-maven-plugin" and mj["goal"] == "testCompile"), + ("other", lambda mj: not (mj["plugin"] == "scala-maven-plugin" and mj["goal"] in ("compile", "testCompile"))), + ]: + a_ms = sum(mj["time_ms"] for m in report_a["modules"] for mj in m["mojos"] if cat_filter(mj)) + b_ms = sum(mj["time_ms"] for m in report_b["modules"] for mj in m["mojos"] if cat_filter(mj)) + lines.append(f"| **{cat_name}** | {fmt_ms(a_ms)} | {fmt_ms(b_ms)} | {_delta(a_ms, b_ms)} |") + + lines.append("") + + # Per-module mojo diff (only significant changes) + lines.extend([ + "## Significant Changes (|Δ| > 500ms)", + "", + "| Module | Goal | A | B | Δ |", + "|--------|------|---|---|---|", + ]) + + for name in all_names: + mojos_a = {(mj["plugin"], mj["goal"], mj["exec_id"]): mj["time_ms"] + for mj in mods_a.get(name, {}).get("mojos", [])} if name in mods_a else {} + mojos_b = {(mj["plugin"], mj["goal"], mj["exec_id"]): mj["time_ms"] + for mj in mods_b.get(name, {}).get("mojos", [])} if name in mods_b else {} + all_keys = set(mojos_a.keys()) | set(mojos_b.keys()) + for key in sorted(all_keys, key=lambda k: abs(mojos_a.get(k, 0) - mojos_b.get(k, 0)), reverse=True): + a_ms = mojos_a.get(key, 0) + b_ms = mojos_b.get(key, 0) + if abs(b_ms - a_ms) > 500: + lines.append( + f"| {name} | `{key[0]}:{key[1]}` | {fmt_ms(a_ms)} | {fmt_ms(b_ms)} | {_delta(a_ms, b_ms)} |" + ) + + lines.append("") + return "\n".join(lines) + + +# ============================================================================= +# File Discovery +# ============================================================================= + +def find_latest_reports(base_dir: str, count: int = 1) -> list: + """Find the latest N profiler JSON reports.""" + pattern = os.path.join(base_dir, ".profiler", "profiler-report-*.json") + files = sorted(glob.glob(pattern), reverse=True) + if not files: + print(f"No JSON reports found in {base_dir}/.profiler/", file=sys.stderr) + print("Run build with: -Dprofile -DprofileFormat=JSON", file=sys.stderr) + sys.exit(1) + return files[:count] + + +# ============================================================================= +# CLI +# ============================================================================= + +def main(): + parser = argparse.ArgumentParser( + description="Analyze maven-profiler JSON reports and output Markdown.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s .profiler/report.json # Analyze single report + %(prog)s --compare clean.json incremental.json # Compare two reports + %(prog)s --latest # Analyze latest report + %(prog)s --latest --compare # Compare latest two reports + %(prog)s report.json -o report.md # Save to file + """, + ) + parser.add_argument("files", nargs="*", help="JSON report file(s)") + parser.add_argument("--compare", action="store_true", + help="Compare two reports (A=baseline, B=current)") + parser.add_argument("--latest", action="store_true", + help="Auto-pick latest report(s) from .profiler/") + parser.add_argument("--dir", default=".", + help="Project root directory (default: .)") + parser.add_argument("-o", "--output", + help="Output file (default: stdout)") + parser.add_argument("-n", "--top", type=int, default=15, + help="Top N mojos to show (default: 15)") + + args = parser.parse_args() + + # Resolve input files + if args.latest: + count = 2 if args.compare else 1 + files = find_latest_reports(args.dir, count) + if args.compare and len(files) < 2: + print("Need at least 2 JSON reports for comparison.", + file=sys.stderr) + sys.exit(1) + elif args.files: + files = args.files + else: + files = find_latest_reports(args.dir, 2 if args.compare else 1) + + # Validate files exist + for f in files: + if not os.path.isfile(f): + print(f"File not found: {f}", file=sys.stderr) + sys.exit(1) + + # Run analysis + if args.compare: + if len(files) < 2: + print("Need exactly 2 files for comparison.", file=sys.stderr) + sys.exit(1) + report_a = load_report(files[1]) # older = baseline + report_b = load_report(files[0]) # newer = current + output = analyze_compare(report_a, report_b) + else: + report = load_report(files[0]) + output = analyze_single(report) + + # Output + if args.output: + Path(args.output).write_text(output) + print(f"Report written to {args.output}", file=sys.stderr) + else: + print(output) + + +if __name__ == "__main__": + main() diff --git a/dev/run-scala-test.sh b/dev/run-scala-test.sh new file mode 100755 index 000000000000..4cc190808507 --- /dev/null +++ b/dev/run-scala-test.sh @@ -0,0 +1,673 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ============================================================================= +# run-scala-test.sh - Run ScalaTest like IntelliJ IDEA +# ============================================================================= +# +# This script simulates IntelliJ IDEA's ScalaTest execution to allow running +# individual test methods, which is not possible with standard Maven commands +# due to the conflict between -Dsuites and -am parameters. +# +# Usage: +# ./dev/run-scala-test.sh [options] -P -pl -s [-t "test name"] +# +# Examples: +# # Run entire suite +# ./dev/run-scala-test.sh \ +# -Pjava-17,spark-4.0,scala-2.13,backends-velox,hadoop-3.3,spark-ut \ +# -pl gluten-ut/spark40 \ +# -s org.apache.spark.sql.GlutenDeprecatedDatasetAggregatorSuite +# +# # Run single test method +# ./dev/run-scala-test.sh \ +# -Pjava-17,spark-4.0,scala-2.13,backends-velox,hadoop-3.3,spark-ut \ +# -pl gluten-ut/spark40 \ +# -s org.apache.spark.sql.GlutenDeprecatedDatasetAggregatorSuite \ +# -t "typed aggregation: class input with reordering" +# +# # With Maven Daemon (mvnd) for faster builds +# ./dev/run-scala-test.sh \ +# -Pjava-17,spark-4.0,scala-2.13,backends-velox,hadoop-3.3,spark-ut \ +# -pl gluten-ut/spark40 \ +# -s org.apache.spark.sql.GlutenDeprecatedDatasetAggregatorSuite \ +# --mvnd +# +# # With Maven profiler enabled +# ./dev/run-scala-test.sh \ +# -Pjava-17,spark-4.0,scala-2.13,backends-velox,hadoop-3.3,spark-ut \ +# -pl gluten-ut/spark40 \ +# -s org.apache.spark.sql.GlutenDeprecatedDatasetAggregatorSuite \ +# --profile +# +# # Export classpath only (no test execution) +# ./dev/run-scala-test.sh \ +# -Pjava-17,spark-4.0,scala-2.13,backends-velox,hadoop-3.3,spark-ut \ +# -pl gluten-ut/spark40 \ +# -s org.apache.spark.sql.GlutenDeprecatedDatasetAggregatorSuite \ +# --export-only +# +# ============================================================================= + +set -e + +# Get script directory and project root +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +GLUTEN_HOME="$(cd "${SCRIPT_DIR}/.." && pwd)" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' +CYAN='\033[0;36m' +MAGENTA='\033[0;35m' + +# ============================================================================= +# Timing helpers +# ============================================================================= + +timer_now() { + date +%s%N +} + +# Returns elapsed time in seconds (with milliseconds) +timer_elapsed() { + local start=$1 end=$2 + echo "scale=1; ($end - $start) / 1000000000" | bc +} + +format_duration() { + local secs=$1 + local mins=$(echo "$secs / 60" | bc) + local remaining=$(echo "scale=1; $secs - $mins * 60" | bc) + if [[ "$mins" -gt 0 ]]; then + printf "%dm %.1fs" "$mins" "$remaining" + else + printf "%.1fs" "$remaining" + fi +} + +log_timing() { + [[ "$ENABLE_PROFILER" != "true" ]] && return + local label=$1 elapsed=$2 + local formatted=$(format_duration "$elapsed") + echo -e "${CYAN}[TIME]${NC} ${label}: ${MAGENTA}${formatted}${NC}" +} + +# Print timing summary table. Args: include_step4 (true/false) +print_timing_summary() { + [[ "$ENABLE_PROFILER" != "true" ]] && return + local include_step4=${1:-false} + local mvn_total=$(echo "$TIMING_STEP1 + $TIMING_STEP2" | bc) + local overall=$(echo "$mvn_total + $TIMING_STEP3" | bc) + local cached_tag="" + [[ "$CACHE_HIT" == "true" ]] && cached_tag=" (cached)" + + echo "" + echo -e "${CYAN}==========================================${NC}" + echo -e "${CYAN} ⏱ Timing Summary${NC}" + echo -e "${CYAN}==========================================${NC}" + printf " %-40s %s\n" "Step 1 - Compile + Classpath (mvn):" "$(format_duration $TIMING_STEP1)${cached_tag}" + printf " %-40s %s\n" "Step 2 - Evaluate JVM args (mvn):" "$(format_duration $TIMING_STEP2)${cached_tag}" + printf " %-40s %s\n" "Step 3 - Resolve classpath:" "$(format_duration $TIMING_STEP3)" + if [[ "$include_step4" == "true" ]]; then + printf " %-40s %s\n" "Step 4 - ScalaTest execution:" "$(format_duration $TIMING_STEP4)" + overall=$(echo "$overall + $TIMING_STEP4" | bc) + fi + echo " ------------------------------------------" + printf " %-40s %s\n" "Maven total:" "$(format_duration $mvn_total)" + if [[ "$include_step4" == "true" ]]; then + printf " %-40s %s\n" "Test execution:" "$(format_duration $TIMING_STEP4)" + fi + printf " %-40s %s\n" "Overall:" "$(format_duration $overall)" + echo -e "${CYAN}==========================================${NC}" +} + +# ============================================================================= +# Module Mapping: artifactId -> directory path +# ============================================================================= +# Format: "artifactId:directory:type" +# type: "scala" for target/scala-X.XX/classes, "java" for target/classes +declare -A MODULE_MAP=( + # Core modules + ["gluten-core"]="gluten-core:scala" + ["gluten-substrait"]="gluten-substrait:scala" + ["gluten-ui"]="gluten-ui:scala" + ["gluten-arrow"]="gluten-arrow:scala" + + # Backend modules + ["backends-velox"]="backends-velox:scala" + ["backends-clickhouse"]="backends-clickhouse:scala" + + # RAS modules + ["gluten-ras-common"]="gluten-ras/common:scala" + ["gluten-ras-planner"]="gluten-ras/planner:scala" + + # Shims modules (Java only, no scala subdirectory) + ["spark-sql-columnar-shims-common"]="shims/common:java" + ["spark-sql-columnar-shims-spark32"]="shims/spark32:java" + ["spark-sql-columnar-shims-spark33"]="shims/spark33:java" + ["spark-sql-columnar-shims-spark34"]="shims/spark34:java" + ["spark-sql-columnar-shims-spark35"]="shims/spark35:java" + ["spark-sql-columnar-shims-spark40"]="shims/spark40:java" + ["spark-sql-columnar-shims-spark41"]="shims/spark41:java" + + # Unit test modules + ["gluten-ut-common"]="gluten-ut/common:scala" + ["gluten-ut-test"]="gluten-ut/test:scala" + ["gluten-ut-spark32"]="gluten-ut/spark32:scala" + ["gluten-ut-spark33"]="gluten-ut/spark33:scala" + ["gluten-ut-spark34"]="gluten-ut/spark34:scala" + ["gluten-ut-spark35"]="gluten-ut/spark35:scala" + ["gluten-ut-spark40"]="gluten-ut/spark40:scala" + ["gluten-ut-spark41"]="gluten-ut/spark41:scala" + + # Data lake modules + ["gluten-delta"]="gluten-delta:scala" + ["gluten-iceberg"]="gluten-iceberg:scala" + ["gluten-hudi"]="gluten-hudi:scala" + ["gluten-paimon"]="gluten-paimon:scala" + + # Shuffle modules + ["gluten-celeborn"]="gluten-celeborn:scala" + ["gluten-uniffle"]="gluten-uniffle:scala" + + # Other modules + ["gluten-kafka"]="gluten-kafka:scala" +) + +# ============================================================================= +# Functions +# ============================================================================= + +print_usage() { + cat << EOF +Usage: $0 [options] -P -pl -s [-t "test name"] + +Required: + -P Maven profiles (e.g., -Pjava-17,spark-4.0,scala-2.13,backends-velox) + -pl Target module (e.g., gluten-ut/spark40) + -s Full suite class name + +Optional: + -t "test name" Specific test method name to run + --mvnd Use Maven Daemon (mvnd) instead of ./build/mvn + --clean Run 'mvn clean' before compiling + --force Force Maven rebuild, bypass build cache + --profile Enable Maven profiler (reports in .profiler/) + --export-only Export classpath and exit (no test execution) + --help Show this help message + +Examples: + # Run entire suite + $0 -Pjava-17,spark-4.0,scala-2.13,backends-velox,hadoop-3.3,spark-ut \\ + -pl gluten-ut/spark40 \\ + -s org.apache.spark.sql.GlutenDeprecatedDatasetAggregatorSuite + + # Run single test method + $0 -Pjava-17,spark-4.0,scala-2.13,backends-velox,hadoop-3.3,spark-ut \\ + -pl gluten-ut/spark40 \\ + -s org.apache.spark.sql.GlutenDeprecatedDatasetAggregatorSuite \\ + -t "typed aggregation: class input with reordering" +EOF +} + +log_info() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +log_step() { + echo -e "${BLUE}[STEP]${NC} $1" +} + +# Detect Scala version from profiles +detect_scala_version() { + local profiles="$1" + if [[ "$profiles" == *"scala-2.13"* ]]; then + echo "2.13" + else + echo "2.12" + fi +} + +# Get target classes directory for a module +get_target_dir() { + local artifact_id="$1" + local scala_version="$2" + local class_type="$3" + + local mapping="${MODULE_MAP[$artifact_id]}" + if [[ -z "$mapping" ]]; then + return 1 + fi + + local dir_path="${mapping%%:*}" + local module_type="${mapping##*:}" + + local target_path="${GLUTEN_HOME}/${dir_path}/target" + + if [[ "$module_type" == "scala" ]]; then + echo "${target_path}/scala-${scala_version}/${class_type}" + else + echo "${target_path}/${class_type}" + fi +} + +# Replace gluten jar paths with target/classes directories +replace_gluten_paths() { + local classpath="$1" + local scala_version="$2" + local result="" + local added_paths="" + + IFS=':' read -ra CP_ENTRIES <<< "$classpath" + for entry in "${CP_ENTRIES[@]}"; do + if [[ "$entry" != *"/org/apache/gluten/"* && "$entry" != *"/gluten-"*"/target/"* ]]; then + result="${result}:${entry}" + continue + fi + + # Local target directories are already in the desired form from reactor builds + if [[ -d "$entry" && "$entry" == *"/target/"* ]]; then + result="${result}:${entry}" + continue + fi + + local filename=$(basename "$entry") + local artifact_id="" + for known_artifact in "${!MODULE_MAP[@]}"; do + if [[ "$filename" == "${known_artifact}-"* ]]; then + artifact_id="$known_artifact" + break + fi + done + + if [[ -z "$artifact_id" ]]; then + log_error "Unknown gluten module in classpath: $entry" >&2 + log_error "Please add it to MODULE_MAP in this script." >&2 + exit 1 + fi + + local class_type="classes" + [[ "$filename" == *"-tests.jar" ]] && class_type="test-classes" + + local target_dir=$(get_target_dir "$artifact_id" "$scala_version" "$class_type") + if [[ -d "$target_dir" && "$added_paths" != *"$target_dir"* ]]; then + result="${result}:${target_dir}" + added_paths="${added_paths}:${target_dir}" + fi + done + + echo "${result#:}" +} + +# ============================================================================= +# Parse Arguments +# ============================================================================= + +PROFILES="" +MODULE="" +SUITE="" +TEST_METHOD="" +EXTRA_MVN_ARGS="" +ENABLE_PROFILER=false +EXPORT_ONLY=false +ENABLE_CLEAN=false +FORCE_BUILD=false +USE_MVND=false + +while [[ $# -gt 0 ]]; do + case $1 in + -P*) + PROFILES="${1#-P}" + shift + ;; + -pl) + MODULE="$2" + shift 2 + ;; + -s) + SUITE="$2" + shift 2 + ;; + -t) + TEST_METHOD="$2" + shift 2 + ;; + --profile) + ENABLE_PROFILER=true + shift + ;; + --clean) + ENABLE_CLEAN=true + shift + ;; + --force) + FORCE_BUILD=true + shift + ;; + --mvnd) + USE_MVND=true + shift + ;; + --export-only) + EXPORT_ONLY=true + shift + ;; + --help) + print_usage + exit 0 + ;; + *) + # Collect other arguments for Maven + EXTRA_MVN_ARGS="${EXTRA_MVN_ARGS} $1" + shift + ;; + esac +done + +# Validate required arguments +if [[ -z "$PROFILES" ]]; then + log_error "Missing required argument: -P" + print_usage + exit 1 +fi + +if [[ -z "$MODULE" ]]; then + log_error "Missing required argument: -pl " + print_usage + exit 1 +fi + +if [[ -z "$SUITE" ]]; then + log_error "Missing required argument: -s " + print_usage + exit 1 +fi + +# Detect Scala version +SCALA_VERSION=$(detect_scala_version "$PROFILES") +log_info "Detected Scala version: ${SCALA_VERSION}" + +# ============================================================================= +# Build Cache - Skip Maven when source files haven't changed +# ============================================================================= +# +# After a successful Maven build, a sentinel file is touched. On subsequent +# runs, if no .scala/.java/pom.xml files are newer than the sentinel, Maven +# is skipped entirely and cached classpath/JVM args are reused. This saves +# ~52s of Zinc analysis loading overhead per run. +# +# Use --force to bypass the cache, or --clean which implicitly bypasses it. +# ============================================================================= + +CACHE_DIR="${GLUTEN_HOME}/.run-scala-test-cache" +mkdir -p "$CACHE_DIR" +CACHE_KEY=$(echo "${PROFILES}__${MODULE}" | md5sum | cut -d' ' -f1) +BUILD_SENTINEL="${CACHE_DIR}/sentinel_${CACHE_KEY}" +CLASSPATH_CACHE="${CACHE_DIR}/classpath_${CACHE_KEY}.txt" +JVM_ARGS_CACHE="${CACHE_DIR}/jvm_args_${CACHE_KEY}.txt" + +can_skip_maven() { + [[ "$ENABLE_CLEAN" == "true" ]] && return 1 + [[ "$FORCE_BUILD" == "true" ]] && return 1 + [[ ! -f "$BUILD_SENTINEL" || ! -f "$CLASSPATH_CACHE" || ! -f "$JVM_ARGS_CACHE" ]] && return 1 + + # Check if any source file or pom.xml changed since last successful build + local changed + changed=$(find "${GLUTEN_HOME}" \ + \( -path "*/target" -o -path "*/.git" -o -path "*/.run-scala-test-cache" \ + -o -path "*/.profiler" -o -path "*/.mvn" \) -prune -o \ + -newer "$BUILD_SENTINEL" \( \ + -name "pom.xml" -o \ + \( -path "*/src/*" \( -name "*.scala" -o -name "*.java" \) \) \ + \) -print 2>/dev/null | head -1) + + [[ -z "$changed" ]] +} + +save_build_cache() { + echo "$RAW_CLASSPATH" > "$CLASSPATH_CACHE" + echo "$JVM_ARGS" > "$JVM_ARGS_CACHE" + touch "$BUILD_SENTINEL" + log_info "Build cache saved" +} + +if can_skip_maven; then + log_step "Steps 1-2: Using cached build (no source changes detected)" + RAW_CLASSPATH=$(cat "$CLASSPATH_CACHE") + JVM_ARGS=$(cat "$JVM_ARGS_CACHE") + TIMING_STEP1=0 + TIMING_STEP2=0 + CACHE_HIT=true + log_info "Cached classpath (${#RAW_CLASSPATH} chars)" + log_info "Cached JVM args: ${JVM_ARGS:0:100}..." + log_info "Maven compilation skipped — use --force to rebuild" +else + +# ============================================================================= +# Step 0: Ensure maven-profiler extension is installed (optional) +# ============================================================================= + +MVN_EXTENSIONS_FILE="${GLUTEN_HOME}/.mvn/extensions.xml" +if [[ ! -f "${MVN_EXTENSIONS_FILE}" ]]; then + log_info "Creating .mvn/extensions.xml for maven-profiler..." + mkdir -p "${GLUTEN_HOME}/.mvn" + cat > "${MVN_EXTENSIONS_FILE}" << 'EXTENSIONS_EOF' + + + + fr.jcgay.maven + maven-profiler + 3.3 + + +EXTENSIONS_EOF +fi + +# ============================================================================= +# Step 1: Compile and get classpath +# ============================================================================= + +log_step "Step 1: Compiling and getting classpath..." + +TIMER_STEP1_START=$(timer_now) + +CLASSPATH_FILE="/tmp/gluten-test-classpath-$$.txt" + +cd "${GLUTEN_HOME}" + +log_info "Running: ./build/mvn test-compile dependency:build-classpath -pl ${MODULE} -am -P${PROFILES} ..." + +# Enable Maven profiler if requested +if [[ "$ENABLE_PROFILER" == "true" ]]; then + EXTRA_MVN_ARGS="${EXTRA_MVN_ARGS} -Dprofile -DprofileFormat=JSON" + log_info "Profiler enabled - timing summary will be printed; JSON reports in .profiler/" +fi + +# Build Maven goals +MVN_GOALS="test-compile dependency:build-classpath" +if [[ "$ENABLE_CLEAN" == "true" ]]; then + MVN_GOALS="clean ${MVN_GOALS}" + log_info "Clean build requested" +fi + +# Select Maven command: --mvnd uses Maven Daemon (persistent JVM that +# keeps Zinc's JIT-optimized code and classloader caches across builds). +if [[ "$USE_MVND" == "true" ]]; then + MVN_CMD="./build/mvnd" + log_info "Using Maven Daemon (mvnd) for faster builds" +else + MVN_CMD="./build/mvn" +fi + +${MVN_CMD} ${MVN_GOALS} \ + -T 1C -q \ + -pl "${MODULE}" -am \ + -P"${PROFILES}" \ + -DincludeScope=test \ + -Dmdep.outputFile="${CLASSPATH_FILE}" \ + -Dspotless.check.skip=true \ + -Dscalastyle.skip=true \ + -Dcheckstyle.skip=true \ + -Dmaven.gitcommitid.skip=true \ + -Dremoteresources.skip=true \ + ${EXTRA_MVN_ARGS} + +if [[ ! -f "${CLASSPATH_FILE}" ]]; then + log_error "Failed to generate classpath file" + exit 1 +fi + +RAW_CLASSPATH=$(cat "${CLASSPATH_FILE}") +log_info "Got raw classpath (${#RAW_CLASSPATH} chars)" + +TIMER_STEP1_END=$(timer_now) +TIMING_STEP1=$(timer_elapsed $TIMER_STEP1_START $TIMER_STEP1_END) +log_timing "Step 1 - Compile + Classpath (mvn)" "$TIMING_STEP1" + +# ============================================================================= +# Step 2: Get JVM arguments +# ============================================================================= + +log_step "Step 2: Getting JVM arguments from pom.xml..." + +TIMER_STEP2_START=$(timer_now) + +JVM_ARGS_RAW=$(${MVN_CMD} help:evaluate \ + -Dexpression=extraJavaTestArgs \ + -q -DforceStdout \ + -P"${PROFILES}" 2>/dev/null || echo "") + +# Clean up JVM args: remove comments, trim whitespace, convert to single line +JVM_ARGS=$(echo "$JVM_ARGS_RAW" | \ + grep -v '