Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
65e1ff5
from_protobuf
thirtiseven Feb 26, 2026
a225f9a
address self review comments
thirtiseven Feb 26, 2026
e517e43
fix schema projection
thirtiseven Feb 26, 2026
bc1bee7
address comments
thirtiseven Feb 28, 2026
62679af
bug fix and clean up
thirtiseven Mar 2, 2026
f7d9551
bug fix and clean up
thirtiseven Mar 3, 2026
802488d
address cc comments
thirtiseven Mar 3, 2026
030fdf8
codex review and address
thirtiseven Mar 3, 2026
73ce21b
verify and fix shim build error
thirtiseven Mar 3, 2026
685c885
Update copyright year in pom.xml
thirtiseven Mar 3, 2026
781e639
Update copyright year in pom.xml
thirtiseven Mar 3, 2026
43a4c08
clean up
thirtiseven Mar 3, 2026
47897e9
Merge branch 'main' into from_protobuf_nested
thirtiseven Mar 3, 2026
c1ef9fb
address comments
thirtiseven Mar 5, 2026
8a2c007
address comments
thirtiseven Mar 5, 2026
035fd1d
address comments
thirtiseven Mar 5, 2026
c812754
fix shim
thirtiseven Mar 5, 2026
4551f28
address comments
thirtiseven Mar 5, 2026
ca3b7c5
address comments
thirtiseven Mar 5, 2026
6860c91
address comments
thirtiseven Mar 5, 2026
c1c919b
address comments
thirtiseven Mar 5, 2026
d71f7a1
address comments
thirtiseven Mar 5, 2026
0c9385a
address comments
thirtiseven Mar 6, 2026
303b4f8
address comments
thirtiseven Mar 6, 2026
981b75c
address comments
thirtiseven Mar 6, 2026
539af83
Batch merging after schema projection
thirtiseven Mar 11, 2026
46861ec
Merge remote-tracking branch 'origin/main' into from_protobuf_nested
thirtiseven Mar 11, 2026
19eb983
fix the enum bug
thirtiseven Mar 11, 2026
3059a50
reflection refactor
thirtiseven Mar 12, 2026
2c47aee
equals and hashcode
thirtiseven Mar 12, 2026
cea7594
address comments
thirtiseven Mar 12, 2026
9467ebc
address comments
thirtiseven Mar 12, 2026
c647776
address comments
thirtiseven Mar 12, 2026
be7bdc6
address comments
thirtiseven Mar 12, 2026
16f2f6e
address commmit
thirtiseven Mar 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ abstract class DeltaProviderBase extends DeltaIOProvider {
//
case dvRoot @ GpuProjectExec(outputList,
dvFilter @ GpuFilterExec(condition,
dvFilterInput @ GpuProjectExec(inputList, fsse: GpuFileSourceScanExec, _)), _)
dvFilterInput @ GpuProjectExec(inputList, fsse: GpuFileSourceScanExec, _, _)), _, _)
if condition.references.exists(_.name == IS_ROW_DELETED_COLUMN_NAME) &&
!outputList.exists(_.name == "_metadata") && inputList.exists(_.name == "_metadata") =>
dvRoot.withNewChildren(Seq(
Expand Down Expand Up @@ -256,7 +256,7 @@ object DVPredicatePushdown extends ShimPredicateHelper {

def pruneIsRowDeletedColumn(plan: SparkPlan): SparkPlan = {
plan.transformUp {
case project @ GpuProjectExec(projectList, _, _) =>
case project @ GpuProjectExec(projectList, _, _, _) =>
val newProjList = projectList.filterNot(isRowDeletedColumnRef(_))
project.copy(projectList = newProjList)
case fsse: GpuFileSourceScanExec =>
Expand Down Expand Up @@ -307,11 +307,16 @@ object DVPredicatePushdown extends ShimPredicateHelper {
def mergeIdenticalProjects(plan: SparkPlan): SparkPlan = {
plan.transformUp {
case p @ GpuProjectExec(projList1,
GpuProjectExec(projList2, child, enablePreSplit1), enablePreSplit2) =>
GpuProjectExec(projList2, child, enablePreSplit1, forcePostProjectCoalesce1),
enablePreSplit2, forcePostProjectCoalesce2) =>
val projSet1 = projList1.map(_.exprId).toSet
val projSet2 = projList2.map(_.exprId).toSet
if (projSet1 == projSet2) {
GpuProjectExec(projList1, child, enablePreSplit1 && enablePreSplit2)
GpuProjectExec(
projList1,
child,
enablePreSplit1 && enablePreSplit2,
forcePostProjectCoalesce1 || forcePostProjectCoalesce2)
} else {
p
}
Expand Down
4 changes: 2 additions & 2 deletions docs/supported_ops.md
Original file line number Diff line number Diff line change
Expand Up @@ -8267,7 +8267,7 @@ are limited.
<td> </td>
<td> </td>
<td> </td>
<td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, UDT, DAYTIME, YEARMONTH</em></td>
<td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types CALENDAR, UDT, DAYTIME, YEARMONTH</em></td>
<td> </td>
<td> </td>
<td> </td>
Expand All @@ -8290,7 +8290,7 @@ are limited.
<td> </td>
<td> </td>
<td> </td>
<td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, UDT, DAYTIME, YEARMONTH</em></td>
<td><em>PS<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types CALENDAR, UDT, DAYTIME, YEARMONTH</em></td>
<td> </td>
<td> </td>
<td> </td>
Expand Down
81 changes: 79 additions & 2 deletions integration_tests/run_pyspark_from_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@
# To run all tests, including Avro tests:
# INCLUDE_SPARK_AVRO_JAR=true ./run_pyspark_from_build.sh
#
# To run tests WITHOUT Protobuf tests (protobuf is included by default):
# INCLUDE_SPARK_PROTOBUF_JAR=false ./run_pyspark_from_build.sh
#
# To run a specific test:
# TEST=my_test ./run_pyspark_from_build.sh
#
Expand Down Expand Up @@ -141,9 +144,82 @@ else
AVRO_JARS=""
fi

# ALL_JARS includes dist.jar integration-test.jar avro.jar parquet.jar if they exist
# Protobuf support: Include spark-protobuf jar by default for protobuf_test.py
# Set INCLUDE_SPARK_PROTOBUF_JAR=false to disable
PROTOBUF_JARS=""
if [[ $( echo ${INCLUDE_SPARK_PROTOBUF_JAR} | tr '[:upper:]' '[:lower:]' ) != "false" ]];
then
export INCLUDE_SPARK_PROTOBUF_JAR=true
mkdir -p "${TARGET_DIR}/dependency"

# Download spark-protobuf jar if not already in target/dependency
PROTOBUF_JAR_NAME="spark-protobuf_${SCALA_VERSION}-${VERSION_STRING}.jar"
PROTOBUF_JAR_PATH="${TARGET_DIR}/dependency/${PROTOBUF_JAR_NAME}"

if [[ ! -f "$PROTOBUF_JAR_PATH" ]]; then
echo "Downloading spark-protobuf jar..."
PROTOBUF_MAVEN_URL="https://repo1.maven.org/maven2/org/apache/spark/spark-protobuf_${SCALA_VERSION}/${VERSION_STRING}/${PROTOBUF_JAR_NAME}"
if curl -fsL -o "$PROTOBUF_JAR_PATH" "$PROTOBUF_MAVEN_URL"; then
echo "Downloaded spark-protobuf jar to $PROTOBUF_JAR_PATH"
else
echo "WARNING: Failed to download spark-protobuf jar from $PROTOBUF_MAVEN_URL"
rm -f "$PROTOBUF_JAR_PATH"
fi
fi

# Also download protobuf-java jar (required dependency).
# Detect version from the jar bundled with Spark, fall back to version mapping.
PROTOBUF_JAVA_VERSION=""
BUNDLED_PB_JAR=$(ls "$SPARK_HOME"/jars/protobuf-java-[0-9]*.jar 2>/dev/null | head -1)
if [[ -n "$BUNDLED_PB_JAR" ]]; then
PROTOBUF_JAVA_VERSION=$(basename "$BUNDLED_PB_JAR" | sed 's/protobuf-java-\(.*\)\.jar/\1/')
echo "Detected protobuf-java version $PROTOBUF_JAVA_VERSION from SPARK_HOME"
fi
if [[ -z "$PROTOBUF_JAVA_VERSION" ]]; then
case "$VERSION_STRING" in
3.4.*) PROTOBUF_JAVA_VERSION="3.25.1" ;;
3.5.*) PROTOBUF_JAVA_VERSION="3.25.1" ;;
4.0.*) PROTOBUF_JAVA_VERSION="4.29.3" ;;
*) PROTOBUF_JAVA_VERSION="3.25.1" ;;
esac
echo "Using protobuf-java version $PROTOBUF_JAVA_VERSION based on Spark $VERSION_STRING"
fi
PROTOBUF_JAVA_JAR_NAME="protobuf-java-${PROTOBUF_JAVA_VERSION}.jar"
PROTOBUF_JAVA_JAR_PATH="${TARGET_DIR}/dependency/${PROTOBUF_JAVA_JAR_NAME}"

if [[ ! -f "$PROTOBUF_JAVA_JAR_PATH" ]]; then
echo "Downloading protobuf-java jar..."
PROTOBUF_JAVA_MAVEN_URL="https://repo1.maven.org/maven2/com/google/protobuf/protobuf-java/${PROTOBUF_JAVA_VERSION}/${PROTOBUF_JAVA_JAR_NAME}"
if curl -fsL -o "$PROTOBUF_JAVA_JAR_PATH" "$PROTOBUF_JAVA_MAVEN_URL"; then
echo "Downloaded protobuf-java jar to $PROTOBUF_JAVA_JAR_PATH"
else
echo "WARNING: Failed to download protobuf-java jar from $PROTOBUF_JAVA_MAVEN_URL"
rm -f "$PROTOBUF_JAVA_JAR_PATH"
fi
fi

if [[ -f "$PROTOBUF_JAR_PATH" ]]; then
PROTOBUF_JARS="$PROTOBUF_JAR_PATH"
echo "Including spark-protobuf jar: $PROTOBUF_JAR_PATH"
fi
if [[ -f "$PROTOBUF_JAVA_JAR_PATH" ]]; then
PROTOBUF_JARS="${PROTOBUF_JARS:+$PROTOBUF_JARS }$PROTOBUF_JAVA_JAR_PATH"
echo "Including protobuf-java jar: $PROTOBUF_JAVA_JAR_PATH"
fi
# Also add protobuf jars to driver classpath for Class.forName() to work
# This is needed because --jars only adds to executor classpath
if [[ -n "$PROTOBUF_JARS" ]]; then
PROTOBUF_DRIVER_CP=$(echo "$PROTOBUF_JARS" | tr ' ' ':')
export PYSP_TEST_spark_driver_extraClassPath="${PYSP_TEST_spark_driver_extraClassPath:+${PYSP_TEST_spark_driver_extraClassPath}:}${PROTOBUF_DRIVER_CP}"
echo "Added protobuf jars to driver classpath"
fi
else
export INCLUDE_SPARK_PROTOBUF_JAR=false
fi

# ALL_JARS includes dist.jar integration-test.jar avro.jar parquet.jar protobuf.jar if they exist
# Remove non-existing paths and canonicalize the paths including get rid of links and `..`
ALL_JARS=$(readlink -e $PLUGIN_JAR $TEST_JARS $AVRO_JARS $PARQUET_HADOOP_TESTS || true)
ALL_JARS=$(readlink -e $PLUGIN_JAR $TEST_JARS $AVRO_JARS $PARQUET_HADOOP_TESTS $PROTOBUF_JARS || true)
# `:` separated jars
ALL_JARS="${ALL_JARS//$'\n'/:}"

Expand Down Expand Up @@ -411,6 +487,7 @@ else
export PYSP_TEST_spark_gluten_loadLibFromJar=true
fi


SPARK_SHELL_SMOKE_TEST="${SPARK_SHELL_SMOKE_TEST:-0}"
EXPLAIN_ONLY_CPU_SMOKE_TEST="${EXPLAIN_ONLY_CPU_SMOKE_TEST:-0}"
SPARK_CONNECT_SMOKE_TEST="${SPARK_CONNECT_SMOKE_TEST:-0}"
Expand Down
Loading
Loading