martin-augment · martin-augment · Mar 21, 2026 · Mar 21, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
@@ -123,8 +123,8 @@ use datafusion_comet_proto::{
 use datafusion_comet_spark_expr::{
     ArrayInsert, Avg, AvgDecimal, Cast, CheckOverflow, Correlation, Covariance, CreateNamedStruct,
     DecimalRescaleCheckOverflow, GetArrayStructFields, GetStructField, IfExpr, ListExtract,
-    NormalizeNaNAndZero, SparkCastOptions, Stddev, SumDecimal, ToJson, UnboundColumn, Variance,
-    WideDecimalBinaryExpr, WideDecimalOp,
+    NormalizeNaNAndZero, Percentile, SparkCastOptions, Stddev, SumDecimal, ToJson, UnboundColumn,
+    Variance, WideDecimalBinaryExpr, WideDecimalOp,
 };
 use itertools::Itertools;
 use jni::objects::GlobalRef;
@@ -2267,6 +2267,55 @@ impl PhysicalPlanner {
                 ));
                 Self::create_aggr_func_expr("bloom_filter_agg", schema, vec![child], func)
             }
+            AggExprStruct::PercentileCont(expr) => {
+                let return_type = to_arrow_datatype(expr.datatype.as_ref().unwrap());
+                let child = self.create_expr(expr.child.as_ref().unwrap(), Arc::clone(&schema))?;
+
+                // Cast input to Float64 for numeric types
+                let child =
+                    Arc::new(CastExpr::new(child, DataType::Float64, None)) as Arc<dyn PhysicalExpr>;
+
+                // Extract the literal percentile value
+                let percentile_expr =
+                    self.create_expr(expr.percentile.as_ref().unwrap(), Arc::clone(&schema))?;
+                let percentile_value = percentile_expr
+                    .as_any()
+                    .downcast_ref::<DataFusionLiteral>()
+                    .ok_or_else(|| {
+                        ExecutionError::GeneralError("percentile must be a literal".into())
+                    })?
+                    .value()
+                    .clone();
+
+                let percentile = match percentile_value {
+                    ScalarValue::Float64(Some(p)) => p,
+                    ScalarValue::Float32(Some(p)) => p as f64,
+                    ScalarValue::Int64(Some(p)) => p as f64,
+                    ScalarValue::Int32(Some(p)) => p as f64,
+                    _ => {
+                        return Err(ExecutionError::GeneralError(format!(
+                            "percentile must be a numeric literal, got {:?}",
+                            percentile_value
+                        )))
+                    }
+                };
+
+                // Custom Spark-compatible Percentile implementation
+                let func = AggregateUDF::new_from_impl(Percentile::new(
+                    "spark_percentile",
+                    percentile,
+                    expr.reverse,
+                    return_type,
+                ));
+
+                AggregateExprBuilder::new(Arc::new(func), vec![child])
+                    .schema(schema)
+                    .alias("spark_percentile")
+                    .with_ignore_nulls(false)
+                    .with_distinct(false)
+                    .build()
+                    .map_err(|e| ExecutionError::DataFusionError(e.to_string()))
+            }
         }
     }
 

diff --git a/native/core/src/execution/serde.rs b/native/core/src/execution/serde.rs
@@ -168,5 +168,11 @@ pub fn to_arrow_datatype(dt_value: &DataType) -> ArrowDataType {
             }
             _ => unreachable!(),
         },
+        DataTypeId::YearMonthInterval => {
+            ArrowDataType::Interval(arrow::datatypes::IntervalUnit::YearMonth)
+        }
+        DataTypeId::DayTimeInterval => {
+            ArrowDataType::Interval(arrow::datatypes::IntervalUnit::DayTime)
+        }
     }
 }
diff --git a/native/proto/src/proto/expr.proto b/native/proto/src/proto/expr.proto
@@ -139,6 +139,7 @@ message AggExpr {
     Stddev stddev = 14;
     Correlation correlation = 15;
     BloomFilterAgg bloomFilterAgg = 16;
+    PercentileCont percentileCont = 17;
   }
 
   // Optional QueryContext for error reporting (contains SQL text and position)
@@ -243,6 +244,13 @@ message BloomFilterAgg {
   DataType datatype = 4;
 }
 
+message PercentileCont {
+  Expr child = 1;           // The column to compute percentile on
+  Expr percentile = 2;      // The percentile value (0.0-1.0)
+  DataType datatype = 3;    // Return type
+  bool reverse = 4;         // True if ORDER BY DESC
+}
+
 enum EvalMode {
   LEGACY = 0;
   TRY = 1;

diff --git a/native/proto/src/proto/types.proto b/native/proto/src/proto/types.proto
@@ -59,6 +59,8 @@ message DataType {
     LIST = 14;
     MAP = 15;
     STRUCT = 16;
+    YEAR_MONTH_INTERVAL = 17;
+    DAY_TIME_INTERVAL = 18;
   }
   DataTypeId type_id = 1;
 

diff --git a/native/spark-expr/src/agg_funcs/mod.rs b/native/spark-expr/src/agg_funcs/mod.rs
@@ -19,6 +19,7 @@ mod avg;
 mod avg_decimal;
 mod correlation;
 mod covariance;
+mod percentile;
 mod stddev;
 mod sum_decimal;
 mod sum_int;
@@ -28,6 +29,7 @@ pub use avg::Avg;
 pub use avg_decimal::AvgDecimal;
 pub use correlation::Correlation;
 pub use covariance::Covariance;
+pub use percentile::Percentile;
 pub use stddev::Stddev;
 pub use sum_decimal::SumDecimal;
 pub use sum_int::SumInteger;