-
Notifications
You must be signed in to change notification settings - Fork 0
3757: feat: Support Spark expression: percentile_cont #49
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -123,8 +123,8 @@ use datafusion_comet_proto::{ | |
| use datafusion_comet_spark_expr::{ | ||
| ArrayInsert, Avg, AvgDecimal, Cast, CheckOverflow, Correlation, Covariance, CreateNamedStruct, | ||
| DecimalRescaleCheckOverflow, GetArrayStructFields, GetStructField, IfExpr, ListExtract, | ||
| NormalizeNaNAndZero, SparkCastOptions, Stddev, SumDecimal, ToJson, UnboundColumn, Variance, | ||
| WideDecimalBinaryExpr, WideDecimalOp, | ||
| NormalizeNaNAndZero, Percentile, SparkCastOptions, Stddev, SumDecimal, ToJson, UnboundColumn, | ||
| Variance, WideDecimalBinaryExpr, WideDecimalOp, | ||
| }; | ||
| use itertools::Itertools; | ||
| use jni::objects::GlobalRef; | ||
|
|
@@ -2267,6 +2267,55 @@ impl PhysicalPlanner { | |
| )); | ||
| Self::create_aggr_func_expr("bloom_filter_agg", schema, vec![child], func) | ||
| } | ||
| AggExprStruct::PercentileCont(expr) => { | ||
| let return_type = to_arrow_datatype(expr.datatype.as_ref().unwrap()); | ||
| let child = self.create_expr(expr.child.as_ref().unwrap(), Arc::clone(&schema))?; | ||
|
|
||
| // Cast input to Float64 for numeric types | ||
| let child = | ||
| Arc::new(CastExpr::new(child, DataType::Float64, None)) as Arc<dyn PhysicalExpr>; | ||
|
|
||
| // Extract the literal percentile value | ||
| let percentile_expr = | ||
| self.create_expr(expr.percentile.as_ref().unwrap(), Arc::clone(&schema))?; | ||
|
Comment on lines
+2271
to
+2280
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The use of For example: let datatype = expr.datatype.as_ref().ok_or_else(|| {
ExecutionError::GeneralError("Datatype for PercentileCont is missing".into())
})?;
let return_type = to_arrow_datatype(datatype);This practice should be applied to |
||
| let percentile_value = percentile_expr | ||
| .as_any() | ||
| .downcast_ref::<DataFusionLiteral>() | ||
| .ok_or_else(|| { | ||
| ExecutionError::GeneralError("percentile must be a literal".into()) | ||
| })? | ||
| .value() | ||
| .clone(); | ||
|
|
||
| let percentile = match percentile_value { | ||
| ScalarValue::Float64(Some(p)) => p, | ||
| ScalarValue::Float32(Some(p)) => p as f64, | ||
| ScalarValue::Int64(Some(p)) => p as f64, | ||
| ScalarValue::Int32(Some(p)) => p as f64, | ||
| _ => { | ||
| return Err(ExecutionError::GeneralError(format!( | ||
| "percentile must be a numeric literal, got {:?}", | ||
| percentile_value | ||
| ))) | ||
| } | ||
| }; | ||
|
|
||
| // Custom Spark-compatible Percentile implementation | ||
| let func = AggregateUDF::new_from_impl(Percentile::new( | ||
| "spark_percentile", | ||
| percentile, | ||
| expr.reverse, | ||
| return_type, | ||
| )); | ||
|
|
||
| AggregateExprBuilder::new(Arc::new(func), vec![child]) | ||
| .schema(schema) | ||
| .alias("spark_percentile") | ||
| .with_ignore_nulls(false) | ||
| .with_distinct(false) | ||
| .build() | ||
| .map_err(|e| ExecutionError::DataFusionError(e.to_string())) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Inline builder duplicates existing helper functionLow Severity The Additional Locations (1) |
||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||


There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Casting the input to
Float64here can change ordering / collapse distinct values for high-precisionDecimalTypeor largeLongvalues (e.g., >2^53), which can makepercentile_contdiverge from Spark’s behavior that orders on the original type.Severity: medium
Other Locations
spark/src/main/scala/org/apache/comet/serde/aggregates.scala:697🤖 Was this useful? React with 👍 or 👎, or 🚀 if it prevented an incident/outage.