-
Notifications
You must be signed in to change notification settings - Fork 0
21577: feat: Extend sqllogictest framework to uptake custom datafusion.format.* settings
#308
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| The AI agents should never link to any issue or a pull request | ||
| in any GitHub repository in the code reviews! | ||
|
|
||
| The AI agents should not review AI agents' config files like CLAUDE.md or AGENTS.md! | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,10 +20,9 @@ use super::error::{DFSqlLogicTestError, Result}; | |
| use crate::engines::output::DFColumnType; | ||
| use arrow::array::{Array, AsArray}; | ||
| use arrow::datatypes::{Fields, Schema}; | ||
| use arrow::util::display::ArrayFormatter; | ||
| use arrow::util::display::{ArrayFormatter, FormatOptions}; | ||
| use arrow::{array, array::ArrayRef, datatypes::DataType, record_batch::RecordBatch}; | ||
| use datafusion::common::internal_datafusion_err; | ||
| use datafusion::config::ConfigField; | ||
| use std::path::PathBuf; | ||
| use std::sync::LazyLock; | ||
|
|
||
|
|
@@ -32,6 +31,7 @@ pub fn convert_batches( | |
| schema: &Schema, | ||
| batches: Vec<RecordBatch>, | ||
| is_spark_path: bool, | ||
| format_options: &FormatOptions<'_>, | ||
| ) -> Result<Vec<Vec<String>>> { | ||
| let mut rows = vec![]; | ||
| for batch in batches { | ||
|
|
@@ -50,7 +50,7 @@ pub fn convert_batches( | |
| batch | ||
| .columns() | ||
| .iter() | ||
| .map(|col| cell_to_string(col, row, is_spark_path)) | ||
| .map(|col| cell_to_string(col, row, is_spark_path, format_options)) | ||
| .collect::<Result<Vec<String>>>() | ||
| }) | ||
| .collect::<Result<Vec<Vec<String>>>>()? | ||
|
|
@@ -185,7 +185,12 @@ macro_rules! get_row_value { | |
| /// [NULL Values and empty strings]: https://duckdb.org/dev/sqllogictest/result_verification#null-values-and-empty-strings | ||
| /// | ||
| /// Floating numbers are rounded to have a consistent representation with the Postgres runner. | ||
| pub fn cell_to_string(col: &ArrayRef, row: usize, is_spark_path: bool) -> Result<String> { | ||
| pub fn cell_to_string( | ||
| col: &ArrayRef, | ||
| row: usize, | ||
| is_spark_path: bool, | ||
| format_options: &FormatOptions<'_>, | ||
| ) -> Result<String> { | ||
| if col.is_null(row) { | ||
| // represent any null value with the string "NULL" | ||
| Ok(NULL_STR.to_string()) | ||
|
|
@@ -233,18 +238,15 @@ pub fn cell_to_string(col: &ArrayRef, row: usize, is_spark_path: bool) -> Result | |
| DataType::Dictionary(_, _) => { | ||
| let dict = col.as_any_dictionary(); | ||
| let key = dict.normalized_keys()[row]; | ||
| Ok(cell_to_string(dict.values(), key, is_spark_path)?) | ||
| Ok(cell_to_string( | ||
| dict.values(), | ||
| key, | ||
| is_spark_path, | ||
| format_options, | ||
| )?) | ||
| } | ||
| _ => { | ||
| let mut datafusion_format_options = | ||
| datafusion::config::FormatOptions::default(); | ||
|
|
||
| datafusion_format_options.set("null", "NULL").unwrap(); | ||
|
|
||
| let arrow_format_options: arrow::util::display::FormatOptions = | ||
| (&datafusion_format_options).try_into().unwrap(); | ||
|
|
||
| let f = ArrayFormatter::try_new(col.as_ref(), &arrow_format_options)?; | ||
| let f = ArrayFormatter::try_new(col.as_ref(), format_options)?; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. value:good-to-have; category:bug; feedback: The Gemini AI reviewer is correct! The same formatter could be used for all values in a column, so it could be reused per column instead of re-created for each value. This will reduce the memory pressure for tests with many results. |
||
|
|
||
| Ok(f.value(row).to_string()) | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -21,9 +21,12 @@ use std::{path::PathBuf, time::Duration}; | |||||
| use crate::engines::currently_executed_sql::CurrentlyExecutingSqlTracker; | ||||||
| use crate::engines::datafusion_engine::Result; | ||||||
| use crate::engines::output::{DFColumnType, DFOutput}; | ||||||
| use crate::{DFSqlLogicTestError, convert_batches, convert_schema_to_types}; | ||||||
| use crate::{ | ||||||
| DFSqlLogicTestError, convert_batches, convert_schema_to_types, get_format_options, | ||||||
| }; | ||||||
| use arrow::record_batch::RecordBatch; | ||||||
| use async_trait::async_trait; | ||||||
| use datafusion::config::ConfigField; | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
# Description: Check if ConfigField is used in this file beyond the import
rg -n 'ConfigField' datafusion/sqllogictest/src/engines/datafusion_substrait_roundtrip_engine/runner.rsRepository: martin-augment/datafusion Length of output: 108 🏁 Script executed: cd datafusion/sqllogictest && cargo clippy --all-targets --all-features -- -D warnings 2>&1 | grep -A 5 -B 5 "ConfigField\|unused" | head -100Repository: martin-augment/datafusion Length of output: 155 🏁 Script executed: # Read the entire file to check for ConfigField usage in context
cat -n datafusion/sqllogictest/src/engines/datafusion_substrait_roundtrip_engine/runner.rs | head -60Repository: martin-augment/datafusion Length of output: 2777 🏁 Script executed: # Also search more broadly for ConfigField in the entire directory
rg 'ConfigField' datafusion/sqllogictest/src/engines/datafusion_substrait_roundtrip_engine/Repository: martin-augment/datafusion Length of output: 189 Remove unused import. The Proposed fix-use datafusion::config::ConfigField;📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||
| use datafusion::logical_expr::LogicalPlan; | ||||||
| use datafusion::physical_plan::common::collect; | ||||||
| use datafusion::physical_plan::execute_stream; | ||||||
|
|
@@ -166,7 +169,11 @@ async fn run_query_substrait_round_trip( | |||||
| let stream = execute_stream(physical_plan, task_ctx)?; | ||||||
| let types = convert_schema_to_types(stream.schema().fields()); | ||||||
| let results: Vec<RecordBatch> = collect(stream).await?; | ||||||
| let rows = convert_batches(&schema, results, false)?; | ||||||
|
|
||||||
| let df_format = get_format_options(ctx)?; | ||||||
| let arrow_format: arrow::util::display::FormatOptions<'_> = | ||||||
| (&df_format).try_into()?; | ||||||
| let rows = convert_batches(&schema, results, false, &arrow_format)?; | ||||||
|
|
||||||
| if rows.is_empty() && types.is_empty() { | ||||||
| Ok(DBOutput::StatementComplete(0)) | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,6 +16,8 @@ | |
| // under the License. | ||
|
|
||
| use datafusion::common::{Result, exec_datafusion_err}; | ||
| use datafusion::config::{ConfigField, FormatOptions}; | ||
| use datafusion::prelude::SessionContext; | ||
| use itertools::Itertools; | ||
| use log::Level::Warn; | ||
| use log::{info, log_enabled, warn}; | ||
|
|
@@ -141,6 +143,13 @@ pub fn is_spark_path(relative_path: &Path) -> bool { | |
| relative_path.starts_with("spark/") | ||
| } | ||
|
|
||
| // Get passed custom FormatOptions by SessionContext to be used for sqllogictest | ||
| pub fn get_format_options(ctx: &SessionContext) -> Result<FormatOptions> { | ||
| let mut df_format = ctx.state().config().options().format.clone(); | ||
| df_format.set("null", "NULL")?; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This line explicitly overrides the user's
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. value:useful; category:bug; feedback: The Gemini AI reviewer is correct! This will set the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Severity: low 🤖 Was this useful? React with 👍 or 👎, or 🚀 if it prevented an incident/outage.
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. value:useful; category:bug; feedback: The Augment AI reviewer is correct! This will set the format.null option to NULL, to be backward compatible with the NULL_STR constant. But to make this option customizable like the date/time formatting this should be done only if the current value is "", i.e. the default value from FormatOptions::default(). A custom value set with SET datafusion.format.null=... should be not be overwritten. |
||
| Ok(df_format) | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
| mod tests { | ||
| use super::*; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -395,7 +395,7 @@ datafusion.format.timestamp_format | |
| datafusion.format.timestamp_tz_format | ||
| datafusion.format.types_info | ||
|
|
||
| # date_format: SET / SHOW / RESET / SHOW | ||
| # date_format: query result display uses session format (default: %Y-%m-%d) | ||
| statement ok | ||
| SET datafusion.format.date_format = '%d-%m-%Y' | ||
|
|
||
|
|
@@ -404,6 +404,11 @@ SHOW datafusion.format.date_format | |
| ---- | ||
| datafusion.format.date_format %d-%m-%Y | ||
|
|
||
| query D | ||
| SELECT DATE '2026-04-07' | ||
| ---- | ||
| 07-04-2026 | ||
|
|
||
| statement ok | ||
| RESET datafusion.format.date_format | ||
|
|
||
|
|
@@ -412,14 +417,23 @@ SHOW datafusion.format.date_format | |
| ---- | ||
| datafusion.format.date_format %Y-%m-%d | ||
|
|
||
| # datetime_format | ||
| query D | ||
| SELECT DATE '2026-04-07' | ||
| ---- | ||
| 2026-04-07 | ||
|
|
||
| # datetime_format (default: %Y-%m-%dT%H:%M:%S%.f) | ||
| statement ok | ||
| SET datafusion.format.datetime_format = '%Y/%m/%d %H:%M:%S' | ||
| SET datafusion.format.datetime_format = '%d-%m-%YT%H:%M:%S' | ||
|
|
||
| query TT | ||
| SHOW datafusion.format.datetime_format | ||
| ---- | ||
| datafusion.format.datetime_format %Y/%m/%d %H:%M:%S | ||
| datafusion.format.datetime_format %d-%m-%YT%H:%M:%S | ||
|
|
||
| # DATETIME literals are not implemented in the SQL parser yet. | ||
| query error DataFusion error: This feature is not implemented: Unsupported SQL type DATETIME | ||
| SELECT DATETIME '2026-04-07 00:10:00'; | ||
|
|
||
| statement ok | ||
| RESET datafusion.format.datetime_format | ||
|
|
@@ -429,14 +443,19 @@ SHOW datafusion.format.datetime_format | |
| ---- | ||
| datafusion.format.datetime_format %Y-%m-%dT%H:%M:%S%.f | ||
|
|
||
| # timestamp_format | ||
| # timestamp_format (default: %Y-%m-%dT%H:%M:%S%.f) | ||
| statement ok | ||
| SET datafusion.format.timestamp_format = '%FT%H:%M:%S' | ||
| SET datafusion.format.timestamp_format = '%d-%m-%YT%H:%M:%S' | ||
|
|
||
| query TT | ||
| SHOW datafusion.format.timestamp_format | ||
| ---- | ||
| datafusion.format.timestamp_format %FT%H:%M:%S | ||
| datafusion.format.timestamp_format %d-%m-%YT%H:%M:%S | ||
|
|
||
| query P | ||
| SELECT TIMESTAMP '2026-04-07 13:31:00'; | ||
| ---- | ||
| 07-04-2026T13:31:00 | ||
|
|
||
| statement ok | ||
| RESET datafusion.format.timestamp_format | ||
|
|
@@ -446,7 +465,12 @@ SHOW datafusion.format.timestamp_format | |
| ---- | ||
| datafusion.format.timestamp_format %Y-%m-%dT%H:%M:%S%.f | ||
|
|
||
| # timestamp_tz_format (default NULL) | ||
| query P | ||
| SELECT TIMESTAMP '2026-04-07 13:31:00'; | ||
| ---- | ||
| 2026-04-07T13:31:00 | ||
|
|
||
| # timestamp_tz_format (default: NULL) | ||
| statement ok | ||
| SET datafusion.format.timestamp_tz_format = '%Y-%m-%d %H:%M:%S %z' | ||
|
|
||
|
|
@@ -455,6 +479,11 @@ SHOW datafusion.format.timestamp_tz_format | |
| ---- | ||
| datafusion.format.timestamp_tz_format %Y-%m-%d %H:%M:%S %z | ||
|
|
||
| query P | ||
| SELECT TIMESTAMPTZ '2026-04-07 13:31:00'; | ||
| ---- | ||
| 2026-04-07T13:31:00 | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Severity: medium 🤖 Was this useful? React with 👍 or 👎, or 🚀 if it prevented an incident/outage.
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. value:useful; category:bug; feedback: The Augment AI reviewer is correct! The test validates that the formatting setting is not used. It should be fixed to verify that the custom date formatting is used! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Test for
|
||
|
|
||
| statement ok | ||
| RESET datafusion.format.timestamp_tz_format | ||
|
|
||
|
|
@@ -463,14 +492,19 @@ SHOW datafusion.format.timestamp_tz_format | |
| ---- | ||
| datafusion.format.timestamp_tz_format NULL | ||
|
|
||
| # time_format | ||
| # time_format (default: %H:%M:%S%.f) | ||
| statement ok | ||
| SET datafusion.format.time_format = '%H-%M-%S' | ||
| SET datafusion.format.time_format = '%S-%M-%H' | ||
|
|
||
| query TT | ||
| SHOW datafusion.format.time_format | ||
| ---- | ||
| datafusion.format.time_format %H-%M-%S | ||
| datafusion.format.time_format %S-%M-%H | ||
|
|
||
| query D | ||
| SELECT TIME '01:02:12.123' AS time; | ||
| ---- | ||
| 12-02-01 | ||
|
|
||
| statement ok | ||
| RESET datafusion.format.time_format | ||
|
|
@@ -480,7 +514,12 @@ SHOW datafusion.format.time_format | |
| ---- | ||
| datafusion.format.time_format %H:%M:%S%.f | ||
|
|
||
| # duration_format: values are normalized to lowercase; ISO8601 and pretty are valid | ||
| query D | ||
| SELECT TIME '01:02:12.123' AS time; | ||
| ---- | ||
| 01:02:12.123 | ||
|
|
||
| # duration_format: (default: pretty) values are normalized to lowercase; ISO8601 and pretty are valid | ||
| statement ok | ||
| SET datafusion.format.duration_format = ISO8601 | ||
|
|
||
|
|
@@ -489,6 +528,12 @@ SHOW datafusion.format.duration_format | |
| ---- | ||
| datafusion.format.duration_format iso8601 | ||
|
|
||
| # Session duration_format controls display of Duration columns (not SQL INTERVAL) | ||
| query ? | ||
| SELECT arrow_cast(3661, 'Duration(Second)'); | ||
| ---- | ||
| PT3661S | ||
|
|
||
| statement ok | ||
| SET datafusion.format.duration_format to 'PRETTY' | ||
|
|
||
|
|
@@ -497,6 +542,11 @@ SHOW datafusion.format.duration_format | |
| ---- | ||
| datafusion.format.duration_format pretty | ||
|
|
||
| query ? | ||
| SELECT arrow_cast(3661, 'Duration(Second)'); | ||
| ---- | ||
| 0 days 1 hours 1 mins 1 secs | ||
|
|
||
| statement ok | ||
| RESET datafusion.format.duration_format | ||
|
|
||
|
|
@@ -505,7 +555,29 @@ SHOW datafusion.format.duration_format | |
| ---- | ||
| datafusion.format.duration_format pretty | ||
|
|
||
| # null display string | ||
| query ? | ||
| SELECT arrow_cast(3661, 'Duration(Second)'); | ||
| ---- | ||
| 0 days 1 hours 1 mins 1 secs | ||
|
|
||
| # Case-insensitive duration_format variable name | ||
| statement ok | ||
| SET datafusion.FORMAT.DURATION_FORMAT = 'ISO8601' | ||
|
|
||
| query TT | ||
| SHOW datafusion.format.duration_format | ||
| ---- | ||
| datafusion.format.duration_format iso8601 | ||
|
|
||
| query ? | ||
| SELECT arrow_cast(61, 'Duration(Second)'); | ||
| ---- | ||
| PT61S | ||
|
|
||
| statement ok | ||
| RESET datafusion.format.duration_format | ||
|
|
||
| # null display string (default: (empty)) | ||
| statement ok | ||
| SET datafusion.format.null = 'NuLL' | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test verifies that the
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. value:useful; category:bug; feedback: The Gemini AI reviewer is correct! The test just verifies that the formatting option is properly set but it does not verify that it is being used when a SELECT query returns a null value. An addition test should be added. |
||
|
|
||
|
|
@@ -522,7 +594,7 @@ SHOW datafusion.format.null | |
| ---- | ||
| datafusion.format.null (empty) | ||
|
|
||
| # safe | ||
| # safe (default: true) | ||
| statement ok | ||
| SET datafusion.format.safe = false | ||
|
|
||
|
|
@@ -539,7 +611,7 @@ SHOW datafusion.format.safe | |
| ---- | ||
| datafusion.format.safe true | ||
|
|
||
| # types_info | ||
| # types_info (default: false) | ||
| statement ok | ||
| SET datafusion.format.types_info to true | ||
|
|
||
|
|
@@ -565,6 +637,11 @@ SHOW datafusion.format.date_format | |
| ---- | ||
| datafusion.format.date_format %m/%d/%Y | ||
|
|
||
| query D | ||
| SELECT DATE '2026-04-07'; | ||
| ---- | ||
| 04/07/2026 | ||
|
|
||
| statement ok | ||
| RESET datafusion.format.date_format | ||
|
|
||
|
|
@@ -573,6 +650,11 @@ SHOW datafusion.format.date_format | |
| ---- | ||
| datafusion.format.date_format %Y-%m-%d | ||
|
|
||
| query D | ||
| SELECT DATE '2026-04-07'; | ||
| ---- | ||
| 2026-04-07 | ||
|
|
||
| # Invalid format option name | ||
| statement error DataFusion error: Invalid or Unsupported Configuration: Config value "unknown_option" not found on FormatOptions | ||
| SET datafusion.format.unknown_option = true | ||
|
|
||


There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
While
format_optionsis now passed tocell_to_string, the function's implementation (specifically for null values) still uses the hardcodedNULL_STR("NULL"). To fully support thedatafusion.format.nullsetting as intended by this PR, the implementation should be updated to useformat_options.null_labelfor both top-level and nested null values.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
value:useful; category:bug; feedback: The Gemini AI reviewer is correct! There are several usages of the NULL_STR constant still in use. They should be replaced with
format_options.nullto make it following the configured formatting.