Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
250 changes: 232 additions & 18 deletions datafusion/functions/src/datetime/date_part.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,23 @@ use std::any::Any;
use std::str::FromStr;
use std::sync::Arc;

use arrow::array::{Array, ArrayRef, Float64Array, Int32Array};
use arrow::array::timezone::Tz;
use arrow::array::{Array, ArrayRef, Float64Array, Int32Array, PrimitiveBuilder};
use arrow::compute::kernels::cast_utils::IntervalUnit;
use arrow::compute::{binary, date_part, DatePart};
use arrow::datatypes::DataType::{
Date32, Date64, Duration, Interval, Time32, Time64, Timestamp,
};
use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second};
use arrow::datatypes::{DataType, Field, FieldRef, TimeUnit};
use arrow::datatypes::{
ArrowTimestampType, DataType, Field, FieldRef, TimeUnit, TimestampMicrosecondType,
TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType,
};

use datafusion_common::cast::as_primitive_array;
use datafusion_common::types::{logical_date, NativeType};

use super::adjust_to_local_time;
use datafusion_common::{
cast::{
as_date32_array, as_date64_array, as_int32_array, as_time32_millisecond_array,
Expand Down Expand Up @@ -56,7 +63,7 @@ use datafusion_macros::user_doc;
argument(
name = "part",
description = r#"Part of the date to return. The following date parts are supported:

- year
- quarter (emits value in inclusive range [1, 4] based on which quartile of the year the date is in)
- month
Expand Down Expand Up @@ -124,7 +131,7 @@ impl DatePartFunc {
],
Volatility::Immutable,
),
aliases: vec![String::from("datepart")],
aliases: vec![String::from("datepart"), String::from("extract")],
}
}
}
Expand Down Expand Up @@ -173,6 +180,7 @@ impl ScalarUDFImpl for DatePartFunc {
&self,
args: datafusion_expr::ScalarFunctionArgs,
) -> Result<ColumnarValue> {
let config = &args.config_options;
let args = args.args;
let [part, array] = take_function_args(self.name(), args)?;

Expand All @@ -193,12 +201,83 @@ impl ScalarUDFImpl for DatePartFunc {
ColumnarValue::Scalar(scalar) => scalar.to_array()?,
};

let (is_timezone_aware, tz_str_opt) = match array.data_type() {
Timestamp(_, Some(tz_str)) => (true, Some(Arc::clone(tz_str))),
_ => (false, None),
};

let part_trim = part_normalization(&part);
let is_epoch = is_epoch(&part);

// Epoch is timezone-independent - it always returns seconds since 1970-01-01 UTC
let array = if is_epoch {
array
} else if is_timezone_aware {
// For timezone-aware timestamps, extract in their own timezone
match tz_str_opt.as_ref() {
Some(tz_str) => {
let tz = match tz_str.parse::<Tz>() {
Ok(tz) => tz,
Err(_) => return exec_err!("Invalid timezone"),
};
match array.data_type() {
Timestamp(time_unit, _) => match time_unit {
Nanosecond => adjust_timestamp_array::<
TimestampNanosecondType,
>(&array, tz)?,
Microsecond => adjust_timestamp_array::<
TimestampMicrosecondType,
>(&array, tz)?,
Millisecond => adjust_timestamp_array::<
TimestampMillisecondType,
>(&array, tz)?,
Second => {
adjust_timestamp_array::<TimestampSecondType>(&array, tz)?
}
},
_ => array,
}
}
None => array,
}
} else if let Timestamp(time_unit, None) = array.data_type() {
// For naive timestamps, interpret in session timezone if available
match config.execution.time_zone.as_ref() {
Some(tz_str) => {
let tz = match tz_str.parse::<Tz>() {
Ok(tz) => tz,
Err(_) => return exec_err!("Invalid timezone"),
};

match time_unit {
Nanosecond => {
adjust_timestamp_array::<TimestampNanosecondType>(&array, tz)?
}
Microsecond => {
adjust_timestamp_array::<TimestampMicrosecondType>(
&array, tz,
)?
}
Millisecond => {
adjust_timestamp_array::<TimestampMillisecondType>(
&array, tz,
)?
}
Second => {
adjust_timestamp_array::<TimestampSecondType>(&array, tz)?
}
}
}
None => array,
}
} else {
array
};
Comment on lines +204 to +275
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

Verify EXTRACT timezone behavior aligns with SQL standards.

The implementation extracts timezone-aware timestamps in their own timezone (line 216) and naive timestamps in the session timezone (line 244). Please confirm this matches the intended SQL semantics for EXTRACT.

In standard SQL and most databases (PostgreSQL, etc.), EXTRACT(HOUR FROM timestamp_with_tz) typically extracts the value in the session timezone, not the timestamp's embedded timezone. The current logic may produce unexpected results.

For example, with session timezone +00:00:

  • EXTRACT(HOUR FROM TIMESTAMP '2025-10-30 10:45:30+02:00')
  • Should extract in UTC (+00:00): result = 8 (since 10:45:30+02:00 = 08:45:30 UTC)
  • Current logic extracts in +02:00: result = 10

🏁 Script executed:

#!/bin/bash
# Verify how PostgreSQL handles EXTRACT with timezone-aware timestamps
# This will help confirm the expected behavior

echo "Checking PostgreSQL EXTRACT behavior with timezones..."
echo ""
echo "Expected behavior for EXTRACT(HOUR FROM TIMESTAMP '2025-10-30 10:45:30+02:00'):"
echo "- When session timezone is UTC: should return 8"
echo "- When session timezone is +02:00: should return 10"
echo ""
echo "Please verify test expectations match this behavior"

Length of output: 288


Now I need to examine the actual code to verify what adjust_timestamp_array does and whether the logic in the review matches what PostgreSQL actually does:

Based on my research, when the input value is of type timestamp with time zone, operations are performed with respect to a particular time zone; by default, operations are done with respect to the current TimeZone setting. Additionally, the originally stated or assumed time zone is not retained in PostgreSQL's timestamptz type—timestamps with time zones are converted to UTC, and the original timezone is not stored.

The review comment's concern is valid. The code's logic of extracting "in the timestamp's own timezone" deviates from standard SQL semantics and PostgreSQL's behavior. When PostgreSQL extracts a field like HOUR from a timezone-aware timestamp, it does so using the session's current timezone setting, not the timestamp's (non-existent) original timezone.

Your EXTRACT timezone behavior is incorrect; extract in session timezone, not timestamp's timezone.

The code extracts timezone-aware timestamps in their embedded timezone offset (lines 218-228) rather than the session timezone as PostgreSQL does by default. PostgreSQL performs such operations with respect to the current TimeZone setting by default, not the timestamp's original timezone. The adjust_timestamp_array call converts the timestamp to a specific timezone, but this differs from PostgreSQL semantics where the session timezone is used unless explicitly overridden with AT TIME ZONE.

For example, with session timezone UTC:

  • EXTRACT(HOUR FROM '2025-10-30 10:45:30+02:00') should extract in UTC → result: 8 (since 10:45:30+02:00 = 08:45:30 UTC)
  • Your code extracts in +02:00 → result: 10

Align extraction with session timezone semantics for PostgreSQL compatibility.


// using IntervalUnit here means we hand off all the work of supporting plurals (like "seconds")
// and synonyms ( like "ms,msec,msecond,millisecond") to Arrow
let arr = if let Ok(interval_unit) = IntervalUnit::from_str(part_trim) {
match interval_unit {
let extracted = match interval_unit {
IntervalUnit::Year => date_part(array.as_ref(), DatePart::Year)?,
IntervalUnit::Month => date_part(array.as_ref(), DatePart::Month)?,
IntervalUnit::Week => date_part(array.as_ref(), DatePart::Week)?,
Expand All @@ -209,8 +288,42 @@ impl ScalarUDFImpl for DatePartFunc {
IntervalUnit::Millisecond => seconds_as_i32(array.as_ref(), Millisecond)?,
IntervalUnit::Microsecond => seconds_as_i32(array.as_ref(), Microsecond)?,
IntervalUnit::Nanosecond => seconds_as_i32(array.as_ref(), Nanosecond)?,
// century and decade are not supported by `DatePart`, although they are supported in postgres
_ => return exec_err!("Date part '{part}' not supported"),
};

// For fixed offsets (like +04:00, -05:30), apply the offset to extract values
if is_timezone_aware {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For timezone-aware timestamps, the array is already adjusted to local time via adjust_timestamp_array; applying a fixed offset again to extracted Hour/Minute/Day values will double-apply the offset and produce incorrect results. Consider removing this post-extraction adjustment for tz-aware inputs (also applies to other locations in this block).

🤖 Was this useful? React with 👍 or 👎

if let Some(tz_str) = tz_str_opt.as_ref() {
let tz_str = tz_str.as_ref();
if is_fixed_offset(tz_str) {
if let Some(offset_info) = extract_offset_components(tz_str) {
match interval_unit {
IntervalUnit::Hour => apply_hour_offset(
extracted.as_ref(),
offset_info.hours,
offset_info.minutes,
)?,
IntervalUnit::Minute => apply_minute_offset(
extracted.as_ref(),
offset_info.minutes,
)?,
IntervalUnit::Day => apply_day_offset(
extracted.as_ref(),
offset_info.hours,
)?,
_ => extracted,
}
} else {
extracted
}
} else {
extracted
}
} else {
extracted
}
} else {
extracted
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Double Timezone Offset Corrupts Data

The timezone offset is being applied twice for fixed offset timezones. The timestamp is first adjusted to local time via adjust_timestamp_array (lines 215-242), then the offset is incorrectly applied again via apply_hour_offset, apply_minute_offset, or apply_day_offset (lines 294-327). This double-adjustment corrupts the extracted values for timestamps with fixed offset timezones like +04:00 or -05:30.

Fix in Cursor Fix in Web

}
} else {
// special cases that can be extracted (in postgres) but are not interval units
Expand Down Expand Up @@ -240,23 +353,129 @@ impl ScalarUDFImpl for DatePartFunc {
}
}

fn adjust_timestamp_array<T: ArrowTimestampType>(
array: &ArrayRef,
tz: Tz,
) -> Result<ArrayRef> {
let mut builder = PrimitiveBuilder::<T>::new();
let primitive_array = as_primitive_array::<T>(array)?;
for ts_opt in primitive_array.iter() {
match ts_opt {
None => builder.append_null(),
Some(ts) => {
let adjusted_ts = adjust_to_local_time::<T>(ts, tz)?;
builder.append_value(adjusted_ts);
}
}
}
Ok(Arc::new(builder.finish()))
}

fn is_epoch(part: &str) -> bool {
let part = part_normalization(part);
matches!(part.to_lowercase().as_str(), "epoch")
}

// Try to remove quote if exist, if the quote is invalid, return original string and let the downstream function handle the error
// Check if a timezone string is a fixed offset
fn is_fixed_offset(tz_str: &str) -> bool {
tz_str.starts_with('+') || tz_str.starts_with('-')
}

// Holds the components of a timezone offset (hours and minutes).
struct OffsetInfo {
hours: i32,
minutes: i32,
}

// Extracts the offset components from a timezone string like "+04:00" or "-05:30".
fn extract_offset_components(tz_str: &str) -> Option<OffsetInfo> {
if tz_str.len() < 6 {
return None;
}

let sign = match &tz_str[0..1] {
"+" => 1,
"-" => -1,
_ => return None,
};

let hours: i32 = tz_str[1..3].parse().ok()?;
let minutes: i32 = tz_str[4..6].parse().ok()?;

Some(OffsetInfo {
hours: sign * hours,
minutes: sign * minutes,
})
}
Comment on lines +379 to +409
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

Verify fixed-offset timezone format handling.

The extract_offset_components function assumes a specific format (+HH:MM with 6 characters). Please verify this handles all valid fixed-offset timezone representations that Arrow might use.

Potential concerns:

  • Does Arrow always use the +HH:MM format with leading zeros?
  • What about formats like +4:00 or +04 (without minutes)?
  • Consider adding format validation and clearer error messages

🏁 Script executed:

#!/bin/bash
# Check what timezone string formats Arrow's Tz type accepts and produces

echo "Verifying Arrow timezone string formats..."
rg -n "parse::<Tz>" --type rust -A 2 -B 2
echo ""
echo "Check Arrow timezone documentation for valid formats"

Length of output: 1959


[/run_scripts]

<web_search>

</web_search>


🏁 Script executed:

#!/bin/bash
# Find where Tz is imported from and check its definition
echo "=== Finding Tz import in date_part.rs ==="
head -50 datafusion/functions/src/datetime/date_part.rs | grep -E "^use|^extern"

echo ""
echo "=== Searching for Tz type definition ==="
rg -n "^use.*\bTz\b" --type rust datafusion/functions/src/datetime/

echo ""
echo "=== Check if extract_offset_components is actually used ==="
rg -n "extract_offset_components" --type rust

echo ""
echo "=== Look for timezone format patterns in tests/examples ==="
rg -n "\\+[0-9]|\\-[0-9]" --type rust -B2 -A2 datafusion/functions/src/datetime/ | head -30

Length of output: 3866


🌐 Web query:

Apache Arrow timezone type Tz valid formats fixed offset

💡 Result:

Apache Arrow accepts two kinds of timezone strings for the Timestamp (Tz) type:

  • IANA/Olson zone names (e.g. "America/New_York", "UTC") — full tz database names. [1][3]
  • Fixed (absolute) offsets, in hour/minute form (e.g. "+07:30", "-05:00"). Shorter variants like "-09" or "+0930" are also accepted in some Arrow builds. When chrono-tz/zone DB support is disabled, only fixed-offset forms are allowed. [2][3]

References: [1] Arrow FlatBuffers Timestamp docs (timezone accepted forms). [2] arrow-rs datatype docs (fixed-offset parsing examples). [3] Arrow Rust/schema & Python docs (timestamp timezone semantics).


[/run_scripts]


🏁 Script executed:

#!/bin/bash
# Check line 299 context where extract_offset_components is called
echo "=== Context around line 299 where extract_offset_components is called ==="
sed -n '290,310p' datafusion/functions/src/datetime/date_part.rs

echo ""
echo "=== Look for any timezone format tests ==="
rg -n "\\+[0-9][0-9]" --type rust datafusion/functions/src/datetime/ -B1 -A1 | head -40

Length of output: 4975


Fix extract_offset_components to handle all valid Arrow fixed-offset formats.

The function rejects valid timezone formats that Arrow accepts. Test cases in the codebase (date_trunc.rs) use formats like +05, +08, +00, but the current implementation requires at least 6 characters and assumes the format +HH:MM. This causes silent failures when offset components cannot be extracted.

Update the parsing logic to handle:

  • Short formats: +05, -09 (sign + hours only)
  • Formats without colon: +0530, -0545 (sign + hours + minutes)
  • Standard formats: +05:30, -09:45 (sign + hours + colon + minutes)

Consider using a regex or more flexible parsing approach, and add explicit error handling instead of silently returning None.


// Applies the timezone offset to hour values in an array.
fn apply_hour_offset(
array: &dyn Array,
offset_hours: i32,
offset_minutes: i32,
) -> Result<ArrayRef> {
let hour_array = as_int32_array(array)?;
let result: Int32Array = hour_array
.iter()
.map(|hour| {
hour.map(|h| {
let mut adjusted = h + offset_hours;
if offset_minutes.abs() >= 30 {
adjusted += if offset_minutes > 0 { 1 } else { -1 };
}
((adjusted % 24) + 24) % 24
})
})
.collect();
Ok(Arc::new(result))
}

// Applies the timezone offset to minute values in an array.
fn apply_minute_offset(array: &dyn Array, offset_minutes: i32) -> Result<ArrayRef> {
let minute_array = as_int32_array(array)?;
let result: Int32Array = minute_array
.iter()
.map(|minute| {
minute.map(|m| {
let adjusted = m + offset_minutes;
((adjusted % 60) + 60) % 60
})
})
.collect();
Ok(Arc::new(result))
}
Comment on lines +433 to +446
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Similar offset direction concern for minutes.

This function has the same potential logic issue as apply_hour_offset - it adds the offset rather than subtracting it. See the comment on lines 411-431 for details.

🤖 Prompt for AI Agents
In datafusion/functions/src/datetime/date_part.rs around lines 433 to 446, the
minute offset function adds offset_minutes to each minute which reverses
timezone adjustment; change the logic to subtract the offset (use m -
offset_minutes) and then normalize into 0..59 using the same ((value % 60) + 60)
% 60 pattern while preserving nulls; update any variable names/comments if
needed to reflect subtraction so behavior matches apply_hour_offset.


// Applies the timezone offset to day values in an array.
fn apply_day_offset(array: &dyn Array, offset_hours: i32) -> Result<ArrayRef> {
let day_array = as_int32_array(array)?;
let result: Int32Array = day_array
.iter()
.map(|day| {
day.map(|d| {
if offset_hours >= 24 || offset_hours <= -24 {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

apply_day_offset adjusts the day solely based on offset hours, but day changes depend on whether the time crosses midnight; this can yield wrong day values and seems redundant if timestamps were already adjusted before extraction. What do you think about avoiding this adjustment or basing it on actual timestamp boundaries?

🤖 Was this useful? React with 👍 or 👎

d + (offset_hours / 24)
} else if offset_hours > 0 {
d + 1
} else if offset_hours < 0 {
d - 1
} else {
d
}
})
})
.collect();
Ok(Arc::new(result))
}
Comment on lines +448 to +468
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Day offset logic appears oversimplified.

The apply_day_offset function has several concerns:

  1. Offset direction: Same concern as hour/minute offset functions - should this add or subtract?

  2. Partial-day offset handling: For offsets between 1-23 hours, the function simply adds/subtracts 1 day. This ignores the actual time-of-day which determines whether a day boundary is crossed.

For example:

  • Timestamp at 01:00 with -5 hour offset: same day
  • Timestamp at 23:00 with -5 hour offset: previous day

The current logic would treat both identically, which is incorrect.

Consider calculating the day adjustment more precisely based on the actual hour value combined with the offset.


// Try to remove quotes if they exist. If the quotes are invalid, return original string.
fn part_normalization(part: &str) -> &str {
part.strip_prefix(|c| c == '\'' || c == '\"')
.and_then(|s| s.strip_suffix(|c| c == '\'' || c == '\"'))
.unwrap_or(part)
}

/// Invoke [`date_part`] on an `array` (e.g. Timestamp) and convert the
/// result to a total number of seconds, milliseconds, microseconds or
/// nanoseconds
// Converts seconds to i32 with the specified time unit.
fn seconds_as_i32(array: &dyn Array, unit: TimeUnit) -> Result<ArrayRef> {
// Nanosecond is neither supported in Postgres nor DuckDB, to avoid dealing
// with overflow and precision issue we don't support nanosecond
if unit == Nanosecond {
return not_impl_err!("Date part {unit:?} not supported");
Expand All @@ -277,7 +496,6 @@ fn seconds_as_i32(array: &dyn Array, unit: TimeUnit) -> Result<ArrayRef> {
};

let secs = date_part(array, DatePart::Second)?;
// This assumes array is primitive and not a dictionary
let secs = as_int32_array(secs.as_ref())?;
let subsecs = date_part(array, DatePart::Nanosecond)?;
let subsecs = as_int32_array(subsecs.as_ref())?;
Expand Down Expand Up @@ -305,11 +523,8 @@ fn seconds_as_i32(array: &dyn Array, unit: TimeUnit) -> Result<ArrayRef> {
}
}

/// Invoke [`date_part`] on an `array` (e.g. Timestamp) and convert the
/// result to a total number of seconds, milliseconds, microseconds or
/// nanoseconds
///
/// Given epoch return f64, this is a duplicated function to optimize for f64 type
// Converts seconds to f64 with the specified time unit.
// Used for Interval and Duration types that need floating-point precision.
fn seconds(array: &dyn Array, unit: TimeUnit) -> Result<ArrayRef> {
let sf = match unit {
Second => 1_f64,
Expand All @@ -318,7 +533,6 @@ fn seconds(array: &dyn Array, unit: TimeUnit) -> Result<ArrayRef> {
Nanosecond => 1_000_000_000_f64,
};
let secs = date_part(array, DatePart::Second)?;
// This assumes array is primitive and not a dictionary
let secs = as_int32_array(secs.as_ref())?;
let subsecs = date_part(array, DatePart::Nanosecond)?;
let subsecs = as_int32_array(subsecs.as_ref())?;
Expand Down
Loading
Loading