From fe2e440d6eec1f852caa95c39218db2f7854c2c0 Mon Sep 17 00:00:00 2001 From: zml1206 Date: Fri, 19 Sep 2025 17:20:54 +0800 Subject: [PATCH 1/5] feat: Add Spark months_between function --- velox/docs/functions/spark/datetime.rst | 13 +++++++ velox/functions/lib/DateTimeUtil.h | 28 +++++++++++++++ velox/functions/lib/TimeUtils.h | 3 ++ velox/functions/sparksql/DateTimeFunctions.h | 27 ++++++++++++++ .../registration/RegisterDatetime.cpp | 2 ++ .../sparksql/tests/DateTimeFunctionsTest.cpp | 36 +++++++++++++++++++ 6 files changed, 109 insertions(+) diff --git a/velox/docs/functions/spark/datetime.rst b/velox/docs/functions/spark/datetime.rst index d87f5819885a..15f8f85d89a5 100644 --- a/velox/docs/functions/spark/datetime.rst +++ b/velox/docs/functions/spark/datetime.rst @@ -244,6 +244,19 @@ These functions support TIMESTAMP and DATE input types. SELECT month('2009-07-30'); -- 7 +.. spark:function:: months_between(timestamp1, timestamp2, roundOff) -> double + + Returns number of months between times ``timestamp1`` and ``timestamp2``. + If ``timestamp1`` is later than ``timestamp2``, the result is positive. + If ``timestamp1`` and ``timestamp2`` are on the same day of month, or both are the + last day of month, time of day will be ignored. Otherwise, the difference is calculated + based on 31 days per month, and rounded to 8 digits unless roundOff=false. :: + + SELECT months_between('1997-02-28 10:30:00', '1996-10-30', true); -- 3.94959677 + SELECT months_between('1997-02-28 10:30:00', '1996-10-30', false); -- 3.9495967741935485 + SELECT months_between('1997-02-28 10:30:00', '1996-03-31 11:00:00', true); -- 11 + SELECT months_between('1997-02-21 10:30:00', '1996-03-21 11:00:00', true); -- 11 + .. spark:function:: next_day(startDate, dayOfWeek) -> date Returns the first date which is later than ``startDate`` and named as ``dayOfWeek``. diff --git a/velox/functions/lib/DateTimeUtil.h b/velox/functions/lib/DateTimeUtil.h index e0064cd4a2e6..cf9cf4428caf 100644 --- a/velox/functions/lib/DateTimeUtil.h +++ b/velox/functions/lib/DateTimeUtil.h @@ -17,6 +17,7 @@ #include "velox/external/date/date.h" #include "velox/functions/lib/DateTimeFormatter.h" +#include "velox/functions/lib/TimeUtils.h" #include "velox/type/Timestamp.h" #include "velox/type/tz/TimeZoneMap.h" @@ -373,4 +374,31 @@ FOLLY_ALWAYS_INLINE Timestamp addToTimestamp( return result; } +FOLLY_ALWAYS_INLINE bool isEndDayOfMonth(const std::tm& tm) { + const auto endDay = util::getMaxDayOfMonth(getYear(tm), getMonth(tm)); + return tm.tm_mday == endDay; +} + +FOLLY_ALWAYS_INLINE double +monthsBetween(const std::tm& tm1, const std::tm& tm2, const bool roundOff) { + const double monthDiff = + (tm1.tm_year - tm2.tm_year) * kMonthInYear + tm1.tm_mon - tm2.tm_mon; + if (tm1.tm_mday == tm2.tm_mday || + (isEndDayOfMonth(tm1) && isEndDayOfMonth(tm2))) { + return monthDiff; + } + const auto secondsInDay1 = + tm1.tm_hour * kSecondsInHour + tm1.tm_min * kSecondsInMinute + tm1.tm_sec; + const auto secondsInDay2 = + tm2.tm_hour * kSecondsInHour + tm2.tm_min * kSecondsInMinute + tm2.tm_sec; + const auto secondsDiff = (tm1.tm_mday - tm2.tm_mday) * kSecondsInDay + + secondsInDay1 - secondsInDay2; + const auto diff = + monthDiff + static_cast(secondsDiff) / kSecondsInMonth; + if (roundOff) { + return round(diff * 1e8) / 1e8; + } + return diff; +} + } // namespace facebook::velox::functions diff --git a/velox/functions/lib/TimeUtils.h b/velox/functions/lib/TimeUtils.h index 424a4ba3d3a2..5feaa017f0e3 100644 --- a/velox/functions/lib/TimeUtils.h +++ b/velox/functions/lib/TimeUtils.h @@ -27,7 +27,10 @@ namespace facebook::velox::functions { +inline constexpr int64_t kSecondsInMinute = 60; +inline constexpr int64_t kSecondsInHour = 3600; inline constexpr int64_t kSecondsInDay = 86'400; +inline constexpr int64_t kSecondsInMonth = 2'678'400; inline constexpr int64_t kDaysInWeek = 7; extern const folly::F14FastMap kDayOfWeekNames; diff --git a/velox/functions/sparksql/DateTimeFunctions.h b/velox/functions/sparksql/DateTimeFunctions.h index 5dba18c95d76..bc0e170a583b 100644 --- a/velox/functions/sparksql/DateTimeFunctions.h +++ b/velox/functions/sparksql/DateTimeFunctions.h @@ -1082,4 +1082,31 @@ struct TimestampAddFunction { std::optional unit_ = std::nullopt; }; +template +struct MonthsBetweenFunction { + VELOX_DEFINE_FUNCTION_TYPES(T); + + FOLLY_ALWAYS_INLINE void initialize( + const std::vector& /*inputTypes*/, + const core::QueryConfig& config, + const arg_type* /*timestamp1*/, + const arg_type* /*timestamp2*/, + const arg_type* /*roundOff*/) { + sessionTimeZone_ = getTimeZoneFromConfig(config); + } + + FOLLY_ALWAYS_INLINE void call( + out_type& result, + const arg_type& timestamp1, + const arg_type& timestamp2, + const arg_type& roundOff) { + const auto dateTime1 = getDateTime(timestamp1, sessionTimeZone_); + const auto dateTime2 = getDateTime(timestamp2, sessionTimeZone_); + result = monthsBetween(dateTime1, dateTime2, roundOff); + } + + private: + const tz::TimeZone* sessionTimeZone_ = nullptr; +}; + } // namespace facebook::velox::functions::sparksql diff --git a/velox/functions/sparksql/registration/RegisterDatetime.cpp b/velox/functions/sparksql/registration/RegisterDatetime.cpp index c3b758ce3f89..455262d95980 100644 --- a/velox/functions/sparksql/registration/RegisterDatetime.cpp +++ b/velox/functions/sparksql/registration/RegisterDatetime.cpp @@ -111,6 +111,8 @@ void registerDatetimeFunctions(const std::string& prefix) { Varchar, int32_t, Timestamp>({prefix + "timestampadd"}); + registerFunction( + {prefix + "months_between"}); } } // namespace facebook::velox::functions::sparksql diff --git a/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp b/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp index f26cbc020afa..089fb9979fe3 100644 --- a/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp +++ b/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp @@ -1796,5 +1796,41 @@ TEST_F(DateTimeFunctionsTest, timestampadd) { 10, Timestamp(1582970400, 500'999'999) /*2020-02-29 10:00:00.500*/)); } + +TEST_F(DateTimeFunctionsTest, monthsBetween) { + const auto monthsBetween = [&](std::optional timestamp1, + std::optional timestamp2, + std::optional roundOff) { + return evaluateOnce( + "months_between(c0, c1, c2)", timestamp1, timestamp2, roundOff); + }; + + EXPECT_EQ( + 3.94959677, + monthsBetween( + parseTimestamp("1997-02-28 10:30:00"), + parseTimestamp("1996-10-30"), + true)); + EXPECT_EQ( + 3.9495967741935485, + monthsBetween( + parseTimestamp("1997-02-28 10:30:00"), + parseTimestamp("1996-10-30"), + false)); + // `timestamp1` and `timestamp2` both are the last day of month. + EXPECT_EQ( + 11, + monthsBetween( + parseTimestamp("1997-02-28 10:30:00"), + parseTimestamp("1996-03-31 11:00:00"), + true)); + // `timestamp1` and `timestamp2` are on the same day of month. + EXPECT_EQ( + 11, + monthsBetween( + parseTimestamp("1997-02-21 10:30:00"), + parseTimestamp("1996-03-21 11:00:00"), + true)); +} } // namespace } // namespace facebook::velox::functions::sparksql::test From b047490b93ba78ddb882d6c4d05ff6c065feae45 Mon Sep 17 00:00:00 2001 From: zml1206 Date: Fri, 19 Sep 2025 18:46:50 +0800 Subject: [PATCH 2/5] address comments --- velox/docs/functions/spark/datetime.rst | 4 ++-- velox/functions/lib/DateTimeUtil.h | 9 ++------- velox/functions/lib/TimeUtils.h | 2 +- velox/functions/sparksql/DateTimeFunctions.h | 4 ++-- .../sparksql/tests/DateTimeFunctionsTest.cpp | 12 ++++++++++++ 5 files changed, 19 insertions(+), 12 deletions(-) diff --git a/velox/docs/functions/spark/datetime.rst b/velox/docs/functions/spark/datetime.rst index 15f8f85d89a5..467b491fa6f5 100644 --- a/velox/docs/functions/spark/datetime.rst +++ b/velox/docs/functions/spark/datetime.rst @@ -254,8 +254,8 @@ These functions support TIMESTAMP and DATE input types. SELECT months_between('1997-02-28 10:30:00', '1996-10-30', true); -- 3.94959677 SELECT months_between('1997-02-28 10:30:00', '1996-10-30', false); -- 3.9495967741935485 - SELECT months_between('1997-02-28 10:30:00', '1996-03-31 11:00:00', true); -- 11 - SELECT months_between('1997-02-21 10:30:00', '1996-03-21 11:00:00', true); -- 11 + SELECT months_between('1997-02-28 10:30:00', '1996-03-31 11:00:00', true); -- 11.0 + SELECT months_between('1997-02-21 10:30:00', '1996-03-21 11:00:00', true); -- 11.0 .. spark:function:: next_day(startDate, dayOfWeek) -> date diff --git a/velox/functions/lib/DateTimeUtil.h b/velox/functions/lib/DateTimeUtil.h index cf9cf4428caf..fca0dfdaf978 100644 --- a/velox/functions/lib/DateTimeUtil.h +++ b/velox/functions/lib/DateTimeUtil.h @@ -15,11 +15,7 @@ */ #pragma once -#include "velox/external/date/date.h" -#include "velox/functions/lib/DateTimeFormatter.h" #include "velox/functions/lib/TimeUtils.h" -#include "velox/type/Timestamp.h" -#include "velox/type/tz/TimeZoneMap.h" namespace facebook::velox::functions { @@ -375,12 +371,11 @@ FOLLY_ALWAYS_INLINE Timestamp addToTimestamp( } FOLLY_ALWAYS_INLINE bool isEndDayOfMonth(const std::tm& tm) { - const auto endDay = util::getMaxDayOfMonth(getYear(tm), getMonth(tm)); - return tm.tm_mday == endDay; + return tm.tm_mday == util::getMaxDayOfMonth(getYear(tm), getMonth(tm)); } FOLLY_ALWAYS_INLINE double -monthsBetween(const std::tm& tm1, const std::tm& tm2, const bool roundOff) { +monthsBetween(const std::tm& tm1, const std::tm& tm2, bool roundOff) { const double monthDiff = (tm1.tm_year - tm2.tm_year) * kMonthInYear + tm1.tm_mon - tm2.tm_mon; if (tm1.tm_mday == tm2.tm_mday || diff --git a/velox/functions/lib/TimeUtils.h b/velox/functions/lib/TimeUtils.h index 5feaa017f0e3..7241a00c3a56 100644 --- a/velox/functions/lib/TimeUtils.h +++ b/velox/functions/lib/TimeUtils.h @@ -30,7 +30,7 @@ namespace facebook::velox::functions { inline constexpr int64_t kSecondsInMinute = 60; inline constexpr int64_t kSecondsInHour = 3600; inline constexpr int64_t kSecondsInDay = 86'400; -inline constexpr int64_t kSecondsInMonth = 2'678'400; +inline constexpr int64_t kSecondsInMonth = kSecondsInDay * 31; inline constexpr int64_t kDaysInWeek = 7; extern const folly::F14FastMap kDayOfWeekNames; diff --git a/velox/functions/sparksql/DateTimeFunctions.h b/velox/functions/sparksql/DateTimeFunctions.h index bc0e170a583b..e19bbd43bab1 100644 --- a/velox/functions/sparksql/DateTimeFunctions.h +++ b/velox/functions/sparksql/DateTimeFunctions.h @@ -1082,9 +1082,9 @@ struct TimestampAddFunction { std::optional unit_ = std::nullopt; }; -template +template struct MonthsBetweenFunction { - VELOX_DEFINE_FUNCTION_TYPES(T); + VELOX_DEFINE_FUNCTION_TYPES(TExec); FOLLY_ALWAYS_INLINE void initialize( const std::vector& /*inputTypes*/, diff --git a/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp b/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp index 089fb9979fe3..a2588464dcec 100644 --- a/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp +++ b/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp @@ -1817,6 +1817,18 @@ TEST_F(DateTimeFunctionsTest, monthsBetween) { parseTimestamp("1997-02-28 10:30:00"), parseTimestamp("1996-10-30"), false)); + EXPECT_EQ( + 3.9495949074074073, + monthsBetween( + parseTimestamp("1997-02-28 10:30:00"), + parseTimestamp("1996-10-30 00:00:05"), + false)); + EXPECT_EQ( + -3.9495949074074073, + monthsBetween( + parseTimestamp("1996-10-30 00:00:05"), + parseTimestamp("1997-02-28 10:30:00"), + false)); // `timestamp1` and `timestamp2` both are the last day of month. EXPECT_EQ( 11, From 6e3dd9b389c0f574ffb0deb2d4aed1046d7d4b2d Mon Sep 17 00:00:00 2001 From: zml1206 Date: Sat, 20 Sep 2025 08:17:07 +0800 Subject: [PATCH 3/5] move to MonthsBetweenFunction --- velox/functions/lib/DateTimeUtil.h | 31 +++----------------- velox/functions/sparksql/DateTimeFunctions.h | 26 ++++++++++++++++ 2 files changed, 30 insertions(+), 27 deletions(-) diff --git a/velox/functions/lib/DateTimeUtil.h b/velox/functions/lib/DateTimeUtil.h index fca0dfdaf978..e0064cd4a2e6 100644 --- a/velox/functions/lib/DateTimeUtil.h +++ b/velox/functions/lib/DateTimeUtil.h @@ -15,7 +15,10 @@ */ #pragma once -#include "velox/functions/lib/TimeUtils.h" +#include "velox/external/date/date.h" +#include "velox/functions/lib/DateTimeFormatter.h" +#include "velox/type/Timestamp.h" +#include "velox/type/tz/TimeZoneMap.h" namespace facebook::velox::functions { @@ -370,30 +373,4 @@ FOLLY_ALWAYS_INLINE Timestamp addToTimestamp( return result; } -FOLLY_ALWAYS_INLINE bool isEndDayOfMonth(const std::tm& tm) { - return tm.tm_mday == util::getMaxDayOfMonth(getYear(tm), getMonth(tm)); -} - -FOLLY_ALWAYS_INLINE double -monthsBetween(const std::tm& tm1, const std::tm& tm2, bool roundOff) { - const double monthDiff = - (tm1.tm_year - tm2.tm_year) * kMonthInYear + tm1.tm_mon - tm2.tm_mon; - if (tm1.tm_mday == tm2.tm_mday || - (isEndDayOfMonth(tm1) && isEndDayOfMonth(tm2))) { - return monthDiff; - } - const auto secondsInDay1 = - tm1.tm_hour * kSecondsInHour + tm1.tm_min * kSecondsInMinute + tm1.tm_sec; - const auto secondsInDay2 = - tm2.tm_hour * kSecondsInHour + tm2.tm_min * kSecondsInMinute + tm2.tm_sec; - const auto secondsDiff = (tm1.tm_mday - tm2.tm_mday) * kSecondsInDay + - secondsInDay1 - secondsInDay2; - const auto diff = - monthDiff + static_cast(secondsDiff) / kSecondsInMonth; - if (roundOff) { - return round(diff * 1e8) / 1e8; - } - return diff; -} - } // namespace facebook::velox::functions diff --git a/velox/functions/sparksql/DateTimeFunctions.h b/velox/functions/sparksql/DateTimeFunctions.h index e19bbd43bab1..31e5caeb18f9 100644 --- a/velox/functions/sparksql/DateTimeFunctions.h +++ b/velox/functions/sparksql/DateTimeFunctions.h @@ -1106,6 +1106,32 @@ struct MonthsBetweenFunction { } private: + FOLLY_ALWAYS_INLINE bool isEndDayOfMonth(const std::tm& tm) { + return tm.tm_mday == util::getMaxDayOfMonth(getYear(tm), getMonth(tm)); + } + + FOLLY_ALWAYS_INLINE double + monthsBetween(const std::tm& tm1, const std::tm& tm2, bool roundOff) { + const double monthDiff = + (tm1.tm_year - tm2.tm_year) * kMonthInYear + tm1.tm_mon - tm2.tm_mon; + if (tm1.tm_mday == tm2.tm_mday || + (isEndDayOfMonth(tm1) && isEndDayOfMonth(tm2))) { + return monthDiff; + } + const auto secondsInDay1 = tm1.tm_hour * kSecondsInHour + + tm1.tm_min * kSecondsInMinute + tm1.tm_sec; + const auto secondsInDay2 = tm2.tm_hour * kSecondsInHour + + tm2.tm_min * kSecondsInMinute + tm2.tm_sec; + const auto secondsDiff = (tm1.tm_mday - tm2.tm_mday) * kSecondsInDay + + secondsInDay1 - secondsInDay2; + const auto diff = + monthDiff + static_cast(secondsDiff) / kSecondsInMonth; + if (roundOff) { + return round(diff * 1e8) / 1e8; + } + return diff; + } + const tz::TimeZone* sessionTimeZone_ = nullptr; }; From 429b40c0bae4e55f819a8b35d9b28c11ffeb6c9a Mon Sep 17 00:00:00 2001 From: zml1206 Date: Thu, 25 Sep 2025 09:09:51 +0800 Subject: [PATCH 4/5] address comments address comments --- velox/functions/lib/TimeUtils.h | 4 ++-- velox/functions/sparksql/DateTimeFunctions.h | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/velox/functions/lib/TimeUtils.h b/velox/functions/lib/TimeUtils.h index 7241a00c3a56..4e1c1908d26c 100644 --- a/velox/functions/lib/TimeUtils.h +++ b/velox/functions/lib/TimeUtils.h @@ -28,9 +28,9 @@ namespace facebook::velox::functions { inline constexpr int64_t kSecondsInMinute = 60; -inline constexpr int64_t kSecondsInHour = 3600; +inline constexpr int64_t kMinutesInHour = 60; +inline constexpr int64_t kSecondsInHour = kSecondsInMinute * kMinutesInHour; inline constexpr int64_t kSecondsInDay = 86'400; -inline constexpr int64_t kSecondsInMonth = kSecondsInDay * 31; inline constexpr int64_t kDaysInWeek = 7; extern const folly::F14FastMap kDayOfWeekNames; diff --git a/velox/functions/sparksql/DateTimeFunctions.h b/velox/functions/sparksql/DateTimeFunctions.h index 31e5caeb18f9..c3f9a2889eca 100644 --- a/velox/functions/sparksql/DateTimeFunctions.h +++ b/velox/functions/sparksql/DateTimeFunctions.h @@ -1127,11 +1127,14 @@ struct MonthsBetweenFunction { const auto diff = monthDiff + static_cast(secondsDiff) / kSecondsInMonth; if (roundOff) { - return round(diff * 1e8) / 1e8; + return round(diff * kRoundingPrecision) / kRoundingPrecision; } return diff; } + // Precision factor for 8 decimal places rounding. + static constexpr int64_t kRoundingPrecision = 1e8; + static constexpr int64_t kSecondsInMonth = kSecondsInDay * 31; const tz::TimeZone* sessionTimeZone_ = nullptr; }; From d3caae8d12bdf5e0dd126fb8104defb1605a236d Mon Sep 17 00:00:00 2001 From: zml1206 Date: Fri, 26 Sep 2025 05:42:20 +0800 Subject: [PATCH 5/5] update --- velox/docs/functions/spark/datetime.rst | 3 ++- velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/velox/docs/functions/spark/datetime.rst b/velox/docs/functions/spark/datetime.rst index 467b491fa6f5..09446378407a 100644 --- a/velox/docs/functions/spark/datetime.rst +++ b/velox/docs/functions/spark/datetime.rst @@ -250,11 +250,12 @@ These functions support TIMESTAMP and DATE input types. If ``timestamp1`` is later than ``timestamp2``, the result is positive. If ``timestamp1`` and ``timestamp2`` are on the same day of month, or both are the last day of month, time of day will be ignored. Otherwise, the difference is calculated - based on 31 days per month, and rounded to 8 digits unless roundOff=false. :: + based on 31 days per month, and rounded to 8 digits unless ``roundOff`` is false. :: SELECT months_between('1997-02-28 10:30:00', '1996-10-30', true); -- 3.94959677 SELECT months_between('1997-02-28 10:30:00', '1996-10-30', false); -- 3.9495967741935485 SELECT months_between('1997-02-28 10:30:00', '1996-03-31 11:00:00', true); -- 11.0 + SELECT months_between('1997-02-28 10:30:00', '1996-03-28 11:00:00', true); -- 11.0 SELECT months_between('1997-02-21 10:30:00', '1996-03-21 11:00:00', true); -- 11.0 .. spark:function:: next_day(startDate, dayOfWeek) -> date diff --git a/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp b/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp index a2588464dcec..ee0ef748e9d9 100644 --- a/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp +++ b/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp @@ -1837,6 +1837,12 @@ TEST_F(DateTimeFunctionsTest, monthsBetween) { parseTimestamp("1996-03-31 11:00:00"), true)); // `timestamp1` and `timestamp2` are on the same day of month. + EXPECT_EQ( + 11, + monthsBetween( + parseTimestamp("1997-02-28 10:30:00"), + parseTimestamp("1996-03-28 11:00:00"), + true)); EXPECT_EQ( 11, monthsBetween(