From 5862a4dc4b76823b1e3a768fb64473b11ce2c961 Mon Sep 17 00:00:00 2001 From: zml1206 Date: Mon, 30 Sep 2024 23:15:41 -0700 Subject: [PATCH] Fix Spark WeekFunction on long years (#10713) Summary: Similar to https://github.com/facebookincubator/velox/issues/10599, in Spark, a long year is any year ending on Thursday and any leap year ending on Friday. Since the only difference between Presto and Spark is on return type, this PR extracts the 'getWeek' function to functions/lib and changed to use 'weeknum' from external/date library for the week calculation. Pull Request resolved: https://github.com/facebookincubator/velox/pull/10713 Reviewed By: DanielHunte Differential Revision: D63269551 Pulled By: pedroerp fbshipit-source-id: cf8cf8b4f2a917d7a7c16a03cc8feed5f3fed5d8 --- velox/functions/lib/TimeUtils.h | 19 ++++++++ velox/functions/prestosql/DateTimeFunctions.h | 47 ++----------------- .../prestosql/tests/DateTimeFunctionsTest.cpp | 12 ++--- velox/functions/sparksql/DateTimeFunctions.h | 47 +------------------ velox/functions/sparksql/Register.cpp | 1 - .../sparksql/tests/DateTimeFunctionsTest.cpp | 16 +++++++ velox/type/Timestamp.cpp | 6 +++ velox/type/Timestamp.h | 3 ++ 8 files changed, 54 insertions(+), 97 deletions(-) diff --git a/velox/functions/lib/TimeUtils.h b/velox/functions/lib/TimeUtils.h index 39892124e0a7..2924ced00481 100644 --- a/velox/functions/lib/TimeUtils.h +++ b/velox/functions/lib/TimeUtils.h @@ -17,6 +17,8 @@ #include #include "velox/core/QueryConfig.h" +#include "velox/external/date/date.h" +#include "velox/external/date/iso_week.h" #include "velox/functions/Macros.h" #include "velox/type/tz/TimeZoneMap.h" @@ -92,6 +94,23 @@ FOLLY_ALWAYS_INLINE int32_t getDayOfYear(const std::tm& time) { return time.tm_yday + 1; } +FOLLY_ALWAYS_INLINE uint32_t getWeek( + const Timestamp& timestamp, + const tz::TimeZone* timezone, + bool allowOverflow) { + // The computation of ISO week from date follows the algorithm here: + // https://en.wikipedia.org/wiki/ISO_week_date + Timestamp t = timestamp; + if (timezone) { + t.toTimezone(*timezone); + } + const auto timePoint = t.toTimePointMs(allowOverflow); + const auto daysTimePoint = date::floor(timePoint); + const date::year_month_day calDate(daysTimePoint); + auto weekNum = date::iso_week::year_weeknum_weekday{calDate}.weeknum(); + return (uint32_t)weekNum; +} + template struct InitSessionTimezone { VELOX_DEFINE_FUNCTION_TYPES(T); diff --git a/velox/functions/prestosql/DateTimeFunctions.h b/velox/functions/prestosql/DateTimeFunctions.h index 7672a2184bcb..cb7150126887 100644 --- a/velox/functions/prestosql/DateTimeFunctions.h +++ b/velox/functions/prestosql/DateTimeFunctions.h @@ -196,62 +196,21 @@ struct WeekFunction : public InitSessionTimezone, public TimestampWithTimezoneSupport { VELOX_DEFINE_FUNCTION_TYPES(T); - FOLLY_ALWAYS_INLINE int64_t getWeek(const std::tm& time) { - // The computation of ISO week from date follows the algorithm here: - // https://en.wikipedia.org/wiki/ISO_week_date - int64_t week = floor( - 10 + (time.tm_yday + 1) - - (time.tm_wday ? time.tm_wday : kDaysInWeek)) / - kDaysInWeek; - - if (week == 0) { - // Distance in days between the first day of the current year and the - // Monday of the current week. - auto mondayOfWeek = - time.tm_yday + 1 - (time.tm_wday + kDaysInWeek - 1) % kDaysInWeek; - // Distance in days between the first day and the first Monday of the - // current year. - auto firstMondayOfYear = - 1 + (mondayOfWeek + kDaysInWeek - 1) % kDaysInWeek; - - // A long year is any year ending on Thursday and any leap year ending on - // Friday. - if ((util::isLeapYear(time.tm_year + 1900 - 1) && - firstMondayOfYear == 3) || - firstMondayOfYear == 4) { - week = 53; - } else { - week = 52; - } - } else if (week == 53) { - // Distance in days between the first day of the current year and the - // Monday of the current week. - auto mondayOfWeek = - time.tm_yday + 1 - (time.tm_wday + kDaysInWeek - 1) % kDaysInWeek; - auto daysInYear = util::isLeapYear(time.tm_year + 1900) ? 366 : 365; - if (daysInYear - mondayOfWeek < 3) { - week = 1; - } - } - - return week; - } - FOLLY_ALWAYS_INLINE void call( int64_t& result, const arg_type& timestamp) { - result = getWeek(getDateTime(timestamp, this->timeZone_)); + result = getWeek(timestamp, this->timeZone_, false); } FOLLY_ALWAYS_INLINE void call(int64_t& result, const arg_type& date) { - result = getWeek(getDateTime(date)); + result = getWeek(Timestamp::fromDate(date), nullptr, false); } FOLLY_ALWAYS_INLINE void call( int64_t& result, const arg_type& timestampWithTimezone) { auto timestamp = this->toTimestamp(timestampWithTimezone); - result = getWeek(getDateTime(timestamp, nullptr)); + result = getWeek(timestamp, nullptr, false); } }; diff --git a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp index 03e64f68b8c3..777cd92bc995 100644 --- a/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/DateTimeFunctionsTest.cpp @@ -486,7 +486,7 @@ TEST_F(DateTimeFunctionsTest, week) { VELOX_USER_FAIL("{}", status.message()); }); auto timestamp = - std::make_optional(Timestamp(ts.getSeconds() * 100'000'000, 0)); + std::make_optional(Timestamp(ts.getSeconds() * 100'000, 0)); auto week = evaluateOnce("week(c0)", timestamp).value(); auto weekOfYear = @@ -497,11 +497,11 @@ TEST_F(DateTimeFunctionsTest, week) { }; EXPECT_EQ(1, weekTimestamp("T00:00:00")); - EXPECT_EQ(10, weekTimestamp("T11:59:59")); - EXPECT_EQ(51, weekTimestamp("T06:01:01")); - EXPECT_EQ(24, weekTimestamp("T06:59:59")); - EXPECT_EQ(27, weekTimestamp("T12:00:01")); - EXPECT_EQ(7, weekTimestamp("T12:59:59")); + EXPECT_EQ(47, weekTimestamp("T11:59:59")); + EXPECT_EQ(33, weekTimestamp("T06:01:01")); + EXPECT_EQ(44, weekTimestamp("T06:59:59")); + EXPECT_EQ(47, weekTimestamp("T12:00:01")); + EXPECT_EQ(16, weekTimestamp("T12:59:59")); } TEST_F(DateTimeFunctionsTest, weekTimestampWithTimezone) { diff --git a/velox/functions/sparksql/DateTimeFunctions.h b/velox/functions/sparksql/DateTimeFunctions.h index 89facdd01503..ebb6cc8ec83c 100644 --- a/velox/functions/sparksql/DateTimeFunctions.h +++ b/velox/functions/sparksql/DateTimeFunctions.h @@ -64,53 +64,8 @@ template struct WeekFunction : public InitSessionTimezone { VELOX_DEFINE_FUNCTION_TYPES(T); - FOLLY_ALWAYS_INLINE int32_t getWeek(const std::tm& time) { - // The computation of ISO week from date follows the algorithm here: - // https://en.wikipedia.org/wiki/ISO_week_date - int32_t week = floor( - 10 + (time.tm_yday + 1) - - (time.tm_wday ? time.tm_wday : kDaysInWeek)) / - kDaysInWeek; - - if (week == 0) { - // Distance in days between the first day of the current year and the - // Monday of the current week. - auto mondayOfWeek = - time.tm_yday + 1 - (time.tm_wday + kDaysInWeek - 1) % kDaysInWeek; - // Distance in days between the first day and the first Monday of the - // current year. - auto firstMondayOfYear = - 1 + (mondayOfWeek + kDaysInWeek - 1) % kDaysInWeek; - - if ((util::isLeapYear(time.tm_year + 1900 - 1) && - firstMondayOfYear == 2) || - firstMondayOfYear == 3 || firstMondayOfYear == 4) { - week = 53; - } else { - week = 52; - } - } else if (week == 53) { - // Distance in days between the first day of the current year and the - // Monday of the current week. - auto mondayOfWeek = - time.tm_yday + 1 - (time.tm_wday + kDaysInWeek - 1) % kDaysInWeek; - auto daysInYear = util::isLeapYear(time.tm_year + 1900) ? 366 : 365; - if (daysInYear - mondayOfWeek < 3) { - week = 1; - } - } - - return week; - } - - FOLLY_ALWAYS_INLINE void call( - int32_t& result, - const arg_type& timestamp) { - result = getWeek(getDateTime(timestamp, this->timeZone_)); - } - FOLLY_ALWAYS_INLINE void call(int32_t& result, const arg_type& date) { - result = getWeek(getDateTime(date)); + result = getWeek(Timestamp::fromDate(date), nullptr, false); } }; diff --git a/velox/functions/sparksql/Register.cpp b/velox/functions/sparksql/Register.cpp index 4b7e638dacd9..3172f7dc572e 100644 --- a/velox/functions/sparksql/Register.cpp +++ b/velox/functions/sparksql/Register.cpp @@ -364,7 +364,6 @@ void registerFunctions(const std::string& prefix) { // Register date functions. registerFunction({prefix + "year"}); registerFunction({prefix + "year"}); - registerFunction({prefix + "week_of_year"}); registerFunction({prefix + "week_of_year"}); registerFunction( {prefix + "year_of_week"}); diff --git a/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp b/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp index 482686ac84c3..ca5f9f4fb2c7 100644 --- a/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp +++ b/velox/functions/sparksql/tests/DateTimeFunctionsTest.cpp @@ -206,6 +206,22 @@ TEST_F(DateTimeFunctionsTest, weekOfYear) { EXPECT_EQ(8, weekOfYear("2008-02-20")); EXPECT_EQ(15, weekOfYear("2015-04-08")); EXPECT_EQ(15, weekOfYear("2013-04-08")); + + // Test various cases where the last week of the previous year extends into + // the next year. + + // Leap year that ends on Thursday. + EXPECT_EQ(53, weekOfYear("2021-01-01")); + // Leap year that ends on Friday. + EXPECT_EQ(53, weekOfYear("2005-01-01")); + // Leap year that ends on Saturday. + EXPECT_EQ(52, weekOfYear("2017-01-01")); + // Common year that ends on Thursday. + EXPECT_EQ(53, weekOfYear("2016-01-01")); + // Common year that ends on Friday. + EXPECT_EQ(52, weekOfYear("2022-01-01")); + // Common year that ends on Saturday. + EXPECT_EQ(52, weekOfYear("2023-01-01")); } TEST_F(DateTimeFunctionsTest, unixDate) { diff --git a/velox/type/Timestamp.cpp b/velox/type/Timestamp.cpp index 4d84b8b7071e..ba5318def972 100644 --- a/velox/type/Timestamp.cpp +++ b/velox/type/Timestamp.cpp @@ -34,6 +34,12 @@ Timestamp Timestamp::fromDaysAndNanos(int32_t days, int64_t nanos) { return Timestamp(seconds, remainingNanos); } +// static +Timestamp Timestamp::fromDate(int32_t date) { + int64_t seconds = (int64_t)date * kSecondsInDay; + return Timestamp(seconds, 0); +} + // static Timestamp Timestamp::now() { auto now = std::chrono::system_clock::now(); diff --git a/velox/type/Timestamp.h b/velox/type/Timestamp.h index f3b4c4cd1340..88a31e72f62e 100644 --- a/velox/type/Timestamp.h +++ b/velox/type/Timestamp.h @@ -117,6 +117,9 @@ struct Timestamp { /// and the number of nanoseconds. static Timestamp fromDaysAndNanos(int32_t days, int64_t nanos); + // date is the number of days since unix epoch. + static Timestamp fromDate(int32_t date); + // Returns the current unix timestamp (ms precision). static Timestamp now();