Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement date_part for intervals #6071

Merged
merged 2 commits into from
Jul 27, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
304 changes: 299 additions & 5 deletions arrow-arith/src/temporal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
use std::sync::Arc;

use arrow_array::cast::AsArray;
use cast::as_primitive_array;
use chrono::{Datelike, NaiveDateTime, Offset, TimeZone, Timelike, Utc};

use arrow_array::temporal_conversions::{
Expand All @@ -31,7 +32,7 @@ use arrow_array::timezone::Tz;
use arrow_array::types::*;
use arrow_array::*;
use arrow_buffer::ArrowNativeType;
use arrow_schema::{ArrowError, DataType};
use arrow_schema::{ArrowError, DataType, IntervalUnit};

/// Valid parts to extract from date/time/timestamp arrays.
///
Expand Down Expand Up @@ -111,6 +112,7 @@ where
/// - Date32/Date64
/// - Time32/Time64
/// - Timestamp
/// - Interval
///
/// Returns an [`Int32Array`] unless input was a dictionary type, in which case returns
/// the dictionary but with this function applied onto its values.
Expand All @@ -137,10 +139,21 @@ pub fn date_part(array: &dyn Array, part: DatePart) -> Result<ArrayRef, ArrowErr
let array = Arc::new(array) as ArrayRef;
Ok(array)
}
// TODO: support interval
// DataType::Interval(_) => {
// todo!();
// }
DataType::Interval(IntervalUnit::YearMonth) => {
let array = as_primitive_array::<IntervalYearMonthType>(array).date_part(part)?;
let array = Arc::new(array) as ArrayRef;
Ok(array)
}
DataType::Interval(IntervalUnit::DayTime) => {
let array = as_primitive_array::<IntervalDayTimeType>(array).date_part(part)?;
let array = Arc::new(array) as ArrayRef;
Ok(array)
}
DataType::Interval(IntervalUnit::MonthDayNano) => {
let array = as_primitive_array::<IntervalMonthDayNanoType>(array).date_part(part)?;
let array = Arc::new(array) as ArrayRef;
Ok(array)
}
DataType::Dictionary(_, _) => {
let array = array.as_any_dictionary();
let values = date_part(array.values(), part)?;
Expand Down Expand Up @@ -387,6 +400,88 @@ impl ExtractDatePartExt for PrimitiveArray<TimestampNanosecondType> {
}
}

impl ExtractDatePartExt for PrimitiveArray<IntervalYearMonthType> {
fn date_part(&self, part: DatePart) -> Result<Int32Array, ArrowError> {
match part {
DatePart::Year => Ok(self.unary_opt(|d| Some(d / 12))),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense as IntervalYearMonth stores the number of whole months: https://docs.rs/arrow/latest/arrow/datatypes/enum.IntervalUnit.html

👍

DatePart::Month => Ok(self.unary_opt(|d| Some(d % 12))),

DatePart::Quarter
| DatePart::Week
| DatePart::Day
| DatePart::DayOfWeekSunday0
| DatePart::DayOfWeekMonday0
| DatePart::DayOfYear
| DatePart::Hour
| DatePart::Minute
| DatePart::Second
| DatePart::Millisecond
| DatePart::Microsecond
| DatePart::Nanosecond => {
return_compute_error_with!(format!("{part} does not support"), self.data_type())
}
}
}
}

impl ExtractDatePartExt for PrimitiveArray<IntervalDayTimeType> {
fn date_part(&self, part: DatePart) -> Result<Int32Array, ArrowError> {
match part {
DatePart::Week => Ok(self.unary_opt(|d| Some(d.days / 7))),
DatePart::Day => Ok(self.unary_opt(|d| Some(d.days))),
DatePart::Hour => Ok(self.unary_opt(|d| Some(d.milliseconds / (60 * 60 * 1_000)))),
DatePart::Minute => Ok(self.unary_opt(|d| Some(d.milliseconds / (60 * 1_000)))),
DatePart::Second => Ok(self.unary_opt(|d| Some(d.milliseconds / 1_000))),
DatePart::Millisecond => Ok(self.unary_opt(|d| Some(d.milliseconds))),
DatePart::Microsecond => Ok(self.unary_opt(|d| d.milliseconds.checked_mul(1_000))),
DatePart::Nanosecond => Ok(self.unary_opt(|d| d.milliseconds.checked_mul(1_000_000))),

DatePart::Quarter
| DatePart::Year
| DatePart::Month
| DatePart::DayOfWeekSunday0
| DatePart::DayOfWeekMonday0
| DatePart::DayOfYear => {
return_compute_error_with!(format!("{part} does not support"), self.data_type())
}
}
}
}

impl ExtractDatePartExt for PrimitiveArray<IntervalMonthDayNanoType> {
fn date_part(&self, part: DatePart) -> Result<Int32Array, ArrowError> {
match part {
DatePart::Year => Ok(self.unary_opt(|d: IntervalMonthDayNano| Some(d.months / 12))),
DatePart::Month => Ok(self.unary_opt(|d: IntervalMonthDayNano| Some(d.months))),
DatePart::Week => Ok(self.unary_opt(|d: IntervalMonthDayNano| Some(d.days / 7))),
DatePart::Day => Ok(self.unary_opt(|d: IntervalMonthDayNano| Some(d.days))),
DatePart::Hour => {
Ok(self.unary_opt(|d| (d.nanoseconds / (60 * 60 * 1_000_000_000)).try_into().ok()))
}
DatePart::Minute => {
Ok(self.unary_opt(|d| (d.nanoseconds / (60 * 1_000_000_000)).try_into().ok()))
}
DatePart::Second => {
Ok(self.unary_opt(|d| (d.nanoseconds / 1_000_000_000).try_into().ok()))
}
DatePart::Millisecond => {
Ok(self.unary_opt(|d| (d.nanoseconds / 1_000_000).try_into().ok()))
}
DatePart::Microsecond => {
Ok(self.unary_opt(|d| (d.nanoseconds / 1_000).try_into().ok()))
}
DatePart::Nanosecond => Ok(self.unary_opt(|d| d.nanoseconds.try_into().ok())),

DatePart::Quarter
| DatePart::DayOfWeekSunday0
| DatePart::DayOfWeekMonday0
| DatePart::DayOfYear => {
return_compute_error_with!(format!("{part} does not support"), self.data_type())
}
}
}
}

macro_rules! return_compute_error_with {
($msg:expr, $param:expr) => {
return { Err(ArrowError::ComputeError(format!("{}: {:?}", $msg, $param))) }
Expand Down Expand Up @@ -1500,4 +1595,203 @@ mod tests {
ensure_returns_error(&Time64MicrosecondArray::from(vec![0]));
ensure_returns_error(&Time64NanosecondArray::from(vec![0]));
}

// IntervalDayTimeType week, day, hour, miute, second, mili, u, nano; invalid month, year; ignores the other part
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these comments seem like they belong with the other tests test_interval_day_time_array and test_interval_month_day_nano_array 🤔

// IntervalMonthDayNanoType year -> nano; days don't affect months, time doesn't affect days, time doesn't affect months (and vice versa)
#[test]
fn test_interval_year_month_array() {
let input: IntervalYearMonthArray = vec![0, 5, 24].into();

let actual = date_part(&input, DatePart::Year).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(0, actual.value(1));
assert_eq!(2, actual.value(2));

let actual = date_part(&input, DatePart::Month).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(5, actual.value(1));
assert_eq!(0, actual.value(2));

assert!(date_part(&input, DatePart::Day).is_err());
assert!(date_part(&input, DatePart::Week).is_err());
}

#[test]
fn test_interval_day_time_array() {
let input: IntervalDayTimeArray = vec![
IntervalDayTime::ZERO,
IntervalDayTime::new(10, 42),
IntervalDayTime::new(10, 1042),
IntervalDayTime::new(10, MILLISECONDS_IN_DAY as i32 + 1),
]
.into();

// Time doesn't affect days.
let actual = date_part(&input, DatePart::Day).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(10, actual.value(1));
assert_eq!(10, actual.value(2));
assert_eq!(10, actual.value(3));

let actual = date_part(&input, DatePart::Week).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(1, actual.value(1));
assert_eq!(1, actual.value(2));
assert_eq!(1, actual.value(3));

// Days doesn't affect time.
let actual = date_part(&input, DatePart::Nanosecond).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(42_000_000, actual.value(1));
assert_eq!(1_042_000_000, actual.value(2));
// Overflow returns zero.
assert_eq!(0, actual.value(3));

let actual = date_part(&input, DatePart::Microsecond).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(42_000, actual.value(1));
assert_eq!(1_042_000, actual.value(2));
// Overflow returns zero.
assert_eq!(0, actual.value(3));

let actual = date_part(&input, DatePart::Millisecond).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(42, actual.value(1));
assert_eq!(1042, actual.value(2));
assert_eq!(MILLISECONDS_IN_DAY as i32 + 1, actual.value(3));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find it somewhat strange that only for MILLISECONDS_IN_DAY we have a defined constant 🤔


let actual = date_part(&input, DatePart::Second).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(0, actual.value(1));
assert_eq!(1, actual.value(2));
assert_eq!(24 * 60 * 60, actual.value(3));

let actual = date_part(&input, DatePart::Minute).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(0, actual.value(1));
assert_eq!(0, actual.value(2));
assert_eq!(24 * 60, actual.value(3));

let actual = date_part(&input, DatePart::Hour).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(0, actual.value(1));
assert_eq!(0, actual.value(2));
assert_eq!(24, actual.value(3));

// Month and year are not valid (since days in month varies).
assert!(date_part(&input, DatePart::Month).is_err());
assert!(date_part(&input, DatePart::Year).is_err());
}

#[test]
fn test_interval_month_day_nano_array() {
let input: IntervalMonthDayNanoArray = vec![
IntervalMonthDayNano::ZERO,
IntervalMonthDayNano::new(5, 10, 42),
IntervalMonthDayNano::new(16, 35, MILLISECONDS_IN_DAY * 1_000_000 + 1),
]
.into();

// Year and month follow from month, but are not affected by days or nanos.
let actual = date_part(&input, DatePart::Year).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(0, actual.value(1));
assert_eq!(1, actual.value(2));

let actual = date_part(&input, DatePart::Month).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(5, actual.value(1));
assert_eq!(16, actual.value(2));

// Week and day follow from day, but are not affected by months or nanos.
let actual = date_part(&input, DatePart::Week).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(1, actual.value(1));
assert_eq!(5, actual.value(2));

let actual = date_part(&input, DatePart::Day).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(10, actual.value(1));
assert_eq!(35, actual.value(2));

// Times follow from nanos, but are not affected by months or dats.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Times follow from nanos, but are not affected by months or dats.
// Times follow from nanos, but are not affected by months or days.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Proposed to fix in #6140

let actual = date_part(&input, DatePart::Hour).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(0, actual.value(1));
assert_eq!(24, actual.value(2));

let actual = date_part(&input, DatePart::Minute).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(0, actual.value(1));
assert_eq!(24 * 60, actual.value(2));

let actual = date_part(&input, DatePart::Second).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(0, actual.value(1));
assert_eq!(24 * 60 * 60, actual.value(2));

let actual = date_part(&input, DatePart::Millisecond).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(0, actual.value(1));
assert_eq!(24 * 60 * 60 * 1_000, actual.value(2));

let actual = date_part(&input, DatePart::Microsecond).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(0, actual.value(1));
// Overflow gives zero.
assert_eq!(0, actual.value(2));

let actual = date_part(&input, DatePart::Nanosecond).unwrap();
let actual = actual.as_primitive::<Int32Type>();
assert_eq!(0, actual.value(0));
assert_eq!(42, actual.value(1));
// Overflow gives zero.
assert_eq!(0, actual.value(2));
}

#[test]
fn test_interval_array_invalid_parts() {
fn ensure_returns_error(array: &dyn Array) {
let invalid_parts = [
DatePart::Quarter,
DatePart::DayOfWeekSunday0,
DatePart::DayOfWeekMonday0,
DatePart::DayOfYear,
];

for part in invalid_parts {
let err = date_part(array, part).unwrap_err();
let expected = format!(
"Compute error: {part} does not support: {}",
array.data_type()
);
assert_eq!(expected, err.to_string());
}
}

ensure_returns_error(&IntervalYearMonthArray::from(vec![0]));
ensure_returns_error(&IntervalDayTimeArray::from(vec![IntervalDayTime::ZERO]));
ensure_returns_error(&IntervalMonthDayNanoArray::from(vec![
IntervalMonthDayNano::ZERO,
]));
}
}
Loading