Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cast functions between String and Date16/Date32/DateTime32 #2080

Merged
merged 2 commits into from
Oct 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions common/datavalues/src/arrays/ops/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ impl ArrayCast for DFNullArray {
DataType::List(_) => Ok(DFListArray::full_null(self.len()).into_series()),

_ => Err(ErrorCode::BadDataValueType(format!(
"Unsupported cast_with_type operation for {:?}",
self,
"Unsupported cast_with_type from array: {:?} into data_type: {:?}",
self, data_type,
))),
}
}
Expand Down
39 changes: 19 additions & 20 deletions common/datavalues/src/arrays/upstream_traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,23 +34,12 @@ where T: DFPrimitiveType
let iter = iter.into_iter();

let arr: PrimitiveArray<T> = match iter.size_hint() {
(a, Some(b)) if a == b => {
// 2021-02-07: ~40% faster than builder.
// It is unsafe because we cannot be certain that the iterators length can be trusted.
// For most iterators that report the same upper bound as lower bound it is, but still
// somebody can create an iterator that incorrectly gives those bounds.
// This will not lead to UB, but will panic.
unsafe {
let arr = PrimitiveArray::from_trusted_len_iter_unchecked(iter);
assert_eq!(arr.len(), a);
arr
}
}
_ => {
// 2021-02-07: ~1.5% slower than builder. Will still use this as it is more idiomatic and will
// likely improve over time.
iter.collect()
}
(a, Some(b)) if a == b => unsafe {
let arr = PrimitiveArray::from_trusted_len_iter_unchecked(iter);
assert_eq!(arr.len(), a);
arr
},
_ => iter.collect(),
};
arr.into()
}
Expand Down Expand Up @@ -92,13 +81,23 @@ impl FromIterator<bool> for NoNull<DFBooleanArray> {
}

// FromIterator for StringType variants.Array

impl<Ptr> FromIterator<Option<Ptr>> for DFStringArray
where Ptr: AsRef<[u8]>
{
fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
// 2021-02-07: this was ~30% faster than with the builder.
let arr = LargeBinaryArray::from_iter(iter);
let iter = iter.into_iter();
let arr: LargeBinaryArray = match iter.size_hint() {
(a, Some(b)) if a == b => {
// 2021-02-07: ~40% faster than builder.
unsafe {
let arr = LargeBinaryArray::from_trusted_len_iter_unchecked(iter);
assert_eq!(arr.len(), a);
arr
}
}
_ => iter.collect(),
};

arr.into()
}
}
Expand Down
1 change: 1 addition & 0 deletions common/exception/src/exception.rs
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ build_exceptions! {
TLSConfigurationFailure(52),
UnknownSession(53),
UnexpectedError(54),
DateTimeParseError(55),

// uncategorized
UnexpectedResponseType(600),
Expand Down
134 changes: 133 additions & 1 deletion common/functions/src/scalars/expressions/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,28 @@

use std::fmt;

use common_arrow::arrow::temporal_conversions::EPOCH_DAYS_FROM_CE;
use common_datavalues::chrono::DateTime;
use common_datavalues::chrono::Datelike;
use common_datavalues::chrono::NaiveDate;
use common_datavalues::chrono::NaiveDateTime;
use common_datavalues::chrono::TimeZone;
use common_datavalues::chrono::Utc;
use common_datavalues::columns::DataColumn;
use common_datavalues::prelude::ArrayApply;
use common_datavalues::prelude::DFInt32Array;
use common_datavalues::prelude::DFStringArray;
use common_datavalues::prelude::DFUInt16Array;
use common_datavalues::prelude::DFUInt32Array;
use common_datavalues::prelude::DataColumnsWithField;
use common_datavalues::series::IntoSeries;
use common_datavalues::DataSchema;
use common_datavalues::DataType;
use common_exception::ErrorCode;
use common_exception::Result;

use crate::scalars::Function;
use crate::with_match_primitive_type;

#[derive(Clone)]
pub struct CastFunction {
Expand Down Expand Up @@ -53,8 +68,106 @@ impl Function for CastFunction {
}

fn eval(&self, columns: &DataColumnsWithField, input_rows: usize) -> Result<DataColumn> {
if columns[0].data_type() == &self.cast_type {
return Ok(columns[0].column().clone());
}

let series = columns[0].column().clone().to_minimal_array()?;
let column: DataColumn = series.cast_with_type(&self.cast_type)?.into();
const DATE_FMT: &str = "%Y-%m-%d";
const TIME_FMT: &str = "%Y-%m-%d %H:%M:%S";

let error = ErrorCode::BadDataValueType(format!(
"Unsupported cast_with_type from array: {:?} into data_type: {:?}",
series, self.cast_type,
));

let array = match (columns[0].data_type(), &self.cast_type) {
// Date/DateTime to others
(DataType::Date16, _) => with_match_primitive_type!(&self.cast_type, |$T| {
series.cast_with_type(&self.cast_type)
}, {
let arr = series.u16()?;
match &self.cast_type {
Date32 => Ok(arr.apply_cast_numeric(|v| v as i32).into_series()),
DateTime32(_) => Ok(arr.apply_cast_numeric(|v| Utc.timestamp(v as i64 * 24 * 3600, 0_u32).timestamp() ).into_series() ),
String => Ok(DFStringArray::from_iter(arr.into_iter().map(|v| v.map(|x| datetime_to_string( Utc.timestamp(*x as i64 * 24 * 3600, 0_u32), DATE_FMT))) ).into_series()),
_ => Err(error)
}
}),

(DataType::Date32, _) => with_match_primitive_type!(&self.cast_type, |$T| {
series.cast_with_type(&self.cast_type)
}, {
let arr = series.i32()?;
match &self.cast_type {
Date32 => Ok(arr.apply_cast_numeric(|v| v as i32).into_series()),
DateTime32(_) => Ok(arr.apply_cast_numeric(|v| Utc.timestamp(v as i64 * 24 * 3600, 0_u32).timestamp() ).into_series() ),
String => Ok(DFStringArray::from_iter(arr.into_iter().map(|v| v.map(|x| datetime_to_string( Utc.timestamp(*x as i64 * 24 * 3600, 0_u32), DATE_FMT))) ).into_series()),
_ => Err(error)
}
}),

(DataType::DateTime32(_), _) => with_match_primitive_type!(&self.cast_type, |$T| {
series.cast_with_type(&self.cast_type)
}, {
let arr = series.u32()?;
match &self.cast_type {
Date16 => Ok(arr.apply_cast_numeric(|v| (v as i64 / 24/ 3600) as u16).into_series()),
Date32 => Ok(arr.apply_cast_numeric(|v| (v as i64 / 24/ 3600) as u32).into_series()),
String => Ok(DFStringArray::from_iter(arr.into_iter().map(|v| v.map(|x| datetime_to_string( Utc.timestamp(*x as i64, 0_u32), TIME_FMT))) ).into_series()),
_ => Err(error)
}
}),

// others to Date/DateTime
(_, DataType::Date16) => with_match_primitive_type!(columns[0].data_type(), |$T| {
series.cast_with_type(&self.cast_type)
}, {
match columns[0].data_type() {
String => {
let it = series.string()?.into_iter().map(|v| {
v.and_then(string_to_date).map(|d| (d.num_days_from_ce() - EPOCH_DAYS_FROM_CE) as u16 )
});
Ok(DFUInt16Array::from_iter(it).into_series())
},
_ => Err(error)
}
}),

(_, DataType::Date32) => with_match_primitive_type!(columns[0].data_type(), |$T| {
series.cast_with_type(&self.cast_type)
}, {
match columns[0].data_type() {
String => {
let it = series.string()?.into_iter().map(|v| {
v.and_then(string_to_date).map(|d| (d.num_days_from_ce() - EPOCH_DAYS_FROM_CE) as i32 )
});
Ok(DFInt32Array::from_iter(it).into_series())
},
_ => Err(error)
}
}),

(_, DataType::DateTime32(_)) => {
with_match_primitive_type!(columns[0].data_type(), |$T| {
series.cast_with_type(&self.cast_type)
}, {
match columns[0].data_type() {
String => {
let it = series.string()?.into_iter().map(|v| {
v.and_then(string_to_datetime).map(|t| t.timestamp() as u32)
});
Ok(DFUInt32Array::from_iter(it).into_series())
},
_ => Err(error)
}
})
}

_ => series.cast_with_type(&self.cast_type),
}?;

let column: DataColumn = array.into();
Ok(column.resize_constant(input_rows))
}

Expand All @@ -68,3 +181,22 @@ impl fmt::Display for CastFunction {
write!(f, "CAST")
}
}

#[inline]
fn datetime_to_string(date: DateTime<Utc>, fmt: &str) -> String {
date.format(fmt).to_string()
}

// currently use UTC by default
// TODO support timezone
#[inline]
fn string_to_datetime(date_str: impl AsRef<[u8]>) -> Option<NaiveDateTime> {
let s = std::str::from_utf8(date_str.as_ref()).ok();
s.and_then(|c| NaiveDateTime::parse_from_str(c, "%Y-%m-%d %H:%M:%S").ok())
}

#[inline]
fn string_to_date(date_str: impl AsRef<[u8]>) -> Option<NaiveDate> {
let s = std::str::from_utf8(date_str.as_ref()).ok();
s.and_then(|c| c.parse::<NaiveDate>().ok())
}
66 changes: 55 additions & 11 deletions common/functions/src/scalars/expressions/cast_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ fn test_cast_function() -> Result<()> {
display: &'static str,
nullable: bool,
columns: Vec<DataColumn>,
column_types: Vec<DataType>,
expect: Series,
error: &'static str,
func: Result<Box<dyn Function>>,
Expand All @@ -37,6 +38,7 @@ fn test_cast_function() -> Result<()> {
display: "CAST",
nullable: false,
columns: vec![Series::new(vec![4i64, 3, 2, 4]).into()],
column_types: vec![DataType::Int64],
func: CastFunction::create("toint8".to_string(), DataType::Int8),
expect: Series::new(vec![4i8, 3, 2, 4]),
error: "",
Expand All @@ -46,6 +48,7 @@ fn test_cast_function() -> Result<()> {
display: "CAST",
nullable: false,
columns: vec![Series::new(vec!["4", "3", "2", "4"]).into()],
column_types: vec![DataType::String],
func: CastFunction::create("toint8".to_string(), DataType::Int8),
expect: Series::new(vec![4i8, 3, 2, 4]),
error: "",
Expand All @@ -55,6 +58,7 @@ fn test_cast_function() -> Result<()> {
display: "CAST",
nullable: false,
columns: vec![Series::new(vec!["4", "3", "2", "4"]).into()],
column_types: vec![DataType::String],
func: CastFunction::create("toint16".to_string(), DataType::Int16),
expect: Series::new(vec![4i16, 3, 2, 4]),
error: "",
Expand All @@ -64,6 +68,7 @@ fn test_cast_function() -> Result<()> {
display: "CAST",
nullable: false,
columns: vec![Series::new(vec!["4", "3", "2", "4"]).into()],
column_types: vec![DataType::String],
func: CastFunction::create("toint32".to_string(), DataType::Int32),
expect: Series::new(vec![4i32, 3, 2, 4]),
error: "",
Expand All @@ -73,30 +78,74 @@ fn test_cast_function() -> Result<()> {
display: "CAST",
nullable: false,
columns: vec![Series::new(vec!["4", "3", "2", "4"]).into()],
column_types: vec![DataType::String],
func: CastFunction::create("toint64".to_string(), DataType::Int64),
expect: Series::new(vec![4i64, 3, 2, 4]),
error: "",
},
Test {
name: "cast-string-to-date16-passed",
display: "CAST",
nullable: false,
columns: vec![Series::new(vec!["2021-03-05", "2021-10-24"]).into()],
column_types: vec![DataType::String],
func: CastFunction::create("cast".to_string(), DataType::Date16),
expect: Series::new(vec![18691u16, 18924]),
error: "",
},
Test {
name: "cast-string-to-date32-passed",
display: "CAST",
nullable: false,
columns: vec![Series::new(vec!["20210305", "20211024"]).into()],
func: CastFunction::create("cast".to_string(), DataType::Int32),
expect: Series::new(vec![20210305i32, 20211024]),
columns: vec![Series::new(vec!["2021-03-05", "2021-10-24"]).into()],
column_types: vec![DataType::String],
func: CastFunction::create("cast".to_string(), DataType::Date32),
expect: Series::new(vec![18691i32, 18924]),
error: "",
},
Test {
name: "cast-string-to-datetime32-passed",
display: "CAST",
nullable: false,
columns: vec![Series::new(vec!["2021-03-05 01:01:01", "2021-10-24 10:10:10"]).into()],
column_types: vec![DataType::String],
func: CastFunction::create("cast".to_string(), DataType::DateTime32(None)),
expect: Series::new(vec![1614906061u32, 1635070210]),
error: "",
},
Test {
name: "cast-date32-to-string-passed",
display: "CAST",
nullable: false,
columns: vec![Series::new(vec![18691i32, 18924]).into()],
column_types: vec![DataType::Date32],
func: CastFunction::create("cast".to_string(), DataType::String),
expect: Series::new(vec!["2021-03-05", "2021-10-24"]),
error: "",
},
Test {
name: "cast-datetime-to-string-passed",
display: "CAST",
nullable: false,
columns: vec![Series::new(vec![1614906061u32, 1635070210]).into()],
column_types: vec![DataType::DateTime32(None)],
func: CastFunction::create("cast".to_string(), DataType::String),
expect: Series::new(vec!["2021-03-05 01:01:01", "2021-10-24 10:10:10"]),
error: "",
},
];

let dummy = DataField::new("dummy", DataType::String, false);

for t in tests {
let rows = t.columns[0].len();

let columns: Vec<DataColumnWithField> = t
.columns
.iter()
.map(|c| DataColumnWithField::new(c.clone(), dummy.clone()))
.zip(t.column_types.iter())
.map(|(c, t)| {
let dummy = DataField::new("dummy", t.clone(), false);
DataColumnWithField::new(c.clone(), dummy.clone())
})
.collect();

let func = t.func.unwrap();
Expand All @@ -114,11 +163,6 @@ fn test_cast_function() -> Result<()> {
assert_eq!(expect_null, actual_null);

let v = &(func.eval(&columns, rows)?);
// Type check.
let expect_type = func.return_type(&[])?;
let actual_type = v.data_type();
assert_eq!(expect_type, actual_type);

let c: DataColumn = t.expect.into();
assert_eq!(v, &c);
}
Expand Down
5 changes: 5 additions & 0 deletions tests/suites/0_stateless/02_0002_function_cast.result
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,8 @@ UInt32
NULL
NULL
NULL
===DATE/DATETIME===
1
1
1
1
6 changes: 6 additions & 0 deletions tests/suites/0_stateless/02_0002_function_cast.sql
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,9 @@ SELECT CAST(CAST(1 + 1 + 1, String) AS Int8);
SELECT CAST(Null as Int64);
SELECT CAST(Null as Boolean);
SELECT CAST(Null as Varchar);

SELECT '===DATE/DATETIME===';
SELECT toDateTime('2021-03-05 01:01:01') + 1 = toDateTime('2021-03-05 01:01:02');
SELECT toDate('2021-03-05') + 1 = toDate('2021-03-06');
SELECT toString(toDate('2021-03-05') + 1) = '2021-03-06';
SELECT toDateTime(toDate('2021-03-05')) = toDateTime('2021-03-05 00:00:00');