From 78bfa14fe1b20a36d7e4a1ed45cf6a4996ebd013 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Fri, 3 Mar 2023 18:53:46 +0000 Subject: [PATCH 1/7] Faster timestamp parsing --- arrow-cast/Cargo.toml | 5 + arrow-cast/benches/parse_timestamp.rs | 44 +++++ arrow-cast/src/cast.rs | 10 +- arrow-cast/src/parse.rs | 230 +++++++++++++++++--------- 4 files changed, 207 insertions(+), 82 deletions(-) create mode 100644 arrow-cast/benches/parse_timestamp.rs diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index 688e0001f973..c383369c4403 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -48,5 +48,10 @@ num = { version = "0.4", default-features = false, features = ["std"] } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } [dev-dependencies] +criterion = { version = "0.4", default-features = false } [build-dependencies] + +[[bench]] +name = "parse_timestamp" +harness = false diff --git a/arrow-cast/benches/parse_timestamp.rs b/arrow-cast/benches/parse_timestamp.rs new file mode 100644 index 000000000000..d3ab41863e70 --- /dev/null +++ b/arrow-cast/benches/parse_timestamp.rs @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_cast::parse::string_to_timestamp_nanos; +use criterion::*; + +fn criterion_benchmark(c: &mut Criterion) { + let timestamps = [ + "2020-09-08", + "2020-09-08T13:42:29", + "2020-09-08T13:42:29.190", + "2020-09-08T13:42:29.190855", + "2020-09-08T13:42:29.190855999", + "2020-09-08T13:42:29+00:00", + "2020-09-08T13:42:29.190+00:00", + "2020-09-08T13:42:29.190855+00:00", + "2020-09-08T13:42:29.190855999-05:00", + "2020-09-08T13:42:29.190855Z", + ]; + + for timestamp in timestamps { + let t = black_box(timestamp); + c.bench_function(t, |b| { + b.iter(|| string_to_timestamp_nanos(t).unwrap()); + }); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 8e3bde990fcd..05b802eb2176 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -4785,7 +4785,7 @@ mod tests { let err = cast_with_options(array, &to_type, &options).unwrap_err(); assert_eq!( err.to_string(), - "Cast error: Error parsing 'Not a valid date' as timestamp" + "Parser error: Error parsing timestamp from 'Not a valid date': error parsing date" ); } } @@ -7601,8 +7601,12 @@ mod tests { ]); let array = Arc::new(valid) as ArrayRef; - let b = cast(&array, &DataType::Timestamp(TimeUnit::Nanosecond, Some(tz))) - .unwrap(); + let b = cast_with_options( + &array, + &DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)), + &CastOptions { safe: false }, + ) + .unwrap(); let c = b .as_any() diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index a48dd2bac7d2..16bd3a7b944e 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -15,11 +15,117 @@ // specific language governing permissions and limitations // under the License. +use arrow_array::timezone::Tz; use arrow_array::types::*; use arrow_array::ArrowPrimitiveType; use arrow_schema::ArrowError; use chrono::prelude::*; +/// Helper for parsing timestamps +struct TimestampParser { + digits: [u8; 32], + mask: u32, +} + +impl TimestampParser { + fn new(bytes: &[u8]) -> Self { + let mut digits = [0; 32]; + let mut mask = 0; + + for (idx, (o, i)) in digits.iter_mut().zip(bytes).enumerate() { + *o = i.wrapping_sub(b'0'); + mask |= ((*o < 10) as u32) << idx + } + + Self { digits, mask } + } + + /// Returns true if the byte at `idx` equals `b` + fn test(&self, idx: usize, b: u8) -> bool { + self.digits[idx] == b.wrapping_sub(b'0') + } + + /// Parses a date of the form `1997-01-31` + fn date(&self) -> Option { + if self.mask & 0b1111111111 != 0b1101101111 + || !self.test(4, b'-') + || !self.test(7, b'-') + { + return None; + } + + let year = self.digits[0] as u16 * 1000 + + self.digits[1] as u16 * 100 + + self.digits[2] as u16 * 10 + + self.digits[3] as u16; + + let month = self.digits[5] * 10 + self.digits[6]; + let day = self.digits[8] * 10 + self.digits[9]; + + NaiveDate::from_ymd_opt(year as _, month as _, day as _) + } + + /// Parses a time of any of forms + /// - `09:26:56` + /// - `09:26:56.123` + /// - `09:26:56.123456` + /// - `09:26:56.123456789` + /// - `092656` + /// + /// Returning the end byte offset + fn time(&self) -> Option<(NaiveTime, usize)> { + match (self.mask >> 11) & 0b11111111 { + // 09:26:56 + 0b11011011 if self.test(13, b':') && self.test(16, b':') => { + let hour = self.digits[11] * 10 + self.digits[12]; + let minute = self.digits[14] * 10 + self.digits[15]; + let second = self.digits[17] * 10 + self.digits[18]; + let time = NaiveTime::from_hms_opt(hour as _, minute as _, second as _)?; + + let millis = || { + self.digits[20] as u32 * 100_000_000 + + self.digits[21] as u32 * 10_000_000 + + self.digits[22] as u32 * 1_000_000 + }; + + let micros = || { + self.digits[23] as u32 * 100_000 + + self.digits[24] as u32 * 10_000 + + self.digits[25] as u32 * 1_000 + }; + + let nanos = || { + self.digits[26] as u32 * 100 + + self.digits[27] as u32 * 10 + + self.digits[28] as u32 + }; + + match self.test(19, b'.') { + true => match (self.mask >> 20).trailing_ones() { + 3 => Some((time.with_nanosecond(millis())?, 23)), + 6 => Some((time.with_nanosecond(millis() + micros())?, 26)), + 9 => Some(( + time.with_nanosecond(millis() + micros() + nanos())?, + 29, + )), + _ => None, + }, + false => Some((time, 19)), + } + } + // 09:26:56 + 0b111111 => { + let hour = self.digits[11] * 10 + self.digits[12]; + let minute = self.digits[13] * 10 + self.digits[14]; + let second = self.digits[15] * 10 + self.digits[16]; + let time = NaiveTime::from_hms_opt(hour as _, minute as _, second as _)?; + Some((time, 17)) + } + _ => None, + } + } +} + /// Accepts a string and parses it relative to the provided `timezone` /// /// In addition to RFC3339 / ISO8601 standard timestamps, it also @@ -33,6 +139,8 @@ use chrono::prelude::*; /// * `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone offset specified /// * `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and no timezone offset /// * `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds +/// * `1997-01-31 092656` # close to RCF3339, no fractional seconds +/// * `1997-01-31 092656+04:00` # close to RCF3339, no fractional seconds or time separator /// * `1997-01-31` # close to RCF3339, only date no time /// /// Some formats that supported by PostgresSql @@ -46,89 +154,52 @@ pub fn string_to_datetime( timezone: &T, s: &str, ) -> Result, ArrowError> { - // Fast path: RFC3339 timestamp (with a T) - // Example: 2020-09-08T13:42:29.190855Z - if let Ok(ts) = DateTime::parse_from_rfc3339(s) { - return Ok(ts.with_timezone(timezone)); - } - - // Implement quasi-RFC3339 support by trying to parse the - // timestamp with various other format specifiers to to support - // separating the date and time with a space ' ' rather than 'T' to be - // (more) compatible with Apache Spark SQL - - let supported_formats = vec![ - "%Y-%m-%d %H:%M:%S%.f%:z", // Example: 2020-09-08 13:42:29.190855-05:00 - "%Y-%m-%d %H%M%S%.3f%:z", // Example: "2023-01-01 040506 +07:30" - ]; - - for f in supported_formats.iter() { - if let Ok(ts) = DateTime::parse_from_str(s, f) { - return Ok(ts.with_timezone(timezone)); - } - } - - // with an explicit Z, using ' ' as a separator - // Example: 2020-09-08 13:42:29Z - if let Ok(ts) = Utc.datetime_from_str(s, "%Y-%m-%d %H:%M:%S%.fZ") { - return Ok(ts.with_timezone(timezone)); - } - - // Support timestamps without an explicit timezone offset, again - // to be compatible with what Apache Spark SQL does. + let err = |ctx: &str| { + ArrowError::ParseError(format!("Error parsing timestamp from '{s}': {ctx}")) + }; - // without a timezone specifier as a local time, using T as a separator - // Example: 2020-09-08T13:42:29.190855 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S%.f") { - if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() { - return Ok(DateTime::from_local(ts, offset)); - } + let bytes = s.as_bytes(); + if bytes.len() < 10 { + return Err(err("timestamp must contain at least 10 characters")); } - // without a timezone specifier as a local time, using T as a - // separator, no fractional seconds - // Example: 2020-09-08T13:42:29 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") { - if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() { - return Ok(DateTime::from_local(ts, offset)); - } - } + let parser = TimestampParser::new(bytes); + let date = parser.date().ok_or_else(|| err("error parsing date"))?; + if bytes.len() == 10 { + let offset = timezone.offset_from_local_date(&date); + let offset = offset + .single() + .ok_or_else(|| err("error computing offset"))?; - // without a timezone specifier as a local time, using ' ' as a separator - // Example: 2020-09-08 13:42:29.190855 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f") { - if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() { - return Ok(DateTime::from_local(ts, offset)); - } + let time = NaiveTime::from_hms_opt(0, 0, 0).unwrap(); + return Ok(DateTime::from_local(date.and_time(time), offset)); } - // without a timezone specifier as a local time, using ' ' as a - // separator, no fractional seconds - // Example: 2020-09-08 13:42:29 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S") { - if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() { - return Ok(DateTime::from_local(ts, offset)); - } + let (time, tz_offset) = parser.time().ok_or_else(|| err("error parsing time"))?; + let datetime = date.and_time(time); + if bytes.len() <= tz_offset { + let offset = timezone.offset_from_local_datetime(&datetime); + let offset = offset + .single() + .ok_or_else(|| err("error computing offset"))?; + return Ok(DateTime::from_local(datetime, offset)); } - // without a timezone specifier as a local time, only date - // Example: 2020-09-08 - if let Ok(dt) = NaiveDate::parse_from_str(s, "%Y-%m-%d") { - if let Some(ts) = dt.and_hms_opt(0, 0, 0) { - if let Some(offset) = timezone.offset_from_local_datetime(&ts).single() { - return Ok(DateTime::from_local(ts, offset)); - } - } + if bytes[tz_offset] == b'z' || bytes[tz_offset] == b'Z' { + let offset = timezone.offset_from_local_datetime(&datetime); + let offset = offset + .single() + .ok_or_else(|| err("error computing offset"))?; + return Ok(DateTime::from_utc(datetime, offset)); } - // Note we don't pass along the error message from the underlying - // chrono parsing because we tried several different format - // strings and we don't know which the user was trying to - // match. Ths any of the specific error messages is likely to be - // be more confusing than helpful - Err(ArrowError::CastError(format!( - "Error parsing '{s}' as timestamp" - ))) + // Parse remainder of string as timezone + let parsed_tz: Tz = s[tz_offset..].trim_start().parse()?; + let offset = parsed_tz.offset_from_local_datetime(&datetime); + let offset = offset + .single() + .ok_or_else(|| err("error computing offset"))?; + Ok(DateTime::::from_local(datetime, offset).with_timezone(timezone)) } /// Accepts a string in RFC3339 / ISO8601 standard format and some @@ -416,19 +487,20 @@ const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented a impl Parser for Date32Type { fn parse(string: &str) -> Option { - let date = string.parse::().ok()?; + let parser = TimestampParser::new(string.as_bytes()); + let date = parser.date()?; Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) } fn parse_formatted(string: &str, format: &str) -> Option { - let date = chrono::NaiveDate::parse_from_str(string, format).ok()?; + let date = NaiveDate::parse_from_str(string, format).ok()?; Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) } } impl Parser for Date64Type { fn parse(string: &str) -> Option { - let date_time = string.parse::().ok()?; + let date_time = string_to_datetime(&Utc, string).ok()?; Some(date_time.timestamp_millis()) } @@ -565,11 +637,11 @@ mod tests { // Test parsing invalid formats // It would be nice to make these messages better - expect_timestamp_parse_error("", "Error parsing '' as timestamp"); - expect_timestamp_parse_error("SS", "Error parsing 'SS' as timestamp"); + expect_timestamp_parse_error("", "Error parsing timestamp from '': timestamp must contain at least 10 characters"); + expect_timestamp_parse_error("SS", "Error parsing timestamp from 'SS': timestamp must contain at least 10 characters"); expect_timestamp_parse_error( "Wed, 18 Feb 2015 23:16:09 GMT", - "Error parsing 'Wed, 18 Feb 2015 23:16:09 GMT' as timestamp", + "Parser error: Error parsing timestamp from 'Wed, 18 Feb 2015 23:16:09 GMT': error parsing date", ); } From 2e7781d8e50b6ee7431397fd7ee3bdfebf098fd2 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Mon, 6 Mar 2023 13:43:48 +0000 Subject: [PATCH 2/7] Faster timezone parsing --- arrow-array/src/timezone.rs | 59 +++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/arrow-array/src/timezone.rs b/arrow-array/src/timezone.rs index 3af76c3dafb7..f56189c46512 100644 --- a/arrow-array/src/timezone.rs +++ b/arrow-array/src/timezone.rs @@ -18,29 +18,34 @@ //! Timezone for timestamp arrays use arrow_schema::ArrowError; -use chrono::format::{parse, Parsed, StrftimeItems}; use chrono::FixedOffset; pub use private::{Tz, TzOffset}; -/// Parses a fixed offset of the form "+09:00" -fn parse_fixed_offset(tz: &str) -> Result { - let mut parsed = Parsed::new(); - - if let Ok(fixed_offset) = parse(&mut parsed, tz, StrftimeItems::new("%:z")) - .and_then(|_| parsed.to_fixed_offset()) - { - return Ok(fixed_offset); +/// Parses a fixed offset of the form "+09:00", "-09" or "+0930" +fn parse_fixed_offset(tz: &str) -> Option { + let bytes = tz.as_bytes(); + + let mut values = match bytes.len() { + // [+-]XX:XX + 6 if bytes[3] == b':' => [bytes[1], bytes[2], bytes[4], bytes[5]], + // [+-]XXXX + 5 => [bytes[1], bytes[2], bytes[3], bytes[4]], + // [+-]XX + 3 => [bytes[1], bytes[2], b'0', b'0'], + _ => return None, + }; + values.iter_mut().for_each(|x| *x = x.wrapping_sub(b'0')); + if values.iter().any(|x| *x > 9) { + return None; } + let secs = (values[0] * 10 + values[1]) as i32 * 60 * 60 + + (values[2] * 10 + values[3]) as i32 * 60; - if let Ok(fixed_offset) = parse(&mut parsed, tz, StrftimeItems::new("%#z")) - .and_then(|_| parsed.to_fixed_offset()) - { - return Ok(fixed_offset); + match bytes[0] { + b'+' => FixedOffset::east_opt(secs), + b'-' => FixedOffset::west_opt(secs), + _ => None, } - - Err(ArrowError::ParseError(format!( - "Invalid timezone \"{tz}\": Expected format [+-]XX:XX, [+-]XX, or [+-]XXXX" - ))) } #[cfg(feature = "chrono-tz")] @@ -83,12 +88,11 @@ mod private { type Err = ArrowError; fn from_str(tz: &str) -> Result { - if tz.starts_with('+') || tz.starts_with('-') { - Ok(Self(TzInner::Offset(parse_fixed_offset(tz)?))) - } else { - Ok(Self(TzInner::Timezone(tz.parse().map_err(|e| { + match parse_fixed_offset(tz) { + Some(offset) => Ok(Self(TzInner::Offset(offset))), + None => Ok(Self(TzInner::Timezone(tz.parse().map_err(|e| { ArrowError::ParseError(format!("Invalid timezone \"{tz}\": {e}")) - })?))) + })?))), } } } @@ -261,13 +265,12 @@ mod private { type Err = ArrowError; fn from_str(tz: &str) -> Result { - if tz.starts_with('+') || tz.starts_with('-') { - Ok(Self(parse_fixed_offset(tz)?)) - } else { - Err(ArrowError::ParseError(format!( + let offset = parse_fixed_offset(tz).ok_or_else(|| { + ArrowError::ParseError(format!( "Invalid timezone \"{tz}\": only offset based timezones supported without chrono-tz feature" - ))) - } + )) + })?; + Ok(Self(offset)) } } From 644eecd39b1f91fac6f1e389da58de38e6a18155 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Thu, 9 Mar 2023 10:48:22 +0000 Subject: [PATCH 3/7] More tests --- arrow-cast/src/parse.rs | 83 ++++++++++++++++++++++++++--------------- arrow/Cargo.toml | 4 ++ arrow/tests/timezone.rs | 65 ++++++++++++++++++++++++++++++++ 3 files changed, 121 insertions(+), 31 deletions(-) create mode 100644 arrow/tests/timezone.rs diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 16bd3a7b944e..dededac56507 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -136,6 +136,7 @@ impl TimestampParser { /// * `1997-01-31T09:26:56.123Z` # RCF3339 /// * `1997-01-31T09:26:56.123-05:00` # RCF3339 /// * `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space rather than T +/// * `2023-01-01 04:05:06.789 -08` # close to RCF3339, no fractional seconds or time separator /// * `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone offset specified /// * `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and no timezone offset /// * `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds @@ -143,13 +144,21 @@ impl TimestampParser { /// * `1997-01-31 092656+04:00` # close to RCF3339, no fractional seconds or time separator /// * `1997-01-31` # close to RCF3339, only date no time /// -/// Some formats that supported by PostgresSql -/// still not supported by chrono, like -/// "2023-01-01 040506 America/Los_Angeles", -/// "2023-01-01 04:05:06.789 +07:30:00", -/// "2023-01-01 040506 +07:30:00", -/// "2023-01-01 04:05:06.789 PST", -/// "2023-01-01 04:05:06.789 -08", +/// [IANA timezones] are only supported if the `arrow-array/chrono-tz` feature is enabled +/// +/// * `2023-01-01 040506 America/Los_Angeles` +/// +/// If a timestamp is ambiguous, for example as a result of daylight-savings time, an error +/// will be returned +/// +/// Some formats supported by PostgresSql +/// are not supported, like +/// +/// * "2023-01-01 04:05:06.789 +07:30:00", +/// * "2023-01-01 040506 +07:30:00", +/// * "2023-01-01 04:05:06.789 PST", +/// +/// [IANA timezones]: https://www.iana.org/time-zones pub fn string_to_datetime( timezone: &T, s: &str, @@ -169,19 +178,23 @@ pub fn string_to_datetime( let offset = timezone.offset_from_local_date(&date); let offset = offset .single() - .ok_or_else(|| err("error computing offset"))?; + .ok_or_else(|| err("error computing timezone offset"))?; let time = NaiveTime::from_hms_opt(0, 0, 0).unwrap(); return Ok(DateTime::from_local(date.and_time(time), offset)); } + if !parser.test(10, b'T') && !parser.test(10, b' ') { + return Err(err("invalid timestamp separator")); + } + let (time, tz_offset) = parser.time().ok_or_else(|| err("error parsing time"))?; let datetime = date.and_time(time); if bytes.len() <= tz_offset { let offset = timezone.offset_from_local_datetime(&datetime); let offset = offset .single() - .ok_or_else(|| err("error computing offset"))?; + .ok_or_else(|| err("error computing timezone offset"))?; return Ok(DateTime::from_local(datetime, offset)); } @@ -189,7 +202,7 @@ pub fn string_to_datetime( let offset = timezone.offset_from_local_datetime(&datetime); let offset = offset .single() - .ok_or_else(|| err("error computing offset"))?; + .ok_or_else(|| err("error computing timezone offset"))?; return Ok(DateTime::from_utc(datetime, offset)); } @@ -198,7 +211,7 @@ pub fn string_to_datetime( let offset = parsed_tz.offset_from_local_datetime(&datetime); let offset = offset .single() - .ok_or_else(|| err("error computing offset"))?; + .ok_or_else(|| err("error computing timezone offset"))?; Ok(DateTime::::from_local(datetime, offset).with_timezone(timezone)) } @@ -635,14 +648,34 @@ mod tests { #[test] fn string_to_timestamp_invalid() { // Test parsing invalid formats - - // It would be nice to make these messages better - expect_timestamp_parse_error("", "Error parsing timestamp from '': timestamp must contain at least 10 characters"); - expect_timestamp_parse_error("SS", "Error parsing timestamp from 'SS': timestamp must contain at least 10 characters"); - expect_timestamp_parse_error( - "Wed, 18 Feb 2015 23:16:09 GMT", - "Parser error: Error parsing timestamp from 'Wed, 18 Feb 2015 23:16:09 GMT': error parsing date", - ); + let cases = [ + ("", "timestamp must contain at least 10 characters"), + ("SS", "timestamp must contain at least 10 characters"), + ("Wed, 18 Feb 2015 23:16:09 GMT", "error parsing date"), + ("1997-01-31H09:26:56.123Z", "invalid timestamp separator"), + ("1997-01-31 09:26:56.123Z", "error parsing time"), + ("1997:01:31T09:26:56.123Z", "error parsing date"), + ("1997:1:31T09:26:56.123Z", "error parsing date"), + ("1997-01-32T09:26:56.123Z", "error parsing date"), + ("1997-13-32T09:26:56.123Z", "error parsing date"), + ("1997-02-29T09:26:56.123Z", "error parsing date"), + ("2015-02-30T17:35:20-08:00", "error parsing date"), + ("1997-01-10T9:26:56.123Z", "error parsing time"), + ("2015-01-20T25:35:20-08:00", "error parsing time"), + ("1997-01-10T09:61:56.123Z", "error parsing time"), + ("1997-01-10T09:61:90.123Z", "error parsing time"), + ("1997-01-10T12:00:56.12Z", "error parsing time"), + ("1997-01-10T12:00:56.1234Z", "error parsing time"), + ("1997-01-10T12:00:56.12345Z", "error parsing time"), + ("1997-01-10T12:00:6.123Z", "error parsing time"), + ]; + + for (s, ctx) in cases { + let expected = + format!("Parser error: Error parsing timestamp from '{s}': {ctx}"); + let actual = string_to_datetime(&Utc, s).unwrap_err().to_string(); + assert_eq!(actual, expected) + } } // Parse a timestamp to timestamp int with a useful human readable error message @@ -654,18 +687,6 @@ mod tests { result } - fn expect_timestamp_parse_error(s: &str, expected_err: &str) { - match string_to_timestamp_nanos(s) { - Ok(v) => panic!( - "Expected error '{expected_err}' while parsing '{s}', but parsed {v} instead" - ), - Err(e) => { - assert!(e.to_string().contains(expected_err), - "Can not find expected error '{expected_err}' while parsing '{s}'. Actual error '{e}'"); - } - } - } - #[test] fn string_without_timezone_to_timestamp() { // string without timezone should always output the same regardless the local or session timezone diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 08fc5513d64f..587f42d7e58f 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -296,3 +296,7 @@ required-features = ["pyarrow"] [[test]] name = "array_cast" required-features = ["chrono-tz"] + +[[test]] +name = "timezone" +required-features = ["chrono-tz"] diff --git a/arrow/tests/timezone.rs b/arrow/tests/timezone.rs new file mode 100644 index 000000000000..1cd25fa6945b --- /dev/null +++ b/arrow/tests/timezone.rs @@ -0,0 +1,65 @@ +use arrow_array::TimestampMillisecondArray; +use arrow_cast::parse::string_to_datetime; +use chrono::Utc; + +#[test] +fn test_parse_timezone() { + let cases = [ + ( + "2023-01-01 040506 America/Los_Angeles", + "2023-01-01T12:05:06+00:00", + ), + ( + "2023-01-01 040506.345 America/Los_Angeles", + "2023-01-01T12:05:06.345+00:00", + ), + ( + "2023-01-01 04:05:06.345 America/Los_Angeles", + "2023-01-01T12:05:06.345+00:00", + ), + ( + "2023-01-01 04:05:06.789 -08", + "2023-01-01T12:05:06.789+00:00", + ), + ( + "2023-03-12 040506 America/Los_Angeles", + "2023-03-12T11:05:06+00:00", + ), // Daylight savings + ]; + + for (s, expected) in cases { + let actual = string_to_datetime(&Utc, s).unwrap().to_rfc3339(); + assert_eq!(actual, expected, "{s}") + } +} + +#[test] +fn test_parse_timezone_invalid() { + let cases = [ + ( + "2015-01-20T17:35:20-24:00", + "Parser error: Invalid timezone \"-24:00\": '-24:00' is not a valid timezone", + ), + ( + "2023-01-01 04:05:06.789 +07:30:00", + "Parser error: Invalid timezone \"+07:30:00\": '+07:30:00' is not a valid timezone" + ), + ( + // Sunday, 12 March 2023, 02:00:00 clocks are turned forward 1 hour to + // Sunday, 12 March 2023, 03:00:00 local daylight time instead. + "2023-03-12 02:05:06 America/Los_Angeles", + "Parser error: Error parsing timestamp from '2023-03-12 02:05:06 America/Los_Angeles': error computing timezone offset", + ), + ( + // Sunday, 5 November 2023, 02:00:00 clocks are turned backward 1 hour to + // Sunday, 5 November 2023, 01:00:00 local standard time instead. + "2023-11-05 01:30:06 America/Los_Angeles", + "Parser error: Error parsing timestamp from '2023-11-05 01:30:06 America/Los_Angeles': error computing timezone offset", + ), + ]; + + for (s, expected) in cases { + let actual = string_to_datetime(&Utc, s).unwrap_err().to_string(); + assert_eq!(actual, expected) + } +} From 763573c511bc735a896b9de1cb38ebf168208a86 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Thu, 9 Mar 2023 10:48:52 +0000 Subject: [PATCH 4/7] Review feedback --- arrow-cast/src/parse.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index dededac56507..bef194ac1b26 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -113,7 +113,7 @@ impl TimestampParser { false => Some((time, 19)), } } - // 09:26:56 + // 092656 0b111111 => { let hour = self.digits[11] * 10 + self.digits[12]; let minute = self.digits[13] * 10 + self.digits[14]; From 0e00ff5aab9ab7ef0f14ef7ab1dbbb79ab9476ec Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Thu, 9 Mar 2023 10:55:22 +0000 Subject: [PATCH 5/7] Review feedback --- arrow-cast/src/parse.rs | 7 ++++++- arrow/tests/timezone.rs | 18 +++++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index bef194ac1b26..3adbc401e164 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -23,7 +23,11 @@ use chrono::prelude::*; /// Helper for parsing timestamps struct TimestampParser { + /// The timestamp bytes to parse minus `b'0'` + /// + /// This makes interpretation as an integer inexpensive digits: [u8; 32], + /// A mask containing a `1` bit where the corresponding byte is a valid ASCII digit mask: u32, } @@ -32,6 +36,7 @@ impl TimestampParser { let mut digits = [0; 32]; let mut mask = 0; + // Treating all bytes the same way, helps LLVM vectorise this correctly for (idx, (o, i)) in digits.iter_mut().zip(bytes).enumerate() { *o = i.wrapping_sub(b'0'); mask |= ((*o < 10) as u32) << idx @@ -40,7 +45,7 @@ impl TimestampParser { Self { digits, mask } } - /// Returns true if the byte at `idx` equals `b` + /// Returns true if the byte at `idx` in the original string equals `b` fn test(&self, idx: usize, b: u8) -> bool { self.digits[idx] == b.wrapping_sub(b'0') } diff --git a/arrow/tests/timezone.rs b/arrow/tests/timezone.rs index 1cd25fa6945b..bcd2100e41b9 100644 --- a/arrow/tests/timezone.rs +++ b/arrow/tests/timezone.rs @@ -1,4 +1,20 @@ -use arrow_array::TimestampMillisecondArray; +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use arrow_cast::parse::string_to_datetime; use chrono::Utc; From 4bd77d4244e7d6609df4b1cc3545b7abfd1fd946 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Thu, 9 Mar 2023 11:08:23 +0000 Subject: [PATCH 6/7] Fix test --- arrow-cast/src/parse.rs | 1 + arrow/tests/timezone.rs | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 3adbc401e164..4e491747ba96 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -673,6 +673,7 @@ mod tests { ("1997-01-10T12:00:56.1234Z", "error parsing time"), ("1997-01-10T12:00:56.12345Z", "error parsing time"), ("1997-01-10T12:00:6.123Z", "error parsing time"), + ("1997-01-31T092656.123Z", "error parsing time") ]; for (s, ctx) in cases { diff --git a/arrow/tests/timezone.rs b/arrow/tests/timezone.rs index bcd2100e41b9..b71d04d64be0 100644 --- a/arrow/tests/timezone.rs +++ b/arrow/tests/timezone.rs @@ -26,7 +26,7 @@ fn test_parse_timezone() { "2023-01-01T12:05:06+00:00", ), ( - "2023-01-01 040506.345 America/Los_Angeles", + "2023-01-01 04:05:06.345 America/Los_Angeles", "2023-01-01T12:05:06.345+00:00", ), ( From fe95670c12df7610d957f6a297379c967702787e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Thu, 9 Mar 2023 11:25:21 +0000 Subject: [PATCH 7/7] Format --- arrow-cast/src/parse.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 4e491747ba96..9444b9d0e3be 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -673,7 +673,7 @@ mod tests { ("1997-01-10T12:00:56.1234Z", "error parsing time"), ("1997-01-10T12:00:56.12345Z", "error parsing time"), ("1997-01-10T12:00:6.123Z", "error parsing time"), - ("1997-01-31T092656.123Z", "error parsing time") + ("1997-01-31T092656.123Z", "error parsing time"), ]; for (s, ctx) in cases {