Skip to content

Commit

Permalink
update GetTimeStamp, UnixTimeStamp, ToUnixTimeStamp and Cast to throw…
Browse files Browse the repository at this point in the history
… runtime exception when parsing to timestamp fail with ANSI mode on.

Change-Id: Ie76d494906c7615871860c89602896c64ed2d9d6
  • Loading branch information
leanken-zz committed Nov 20, 2020
1 parent 02d410a commit e6f5634
Show file tree
Hide file tree
Showing 9 changed files with 362 additions and 37 deletions.
5 changes: 5 additions & 0 deletions docs/sql-ref-ansi-compliance.md
Original file line number Diff line number Diff line change
Expand Up @@ -135,12 +135,17 @@ The behavior of some SQL functions can be different under ANSI mode (`spark.sql.
- `element_at`: This function throws `ArrayIndexOutOfBoundsException` if using invalid indices.
- `element_at`: This function throws `NoSuchElementException` if key does not exist in map.
- `elt`: This function throws `ArrayIndexOutOfBoundsException` if using invalid indices.
- `to_date` This function should fail with Exception if the input string can't be parsed, or the pattern string is invalid.
- `to_timestamp` This function should fail with Exception if the input string can't be parsed, or the pattern string is invalid.
- `unix_timestamp` This function should fail with Exception if the input string can't be parsed, or the pattern string is invalid.
- `to_unix_timestamp` This function should fail with Exception if the input string can't be parsed, or the pattern string is invalid.

### SQL Operators

The behavior of some SQL operators can be different under ANSI mode (`spark.sql.ansi.enabled=true`).
- `array_col[index]`: This operator throws `ArrayIndexOutOfBoundsException` if using invalid indices.
- `map_col[key]`: This operator throws `NoSuchElementException` if key does not exist in map.
- `cast to timestamp`: This operator should fail with Exception if the input string can't be parsed.

### SQL Keywords

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,8 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
// TimestampConverter
private[this] def castToTimestamp(from: DataType): Any => Any = from match {
case StringType =>
buildCast[UTF8String](_, utfs => DateTimeUtils.stringToTimestamp(utfs, zoneId).orNull)
buildCast[UTF8String](_,
utfs => DateTimeUtils.stringToTimestamp(utfs, zoneId, ansiEnabled).orNull)
case BooleanType =>
buildCast[Boolean](_, b => if (b) 1L else 0)
case LongType =>
Expand Down Expand Up @@ -1257,7 +1258,8 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
(c, evPrim, evNull) =>
code"""
scala.Option<Long> $longOpt =
org.apache.spark.sql.catalyst.util.DateTimeUtils.stringToTimestamp($c, $zid);
org.apache.spark.sql.catalyst.util.DateTimeUtils.stringToTimestamp($c, $zid,
$ansiEnabled);
if ($longOpt.isDefined()) {
$evPrim = ((Long) $longOpt.get()).longValue();
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -705,10 +705,12 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti
case class ToUnixTimestamp(
timeExp: Expression,
format: Expression,
timeZoneId: Option[String] = None)
timeZoneId: Option[String] = None,
failOnError: Boolean = SQLConf.get.ansiEnabled)
extends UnixTime {

def this(timeExp: Expression, format: Expression) = this(timeExp, format, None)
def this(timeExp: Expression, format: Expression) =
this(timeExp, format, None, SQLConf.get.ansiEnabled)

override def left: Expression = timeExp
override def right: Expression = format
Expand Down Expand Up @@ -752,10 +754,15 @@ case class ToUnixTimestamp(
group = "datetime_funcs",
since = "1.5.0")
// scalastyle:on line.size.limit
case class UnixTimestamp(timeExp: Expression, format: Expression, timeZoneId: Option[String] = None)
case class UnixTimestamp(
timeExp: Expression,
format: Expression,
timeZoneId: Option[String] = None,
failOnError: Boolean = SQLConf.get.ansiEnabled)
extends UnixTime {

def this(timeExp: Expression, format: Expression) = this(timeExp, format, None)
def this(timeExp: Expression, format: Expression) =
this(timeExp, format, None, SQLConf.get.ansiEnabled)

override def left: Expression = timeExp
override def right: Expression = format
Expand Down Expand Up @@ -790,6 +797,15 @@ abstract class ToTimestamp
override def dataType: DataType = LongType
override def nullable: Boolean = true

def failOnError: Boolean

private def isParseError(e: Throwable): Boolean = e match {
case _: DateTimeParseException |
_: DateTimeException |
_: ParseException => true
case _ => false
}

override def eval(input: InternalRow): Any = {
val t = left.eval(input)
if (t == null) {
Expand All @@ -809,9 +825,12 @@ abstract class ToTimestamp
try {
formatter.parse(t.asInstanceOf[UTF8String].toString) / downScaleFactor
} catch {
case _: DateTimeParseException |
_: DateTimeException |
_: ParseException => null
case e if isParseError(e) =>
if (failOnError) {
throw e
} else {
null
}
}
}
}
Expand All @@ -820,6 +839,7 @@ abstract class ToTimestamp

override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
val javaType = CodeGenerator.javaType(dataType)
val parseErrorBranch = if (failOnError) "throw e;" else s"${ev.isNull} = true;"
left.dataType match {
case StringType => formatterOption.map { fmt =>
val df = classOf[TimestampFormatter].getName
Expand All @@ -829,11 +849,11 @@ abstract class ToTimestamp
|try {
| ${ev.value} = $formatterName.parse($datetimeStr.toString()) / $downScaleFactor;
|} catch (java.time.DateTimeException e) {
| ${ev.isNull} = true;
| $parseErrorBranch
|} catch (java.time.format.DateTimeParseException e) {
| ${ev.isNull} = true;
| $parseErrorBranch
|} catch (java.text.ParseException e) {
| ${ev.isNull} = true;
| $parseErrorBranch
|}
|""".stripMargin)
}.getOrElse {
Expand All @@ -851,11 +871,11 @@ abstract class ToTimestamp
|try {
| ${ev.value} = $timestampFormatter.parse($string.toString()) / $downScaleFactor;
|} catch (java.time.format.DateTimeParseException e) {
| ${ev.isNull} = true;
| $parseErrorBranch
|} catch (java.time.DateTimeException e) {
| ${ev.isNull} = true;
| $parseErrorBranch
|} catch (java.text.ParseException e) {
| ${ev.isNull} = true;
| $parseErrorBranch
|}
|""".stripMargin)
}
Expand Down Expand Up @@ -1722,7 +1742,8 @@ case class DateDiff(endDate: Expression, startDate: Expression)
private case class GetTimestamp(
left: Expression,
right: Expression,
timeZoneId: Option[String] = None)
timeZoneId: Option[String] = None,
failOnError: Boolean = SQLConf.get.ansiEnabled)
extends ToTimestamp {

override val downScaleFactor = 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,15 @@ object DateTimeUtils {
}
}

def stringToTimestamp(s: UTF8String, timeZoneId: ZoneId, ansiEnabled: Boolean): Option[Long] = {
val timestamp = stringToTimestamp(s, timeZoneId)
if (ansiEnabled && timestamp.isEmpty) {
throw new DateTimeException(s"Cannot cast $s to TimestampType.")
} else {
timestamp
}
}

/**
* Gets the number of microseconds since the epoch of 1970-01-01 00:00:00Z from the given
* instance of `java.time.Instant`. The epoch microsecond count is a simple incrementing count of
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
package org.apache.spark.sql.catalyst.expressions

import java.sql.{Date, Timestamp}
import java.text.SimpleDateFormat
import java.time.{Instant, LocalDate, ZoneId}
import java.text.{ParseException, SimpleDateFormat}
import java.time.{DateTimeException, Instant, LocalDate, ZoneId}
import java.time.format.DateTimeParseException
import java.util.{Calendar, Locale, TimeZone}
import java.util.concurrent.TimeUnit._

Expand Down Expand Up @@ -1286,4 +1287,47 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
testIntegralFunc(Long.MaxValue)
testIntegralFunc(Long.MinValue)
}

test("SPARK-33498: GetTimestamp,UnixTimestamp,ToUnixTimestamp with parseError") {
Seq(true, false).foreach { ansiEnabled =>
Seq("LEGACY", "CORRECTED", "EXCEPTION").foreach { policy =>
withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> policy,
SQLConf.ANSI_ENABLED.key -> ansiEnabled.toString) {

val exprSeq = Seq[Expression](
GetTimestamp(Literal("2020-01-27T20:06:11.847"), Literal("yyyy-MM-dd HH:mm:ss.SSS")),
GetTimestamp(Literal("Unparseable"), Literal("yyyy-MM-dd HH:mm:ss.SSS")),
UnixTimestamp(Literal("2020-01-27T20:06:11.847"), Literal("yyyy-MM-dd HH:mm:ss.SSS")),
UnixTimestamp(Literal("Unparseable"), Literal("yyyy-MM-dd HH:mm:ss.SSS")),
ToUnixTimestamp(Literal("2020-01-27T20:06:11.847"), Literal("yyyy-MM-dd HH:mm:ss.SSS")),
ToUnixTimestamp(Literal("Unparseable"), Literal("yyyy-MM-dd HH:mm:ss.SSS"))
)

if (!ansiEnabled) {
exprSeq.foreach(checkEvaluation(_, null))
} else if (policy == "LEGACY") {
exprSeq.foreach(checkExceptionInExpression[ParseException](_, "Unparseable"))
} else {
exprSeq.foreach(
checkExceptionInExpression[DateTimeParseException](_, "could not be parsed"))
}
}
}
}
}

test("SPARK-33498: TimestampType cast with parseError") {
Seq(true, false).foreach { ansiEnabled =>
withSQLConf(SQLConf.ANSI_ENABLED.key -> ansiEnabled.toString) {
val input = "Unparseable"
val castExpr = Cast(Literal(input), TimestampType, UTC_OPT)
if (!ansiEnabled) {
checkEvaluation(castExpr, null)
} else {
checkExceptionInExpression[DateTimeException](castExpr,
s"Cannot cast $input to TimestampType.")
}
}
}
}
}
11 changes: 11 additions & 0 deletions sql/core/src/test/resources/sql-tests/inputs/datetime.sql
Original file line number Diff line number Diff line change
Expand Up @@ -153,3 +153,14 @@ select from_json('{"t":"26/October/2015"}', 't Timestamp', map('timestampFormat'
select from_json('{"d":"26/October/2015"}', 'd Date', map('dateFormat', 'dd/MMMMM/yyyy'));
select from_csv('26/October/2015', 't Timestamp', map('timestampFormat', 'dd/MMMMM/yyyy'));
select from_csv('26/October/2015', 'd Date', map('dateFormat', 'dd/MMMMM/yyyy'));

-- Timestamp type parse error
select to_date("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS");
select to_date("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS");
select to_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS");
select to_timestamp("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS");
select unix_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS");
select unix_timestamp("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS");
select to_unix_timestamp("2020-01-27T20:06:11.847", "yyyy-MM-dd HH:mm:ss.SSS");
select to_unix_timestamp("Unparseable", "yyyy-MM-dd HH:mm:ss.SSS");
select cast("Unparseable" as timestamp)
Loading

0 comments on commit e6f5634

Please sign in to comment.