Skip to content

Commit

Permalink
[SPARK-38590][SQL] New SQL function: try_to_binary
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?

Add a new SQL function: `try_to_binary`. It is identical to the function `to_binary`, except that it returns NULL results instead of throwing an exception on encoding errors.
There is a similar function in Snowflake: https://docs.snowflake.com/en/sql-reference/functions/try_to_binary.html

### Why are the changes needed?

Users can manage to finish queries without interruptions by encoding errors.

### Does this PR introduce _any_ user-facing change?

Yes, adding a new SQL function: `try_to_binary`. It is identical to the function `to_binary`, except that it returns NULL results instead of throwing an exception on encoding errors.

### How was this patch tested?

UT

Closes #35897 from gengliangwang/try_to_binary.

Authored-by: Gengliang Wang <gengliang@apache.org>
Signed-off-by: Max Gekk <max.gekk@gmail.com>
  • Loading branch information
gengliangwang authored and MaxGekk committed Apr 7, 2022
1 parent b57c93b commit becda33
Show file tree
Hide file tree
Showing 9 changed files with 97 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,7 @@ object FunctionRegistry {
expression[TryMultiply]("try_multiply"),
expression[TryElementAt]("try_element_at"),
expression[TrySum]("try_sum"),
expression[TryToBinary]("try_to_binary"),

// aggregate functions
expression[HyperLogLogPlusPlus]("approx_count_distinct"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,3 +181,38 @@ case class TryMultiply(left: Expression, right: Expression, replacement: Express
override protected def withNewChildInternal(newChild: Expression): Expression =
this.copy(replacement = newChild)
}

// scalastyle:off line.size.limit
@ExpressionDescription(
usage = "_FUNC_(str[, fmt]) - This is a special version of `to_binary` that performs the same operation, but returns a NULL value instead of raising an error if the conversion cannot be performed.",
examples = """
Examples:
> SELECT _FUNC_('abc', 'utf-8');
abc
> select _FUNC_('a!', 'base64');
NULL
> select _FUNC_('abc', 'invalidFormat');
NULL
""",
since = "3.3.0",
group = "string_funcs")
// scalastyle:on line.size.limit
case class TryToBinary(
expr: Expression,
format: Option[Expression],
replacement: Expression) extends RuntimeReplaceable
with InheritAnalysisRules {
def this(expr: Expression) =
this(expr, None, TryEval(ToBinary(expr, None, nullOnInvalidFormat = true)))

def this(expr: Expression, formatExpression: Expression) =
this(expr, Some(formatExpression),
TryEval(ToBinary(expr, Some(formatExpression), nullOnInvalidFormat = true)))

override def prettyName: String = "try_to_binary"

override def parameters: Seq[Expression] = expr +: format.toSeq

override protected def withNewChildInternal(newChild: Expression): Expression =
this.copy(replacement = newChild)
}
Original file line number Diff line number Diff line change
Expand Up @@ -2638,7 +2638,10 @@ case class Encode(value: Expression, charset: Expression)
since = "3.3.0",
group = "string_funcs")
// scalastyle:on line.size.limit
case class ToBinary(expr: Expression, format: Option[Expression]) extends RuntimeReplaceable
case class ToBinary(
expr: Expression,
format: Option[Expression],
nullOnInvalidFormat: Boolean = false) extends RuntimeReplaceable
with ImplicitCastInputTypes {

override lazy val replacement: Expression = format.map { f =>
Expand All @@ -2651,6 +2654,7 @@ case class ToBinary(expr: Expression, format: Option[Expression]) extends Runtim
case "hex" => Unhex(expr)
case "utf-8" => Encode(expr, Literal("UTF-8"))
case "base64" => UnBase64(expr)
case _ if nullOnInvalidFormat => Literal(null, BinaryType)
case other => throw QueryCompilationErrors.invalidStringLiteralParameter(
"to_binary", "format", other,
Some("The value has to be a case-insensitive string literal of " +
Expand All @@ -2659,16 +2663,18 @@ case class ToBinary(expr: Expression, format: Option[Expression]) extends Runtim
}
}.getOrElse(Unhex(expr))

def this(expr: Expression) = this(expr, None)
def this(expr: Expression) = this(expr, None, false)

def this(expr: Expression, format: Expression) = this(expr, Some({
// We perform this check in the constructor to make it eager and not go through type coercion.
if (format.foldable && (format.dataType == StringType || format.dataType == NullType)) {
format
} else {
throw QueryCompilationErrors.requireLiteralParameter("to_binary", "format", "string")
}
}))
// We perform this check in the constructor to make it eager and not go through type coercion.
if (format.foldable && (format.dataType == StringType || format.dataType == NullType)) {
format
} else {
throw QueryCompilationErrors.requireLiteralParameter("to_binary", "format", "string")
}
}),
false
)

override def prettyName: String = "to_binary"

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<!-- Automatically generated by ExpressionsSchemaSuite -->
## Summary
- Number of queries: 385
- Number of queries: 386
- Number of expressions that missing example: 12
- Expressions missing examples: bigint,binary,boolean,date,decimal,double,float,int,smallint,string,timestamp,tinyint
## Schema of Built-in Functions
Expand Down Expand Up @@ -316,6 +316,7 @@
| org.apache.spark.sql.catalyst.expressions.TryElementAt | try_element_at | SELECT try_element_at(array(1, 2, 3), 2) | struct<try_element_at(array(1, 2, 3), 2):int> |
| org.apache.spark.sql.catalyst.expressions.TryMultiply | try_multiply | SELECT try_multiply(2, 3) | struct<try_multiply(2, 3):int> |
| org.apache.spark.sql.catalyst.expressions.TrySubtract | try_subtract | SELECT try_subtract(2, 1) | struct<try_subtract(2, 1):int> |
| org.apache.spark.sql.catalyst.expressions.TryToBinary | try_to_binary | SELECT try_to_binary('abc', 'utf-8') | struct<try_to_binary(abc, utf-8):binary> |
| org.apache.spark.sql.catalyst.expressions.TypeOf | typeof | SELECT typeof(1) | struct<typeof(1):string> |
| org.apache.spark.sql.catalyst.expressions.UnBase64 | unbase64 | SELECT unbase64('U3BhcmsgU1FM') | struct<unbase64(U3BhcmsgU1FM):binary> |
| org.apache.spark.sql.catalyst.expressions.UnaryMinus | negative | SELECT negative(1) | struct<negative(1):int> |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -184,5 +184,7 @@ select to_binary(null, cast(null as string));
-- 'format' parameter must be string type or void type.
select to_binary(null, cast(null as int));
select to_binary('abc', 1);
-- invalid inputs.
-- invalid format
select to_binary('abc', 'invalidFormat');
-- invalid string input
select to_binary('a!', 'base64');
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
-- try_to_binary
select try_to_binary('abc');
select try_to_binary('abc', 'utf-8');
select try_to_binary('abc', 'base64');
select try_to_binary('abc', 'hex');
-- 'format' parameter can be any foldable string value, not just literal.
select try_to_binary('abc', concat('utf', '-8'));
-- 'format' parameter is case insensitive.
select try_to_binary('abc', 'Hex');
-- null inputs lead to null result.
select try_to_binary('abc', null);
select try_to_binary(null, 'utf-8');
select try_to_binary(null, null);
select try_to_binary(null, cast(null as string));
-- 'format' parameter must be string type or void type.
select try_to_binary(null, cast(null as int));
select try_to_binary('abc', 1);
-- invalid format
select try_to_binary('abc', 'invalidFormat');
-- invalid string input
select try_to_binary('a!', 'base64');
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 141
-- Number of queries: 142


-- !query
Expand Down Expand Up @@ -1140,3 +1140,12 @@ struct<>
-- !query output
org.apache.spark.sql.AnalysisException
Invalid value for the 'format' parameter of function 'to_binary': invalidformat. The value has to be a case-insensitive string literal of 'hex', 'utf-8', or 'base64'.


-- !query
select to_binary('a!', 'base64')
-- !query schema
struct<>
-- !query output
java.lang.IllegalArgumentException
Last unit does not have enough valid bits
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 141
-- Number of queries: 142


-- !query
Expand Down Expand Up @@ -1136,3 +1136,12 @@ struct<>
-- !query output
org.apache.spark.sql.AnalysisException
Invalid value for the 'format' parameter of function 'to_binary': invalidformat. The value has to be a case-insensitive string literal of 'hex', 'utf-8', or 'base64'.


-- !query
select to_binary('a!', 'base64')
-- !query schema
struct<>
-- !query output
java.lang.IllegalArgumentException
Last unit does not have enough valid bits
Binary file not shown.

0 comments on commit becda33

Please sign in to comment.