Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-49909][SQL] Fix the pretty name of some expressions #48385

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 40 additions & 40 deletions python/pyspark/sql/functions/builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -4921,44 +4921,44 @@ def array_agg(col: "ColumnOrName") -> Column:
>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([[1],[1],[2]], ["c"])
>>> df.agg(sf.sort_array(sf.array_agg('c'))).show()
+---------------------------------+
|sort_array(collect_list(c), true)|
+---------------------------------+
| [1, 1, 2]|
+---------------------------------+
+------------------------------+
|sort_array(array_agg(c), true)|
+------------------------------+
| [1, 1, 2]|
+------------------------------+

Example 2: Using array_agg function on a string column

>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([["apple"],["apple"],["banana"]], ["c"])
>>> df.agg(sf.sort_array(sf.array_agg('c'))).show(truncate=False)
+---------------------------------+
|sort_array(collect_list(c), true)|
+---------------------------------+
|[apple, apple, banana] |
+---------------------------------+
+------------------------------+
|sort_array(array_agg(c), true)|
+------------------------------+
|[apple, apple, banana] |
+------------------------------+

Example 3: Using array_agg function on a column with null values

>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([[1],[None],[2]], ["c"])
>>> df.agg(sf.sort_array(sf.array_agg('c'))).show()
+---------------------------------+
|sort_array(collect_list(c), true)|
+---------------------------------+
| [1, 2]|
+---------------------------------+
+------------------------------+
|sort_array(array_agg(c), true)|
+------------------------------+
| [1, 2]|
+------------------------------+

Example 4: Using array_agg function on a column with different data types

>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame([[1],["apple"],[2]], ["c"])
>>> df.agg(sf.sort_array(sf.array_agg('c'))).show()
+---------------------------------+
|sort_array(collect_list(c), true)|
+---------------------------------+
| [1, 2, apple]|
+---------------------------------+
+------------------------------+
|sort_array(array_agg(c), true)|
+------------------------------+
| [1, 2, apple]|
+------------------------------+
"""
return _invoke_function_over_columns("array_agg", col)

Expand Down Expand Up @@ -8712,31 +8712,31 @@ def dateadd(start: "ColumnOrName", days: Union["ColumnOrName", int]) -> Column:
>>> spark.createDataFrame(
... [('2015-04-08', 2,)], ['dt', 'add']
... ).select(sf.dateadd("dt", 1)).show()
+---------------+
|date_add(dt, 1)|
+---------------+
| 2015-04-09|
+---------------+
+--------------+
|dateadd(dt, 1)|
+--------------+
| 2015-04-09|
+--------------+

>>> import pyspark.sql.functions as sf
>>> spark.createDataFrame(
... [('2015-04-08', 2,)], ['dt', 'add']
... ).select(sf.dateadd("dt", sf.lit(2))).show()
+---------------+
|date_add(dt, 2)|
+---------------+
| 2015-04-10|
+---------------+
+--------------+
|dateadd(dt, 2)|
+--------------+
| 2015-04-10|
+--------------+

>>> import pyspark.sql.functions as sf
>>> spark.createDataFrame(
... [('2015-04-08', 2,)], ['dt', 'add']
... ).select(sf.dateadd("dt", -1)).show()
+----------------+
|date_add(dt, -1)|
+----------------+
| 2015-04-07|
+----------------+
+---------------+
|dateadd(dt, -1)|
+---------------+
| 2015-04-07|
+---------------+
"""
days = _enum_to_value(days)
days = lit(days) if isinstance(days, int) else days
Expand Down Expand Up @@ -10343,11 +10343,11 @@ def current_database() -> Column:
Examples
--------
>>> spark.range(1).select(current_database()).show()
+----------------+
|current_schema()|
+----------------+
| default|
+----------------+
+------------------+
|current_database()|
+------------------+
| default|
+------------------+
"""
return _invoke_function("current_database")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import scala.collection.mutable
import scala.collection.mutable.Growable

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, TypeCheckResult}
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.trees.UnaryLike
Expand Down Expand Up @@ -118,7 +118,8 @@ case class CollectList(

override def createAggregationBuffer(): mutable.ArrayBuffer[Any] = mutable.ArrayBuffer.empty

override def prettyName: String = "collect_list"
override def prettyName: String =
getTagValue(FunctionRegistry.FUNC_ALIAS).getOrElse("collect_list")

override def eval(buffer: mutable.ArrayBuffer[Any]): Any = {
new GenericArrayData(buffer.toArray)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,8 @@ case class CurrentDate(timeZoneId: Option[String] = None)
override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression =
copy(timeZoneId = Option(timeZoneId))

override def prettyName: String = "current_date"
override def prettyName: String =
getTagValue(FunctionRegistry.FUNC_ALIAS).getOrElse("current_date")
}

// scalastyle:off line.size.limit
Expand Down Expand Up @@ -329,7 +330,7 @@ case class DateAdd(startDate: Expression, days: Expression)
})
}

override def prettyName: String = "date_add"
override def prettyName: String = getTagValue(FunctionRegistry.FUNC_ALIAS).getOrElse("date_add")

override protected def withNewChildrenInternal(
newLeft: Expression, newRight: Expression): DateAdd = copy(startDate = newLeft, days = newRight)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,8 @@ object AssertTrue {
case class CurrentDatabase() extends LeafExpression with Unevaluable {
override def dataType: DataType = SQLConf.get.defaultStringType
override def nullable: Boolean = false
override def prettyName: String = "current_schema"
override def prettyName: String =
getTagValue(FunctionRegistry.FUNC_ALIAS).getOrElse("current_database")
final override val nodePatterns: Seq[TreePattern] = Seq(CURRENT_LIKE)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions

import java.util.Locale

import org.apache.spark.sql.catalyst.analysis.{ExpressionBuilder, TypeCheckResult}
import org.apache.spark.sql.catalyst.analysis.{ExpressionBuilder, FunctionRegistry, TypeCheckResult}
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch
import org.apache.spark.sql.catalyst.expressions.Cast._
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode}
Expand Down Expand Up @@ -307,7 +307,10 @@ case class ToCharacter(left: Expression, right: Expression)
inputTypeCheck
}
}
override def prettyName: String = "to_char"

override def prettyName: String =
getTagValue(FunctionRegistry.FUNC_ALIAS).getOrElse("to_char")

override def nullSafeEval(decimal: Any, format: Any): Any = {
val input = decimal.asInstanceOf[Decimal]
numberFormatter.format(input)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.SparkException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, UnresolvedSeed}
import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, TypeCheckResult, UnresolvedSeed}
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch
import org.apache.spark.sql.catalyst.expressions.ExpectsInputTypes.{ordinalNumber, toSQLExpr, toSQLType}
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode, FalseLiteral}
Expand Down Expand Up @@ -128,8 +128,12 @@ case class Rand(child: Expression, hideSeed: Boolean = false) extends Nondetermi
}

override def flatArguments: Iterator[Any] = Iterator(child)

override def prettyName: String =
getTagValue(FunctionRegistry.FUNC_ALIAS).getOrElse("rand")

override def sql: String = {
s"rand(${if (hideSeed) "" else child.sql})"
s"$prettyName(${if (hideSeed) "" else child.sql})"
}

override protected def withNewChildInternal(newChild: Expression): Rand = copy(child = newChild)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
Aggregate [collect_list(a#0, 0, 0) AS collect_list(a)#0]
Aggregate [array_agg(a#0, 0, 0) AS array_agg(a)#0]
+- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
Project [current_date(Some(America/Los_Angeles)) AS current_date()#0]
Project [curdate(Some(America/Los_Angeles)) AS curdate()#0]
+- LocalRelation <empty>, [d#0, t#0, s#0, x#0L, wt#0]
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
Project [current_schema() AS current_schema()#0]
Project [current_database() AS current_database()#0]
+- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
Project [date_add(d#0, 2) AS date_add(d, 2)#0]
Project [dateadd(d#0, 2) AS dateadd(d, 2)#0]
+- LocalRelation <empty>, [d#0, t#0, s#0, x#0L, wt#0]
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
Project [random(1) AS rand(1)#0]
Project [random(1) AS random(1)#0]
+- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
Project [to_char(cast(b#0 as decimal(30,15)), $99.99) AS to_char(b, $99.99)#0]
Project [to_varchar(cast(b#0 as decimal(30,15)), $99.99) AS to_varchar(b, $99.99)#0]
+- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,9 @@
| org.apache.spark.sql.catalyst.expressions.Csc | csc | SELECT csc(1) | struct<CSC(1):double> |
| org.apache.spark.sql.catalyst.expressions.CsvToStructs | from_csv | SELECT from_csv('1, 0.8', 'a INT, b DOUBLE') | struct<from_csv(1, 0.8):struct<a:int,b:double>> |
| org.apache.spark.sql.catalyst.expressions.CumeDist | cume_dist | SELECT a, b, cume_dist() OVER (PARTITION BY a ORDER BY b) FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b) | struct<a:string,b:int,cume_dist() OVER (PARTITION BY a ORDER BY b ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):double> |
| org.apache.spark.sql.catalyst.expressions.CurDateExpressionBuilder | curdate | SELECT curdate() | struct<current_date():date> |
| org.apache.spark.sql.catalyst.expressions.CurDateExpressionBuilder | curdate | SELECT curdate() | struct<curdate():date> |
| org.apache.spark.sql.catalyst.expressions.CurrentCatalog | current_catalog | SELECT current_catalog() | struct<current_catalog():string> |
| org.apache.spark.sql.catalyst.expressions.CurrentDatabase | current_database | SELECT current_database() | struct<current_schema():string> |
| org.apache.spark.sql.catalyst.expressions.CurrentDatabase | current_database | SELECT current_database() | struct<current_database():string> |
| org.apache.spark.sql.catalyst.expressions.CurrentDatabase | current_schema | SELECT current_schema() | struct<current_schema():string> |
| org.apache.spark.sql.catalyst.expressions.CurrentDate | current_date | SELECT current_date() | struct<current_date():date> |
| org.apache.spark.sql.catalyst.expressions.CurrentTimeZone | current_timezone | SELECT current_timezone() | struct<current_timezone():string> |
Expand All @@ -110,7 +110,7 @@
| org.apache.spark.sql.catalyst.expressions.CurrentUser | session_user | SELECT session_user() | struct<session_user():string> |
| org.apache.spark.sql.catalyst.expressions.CurrentUser | user | SELECT user() | struct<user():string> |
| org.apache.spark.sql.catalyst.expressions.DateAdd | date_add | SELECT date_add('2016-07-30', 1) | struct<date_add(2016-07-30, 1):date> |
| org.apache.spark.sql.catalyst.expressions.DateAdd | dateadd | SELECT dateadd('2016-07-30', 1) | struct<date_add(2016-07-30, 1):date> |
| org.apache.spark.sql.catalyst.expressions.DateAdd | dateadd | SELECT dateadd('2016-07-30', 1) | struct<dateadd(2016-07-30, 1):date> |
| org.apache.spark.sql.catalyst.expressions.DateDiff | date_diff | SELECT date_diff('2009-07-31', '2009-07-30') | struct<date_diff(2009-07-31, 2009-07-30):int> |
| org.apache.spark.sql.catalyst.expressions.DateDiff | datediff | SELECT datediff('2009-07-31', '2009-07-30') | struct<datediff(2009-07-31, 2009-07-30):int> |
| org.apache.spark.sql.catalyst.expressions.DateFormatClass | date_format | SELECT date_format('2016-04-08', 'y') | struct<date_format(2016-04-08, y):string> |
Expand Down Expand Up @@ -264,7 +264,7 @@
| org.apache.spark.sql.catalyst.expressions.RPadExpressionBuilder | rpad | SELECT rpad('hi', 5, '??') | struct<rpad(hi, 5, ??):string> |
| org.apache.spark.sql.catalyst.expressions.RaiseErrorExpressionBuilder | raise_error | SELECT raise_error('custom error message') | struct<raise_error(USER_RAISED_EXCEPTION, map(errorMessage, custom error message)):void> |
| org.apache.spark.sql.catalyst.expressions.Rand | rand | SELECT rand() | struct<rand():double> |
| org.apache.spark.sql.catalyst.expressions.Rand | random | SELECT random() | struct<rand():double> |
| org.apache.spark.sql.catalyst.expressions.Rand | random | SELECT random() | struct<random():double> |
| org.apache.spark.sql.catalyst.expressions.RandStr | randstr | SELECT randstr(3, 0) AS result | struct<result:string> |
| org.apache.spark.sql.catalyst.expressions.Randn | randn | SELECT randn() | struct<randn():double> |
| org.apache.spark.sql.catalyst.expressions.Rank | rank | SELECT a, b, rank(b) OVER (PARTITION BY a ORDER BY b) FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b) | struct<a:string,b:int,RANK() OVER (PARTITION BY a ORDER BY b ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):int> |
Expand Down Expand Up @@ -340,7 +340,7 @@
| org.apache.spark.sql.catalyst.expressions.TimeWindow | window | SELECT a, window.start, window.end, count(*) as cnt FROM VALUES ('A1', '2021-01-01 00:00:00'), ('A1', '2021-01-01 00:04:30'), ('A1', '2021-01-01 00:06:00'), ('A2', '2021-01-01 00:01:00') AS tab(a, b) GROUP by a, window(b, '5 minutes') ORDER BY a, start | struct<a:string,start:timestamp,end:timestamp,cnt:bigint> |
| org.apache.spark.sql.catalyst.expressions.ToBinary | to_binary | SELECT to_binary('abc', 'utf-8') | struct<to_binary(abc, utf-8):binary> |
| org.apache.spark.sql.catalyst.expressions.ToCharacterBuilder | to_char | SELECT to_char(454, '999') | struct<to_char(454, 999):string> |
| org.apache.spark.sql.catalyst.expressions.ToCharacterBuilder | to_varchar | SELECT to_varchar(454, '999') | struct<to_char(454, 999):string> |
| org.apache.spark.sql.catalyst.expressions.ToCharacterBuilder | to_varchar | SELECT to_varchar(454, '999') | struct<to_varchar(454, 999):string> |
| org.apache.spark.sql.catalyst.expressions.ToDegrees | degrees | SELECT degrees(3.141592653589793) | struct<DEGREES(3.141592653589793):double> |
| org.apache.spark.sql.catalyst.expressions.ToNumber | to_number | SELECT to_number('454', '999') | struct<to_number(454, 999):decimal(3,0)> |
| org.apache.spark.sql.catalyst.expressions.ToRadians | radians | SELECT radians(180) | struct<RADIANS(180):double> |
Expand Down Expand Up @@ -402,7 +402,7 @@
| org.apache.spark.sql.catalyst.expressions.aggregate.BoolOr | any | SELECT any(col) FROM VALUES (true), (false), (false) AS tab(col) | struct<any(col):boolean> |
| org.apache.spark.sql.catalyst.expressions.aggregate.BoolOr | bool_or | SELECT bool_or(col) FROM VALUES (true), (false), (false) AS tab(col) | struct<bool_or(col):boolean> |
| org.apache.spark.sql.catalyst.expressions.aggregate.BoolOr | some | SELECT some(col) FROM VALUES (true), (false), (false) AS tab(col) | struct<some(col):boolean> |
| org.apache.spark.sql.catalyst.expressions.aggregate.CollectList | array_agg | SELECT array_agg(col) FROM VALUES (1), (2), (1) AS tab(col) | struct<collect_list(col):array<int>> |
| org.apache.spark.sql.catalyst.expressions.aggregate.CollectList | array_agg | SELECT array_agg(col) FROM VALUES (1), (2), (1) AS tab(col) | struct<array_agg(col):array<int>> |
| org.apache.spark.sql.catalyst.expressions.aggregate.CollectList | collect_list | SELECT collect_list(col) FROM VALUES (1), (2), (1) AS tab(col) | struct<collect_list(col):array<int>> |
| org.apache.spark.sql.catalyst.expressions.aggregate.CollectSet | collect_set | SELECT collect_set(col) FROM VALUES (1), (2), (1) AS tab(col) | struct<collect_set(col):array<int>> |
| org.apache.spark.sql.catalyst.expressions.aggregate.Corr | corr | SELECT corr(c1, c2) FROM VALUES (3, 2), (3, 3), (6, 4) as tab(c1, c2) | struct<corr(c1, c2):double> |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -722,19 +722,19 @@ Project [chr(cast(167 as bigint)) AS chr(167)#x, chr(cast(247 as bigint)) AS chr
-- !query
SELECT to_varchar(78.12, '$99.99')
-- !query analysis
Project [to_char(78.12, $99.99) AS to_char(78.12, $99.99)#x]
Project [to_varchar(78.12, $99.99) AS to_varchar(78.12, $99.99)#x]
+- OneRowRelation


-- !query
SELECT to_varchar(111.11, '99.9')
-- !query analysis
Project [to_char(111.11, 99.9) AS to_char(111.11, 99.9)#x]
Project [to_varchar(111.11, 99.9) AS to_varchar(111.11, 99.9)#x]
+- OneRowRelation


-- !query
SELECT to_varchar(12454.8, '99,999.9S')
-- !query analysis
Project [to_char(12454.8, 99,999.9S) AS to_char(12454.8, 99,999.9S)#x]
Project [to_varchar(12454.8, 99,999.9S) AS to_varchar(12454.8, 99,999.9S)#x]
+- OneRowRelation
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
-- !query
select current_database(), current_schema(), current_catalog()
-- !query analysis
Project [current_schema() AS current_schema()#x, current_schema() AS current_schema()#x, current_catalog() AS current_catalog()#x]
Project [current_database() AS current_database()#x, current_schema() AS current_schema()#x, current_catalog() AS current_catalog()#x]
+- OneRowRelation
Original file line number Diff line number Diff line change
Expand Up @@ -1133,7 +1133,7 @@ SELECT
FROM VALUES
(1), (2), (1) AS tab(col)
-- !query analysis
Aggregate [collect_list(col#x, 0, 0) AS collect_list(col)#x, collect_list(col#x, 0, 0) AS collect_list(col)#x]
Aggregate [collect_list(col#x, 0, 0) AS collect_list(col)#x, array_agg(col#x, 0, 0) AS array_agg(col)#x]
+- SubqueryAlias tab
+- LocalRelation [col#x]

Expand All @@ -1147,7 +1147,7 @@ FROM VALUES
(1,4),(2,3),(1,4),(2,4) AS v(a,b)
GROUP BY a
-- !query analysis
Aggregate [a#x], [a#x, collect_list(b#x, 0, 0) AS collect_list(b)#x, collect_list(b#x, 0, 0) AS collect_list(b)#x]
Aggregate [a#x], [a#x, collect_list(b#x, 0, 0) AS collect_list(b)#x, array_agg(b#x, 0, 0) AS array_agg(b)#x]
+- SubqueryAlias v
+- LocalRelation [a#x, b#x]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -776,7 +776,7 @@ Project [NULL AS Expected#x, variablereference(system.session.var1=CAST(NULL AS
-- !query
DECLARE OR REPLACE VARIABLE var1 STRING DEFAULT CURRENT_DATABASE()
-- !query analysis
CreateVariable defaultvalueexpression(cast(current_schema() as string), CURRENT_DATABASE()), true
CreateVariable defaultvalueexpression(cast(current_database() as string), CURRENT_DATABASE()), true
+- ResolvedIdentifier org.apache.spark.sql.catalyst.analysis.FakeSystemCatalog$@xxxxxxxx, session.var1


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1235,22 +1235,22 @@ struct<chr(167):string,chr(247):string,chr(215):string>
-- !query
SELECT to_varchar(78.12, '$99.99')
-- !query schema
struct<to_char(78.12, $99.99):string>
struct<to_varchar(78.12, $99.99):string>
-- !query output
$78.12


-- !query
SELECT to_varchar(111.11, '99.9')
-- !query schema
struct<to_char(111.11, 99.9):string>
struct<to_varchar(111.11, 99.9):string>
-- !query output
##.#


-- !query
SELECT to_varchar(12454.8, '99,999.9S')
-- !query schema
struct<to_char(12454.8, 99,999.9S):string>
struct<to_varchar(12454.8, 99,999.9S):string>
-- !query output
12,454.8+
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
-- !query
select current_database(), current_schema(), current_catalog()
-- !query schema
struct<current_schema():string,current_schema():string,current_catalog():string>
struct<current_database():string,current_schema():string,current_catalog():string>
-- !query output
default default spark_catalog
Loading