Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-24543][SQL] Support any type as DDL string for from_json's schema #21550

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions python/pyspark/sql/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2168,8 +2168,7 @@ def from_json(col, schema, options={}):
[Row(json=Row(a=1))]
>>> df.select(from_json(df.value, "a INT").alias("json")).collect()
[Row(json=Row(a=1))]
>>> schema = MapType(StringType(), IntegerType())
>>> df.select(from_json(df.value, schema).alias("json")).collect()
>>> df.select(from_json(df.value, "MAP<STRING,INT>").alias("json")).collect()
[Row(json={u'a': 1})]
>>> data = [(1, '''[{"a": 1}]''')]
>>> schema = ArrayType(StructType([StructField("a", IntegerType())]))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
import org.apache.spark.sql.catalyst.json._
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, BadRecordException, FailFastMode, GenericArrayData, MapData}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types._
Expand Down Expand Up @@ -747,8 +746,8 @@ case class StructsToJson(

object JsonExprUtils {

def validateSchemaLiteral(exp: Expression): StructType = exp match {
case Literal(s, StringType) => CatalystSqlParser.parseTableSchema(s.toString)
def validateSchemaLiteral(exp: Expression): DataType = exp match {
case Literal(s, StringType) => DataType.fromDDL(s.toString)
case e => throw new AnalysisException(s"Expected a string literal instead of $e")
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,16 @@ package org.apache.spark.sql.types

import java.util.Locale

import scala.util.control.NonFatal

import org.json4s._
import org.json4s.JsonAST.JValue
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.annotation.InterfaceStability
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.util.Utils

Expand Down Expand Up @@ -110,6 +113,14 @@ abstract class DataType extends AbstractDataType {
@InterfaceStability.Stable
object DataType {

def fromDDL(ddl: String): DataType = {
try {
CatalystSqlParser.parseDataType(ddl)
} catch {
case NonFatal(_) => CatalystSqlParser.parseTableSchema(ddl)
}
}

def fromJson(json: String): DataType = parseDataType(parse(json))

private val nonDecimalNameToType = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3369,7 +3369,7 @@ object functions {
val dataType = try {
DataType.fromJson(schema)
} catch {
case NonFatal(_) => StructType.fromDDL(schema)
case NonFatal(_) => DataType.fromDDL(schema)
}
from_json(e, dataType, options)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,7 @@ CREATE TEMPORARY VIEW jsonTable(jsonField, a) AS SELECT * FROM VALUES ('{"a": 1,
SELECT json_tuple(jsonField, 'b', CAST(NULL AS STRING), a) FROM jsonTable;
-- Clean up
DROP VIEW IF EXISTS jsonTable;

-- from_json - complex types
select from_json('{"a":1, "b":2}', 'map<string, int>');
select from_json('{"a":1, "b":"2"}', 'struct<a:int,b:string>');
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 26
-- Number of queries: 28


-- !query 0
Expand Down Expand Up @@ -258,3 +258,19 @@ DROP VIEW IF EXISTS jsonTable
struct<>
-- !query 25 output



-- !query 26
select from_json('{"a":1, "b":2}', 'map<string, int>')
-- !query 26 schema
struct<entries:map<string,int>>
-- !query 26 output
{"a":1,"b":2}


-- !query 27
select from_json('{"a":1, "b":"2"}', 'struct<a:int,b:string>')
-- !query 27 schema
struct<jsontostructs({"a":1, "b":"2"}):struct<a:int,b:string>>
-- !query 27 output
{"a":1,"b":"2"}
Original file line number Diff line number Diff line change
Expand Up @@ -354,8 +354,8 @@ class JsonFunctionsSuite extends QueryTest with SharedSQLContext {

test("SPARK-24027: from_json - map<string, map<string, int>>") {
val in = Seq("""{"a": {"b": 1}}""").toDS()
val schema = MapType(StringType, MapType(StringType, IntegerType))
val out = in.select(from_json($"value", schema))
val schema = "map<string, map<string, int>>"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A general suggestion. Create a new test case for these changes, instead of modifying the existing ones.

val out = in.select(from_json($"value", schema, Map.empty[String, String]))

checkAnswer(out, Row(Map("a" -> Map("b" -> 1))))
}
Expand Down