Implement v2 CreateTableAsSelect.

apache · May 9, 2019 · b13a8e2 · b13a8e2
1 parent 09422f5
commit b13a8e2
Show file tree

Hide file tree

Showing 14 changed files with 487 additions and 32 deletions.
diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -238,7 +238,7 @@ unsupportedHiveNativeCommands
     ;
 
 createTableHeader
-    : CREATE TEMPORARY? EXTERNAL? TABLE (IF NOT EXISTS)? tableIdentifier
+    : CREATE TEMPORARY? EXTERNAL? TABLE (IF NOT EXISTS)? multipartIdentifier
     ;
 
 bucketSpec

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -33,6 +33,8 @@ import org.apache.spark.sql.types._
  */
 trait CheckAnalysis extends PredicateHelper {
 
+  import org.apache.spark.sql.catalog.v2.CatalogV2Implicits._
+
   /**
    * Override to provide additional checks for correct analysis.
    * These rules will be evaluated after our built-in check rules.
@@ -296,6 +298,21 @@ trait CheckAnalysis extends PredicateHelper {
               }
             }
 
+          case CreateTableAsSelect(_, _, partitioning, query, _, _, _) =>
+            val references = partitioning.flatMap(_.references).toSet
+            val badReferences = references.map(_.fieldNames).flatMap { column =>
+              query.schema.findNestedField(column).map(_.dataType) match {
+                case Some(_) =>
+                  None
+                case _ =>
+                  Some(s"${column.quoted} is missing or is in a map or array")
+              }
+            }
+
+            if (badReferences.nonEmpty) {
+              failAnalysis(s"Invalid partitioning: ${badReferences.mkString(", ")}")
+            }
+
           case _ => // Fallbacks to the following checks
         }
 

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -2019,7 +2019,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
   /**
    * Type to keep track of a table header: (identifier, isTemporary, ifNotExists, isExternal).
    */
-  type TableHeader = (TableIdentifier, Boolean, Boolean, Boolean)
+  type TableHeader = (Seq[String], Boolean, Boolean, Boolean)
 
   /**
    * Validate a create table statement and return the [[TableIdentifier]].
@@ -2031,7 +2031,8 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
     if (temporary && ifNotExists) {
       operationNotAllowed("CREATE TEMPORARY TABLE ... IF NOT EXISTS", ctx)
     }
-    (visitTableIdentifier(ctx.tableIdentifier), temporary, ifNotExists, ctx.EXTERNAL != null)
+    val multipartIdentifier: Seq[String] = ctx.multipartIdentifier.parts.asScala.map(_.getText)
+    (multipartIdentifier, temporary, ifNotExists, ctx.EXTERNAL != null)
   }
 
   /**

diff --git a/...st/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/...st/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.plans.logical
 
+import org.apache.spark.sql.catalog.v2.{Identifier, TableCatalog}
+import org.apache.spark.sql.catalog.v2.expressions.Transform
 import org.apache.spark.sql.catalyst.AliasIdentifier
 import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, NamedRelation}
 import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable}
@@ -402,6 +404,35 @@ trait V2WriteCommand extends Command {
   }
 }
 
+/**
+ * Create a new table from a select query with a v2 catalog.
+ */
+case class CreateTableAsSelect(
+    catalog: TableCatalog,
+    tableName: Identifier,
+    partitioning: Seq[Transform],
+    query: LogicalPlan,
+    properties: Map[String, String],
+    writeOptions: Map[String, String],
+    ignoreIfExists: Boolean) extends Command {
+
+  override def children: Seq[LogicalPlan] = Seq(query)
+
+  override lazy val resolved: Boolean = {
+    // the table schema is created from the query schema, so the only resolution needed is to check
+    // that the columns referenced by the table's partitioning exist in the query schema
+    val references = partitioning.flatMap(_.references).toSet
+    references.map(_.fieldNames).forall { column =>
+      query.schema.findNestedField(column).map(_.dataType) match {
+        case Some(_) =>
+          true
+        case _ =>
+          false
+      }
+    }
+  }
+}
+
 /**
  * Append data to an existing table.
  */

diff --git a/...src/main/scala/org/apache/spark/sql/catalyst/plans/logical/sql/CreateTableStatement.scala b/...src/main/scala/org/apache/spark/sql/catalyst/plans/logical/sql/CreateTableStatement.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.catalyst.plans.logical.sql
 
 import org.apache.spark.sql.catalog.v2.expressions.Transform
-import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.BucketSpec
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
@@ -30,7 +29,7 @@ import org.apache.spark.sql.types.StructType
  * This is a metadata-only command and is not used to write data to the created table.
  */
 case class CreateTableStatement(
-    table: TableIdentifier,
+    tableName: Seq[String],
     tableSchema: StructType,
     partitioning: Seq[Transform],
     bucketSpec: Option[BucketSpec],
@@ -50,7 +49,7 @@ case class CreateTableStatement(
  * A CREATE TABLE AS SELECT command, as parsed from SQL.
  */
 case class CreateTableAsSelectStatement(
-    table: TableIdentifier,
+    tableName: Seq[String],
     asSelect: LogicalPlan,
     partitioning: Seq[Transform],
     bucketSpec: Option[BucketSpec],

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -307,6 +307,29 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru
     nameToIndex.get(name)
   }
 
+  /**
+   * Returns a field in this struct and its child structs.
+   *
+   * This does not support finding fields nested in maps or arrays.
+   */
+  private[sql] def findNestedField(fieldNames: Seq[String]): Option[StructField] = {
+    fieldNames.headOption.flatMap(nameToField.get) match {
+      case Some(field) =>
+        if (fieldNames.tail.isEmpty) {
+          Some(field)
+        } else {
+          field.dataType match {
+            case struct: StructType =>
+              struct.findNestedField(fieldNames.tail)
+            case _ =>
+              None
+          }
+        }
+      case _ =>
+        None
+    }
+  }
+
   protected[sql] def toAttributes: Seq[AttributeReference] =
     map(f => AttributeReference(f.name, f.dataType, f.nullable, f.metadata)())
 

diff --git a/...scala/org/apache/spark/sql/catalyst/analysis/CreateTablePartitioningValidationSuite.scala b/...scala/org/apache/spark/sql/catalyst/analysis/CreateTablePartitioningValidationSuite.scala
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.analysis
+
+import org.apache.spark.sql.catalog.v2.{Identifier, TableCatalog, TestTableCatalog}
+import org.apache.spark.sql.catalog.v2.expressions.LogicalExpressions
+import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import org.apache.spark.sql.catalyst.plans.logical.{CreateTableAsSelect, LeafNode}
+import org.apache.spark.sql.types.{DoubleType, LongType, StringType, StructType}
+import org.apache.spark.sql.util.CaseInsensitiveStringMap
+
+class CreateTablePartitioningValidationSuite extends AnalysisTest {
+  import CreateTablePartitioningValidationSuite._
+
+  test("CreateTableAsSelect: fail missing top-level column") {
+    val plan = CreateTableAsSelect(
+      catalog,
+      Identifier.of(Array(), "table_name"),
+      LogicalExpressions.bucket(4, "does_not_exist") :: Nil,
+      TestRelation2,
+      Map.empty,
+      Map.empty,
+      ignoreIfExists = false)
+
+    assert(!plan.resolved)
+    assertAnalysisError(plan, Seq(
+      "Invalid partitioning",
+      "does_not_exist is missing or is in a map or array"))
+  }
+
+  test("CreateTableAsSelect: fail missing top-level column nested reference") {
+    val plan = CreateTableAsSelect(
+      catalog,
+      Identifier.of(Array(), "table_name"),
+      LogicalExpressions.bucket(4, "does_not_exist.z") :: Nil,
+      TestRelation2,
+      Map.empty,
+      Map.empty,
+      ignoreIfExists = false)
+
+    assert(!plan.resolved)
+    assertAnalysisError(plan, Seq(
+      "Invalid partitioning",
+      "does_not_exist.z is missing or is in a map or array"))
+  }
+
+  test("CreateTableAsSelect: fail missing nested column") {
+    val plan = CreateTableAsSelect(
+      catalog,
+      Identifier.of(Array(), "table_name"),
+      LogicalExpressions.bucket(4, "point.z") :: Nil,
+      TestRelation2,
+      Map.empty,
+      Map.empty,
+      ignoreIfExists = false)
+
+    assert(!plan.resolved)
+    assertAnalysisError(plan, Seq(
+      "Invalid partitioning",
+      "point.z is missing or is in a map or array"))
+  }
+
+  test("CreateTableAsSelect: fail with multiple errors") {
+    val plan = CreateTableAsSelect(
+      catalog,
+      Identifier.of(Array(), "table_name"),
+      LogicalExpressions.bucket(4, "does_not_exist", "point.z") :: Nil,
+      TestRelation2,
+      Map.empty,
+      Map.empty,
+      ignoreIfExists = false)
+
+    assert(!plan.resolved)
+    assertAnalysisError(plan, Seq(
+      "Invalid partitioning",
+      "point.z is missing or is in a map or array",
+      "does_not_exist is missing or is in a map or array"))
+  }
+
+  test("CreateTableAsSelect: success with top-level column") {
+    val plan = CreateTableAsSelect(
+      catalog,
+      Identifier.of(Array(), "table_name"),
+      LogicalExpressions.bucket(4, "id") :: Nil,
+      TestRelation2,
+      Map.empty,
+      Map.empty,
+      ignoreIfExists = false)
+
+    assertAnalysisSuccess(plan)
+  }
+
+  test("CreateTableAsSelect: success using nested column") {
+    val plan = CreateTableAsSelect(
+      catalog,
+      Identifier.of(Array(), "table_name"),
+      LogicalExpressions.bucket(4, "point.x") :: Nil,
+      TestRelation2,
+      Map.empty,
+      Map.empty,
+      ignoreIfExists = false)
+
+    assertAnalysisSuccess(plan)
+  }
+
+  test("CreateTableAsSelect: success using complex column") {
+    val plan = CreateTableAsSelect(
+      catalog,
+      Identifier.of(Array(), "table_name"),
+      LogicalExpressions.bucket(4, "point") :: Nil,
+      TestRelation2,
+      Map.empty,
+      Map.empty,
+      ignoreIfExists = false)
+
+    assertAnalysisSuccess(plan)
+  }
+}
+
+private object CreateTablePartitioningValidationSuite {
+  val catalog: TableCatalog = {
+    val cat = new TestTableCatalog()
+    cat.initialize("test", CaseInsensitiveStringMap.empty())
+    cat
+  }
+
+  val schema: StructType = new StructType()
+      .add("id", LongType)
+      .add("data", StringType)
+      .add("point", new StructType().add("x", DoubleType).add("y", DoubleType))
+}
+
+private case object TestRelation2 extends LeafNode with NamedRelation {
+  override def name: String = "source_relation"
+  override def output: Seq[AttributeReference] =
+    CreateTablePartitioningValidationSuite.schema.toAttributes
+}
+