diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala index 034223e115389..5c79c6905801c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala @@ -33,7 +33,6 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection -import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ @@ -51,7 +50,7 @@ private[libsvm] class LibSVMOutputWriter( new TextOutputFormat[NullWritable, Text]() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { val configuration = context.getConfiguration - val uniqueWriteJobId = configuration.get(CreateDataSourceTableUtils.DATASOURCE_WRITEJOBUUID) + val uniqueWriteJobId = configuration.get(WriterContainer.DATASOURCE_WRITEJOBUUID) val taskAttemptId = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$extension") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 71c3bd31e02e4..e32d30178eeb1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -971,7 +971,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { // Storage format val defaultStorage: CatalogStorageFormat = { val defaultStorageType = conf.getConfString("hive.default.fileformat", "textfile") - val defaultHiveSerde = HiveSerDe.sourceToSerDe(defaultStorageType, conf) + val defaultHiveSerde = HiveSerDe.sourceToSerDe(defaultStorageType) CatalogStorageFormat( locationUri = None, inputFormat = defaultHiveSerde.flatMap(_.inputFormat) @@ -1115,7 +1115,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { override def visitGenericFileFormat( ctx: GenericFileFormatContext): CatalogStorageFormat = withOrigin(ctx) { val source = ctx.identifier.getText - HiveSerDe.sourceToSerDe(source, conf) match { + HiveSerDe.sourceToSerDe(source) match { case Some(s) => CatalogStorageFormat.empty.copy( inputFormat = s.inputFormat, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala index 7b028e72ed0a8..7400a0e7bb1f1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala @@ -17,10 +17,6 @@ package org.apache.spark.sql.execution.command -import scala.collection.mutable -import scala.util.control.NonFatal - -import org.apache.spark.internal.Logging import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases @@ -28,7 +24,6 @@ import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.datasources._ -import org.apache.spark.sql.internal.HiveSerDe import org.apache.spark.sql.sources.{BaseRelation, InsertableRelation} import org.apache.spark.sql.types._ @@ -97,16 +92,19 @@ case class CreateDataSourceTableCommand( } } - CreateDataSourceTableUtils.createDataSourceTable( - sparkSession = sparkSession, - tableIdent = tableIdent, + val table = CatalogTable( + identifier = tableIdent, + tableType = if (isExternal) CatalogTableType.EXTERNAL else CatalogTableType.MANAGED, + storage = CatalogStorageFormat.empty.copy(properties = optionsWithPath), schema = dataSource.schema, - partitionColumns = partitionColumns, - bucketSpec = bucketSpec, - provider = provider, - options = optionsWithPath, - isExternal = isExternal) - + provider = Some(provider), + partitionColumnNames = partitionColumns, + bucketSpec = bucketSpec + ) + + // We will return Nil or throw exception at the beginning if the table already exists, so when + // we reach here, the table should not exist and we should set `ignoreIfExists` to false. + sessionState.catalog.createTable(table, ignoreIfExists = false) Seq.empty[Row] } } @@ -193,7 +191,7 @@ case class CreateDataSourceTableAsSelectCommand( } existingSchema = Some(l.schema) case s: SimpleCatalogRelation if DDLUtils.isDatasourceTable(s.metadata) => - existingSchema = Some(DDLUtils.getSchemaFromTableProperties(s.metadata)) + existingSchema = Some(s.metadata.schema) case o => throw new AnalysisException(s"Saving data in ${o.toString} is not supported.") } @@ -233,15 +231,17 @@ case class CreateDataSourceTableAsSelectCommand( // We will use the schema of resolved.relation as the schema of the table (instead of // the schema of df). It is important since the nullability may be changed by the relation // provider (for example, see org.apache.spark.sql.parquet.DefaultSource). - CreateDataSourceTableUtils.createDataSourceTable( - sparkSession = sparkSession, - tableIdent = tableIdent, - schema = result.schema, - partitionColumns = partitionColumns, - bucketSpec = bucketSpec, - provider = provider, - options = optionsWithPath, - isExternal = isExternal) + val schema = result.schema + val table = CatalogTable( + identifier = tableIdent, + tableType = if (isExternal) CatalogTableType.EXTERNAL else CatalogTableType.MANAGED, + storage = CatalogStorageFormat.empty.copy(properties = optionsWithPath), + schema = schema, + provider = Some(provider), + partitionColumnNames = partitionColumns, + bucketSpec = bucketSpec + ) + sessionState.catalog.createTable(table, ignoreIfExists = false) } // Refresh the cache of the table in the catalog. @@ -249,210 +249,3 @@ case class CreateDataSourceTableAsSelectCommand( Seq.empty[Row] } } - - -object CreateDataSourceTableUtils extends Logging { - - val DATASOURCE_PREFIX = "spark.sql.sources." - val DATASOURCE_PROVIDER = DATASOURCE_PREFIX + "provider" - val DATASOURCE_WRITEJOBUUID = DATASOURCE_PREFIX + "writeJobUUID" - val DATASOURCE_OUTPUTPATH = DATASOURCE_PREFIX + "output.path" - val DATASOURCE_SCHEMA = DATASOURCE_PREFIX + "schema" - val DATASOURCE_SCHEMA_PREFIX = DATASOURCE_SCHEMA + "." - val DATASOURCE_SCHEMA_NUMPARTS = DATASOURCE_SCHEMA_PREFIX + "numParts" - val DATASOURCE_SCHEMA_NUMPARTCOLS = DATASOURCE_SCHEMA_PREFIX + "numPartCols" - val DATASOURCE_SCHEMA_NUMSORTCOLS = DATASOURCE_SCHEMA_PREFIX + "numSortCols" - val DATASOURCE_SCHEMA_NUMBUCKETS = DATASOURCE_SCHEMA_PREFIX + "numBuckets" - val DATASOURCE_SCHEMA_NUMBUCKETCOLS = DATASOURCE_SCHEMA_PREFIX + "numBucketCols" - val DATASOURCE_SCHEMA_PART_PREFIX = DATASOURCE_SCHEMA_PREFIX + "part." - val DATASOURCE_SCHEMA_PARTCOL_PREFIX = DATASOURCE_SCHEMA_PREFIX + "partCol." - val DATASOURCE_SCHEMA_BUCKETCOL_PREFIX = DATASOURCE_SCHEMA_PREFIX + "bucketCol." - val DATASOURCE_SCHEMA_SORTCOL_PREFIX = DATASOURCE_SCHEMA_PREFIX + "sortCol." - - def createDataSourceTable( - sparkSession: SparkSession, - tableIdent: TableIdentifier, - schema: StructType, - partitionColumns: Array[String], - bucketSpec: Option[BucketSpec], - provider: String, - options: Map[String, String], - isExternal: Boolean): Unit = { - val tableProperties = new mutable.HashMap[String, String] - tableProperties.put(DATASOURCE_PROVIDER, provider) - - // Serialized JSON schema string may be too long to be stored into a single metastore table - // property. In this case, we split the JSON string and store each part as a separate table - // property. - val threshold = sparkSession.sessionState.conf.schemaStringLengthThreshold - val schemaJsonString = schema.json - // Split the JSON string. - val parts = schemaJsonString.grouped(threshold).toSeq - tableProperties.put(DATASOURCE_SCHEMA_NUMPARTS, parts.size.toString) - parts.zipWithIndex.foreach { case (part, index) => - tableProperties.put(s"$DATASOURCE_SCHEMA_PART_PREFIX$index", part) - } - - if (partitionColumns.length > 0) { - tableProperties.put(DATASOURCE_SCHEMA_NUMPARTCOLS, partitionColumns.length.toString) - partitionColumns.zipWithIndex.foreach { case (partCol, index) => - tableProperties.put(s"$DATASOURCE_SCHEMA_PARTCOL_PREFIX$index", partCol) - } - } - - if (bucketSpec.isDefined) { - val BucketSpec(numBuckets, bucketColumnNames, sortColumnNames) = bucketSpec.get - - tableProperties.put(DATASOURCE_SCHEMA_NUMBUCKETS, numBuckets.toString) - tableProperties.put(DATASOURCE_SCHEMA_NUMBUCKETCOLS, bucketColumnNames.length.toString) - bucketColumnNames.zipWithIndex.foreach { case (bucketCol, index) => - tableProperties.put(s"$DATASOURCE_SCHEMA_BUCKETCOL_PREFIX$index", bucketCol) - } - - if (sortColumnNames.nonEmpty) { - tableProperties.put(DATASOURCE_SCHEMA_NUMSORTCOLS, sortColumnNames.length.toString) - sortColumnNames.zipWithIndex.foreach { case (sortCol, index) => - tableProperties.put(s"$DATASOURCE_SCHEMA_SORTCOL_PREFIX$index", sortCol) - } - } - } - - val tableType = if (isExternal) { - tableProperties.put("EXTERNAL", "TRUE") - CatalogTableType.EXTERNAL - } else { - tableProperties.put("EXTERNAL", "FALSE") - CatalogTableType.MANAGED - } - - val maybeSerDe = HiveSerDe.sourceToSerDe(provider, sparkSession.sessionState.conf) - val dataSource = - DataSource( - sparkSession, - userSpecifiedSchema = Some(schema), - partitionColumns = partitionColumns, - bucketSpec = bucketSpec, - className = provider, - options = options) - - def newSparkSQLSpecificMetastoreTable(): CatalogTable = { - CatalogTable( - identifier = tableIdent, - tableType = tableType, - schema = new StructType, - provider = Some(provider), - storage = CatalogStorageFormat( - locationUri = None, - inputFormat = None, - outputFormat = None, - serde = None, - compressed = false, - properties = options - ), - properties = tableProperties.toMap) - } - - def newHiveCompatibleMetastoreTable( - relation: HadoopFsRelation, - serde: HiveSerDe): CatalogTable = { - assert(partitionColumns.isEmpty) - assert(relation.partitionSchema.isEmpty) - - CatalogTable( - identifier = tableIdent, - tableType = tableType, - storage = CatalogStorageFormat( - locationUri = Some(relation.location.paths.map(_.toUri.toString).head), - inputFormat = serde.inputFormat, - outputFormat = serde.outputFormat, - serde = serde.serde, - compressed = false, - properties = options - ), - schema = relation.schema, - provider = Some(provider), - properties = tableProperties.toMap, - viewText = None) - } - - // TODO: Support persisting partitioned data source relations in Hive compatible format - val qualifiedTableName = tableIdent.quotedString - val skipHiveMetadata = options.getOrElse("skipHiveMetadata", "false").toBoolean - val resolvedRelation = dataSource.resolveRelation(checkPathExist = false) - val (hiveCompatibleTable, logMessage) = (maybeSerDe, resolvedRelation) match { - case _ if skipHiveMetadata => - val message = - s"Persisting partitioned data source relation $qualifiedTableName into " + - "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive." - (None, message) - - case (Some(serde), relation: HadoopFsRelation) if relation.location.paths.length == 1 && - relation.partitionSchema.isEmpty && relation.bucketSpec.isEmpty => - val hiveTable = newHiveCompatibleMetastoreTable(relation, serde) - val message = - s"Persisting data source relation $qualifiedTableName with a single input path " + - s"into Hive metastore in Hive compatible format. Input path: " + - s"${relation.location.paths.head}." - (Some(hiveTable), message) - - case (Some(serde), relation: HadoopFsRelation) if relation.partitionSchema.nonEmpty => - val message = - s"Persisting partitioned data source relation $qualifiedTableName into " + - "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. " + - "Input path(s): " + relation.location.paths.mkString("\n", "\n", "") - (None, message) - - case (Some(serde), relation: HadoopFsRelation) if relation.bucketSpec.nonEmpty => - val message = - s"Persisting bucketed data source relation $qualifiedTableName into " + - "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. " + - "Input path(s): " + relation.location.paths.mkString("\n", "\n", "") - (None, message) - - case (Some(serde), relation: HadoopFsRelation) => - val message = - s"Persisting data source relation $qualifiedTableName with multiple input paths into " + - "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. " + - s"Input paths: " + relation.location.paths.mkString("\n", "\n", "") - (None, message) - - case (Some(serde), _) => - val message = - s"Data source relation $qualifiedTableName is not a " + - s"${classOf[HadoopFsRelation].getSimpleName}. Persisting it into Hive metastore " + - "in Spark SQL specific format, which is NOT compatible with Hive." - (None, message) - - case _ => - val message = - s"Couldn't find corresponding Hive SerDe for data source provider $provider. " + - s"Persisting data source relation $qualifiedTableName into Hive metastore in " + - s"Spark SQL specific format, which is NOT compatible with Hive." - (None, message) - } - - (hiveCompatibleTable, logMessage) match { - case (Some(table), message) => - // We first try to save the metadata of the table in a Hive compatible way. - // If Hive throws an error, we fall back to save its metadata in the Spark SQL - // specific way. - try { - logInfo(message) - sparkSession.sessionState.catalog.createTable(table, ignoreIfExists = false) - } catch { - case NonFatal(e) => - val warningMessage = - s"Could not persist $qualifiedTableName in a Hive compatible way. Persisting " + - s"it into Hive metastore in Spark SQL specific format." - logWarning(warningMessage, e) - val table = newSparkSQLSpecificMetastoreTable() - sparkSession.sessionState.catalog.createTable(table, ignoreIfExists = false) - } - - case (None, message) => - logWarning(message) - val table = newSparkSQLSpecificMetastoreTable() - sparkSession.sessionState.catalog.createTable(table, ignoreIfExists = false) - } - } -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala index 2eff9337bc14a..3817f919f3a5a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala @@ -27,10 +27,9 @@ import org.apache.hadoop.mapred.{FileInputFormat, JobConf} import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogDatabase, CatalogTable, CatalogTablePartition, CatalogTableType, SessionCatalog} -import org.apache.spark.sql.catalyst.catalog.CatalogTypes._ +import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogTable, CatalogTablePartition, CatalogTableType, SessionCatalog} +import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} -import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils._ import org.apache.spark.sql.execution.datasources.PartitioningUtils import org.apache.spark.sql.types._ @@ -234,10 +233,8 @@ case class AlterTableSetPropertiesCommand( extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { - val ident = if (isView) "VIEW" else "TABLE" val catalog = sparkSession.sessionState.catalog DDLUtils.verifyAlterTableType(catalog, tableName, isView) - DDLUtils.verifyTableProperties(properties.keys.toSeq, s"ALTER $ident") val table = catalog.getTableMetadata(tableName) // This overrides old properties val newTable = table.copy(properties = table.properties ++ properties) @@ -264,10 +261,8 @@ case class AlterTableUnsetPropertiesCommand( extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { - val ident = if (isView) "VIEW" else "TABLE" val catalog = sparkSession.sessionState.catalog DDLUtils.verifyAlterTableType(catalog, tableName, isView) - DDLUtils.verifyTableProperties(propKeys, s"ALTER $ident") val table = catalog.getTableMetadata(tableName) if (!ifExists) { propKeys.foreach { k => @@ -445,11 +440,11 @@ case class AlterTableRecoverPartitionsCommand( if (!catalog.tableExists(tableName)) { throw new AnalysisException(s"Table $tableName in $cmd does not exist.") } - val table = catalog.getTableMetadata(tableName) if (catalog.isTemporaryTable(tableName)) { throw new AnalysisException( s"Operation not allowed: $cmd on temporary tables: $tableName") } + val table = catalog.getTableMetadata(tableName) if (DDLUtils.isDatasourceTable(table)) { throw new AnalysisException( s"Operation not allowed: $cmd on datasource tables: $tableName") @@ -458,7 +453,7 @@ case class AlterTableRecoverPartitionsCommand( throw new AnalysisException( s"Operation not allowed: $cmd only works on external tables: $tableName") } - if (!DDLUtils.isTablePartitioned(table)) { + if (table.partitionColumnNames.isEmpty) { throw new AnalysisException( s"Operation not allowed: $cmd only works on partitioned tables: $tableName") } @@ -584,13 +579,8 @@ case class AlterTableSetLocationCommand( object DDLUtils { - - def isDatasourceTable(props: Map[String, String]): Boolean = { - props.contains(DATASOURCE_PROVIDER) - } - def isDatasourceTable(table: CatalogTable): Boolean = { - isDatasourceTable(table.properties) + table.provider.isDefined && table.provider.get != "hive" } /** @@ -611,78 +601,4 @@ object DDLUtils { case _ => }) } - - /** - * If the given table properties (or SerDe properties) contains datasource properties, - * throw an exception. - */ - def verifyTableProperties(propKeys: Seq[String], operation: String): Unit = { - val datasourceKeys = propKeys.filter(_.startsWith(DATASOURCE_PREFIX)) - if (datasourceKeys.nonEmpty) { - throw new AnalysisException(s"Operation not allowed: $operation property keys may not " + - s"start with '$DATASOURCE_PREFIX': ${datasourceKeys.mkString("[", ", ", "]")}") - } - } - - def isTablePartitioned(table: CatalogTable): Boolean = { - table.partitionColumnNames.nonEmpty || table.properties.contains(DATASOURCE_SCHEMA_NUMPARTCOLS) - } - - // A persisted data source table always store its schema in the catalog. - def getSchemaFromTableProperties(metadata: CatalogTable): StructType = { - require(isDatasourceTable(metadata)) - val msgSchemaCorrupted = "Could not read schema from the metastore because it is corrupted." - val props = metadata.properties - props.get(DATASOURCE_SCHEMA).map { schema => - // Originally, we used spark.sql.sources.schema to store the schema of a data source table. - // After SPARK-6024, we removed this flag. - // Although we are not using spark.sql.sources.schema any more, we need to still support. - DataType.fromJson(schema).asInstanceOf[StructType] - } getOrElse { - props.get(DATASOURCE_SCHEMA_NUMPARTS).map { numParts => - val parts = (0 until numParts.toInt).map { index => - val part = metadata.properties.get(s"$DATASOURCE_SCHEMA_PART_PREFIX$index").orNull - if (part == null) { - throw new AnalysisException(msgSchemaCorrupted + - s" (missing part $index of the schema, $numParts parts are expected).") - } - part - } - // Stick all parts back to a single schema string. - DataType.fromJson(parts.mkString).asInstanceOf[StructType] - } getOrElse(throw new AnalysisException(msgSchemaCorrupted)) - } - } - - private def getColumnNamesByType( - props: Map[String, String], colType: String, typeName: String): Seq[String] = { - require(isDatasourceTable(props)) - - for { - numCols <- props.get(s"spark.sql.sources.schema.num${colType.capitalize}Cols").toSeq - index <- 0 until numCols.toInt - } yield props.getOrElse( - s"$DATASOURCE_SCHEMA_PREFIX${colType}Col.$index", - throw new AnalysisException( - s"Corrupted $typeName in catalog: $numCols parts expected, but part $index is missing." - ) - ) - } - - def getPartitionColumnsFromTableProperties(metadata: CatalogTable): Seq[String] = { - getColumnNamesByType(metadata.properties, "part", "partitioning columns") - } - - def getBucketSpecFromTableProperties(metadata: CatalogTable): Option[BucketSpec] = { - if (isDatasourceTable(metadata)) { - metadata.properties.get(DATASOURCE_SCHEMA_NUMBUCKETS).map { numBuckets => - BucketSpec( - numBuckets.toInt, - getColumnNamesByType(metadata.properties, "bucket", "bucketing columns"), - getColumnNamesByType(metadata.properties, "sort", "sorting columns")) - } - } else { - None - } - } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index 720399ecc596a..af2b5ffd1c427 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -119,11 +119,9 @@ case class CreateTableLikeCommand( case class CreateTableCommand(table: CatalogTable, ifNotExists: Boolean) extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { - DDLUtils.verifyTableProperties(table.properties.keys.toSeq, "CREATE TABLE") sparkSession.sessionState.catalog.createTable(table, ifNotExists) Seq.empty[Row] } - } @@ -414,8 +412,8 @@ case class DescribeTableCommand(table: TableIdentifier, isExtended: Boolean, isF describeSchema(catalog.lookupRelation(table).schema, result) } else { val metadata = catalog.getTableMetadata(table) + describeSchema(metadata.schema, result) - describeSchema(metadata, result) if (isExtended) { describeExtended(metadata, result) } else if (isFormatted) { @@ -429,20 +427,10 @@ case class DescribeTableCommand(table: TableIdentifier, isExtended: Boolean, isF } private def describePartitionInfo(table: CatalogTable, buffer: ArrayBuffer[Row]): Unit = { - if (DDLUtils.isDatasourceTable(table)) { - val partColNames = DDLUtils.getPartitionColumnsFromTableProperties(table) - if (partColNames.nonEmpty) { - val userSpecifiedSchema = DDLUtils.getSchemaFromTableProperties(table) - append(buffer, "# Partition Information", "", "") - append(buffer, s"# ${output.head.name}", output(1).name, output(2).name) - describeSchema(StructType(partColNames.map(userSpecifiedSchema(_))), buffer) - } - } else { - if (table.partitionColumnNames.nonEmpty) { - append(buffer, "# Partition Information", "", "") - append(buffer, s"# ${output.head.name}", output(1).name, output(2).name) - describeSchema(table.partitionSchema, buffer) - } + if (table.partitionColumnNames.nonEmpty) { + append(buffer, "# Partition Information", "", "") + append(buffer, s"# ${output.head.name}", output(1).name, output(2).name) + describeSchema(table.partitionSchema, buffer) } } @@ -466,11 +454,7 @@ case class DescribeTableCommand(table: TableIdentifier, isExtended: Boolean, isF append(buffer, "Table Type:", table.tableType.name, "") append(buffer, "Table Parameters:", "", "") - table.properties.filterNot { - // Hides schema properties that hold user-defined schema, partition columns, and bucketing - // information since they are already extracted and shown in other parts. - case (key, _) => key.startsWith(CreateDataSourceTableUtils.DATASOURCE_SCHEMA) - }.foreach { case (key, value) => + table.properties.foreach { case (key, value) => append(buffer, s" $key", value, "") } @@ -493,7 +477,7 @@ case class DescribeTableCommand(table: TableIdentifier, isExtended: Boolean, isF } private def describeBucketingInfo(metadata: CatalogTable, buffer: ArrayBuffer[Row]): Unit = { - def appendBucketInfo(bucketSpec: Option[BucketSpec]) = bucketSpec match { + metadata.bucketSpec match { case Some(BucketSpec(numBuckets, bucketColumnNames, sortColumnNames)) => append(buffer, "Num Buckets:", numBuckets.toString, "") append(buffer, "Bucket Columns:", bucketColumnNames.mkString("[", ", ", "]"), "") @@ -501,23 +485,6 @@ case class DescribeTableCommand(table: TableIdentifier, isExtended: Boolean, isF case _ => } - - if (DDLUtils.isDatasourceTable(metadata)) { - appendBucketInfo(DDLUtils.getBucketSpecFromTableProperties(metadata)) - } else { - appendBucketInfo(metadata.bucketSpec) - } - } - - private def describeSchema( - tableDesc: CatalogTable, - buffer: ArrayBuffer[Row]): Unit = { - if (DDLUtils.isDatasourceTable(tableDesc)) { - val schema = DDLUtils.getSchemaFromTableProperties(tableDesc) - describeSchema(schema, buffer) - } else { - describeSchema(tableDesc.schema, buffer) - } } private def describeSchema(schema: StructType, buffer: ArrayBuffer[Row]): Unit = { @@ -678,7 +645,7 @@ case class ShowPartitionsCommand( s"SHOW PARTITIONS is not allowed on a view or index table: ${tab.qualifiedName}") } - if (!DDLUtils.isTablePartitioned(tab)) { + if (tab.partitionColumnNames.isEmpty) { throw new AnalysisException( s"SHOW PARTITIONS is not allowed on a table that is not partitioned: ${tab.qualifiedName}") } @@ -729,6 +696,7 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman val tableMetadata = catalog.getTableMetadata(table) + // TODO: unify this after we unify the CREATE TABLE syntax for hive serde and data source table. val stmt = if (DDLUtils.isDatasourceTable(tableMetadata)) { showCreateDataSourceTable(tableMetadata) } else { @@ -872,15 +840,14 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman private def showDataSourceTableDataColumns( metadata: CatalogTable, builder: StringBuilder): Unit = { - val schema = DDLUtils.getSchemaFromTableProperties(metadata) - val columns = schema.fields.map(f => s"${quoteIdentifier(f.name)} ${f.dataType.sql}") + val columns = metadata.schema.fields.map(f => s"${quoteIdentifier(f.name)} ${f.dataType.sql}") builder ++= columns.mkString("(", ", ", ")\n") } private def showDataSourceTableOptions(metadata: CatalogTable, builder: StringBuilder): Unit = { val props = metadata.properties - builder ++= s"USING ${props(CreateDataSourceTableUtils.DATASOURCE_PROVIDER)}\n" + builder ++= s"USING ${metadata.provider.get}\n" val dataSourceOptions = metadata.storage.properties.filterNot { case (key, value) => @@ -900,12 +867,12 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman private def showDataSourceTableNonDataColumns( metadata: CatalogTable, builder: StringBuilder): Unit = { - val partCols = DDLUtils.getPartitionColumnsFromTableProperties(metadata) + val partCols = metadata.partitionColumnNames if (partCols.nonEmpty) { builder ++= s"PARTITIONED BY ${partCols.mkString("(", ", ", ")")}\n" } - DDLUtils.getBucketSpecFromTableProperties(metadata).foreach { spec => + metadata.bucketSpec.foreach { spec => if (spec.bucketColumnNames.nonEmpty) { builder ++= s"CLUSTERED BY ${spec.bucketColumnNames.mkString("(", ", ", ")")}\n" diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala index 733ba185287e1..5eba7df060c4e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala @@ -34,7 +34,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, UnknownPartitioning} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{RowDataSourceScanExec, SparkPlan} -import org.apache.spark.sql.execution.command.{CreateDataSourceTableUtils, DDLUtils, ExecutedCommandExec} +import org.apache.spark.sql.execution.command.{DDLUtils, ExecutedCommandExec} import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -204,24 +204,14 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] { */ class FindDataSourceTable(sparkSession: SparkSession) extends Rule[LogicalPlan] { private def readDataSourceTable(sparkSession: SparkSession, table: CatalogTable): LogicalPlan = { - val schema = DDLUtils.getSchemaFromTableProperties(table) - - // We only need names at here since userSpecifiedSchema we loaded from the metastore - // contains partition columns. We can always get datatypes of partitioning columns - // from userSpecifiedSchema. - val partitionColumns = DDLUtils.getPartitionColumnsFromTableProperties(table) - - val bucketSpec = DDLUtils.getBucketSpecFromTableProperties(table) - - val options = table.storage.properties val dataSource = DataSource( sparkSession, - userSpecifiedSchema = Some(schema), - partitionColumns = partitionColumns, - bucketSpec = bucketSpec, - className = table.properties(CreateDataSourceTableUtils.DATASOURCE_PROVIDER), - options = options) + userSpecifiedSchema = Some(table.schema), + partitionColumns = table.partitionColumnNames, + bucketSpec = table.bucketSpec, + className = table.provider.get, + options = table.storage.properties) LogicalRelation( dataSource.resolveRelation(), diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala index 447c237e3a1b0..7880c7cfa16f8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/WriterContainer.scala @@ -33,7 +33,6 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.UnsafeKVExternalSorter -import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import org.apache.spark.util.{SerializableConfiguration, Utils} @@ -48,6 +47,11 @@ private[datasources] case class WriteRelation( prepareJobForWrite: Job => OutputWriterFactory, bucketSpec: Option[BucketSpec]) +object WriterContainer { + val DATASOURCE_WRITEJOBUUID = "spark.sql.sources.writeJobUUID" + val DATASOURCE_OUTPUTPATH = "spark.sql.sources.output.path" +} + private[datasources] abstract class BaseWriterContainer( @transient val relation: WriteRelation, @transient private val job: Job, @@ -94,7 +98,7 @@ private[datasources] abstract class BaseWriterContainer( // This UUID is sent to executor side together with the serialized `Configuration` object within // the `Job` instance. `OutputWriters` on the executor side should use this UUID to generate // unique task output files. - job.getConfiguration.set(DATASOURCE_WRITEJOBUUID, uniqueWriteJobId.toString) + job.getConfiguration.set(WriterContainer.DATASOURCE_WRITEJOBUUID, uniqueWriteJobId.toString) // Order of the following two lines is important. For Hadoop 1, TaskAttemptContext constructor // clones the Configuration object passed in. If we initialize the TaskAttemptContext first, @@ -244,7 +248,7 @@ private[datasources] class DefaultWriterContainer( def writeRows(taskContext: TaskContext, iterator: Iterator[InternalRow]): Unit = { executorSideSetup(taskContext) val configuration = taskAttemptContext.getConfiguration - configuration.set(DATASOURCE_OUTPUTPATH, outputPath) + configuration.set(WriterContainer.DATASOURCE_OUTPUTPATH, outputPath) var writer = newOutputWriter(getWorkPath) writer.initConverter(dataSchema) @@ -352,10 +356,12 @@ private[datasources] class DynamicPartitionWriterContainer( val configuration = taskAttemptContext.getConfiguration val path = if (partitionColumns.nonEmpty) { val partitionPath = getPartitionString(key).getString(0) - configuration.set(DATASOURCE_OUTPUTPATH, new Path(outputPath, partitionPath).toString) + configuration.set( + WriterContainer.DATASOURCE_OUTPUTPATH, + new Path(outputPath, partitionPath).toString) new Path(getWorkPath, partitionPath).toString } else { - configuration.set(DATASOURCE_OUTPUTPATH, outputPath) + configuration.set(WriterContainer.DATASOURCE_OUTPUTPATH, outputPath) getWorkPath } val bucketId = getBucketIdFromKey(key) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala index 6b2f9fc61e677..de2d633c0bcf4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala @@ -30,8 +30,7 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericMutableRow -import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils -import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory, PartitionedFile} +import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory, PartitionedFile, WriterContainer} import org.apache.spark.sql.types._ object CSVRelation extends Logging { @@ -192,7 +191,7 @@ private[csv] class CsvOutputWriter( new TextOutputFormat[NullWritable, Text]() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { val configuration = context.getConfiguration - val uniqueWriteJobId = configuration.get(CreateDataSourceTableUtils.DATASOURCE_WRITEJOBUUID) + val uniqueWriteJobId = configuration.get(WriterContainer.DATASOURCE_WRITEJOBUUID) val taskAttemptId = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId new Path(path, f"part-r-$split%05d-$uniqueWriteJobId.csv$extension") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala index 27910e2cddad8..16150b91d6452 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala @@ -31,7 +31,6 @@ import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.StructType @@ -164,7 +163,7 @@ private[json] class JsonOutputWriter( new TextOutputFormat[NullWritable, Text]() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { val configuration = context.getConfiguration - val uniqueWriteJobId = configuration.get(CreateDataSourceTableUtils.DATASOURCE_WRITEJOBUUID) + val uniqueWriteJobId = configuration.get(WriterContainer.DATASOURCE_WRITEJOBUUID) val taskAttemptId = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId val bucketString = bucketId.map(BucketingUtils.bucketIdToString).getOrElse("") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala index 9c4778acf53d7..9208c82179d8d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala @@ -44,7 +44,6 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection import org.apache.spark.sql.catalyst.parser.LegacyTypeStringParser -import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.internal.SQLConf @@ -547,8 +546,7 @@ private[parquet] class ParquetOutputWriter( // partitions in the case of dynamic partitioning. override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { val configuration = context.getConfiguration - val uniqueWriteJobId = configuration.get( - CreateDataSourceTableUtils.DATASOURCE_WRITEJOBUUID) + val uniqueWriteJobId = configuration.get(WriterContainer.DATASOURCE_WRITEJOBUUID) val taskAttemptId = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId val bucketString = bucketId.map(BucketingUtils.bucketIdToString).getOrElse("") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala index abb6059f75ba8..a0c3fd53fb53b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala @@ -27,7 +27,6 @@ import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter} -import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{StringType, StructType} @@ -131,7 +130,7 @@ class TextOutputWriter(path: String, dataSchema: StructType, context: TaskAttemp new TextOutputFormat[NullWritable, Text]() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { val configuration = context.getConfiguration - val uniqueWriteJobId = configuration.get(CreateDataSourceTableUtils.DATASOURCE_WRITEJOBUUID) + val uniqueWriteJobId = configuration.get(WriterContainer.DATASOURCE_WRITEJOBUUID) val taskAttemptId = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId new Path(path, f"part-r-$split%05d-$uniqueWriteJobId.txt$extension") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala index ad69137f7401b..52e648a917d8b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala @@ -28,10 +28,9 @@ object HiveSerDe { * * @param source Currently the source abbreviation can be one of the following: * SequenceFile, RCFile, ORC, PARQUET, and case insensitive. - * @param conf SQLConf * @return HiveSerDe associated with the specified source */ - def sourceToSerDe(source: String, conf: SQLConf): Option[HiveSerDe] = { + def sourceToSerDe(source: String): Option[HiveSerDe] = { val serdeMap = Map( "sequencefile" -> HiveSerDe( @@ -42,8 +41,7 @@ object HiveSerDe { HiveSerDe( inputFormat = Option("org.apache.hadoop.hive.ql.io.RCFileInputFormat"), outputFormat = Option("org.apache.hadoop.hive.ql.io.RCFileOutputFormat"), - serde = Option(conf.getConfString("hive.default.rcfile.serde", - "org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe"))), + serde = Option("org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe")), "orc" -> HiveSerDe( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala index be1bccbd990a0..8dd883b37bde0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala @@ -243,7 +243,7 @@ class DDLCommandSuite extends PlanTest { allSources.foreach { s => val query = s"CREATE TABLE my_tab STORED AS $s" val ct = parseAs[CreateTable](query) - val hiveSerde = HiveSerDe.sourceToSerDe(s, new SQLConf) + val hiveSerde = HiveSerDe.sourceToSerDe(s) assert(hiveSerde.isDefined) assert(ct.tableDesc.storage.serde == hiveSerde.get.serde) assert(ct.tableDesc.storage.inputFormat == hiveSerde.get.inputFormat) @@ -276,7 +276,7 @@ class DDLCommandSuite extends PlanTest { val query = s"CREATE TABLE my_tab ROW FORMAT SERDE 'anything' STORED AS $s" if (supportedSources.contains(s)) { val ct = parseAs[CreateTable](query) - val hiveSerde = HiveSerDe.sourceToSerDe(s, new SQLConf) + val hiveSerde = HiveSerDe.sourceToSerDe(s) assert(hiveSerde.isDefined) assert(ct.tableDesc.storage.serde == Some("anything")) assert(ct.tableDesc.storage.inputFormat == hiveSerde.get.inputFormat) @@ -295,7 +295,7 @@ class DDLCommandSuite extends PlanTest { val query = s"CREATE TABLE my_tab ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' STORED AS $s" if (supportedSources.contains(s)) { val ct = parseAs[CreateTable](query) - val hiveSerde = HiveSerDe.sourceToSerDe(s, new SQLConf) + val hiveSerde = HiveSerDe.sourceToSerDe(s) assert(hiveSerde.isDefined) assert(ct.tableDesc.storage.serde == hiveSerde.get.serde) assert(ct.tableDesc.storage.inputFormat == hiveSerde.get.inputFormat) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index 0f7fda7666a3b..e6ae42258d4c5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -30,7 +30,6 @@ import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogDatabase, Catal import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType} import org.apache.spark.sql.catalyst.catalog.{CatalogTablePartition, SessionCatalog} import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec -import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ @@ -93,7 +92,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { .add("col2", "string") .add("a", "int") .add("b", "int"), - provider = Some("parquet"), + provider = Some("hive"), partitionColumnNames = Seq("a", "b"), createTime = 0L) } @@ -277,10 +276,8 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { """.stripMargin) val tableMetadata = spark.sessionState.catalog.getTableMetadata(TableIdentifier(tabName)) - assert(expectedSchema == - DDLUtils.getSchemaFromTableProperties(tableMetadata)) - assert(expectedPartitionCols == - DDLUtils.getPartitionColumnsFromTableProperties(tableMetadata)) + assert(expectedSchema == tableMetadata.schema) + assert(expectedPartitionCols == tableMetadata.partitionColumnNames) } } @@ -399,41 +396,6 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { assert(e.message == "Found duplicate column(s) in bucket: a") } - test("Describe Table with Corrupted Schema") { - import testImplicits._ - - val tabName = "tab1" - withTempPath { dir => - val path = dir.getCanonicalPath - val df = sparkContext.parallelize(1 to 10).map(i => (i, i.toString)).toDF("col1", "col2") - df.write.format("json").save(path) - val uri = dir.toURI - - withTable(tabName) { - sql( - s""" - |CREATE TABLE $tabName - |USING json - |OPTIONS ( - | path '$uri' - |) - """.stripMargin) - - val catalog = spark.sessionState.catalog - val table = catalog.getTableMetadata(TableIdentifier(tabName)) - val newProperties = table.properties.filterKeys(key => - key != CreateDataSourceTableUtils.DATASOURCE_SCHEMA_NUMPARTS) - val newTable = table.copy(properties = newProperties) - catalog.alterTable(newTable) - - val e = intercept[AnalysisException] { - sql(s"DESC $tabName") - }.getMessage - assert(e.contains(s"Could not read schema from the metastore because it is corrupted")) - } - } - } - test("Refresh table after changing the data source table partitioning") { import testImplicits._ @@ -460,10 +422,8 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { |) """.stripMargin) val tableMetadata = catalog.getTableMetadata(TableIdentifier(tabName)) - val tableSchema = DDLUtils.getSchemaFromTableProperties(tableMetadata) - assert(tableSchema == schema) - val partCols = DDLUtils.getPartitionColumnsFromTableProperties(tableMetadata) - assert(partCols == partitionCols) + assert(tableMetadata.schema == schema) + assert(tableMetadata.partitionColumnNames == partitionCols) // Change the schema val newDF = sparkContext.parallelize(1 to 10).map(i => (i, i.toString)) @@ -472,23 +432,15 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { // No change on the schema val tableMetadataBeforeRefresh = catalog.getTableMetadata(TableIdentifier(tabName)) - val tableSchemaBeforeRefresh = - DDLUtils.getSchemaFromTableProperties(tableMetadataBeforeRefresh) - assert(tableSchemaBeforeRefresh == schema) - val partColsBeforeRefresh = - DDLUtils.getPartitionColumnsFromTableProperties(tableMetadataBeforeRefresh) - assert(partColsBeforeRefresh == partitionCols) + assert(tableMetadataBeforeRefresh.schema == schema) + assert(tableMetadataBeforeRefresh.partitionColumnNames == partitionCols) // Refresh does not affect the schema spark.catalog.refreshTable(tabName) val tableMetadataAfterRefresh = catalog.getTableMetadata(TableIdentifier(tabName)) - val tableSchemaAfterRefresh = - DDLUtils.getSchemaFromTableProperties(tableMetadataAfterRefresh) - assert(tableSchemaAfterRefresh == schema) - val partColsAfterRefresh = - DDLUtils.getPartitionColumnsFromTableProperties(tableMetadataAfterRefresh) - assert(partColsAfterRefresh == partitionCols) + assert(tableMetadataAfterRefresh.schema == schema) + assert(tableMetadataAfterRefresh.partitionColumnNames == partitionCols) } } } @@ -641,7 +593,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { val table = catalog.getTableMetadata(TableIdentifier("tbl")) assert(table.tableType == CatalogTableType.MANAGED) assert(table.schema == new StructType().add("a", "int").add("b", "int")) - assert(table.properties(DATASOURCE_PROVIDER) == "parquet") + assert(table.provider == Some("parquet")) } } @@ -651,12 +603,9 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { sql("CREATE TABLE tbl(a INT, b INT) USING parquet PARTITIONED BY (a)") val table = catalog.getTableMetadata(TableIdentifier("tbl")) assert(table.tableType == CatalogTableType.MANAGED) - assert(table.schema.isEmpty) // partitioned datasource table is not hive-compatible - assert(table.properties(DATASOURCE_PROVIDER) == "parquet") - assert(DDLUtils.getSchemaFromTableProperties(table) == - new StructType().add("a", IntegerType).add("b", IntegerType)) - assert(DDLUtils.getPartitionColumnsFromTableProperties(table) == - Seq("a")) + assert(table.provider == Some("parquet")) + assert(table.schema == new StructType().add("a", IntegerType).add("b", IntegerType)) + assert(table.partitionColumnNames == Seq("a")) } } @@ -667,12 +616,9 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { "CLUSTERED BY (a) SORTED BY (b) INTO 5 BUCKETS") val table = catalog.getTableMetadata(TableIdentifier("tbl")) assert(table.tableType == CatalogTableType.MANAGED) - assert(table.schema.isEmpty) // partitioned datasource table is not hive-compatible - assert(table.properties(DATASOURCE_PROVIDER) == "parquet") - assert(DDLUtils.getSchemaFromTableProperties(table) == - new StructType().add("a", IntegerType).add("b", IntegerType)) - assert(DDLUtils.getBucketSpecFromTableProperties(table) == - Some(BucketSpec(5, Seq("a"), Seq("b")))) + assert(table.provider == Some("parquet")) + assert(table.schema == new StructType().add("a", IntegerType).add("b", IntegerType)) + assert(table.bucketSpec == Some(BucketSpec(5, Seq("a"), Seq("b")))) } } @@ -1096,7 +1042,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { catalog: SessionCatalog, tableIdent: TableIdentifier): Unit = { catalog.alterTable(catalog.getTableMetadata(tableIdent).copy( - properties = Map(DATASOURCE_PROVIDER -> "csv"))) + provider = Some("csv"))) } private def testSetProperties(isDatasourceTable: Boolean): Unit = { @@ -1108,9 +1054,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { convertToDatasourceTable(catalog, tableIdent) } def getProps: Map[String, String] = { - catalog.getTableMetadata(tableIdent).properties.filterKeys { k => - !isDatasourceTable || !k.startsWith(DATASOURCE_PREFIX) - } + catalog.getTableMetadata(tableIdent).properties } assert(getProps.isEmpty) // set table properties @@ -1124,11 +1068,6 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { intercept[AnalysisException] { sql("ALTER TABLE does_not_exist SET TBLPROPERTIES ('winner' = 'loser')") } - // datasource table property keys are not allowed - val e = intercept[AnalysisException] { - sql(s"ALTER TABLE tab1 SET TBLPROPERTIES ('${DATASOURCE_PREFIX}foo' = 'loser')") - } - assert(e.getMessage.contains(DATASOURCE_PREFIX + "foo")) } private def testUnsetProperties(isDatasourceTable: Boolean): Unit = { @@ -1140,9 +1079,7 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { convertToDatasourceTable(catalog, tableIdent) } def getProps: Map[String, String] = { - catalog.getTableMetadata(tableIdent).properties.filterKeys { k => - !isDatasourceTable || !k.startsWith(DATASOURCE_PREFIX) - } + catalog.getTableMetadata(tableIdent).properties } // unset table properties sql("ALTER TABLE dbx.tab1 SET TBLPROPERTIES ('j' = 'am', 'p' = 'an', 'c' = 'lan', 'x' = 'y')") @@ -1164,11 +1101,6 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { // property to unset does not exist, but "IF EXISTS" is specified sql("ALTER TABLE tab1 UNSET TBLPROPERTIES IF EXISTS ('c', 'xyz')") assert(getProps == Map("x" -> "y")) - // datasource table property keys are not allowed - val e2 = intercept[AnalysisException] { - sql(s"ALTER TABLE tab1 UNSET TBLPROPERTIES ('${DATASOURCE_PREFIX}foo')") - } - assert(e2.getMessage.contains(DATASOURCE_PREFIX + "foo")) } private def testSetLocation(isDatasourceTable: Boolean): Unit = { @@ -1573,10 +1505,6 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { } } - test("create table with datasource properties (not allowed)") { - assertUnsupported("CREATE TABLE my_tab TBLPROPERTIES ('spark.sql.sources.me'='anything')") - } - test("Create Hive Table As Select") { import testImplicits._ withTable("t", "t1") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala index 49153f77362b7..729c9fdda543e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala @@ -201,7 +201,7 @@ class CreateTableAsSelectSuite """.stripMargin ) val table = catalog.getTableMetadata(TableIdentifier("t")) - assert(DDLUtils.getPartitionColumnsFromTableProperties(table) == Seq("a")) + assert(table.partitionColumnNames == Seq("a")) } } @@ -217,8 +217,7 @@ class CreateTableAsSelectSuite """.stripMargin ) val table = catalog.getTableMetadata(TableIdentifier("t")) - assert(DDLUtils.getBucketSpecFromTableProperties(table) == - Option(BucketSpec(5, Seq("a"), Seq("b")))) + assert(table.bucketSpec == Option(BucketSpec(5, Seq("a"), Seq("b")))) } } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index 8302e3e98ad34..de3e60a44d920 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -30,7 +30,11 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog._ +import org.apache.spark.sql.execution.command.DDLUtils +import org.apache.spark.sql.execution.datasources.CaseInsensitiveMap import org.apache.spark.sql.hive.client.HiveClient +import org.apache.spark.sql.internal.HiveSerDe +import org.apache.spark.sql.types.{DataType, StructType} /** @@ -41,6 +45,8 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu extends ExternalCatalog with Logging { import CatalogTypes.TablePartitionSpec + import HiveExternalCatalog._ + import CatalogTableType._ // Exceptions thrown by the hive client that we would like to wrap private val clientExceptions = Set( @@ -81,6 +87,20 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu withClient { getTable(db, table) } } + /** + * If the given table properties contains datasource properties, throw an exception. We will do + * this check when create or alter a table, i.e. when we try to write table metadata to Hive + * metastore. + */ + private def verifyTableProperties(table: CatalogTable): Unit = { + val datasourceKeys = table.properties.keys.filter(_.startsWith(DATASOURCE_PREFIX)) + if (datasourceKeys.nonEmpty) { + throw new AnalysisException(s"Cannot persistent ${table.qualifiedName} into hive metastore " + + s"as table property keys may not start with '$DATASOURCE_PREFIX': " + + datasourceKeys.mkString("[", ", ", "]")) + } + } + // -------------------------------------------------------------------------- // Databases // -------------------------------------------------------------------------- @@ -144,16 +164,162 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu assert(tableDefinition.identifier.database.isDefined) val db = tableDefinition.identifier.database.get requireDbExists(db) + verifyTableProperties(tableDefinition) + + // Before saving data source table metadata into Hive metastore, we should: + // 1. Put table schema, partition column names and bucket specification in table properties. + // 2. Check if this table is hive compatible + // 2.1 If it's not hive compatible, set schema, partition columns and bucket spec to empty + // and save table metadata to Hive. + // 2.1 If it's hive compatible, set serde information in table metadata and try to save + // it to Hive. If it fails, treat it as not hive compatible and go back to 2.1 + if (DDLUtils.isDatasourceTable(tableDefinition)) { + // data source table always have a provider, it's guaranteed by `DDLUtils.isDatasourceTable`. + val provider = tableDefinition.provider.get + val partitionColumns = tableDefinition.partitionColumnNames + val bucketSpec = tableDefinition.bucketSpec + + val tableProperties = new scala.collection.mutable.HashMap[String, String] + tableProperties.put(DATASOURCE_PROVIDER, provider) + + // Serialized JSON schema string may be too long to be stored into a single metastore table + // property. In this case, we split the JSON string and store each part as a separate table + // property. + // TODO: the threshold should be set by `spark.sql.sources.schemaStringLengthThreshold`, + // however the current SQLConf is session isolated, which is not applicable to external + // catalog. We should re-enable this conf instead of hard code the value here, after we have + // global SQLConf. + val threshold = 4000 + val schemaJsonString = tableDefinition.schema.json + // Split the JSON string. + val parts = schemaJsonString.grouped(threshold).toSeq + tableProperties.put(DATASOURCE_SCHEMA_NUMPARTS, parts.size.toString) + parts.zipWithIndex.foreach { case (part, index) => + tableProperties.put(s"$DATASOURCE_SCHEMA_PART_PREFIX$index", part) + } + + if (partitionColumns.nonEmpty) { + tableProperties.put(DATASOURCE_SCHEMA_NUMPARTCOLS, partitionColumns.length.toString) + partitionColumns.zipWithIndex.foreach { case (partCol, index) => + tableProperties.put(s"$DATASOURCE_SCHEMA_PARTCOL_PREFIX$index", partCol) + } + } + + if (bucketSpec.isDefined) { + val BucketSpec(numBuckets, bucketColumnNames, sortColumnNames) = bucketSpec.get + + tableProperties.put(DATASOURCE_SCHEMA_NUMBUCKETS, numBuckets.toString) + tableProperties.put(DATASOURCE_SCHEMA_NUMBUCKETCOLS, bucketColumnNames.length.toString) + bucketColumnNames.zipWithIndex.foreach { case (bucketCol, index) => + tableProperties.put(s"$DATASOURCE_SCHEMA_BUCKETCOL_PREFIX$index", bucketCol) + } + + if (sortColumnNames.nonEmpty) { + tableProperties.put(DATASOURCE_SCHEMA_NUMSORTCOLS, sortColumnNames.length.toString) + sortColumnNames.zipWithIndex.foreach { case (sortCol, index) => + tableProperties.put(s"$DATASOURCE_SCHEMA_SORTCOL_PREFIX$index", sortCol) + } + } + } + + // converts the table metadata to Spark SQL specific format, i.e. set schema, partition column + // names and bucket specification to empty. + def newSparkSQLSpecificMetastoreTable(): CatalogTable = { + tableDefinition.copy( + schema = new StructType, + partitionColumnNames = Nil, + bucketSpec = None, + properties = tableDefinition.properties ++ tableProperties) + } + + // converts the table metadata to Hive compatible format, i.e. set the serde information. + def newHiveCompatibleMetastoreTable(serde: HiveSerDe, path: String): CatalogTable = { + tableDefinition.copy( + storage = tableDefinition.storage.copy( + locationUri = Some(new Path(path).toUri.toString), + inputFormat = serde.inputFormat, + outputFormat = serde.outputFormat, + serde = serde.serde + ), + properties = tableDefinition.properties ++ tableProperties) + } + + val qualifiedTableName = tableDefinition.identifier.quotedString + val maybeSerde = HiveSerDe.sourceToSerDe(tableDefinition.provider.get) + val maybePath = new CaseInsensitiveMap(tableDefinition.storage.properties).get("path") + val skipHiveMetadata = tableDefinition.storage.properties + .getOrElse("skipHiveMetadata", "false").toBoolean + + val (hiveCompatibleTable, logMessage) = (maybeSerde, maybePath) match { + case _ if skipHiveMetadata => + val message = + s"Persisting data source table $qualifiedTableName into Hive metastore in" + + "Spark SQL specific format, which is NOT compatible with Hive." + (None, message) + + // our bucketing is un-compatible with hive(different hash function) + case _ if tableDefinition.bucketSpec.nonEmpty => + val message = + s"Persisting bucketed data source table $qualifiedTableName into " + + "Hive metastore in Spark SQL specific format, which is NOT compatible with Hive. " + (None, message) + + case (Some(serde), Some(path)) => + val message = + s"Persisting file based data source table $qualifiedTableName with an input path " + + s"into Hive metastore in Hive compatible format." + (Some(newHiveCompatibleMetastoreTable(serde, path)), message) + + case (Some(_), None) => + val message = + s"Data source table $qualifiedTableName is not file based. Persisting it into " + + s"Hive metastore in Spark SQL specific format, which is NOT compatible with Hive." + (None, message) + + case _ => + val provider = tableDefinition.provider.get + val message = + s"Couldn't find corresponding Hive SerDe for data source provider $provider. " + + s"Persisting data source table $qualifiedTableName into Hive metastore in " + + s"Spark SQL specific format, which is NOT compatible with Hive." + (None, message) + } + + (hiveCompatibleTable, logMessage) match { + case (Some(table), message) => + // We first try to save the metadata of the table in a Hive compatible way. + // If Hive throws an error, we fall back to save its metadata in the Spark SQL + // specific way. + try { + logInfo(message) + saveTableIntoHive(table, ignoreIfExists) + } catch { + case NonFatal(e) => + val warningMessage = + s"Could not persist ${tableDefinition.identifier.quotedString} in a Hive " + + "compatible way. Persisting it into Hive metastore in Spark SQL specific format." + logWarning(warningMessage, e) + saveTableIntoHive(newSparkSQLSpecificMetastoreTable(), ignoreIfExists) + } + + case (None, message) => + logWarning(message) + saveTableIntoHive(newSparkSQLSpecificMetastoreTable(), ignoreIfExists) + } + } else { + client.createTable(tableDefinition, ignoreIfExists) + } + } - if ( + private def saveTableIntoHive(tableDefinition: CatalogTable, ignoreIfExists: Boolean): Unit = { + assert(DDLUtils.isDatasourceTable(tableDefinition), + "saveTableIntoHive only takes data source table.") // If this is an external data source table... - tableDefinition.properties.contains("spark.sql.sources.provider") && - tableDefinition.tableType == CatalogTableType.EXTERNAL && - // ... that is not persisted as Hive compatible format (external tables in Hive compatible - // format always set `locationUri` to the actual data location and should NOT be hacked as - // following.) - tableDefinition.storage.locationUri.isEmpty - ) { + if (tableDefinition.tableType == EXTERNAL && + // ... that is not persisted as Hive compatible format (external tables in Hive compatible + // format always set `locationUri` to the actual data location and should NOT be hacked as + // following.) + tableDefinition.storage.locationUri.isEmpty) { // !! HACK ALERT !! // // Due to a restriction of Hive metastore, here we have to set `locationUri` to a temporary @@ -200,22 +366,79 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu * Alter a table whose name that matches the one specified in `tableDefinition`, * assuming the table exists. * - * Note: As of now, this only supports altering table properties, serde properties, - * and num buckets! + * Note: As of now, this doesn't support altering table schema, partition column names and bucket + * specification. We will ignore them even if users do specify different values for these fields. */ override def alterTable(tableDefinition: CatalogTable): Unit = withClient { assert(tableDefinition.identifier.database.isDefined) val db = tableDefinition.identifier.database.get requireTableExists(db, tableDefinition.identifier.table) - client.alterTable(tableDefinition) + verifyTableProperties(tableDefinition) + + if (DDLUtils.isDatasourceTable(tableDefinition)) { + val oldDef = client.getTable(db, tableDefinition.identifier.table) + // Sets the `schema`, `partitionColumnNames` and `bucketSpec` from the old table definition, + // to retain the spark specific format if it is. Also add old data source properties to table + // properties, to retain the data source table format. + val oldDataSourceProps = oldDef.properties.filter(_._1.startsWith(DATASOURCE_PREFIX)) + val newDef = tableDefinition.copy( + schema = oldDef.schema, + partitionColumnNames = oldDef.partitionColumnNames, + bucketSpec = oldDef.bucketSpec, + properties = oldDataSourceProps ++ tableDefinition.properties) + + client.alterTable(newDef) + } else { + client.alterTable(tableDefinition) + } } override def getTable(db: String, table: String): CatalogTable = withClient { - client.getTable(db, table) + restoreTableMetadata(client.getTable(db, table)) } override def getTableOption(db: String, table: String): Option[CatalogTable] = withClient { - client.getTableOption(db, table) + client.getTableOption(db, table).map(restoreTableMetadata) + } + + /** + * Restores table metadata from the table properties if it's a datasouce table. This method is + * kind of a opposite version of [[createTable]]. + * + * It reads table schema, provider, partition column names and bucket specification from table + * properties, and filter out these special entries from table properties. + */ + private def restoreTableMetadata(table: CatalogTable): CatalogTable = { + if (table.tableType == VIEW) { + table + } else { + getProviderFromTableProperties(table).map { provider => + assert(provider != "hive", "Hive serde table should not save provider in table properties.") + // SPARK-15269: Persisted data source tables always store the location URI as a storage + // property named "path" instead of standard Hive `dataLocation`, because Hive only + // allows directory paths as location URIs while Spark SQL data source tables also + // allows file paths. So the standard Hive `dataLocation` is meaningless for Spark SQL + // data source tables. + // Spark SQL may also save external data source in Hive compatible format when + // possible, so that these tables can be directly accessed by Hive. For these tables, + // `dataLocation` is still necessary. Here we also check for input format because only + // these Hive compatible tables set this field. + val storage = if (table.tableType == EXTERNAL && table.storage.inputFormat.isEmpty) { + table.storage.copy(locationUri = None) + } else { + table.storage + } + table.copy( + storage = storage, + schema = getSchemaFromTableProperties(table), + provider = Some(provider), + partitionColumnNames = getPartitionColumnsFromTableProperties(table), + bucketSpec = getBucketSpecFromTableProperties(table), + properties = getOriginalTableProperties(table)) + } getOrElse { + table.copy(provider = Some("hive")) + } + } } override def tableExists(db: String, table: String): Boolean = withClient { @@ -363,3 +586,82 @@ private[spark] class HiveExternalCatalog(client: HiveClient, hadoopConf: Configu } } + +object HiveExternalCatalog { + val DATASOURCE_PREFIX = "spark.sql.sources." + val DATASOURCE_PROVIDER = DATASOURCE_PREFIX + "provider" + val DATASOURCE_SCHEMA = DATASOURCE_PREFIX + "schema" + val DATASOURCE_SCHEMA_PREFIX = DATASOURCE_SCHEMA + "." + val DATASOURCE_SCHEMA_NUMPARTS = DATASOURCE_SCHEMA_PREFIX + "numParts" + val DATASOURCE_SCHEMA_NUMPARTCOLS = DATASOURCE_SCHEMA_PREFIX + "numPartCols" + val DATASOURCE_SCHEMA_NUMSORTCOLS = DATASOURCE_SCHEMA_PREFIX + "numSortCols" + val DATASOURCE_SCHEMA_NUMBUCKETS = DATASOURCE_SCHEMA_PREFIX + "numBuckets" + val DATASOURCE_SCHEMA_NUMBUCKETCOLS = DATASOURCE_SCHEMA_PREFIX + "numBucketCols" + val DATASOURCE_SCHEMA_PART_PREFIX = DATASOURCE_SCHEMA_PREFIX + "part." + val DATASOURCE_SCHEMA_PARTCOL_PREFIX = DATASOURCE_SCHEMA_PREFIX + "partCol." + val DATASOURCE_SCHEMA_BUCKETCOL_PREFIX = DATASOURCE_SCHEMA_PREFIX + "bucketCol." + val DATASOURCE_SCHEMA_SORTCOL_PREFIX = DATASOURCE_SCHEMA_PREFIX + "sortCol." + + def getProviderFromTableProperties(metadata: CatalogTable): Option[String] = { + metadata.properties.get(DATASOURCE_PROVIDER) + } + + def getOriginalTableProperties(metadata: CatalogTable): Map[String, String] = { + metadata.properties.filterNot { case (key, _) => key.startsWith(DATASOURCE_PREFIX) } + } + + // A persisted data source table always store its schema in the catalog. + def getSchemaFromTableProperties(metadata: CatalogTable): StructType = { + val errorMessage = "Could not read schema from the hive metastore because it is corrupted." + val props = metadata.properties + props.get(DATASOURCE_SCHEMA).map { schema => + // Originally, we used `spark.sql.sources.schema` to store the schema of a data source table. + // After SPARK-6024, we removed this flag. + // Although we are not using `spark.sql.sources.schema` any more, we need to still support. + DataType.fromJson(schema).asInstanceOf[StructType] + } getOrElse { + props.get(DATASOURCE_SCHEMA_NUMPARTS).map { numParts => + val parts = (0 until numParts.toInt).map { index => + val part = metadata.properties.get(s"$DATASOURCE_SCHEMA_PART_PREFIX$index").orNull + if (part == null) { + throw new AnalysisException(errorMessage + + s" (missing part $index of the schema, $numParts parts are expected).") + } + part + } + // Stick all parts back to a single schema string. + DataType.fromJson(parts.mkString).asInstanceOf[StructType] + } getOrElse { + throw new AnalysisException(errorMessage) + } + } + } + + private def getColumnNamesByType( + props: Map[String, String], + colType: String, + typeName: String): Seq[String] = { + for { + numCols <- props.get(s"spark.sql.sources.schema.num${colType.capitalize}Cols").toSeq + index <- 0 until numCols.toInt + } yield props.getOrElse( + s"$DATASOURCE_SCHEMA_PREFIX${colType}Col.$index", + throw new AnalysisException( + s"Corrupted $typeName in catalog: $numCols parts expected, but part $index is missing." + ) + ) + } + + def getPartitionColumnsFromTableProperties(metadata: CatalogTable): Seq[String] = { + getColumnNamesByType(metadata.properties, "part", "partitioning columns") + } + + def getBucketSpecFromTableProperties(metadata: CatalogTable): Option[BucketSpec] = { + metadata.properties.get(DATASOURCE_SCHEMA_NUMBUCKETS).map { numBuckets => + BucketSpec( + numBuckets.toInt, + getColumnNamesByType(metadata.properties, "bucket", "bucketing columns"), + getColumnNamesByType(metadata.properties, "sort", "sorting columns")) + } + } +} diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index 7118edabb83cf..181f470b2a100 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ -import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils._ +import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.execution.datasources.{Partition => _, _} import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, ParquetOptions} import org.apache.spark.sql.hive.orc.OrcFileFormat @@ -68,64 +68,16 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log val cacheLoader = new CacheLoader[QualifiedTableName, LogicalPlan]() { override def load(in: QualifiedTableName): LogicalPlan = { logDebug(s"Creating new cached data source for $in") - val table = client.getTable(in.database, in.name) + val table = sparkSession.sharedState.externalCatalog.getTable(in.database, in.name) - // TODO: the following code is duplicated with FindDataSourceTable.readDataSourceTable - - def schemaStringFromParts: Option[String] = { - table.properties.get(DATASOURCE_SCHEMA_NUMPARTS).map { numParts => - val parts = (0 until numParts.toInt).map { index => - val part = table.properties.get(s"$DATASOURCE_SCHEMA_PART_PREFIX$index").orNull - if (part == null) { - throw new AnalysisException( - "Could not read schema from the metastore because it is corrupted " + - s"(missing part $index of the schema, $numParts parts are expected).") - } - - part - } - // Stick all parts back to a single schema string. - parts.mkString - } - } - - def getColumnNames(colType: String): Seq[String] = { - table.properties.get(s"$DATASOURCE_SCHEMA.num${colType.capitalize}Cols").map { - numCols => (0 until numCols.toInt).map { index => - table.properties.getOrElse(s"$DATASOURCE_SCHEMA_PREFIX${colType}Col.$index", - throw new AnalysisException( - s"Could not read $colType columns from the metastore because it is corrupted " + - s"(missing part $index of it, $numCols parts are expected).")) - } - }.getOrElse(Nil) - } - - // Originally, we used spark.sql.sources.schema to store the schema of a data source table. - // After SPARK-6024, we removed this flag. - // Although we are not using spark.sql.sources.schema any more, we need to still support. - val schemaString = table.properties.get(DATASOURCE_SCHEMA).orElse(schemaStringFromParts) - - val userSpecifiedSchema = - schemaString.map(s => DataType.fromJson(s).asInstanceOf[StructType]) - - // We only need names at here since userSpecifiedSchema we loaded from the metastore - // contains partition columns. We can always get data types of partitioning columns - // from userSpecifiedSchema. - val partitionColumns = getColumnNames("part") - - val bucketSpec = table.properties.get(DATASOURCE_SCHEMA_NUMBUCKETS).map { n => - BucketSpec(n.toInt, getColumnNames("bucket"), getColumnNames("sort")) - } - - val options = table.storage.properties val dataSource = DataSource( sparkSession, - userSpecifiedSchema = userSpecifiedSchema, - partitionColumns = partitionColumns, - bucketSpec = bucketSpec, - className = table.properties(DATASOURCE_PROVIDER), - options = options) + userSpecifiedSchema = Some(table.schema), + partitionColumns = table.partitionColumnNames, + bucketSpec = table.bucketSpec, + className = table.provider.get, + options = table.storage.properties) LogicalRelation( dataSource.resolveRelation(checkPathExist = true), @@ -158,9 +110,10 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log tableIdent: TableIdentifier, alias: Option[String]): LogicalPlan = { val qualifiedTableName = getQualifiedTableName(tableIdent) - val table = client.getTable(qualifiedTableName.database, qualifiedTableName.name) + val table = sparkSession.sharedState.externalCatalog.getTable( + qualifiedTableName.database, qualifiedTableName.name) - if (table.properties.get(DATASOURCE_PROVIDER).isDefined) { + if (DDLUtils.isDatasourceTable(table)) { val dataSourceTable = cachedDataSourceTables(qualifiedTableName) val qualifiedTable = SubqueryAlias(qualifiedTableName.name, dataSourceTable, None) // Then, if alias is specified, wrap the table with a Subquery using the alias. diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index f8204e183f03a..9b7afd462841c 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -45,7 +45,6 @@ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException} import org.apache.spark.sql.execution.QueryExecutionException -import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.util.{CircularBuffer, Utils} @@ -392,20 +391,7 @@ private[hive] class HiveClientImpl( createTime = h.getTTable.getCreateTime.toLong * 1000, lastAccessTime = h.getLastAccessTime.toLong * 1000, storage = CatalogStorageFormat( - locationUri = shim.getDataLocation(h).filterNot { _ => - // SPARK-15269: Persisted data source tables always store the location URI as a SerDe - // property named "path" instead of standard Hive `dataLocation`, because Hive only - // allows directory paths as location URIs while Spark SQL data source tables also - // allows file paths. So the standard Hive `dataLocation` is meaningless for Spark SQL - // data source tables. - DDLUtils.isDatasourceTable(properties) && - h.getTableType == HiveTableType.EXTERNAL_TABLE && - // Spark SQL may also save external data source in Hive compatible format when - // possible, so that these tables can be directly accessed by Hive. For these tables, - // `dataLocation` is still necessary. Here we also check for input format class - // because only these Hive compatible tables set this field. - h.getInputFormatClass == null - }, + locationUri = shim.getDataLocation(h), inputFormat = Option(h.getInputFormatClass).map(_.getName), outputFormat = Option(h.getOutputFormatClass).map(_.getName), serde = Option(h.getSerializationLib), diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala index c74d948a6fa52..286197b50e229 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala @@ -34,7 +34,6 @@ import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit} import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.hive.{HiveInspectors, HiveShim} import org.apache.spark.sql.sources.{Filter, _} @@ -222,7 +221,7 @@ private[orc] class OrcOutputWriter( private lazy val recordWriter: RecordWriter[NullWritable, Writable] = { recordWriterInstantiated = true - val uniqueWriteJobId = conf.get(CreateDataSourceTableUtils.DATASOURCE_WRITEJOBUUID) + val uniqueWriteJobId = conf.get(WriterContainer.DATASOURCE_WRITEJOBUUID) val taskAttemptId = context.getTaskAttemptID val partition = taskAttemptId.getTaskID.getId val bucketString = bucketId.map(BucketingUtils.bucketIdToString).getOrElse("") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala index 3892fe87e2a80..571ba49d115f8 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala @@ -26,9 +26,9 @@ import org.apache.hadoop.fs.Path import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType} -import org.apache.spark.sql.catalyst.parser.CatalystSqlParser -import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils._ import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} +import org.apache.spark.sql.hive.HiveExternalCatalog._ +import org.apache.spark.sql.hive.client.HiveClient import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils @@ -49,6 +49,10 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv jsonFilePath = Utils.getSparkClassLoader.getResource("sample.json").getFile } + // To test `HiveExternalCatalog`, we need to read the raw table metadata(schema, partition + // columns and bucket specification are still in table properties) from hive client. + private def hiveClient: HiveClient = sharedState.asInstanceOf[HiveSharedState].metadataHive + test("persistent JSON table") { withTable("jsonTable") { sql( @@ -697,18 +701,18 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv withTable("wide_schema") { withTempDir { tempDir => // We will need 80 splits for this schema if the threshold is 4000. - val schema = StructType((1 to 5000).map(i => StructField(s"c_$i", StringType, true))) - - // Manually create a metastore data source table. - createDataSourceTable( - sparkSession = spark, - tableIdent = TableIdentifier("wide_schema"), + val schema = StructType((1 to 5000).map(i => StructField(s"c_$i", StringType))) + + val tableDesc = CatalogTable( + identifier = TableIdentifier("wide_schema"), + tableType = CatalogTableType.EXTERNAL, + storage = CatalogStorageFormat.empty.copy( + properties = Map("path" -> tempDir.getCanonicalPath) + ), schema = schema, - partitionColumns = Array.empty[String], - bucketSpec = None, - provider = "json", - options = Map("path" -> tempDir.getCanonicalPath), - isExternal = false) + provider = Some("json") + ) + spark.sessionState.catalog.createTable(tableDesc, ignoreIfExists = false) sessionState.refreshTable("wide_schema") @@ -741,7 +745,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv DATASOURCE_SCHEMA -> schema.json, "EXTERNAL" -> "FALSE")) - sharedState.externalCatalog.createTable(hiveTable, ignoreIfExists = false) + hiveClient.createTable(hiveTable, ignoreIfExists = false) sessionState.refreshTable(tableName) val actualSchema = table(tableName).schema @@ -759,7 +763,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv withTable(tableName) { df.write.format("parquet").partitionBy("d", "b").saveAsTable(tableName) sessionState.refreshTable(tableName) - val metastoreTable = sharedState.externalCatalog.getTable("default", tableName) + val metastoreTable = hiveClient.getTable("default", tableName) val expectedPartitionColumns = StructType(df.schema("d") :: df.schema("b") :: Nil) val numPartCols = metastoreTable.properties(DATASOURCE_SCHEMA_NUMPARTCOLS).toInt @@ -794,7 +798,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv .sortBy("c") .saveAsTable(tableName) sessionState.refreshTable(tableName) - val metastoreTable = sharedState.externalCatalog.getTable("default", tableName) + val metastoreTable = hiveClient.getTable("default", tableName) val expectedBucketByColumns = StructType(df.schema("d") :: df.schema("b") :: Nil) val expectedSortByColumns = StructType(df.schema("c") :: Nil) @@ -985,35 +989,37 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv withTempDir { tempPath => val schema = StructType((1 to 5).map(i => StructField(s"c_$i", StringType))) - createDataSourceTable( - sparkSession = spark, - tableIdent = TableIdentifier("not_skip_hive_metadata"), + val tableDesc1 = CatalogTable( + identifier = TableIdentifier("not_skip_hive_metadata"), + tableType = CatalogTableType.EXTERNAL, + storage = CatalogStorageFormat.empty.copy( + properties = Map("path" -> tempPath.getCanonicalPath, "skipHiveMetadata" -> "false") + ), schema = schema, - partitionColumns = Array.empty[String], - bucketSpec = None, - provider = "parquet", - options = Map("path" -> tempPath.getCanonicalPath, "skipHiveMetadata" -> "false"), - isExternal = false) + provider = Some("parquet") + ) + spark.sessionState.catalog.createTable(tableDesc1, ignoreIfExists = false) // As a proxy for verifying that the table was stored in Hive compatible format, // we verify that each column of the table is of native type StringType. - assert(sharedState.externalCatalog.getTable("default", "not_skip_hive_metadata").schema + assert(hiveClient.getTable("default", "not_skip_hive_metadata").schema .forall(_.dataType == StringType)) - createDataSourceTable( - sparkSession = spark, - tableIdent = TableIdentifier("skip_hive_metadata"), + val tableDesc2 = CatalogTable( + identifier = TableIdentifier("skip_hive_metadata", Some("default")), + tableType = CatalogTableType.EXTERNAL, + storage = CatalogStorageFormat.empty.copy( + properties = Map("path" -> tempPath.getCanonicalPath, "skipHiveMetadata" -> "true") + ), schema = schema, - partitionColumns = Array.empty[String], - bucketSpec = None, - provider = "parquet", - options = Map("path" -> tempPath.getCanonicalPath, "skipHiveMetadata" -> "true"), - isExternal = false) + provider = Some("parquet") + ) + spark.sessionState.catalog.createTable(tableDesc2, ignoreIfExists = false) // As a proxy for verifying that the table was stored in SparkSQL format, // we verify that the table has a column type as array of StringType. - assert(sharedState.externalCatalog.getTable("default", "skip_hive_metadata") - .schema.forall(_.dataType == ArrayType(StringType))) + assert(hiveClient.getTable("default", "skip_hive_metadata").schema + .forall(_.dataType == ArrayType(StringType))) } } @@ -1030,7 +1036,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv """.stripMargin ) - val metastoreTable = sharedState.externalCatalog.getTable("default", "t") + val metastoreTable = hiveClient.getTable("default", "t") assert(metastoreTable.properties(DATASOURCE_SCHEMA_NUMPARTCOLS).toInt === 1) assert(!metastoreTable.properties.contains(DATASOURCE_SCHEMA_NUMBUCKETS)) assert(!metastoreTable.properties.contains(DATASOURCE_SCHEMA_NUMBUCKETCOLS)) @@ -1054,7 +1060,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv """.stripMargin ) - val metastoreTable = sharedState.externalCatalog.getTable("default", "t") + val metastoreTable = hiveClient.getTable("default", "t") assert(!metastoreTable.properties.contains(DATASOURCE_SCHEMA_NUMPARTCOLS)) assert(metastoreTable.properties(DATASOURCE_SCHEMA_NUMBUCKETS).toInt === 2) assert(metastoreTable.properties(DATASOURCE_SCHEMA_NUMBUCKETCOLS).toInt === 1) @@ -1076,7 +1082,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv """.stripMargin ) - val metastoreTable = sharedState.externalCatalog.getTable("default", "t") + val metastoreTable = hiveClient.getTable("default", "t") assert(!metastoreTable.properties.contains(DATASOURCE_SCHEMA_NUMPARTCOLS)) assert(metastoreTable.properties(DATASOURCE_SCHEMA_NUMBUCKETS).toInt === 2) assert(metastoreTable.properties(DATASOURCE_SCHEMA_NUMBUCKETCOLS).toInt === 1) @@ -1101,7 +1107,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv """.stripMargin ) - val metastoreTable = sharedState.externalCatalog.getTable("default", "t") + val metastoreTable = hiveClient.getTable("default", "t") assert(metastoreTable.properties(DATASOURCE_SCHEMA_NUMPARTCOLS).toInt === 1) assert(metastoreTable.properties(DATASOURCE_SCHEMA_NUMBUCKETS).toInt === 2) assert(metastoreTable.properties(DATASOURCE_SCHEMA_NUMBUCKETCOLS).toInt === 1) @@ -1168,7 +1174,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv ) sql("insert into t values (2, 3, 4)") checkAnswer(table("t"), Seq(Row(1, 2, 3), Row(2, 3, 4))) - val catalogTable = sharedState.externalCatalog.getTable("default", "t") + val catalogTable = hiveClient.getTable("default", "t") // there should not be a lowercase key 'path' now assert(catalogTable.storage.properties.get("path").isEmpty) assert(catalogTable.storage.properties.get("PATH").isDefined) @@ -1188,4 +1194,28 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv } } } + + test("read table with corrupted schema") { + try { + val schema = StructType(StructField("int", IntegerType, true) :: Nil) + val hiveTable = CatalogTable( + identifier = TableIdentifier("t", Some("default")), + tableType = CatalogTableType.MANAGED, + schema = new StructType, + storage = CatalogStorageFormat.empty, + properties = Map( + DATASOURCE_PROVIDER -> "json", + // no DATASOURCE_SCHEMA_NUMPARTS + DATASOURCE_SCHEMA_PART_PREFIX + 0 -> schema.json)) + + hiveClient.createTable(hiveTable, ignoreIfExists = false) + + val e = intercept[AnalysisException] { + sharedState.externalCatalog.getTable("default", "t") + }.getMessage + assert(e.contains(s"Could not read schema from the hive metastore because it is corrupted")) + } finally { + hiveClient.dropTable("default", "t", ignoreIfNotExists = true, purge = true) + } + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala index 5d510197c4d95..76aa84b19410d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala @@ -18,21 +18,32 @@ package org.apache.spark.sql.hive.execution import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SaveMode} +import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.NoSuchTableException -import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils._ +import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType} import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils +import org.apache.spark.sql.types.StructType class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { import testImplicits._ protected override def beforeAll(): Unit = { super.beforeAll() - sql( - """ - |CREATE TABLE parquet_tab1 (c1 INT, c2 STRING) - |USING org.apache.spark.sql.parquet.DefaultSource - """.stripMargin) + + // Use catalog to create table instead of SQL string here, because we don't support specifying + // table properties for data source table with SQL API now. + hiveContext.sessionState.catalog.createTable( + CatalogTable( + identifier = TableIdentifier("parquet_tab1"), + tableType = CatalogTableType.MANAGED, + storage = CatalogStorageFormat.empty, + schema = new StructType().add("c1", "int").add("c2", "string"), + provider = Some("parquet"), + properties = Map("my_key1" -> "v1") + ), + ignoreIfExists = false + ) sql( """ @@ -101,23 +112,14 @@ class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleto test("show tblproperties of data source tables - basic") { checkAnswer( - sql("SHOW TBLPROPERTIES parquet_tab1").filter(s"key = '$DATASOURCE_PROVIDER'"), - Row(DATASOURCE_PROVIDER, "org.apache.spark.sql.parquet.DefaultSource") :: Nil + sql("SHOW TBLPROPERTIES parquet_tab1").filter(s"key = 'my_key1'"), + Row("my_key1", "v1") :: Nil ) checkAnswer( - sql(s"SHOW TBLPROPERTIES parquet_tab1($DATASOURCE_PROVIDER)"), - Row("org.apache.spark.sql.parquet.DefaultSource") :: Nil + sql(s"SHOW TBLPROPERTIES parquet_tab1('my_key1')"), + Row("v1") :: Nil ) - - checkAnswer( - sql("SHOW TBLPROPERTIES parquet_tab1").filter(s"key = '$DATASOURCE_SCHEMA_NUMPARTS'"), - Row(DATASOURCE_SCHEMA_NUMPARTS, "1") :: Nil - ) - - checkAnswer( - sql(s"SHOW TBLPROPERTIES parquet_tab1('$DATASOURCE_SCHEMA_NUMPARTS')"), - Row("1")) } test("show tblproperties for datasource table - errors") { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index 970b6885f6254..f00a99b6d0b3d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -692,4 +692,27 @@ class HiveDDLSuite )) } } + + test("datasource table property keys are not allowed") { + import org.apache.spark.sql.hive.HiveExternalCatalog.DATASOURCE_PREFIX + + withTable("tbl") { + sql("CREATE TABLE tbl(a INT) STORED AS parquet") + + val e = intercept[AnalysisException] { + sql(s"ALTER TABLE tbl SET TBLPROPERTIES ('${DATASOURCE_PREFIX}foo' = 'loser')") + } + assert(e.getMessage.contains(DATASOURCE_PREFIX + "foo")) + + val e2 = intercept[AnalysisException] { + sql(s"ALTER TABLE tbl UNSET TBLPROPERTIES ('${DATASOURCE_PREFIX}foo')") + } + assert(e2.getMessage.contains(DATASOURCE_PREFIX + "foo")) + + val e3 = intercept[AnalysisException] { + sql(s"CREATE TABLE tbl TBLPROPERTIES ('${DATASOURCE_PREFIX}foo'='anything')") + } + assert(e3.getMessage.contains(DATASOURCE_PREFIX + "foo")) + } + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index e6fe47aa65f34..4ca882f840a58 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -29,7 +29,6 @@ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, FunctionRegistry} import org.apache.spark.sql.catalyst.catalog.CatalogTableType import org.apache.spark.sql.catalyst.parser.ParseException -import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.{HiveUtils, MetastoreRelation} @@ -436,8 +435,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { assert(r.options("path") === location) case None => // OK. } - assert( - catalogTable.properties(CreateDataSourceTableUtils.DATASOURCE_PROVIDER) === format) + assert(catalogTable.provider.get === format) case r: MetastoreRelation => if (isDataSourceParquet) { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala index 67a58a3859b84..906de6bbcbee5 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala @@ -29,7 +29,6 @@ import org.apache.spark.sql.{sources, Row, SparkSession} import org.apache.spark.sql.catalyst.{expressions, InternalRow} import org.apache.spark.sql.catalyst.expressions.{Cast, Expression, GenericInternalRow, InterpretedPredicate, InterpretedProjection, JoinedRow, Literal} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection -import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils._ import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.util.SerializableConfiguration @@ -145,7 +144,7 @@ class AppendingTextOutputFormat(outputFile: Path) extends TextOutputFormat[NullW override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { val configuration = context.getConfiguration - val uniqueWriteJobId = configuration.get(DATASOURCE_WRITEJOBUUID) + val uniqueWriteJobId = configuration.get(WriterContainer.DATASOURCE_WRITEJOBUUID) val taskAttemptId = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId val name = FileOutputFormat.getOutputName(context)