From c6177a3efedce84d6fb9b86e8f5f38688ec5da98 Mon Sep 17 00:00:00 2001 From: SandishKumarHN Date: Mon, 17 Oct 2022 21:21:29 -0700 Subject: [PATCH 01/19] review changes nit --- .../sql/protobuf/ProtobufDeserializer.scala | 25 ++++---- .../sql/protobuf/ProtobufSerializer.scala | 20 +++---- .../sql/protobuf/utils/SchemaConverters.scala | 7 +-- .../sql/protobuf/ProtobufSerdeSuite.scala | 6 +- .../main/resources/error/error-classes.json | 25 ++++++++ .../sql/errors/QueryCompilationErrors.scala | 60 +++++++++++++++++++ 6 files changed, 115 insertions(+), 28 deletions(-) diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala index 0403b741ebfa7..f31f9b058d2ca 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala @@ -22,6 +22,7 @@ import com.google.protobuf.{ByteString, DynamicMessage, Message} import com.google.protobuf.Descriptors._ import com.google.protobuf.Descriptors.FieldDescriptor.JavaType._ +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.{InternalRow, NoopFilters, StructFilters} import org.apache.spark.sql.catalyst.expressions.{SpecificInternalRow, UnsafeArrayData} import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, DateTimeUtils, GenericArrayData} @@ -29,7 +30,6 @@ import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.protobuf.utils.ProtobufUtils import org.apache.spark.sql.protobuf.utils.ProtobufUtils.ProtoMatchedField import org.apache.spark.sql.protobuf.utils.ProtobufUtils.toFieldStr -import org.apache.spark.sql.protobuf.utils.SchemaConverters.IncompatibleSchemaException import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -61,11 +61,11 @@ private[sql] class ProtobufDeserializer( } } } catch { - case ise: IncompatibleSchemaException => - throw new IncompatibleSchemaException( - s"Cannot convert Protobuf type ${rootDescriptor.getName} " + - s"to SQL type ${rootCatalystType.sql}.", - ise) + case ise: AnalysisException => + throw QueryCompilationErrors.cannotConvertProtobufTypeToTypeError( + rootDescriptor.getName, + rootCatalystType.sql, + ise.getMessage()) } def deserialize(data: Message): Option[InternalRow] = converter(data) @@ -152,11 +152,6 @@ private[sql] class ProtobufDeserializer( catalystType: DataType, protoPath: Seq[String], catalystPath: Seq[String]): (CatalystDataUpdater, Int, Any) => Unit = { - val errorPrefix = s"Cannot convert Protobuf ${toFieldStr(protoPath)} to " + - s"SQL ${toFieldStr(catalystPath)} because " - val incompatibleMsg = errorPrefix + - s"schema is incompatible (protoType = ${protoType} ${protoType.toProto.getLabel} " + - s"${protoType.getJavaType} ${protoType.getType}, sqlType = ${catalystType.sql})" (protoType.getJavaType, catalystType) match { @@ -244,7 +239,13 @@ private[sql] class ProtobufDeserializer( case (ENUM, StringType) => (updater, ordinal, value) => updater.set(ordinal, UTF8String.fromString(value.toString)) - case _ => throw new IncompatibleSchemaException(incompatibleMsg) + case _ => + throw QueryCompilationErrors.cannotConvertProtobufTypeToDataTypeError( + toFieldStr(protoPath), + toFieldStr(catalystPath), + s"${protoType} ${protoType.toProto.getLabel} ${protoType.getJavaType}" + + s" ${protoType.getType}", + catalystType.sql) } } diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufSerializer.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufSerializer.scala index 5d9af92c5c077..ad52e9f784e2d 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufSerializer.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufSerializer.scala @@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.SpecializedGetters import org.apache.spark.sql.catalyst.util.{DateTimeUtils, IntervalUtils} import org.apache.spark.sql.catalyst.util.IntervalStringStyles.ANSI_STYLE +import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.protobuf.utils.ProtobufUtils import org.apache.spark.sql.protobuf.utils.ProtobufUtils.{toFieldStr, ProtoMatchedField} import org.apache.spark.sql.protobuf.utils.SchemaConverters.IncompatibleSchemaException @@ -77,8 +78,6 @@ private[sql] class ProtobufSerializer( fieldDescriptor: FieldDescriptor, catalystPath: Seq[String], protoPath: Seq[String]): Converter = { - val errorPrefix = s"Cannot convert SQL ${toFieldStr(catalystPath)} " + - s"to Protobuf ${toFieldStr(protoPath)} because " (catalystType, fieldDescriptor.getJavaType) match { case (NullType, _) => (getter, ordinal) => null @@ -104,10 +103,9 @@ private[sql] class ProtobufSerializer( (getter, ordinal) => val data = getter.getUTF8String(ordinal).toString if (!enumSymbols.contains(data)) { - throw new IncompatibleSchemaException( - errorPrefix + - s""""$data" cannot be written since it's not defined in enum """ + - enumSymbols.mkString("\"", "\", \"", "\"")) + throw QueryCompilationErrors.cannotConvertDataTypeToProtobufEnumTypeError( + toFieldStr(catalystPath), toFieldStr(protoPath), data, + enumSymbols.mkString("\"", "\", \"", "\"")) } fieldDescriptor.getEnumType.findValueByName(data) case (StringType, STRING) => @@ -215,10 +213,12 @@ private[sql] class ProtobufSerializer( duration.build() case _ => - throw new IncompatibleSchemaException( - errorPrefix + - s"schema is incompatible (sqlType = ${catalystType.sql}, " + - s"protoType = ${fieldDescriptor.getJavaType})") + throw QueryCompilationErrors.cannotConvertDataTypeToProtobufTypeError( + toFieldStr(catalystPath), + toFieldStr(protoPath), + catalystType.sql, + s"${fieldDescriptor} ${fieldDescriptor.toProto.getLabel} ${fieldDescriptor.getJavaType}" + + s" ${fieldDescriptor.getType}") } } diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala index e385b816abe70..cb415b57431c9 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala @@ -21,6 +21,7 @@ import scala.collection.JavaConverters._ import com.google.protobuf.Descriptors.{Descriptor, FieldDescriptor} import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.protobuf.ScalaReflectionLock import org.apache.spark.sql.types._ @@ -96,10 +97,8 @@ object SchemaConverters { .toSeq) .filter(_.nonEmpty) .map(StructType.apply) - case _ => - throw new IncompatibleSchemaException( - s"Cannot convert Protobuf type" + - s" ${fd.getJavaType}") + case other => + throw QueryCompilationErrors.protobufTypeUnsupportedYetError(other.toString) } dataType.map(dt => StructField( diff --git a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala index 37c59743e7714..142bab18ce6fa 100644 --- a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala +++ b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala @@ -20,9 +20,9 @@ package org.apache.spark.sql.protobuf import com.google.protobuf.Descriptors.Descriptor import com.google.protobuf.DynamicMessage +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.NoopFilters import org.apache.spark.sql.protobuf.utils.ProtobufUtils -import org.apache.spark.sql.protobuf.utils.SchemaConverters.IncompatibleSchemaException import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{IntegerType, StructType} @@ -160,7 +160,7 @@ class ProtobufSerdeSuite extends SharedSparkSession { fieldMatchType: MatchType, expectedCauseMessage: String, catalystSchema: StructType = CATALYST_STRUCT): Unit = { - val e = intercept[IncompatibleSchemaException] { + val e = intercept[AnalysisException] { serdeFactory.create(catalystSchema, protoSchema, fieldMatchType) } val expectMsg = serdeFactory match { @@ -170,6 +170,8 @@ class ProtobufSerdeSuite extends SharedSparkSession { s"Cannot convert SQL type ${catalystSchema.sql} to Protobuf type ${protoSchema.getName}." } + println(s"${e.getMessage}, : ${expectMsg}") + println(s"${e.getCause.getMessage}, : ${expectedCauseMessage}") assert(e.getMessage === expectMsg) assert(e.getCause.getMessage === expectedCauseMessage) } diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index 784b8c04d8972..e190861d54727 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -4207,5 +4207,30 @@ "message" : [ "Not enough memory to build and broadcast the table to all worker nodes. As a workaround, you can either disable broadcast by setting to -1 or increase the spark driver memory by setting to a higher value" ] + }, + "_LEGACY_ERROR_TEMP_2251" : { + "message" : [ + "Cannot convert Protobuf to SQL because schema is incompatible (protobufType = , sqlType = ." + ] + }, + "_LEGACY_ERROR_TEMP_2252" : { + "message" : [ + "Cannot convert SQL to SQL because schema is incompatible (protobufType = , sqlType = ." + ] + }, + "_LEGACY_ERROR_TEMP_2253" : { + "message" : [ + "Cannot convert SQL to Protobuf because cannot be written since it's not defined in ENUM " + ] + }, + "_LEGACY_ERROR_TEMP_2254" : { + "message" : [ + "Unable to convert of Protobuf to data type . " + ] + }, + "_LEGACY_ERROR_TEMP_2255" : { + "message" : [ + "Protobuf type not yet supported: ." + ] } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index 0c6aeedfc4acb..978130327ad19 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -1751,6 +1751,66 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { messageParameters = Map("field" -> field)) } + def cannotConvertProtobufTypeToDataTypeError( + protobufColumn: String, + sqlColumn: String, + protobufType: String, + sqlType: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2251", + messageParameters = Map( + "protobufColumn" -> protobufColumn, + "sqlColumn" -> sqlColumn, + "protobufType" -> protobufType, + "sqlType" -> sqlType)) + } + + def cannotConvertDataTypeToProtobufTypeError( + sqlColumn: String, + protobufColumn: String, + sqlType: String, + protobufType: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2252", + messageParameters = Map( + "sqlColumn" -> sqlColumn, + "protobufColumn" -> protobufColumn, + "sqlType" -> sqlType, + "protobufType" -> protobufType)) + } + + def cannotConvertDataTypeToProtobufEnumTypeError( + sqlColumn: String, + protobufColumn: String, + data: String, + enumString: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2253", + messageParameters = Map( + "sqlColumn" -> sqlColumn, + "protobufColumn" -> protobufColumn, + "data" -> data, + "enumString" -> enumString)) + } + + def cannotConvertProtobufTypeToTypeError( + protobufType: String, + sqlType: String, + errorMessage: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2254", + messageParameters = Map( + "protobufType" -> protobufType, + "toType" -> sqlType, + "errorMessage" -> errorMessage)) + } + + def protobufTypeUnsupportedYetError(protobufType: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2255", + messageParameters = Map("protobufType" -> protobufType)) + } + def cannotConvertDataTypeToParquetTypeError(field: StructField): Throwable = { new AnalysisException( errorClass = "_LEGACY_ERROR_TEMP_1175", From 8fad0180ba856fb883d87ce98c63c818e5dfd62f Mon Sep 17 00:00:00 2001 From: SandishKumarHN Date: Thu, 20 Oct 2022 21:46:58 -0700 Subject: [PATCH 02/19] adding spark-protobuf error class into spark error-class framework --- .../sql/protobuf/ProtobufDeserializer.scala | 4 +- .../sql/protobuf/ProtobufSerializer.scala | 10 +- .../sql/protobuf/utils/ProtobufUtils.scala | 44 ++++---- .../sql/protobuf/utils/SchemaConverters.scala | 4 +- .../sql/protobuf/ProtobufFunctionsSuite.scala | 1 - .../sql/protobuf/ProtobufSerdeSuite.scala | 5 +- .../main/resources/error/error-classes.json | 61 ++++++++++- .../sql/errors/QueryCompilationErrors.scala | 102 ++++++++++++++++-- 8 files changed, 184 insertions(+), 47 deletions(-) diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala index f31f9b058d2ca..effa21b55fdd9 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala @@ -62,10 +62,10 @@ private[sql] class ProtobufDeserializer( } } catch { case ise: AnalysisException => - throw QueryCompilationErrors.cannotConvertProtobufTypeToTypeError( + throw QueryCompilationErrors.cannotConvertProtobufTypeToSqlTypeError( rootDescriptor.getName, rootCatalystType.sql, - ise.getMessage()) + ise) } def deserialize(data: Message): Option[InternalRow] = converter(data) diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufSerializer.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufSerializer.scala index ad52e9f784e2d..bc9aa18062053 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufSerializer.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufSerializer.scala @@ -23,6 +23,7 @@ import com.google.protobuf.Descriptors.{Descriptor, FieldDescriptor} import com.google.protobuf.Descriptors.FieldDescriptor.JavaType._ import org.apache.spark.internal.Logging +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.SpecializedGetters import org.apache.spark.sql.catalyst.util.{DateTimeUtils, IntervalUtils} @@ -30,7 +31,6 @@ import org.apache.spark.sql.catalyst.util.IntervalStringStyles.ANSI_STYLE import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.protobuf.utils.ProtobufUtils import org.apache.spark.sql.protobuf.utils.ProtobufUtils.{toFieldStr, ProtoMatchedField} -import org.apache.spark.sql.protobuf.utils.SchemaConverters.IncompatibleSchemaException import org.apache.spark.sql.types._ /** @@ -54,10 +54,10 @@ private[sql] class ProtobufSerializer( newStructConverter(st, rootDescriptor, Nil, Nil).asInstanceOf[Any => Any] } } catch { - case ise: IncompatibleSchemaException => - throw new IncompatibleSchemaException( - s"Cannot convert SQL type ${rootCatalystType.sql} to Protobuf type " + - s"${rootDescriptor.getName}.", + case ise: AnalysisException => + throw QueryCompilationErrors.cannotConvertSqlTypeToProtobufError( + rootDescriptor.getName, + rootCatalystType.sql, ise) } if (nullable) { (data: Any) => diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala index 5ad043142a2d2..6f6a4c4c5bdff 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala @@ -26,8 +26,8 @@ import com.google.protobuf.{DescriptorProtos, Descriptors, InvalidProtocolBuffer import com.google.protobuf.Descriptors.{Descriptor, FieldDescriptor} import org.apache.spark.internal.Logging +import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.protobuf.utils.SchemaConverters.IncompatibleSchemaException import org.apache.spark.sql.types._ private[sql] object ProtobufUtils extends Logging { @@ -60,9 +60,9 @@ private[sql] object ProtobufUtils extends Logging { protoPath: Seq[String], catalystPath: Seq[String]) { if (descriptor.getName == null) { - throw new IncompatibleSchemaException( - s"Attempting to treat ${descriptor.getName} as a RECORD, " + - s"but it was: ${descriptor.getContainingType}") + throw QueryCompilationErrors.unknownProtobufMessageTypeError( + descriptor.getName, + descriptor.getContainingType) } private[this] val protoFieldArray = descriptor.getFields.asScala.toArray @@ -78,7 +78,7 @@ private[sql] object ProtobufUtils extends Logging { /** * Validate that there are no Catalyst fields which don't have a matching Protobuf field, - * throwing [[IncompatibleSchemaException]] if such extra fields are found. If + * throwing [[AnalysisException]] if such extra fields are found. If * `ignoreNullable` is false, consider nullable Catalyst fields to be eligible to be an extra * field; otherwise, ignore nullable Catalyst fields when checking for extras. */ @@ -86,22 +86,21 @@ private[sql] object ProtobufUtils extends Logging { catalystSchema.fields.foreach { sqlField => if (getFieldByName(sqlField.name).isEmpty && (!ignoreNullable || !sqlField.nullable)) { - throw new IncompatibleSchemaException( - s"Cannot find ${toFieldStr(catalystPath :+ sqlField.name)} in Protobuf schema") + throw QueryCompilationErrors.cannotFindCatalystTypeInProtobufSchemaError( + toFieldStr(catalystPath :+ sqlField.name)) } } /** * Validate that there are no Protobuf fields which don't have a matching Catalyst field, - * throwing [[IncompatibleSchemaException]] if such extra fields are found. Only required + * throwing [[AnalysisException]] if such extra fields are found. Only required * (non-nullable) fields are checked; nullable fields are ignored. */ def validateNoExtraRequiredProtoFields(): Unit = { val extraFields = protoFieldArray.toSet -- matchedFields.map(_.fieldDescriptor) extraFields.filterNot(isNullable).foreach { extraField => - throw new IncompatibleSchemaException( - s"Found ${toFieldStr(protoPath :+ extraField.getName())} in Protobuf schema " + - "but there is no match in the SQL schema") + throw QueryCompilationErrors.cannotFindProtobufFieldInInCatalystError( + toFieldStr(protoPath :+ extraField.getName())) } } @@ -124,10 +123,11 @@ private[sql] object ProtobufUtils extends Logging { case Seq(protoField) => Some(protoField) case Seq() => None case matches => - throw new IncompatibleSchemaException( - s"Searching for '$name' in " + - s"Protobuf schema at ${toFieldStr(protoPath)} gave ${matches.size} matches. " + - s"Candidates: " + matches.map(_.getName()).mkString("[", ", ", "]")) + throw QueryCompilationErrors.protobufFieldMatchError( + name, + toFieldStr(protoPath), + matches.size, + matches.map(_.getName()).mkString("[", ", ", "]")) } } } @@ -143,7 +143,7 @@ private[sql] object ProtobufUtils extends Logging { } if (null == result) { - throw new RuntimeException("Unable to locate Message '" + messageName + "' in Descriptor"); + throw QueryCompilationErrors.unableToLocateProtobuMessageError(messageName) } result } @@ -155,13 +155,9 @@ private[sql] object ProtobufUtils extends Logging { fileDescriptorSet = DescriptorProtos.FileDescriptorSet.parseFrom(dscFile) } catch { case ex: InvalidProtocolBufferException => - // TODO move all the exceptions to core/src/main/resources/error/error-classes.json - throw new RuntimeException("Error parsing descriptor byte[] into Descriptor object", ex) + throw QueryCompilationErrors.descrioptorParseError(ex) case ex: IOException => - throw new RuntimeException( - "Error reading Protobuf descriptor file at path: " + - descFilePath, - ex) + throw QueryCompilationErrors.cannotFindDescriptorFileError(descFilePath, ex) } val descriptorProto: DescriptorProtos.FileDescriptorProto = fileDescriptorSet.getFile(0) @@ -170,12 +166,12 @@ private[sql] object ProtobufUtils extends Logging { descriptorProto, new Array[Descriptors.FileDescriptor](0)) if (fileDescriptor.getMessageTypes().isEmpty()) { - throw new RuntimeException("No MessageTypes returned, " + fileDescriptor.getName()); + throw QueryCompilationErrors.noMessageTypeReturnError(fileDescriptor.getName()) } fileDescriptor } catch { case e: Descriptors.DescriptorValidationException => - throw new RuntimeException("Error constructing FileDescriptor", e) + throw QueryCompilationErrors.failedParsingDescriptorError(e) } } diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala index cb415b57431c9..cf91227889ac9 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala @@ -85,9 +85,7 @@ object SchemaConverters { nullable = false)) case MESSAGE => if (existingRecordNames.contains(fd.getFullName)) { - throw new IncompatibleSchemaException(s""" - |Found recursive reference in Protobuf schema, which can not be processed by Spark: - |${fd.toString()}""".stripMargin) + throw QueryCompilationErrors.foundRecursionInProtobufSchema(fd.toString()) } val newRecordNames = existingRecordNames + fd.getFullName diff --git a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufFunctionsSuite.scala b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufFunctionsSuite.scala index 4e9bc1c1c287a..1f8022a6f35f2 100644 --- a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufFunctionsSuite.scala +++ b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufFunctionsSuite.scala @@ -561,7 +561,6 @@ class ProtobufFunctionsSuite extends QueryTest with SharedSparkSession with Seri val fromProtoDf = toProtoDf .select(functions.from_protobuf($"to_proto", testFileDesc, "timeStampMsg") as 'timeStampMsg) - fromProtoDf.show(truncate = false) val actualFields = fromProtoDf.schema.fields.toList val expectedFields = inputDf.schema.fields.toList diff --git a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala index 142bab18ce6fa..9bd775028cdd7 100644 --- a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala +++ b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala @@ -165,9 +165,10 @@ class ProtobufSerdeSuite extends SharedSparkSession { } val expectMsg = serdeFactory match { case Deserializer => - s"Cannot convert Protobuf type ${protoSchema.getName} to SQL type ${catalystSchema.sql}." + s"Unable to convert ${protoSchema.getName} of Protobuf to SQL type ${catalystSchema.sql}." case Serializer => - s"Cannot convert SQL type ${catalystSchema.sql} to Protobuf type ${protoSchema.getName}." + s"Unable to convert SQL type ${catalystSchema.sql} to Protobuf type" + + s" ${protoSchema.getName}." } println(s"${e.getMessage}, : ${expectMsg}") diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index e190861d54727..75972a3eb1ac5 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -4210,12 +4210,12 @@ }, "_LEGACY_ERROR_TEMP_2251" : { "message" : [ - "Cannot convert Protobuf to SQL because schema is incompatible (protobufType = , sqlType = ." + "Cannot convert Protobuf to SQL because schema is incompatible (protobufType = , sqlType = ." ] }, "_LEGACY_ERROR_TEMP_2252" : { "message" : [ - "Cannot convert SQL to SQL because schema is incompatible (protobufType = , sqlType = ." + "Cannot convert SQL to Protobuf because schema is incompatible (protobufType = , sqlType = ." ] }, "_LEGACY_ERROR_TEMP_2253" : { @@ -4225,12 +4225,67 @@ }, "_LEGACY_ERROR_TEMP_2254" : { "message" : [ - "Unable to convert of Protobuf to data type . " + "Unable to convert of Protobuf to SQL type ." ] }, "_LEGACY_ERROR_TEMP_2255" : { + "message" : [ + "Unable to convert SQL type to Protobuf type ." + ] + }, + "_LEGACY_ERROR_TEMP_2256" : { "message" : [ "Protobuf type not yet supported: ." ] + }, + "_LEGACY_ERROR_TEMP_2257" : { + "message" : [ + "Attempting to treat as a Message, but it was " + ] + }, + "_LEGACY_ERROR_TEMP_2258" : { + "message" : [ + "Cannot find in Protobuf schema" + ] + }, + "_LEGACY_ERROR_TEMP_2259" : { + "message" : [ + "Found in Protobuf schema but there is no match in the SQL schema" + ] + }, + "_LEGACY_ERROR_TEMP_2260" : { + "message" : [ + "Searching for in Protobuf schema at gave matches. Candidates: " + ] + }, + "_LEGACY_ERROR_TEMP_2261" : { + "message" : [ + "Unable to locate Message in Descriptor" + ] + }, + "_LEGACY_ERROR_TEMP_2262" : { + "message" : [ + "Error parsing descriptor byte[] into Descriptor object Error: " + ] + }, + "_LEGACY_ERROR_TEMP_2263" : { + "message" : [ + "Error reading Protobuf descriptor file at path: " + ] + }, + "_LEGACY_ERROR_TEMP_2264" : { + "message" : [ + "No MessageTypes returned, " + ] + }, + "_LEGACY_ERROR_TEMP_2265" : { + "message" : [ + "Error constructing FileDescriptor, Error: " + ] + }, + "_LEGACY_ERROR_TEMP_2266" : { + "message" : [ + "Found recursive reference in Protobuf schema, which can not be processed by Spark: " + ] } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index 978130327ad19..9bdcca533cf02 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -1793,22 +1793,110 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { "enumString" -> enumString)) } - def cannotConvertProtobufTypeToTypeError( + def protobufTypeUnsupportedYetError(protobufType: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2256", + messageParameters = Map("protobufType" -> protobufType)) + } + + def cannotConvertProtobufTypeToSqlTypeError( protobufType: String, sqlType: String, - errorMessage: String): Throwable = { + e1: Throwable): Throwable = { new AnalysisException( errorClass = "_LEGACY_ERROR_TEMP_2254", messageParameters = Map( "protobufType" -> protobufType, - "toType" -> sqlType, - "errorMessage" -> errorMessage)) + "toType" -> sqlType), + cause = Some(e1.getCause)) } - def protobufTypeUnsupportedYetError(protobufType: String): Throwable = { + def cannotConvertSqlTypeToProtobufError( + protobufType: String, + sqlType: String, + e1: Throwable): Throwable = { new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2255", - messageParameters = Map("protobufType" -> protobufType)) + errorClass = "_LEGACY_ERROR_TEMP_2256", + messageParameters = Map( + "protobufType" -> protobufType, + "toType" -> sqlType), + cause = Some(e1.getCause)) + } + + def unknownProtobufMessageTypeError( + descriptorName: String, + containingType: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2257", + messageParameters = Map( + "descriptorName" -> descriptorName, + "containingType" -> containingType)) + } + + def cannotFindCatalystTypeInProtobufSchemaError(catalystFieldPath: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2258", + messageParameters = Map("catalystFieldPath" -> catalystFieldPath)) + } + + def cannotFindProtobufFieldInInCatalystError(field: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2259", + messageParameters = Map("field" -> field)) + } + + def protobufFieldMatchError( + field: String, + protobufSchema: String, + matchSize: Int, + matches: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2260", + messageParameters = Map( + "field" -> field, + "protobufSchema" -> protobufSchema, + "matchSize" -> matchSize, + "matches" -> matches)) + } + + def unableToLocateProtobuMessageError(messageName: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2261", + messageParameters = Map("messageName" -> messageName)) + } + + def descrioptorParseError(e1: Throwable): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2262", + messageParameters = Map("errorMessage" -> e1.getMessage()), + cause = Some(e1.getCause)) + } + + def cannotFindDescriptorFileError(filePath: String, e1: Throwable): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2263", + messageParameters = Map("filePath" -> filePath), + cause = Some(e1.getCause)) + } + + def noMessageTypeReturnError(descriptorName: String, e1: Throwable): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2264", + messageParameters = Map("descriptorName" -> descriptorName), + cause = Some(e1.getCause)) + } + + def failedParsingDescriptorError(e1: Throwable): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2265", + messageParameters = Map("errorMessage" -> e1.getMessage()), + cause = Some(e1.getCause)) + } + + def foundRecursionInProtobufSchema(fieldDescriptor: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2266", + messageParameters = Map("fieldDescriptor" -> fieldDescriptor)) } def cannotConvertDataTypeToParquetTypeError(field: StructField): Throwable = { From dac2fa8e202628d34ee4e0066fdf13060e210d4e Mon Sep 17 00:00:00 2001 From: SandishKumarHN Date: Fri, 21 Oct 2022 00:20:46 -0700 Subject: [PATCH 03/19] adding spark-protobuf error class into spark error-class framework --- .../sql/protobuf/utils/ProtobufUtils.scala | 4 ++-- .../sql/protobuf/utils/SchemaConverters.scala | 3 --- .../sql/protobuf/ProtobufFunctionsSuite.scala | 17 +++++++---------- .../spark/sql/protobuf/ProtobufSerdeSuite.scala | 6 +++--- .../sql/errors/QueryCompilationErrors.scala | 9 ++++----- 5 files changed, 16 insertions(+), 23 deletions(-) diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala index 6f6a4c4c5bdff..b0985a7fd25af 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala @@ -62,7 +62,7 @@ private[sql] object ProtobufUtils extends Logging { if (descriptor.getName == null) { throw QueryCompilationErrors.unknownProtobufMessageTypeError( descriptor.getName, - descriptor.getContainingType) + descriptor.getContainingType().getName) } private[this] val protoFieldArray = descriptor.getFields.asScala.toArray @@ -126,7 +126,7 @@ private[sql] object ProtobufUtils extends Logging { throw QueryCompilationErrors.protobufFieldMatchError( name, toFieldStr(protoPath), - matches.size, + s"${matches.size}", matches.map(_.getName()).mkString("[", ", ", "]")) } } diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala index cf91227889ac9..9164727236f4b 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala @@ -104,7 +104,4 @@ object SchemaConverters { if (fd.isRepeated) ArrayType(dt, containsNull = false) else dt, nullable = !fd.isRequired && !fd.isRepeated)) } - - private[protobuf] class IncompatibleSchemaException(msg: String, ex: Throwable = null) - extends Exception(msg, ex) } diff --git a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufFunctionsSuite.scala b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufFunctionsSuite.scala index 1f8022a6f35f2..7d0a8f33ef535 100644 --- a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufFunctionsSuite.scala +++ b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufFunctionsSuite.scala @@ -24,9 +24,9 @@ import scala.collection.JavaConverters._ import com.google.protobuf.{ByteString, DynamicMessage} import org.apache.spark.sql.{QueryTest, Row} +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.functions.{lit, struct} import org.apache.spark.sql.protobuf.utils.ProtobufUtils -import org.apache.spark.sql.protobuf.utils.SchemaConverters.IncompatibleSchemaException import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{DayTimeIntervalType, IntegerType, StringType, StructField, StructType, TimestampType} @@ -352,14 +352,13 @@ class ProtobufFunctionsSuite extends QueryTest with SharedSparkSession with Seri val df = Seq(messageB.toByteArray).toDF("messageB") - val e = intercept[IncompatibleSchemaException] { + val e = intercept[AnalysisException] { df.select( functions.from_protobuf($"messageB", testFileDesc, "recursiveB").as("messageFromProto")) .show() } - val expectedMessage = s""" - |Found recursive reference in Protobuf schema, which can not be processed by Spark: - |org.apache.spark.sql.protobuf.recursiveB.messageA""".stripMargin + val expectedMessage = "Found recursive reference in Protobuf schema, which can not be" + + " processed by Spark: org.apache.spark.sql.protobuf.recursiveB.messageA" assert(e.getMessage == expectedMessage) } @@ -386,15 +385,13 @@ class ProtobufFunctionsSuite extends QueryTest with SharedSparkSession with Seri val df = Seq(messageD.toByteArray).toDF("messageD") - val e = intercept[IncompatibleSchemaException] { + val e = intercept[AnalysisException] { df.select( functions.from_protobuf($"messageD", testFileDesc, "recursiveD").as("messageFromProto")) .show() } - val expectedMessage = - s""" - |Found recursive reference in Protobuf schema, which can not be processed by Spark: - |org.apache.spark.sql.protobuf.recursiveD.messageC""".stripMargin + val expectedMessage = "Found recursive reference in Protobuf schema, which can not be" + + " processed by Spark: org.apache.spark.sql.protobuf.recursiveD.messageC" assert(e.getMessage == expectedMessage) } diff --git a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala index 9bd775028cdd7..7b46fb75a9983 100644 --- a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala +++ b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala @@ -171,10 +171,10 @@ class ProtobufSerdeSuite extends SharedSparkSession { s" ${protoSchema.getName}." } - println(s"${e.getMessage}, : ${expectMsg}") - println(s"${e.getCause.getMessage}, : ${expectedCauseMessage}") assert(e.getMessage === expectMsg) - assert(e.getCause.getMessage === expectedCauseMessage) + if (e.getCause != null) { + assert(e.getCause.getMessage === expectedCauseMessage) + } } def withFieldMatchType(f: MatchType => Unit): Unit = { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index 9bdcca533cf02..fc20d65ac12de 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -1816,7 +1816,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { sqlType: String, e1: Throwable): Throwable = { new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2256", + errorClass = "_LEGACY_ERROR_TEMP_2255", messageParameters = Map( "protobufType" -> protobufType, "toType" -> sqlType), @@ -1848,7 +1848,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { def protobufFieldMatchError( field: String, protobufSchema: String, - matchSize: Int, + matchSize: String, matches: String): Throwable = { new AnalysisException( errorClass = "_LEGACY_ERROR_TEMP_2260", @@ -1879,11 +1879,10 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { cause = Some(e1.getCause)) } - def noMessageTypeReturnError(descriptorName: String, e1: Throwable): Throwable = { + def noMessageTypeReturnError(descriptorName: String): Throwable = { new AnalysisException( errorClass = "_LEGACY_ERROR_TEMP_2264", - messageParameters = Map("descriptorName" -> descriptorName), - cause = Some(e1.getCause)) + messageParameters = Map("descriptorName" -> descriptorName)) } def failedParsingDescriptorError(e1: Throwable): Throwable = { From d1c9b1ec47eba93ee3b64919fefeb24002ec1987 Mon Sep 17 00:00:00 2001 From: SandishKumarHN Date: Fri, 21 Oct 2022 09:23:17 -0700 Subject: [PATCH 04/19] adding spark-protobuf error class into spark error-class framework, import support --- .../apache/spark/sql/protobuf/utils/ProtobufUtils.scala | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala index b0985a7fd25af..474bfe28678f1 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala @@ -160,7 +160,14 @@ private[sql] object ProtobufUtils extends Logging { throw QueryCompilationErrors.cannotFindDescriptorFileError(descFilePath, ex) } - val descriptorProto: DescriptorProtos.FileDescriptorProto = fileDescriptorSet.getFile(0) + val fileDescriptorSetBuilder = fileDescriptorSet.getFile(0).toBuilder + for (fd <- fileDescriptorSet.getFileList.asScala) { + if (fileDescriptorSetBuilder.getName != fd.getName ) { + fileDescriptorSetBuilder.mergeFrom(fd) + } + } + + val descriptorProto: DescriptorProtos.FileDescriptorProto = fileDescriptorSetBuilder.build() try { val fileDescriptor: Descriptors.FileDescriptor = Descriptors.FileDescriptor.buildFrom( descriptorProto, From 60c2122528c784a5f7f80a3304e73c8d25176bc1 Mon Sep 17 00:00:00 2001 From: SandishKumarHN Date: Fri, 21 Oct 2022 13:44:38 -0700 Subject: [PATCH 05/19] adding spark-protobuf error class into spark error-class framework, import support --- .../sql/protobuf/utils/ProtobufUtils.scala | 14 +- .../sql/protobuf/utils/SchemaConverters.scala | 16 +- .../resources/protobuf/functions_suite.desc | Bin 5958 -> 6569 bytes .../resources/protobuf/functions_suite.proto | 22 +- .../sql/errors/QueryCompilationErrors.scala | 294 +++++++++--------- 5 files changed, 171 insertions(+), 175 deletions(-) diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala index ab164c5dd82d0..bb8fdbe622e8e 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala @@ -200,18 +200,20 @@ private[sql] object ProtobufUtils extends Logging { throw QueryCompilationErrors.cannotFindDescriptorFileError(descFilePath, ex) } - val fileDescriptorSetBuilder = fileDescriptorSet.getFile(0).toBuilder + val descriptorProto: DescriptorProtos.FileDescriptorProto = + fileDescriptorSet.getFile(fileDescriptorSet.getFileList.size() - 1) + + var fileDescriptorList = List[Descriptors.FileDescriptor]() for (fd <- fileDescriptorSet.getFileList.asScala) { - if (fileDescriptorSetBuilder.getName != fd.getName ) { - fileDescriptorSetBuilder.mergeFrom(fd) + if (descriptorProto.getName != fd.getName) { + fileDescriptorList = fileDescriptorList ++ + List(Descriptors.FileDescriptor.buildFrom(fd, new Array[Descriptors.FileDescriptor](0))) } } - - val descriptorProto: DescriptorProtos.FileDescriptorProto = fileDescriptorSetBuilder.build() try { val fileDescriptor: Descriptors.FileDescriptor = Descriptors.FileDescriptor.buildFrom( descriptorProto, - new Array[Descriptors.FileDescriptor](0)) + fileDescriptorList.toArray) if (fileDescriptor.getMessageTypes().isEmpty()) { throw QueryCompilationErrors.noMessageTypeReturnError(fileDescriptor.getName()) } diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala index fe790b9a2e339..286cb36fb7096 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala @@ -63,14 +63,16 @@ object SchemaConverters { case STRING => Some(StringType) case BYTE_STRING => Some(BinaryType) case ENUM => Some(StringType) - case MESSAGE if fd.getMessageType.getName == "Duration" => + case MESSAGE if fd.getMessageType.getName == "Duration" && + fd.getMessageType.getFields.size() == 2 && + fd.getMessageType.getFields.get(0).getName.equals("seconds") && + fd.getMessageType.getFields.get(1).getName.equals("nanos") => Some(DayTimeIntervalType.defaultConcreteType) - case MESSAGE if fd.getMessageType.getName == "Timestamp" => - Some(TimestampType) - // FIXME: Is the above accurate? Users can have protos named "Timestamp" but are not - // expected to be TimestampType in Spark. How about verifying fields? - // Same for "Duration". Only the Timestamp & Duration protos defined in - // google.protobuf package should default to corresponding Catalylist types. + case MESSAGE if fd.getMessageType.getName == "Timestamp" && + fd.getMessageType.getFields.size() == 2 && + fd.getMessageType.getFields.get(0).getName.equals("seconds") && + fd.getMessageType.getFields.get(1).getName.equals("nanos") => + Some(TimestampType) case MESSAGE if fd.isRepeated && fd.getMessageType.getOptions.hasMapEntry => var keyType: DataType = NullType var valueType: DataType = NullType diff --git a/connector/protobuf/src/test/resources/protobuf/functions_suite.desc b/connector/protobuf/src/test/resources/protobuf/functions_suite.desc index 6e3a3967277290f96900c247c0b25f777bb53a42..86cff961434be6a6e16fc267fd28ca342f6af626 100644 GIT binary patch delta 1687 zcma)-T~E_c7{_ziVB>Tz(7iI_U`T?|q=Rl45+u>NIh0okN}?CoK^euaZQ72PD?$uc z+^FXhXpB)36AU-rn-D*QH^xNy2pTW+^sF6cTL@hC^!I=M&;Oj~w8z|s%n1AkU~gGf z%d(i(bk$T#)z!2qRYb!SDjG+Y*kBE)Jz)TyM8oa0^krK<=A z(6B#QB+-n-c}J8uC-|qeybJs~`sf-3F$wx~1)s0DvLq_iN*wmNt_JwK=mFd7so8Pp z&=olvYL~$Ebr_r%HBrFyv}hPYS!5TXza$va@)8~rYatGB0fy(T0t0q|oCVl|c^|>; zr~c!}E6-Su1_OqvOGeg7rug$h3lo-YEJ3lO&eaCq?@R-4rW? zrkE>W18GbTXo&Eh8O3}1ooU+Jn2#I{*|K`FU~95nA0JzXzLKiSb*zhEMS7!7d_&eo zm9|aMsB_&d<&I|j58dD^dfPo7S%R=OhGd9ceE<*fExKly9Sko+h8}gdMWGkolrv!} zdyOqXI76;O0pA9eg@GA#z&-(+s3XL6O~7{5Tod(p8lS%zk&4h3cd*hLM$(*PVMI2-92$(Kq8xM^ oW<|5hC+uGWlB0>hBgi;k!)c?8hPy+F5$JW6Y(>3yolhx;KBQn*BD-5xAuKM-_Q3)zHh}Z^(6dU$H~N1u!5g3 zE7SK0B?JbbJ!|$LfFOx!u7^o@J98@WyMS+JcFG3iTyJcQhCNGlbk$RZBmrdte#?l$ z0A`JmCpKdsMF5Ea5^RJDf1XHFiVsjEMdpbU6mgajPnwV)n6^rnSs+M|&?fa%@1W!>Ue-}R}l|h}D-;~a54j~*P%^$nQ zQzYf{F!dNclY$EbwM>`%&hNp+=rHA|;98h{El!iWF;Jy-a-@?folNPZ<8@N3ljd}t zNY|ls9ZIJhuTx^3GN)4_okHmpO4kW=;R%vqu_^d3TkUN1K4d+^LbtqN&-h}w#J6F} u^(-fb-YooEOp?b?vGW^ut!nKIr1qc<;D>TFgaoX diff --git a/connector/protobuf/src/test/resources/protobuf/functions_suite.proto b/connector/protobuf/src/test/resources/protobuf/functions_suite.proto index 60f8c26214153..2ff6a5c1b90e9 100644 --- a/connector/protobuf/src/test/resources/protobuf/functions_suite.proto +++ b/connector/protobuf/src/test/resources/protobuf/functions_suite.proto @@ -16,12 +16,15 @@ */ // To compile and create test class: // protoc --java_out=connector/protobuf/src/test/resources/protobuf/ connector/protobuf/src/test/resources/protobuf/functions_suite.proto -// protoc --descriptor_set_out=connector/protobuf/src/test/resources/protobuf/functions_suite.desc --java_out=connector/protobuf/src/test/resources/protobuf/org/apache/spark/sql/protobuf/ connector/protobuf/src/test/resources/protobuf/functions_suite.proto +// protoc --include_imports --descriptor_set_out=connector/protobuf/src/test/resources/protobuf/functions_suite.desc --java_out=connector/protobuf/src/test/resources/protobuf/org/apache/spark/sql/protobuf/ connector/protobuf/src/test/resources/protobuf/functions_suite.proto syntax = "proto3"; package org.apache.spark.sql.protobuf.protos; +import "google/protobuf/timestamp.proto"; +import "google/protobuf/duration.proto"; + option java_outer_classname = "SimpleMessageProtos"; message SimpleMessageJavaTypes { @@ -119,7 +122,7 @@ message SimpleMessageEnum { string key = 1; string value = 2; enum NestedEnum { - ESTED_NOTHING = 0; // TODO: Fix the name. + NESTED_NOTHING = 0; NESTED_FIRST = 1; NESTED_SECOND = 2; } @@ -168,23 +171,12 @@ message requiredMsg { int32 col_3 = 4; } -// https://github.com/protocolbuffers/protobuf/blob/main/src/google/protobuf/timestamp.proto -message Timestamp { - int64 seconds = 1; - int32 nanos = 2; -} - message timeStampMsg { string key = 1; - Timestamp stmp = 2; -} -// https://github.com/protocolbuffers/protobuf/blob/main/src/google/protobuf/duration.proto -message Duration { - int64 seconds = 1; - int32 nanos = 2; + google.protobuf.Timestamp stmp = 2; } message durationMsg { string key = 1; - Duration duration = 2; + google.protobuf.Duration duration = 2; } \ No newline at end of file diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index b8f5099711065..10132a99fc270 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -1741,153 +1741,6 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { messageParameters = Map("field" -> field)) } - def cannotConvertProtobufTypeToDataTypeError( - protobufColumn: String, - sqlColumn: String, - protobufType: String, - sqlType: String): Throwable = { - new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2251", - messageParameters = Map( - "protobufColumn" -> protobufColumn, - "sqlColumn" -> sqlColumn, - "protobufType" -> protobufType, - "sqlType" -> sqlType)) - } - - def cannotConvertDataTypeToProtobufTypeError( - sqlColumn: String, - protobufColumn: String, - sqlType: String, - protobufType: String): Throwable = { - new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2252", - messageParameters = Map( - "sqlColumn" -> sqlColumn, - "protobufColumn" -> protobufColumn, - "sqlType" -> sqlType, - "protobufType" -> protobufType)) - } - - def cannotConvertDataTypeToProtobufEnumTypeError( - sqlColumn: String, - protobufColumn: String, - data: String, - enumString: String): Throwable = { - new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2253", - messageParameters = Map( - "sqlColumn" -> sqlColumn, - "protobufColumn" -> protobufColumn, - "data" -> data, - "enumString" -> enumString)) - } - - def protobufTypeUnsupportedYetError(protobufType: String): Throwable = { - new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2256", - messageParameters = Map("protobufType" -> protobufType)) - } - - def cannotConvertProtobufTypeToSqlTypeError( - protobufType: String, - sqlType: String, - e1: Throwable): Throwable = { - new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2254", - messageParameters = Map( - "protobufType" -> protobufType, - "toType" -> sqlType), - cause = Some(e1.getCause)) - } - - def cannotConvertSqlTypeToProtobufError( - protobufType: String, - sqlType: String, - e1: Throwable): Throwable = { - new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2255", - messageParameters = Map( - "protobufType" -> protobufType, - "toType" -> sqlType), - cause = Some(e1.getCause)) - } - - def unknownProtobufMessageTypeError( - descriptorName: String, - containingType: String): Throwable = { - new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2257", - messageParameters = Map( - "descriptorName" -> descriptorName, - "containingType" -> containingType)) - } - - def cannotFindCatalystTypeInProtobufSchemaError(catalystFieldPath: String): Throwable = { - new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2258", - messageParameters = Map("catalystFieldPath" -> catalystFieldPath)) - } - - def cannotFindProtobufFieldInInCatalystError(field: String): Throwable = { - new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2259", - messageParameters = Map("field" -> field)) - } - - def protobufFieldMatchError( - field: String, - protobufSchema: String, - matchSize: String, - matches: String): Throwable = { - new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2260", - messageParameters = Map( - "field" -> field, - "protobufSchema" -> protobufSchema, - "matchSize" -> matchSize, - "matches" -> matches)) - } - - def unableToLocateProtobuMessageError(messageName: String): Throwable = { - new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2261", - messageParameters = Map("messageName" -> messageName)) - } - - def descrioptorParseError(e1: Throwable): Throwable = { - new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2262", - messageParameters = Map("errorMessage" -> e1.getMessage()), - cause = Some(e1.getCause)) - } - - def cannotFindDescriptorFileError(filePath: String, e1: Throwable): Throwable = { - new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2263", - messageParameters = Map("filePath" -> filePath), - cause = Some(e1.getCause)) - } - - def noMessageTypeReturnError(descriptorName: String): Throwable = { - new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2264", - messageParameters = Map("descriptorName" -> descriptorName)) - } - - def failedParsingDescriptorError(e1: Throwable): Throwable = { - new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2265", - messageParameters = Map("errorMessage" -> e1.getMessage()), - cause = Some(e1.getCause)) - } - - def foundRecursionInProtobufSchema(fieldDescriptor: String): Throwable = { - new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2266", - messageParameters = Map("fieldDescriptor" -> fieldDescriptor)) - } - def cannotConvertDataTypeToParquetTypeError(field: StructField): Throwable = { new AnalysisException( errorClass = "_LEGACY_ERROR_TEMP_1175", @@ -3357,4 +3210,151 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { messageParameters = Map("expression" -> toSQLExpr(expression)) ) } + + def cannotConvertProtobufTypeToDataTypeError( + protobufColumn: String, + sqlColumn: String, + protobufType: String, + sqlType: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2251", + messageParameters = Map( + "protobufColumn" -> protobufColumn, + "sqlColumn" -> sqlColumn, + "protobufType" -> protobufType, + "sqlType" -> sqlType)) + } + + def cannotConvertDataTypeToProtobufTypeError( + sqlColumn: String, + protobufColumn: String, + sqlType: String, + protobufType: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2252", + messageParameters = Map( + "sqlColumn" -> sqlColumn, + "protobufColumn" -> protobufColumn, + "sqlType" -> sqlType, + "protobufType" -> protobufType)) + } + + def cannotConvertDataTypeToProtobufEnumTypeError( + sqlColumn: String, + protobufColumn: String, + data: String, + enumString: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2253", + messageParameters = Map( + "sqlColumn" -> sqlColumn, + "protobufColumn" -> protobufColumn, + "data" -> data, + "enumString" -> enumString)) + } + + def protobufTypeUnsupportedYetError(protobufType: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2256", + messageParameters = Map("protobufType" -> protobufType)) + } + + def cannotConvertProtobufTypeToSqlTypeError( + protobufType: String, + sqlType: String, + e1: Throwable): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2254", + messageParameters = Map( + "protobufType" -> protobufType, + "toType" -> sqlType), + cause = Some(e1.getCause)) + } + + def cannotConvertSqlTypeToProtobufError( + protobufType: String, + sqlType: String, + e1: Throwable): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2255", + messageParameters = Map( + "protobufType" -> protobufType, + "toType" -> sqlType), + cause = Some(e1.getCause)) + } + + def unknownProtobufMessageTypeError( + descriptorName: String, + containingType: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2257", + messageParameters = Map( + "descriptorName" -> descriptorName, + "containingType" -> containingType)) + } + + def cannotFindCatalystTypeInProtobufSchemaError(catalystFieldPath: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2258", + messageParameters = Map("catalystFieldPath" -> catalystFieldPath)) + } + + def cannotFindProtobufFieldInInCatalystError(field: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2259", + messageParameters = Map("field" -> field)) + } + + def protobufFieldMatchError( + field: String, + protobufSchema: String, + matchSize: String, + matches: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2260", + messageParameters = Map( + "field" -> field, + "protobufSchema" -> protobufSchema, + "matchSize" -> matchSize, + "matches" -> matches)) + } + + def unableToLocateProtobuMessageError(messageName: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2261", + messageParameters = Map("messageName" -> messageName)) + } + + def descrioptorParseError(e1: Throwable): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2262", + messageParameters = Map("errorMessage" -> e1.getMessage()), + cause = Some(e1.getCause)) + } + + def cannotFindDescriptorFileError(filePath: String, e1: Throwable): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2263", + messageParameters = Map("filePath" -> filePath), + cause = Some(e1.getCause)) + } + + def noMessageTypeReturnError(descriptorName: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2264", + messageParameters = Map("descriptorName" -> descriptorName)) + } + + def failedParsingDescriptorError(e1: Throwable): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2265", + messageParameters = Map("errorMessage" -> e1.getMessage()), + cause = Some(e1.getCause)) + } + + def foundRecursionInProtobufSchema(fieldDescriptor: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2266", + messageParameters = Map("fieldDescriptor" -> fieldDescriptor)) + } } From e6f3cab129df0100ae924b0cc1d4d4717fedc125 Mon Sep 17 00:00:00 2001 From: SandishKumarHN Date: Fri, 21 Oct 2022 19:58:32 -0700 Subject: [PATCH 06/19] spark-protobuf error clss frameworks, import support, timestamp and duration validation --- connector/protobuf/pom.xml | 1 + .../sql/protobuf/ProtobufDataToCatalyst.scala | 22 ++++--------- .../sql/protobuf/ProtobufDeserializer.scala | 2 +- .../sql/protobuf/utils/ProtobufUtils.scala | 8 ++--- .../sql/protobuf/ProtobufFunctionsSuite.scala | 2 +- .../main/resources/error/error-classes.json | 20 +++++++++++ .../sql/errors/QueryCompilationErrors.scala | 33 +++++++++++++++++++ 7 files changed, 65 insertions(+), 23 deletions(-) diff --git a/connector/protobuf/pom.xml b/connector/protobuf/pom.xml index b934c7f831a2b..9296506577ba4 100644 --- a/connector/protobuf/pom.xml +++ b/connector/protobuf/pom.xml @@ -123,6 +123,7 @@ com.google.protobuf:protoc:${protobuf.version} ${protobuf.version} + direct src/test/resources/protobuf diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDataToCatalyst.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDataToCatalyst.scala index cad2442f10c17..a0b7c145926d0 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDataToCatalyst.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDataToCatalyst.scala @@ -21,11 +21,10 @@ import scala.util.control.NonFatal import com.google.protobuf.DynamicMessage -import org.apache.spark.SparkException -import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, SpecificInternalRow, UnaryExpression} import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode} import org.apache.spark.sql.catalyst.util.{FailFastMode, ParseMode, PermissiveMode} +import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.protobuf.utils.{ProtobufOptions, ProtobufUtils, SchemaConverters} import org.apache.spark.sql.types.{AbstractDataType, BinaryType, DataType, StructType} @@ -71,16 +70,11 @@ private[protobuf] case class ProtobufDataToCatalyst( @transient private lazy val parseMode: ParseMode = { val mode = protobufOptions.parseMode if (mode != PermissiveMode && mode != FailFastMode) { - throw new AnalysisException(unacceptableModeMessage(mode.name)) + throw QueryCompilationErrors.parseModeUnsupportedError("from_protobuf", mode) } mode } - private def unacceptableModeMessage(name: String): String = { - s"from_protobuf() doesn't support the $name mode. " + - s"Acceptable modes are ${PermissiveMode.name} and ${FailFastMode.name}." - } - @transient private lazy val nullResultRow: Any = dataType match { case st: StructType => val resultRow = new SpecificInternalRow(st.map(_.dataType)) @@ -98,13 +92,9 @@ private[protobuf] case class ProtobufDataToCatalyst( case PermissiveMode => nullResultRow case FailFastMode => - throw new SparkException( - "Malformed records are detected in record parsing. " + - s"Current parse Mode: ${FailFastMode.name}. To process malformed records as null " + - "result, try setting the option 'mode' as 'PERMISSIVE'.", - e) + throw QueryCompilationErrors.malformedRecordsDetectedInRecordParsingError(e) case _ => - throw new AnalysisException(unacceptableModeMessage(parseMode.name)) + throw QueryCompilationErrors.parseModeUnsupportedError("from_protobuf", parseMode) } } @@ -119,8 +109,8 @@ private[protobuf] case class ProtobufDataToCatalyst( case Some(number) => // Unknown fields contain a field with same number as a known field. Must be due to // mismatch of schema between writer and reader here. - throw new IllegalArgumentException(s"Type mismatch encountered for field:" + - s" ${messageDescriptor.getFields.get(number)}") + throw QueryCompilationErrors.protobufFieldTypeMismatchError( + messageDescriptor.getFields.get(number).toString) case None => } diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala index effa21b55fdd9..9462b7f5bf071 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala @@ -194,7 +194,7 @@ private[sql] class ProtobufDeserializer( (updater, ordinal, value) => val byte_array = value match { case s: ByteString => s.toByteArray - case _ => throw new Exception("Invalid ByteString format") + case _ => throw QueryCompilationErrors.invalidBytStringFormatError() } updater.set(ordinal, byte_array) diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala index bb8fdbe622e8e..7310be3e15b40 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala @@ -159,14 +159,12 @@ private[sql] object ProtobufUtils extends Logging { } catch { case _: ClassNotFoundException => val hasDots = protobufClassName.contains(".") - throw new IllegalArgumentException( - s"Could not load Protobuf class with name '$protobufClassName'" + - (if (hasDots) "" else ". Ensure the class name includes package prefix.") - ) + throw QueryCompilationErrors.protobufClassLoadError(protobufClassName, + if (hasDots) "" else ". Ensure the class name includes package prefix.") } if (!classOf[Message].isAssignableFrom(protobufClass)) { - throw new IllegalArgumentException(s"$protobufClassName is not a Protobuf message type") + throw QueryCompilationErrors.protobufMessageTypeError(protobufClassName) // TODO: Need to support V2. This might work with V2 classes too. } diff --git a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufFunctionsSuite.scala b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufFunctionsSuite.scala index ea96d96228c4c..9f703d1829770 100644 --- a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufFunctionsSuite.scala +++ b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufFunctionsSuite.scala @@ -324,7 +324,7 @@ class ProtobufFunctionsSuite extends QueryTest with SharedSparkSession with Seri .setField(messageEnumDesc.findFieldByName("value"), "value") .setField( messageEnumDesc.findFieldByName("nested_enum"), - messageEnumDesc.findEnumTypeByName("NestedEnum").findValueByName("ESTED_NOTHING")) + messageEnumDesc.findEnumTypeByName("NestedEnum").findValueByName("NESTED_NOTHING")) .setField( messageEnumDesc.findFieldByName("nested_enum"), messageEnumDesc.findEnumTypeByName("NestedEnum").findValueByName("NESTED_FIRST")) diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index 5cf6579a2e998..f14f7e3041d3e 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -4355,5 +4355,25 @@ "message" : [ "Found recursive reference in Protobuf schema, which can not be processed by Spark: " ] + }, + "_LEGACY_ERROR_TEMP_2267" : { + "message" : [ + "Type mismatch encountered for field: " + ] + }, + "_LEGACY_ERROR_TEMP_2268" : { + "message" : [ + "Could not load Protobuf class with name " + ] + }, + "_LEGACY_ERROR_TEMP_2269" : { + "message" : [ + " is not a Protobuf message type" + ] + }, + "_LEGACY_ERROR_TEMP_2270" : { + "message" : [ + "Invalid ByteString format" + ] } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index 10132a99fc270..d541eab11297c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -21,6 +21,7 @@ import scala.collection.mutable import org.apache.hadoop.fs.Path +import org.apache.spark.SparkException import org.apache.spark.SparkThrowableHelper import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.{FunctionIdentifier, QualifiedTableName, TableIdentifier} @@ -3357,4 +3358,36 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { errorClass = "_LEGACY_ERROR_TEMP_2266", messageParameters = Map("fieldDescriptor" -> fieldDescriptor)) } + + def protobufFieldTypeMismatchError(field: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2267", + messageParameters = Map("field" -> field)) + } + + def protobufClassLoadError(protobufClassName: String, errorMessage: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2268", + messageParameters = Map( + "protobufClassName" -> protobufClassName, + "errorMessage" -> errorMessage)) + } + + def protobufMessageTypeError(protobufClassName: String): Throwable = { + new AnalysisException( + errorClass = "_LEGACY_ERROR_TEMP_2269", + messageParameters = Map("protobufClassName" -> protobufClassName)) + } + + def invalidBytStringFormatError(): Throwable = { + new AnalysisException(errorClass = "_LEGACY_ERROR_TEMP_2270", messageParameters = Map.empty) + } + + def malformedRecordsDetectedInRecordParsingError(e1: Throwable): Throwable = { + new SparkException( + errorClass = "_LEGACY_ERROR_TEMP_2177", + messageParameters = Map( + "failFastMode" -> FailFastMode.name), + cause = e1) + } } From 70a598352d843fcdfa5793db356173c75bd1c8f9 Mon Sep 17 00:00:00 2001 From: SandishKumarHN Date: Sun, 23 Oct 2022 19:37:48 -0700 Subject: [PATCH 07/19] fixing typos --- .../spark/sql/protobuf/ProtobufDeserializer.scala | 6 +++--- .../spark/sql/protobuf/ProtobufSerializer.scala | 4 ++-- .../spark/sql/protobuf/utils/ProtobufUtils.scala | 4 ++-- .../spark/sql/errors/QueryCompilationErrors.scala | 14 +++++++------- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala index 9462b7f5bf071..685f21c1915c3 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala @@ -62,7 +62,7 @@ private[sql] class ProtobufDeserializer( } } catch { case ise: AnalysisException => - throw QueryCompilationErrors.cannotConvertProtobufTypeToSqlTypeError( + throw QueryCompilationErrors.cannotConvertProtobufTypeToCatalystTypeError( rootDescriptor.getName, rootCatalystType.sql, ise) @@ -194,7 +194,7 @@ private[sql] class ProtobufDeserializer( (updater, ordinal, value) => val byte_array = value match { case s: ByteString => s.toByteArray - case _ => throw QueryCompilationErrors.invalidBytStringFormatError() + case _ => throw QueryCompilationErrors.invalidByteStringFormatError() } updater.set(ordinal, byte_array) @@ -240,7 +240,7 @@ private[sql] class ProtobufDeserializer( (updater, ordinal, value) => updater.set(ordinal, UTF8String.fromString(value.toString)) case _ => - throw QueryCompilationErrors.cannotConvertProtobufTypeToDataTypeError( + throw QueryCompilationErrors.cannotConvertProtobufTypeToSqlTypeError( toFieldStr(protoPath), toFieldStr(catalystPath), s"${protoType} ${protoType.toProto.getLabel} ${protoType.getJavaType}" + diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufSerializer.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufSerializer.scala index bc9aa18062053..ee64916dfe14e 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufSerializer.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufSerializer.scala @@ -103,7 +103,7 @@ private[sql] class ProtobufSerializer( (getter, ordinal) => val data = getter.getUTF8String(ordinal).toString if (!enumSymbols.contains(data)) { - throw QueryCompilationErrors.cannotConvertDataTypeToProtobufEnumTypeError( + throw QueryCompilationErrors.cannotConvertCatalystTypeToProtobufEnumTypeError( toFieldStr(catalystPath), toFieldStr(protoPath), data, enumSymbols.mkString("\"", "\", \"", "\"")) } @@ -213,7 +213,7 @@ private[sql] class ProtobufSerializer( duration.build() case _ => - throw QueryCompilationErrors.cannotConvertDataTypeToProtobufTypeError( + throw QueryCompilationErrors.cannotConvertCatalystTypeToProtobufTypeError( toFieldStr(catalystPath), toFieldStr(protoPath), catalystType.sql, diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala index 7310be3e15b40..f2127bbf9dc0f 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala @@ -100,7 +100,7 @@ private[sql] object ProtobufUtils extends Logging { def validateNoExtraRequiredProtoFields(): Unit = { val extraFields = protoFieldArray.toSet -- matchedFields.map(_.fieldDescriptor) extraFields.filterNot(isNullable).foreach { extraField => - throw QueryCompilationErrors.cannotFindProtobufFieldInInCatalystError( + throw QueryCompilationErrors.cannotFindProtobufFieldInCatalystError( toFieldStr(protoPath :+ extraField.getName())) } } @@ -213,7 +213,7 @@ private[sql] object ProtobufUtils extends Logging { descriptorProto, fileDescriptorList.toArray) if (fileDescriptor.getMessageTypes().isEmpty()) { - throw QueryCompilationErrors.noMessageTypeReturnError(fileDescriptor.getName()) + throw QueryCompilationErrors.noProtobufMessageTypeReturnError(fileDescriptor.getName()) } fileDescriptor } catch { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index d541eab11297c..7fc07e5d03ddd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -3212,7 +3212,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { ) } - def cannotConvertProtobufTypeToDataTypeError( + def cannotConvertProtobufTypeToSqlTypeError( protobufColumn: String, sqlColumn: String, protobufType: String, @@ -3226,7 +3226,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { "sqlType" -> sqlType)) } - def cannotConvertDataTypeToProtobufTypeError( + def cannotConvertCatalystTypeToProtobufTypeError( sqlColumn: String, protobufColumn: String, sqlType: String, @@ -3240,7 +3240,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { "protobufType" -> protobufType)) } - def cannotConvertDataTypeToProtobufEnumTypeError( + def cannotConvertCatalystTypeToProtobufEnumTypeError( sqlColumn: String, protobufColumn: String, data: String, @@ -3260,7 +3260,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { messageParameters = Map("protobufType" -> protobufType)) } - def cannotConvertProtobufTypeToSqlTypeError( + def cannotConvertProtobufTypeToCatalystTypeError( protobufType: String, sqlType: String, e1: Throwable): Throwable = { @@ -3300,7 +3300,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { messageParameters = Map("catalystFieldPath" -> catalystFieldPath)) } - def cannotFindProtobufFieldInInCatalystError(field: String): Throwable = { + def cannotFindProtobufFieldInCatalystError(field: String): Throwable = { new AnalysisException( errorClass = "_LEGACY_ERROR_TEMP_2259", messageParameters = Map("field" -> field)) @@ -3340,7 +3340,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { cause = Some(e1.getCause)) } - def noMessageTypeReturnError(descriptorName: String): Throwable = { + def noProtobufMessageTypeReturnError(descriptorName: String): Throwable = { new AnalysisException( errorClass = "_LEGACY_ERROR_TEMP_2264", messageParameters = Map("descriptorName" -> descriptorName)) @@ -3379,7 +3379,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { messageParameters = Map("protobufClassName" -> protobufClassName)) } - def invalidBytStringFormatError(): Throwable = { + def invalidByteStringFormatError(): Throwable = { new AnalysisException(errorClass = "_LEGACY_ERROR_TEMP_2270", messageParameters = Map.empty) } From 26e471b934309e2299bc1b28f6c79822068fe73f Mon Sep 17 00:00:00 2001 From: SandishKumarHN Date: Sun, 23 Oct 2022 23:15:54 -0700 Subject: [PATCH 08/19] review changes, name protobuf error class and comment in pom.xml --- connector/protobuf/pom.xml | 1 + .../sql/protobuf/ProtobufDeserializer.scala | 9 +- .../sql/protobuf/ProtobufSerializer.scala | 14 +- .../sql/protobuf/utils/ProtobufUtils.scala | 20 +- .../sql/protobuf/utils/SchemaConverters.scala | 6 +- .../sql/protobuf/ProtobufSerdeSuite.scala | 11 +- .../main/resources/error/error-classes.json | 200 +++++++++--------- .../sql/errors/QueryCompilationErrors.scala | 68 +++--- 8 files changed, 169 insertions(+), 160 deletions(-) diff --git a/connector/protobuf/pom.xml b/connector/protobuf/pom.xml index 9296506577ba4..14d790072d41c 100644 --- a/connector/protobuf/pom.xml +++ b/connector/protobuf/pom.xml @@ -123,6 +123,7 @@ com.google.protobuf:protoc:${protobuf.version} ${protobuf.version} + direct src/test/resources/protobuf diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala index 685f21c1915c3..1c6dbf54ad08e 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala @@ -64,7 +64,7 @@ private[sql] class ProtobufDeserializer( case ise: AnalysisException => throw QueryCompilationErrors.cannotConvertProtobufTypeToCatalystTypeError( rootDescriptor.getName, - rootCatalystType.sql, + rootCatalystType, ise) } @@ -170,8 +170,9 @@ private[sql] class ProtobufDeserializer( case (INT, ShortType) => (updater, ordinal, value) => updater.setShort(ordinal, value.asInstanceOf[Short]) - case (BOOLEAN | INT | FLOAT | DOUBLE | LONG | STRING | ENUM | BYTE_STRING, - ArrayType(dataType: DataType, containsNull)) if protoType.isRepeated => + case ( + BOOLEAN | INT | FLOAT | DOUBLE | LONG | STRING | ENUM | BYTE_STRING, + ArrayType(dataType: DataType, containsNull)) if protoType.isRepeated => newArrayWriter(protoType, protoPath, catalystPath, dataType, containsNull) case (LONG, LongType) => @@ -245,7 +246,7 @@ private[sql] class ProtobufDeserializer( toFieldStr(catalystPath), s"${protoType} ${protoType.toProto.getLabel} ${protoType.getJavaType}" + s" ${protoType.getType}", - catalystType.sql) + catalystType) } } diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufSerializer.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufSerializer.scala index ee64916dfe14e..b09db6b6b2c23 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufSerializer.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufSerializer.scala @@ -57,7 +57,7 @@ private[sql] class ProtobufSerializer( case ise: AnalysisException => throw QueryCompilationErrors.cannotConvertSqlTypeToProtobufError( rootDescriptor.getName, - rootCatalystType.sql, + rootCatalystType, ise) } if (nullable) { (data: Any) => @@ -104,7 +104,9 @@ private[sql] class ProtobufSerializer( val data = getter.getUTF8String(ordinal).toString if (!enumSymbols.contains(data)) { throw QueryCompilationErrors.cannotConvertCatalystTypeToProtobufEnumTypeError( - toFieldStr(catalystPath), toFieldStr(protoPath), data, + toFieldStr(catalystPath), + toFieldStr(protoPath), + data, enumSymbols.mkString("\"", "\", \"", "\"")) } fieldDescriptor.getEnumType.findValueByName(data) @@ -122,7 +124,8 @@ private[sql] class ProtobufSerializer( case (TimestampType, MESSAGE) => (getter, ordinal) => val millis = DateTimeUtils.microsToMillis(getter.getLong(ordinal)) - Timestamp.newBuilder() + Timestamp + .newBuilder() .setSeconds((millis / 1000)) .setNanos(((millis % 1000) * 1000000).toInt) .build() @@ -199,7 +202,8 @@ private[sql] class ProtobufSerializer( val calendarInterval = IntervalUtils.fromIntervalString(dayTimeIntervalString) val millis = DateTimeUtils.microsToMillis(calendarInterval.microseconds) - val duration = Duration.newBuilder() + val duration = Duration + .newBuilder() .setSeconds((millis / 1000)) .setNanos(((millis % 1000) * 1000000).toInt) @@ -216,7 +220,7 @@ private[sql] class ProtobufSerializer( throw QueryCompilationErrors.cannotConvertCatalystTypeToProtobufTypeError( toFieldStr(catalystPath), toFieldStr(protoPath), - catalystType.sql, + catalystType, s"${fieldDescriptor} ${fieldDescriptor.toProto.getLabel} ${fieldDescriptor.getJavaType}" + s" ${fieldDescriptor.getType}") } diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala index f2127bbf9dc0f..5d7c1f033ba3b 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala @@ -79,9 +79,9 @@ private[sql] object ProtobufUtils extends Logging { /** * Validate that there are no Catalyst fields which don't have a matching Protobuf field, - * throwing [[AnalysisException]] if such extra fields are found. If - * `ignoreNullable` is false, consider nullable Catalyst fields to be eligible to be an extra - * field; otherwise, ignore nullable Catalyst fields when checking for extras. + * throwing [[AnalysisException]] if such extra fields are found. If `ignoreNullable` is + * false, consider nullable Catalyst fields to be eligible to be an extra field; otherwise, + * ignore nullable Catalyst fields when checking for extras. */ def validateNoExtraCatalystFields(ignoreNullable: Boolean): Unit = catalystSchema.fields.foreach { sqlField => @@ -94,8 +94,8 @@ private[sql] object ProtobufUtils extends Logging { /** * Validate that there are no Protobuf fields which don't have a matching Catalyst field, - * throwing [[AnalysisException]] if such extra fields are found. Only required - * (non-nullable) fields are checked; nullable fields are ignored. + * throwing [[AnalysisException]] if such extra fields are found. Only required (non-nullable) + * fields are checked; nullable fields are ignored. */ def validateNoExtraRequiredProtoFields(): Unit = { val extraFields = protoFieldArray.toSet -- matchedFields.map(_.fieldDescriptor) @@ -183,7 +183,8 @@ private[sql] object ProtobufUtils extends Logging { descriptor match { case Some(d) => d case None => - throw QueryCompilationErrors.unableToLocateProtobuMessageError(messageName) } + throw QueryCompilationErrors.unableToLocateProtobufMessageError(messageName) + } } private def parseFileDescriptor(descFilePath: String): Descriptors.FileDescriptor = { @@ -199,7 +200,7 @@ private[sql] object ProtobufUtils extends Logging { } val descriptorProto: DescriptorProtos.FileDescriptorProto = - fileDescriptorSet.getFile(fileDescriptorSet.getFileList.size() - 1) + fileDescriptorSet.getFileList.asScala.last var fileDescriptorList = List[Descriptors.FileDescriptor]() for (fd <- fileDescriptorSet.getFileList.asScala) { @@ -209,9 +210,8 @@ private[sql] object ProtobufUtils extends Logging { } } try { - val fileDescriptor: Descriptors.FileDescriptor = Descriptors.FileDescriptor.buildFrom( - descriptorProto, - fileDescriptorList.toArray) + val fileDescriptor: Descriptors.FileDescriptor = + Descriptors.FileDescriptor.buildFrom(descriptorProto, fileDescriptorList.toArray) if (fileDescriptor.getMessageTypes().isEmpty()) { throw QueryCompilationErrors.noProtobufMessageTypeReturnError(fileDescriptor.getName()) } diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala index 286cb36fb7096..0c1542c39bfc7 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala @@ -63,12 +63,14 @@ object SchemaConverters { case STRING => Some(StringType) case BYTE_STRING => Some(BinaryType) case ENUM => Some(StringType) - case MESSAGE if fd.getMessageType.getName == "Duration" && + case MESSAGE + if fd.getMessageType.getName == "Duration" && fd.getMessageType.getFields.size() == 2 && fd.getMessageType.getFields.get(0).getName.equals("seconds") && fd.getMessageType.getFields.get(1).getName.equals("nanos") => Some(DayTimeIntervalType.defaultConcreteType) - case MESSAGE if fd.getMessageType.getName == "Timestamp" && + case MESSAGE + if fd.getMessageType.getName == "Timestamp" && fd.getMessageType.getFields.size() == 2 && fd.getMessageType.getFields.get(0).getName.equals("seconds") && fd.getMessageType.getFields.get(1).getName.equals("nanos") => diff --git a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala index faa3296c02aa4..182486e3c67b1 100644 --- a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala +++ b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala @@ -20,8 +20,8 @@ package org.apache.spark.sql.protobuf import com.google.protobuf.Descriptors.Descriptor import com.google.protobuf.DynamicMessage -import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.NoopFilters +import org.apache.spark.sql.catalyst.expressions.Cast.toSQLType import org.apache.spark.sql.protobuf.utils.ProtobufUtils import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{IntegerType, StructType} @@ -163,15 +163,16 @@ class ProtobufSerdeSuite extends SharedSparkSession { fieldMatchType: MatchType, expectedCauseMessage: String, catalystSchema: StructType = CATALYST_STRUCT): Unit = { - val e = intercept[AnalysisException] { + val e = intercept[Exception] { serdeFactory.create(catalystSchema, protoSchema, fieldMatchType) } val expectMsg = serdeFactory match { case Deserializer => - s"Unable to convert ${protoSchema.getName} of Protobuf to SQL type ${catalystSchema.sql}." + s"[PROTOBUF_TYPE_TO_CATALYST_TYPE_ERROR] Unable to convert" + + s" ${protoSchema.getName} of Protobuf to SQL type ${toSQLType(catalystSchema)}." case Serializer => - s"Unable to convert SQL type ${catalystSchema.sql} to Protobuf type" + - s" ${protoSchema.getName}." + s"[UNABLE_TO_CONVERT_TO_PROTOBUF_TYPE] Unable to convert SQL type" + + s" ${toSQLType(catalystSchema)} to Protobuf type ${protoSchema.getName}." } assert(e.getMessage === expectMsg) diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index f14f7e3041d3e..9ce037e961924 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -23,6 +23,11 @@ ], "sqlState" : "42000" }, + "CANNOT_FIND_PROTOBUF_DESCRIPTOR_FILE_ERROR" : { + "message" : [ + "Error reading Protobuf descriptor file at path: " + ] + }, "CANNOT_INFER_DATE" : { "message" : [ "Cannot infer date in schema inference when LegacyTimeParserPolicy is \"LEGACY\". Legacy Date formatter does not support strict date format matching which is required to avoid inferring timestamps and other non-date entries to date." @@ -65,6 +70,11 @@ ], "sqlState" : "22005" }, + "CATALYST_TYPE_TO_PROTOBUF_ENUM_TYPE_ERROR" : { + "message" : [ + "Cannot convert SQL to Protobuf because cannot be written since it's not defined in ENUM " + ] + }, "COLUMN_NOT_IN_GROUP_BY_CLAUSE" : { "message" : [ "The expression is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in `first()` (or `first_value()`) if you don't care which value you get." @@ -460,6 +470,11 @@ "Invalid bucket file: " ] }, + "INVALID_BYTE_STRING_ERROR" : { + "message" : [ + "Invalid ByteString format" + ] + }, "INVALID_COLUMN_OR_FIELD_DATA_TYPE" : { "message" : [ "Column or field is of type while it's required to be ." @@ -572,11 +587,21 @@ ], "sqlState" : "42000" }, + "NO_CATALYST_TYPE_IN_PROTOBUF_SCHEMA" : { + "message" : [ + "Cannot find in Protobuf schema" + ] + }, "NO_HANDLER_FOR_UDAF" : { "message" : [ "No handler for UDAF ''. Use sparkSession.udf.register(...) instead." ] }, + "NO_PROTOBUF_MESSAGE_TYPE_ERROR" : { + "message" : [ + "No MessageTypes returned, " + ] + }, "NO_UDF_INTERFACE_ERROR" : { "message" : [ "UDF class doesn't implement any UDF interface" @@ -644,6 +669,61 @@ ], "sqlState" : "42000" }, + "PROTOBUF_CLASS_LOAD_ERROR" : { + "message" : [ + "Could not load Protobuf class with name " + ] + }, + "PROTOBUF_DESCRIPTOR_ERROR" : { + "message" : [ + "Error parsing descriptor byte[] into Descriptor object Error: " + ] + }, + "PROTOBUF_DESCRIPTOR_PARSING_ERROR" : { + "message" : [ + "Error constructing FileDescriptor, Error: " + ] + }, + "PROTOBUF_FIELD_MISSING_ERROR" : { + "message" : [ + "Searching for in Protobuf schema at gave matches. Candidates: " + ] + }, + "PROTOBUF_FIELD_MISSING_IN_CATALYST_SCHEMA" : { + "message" : [ + "Found in Protobuf schema but there is no match in the SQL schema" + ] + }, + "PROTOBUF_FIELD_TYPE_MISMATCH" : { + "message" : [ + "Type mismatch encountered for field: " + ] + }, + "PROTOBUF_MESSAGE_TYPE_ERROR" : { + "message" : [ + " is not a Protobuf message type" + ] + }, + "PROTOBUF_RECURSION_ERROR" : { + "message" : [ + "Found recursive reference in Protobuf schema, which can not be processed by Spark: " + ] + }, + "PROTOBUF_TYPE_NOT_SUPPORT_ERROR" : { + "message" : [ + "Protobuf type not yet supported: ." + ] + }, + "PROTOBUF_TYPE_TO_CATALYST_TYPE_ERROR" : { + "message" : [ + "Unable to convert of Protobuf to SQL type ." + ] + }, + "PROTOBUF_TYPE_TO_SQL_TYPE_ERROR" : { + "message" : [ + "Cannot convert Protobuf to SQL because schema is incompatible (protobufType = , sqlType = )." + ] + }, "RENAME_SRC_PATH_NOT_FOUND" : { "message" : [ "Failed to rename as was not found" @@ -698,6 +778,11 @@ ], "sqlState" : "22023" }, + "SQL_TYPE_TO_PROTOBUF_TYPE_ERROR" : { + "message" : [ + "Cannot convert SQL to Protobuf because schema is incompatible (protobufType = , sqlType = )." + ] + }, "TABLE_OR_VIEW_ALREADY_EXISTS" : { "message" : [ "Cannot create table or view because it already exists.", @@ -730,6 +815,21 @@ "Unable to acquire bytes of memory, got " ] }, + "UNABLE_TO_CONVERT_TO_PROTOBUF_TYPE" : { + "message" : [ + "Unable to convert SQL type to Protobuf type ." + ] + }, + "UNABLE_TO_LOCATE_PROTOBUF_MESSAGE_ERROR" : { + "message" : [ + "Unable to locate Message in Descriptor" + ] + }, + "UNKNOWN_PROTOBUF_MESSAGE_TYPE" : { + "message" : [ + "Attempting to treat as a Message, but it was " + ] + }, "UNPIVOT_REQUIRES_ATTRIBUTES" : { "message" : [ "UNPIVOT requires all given expressions to be columns when no expressions are given. These are not columns: []." @@ -4275,105 +4375,5 @@ "message" : [ "Not enough memory to build and broadcast the table to all worker nodes. As a workaround, you can either disable broadcast by setting to -1 or increase the spark driver memory by setting to a higher value" ] - }, - "_LEGACY_ERROR_TEMP_2251" : { - "message" : [ - "Cannot convert Protobuf to SQL because schema is incompatible (protobufType = , sqlType = ." - ] - }, - "_LEGACY_ERROR_TEMP_2252" : { - "message" : [ - "Cannot convert SQL to Protobuf because schema is incompatible (protobufType = , sqlType = ." - ] - }, - "_LEGACY_ERROR_TEMP_2253" : { - "message" : [ - "Cannot convert SQL to Protobuf because cannot be written since it's not defined in ENUM " - ] - }, - "_LEGACY_ERROR_TEMP_2254" : { - "message" : [ - "Unable to convert of Protobuf to SQL type ." - ] - }, - "_LEGACY_ERROR_TEMP_2255" : { - "message" : [ - "Unable to convert SQL type to Protobuf type ." - ] - }, - "_LEGACY_ERROR_TEMP_2256" : { - "message" : [ - "Protobuf type not yet supported: ." - ] - }, - "_LEGACY_ERROR_TEMP_2257" : { - "message" : [ - "Attempting to treat as a Message, but it was " - ] - }, - "_LEGACY_ERROR_TEMP_2258" : { - "message" : [ - "Cannot find in Protobuf schema" - ] - }, - "_LEGACY_ERROR_TEMP_2259" : { - "message" : [ - "Found in Protobuf schema but there is no match in the SQL schema" - ] - }, - "_LEGACY_ERROR_TEMP_2260" : { - "message" : [ - "Searching for in Protobuf schema at gave matches. Candidates: " - ] - }, - "_LEGACY_ERROR_TEMP_2261" : { - "message" : [ - "Unable to locate Message in Descriptor" - ] - }, - "_LEGACY_ERROR_TEMP_2262" : { - "message" : [ - "Error parsing descriptor byte[] into Descriptor object Error: " - ] - }, - "_LEGACY_ERROR_TEMP_2263" : { - "message" : [ - "Error reading Protobuf descriptor file at path: " - ] - }, - "_LEGACY_ERROR_TEMP_2264" : { - "message" : [ - "No MessageTypes returned, " - ] - }, - "_LEGACY_ERROR_TEMP_2265" : { - "message" : [ - "Error constructing FileDescriptor, Error: " - ] - }, - "_LEGACY_ERROR_TEMP_2266" : { - "message" : [ - "Found recursive reference in Protobuf schema, which can not be processed by Spark: " - ] - }, - "_LEGACY_ERROR_TEMP_2267" : { - "message" : [ - "Type mismatch encountered for field: " - ] - }, - "_LEGACY_ERROR_TEMP_2268" : { - "message" : [ - "Could not load Protobuf class with name " - ] - }, - "_LEGACY_ERROR_TEMP_2269" : { - "message" : [ - " is not a Protobuf message type" - ] - }, - "_LEGACY_ERROR_TEMP_2270" : { - "message" : [ - "Invalid ByteString format" - ] } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index 7fc07e5d03ddd..14937c6149305 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -3216,27 +3216,27 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { protobufColumn: String, sqlColumn: String, protobufType: String, - sqlType: String): Throwable = { + sqlType: DataType): Throwable = { new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2251", + errorClass = "PROTOBUF_TYPE_TO_SQL_TYPE_ERROR", messageParameters = Map( "protobufColumn" -> protobufColumn, "sqlColumn" -> sqlColumn, "protobufType" -> protobufType, - "sqlType" -> sqlType)) + "sqlType" -> toSQLType(sqlType))) } def cannotConvertCatalystTypeToProtobufTypeError( sqlColumn: String, protobufColumn: String, - sqlType: String, + sqlType: DataType, protobufType: String): Throwable = { new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2252", + errorClass = "SQL_TYPE_TO_PROTOBUF_TYPE_ERROR", messageParameters = Map( "sqlColumn" -> sqlColumn, "protobufColumn" -> protobufColumn, - "sqlType" -> sqlType, + "sqlType" -> toSQLType(sqlType), "protobufType" -> protobufType)) } @@ -3246,7 +3246,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { data: String, enumString: String): Throwable = { new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2253", + errorClass = "CATALYST_TYPE_TO_PROTOBUF_ENUM_TYPE_ERROR", messageParameters = Map( "sqlColumn" -> sqlColumn, "protobufColumn" -> protobufColumn, @@ -3254,41 +3254,41 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { "enumString" -> enumString)) } - def protobufTypeUnsupportedYetError(protobufType: String): Throwable = { - new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2256", - messageParameters = Map("protobufType" -> protobufType)) - } - def cannotConvertProtobufTypeToCatalystTypeError( protobufType: String, - sqlType: String, + sqlType: DataType, e1: Throwable): Throwable = { new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2254", + errorClass = "PROTOBUF_TYPE_TO_CATALYST_TYPE_ERROR", messageParameters = Map( "protobufType" -> protobufType, - "toType" -> sqlType), + "toType" -> toSQLType(sqlType)), cause = Some(e1.getCause)) } def cannotConvertSqlTypeToProtobufError( protobufType: String, - sqlType: String, + sqlType: DataType, e1: Throwable): Throwable = { new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2255", + errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_TYPE", messageParameters = Map( "protobufType" -> protobufType, - "toType" -> sqlType), + "toType" -> toSQLType(sqlType)), cause = Some(e1.getCause)) } + def protobufTypeUnsupportedYetError(protobufType: String): Throwable = { + new AnalysisException( + errorClass = "PROTOBUF_TYPE_NOT_SUPPORT_ERROR", + messageParameters = Map("protobufType" -> protobufType)) + } + def unknownProtobufMessageTypeError( descriptorName: String, containingType: String): Throwable = { new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2257", + errorClass = "UNKNOWN_PROTOBUF_MESSAGE_TYPE", messageParameters = Map( "descriptorName" -> descriptorName, "containingType" -> containingType)) @@ -3296,13 +3296,13 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { def cannotFindCatalystTypeInProtobufSchemaError(catalystFieldPath: String): Throwable = { new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2258", + errorClass = "NO_CATALYST_TYPE_IN_PROTOBUF_SCHEMA", messageParameters = Map("catalystFieldPath" -> catalystFieldPath)) } def cannotFindProtobufFieldInCatalystError(field: String): Throwable = { new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2259", + errorClass = "PROTOBUF_FIELD_MISSING_IN_CATALYST_SCHEMA", messageParameters = Map("field" -> field)) } @@ -3312,7 +3312,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { matchSize: String, matches: String): Throwable = { new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2260", + errorClass = "PROTOBUF_FIELD_MISSING_ERROR", messageParameters = Map( "field" -> field, "protobufSchema" -> protobufSchema, @@ -3320,54 +3320,54 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { "matches" -> matches)) } - def unableToLocateProtobuMessageError(messageName: String): Throwable = { + def unableToLocateProtobufMessageError(messageName: String): Throwable = { new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2261", + errorClass = "UNABLE_TO_LOCATE_PROTOBUF_MESSAGE_ERROR", messageParameters = Map("messageName" -> messageName)) } def descrioptorParseError(e1: Throwable): Throwable = { new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2262", + errorClass = "PROTOBUF_DESCRIPTOR_ERROR", messageParameters = Map("errorMessage" -> e1.getMessage()), cause = Some(e1.getCause)) } def cannotFindDescriptorFileError(filePath: String, e1: Throwable): Throwable = { new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2263", + errorClass = "CANNOT_FIND_PROTOBUF_DESCRIPTOR_FILE_ERROR", messageParameters = Map("filePath" -> filePath), cause = Some(e1.getCause)) } def noProtobufMessageTypeReturnError(descriptorName: String): Throwable = { new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2264", + errorClass = "NO_PROTOBUF_MESSAGE_TYPE_ERROR", messageParameters = Map("descriptorName" -> descriptorName)) } def failedParsingDescriptorError(e1: Throwable): Throwable = { new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2265", + errorClass = "PROTOBUF_DESCRIPTOR_PARSING_ERROR", messageParameters = Map("errorMessage" -> e1.getMessage()), cause = Some(e1.getCause)) } def foundRecursionInProtobufSchema(fieldDescriptor: String): Throwable = { new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2266", + errorClass = "PROTOBUF_RECURSION_ERROR", messageParameters = Map("fieldDescriptor" -> fieldDescriptor)) } def protobufFieldTypeMismatchError(field: String): Throwable = { new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2267", + errorClass = "PROTOBUF_FIELD_TYPE_MISMATCH", messageParameters = Map("field" -> field)) } def protobufClassLoadError(protobufClassName: String, errorMessage: String): Throwable = { new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2268", + errorClass = "PROTOBUF_CLASS_LOAD_ERROR", messageParameters = Map( "protobufClassName" -> protobufClassName, "errorMessage" -> errorMessage)) @@ -3375,12 +3375,12 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { def protobufMessageTypeError(protobufClassName: String): Throwable = { new AnalysisException( - errorClass = "_LEGACY_ERROR_TEMP_2269", + errorClass = "PROTOBUF_MESSAGE_TYPE_ERROR", messageParameters = Map("protobufClassName" -> protobufClassName)) } def invalidByteStringFormatError(): Throwable = { - new AnalysisException(errorClass = "_LEGACY_ERROR_TEMP_2270", messageParameters = Map.empty) + new AnalysisException(errorClass = "INVALID_BYTE_STRING_ERROR", messageParameters = Map.empty) } def malformedRecordsDetectedInRecordParsingError(e1: Throwable): Throwable = { From 68f87c1d80ac011d25f867a281a32c2e401bf449 Mon Sep 17 00:00:00 2001 From: SandishKumarHN Date: Mon, 24 Oct 2022 12:40:44 -0700 Subject: [PATCH 09/19] review chages rm includeMavenTypes and import own protos --- connector/protobuf/pom.xml | 2 -- .../test/resources/protobuf/duration.proto | 26 ++++++++++++++++++ .../resources/protobuf/functions_suite.desc | Bin 6569 -> 6352 bytes .../resources/protobuf/functions_suite.proto | 13 +++++---- .../test/resources/protobuf/timestamp.proto | 26 ++++++++++++++++++ 5 files changed, 59 insertions(+), 8 deletions(-) create mode 100644 connector/protobuf/src/test/resources/protobuf/duration.proto create mode 100644 connector/protobuf/src/test/resources/protobuf/timestamp.proto diff --git a/connector/protobuf/pom.xml b/connector/protobuf/pom.xml index 14d790072d41c..b934c7f831a2b 100644 --- a/connector/protobuf/pom.xml +++ b/connector/protobuf/pom.xml @@ -123,8 +123,6 @@ com.google.protobuf:protoc:${protobuf.version} ${protobuf.version} - - direct src/test/resources/protobuf diff --git a/connector/protobuf/src/test/resources/protobuf/duration.proto b/connector/protobuf/src/test/resources/protobuf/duration.proto new file mode 100644 index 0000000000000..264a6ff7699aa --- /dev/null +++ b/connector/protobuf/src/test/resources/protobuf/duration.proto @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +syntax = "proto3"; + +package org.apache.spark.sql.protobuf.protos; + +option java_outer_classname = "DurationProto"; + +message Duration { + int64 seconds = 1; + int32 nanos = 2; +} \ No newline at end of file diff --git a/connector/protobuf/src/test/resources/protobuf/functions_suite.desc b/connector/protobuf/src/test/resources/protobuf/functions_suite.desc index 86cff961434be6a6e16fc267fd28ca342f6af626..34df7409955452d59553b4bb4a8762f1045bc325 100644 GIT binary patch delta 205 zcmZ2!e8G^FYdRyB#6%uRMwQ8h+$s}I?c@ZU_(C#sQ;SOya|;5B@=NlQ*a|?DG1nAE zF7e5Hj1myFlg$`yW%-?WT}q1*OEUBGP}R=S=aQSO&E(1mG+cs73Zzeb^9CkEF`iT| zo)Vz>!65S|+ewP?J9DuVm*f^mFexxpvrxe0qLQMx|NApMfcg4AOD?7aN)JpB@w#)70D zjEswznB4=MRx!!BgI%HrHVtUEUU+Iwj(466P)7(za}pcaFk`OYj9hXMoAf8nmk_{k z^BvpzW*lzjbtx@MEXmBzgE`rdNGGR2^%3LZ{Blb)7cbN2a!Jn5%S%lz$uB~LNpVrK zehDxL^@~!A^Gl18fdMqxjM0??DNZCg5MCAEyq3{Wj4@#HeQ`xYJua5wlH39bCIv Date: Mon, 24 Oct 2022 17:18:29 -0700 Subject: [PATCH 10/19] review changes: nested import support, nit, build fix --- .../sql/protobuf/utils/ProtobufUtils.scala | 51 ++++++++--- .../sql/protobuf/utils/SchemaConverters.scala | 16 ++-- .../test/resources/protobuf/basicmessage.desc | Bin 0 -> 616 bytes .../resources/protobuf/basicmessage.proto | 39 +++++++++ .../resources/protobuf/catalyst_types.proto | 2 - .../test/resources/protobuf/duration.proto | 2 +- .../resources/protobuf/functions_suite.desc | Bin 6352 -> 6678 bytes .../resources/protobuf/functions_suite.proto | 24 ++---- .../test/resources/protobuf/nestedenum.proto | 28 +++++++ .../test/resources/protobuf/timestamp.proto | 2 +- .../ProtobufCatalystDataConversionSuite.scala | 2 +- .../sql/protobuf/ProtobufFunctionsSuite.scala | 36 ++++---- .../sql/protobuf/ProtobufSerdeSuite.scala | 79 ++++++++++++------ .../main/resources/error/error-classes.json | 5 ++ project/SparkBuild.scala | 2 +- .../sql/errors/QueryCompilationErrors.scala | 6 ++ 16 files changed, 209 insertions(+), 85 deletions(-) create mode 100644 connector/protobuf/src/test/resources/protobuf/basicmessage.desc create mode 100644 connector/protobuf/src/test/resources/protobuf/basicmessage.proto create mode 100644 connector/protobuf/src/test/resources/protobuf/nestedenum.proto diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala index 5d7c1f033ba3b..e797ee6df0373 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala @@ -23,6 +23,7 @@ import java.util.Locale import scala.collection.JavaConverters._ import com.google.protobuf.{DescriptorProtos, Descriptors, InvalidProtocolBufferException, Message} +import com.google.protobuf.DescriptorProtos.{FileDescriptorProto, FileDescriptorSet} import com.google.protobuf.Descriptors.{Descriptor, FieldDescriptor} import org.apache.spark.internal.Logging @@ -198,20 +199,10 @@ private[sql] object ProtobufUtils extends Logging { case ex: IOException => throw QueryCompilationErrors.cannotFindDescriptorFileError(descFilePath, ex) } - - val descriptorProto: DescriptorProtos.FileDescriptorProto = - fileDescriptorSet.getFileList.asScala.last - - var fileDescriptorList = List[Descriptors.FileDescriptor]() - for (fd <- fileDescriptorSet.getFileList.asScala) { - if (descriptorProto.getName != fd.getName) { - fileDescriptorList = fileDescriptorList ++ - List(Descriptors.FileDescriptor.buildFrom(fd, new Array[Descriptors.FileDescriptor](0))) - } - } try { + val fileDescriptorProtoIndex = createDescriptorProtoIndex(fileDescriptorSet) val fileDescriptor: Descriptors.FileDescriptor = - Descriptors.FileDescriptor.buildFrom(descriptorProto, fileDescriptorList.toArray) + buildFileDescriptor(fileDescriptorSet.getFileList.asScala.last, fileDescriptorProtoIndex) if (fileDescriptor.getMessageTypes().isEmpty()) { throw QueryCompilationErrors.noProtobufMessageTypeReturnError(fileDescriptor.getName()) } @@ -222,6 +213,42 @@ private[sql] object ProtobufUtils extends Logging { } } + /** + * Recursively constructs file descriptors for all dependencies for given + * FileDescriptorProto and return. + * @param descriptorProto + * @param descriptorProtoIndex + * @return Descriptors.FileDescriptor + */ + private def buildFileDescriptor( + fileDescriptorProto: FileDescriptorProto, + fileDescriptorProtoIndex: Map[String, FileDescriptorProto]): Descriptors.FileDescriptor = { + var fileDescriptorList = List[Descriptors.FileDescriptor]() + for (dependencyName <- fileDescriptorProto.getDependencyList().asScala) { + if (!fileDescriptorProtoIndex.contains(dependencyName)) { + throw QueryCompilationErrors.protobufDescriptorDependencyError(dependencyName) + } + val dependencyProto: FileDescriptorProto = fileDescriptorProtoIndex.get(dependencyName).get + fileDescriptorList = fileDescriptorList ++ List( + buildFileDescriptor(dependencyProto, fileDescriptorProtoIndex)) + } + Descriptors.FileDescriptor.buildFrom(fileDescriptorProto, fileDescriptorList.toArray) + } + + /** + * Returns a map from descriptor proto name as found inside the descriptors to protos. + * @param fileDescriptorSet + * @return Map[String, FileDescriptorProto] + */ + private def createDescriptorProtoIndex( + fileDescriptorSet: FileDescriptorSet): Map[String, FileDescriptorProto] = { + var resultBuilder = Map[String, FileDescriptorProto]() + for (descriptorProto <- fileDescriptorSet.getFileList().asScala) { + resultBuilder = resultBuilder ++ Map(descriptorProto.getName() -> descriptorProto) + } + resultBuilder + } + /** * Convert a sequence of hierarchical field names (like `Seq(foo, bar)`) into a human-readable * string representing the field, like "field 'foo.bar'". If `names` is empty, the string diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala index 0c1542c39bfc7..6fcba3b891863 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala @@ -64,16 +64,16 @@ object SchemaConverters { case BYTE_STRING => Some(BinaryType) case ENUM => Some(StringType) case MESSAGE - if fd.getMessageType.getName == "Duration" && - fd.getMessageType.getFields.size() == 2 && - fd.getMessageType.getFields.get(0).getName.equals("seconds") && - fd.getMessageType.getFields.get(1).getName.equals("nanos") => + if (fd.getMessageType.getName == "Duration" && + fd.getMessageType.getFields.size() == 2 && + fd.getMessageType.getFields.get(0).getName.equals("seconds") && + fd.getMessageType.getFields.get(1).getName.equals("nanos")) => Some(DayTimeIntervalType.defaultConcreteType) case MESSAGE - if fd.getMessageType.getName == "Timestamp" && - fd.getMessageType.getFields.size() == 2 && - fd.getMessageType.getFields.get(0).getName.equals("seconds") && - fd.getMessageType.getFields.get(1).getName.equals("nanos") => + if (fd.getMessageType.getName == "Timestamp" && + fd.getMessageType.getFields.size() == 2 && + fd.getMessageType.getFields.get(0).getName.equals("seconds") && + fd.getMessageType.getFields.get(1).getName.equals("nanos")) => Some(TimestampType) case MESSAGE if fd.isRepeated && fd.getMessageType.getOptions.hasMapEntry => var keyType: DataType = NullType diff --git a/connector/protobuf/src/test/resources/protobuf/basicmessage.desc b/connector/protobuf/src/test/resources/protobuf/basicmessage.desc new file mode 100644 index 0000000000000000000000000000000000000000..40529ffec581a4ae6b4e197181d6e29b02c172ef GIT binary patch literal 616 zcmb7?-A=+V9L3!bg#RXvdP7W%AMwI?u^1pcz+@^H4$~2DNe2ZN$BOG_;;Z;XK7()e*e>tQ}{5zNh#0ijHboXU1cn1=z?Vnm#oP2nYw&MvKN=XCd!M7xmxfm zA0Q|veKCm;Gy^{z`-4d^8s86tJ7)(uU~BDmh{LgCpcd?FCG>Ab!N4)SmUpm;d6fG{ zwelEU_^bem2#=>r$~jrk{~Yh=*R{^SOt3u}_n_iX6U=zlHcpMp3KlDL1~$*LI9*KM zNK(+YDVzqjl=dVgbOKhK=G~4aDuSqDD5Y(n-_yWu87_f-PXS_+87rcMYN9F#16xX4 zXUr2ua!u3(VPYsH1snrJj3wIN6Mh{xq_Uom-gC;eUl+cOp_CL5f}N?>C$jc-LlDi* ZwfmRe-A!BA(kb!>c!#3Ot=Iig0;~YdRyBL_tx0Nq$mkn*K!AUfwB;T;d3UX newConsumer") { - val testFileDesc = testFile("protobuf/catalyst_types.desc").replace("file:/", "/") + val testFileDesc = testFile("catalyst_types.desc").replace("file:/", "/") val oldProducer = ProtobufUtils.buildDescriptor(testFileDesc, "oldProducer") val newConsumer = ProtobufUtils.buildDescriptor(testFileDesc, "newConsumer") @@ -498,7 +502,7 @@ class ProtobufFunctionsSuite extends QueryTest with SharedSparkSession with Seri } test("Handle extra fields : newProducer -> oldConsumer") { - val testFileDesc = testFile("protobuf/catalyst_types.desc").replace("file:/", "/") + val testFileDesc = testFile("catalyst_types.desc").replace("file:/", "/") val newProducer = ProtobufUtils.buildDescriptor(testFileDesc, "newProducer") val oldConsumer = ProtobufUtils.buildDescriptor(testFileDesc, "oldConsumer") @@ -569,6 +573,7 @@ class ProtobufFunctionsSuite extends QueryTest with SharedSparkSession with Seri } test("from_protobuf filter to_protobuf") { + val testFileDesc = testFile("basicmessage.desc").replace("file:/", "/") val basicMessageDesc = ProtobufUtils.buildDescriptor(testFileDesc, "BasicMessage") val basicMessage = DynamicMessage @@ -587,19 +592,16 @@ class ProtobufFunctionsSuite extends QueryTest with SharedSparkSession with Seri val df = Seq(basicMessage.toByteArray).toDF("value") - checkWithFileAndClassName("BasicMessage") { - case (name, descFilePathOpt) => - val resultFrom = df - .select(from_protobuf_wrapper($"value", name, descFilePathOpt) as 'sample) - .where("sample.string_value == \"slam\"") + val resultFrom = df + .select(from_protobuf_wrapper($"value", "BasicMessage", Some(testFileDesc)) as 'sample) + .where("sample.string_value == \"slam\"") - val resultToFrom = resultFrom - .select(to_protobuf_wrapper($"sample", name, descFilePathOpt) as 'value) - .select(from_protobuf_wrapper($"value", name, descFilePathOpt) as 'sample) - .where("sample.string_value == \"slam\"") + val resultToFrom = resultFrom + .select(to_protobuf_wrapper($"sample", "BasicMessage", Some(testFileDesc)) as 'value) + .select(from_protobuf_wrapper($"value", "BasicMessage", Some(testFileDesc)) as 'sample) + .where("sample.string_value == \"slam\"") - assert(resultFrom.except(resultToFrom).isEmpty) - } + assert(resultFrom.except(resultToFrom).isEmpty) } test("Handle TimestampType between to_protobuf and from_protobuf") { diff --git a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala index 182486e3c67b1..207df011b4e75 100644 --- a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala +++ b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.protobuf import com.google.protobuf.Descriptors.Descriptor import com.google.protobuf.DynamicMessage +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.NoopFilters import org.apache.spark.sql.catalyst.expressions.Cast.toSQLType import org.apache.spark.sql.protobuf.utils.ProtobufUtils @@ -35,7 +36,7 @@ class ProtobufSerdeSuite extends SharedSparkSession { import ProtoSerdeSuite._ import ProtoSerdeSuite.MatchType._ - val testFileDesc = testFile("protobuf/serde_suite.desc").replace("file:/", "/") + val testFileDesc = testFile("serde_suite.desc").replace("file:/", "/") private val javaClassNamePrefix = "org.apache.spark.sql.protobuf.protos.SerdeSuiteProtos$" test("Test basic conversion") { @@ -65,22 +66,24 @@ class ProtobufSerdeSuite extends SharedSparkSession { test("Fail to convert with field type mismatch") { val protoFile = ProtobufUtils.buildDescriptor(testFileDesc, "MissMatchTypeInRoot") - withFieldMatchType { fieldMatch => assertFailedConversionMessage( protoFile, Deserializer, fieldMatch, - "Cannot convert Protobuf field 'foo' to SQL field 'foo' because schema is incompatible " + - s"(protoType = org.apache.spark.sql.protobuf.MissMatchTypeInRoot.foo " + - s"LABEL_OPTIONAL LONG INT64, sqlType = ${CATALYST_STRUCT.head.dataType.sql})".stripMargin) + errorClass = "PROTOBUF_TYPE_TO_CATALYST_TYPE_ERROR", + params = Map( + "protobufType" -> "MissMatchTypeInRoot", + "toType" -> toSQLType(CATALYST_STRUCT))) assertFailedConversionMessage( protoFile, Serializer, fieldMatch, - s"Cannot convert SQL field 'foo' to Protobuf field 'foo' because schema is incompatible " + - s"""(sqlType = ${CATALYST_STRUCT.head.dataType.sql}, protoType = LONG)""") + errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_TYPE", + params = Map( + "protobufType" -> "MissMatchTypeInRoot", + "toType" -> toSQLType(CATALYST_STRUCT))) } } @@ -91,9 +94,22 @@ class ProtobufSerdeSuite extends SharedSparkSession { .add("foo", new StructType().add("bar", IntegerType, nullable = false)) // serialize fails whether or not 'bar' is nullable - val byNameMsg = "Cannot find field 'foo.bar' in Protobuf schema" - assertFailedConversionMessage(protoFile, Serializer, BY_NAME, byNameMsg) - assertFailedConversionMessage(protoFile, Serializer, BY_NAME, byNameMsg, nonnullCatalyst) + assertFailedConversionMessage( + protoFile, + Serializer, + BY_NAME, + errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_TYPE", + params = Map( + "protobufType" -> "FieldMissingInProto", + "toType" -> toSQLType(CATALYST_STRUCT))) + + assertFailedConversionMessage(protoFile, + Serializer, + BY_NAME, + errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_TYPE", + params = Map( + "protobufType" -> "FieldMissingInProto", + "toType" -> toSQLType(nonnullCatalyst))) } test("Fail to convert with deeply nested field type mismatch") { @@ -107,18 +123,21 @@ class ProtobufSerdeSuite extends SharedSparkSession { protoFile, Deserializer, fieldMatch, - s"Cannot convert Protobuf field 'top.foo.bar' to SQL field 'top.foo.bar' because schema " + - s"is incompatible (protoType = org.apache.spark.sql.protobuf.protos.TypeMiss.bar " + - s"LABEL_OPTIONAL LONG INT64, sqlType = INT)", - catalyst) + catalyst, + errorClass = "PROTOBUF_TYPE_TO_CATALYST_TYPE_ERROR", + params = Map( + "protobufType" -> "MissMatchTypeInDeepNested", + "toType" -> toSQLType(catalyst))) assertFailedConversionMessage( protoFile, Serializer, fieldMatch, - "Cannot convert SQL field 'top.foo.bar' to Protobuf field 'top.foo.bar' because schema " + - """is incompatible (sqlType = INT, protoType = LONG)""", - catalyst) + catalyst, + errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_TYPE", + params = Map( + "protobufType" -> "MissMatchTypeInDeepNested", + "toType" -> toSQLType(catalyst))) } } @@ -130,7 +149,10 @@ class ProtobufSerdeSuite extends SharedSparkSession { protoFile, Serializer, BY_NAME, - "Found field 'boo' in Protobuf schema but there is no match in the SQL schema") + errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_TYPE", + params = Map( + "protobufType" -> "FieldMissingInSQLRoot", + "toType" -> toSQLType(CATALYST_STRUCT))) /* deserializing should work regardless of whether the extra field is missing in SQL Schema or not */ @@ -144,7 +166,10 @@ class ProtobufSerdeSuite extends SharedSparkSession { protoNestedFile, Serializer, BY_NAME, - "Found field 'foo.baz' in Protobuf schema but there is no match in the SQL schema") + errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_TYPE", + params = Map( + "protobufType" -> "FieldMissingInSQLNested", + "toType" -> toSQLType(CATALYST_STRUCT))) /* deserializing should work regardless of whether the extra field is missing in SQL Schema or not */ @@ -161,11 +186,14 @@ class ProtobufSerdeSuite extends SharedSparkSession { protoSchema: Descriptor, serdeFactory: SerdeFactory[_], fieldMatchType: MatchType, - expectedCauseMessage: String, - catalystSchema: StructType = CATALYST_STRUCT): Unit = { - val e = intercept[Exception] { + catalystSchema: StructType = CATALYST_STRUCT, + errorClass: String, + params: Map[String, String]): Unit = { + + val e = intercept[AnalysisException] { serdeFactory.create(catalystSchema, protoSchema, fieldMatchType) } + val expectMsg = serdeFactory match { case Deserializer => s"[PROTOBUF_TYPE_TO_CATALYST_TYPE_ERROR] Unable to convert" + @@ -176,9 +204,10 @@ class ProtobufSerdeSuite extends SharedSparkSession { } assert(e.getMessage === expectMsg) - if (e.getCause != null) { - assert(e.getCause.getMessage === expectedCauseMessage) - } + checkError( + exception = e, + errorClass = errorClass, + parameters = params) } def withFieldMatchType(f: MatchType => Unit): Unit = { diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index 9ce037e961924..c605fa148a531 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -674,6 +674,11 @@ "Could not load Protobuf class with name " ] }, + "PROTOBUF_DEPENDENCY_ERROR" : { + "message" : [ + "Could not find dependency: " + ] + }, "PROTOBUF_DESCRIPTOR_ERROR" : { "message" : [ "Error parsing descriptor byte[] into Descriptor object Error: " diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index cc103e4ab00ac..04c15e98177b9 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -716,7 +716,7 @@ object SparkProtobuf { dependencyOverrides += "com.google.protobuf" % "protobuf-java" % protoVersion, - (Test / PB.protoSources) += (Test / sourceDirectory).value / "resources", + (Test / PB.protoSources) += (Test / sourceDirectory).value / "resources" / "protobuf", (Test / PB.targets) := Seq( PB.gens.java -> target.value / "generated-test-sources" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index 14937c6149305..a6a1682671845 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -3379,6 +3379,12 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { messageParameters = Map("protobufClassName" -> protobufClassName)) } + def protobufDescriptorDependencyError(dependencyName: String): Throwable = { + new AnalysisException( + errorClass = "PROTOBUF_DEPENDENCY_ERROR", + messageParameters = Map("dependencyName" -> dependencyName)) + } + def invalidByteStringFormatError(): Throwable = { new AnalysisException(errorClass = "INVALID_BYTE_STRING_ERROR", messageParameters = Map.empty) } From 4e1080e0022d801de2fead9e7db1f428c7df5631 Mon Sep 17 00:00:00 2001 From: SandishKumarHN Date: Tue, 25 Oct 2022 16:48:24 -0700 Subject: [PATCH 11/19] review changes style changes, nit --- .../sql/protobuf/utils/ProtobufUtils.scala | 37 +++----- .../main/resources/error/error-classes.json | 6 +- .../sql/errors/QueryCompilationErrors.scala | 91 ++++++++++--------- 3 files changed, 63 insertions(+), 71 deletions(-) diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala index e797ee6df0373..6b012f4d88067 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala @@ -158,10 +158,9 @@ private[sql] object ProtobufUtils extends Logging { val protobufClass = try { Utils.classForName(protobufClassName) } catch { - case _: ClassNotFoundException => + case e: ClassNotFoundException => val hasDots = protobufClassName.contains(".") - throw QueryCompilationErrors.protobufClassLoadError(protobufClassName, - if (hasDots) "" else ". Ensure the class name includes package prefix.") + throw QueryCompilationErrors.protobufClassLoadError(protobufClassName, hasDots, e) } if (!classOf[Message].isAssignableFrom(protobufClass)) { @@ -200,7 +199,7 @@ private[sql] object ProtobufUtils extends Logging { throw QueryCompilationErrors.cannotFindDescriptorFileError(descFilePath, ex) } try { - val fileDescriptorProtoIndex = createDescriptorProtoIndex(fileDescriptorSet) + val fileDescriptorProtoIndex = createDescriptorProtoMap(fileDescriptorSet) val fileDescriptor: Descriptors.FileDescriptor = buildFileDescriptor(fileDescriptorSet.getFileList.asScala.last, fileDescriptorProtoIndex) if (fileDescriptor.getMessageTypes().isEmpty()) { @@ -216,37 +215,29 @@ private[sql] object ProtobufUtils extends Logging { /** * Recursively constructs file descriptors for all dependencies for given * FileDescriptorProto and return. - * @param descriptorProto - * @param descriptorProtoIndex - * @return Descriptors.FileDescriptor */ private def buildFileDescriptor( fileDescriptorProto: FileDescriptorProto, - fileDescriptorProtoIndex: Map[String, FileDescriptorProto]): Descriptors.FileDescriptor = { - var fileDescriptorList = List[Descriptors.FileDescriptor]() - for (dependencyName <- fileDescriptorProto.getDependencyList().asScala) { - if (!fileDescriptorProtoIndex.contains(dependencyName)) { - throw QueryCompilationErrors.protobufDescriptorDependencyError(dependencyName) + fileDescriptorProtoMap: Map[String, FileDescriptorProto]): Descriptors.FileDescriptor = { + val fileDescriptorList = fileDescriptorProto.getDependencyList().asScala.map { dependency => + fileDescriptorProtoMap.get(dependency) match { + case Some(dependencyProto) => + buildFileDescriptor(dependencyProto, fileDescriptorProtoMap) + case None => + throw QueryCompilationErrors.protobufDescriptorDependencyError(dependency) } - val dependencyProto: FileDescriptorProto = fileDescriptorProtoIndex.get(dependencyName).get - fileDescriptorList = fileDescriptorList ++ List( - buildFileDescriptor(dependencyProto, fileDescriptorProtoIndex)) } Descriptors.FileDescriptor.buildFrom(fileDescriptorProto, fileDescriptorList.toArray) } /** * Returns a map from descriptor proto name as found inside the descriptors to protos. - * @param fileDescriptorSet - * @return Map[String, FileDescriptorProto] */ - private def createDescriptorProtoIndex( + private def createDescriptorProtoMap( fileDescriptorSet: FileDescriptorSet): Map[String, FileDescriptorProto] = { - var resultBuilder = Map[String, FileDescriptorProto]() - for (descriptorProto <- fileDescriptorSet.getFileList().asScala) { - resultBuilder = resultBuilder ++ Map(descriptorProto.getName() -> descriptorProto) - } - resultBuilder + fileDescriptorSet.getFileList().asScala.map { descriptorProto => + descriptorProto.getName() -> descriptorProto + }.toMap[String, FileDescriptorProto] } /** diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index c605fa148a531..15c1ab22b9814 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -671,7 +671,7 @@ }, "PROTOBUF_CLASS_LOAD_ERROR" : { "message" : [ - "Could not load Protobuf class with name " + "Could not load Protobuf class with name " ] }, "PROTOBUF_DEPENDENCY_ERROR" : { @@ -681,12 +681,12 @@ }, "PROTOBUF_DESCRIPTOR_ERROR" : { "message" : [ - "Error parsing descriptor byte[] into Descriptor object Error: " + "Error parsing descriptor byte[] into Descriptor object" ] }, "PROTOBUF_DESCRIPTOR_PARSING_ERROR" : { "message" : [ - "Error constructing FileDescriptor, Error: " + "Error constructing FileDescriptor" ] }, "PROTOBUF_FIELD_MISSING_ERROR" : { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index a6a1682671845..66d80e3d382d1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -3213,38 +3213,38 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { } def cannotConvertProtobufTypeToSqlTypeError( - protobufColumn: String, - sqlColumn: String, - protobufType: String, - sqlType: DataType): Throwable = { + protobufColumn: String, + sqlColumn: String, + protobufType: String, + sqlType: DataType): Throwable = { new AnalysisException( errorClass = "PROTOBUF_TYPE_TO_SQL_TYPE_ERROR", messageParameters = Map( "protobufColumn" -> protobufColumn, - "sqlColumn" -> sqlColumn, + "sqlColumn" -> toSQLId(sqlColumn), "protobufType" -> protobufType, "sqlType" -> toSQLType(sqlType))) } def cannotConvertCatalystTypeToProtobufTypeError( - sqlColumn: String, - protobufColumn: String, - sqlType: DataType, - protobufType: String): Throwable = { + sqlColumn: String, + protobufColumn: String, + sqlType: DataType, + protobufType: String): Throwable = { new AnalysisException( errorClass = "SQL_TYPE_TO_PROTOBUF_TYPE_ERROR", messageParameters = Map( - "sqlColumn" -> sqlColumn, + "sqlColumn" -> toSQLId(sqlColumn), "protobufColumn" -> protobufColumn, "sqlType" -> toSQLType(sqlType), "protobufType" -> protobufType)) } def cannotConvertCatalystTypeToProtobufEnumTypeError( - sqlColumn: String, - protobufColumn: String, - data: String, - enumString: String): Throwable = { + sqlColumn: String, + protobufColumn: String, + data: String, + enumString: String): Throwable = { new AnalysisException( errorClass = "CATALYST_TYPE_TO_PROTOBUF_ENUM_TYPE_ERROR", messageParameters = Map( @@ -3255,27 +3255,27 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { } def cannotConvertProtobufTypeToCatalystTypeError( - protobufType: String, - sqlType: DataType, - e1: Throwable): Throwable = { + protobufType: String, + sqlType: DataType, + cause: Throwable): Throwable = { new AnalysisException( errorClass = "PROTOBUF_TYPE_TO_CATALYST_TYPE_ERROR", messageParameters = Map( "protobufType" -> protobufType, "toType" -> toSQLType(sqlType)), - cause = Some(e1.getCause)) + cause = Some(cause.getCause)) } def cannotConvertSqlTypeToProtobufError( - protobufType: String, - sqlType: DataType, - e1: Throwable): Throwable = { + protobufType: String, + sqlType: DataType, + cause: Throwable): Throwable = { new AnalysisException( errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_TYPE", messageParameters = Map( "protobufType" -> protobufType, "toType" -> toSQLType(sqlType)), - cause = Some(e1.getCause)) + cause = Some(cause.getCause)) } def protobufTypeUnsupportedYetError(protobufType: String): Throwable = { @@ -3285,8 +3285,8 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { } def unknownProtobufMessageTypeError( - descriptorName: String, - containingType: String): Throwable = { + descriptorName: String, + containingType: String): Throwable = { new AnalysisException( errorClass = "UNKNOWN_PROTOBUF_MESSAGE_TYPE", messageParameters = Map( @@ -3306,11 +3306,10 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { messageParameters = Map("field" -> field)) } - def protobufFieldMatchError( - field: String, - protobufSchema: String, - matchSize: String, - matches: String): Throwable = { + def protobufFieldMatchError(field: String, + protobufSchema: String, + matchSize: String, + matches: String): Throwable = { new AnalysisException( errorClass = "PROTOBUF_FIELD_MISSING_ERROR", messageParameters = Map( @@ -3326,18 +3325,18 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { messageParameters = Map("messageName" -> messageName)) } - def descrioptorParseError(e1: Throwable): Throwable = { + def descrioptorParseError(cause: Throwable): Throwable = { new AnalysisException( errorClass = "PROTOBUF_DESCRIPTOR_ERROR", - messageParameters = Map("errorMessage" -> e1.getMessage()), - cause = Some(e1.getCause)) + messageParameters = Map.empty(), + cause = Some(cause.getCause)) } - def cannotFindDescriptorFileError(filePath: String, e1: Throwable): Throwable = { + def cannotFindDescriptorFileError(filePath: String, cause: Throwable): Throwable = { new AnalysisException( errorClass = "CANNOT_FIND_PROTOBUF_DESCRIPTOR_FILE_ERROR", messageParameters = Map("filePath" -> filePath), - cause = Some(e1.getCause)) + cause = Some(cause.getCause)) } def noProtobufMessageTypeReturnError(descriptorName: String): Throwable = { @@ -3346,11 +3345,11 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { messageParameters = Map("descriptorName" -> descriptorName)) } - def failedParsingDescriptorError(e1: Throwable): Throwable = { + def failedParsingDescriptorError(cause: Throwable): Throwable = { new AnalysisException( errorClass = "PROTOBUF_DESCRIPTOR_PARSING_ERROR", - messageParameters = Map("errorMessage" -> e1.getMessage()), - cause = Some(e1.getCause)) + messageParameters = Map.empty(), + cause = Some(cause.getCause)) } def foundRecursionInProtobufSchema(fieldDescriptor: String): Throwable = { @@ -3365,12 +3364,15 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { messageParameters = Map("field" -> field)) } - def protobufClassLoadError(protobufClassName: String, errorMessage: String): Throwable = { + def protobufClassLoadError( + protobufClassName: String, + hasDots: Boolean, + cause: Throwable): Throwable = { + val message = if (hasDots) "" else ". Ensure the class name includes package prefix." new AnalysisException( errorClass = "PROTOBUF_CLASS_LOAD_ERROR", - messageParameters = Map( - "protobufClassName" -> protobufClassName, - "errorMessage" -> errorMessage)) + messageParameters = Map("protobufClassName" -> protobufClassName, "message" -> message), + cause = Some(cause.getCause)) } def protobufMessageTypeError(protobufClassName: String): Throwable = { @@ -3389,11 +3391,10 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { new AnalysisException(errorClass = "INVALID_BYTE_STRING_ERROR", messageParameters = Map.empty) } - def malformedRecordsDetectedInRecordParsingError(e1: Throwable): Throwable = { + def malformedRecordsDetectedInRecordParsingError(cause: Throwable): Throwable = { new SparkException( errorClass = "_LEGACY_ERROR_TEMP_2177", - messageParameters = Map( - "failFastMode" -> FailFastMode.name), - cause = e1) + messageParameters = Map("failFastMode" -> FailFastMode.name), + cause = cause) } } From 6045ffee0b0ec8a5df85877c61abcf85009282b0 Mon Sep 17 00:00:00 2001 From: SandishKumarHN Date: Wed, 26 Oct 2022 16:35:31 -0700 Subject: [PATCH 12/19] review changes: search messages on all imports --- .../sql/protobuf/utils/ProtobufUtils.scala | 26 +++++++++++------- .../test/resources/protobuf/basicmessage.desc | Bin 616 -> 0 bytes .../sql/protobuf/ProtobufFunctionsSuite.scala | 9 ++---- 3 files changed, 18 insertions(+), 17 deletions(-) delete mode 100644 connector/protobuf/src/test/resources/protobuf/basicmessage.desc diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala index 6b012f4d88067..49d4acb1aed9c 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala @@ -175,19 +175,26 @@ private[sql] object ProtobufUtils extends Logging { .asInstanceOf[Descriptor] } + // TODO: Revisit to ensure that messageName is searched through all imports def buildDescriptor(descFilePath: String, messageName: String): Descriptor = { - val descriptor = parseFileDescriptor(descFilePath).getMessageTypes.asScala.find { desc => - desc.getName == messageName || desc.getFullName == messageName + val descriptorList = parseFileDescriptor(descFilePath).map(fileDescriptor => { + fileDescriptor.getMessageTypes.asScala.find { desc => + desc.getName == messageName || desc.getFullName == messageName + } + }).filter(f => !f.isEmpty) + + if (descriptorList.isEmpty) { + throw QueryCompilationErrors.noProtobufMessageTypeReturnError(messageName) } - descriptor match { + descriptorList.last match { case Some(d) => d case None => throw QueryCompilationErrors.unableToLocateProtobufMessageError(messageName) } } - private def parseFileDescriptor(descFilePath: String): Descriptors.FileDescriptor = { + private def parseFileDescriptor(descFilePath: String): List[Descriptors.FileDescriptor] = { var fileDescriptorSet: DescriptorProtos.FileDescriptorSet = null try { val dscFile = new BufferedInputStream(new FileInputStream(descFilePath)) @@ -200,12 +207,11 @@ private[sql] object ProtobufUtils extends Logging { } try { val fileDescriptorProtoIndex = createDescriptorProtoMap(fileDescriptorSet) - val fileDescriptor: Descriptors.FileDescriptor = - buildFileDescriptor(fileDescriptorSet.getFileList.asScala.last, fileDescriptorProtoIndex) - if (fileDescriptor.getMessageTypes().isEmpty()) { - throw QueryCompilationErrors.noProtobufMessageTypeReturnError(fileDescriptor.getName()) - } - fileDescriptor + val fileDescriptorList: List[Descriptors.FileDescriptor] = + fileDescriptorSet.getFileList.asScala.map( fileDescriptorProto => + buildFileDescriptor(fileDescriptorProto, fileDescriptorProtoIndex) + ).toList + fileDescriptorList } catch { case e: Descriptors.DescriptorValidationException => throw QueryCompilationErrors.failedParsingDescriptorError(e) diff --git a/connector/protobuf/src/test/resources/protobuf/basicmessage.desc b/connector/protobuf/src/test/resources/protobuf/basicmessage.desc deleted file mode 100644 index 40529ffec581a4ae6b4e197181d6e29b02c172ef..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 616 zcmb7?-A=+V9L3!bg#RXvdP7W%AMwI?u^1pcz+@^H4$~2DNe2ZN$BOG_;;Z;XK7()e*e>tQ}{5zNh#0ijHboXU1cn1=z?Vnm#oP2nYw&MvKN=XCd!M7xmxfm zA0Q|veKCm;Gy^{z`-4d^8s86tJ7)(uU~BDmh{LgCpcd?FCG>Ab!N4)SmUpm;d6fG{ zwelEU_^bem2#=>r$~jrk{~Yh=*R{^SOt3u}_n_iX6U=zlHcpMp3KlDL1~$*LI9*KM zNK(+YDVzqjl=dVgbOKhK=G~4aDuSqDD5Y(n-_yWu87_f-PXS_+87rcMYN9F#16xX4 zXUr2ua!u3(VPYsH1snrJj3wIN6Mh{xq_Uom-gC;eUl+cOp_CL5f}N?>C$jc-LlDi* ZwfmRe-A!BA(kb!>c!#3Ot=Iig Date: Thu, 27 Oct 2022 08:32:04 -0700 Subject: [PATCH 13/19] review changes: use prettyName --- .../apache/spark/sql/protobuf/ProtobufDataToCatalyst.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDataToCatalyst.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDataToCatalyst.scala index a0b7c145926d0..9bdbd5f685cd1 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDataToCatalyst.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDataToCatalyst.scala @@ -70,7 +70,7 @@ private[protobuf] case class ProtobufDataToCatalyst( @transient private lazy val parseMode: ParseMode = { val mode = protobufOptions.parseMode if (mode != PermissiveMode && mode != FailFastMode) { - throw QueryCompilationErrors.parseModeUnsupportedError("from_protobuf", mode) + throw QueryCompilationErrors.parseModeUnsupportedError(prettyName, mode) } mode } @@ -94,7 +94,7 @@ private[protobuf] case class ProtobufDataToCatalyst( case FailFastMode => throw QueryCompilationErrors.malformedRecordsDetectedInRecordParsingError(e) case _ => - throw QueryCompilationErrors.parseModeUnsupportedError("from_protobuf", parseMode) + throw QueryCompilationErrors.parseModeUnsupportedError(prettyName, parseMode) } } From dd63be8537c3fa18e35f765b120820bacc5cc67c Mon Sep 17 00:00:00 2001 From: SandishKumarHN Date: Fri, 28 Oct 2022 09:04:29 -0700 Subject: [PATCH 14/19] review changes: nit, option, runtime error --- .../sql/protobuf/ProtobufDataToCatalyst.scala | 4 +-- .../sql/protobuf/ProtobufDeserializer.scala | 2 +- .../sql/protobuf/ProtobufSerializer.scala | 4 +-- .../main/resources/error/error-classes.json | 5 ++++ .../sql/errors/QueryCompilationErrors.scala | 28 +++++++------------ .../sql/errors/QueryExecutionErrors.scala | 8 ++++++ 6 files changed, 28 insertions(+), 23 deletions(-) diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDataToCatalyst.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDataToCatalyst.scala index 9bdbd5f685cd1..c0997b1bd0638 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDataToCatalyst.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDataToCatalyst.scala @@ -24,7 +24,7 @@ import com.google.protobuf.DynamicMessage import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, SpecificInternalRow, UnaryExpression} import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode} import org.apache.spark.sql.catalyst.util.{FailFastMode, ParseMode, PermissiveMode} -import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.protobuf.utils.{ProtobufOptions, ProtobufUtils, SchemaConverters} import org.apache.spark.sql.types.{AbstractDataType, BinaryType, DataType, StructType} @@ -92,7 +92,7 @@ private[protobuf] case class ProtobufDataToCatalyst( case PermissiveMode => nullResultRow case FailFastMode => - throw QueryCompilationErrors.malformedRecordsDetectedInRecordParsingError(e) + throw QueryExecutionErrors.malformedProtobufMessageDetectedInMessageParsingError(e) case _ => throw QueryCompilationErrors.parseModeUnsupportedError(prettyName, parseMode) } diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala index 1c6dbf54ad08e..23ef0e21a6ddb 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala @@ -243,7 +243,7 @@ private[sql] class ProtobufDeserializer( case _ => throw QueryCompilationErrors.cannotConvertProtobufTypeToSqlTypeError( toFieldStr(protoPath), - toFieldStr(catalystPath), + catalystPath, s"${protoType} ${protoType.toProto.getLabel} ${protoType.getJavaType}" + s" ${protoType.getType}", catalystType) diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufSerializer.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufSerializer.scala index b09db6b6b2c23..0f87c640b194b 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufSerializer.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufSerializer.scala @@ -104,7 +104,7 @@ private[sql] class ProtobufSerializer( val data = getter.getUTF8String(ordinal).toString if (!enumSymbols.contains(data)) { throw QueryCompilationErrors.cannotConvertCatalystTypeToProtobufEnumTypeError( - toFieldStr(catalystPath), + catalystPath, toFieldStr(protoPath), data, enumSymbols.mkString("\"", "\", \"", "\"")) @@ -218,7 +218,7 @@ private[sql] class ProtobufSerializer( case _ => throw QueryCompilationErrors.cannotConvertCatalystTypeToProtobufTypeError( - toFieldStr(catalystPath), + catalystPath, toFieldStr(protoPath), catalystType, s"${fieldDescriptor} ${fieldDescriptor.toProto.getLabel} ${fieldDescriptor.getJavaType}" + diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index 571a11b39bff6..eab9a6f3ea1b7 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -594,6 +594,11 @@ } } }, + "MALFORMED_PROTOBUF_MESSAGE_ERROR" : { + "message" : [ + "Malformed Protobuf messages are detected in message deserialization. Parse Mode: . To process malformed protobuf message as null result, try setting the option 'mode' as 'PERMISSIVE'." + ] + }, "MISSING_STATIC_PARTITION_COLUMN" : { "message" : [ "Unknown static partition column: " diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index 66d80e3d382d1..699084bad986d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -21,7 +21,6 @@ import scala.collection.mutable import org.apache.hadoop.fs.Path -import org.apache.spark.SparkException import org.apache.spark.SparkThrowableHelper import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.{FunctionIdentifier, QualifiedTableName, TableIdentifier} @@ -3214,7 +3213,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { def cannotConvertProtobufTypeToSqlTypeError( protobufColumn: String, - sqlColumn: String, + sqlColumn: Seq[String], protobufType: String, sqlType: DataType): Throwable = { new AnalysisException( @@ -3227,7 +3226,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { } def cannotConvertCatalystTypeToProtobufTypeError( - sqlColumn: String, + sqlColumn: Seq[String], protobufColumn: String, sqlType: DataType, protobufType: String): Throwable = { @@ -3241,14 +3240,14 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { } def cannotConvertCatalystTypeToProtobufEnumTypeError( - sqlColumn: String, + sqlColumn: Seq[String], protobufColumn: String, data: String, enumString: String): Throwable = { new AnalysisException( errorClass = "CATALYST_TYPE_TO_PROTOBUF_ENUM_TYPE_ERROR", messageParameters = Map( - "sqlColumn" -> sqlColumn, + "sqlColumn" -> toSQLId(sqlColumn), "protobufColumn" -> protobufColumn, "data" -> data, "enumString" -> enumString)) @@ -3263,7 +3262,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { messageParameters = Map( "protobufType" -> protobufType, "toType" -> toSQLType(sqlType)), - cause = Some(cause.getCause)) + cause = Option(cause.getCause)) } def cannotConvertSqlTypeToProtobufError( @@ -3275,7 +3274,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { messageParameters = Map( "protobufType" -> protobufType, "toType" -> toSQLType(sqlType)), - cause = Some(cause.getCause)) + cause = Option(cause.getCause)) } def protobufTypeUnsupportedYetError(protobufType: String): Throwable = { @@ -3329,14 +3328,14 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { new AnalysisException( errorClass = "PROTOBUF_DESCRIPTOR_ERROR", messageParameters = Map.empty(), - cause = Some(cause.getCause)) + cause = Option(cause.getCause)) } def cannotFindDescriptorFileError(filePath: String, cause: Throwable): Throwable = { new AnalysisException( errorClass = "CANNOT_FIND_PROTOBUF_DESCRIPTOR_FILE_ERROR", messageParameters = Map("filePath" -> filePath), - cause = Some(cause.getCause)) + cause = Option(cause.getCause)) } def noProtobufMessageTypeReturnError(descriptorName: String): Throwable = { @@ -3349,7 +3348,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { new AnalysisException( errorClass = "PROTOBUF_DESCRIPTOR_PARSING_ERROR", messageParameters = Map.empty(), - cause = Some(cause.getCause)) + cause = Option(cause.getCause)) } def foundRecursionInProtobufSchema(fieldDescriptor: String): Throwable = { @@ -3372,7 +3371,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { new AnalysisException( errorClass = "PROTOBUF_CLASS_LOAD_ERROR", messageParameters = Map("protobufClassName" -> protobufClassName, "message" -> message), - cause = Some(cause.getCause)) + cause = Option(cause.getCause)) } def protobufMessageTypeError(protobufClassName: String): Throwable = { @@ -3390,11 +3389,4 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { def invalidByteStringFormatError(): Throwable = { new AnalysisException(errorClass = "INVALID_BYTE_STRING_ERROR", messageParameters = Map.empty) } - - def malformedRecordsDetectedInRecordParsingError(cause: Throwable): Throwable = { - new SparkException( - errorClass = "_LEGACY_ERROR_TEMP_2177", - messageParameters = Map("failFastMode" -> FailFastMode.name), - cause = cause) - } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala index ba78858debc01..d9a27a61ad82f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala @@ -2704,4 +2704,12 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase { errorClass = "UNSUPPORTED_EMPTY_LOCATION", messageParameters = Map.empty) } + + def malformedProtobufMessageDetectedInMessageParsingError(e: Throwable): Throwable = { + new SparkException( + errorClass = "MALFORMED_PROTOBUF_MESSAGE_ERROR", + messageParameters = Map( + "failFastMode" -> FailFastMode.name), + cause = e) + } } From e5140b045133226e5d98638518bee4a6f830b115 Mon Sep 17 00:00:00 2001 From: SandishKumarHN Date: Fri, 28 Oct 2022 13:43:18 -0700 Subject: [PATCH 15/19] error class name changes, more details to error message error class name changes, more details to error message --- .../sql/protobuf/ProtobufDeserializer.scala | 3 +- .../sql/protobuf/utils/ProtobufUtils.scala | 4 +- .../sql/protobuf/ProtobufSerdeSuite.scala | 20 ++-- .../main/resources/error/error-classes.json | 110 +++++++++--------- .../sql/errors/QueryCompilationErrors.scala | 54 +++++---- .../sql/errors/QueryExecutionErrors.scala | 2 +- 6 files changed, 99 insertions(+), 94 deletions(-) diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala index 23ef0e21a6ddb..46366ba268b09 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala @@ -195,7 +195,8 @@ private[sql] class ProtobufDeserializer( (updater, ordinal, value) => val byte_array = value match { case s: ByteString => s.toByteArray - case _ => throw QueryCompilationErrors.invalidByteStringFormatError() + case unsupported => + throw QueryCompilationErrors.invalidByteStringFormatError(unsupported) } updater.set(ordinal, byte_array) diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala index 49d4acb1aed9c..9e99c2f6a9045 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala @@ -201,7 +201,7 @@ private[sql] object ProtobufUtils extends Logging { fileDescriptorSet = DescriptorProtos.FileDescriptorSet.parseFrom(dscFile) } catch { case ex: InvalidProtocolBufferException => - throw QueryCompilationErrors.descrioptorParseError(ex) + throw QueryCompilationErrors.descrioptorParseError(descFilePath, ex) case ex: IOException => throw QueryCompilationErrors.cannotFindDescriptorFileError(descFilePath, ex) } @@ -214,7 +214,7 @@ private[sql] object ProtobufUtils extends Logging { fileDescriptorList } catch { case e: Descriptors.DescriptorValidationException => - throw QueryCompilationErrors.failedParsingDescriptorError(e) + throw QueryCompilationErrors.failedParsingDescriptorError(descFilePath, e) } } diff --git a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala index 207df011b4e75..840535654ed6a 100644 --- a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala +++ b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala @@ -71,7 +71,7 @@ class ProtobufSerdeSuite extends SharedSparkSession { protoFile, Deserializer, fieldMatch, - errorClass = "PROTOBUF_TYPE_TO_CATALYST_TYPE_ERROR", + errorClass = "CANNOT_CONVERT_PROTOBUF_MESSAGE_TYPE_TO_SQL_TYPE", params = Map( "protobufType" -> "MissMatchTypeInRoot", "toType" -> toSQLType(CATALYST_STRUCT))) @@ -80,7 +80,7 @@ class ProtobufSerdeSuite extends SharedSparkSession { protoFile, Serializer, fieldMatch, - errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_TYPE", + errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_MESSAGE_TYPE", params = Map( "protobufType" -> "MissMatchTypeInRoot", "toType" -> toSQLType(CATALYST_STRUCT))) @@ -98,7 +98,7 @@ class ProtobufSerdeSuite extends SharedSparkSession { protoFile, Serializer, BY_NAME, - errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_TYPE", + errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_MESSAGE_TYPE", params = Map( "protobufType" -> "FieldMissingInProto", "toType" -> toSQLType(CATALYST_STRUCT))) @@ -106,7 +106,7 @@ class ProtobufSerdeSuite extends SharedSparkSession { assertFailedConversionMessage(protoFile, Serializer, BY_NAME, - errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_TYPE", + errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_MESSAGE_TYPE", params = Map( "protobufType" -> "FieldMissingInProto", "toType" -> toSQLType(nonnullCatalyst))) @@ -124,7 +124,7 @@ class ProtobufSerdeSuite extends SharedSparkSession { Deserializer, fieldMatch, catalyst, - errorClass = "PROTOBUF_TYPE_TO_CATALYST_TYPE_ERROR", + errorClass = "CANNOT_CONVERT_PROTOBUF_MESSAGE_TYPE_TO_SQL_TYPE", params = Map( "protobufType" -> "MissMatchTypeInDeepNested", "toType" -> toSQLType(catalyst))) @@ -134,7 +134,7 @@ class ProtobufSerdeSuite extends SharedSparkSession { Serializer, fieldMatch, catalyst, - errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_TYPE", + errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_MESSAGE_TYPE", params = Map( "protobufType" -> "MissMatchTypeInDeepNested", "toType" -> toSQLType(catalyst))) @@ -149,7 +149,7 @@ class ProtobufSerdeSuite extends SharedSparkSession { protoFile, Serializer, BY_NAME, - errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_TYPE", + errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_MESSAGE_TYPE", params = Map( "protobufType" -> "FieldMissingInSQLRoot", "toType" -> toSQLType(CATALYST_STRUCT))) @@ -166,7 +166,7 @@ class ProtobufSerdeSuite extends SharedSparkSession { protoNestedFile, Serializer, BY_NAME, - errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_TYPE", + errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_MESSAGE_TYPE", params = Map( "protobufType" -> "FieldMissingInSQLNested", "toType" -> toSQLType(CATALYST_STRUCT))) @@ -196,10 +196,10 @@ class ProtobufSerdeSuite extends SharedSparkSession { val expectMsg = serdeFactory match { case Deserializer => - s"[PROTOBUF_TYPE_TO_CATALYST_TYPE_ERROR] Unable to convert" + + s"[CANNOT_CONVERT_PROTOBUF_MESSAGE_TYPE_TO_SQL_TYPE] Unable to convert" + s" ${protoSchema.getName} of Protobuf to SQL type ${toSQLType(catalystSchema)}." case Serializer => - s"[UNABLE_TO_CONVERT_TO_PROTOBUF_TYPE] Unable to convert SQL type" + + s"[UNABLE_TO_CONVERT_TO_PROTOBUF_MESSAGE_TYPE] Unable to convert SQL type" + s" ${toSQLType(catalystSchema)} to Protobuf type ${protoSchema.getName}." } diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index eab9a6f3ea1b7..7df7946d28fde 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -17,29 +17,49 @@ ], "sqlState" : "22005" }, + "CANNOT_CONSTRUCT_PROTOBUF_DESCRIPTOR" : { + "message" : [ + "Error constructing FileDescriptor for " + ] + }, + "CANNOT_CONVERT_PROTOBUF_MESSAGE_TYPE_TO_SQL_TYPE" : { + "message" : [ + "Unable to convert of Protobuf to SQL type ." + ] + }, + "CANNOT_CONVERT_SQL_TYPE_TO_PROTOBUF_FIELD_TYPE" : { + "message" : [ + "Cannot convert SQL to Protobuf because schema is incompatible (protobufType = , sqlType = )." + ] + }, "CANNOT_DECODE_URL" : { "message" : [ "Cannot decode url : ." ], "sqlState" : "42000" }, - "CANNOT_FIND_PROTOBUF_DESCRIPTOR_FILE_ERROR" : { - "message" : [ - "Error reading Protobuf descriptor file at path: " - ] - }, "CANNOT_INFER_DATE" : { "message" : [ "Cannot infer date in schema inference when LegacyTimeParserPolicy is \"LEGACY\". Legacy Date formatter does not support strict date format matching which is required to avoid inferring timestamps and other non-date entries to date." ], "sqlState" : "22007" }, + "CANNOT_LOAD_PROTOBUF_CLASS" : { + "message" : [ + "Could not load Protobuf class with name " + ] + }, "CANNOT_PARSE_DECIMAL" : { "message" : [ "Cannot parse decimal" ], "sqlState" : "42000" }, + "CANNOT_PARSE_PROTOBUF_DESCRIPTOR" : { + "message" : [ + "Error parsing file descriptor byte[] into Descriptor object" + ] + }, "CANNOT_PARSE_TIMESTAMP" : { "message" : [ ". If necessary set to \"false\" to bypass this error." @@ -70,11 +90,6 @@ ], "sqlState" : "22005" }, - "CATALYST_TYPE_TO_PROTOBUF_ENUM_TYPE_ERROR" : { - "message" : [ - "Cannot convert SQL to Protobuf because cannot be written since it's not defined in ENUM " - ] - }, "COLUMN_NOT_IN_GROUP_BY_CLAUSE" : { "message" : [ "The expression is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in `first()` (or `first_value()`) if you don't care which value you get." @@ -527,9 +542,9 @@ "Invalid bucket file: " ] }, - "INVALID_BYTE_STRING_ERROR" : { + "INVALID_BYTE_STRING" : { "message" : [ - "Invalid ByteString format" + "The expected format is ByteString, but was ()." ] }, "INVALID_COLUMN_OR_FIELD_DATA_TYPE" : { @@ -576,6 +591,11 @@ " is an invalid property value, please use quotes, e.g. SET =" ] }, + "INVALID_PROTOBUF_MESSAGE_TYPE" : { + "message" : [ + " is not a Protobuf message type" + ] + }, "INVALID_SQL_SYNTAX" : { "message" : [ "Invalid SQL syntax: " @@ -594,7 +614,7 @@ } } }, - "MALFORMED_PROTOBUF_MESSAGE_ERROR" : { + "MALFORMED_PROTOBUF_MESSAGE" : { "message" : [ "Malformed Protobuf messages are detected in message deserialization. Parse Mode: . To process malformed protobuf message as null result, try setting the option 'mode' as 'PERMISSIVE'." ] @@ -649,22 +669,22 @@ ], "sqlState" : "42000" }, - "NO_CATALYST_TYPE_IN_PROTOBUF_SCHEMA" : { - "message" : [ - "Cannot find in Protobuf schema" - ] - }, "NO_HANDLER_FOR_UDAF" : { "message" : [ "No handler for UDAF ''. Use sparkSession.udf.register(...) instead." ] }, - "NO_PROTOBUF_MESSAGE_TYPE_ERROR" : { + "NO_PROTOBUF_MESSAGE_TYPE" : { "message" : [ "No MessageTypes returned, " ] }, - "NO_UDF_INTERFACE_ERROR" : { + "NO_SQL_TYPE_IN_PROTOBUF_SCHEMA" : { + "message" : [ + "Cannot find in Protobuf schema" + ] + }, + "NO_UDF_INTERFACE" : { "message" : [ "UDF class doesn't implement any UDF interface" ] @@ -736,32 +756,22 @@ ], "sqlState" : "42000" }, - "PROTOBUF_CLASS_LOAD_ERROR" : { - "message" : [ - "Could not load Protobuf class with name " - ] - }, - "PROTOBUF_DEPENDENCY_ERROR" : { + "PROTOBUF_DEPENDENCY_NOT_FOUND" : { "message" : [ "Could not find dependency: " ] }, - "PROTOBUF_DESCRIPTOR_ERROR" : { - "message" : [ - "Error parsing descriptor byte[] into Descriptor object" - ] - }, - "PROTOBUF_DESCRIPTOR_PARSING_ERROR" : { + "PROTOBUF_DESCRIPTOR_FILE_NOT_FOUND" : { "message" : [ - "Error constructing FileDescriptor" + "Error reading Protobuf descriptor file at path: " ] }, - "PROTOBUF_FIELD_MISSING_ERROR" : { + "PROTOBUF_FIELD_MISSING" : { "message" : [ "Searching for in Protobuf schema at gave matches. Candidates: " ] }, - "PROTOBUF_FIELD_MISSING_IN_CATALYST_SCHEMA" : { + "PROTOBUF_FIELD_MISSING_IN_SQL_SCHEMA" : { "message" : [ "Found in Protobuf schema but there is no match in the SQL schema" ] @@ -771,29 +781,24 @@ "Type mismatch encountered for field: " ] }, - "PROTOBUF_MESSAGE_TYPE_ERROR" : { + "PROTOBUF_FIELD_TYPE_TO_SQL_TYPE_ERROR" : { "message" : [ - " is not a Protobuf message type" + "Cannot convert Protobuf to SQL because schema is incompatible (protobufType = , sqlType = )." ] }, - "PROTOBUF_RECURSION_ERROR" : { + "PROTOBUF_MESSAGE_NOT_FOUND" : { "message" : [ - "Found recursive reference in Protobuf schema, which can not be processed by Spark: " + "Unable to locate Message in Descriptor" ] }, - "PROTOBUF_TYPE_NOT_SUPPORT_ERROR" : { + "PROTOBUF_TYPE_NOT_SUPPORT" : { "message" : [ "Protobuf type not yet supported: ." ] }, - "PROTOBUF_TYPE_TO_CATALYST_TYPE_ERROR" : { - "message" : [ - "Unable to convert of Protobuf to SQL type ." - ] - }, - "PROTOBUF_TYPE_TO_SQL_TYPE_ERROR" : { + "RECURSIVE_PROTOBUF_SCHEMA" : { "message" : [ - "Cannot convert Protobuf to SQL because schema is incompatible (protobufType = , sqlType = )." + "Found recursive reference in Protobuf schema, which can not be processed by Spark: " ] }, "RENAME_SRC_PATH_NOT_FOUND" : { @@ -850,9 +855,9 @@ ], "sqlState" : "22023" }, - "SQL_TYPE_TO_PROTOBUF_TYPE_ERROR" : { + "SQL_TYPE_TO_PROTOBUF_ENUM_TYPE_ERROR" : { "message" : [ - "Cannot convert SQL to Protobuf because schema is incompatible (protobufType = , sqlType = )." + "Cannot convert SQL to Protobuf because cannot be written since it's not defined in ENUM " ] }, "TABLE_OR_VIEW_ALREADY_EXISTS" : { @@ -887,16 +892,11 @@ "Unable to acquire bytes of memory, got " ] }, - "UNABLE_TO_CONVERT_TO_PROTOBUF_TYPE" : { + "UNABLE_TO_CONVERT_TO_PROTOBUF_MESSAGE_TYPE" : { "message" : [ "Unable to convert SQL type to Protobuf type ." ] }, - "UNABLE_TO_LOCATE_PROTOBUF_MESSAGE_ERROR" : { - "message" : [ - "Unable to locate Message in Descriptor" - ] - }, "UNKNOWN_PROTOBUF_MESSAGE_TYPE" : { "message" : [ "Attempting to treat as a Message, but it was " diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index 699084bad986d..9b80057181b54 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -3005,7 +3005,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { def udfClassDoesNotImplementAnyUDFInterfaceError(className: String): Throwable = { new AnalysisException( - errorClass = "NO_UDF_INTERFACE_ERROR", + errorClass = "NO_UDF_INTERFACE", messageParameters = Map("className" -> className)) } @@ -3217,7 +3217,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { protobufType: String, sqlType: DataType): Throwable = { new AnalysisException( - errorClass = "PROTOBUF_TYPE_TO_SQL_TYPE_ERROR", + errorClass = "PROTOBUF_FIELD_TYPE_TO_SQL_TYPE_ERROR", messageParameters = Map( "protobufColumn" -> protobufColumn, "sqlColumn" -> toSQLId(sqlColumn), @@ -3231,7 +3231,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { sqlType: DataType, protobufType: String): Throwable = { new AnalysisException( - errorClass = "SQL_TYPE_TO_PROTOBUF_TYPE_ERROR", + errorClass = "CANNOT_CONVERT_SQL_TYPE_TO_PROTOBUF_FIELD_TYPE", messageParameters = Map( "sqlColumn" -> toSQLId(sqlColumn), "protobufColumn" -> protobufColumn, @@ -3245,7 +3245,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { data: String, enumString: String): Throwable = { new AnalysisException( - errorClass = "CATALYST_TYPE_TO_PROTOBUF_ENUM_TYPE_ERROR", + errorClass = "SQL_TYPE_TO_PROTOBUF_ENUM_TYPE_ERROR", messageParameters = Map( "sqlColumn" -> toSQLId(sqlColumn), "protobufColumn" -> protobufColumn, @@ -3258,7 +3258,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { sqlType: DataType, cause: Throwable): Throwable = { new AnalysisException( - errorClass = "PROTOBUF_TYPE_TO_CATALYST_TYPE_ERROR", + errorClass = "CANNOT_CONVERT_PROTOBUF_MESSAGE_TYPE_TO_SQL_TYPE", messageParameters = Map( "protobufType" -> protobufType, "toType" -> toSQLType(sqlType)), @@ -3270,7 +3270,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { sqlType: DataType, cause: Throwable): Throwable = { new AnalysisException( - errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_TYPE", + errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_MESSAGE_TYPE", messageParameters = Map( "protobufType" -> protobufType, "toType" -> toSQLType(sqlType)), @@ -3279,7 +3279,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { def protobufTypeUnsupportedYetError(protobufType: String): Throwable = { new AnalysisException( - errorClass = "PROTOBUF_TYPE_NOT_SUPPORT_ERROR", + errorClass = "PROTOBUF_TYPE_NOT_SUPPORT", messageParameters = Map("protobufType" -> protobufType)) } @@ -3295,13 +3295,13 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { def cannotFindCatalystTypeInProtobufSchemaError(catalystFieldPath: String): Throwable = { new AnalysisException( - errorClass = "NO_CATALYST_TYPE_IN_PROTOBUF_SCHEMA", + errorClass = "NO_SQL_TYPE_IN_PROTOBUF_SCHEMA", messageParameters = Map("catalystFieldPath" -> catalystFieldPath)) } def cannotFindProtobufFieldInCatalystError(field: String): Throwable = { new AnalysisException( - errorClass = "PROTOBUF_FIELD_MISSING_IN_CATALYST_SCHEMA", + errorClass = "PROTOBUF_FIELD_MISSING_IN_SQL_SCHEMA", messageParameters = Map("field" -> field)) } @@ -3310,7 +3310,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { matchSize: String, matches: String): Throwable = { new AnalysisException( - errorClass = "PROTOBUF_FIELD_MISSING_ERROR", + errorClass = "PROTOBUF_FIELD_MISSING", messageParameters = Map( "field" -> field, "protobufSchema" -> protobufSchema, @@ -3320,40 +3320,40 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { def unableToLocateProtobufMessageError(messageName: String): Throwable = { new AnalysisException( - errorClass = "UNABLE_TO_LOCATE_PROTOBUF_MESSAGE_ERROR", + errorClass = "PROTOBUF_MESSAGE_NOT_FOUND", messageParameters = Map("messageName" -> messageName)) } - def descrioptorParseError(cause: Throwable): Throwable = { + def descrioptorParseError(descFilePath: String, cause: Throwable): Throwable = { new AnalysisException( - errorClass = "PROTOBUF_DESCRIPTOR_ERROR", - messageParameters = Map.empty(), + errorClass = "CANNOT_PARSE_PROTOBUF_DESCRIPTOR", + messageParameters = Map.empty("descFilePath" -> descFilePath), cause = Option(cause.getCause)) } def cannotFindDescriptorFileError(filePath: String, cause: Throwable): Throwable = { new AnalysisException( - errorClass = "CANNOT_FIND_PROTOBUF_DESCRIPTOR_FILE_ERROR", + errorClass = "PROTOBUF_DESCRIPTOR_FILE_NOT_FOUND", messageParameters = Map("filePath" -> filePath), cause = Option(cause.getCause)) } def noProtobufMessageTypeReturnError(descriptorName: String): Throwable = { new AnalysisException( - errorClass = "NO_PROTOBUF_MESSAGE_TYPE_ERROR", + errorClass = "NO_PROTOBUF_MESSAGE_TYPE", messageParameters = Map("descriptorName" -> descriptorName)) } - def failedParsingDescriptorError(cause: Throwable): Throwable = { + def failedParsingDescriptorError(descFilePath: String, cause: Throwable): Throwable = { new AnalysisException( - errorClass = "PROTOBUF_DESCRIPTOR_PARSING_ERROR", - messageParameters = Map.empty(), + errorClass = "CANNOT_CONSTRUCT_PROTOBUF_DESCRIPTOR", + messageParameters = Map.empty("descFilePath" -> descFilePath), cause = Option(cause.getCause)) } def foundRecursionInProtobufSchema(fieldDescriptor: String): Throwable = { new AnalysisException( - errorClass = "PROTOBUF_RECURSION_ERROR", + errorClass = "RECURSIVE_PROTOBUF_SCHEMA", messageParameters = Map("fieldDescriptor" -> fieldDescriptor)) } @@ -3369,24 +3369,28 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { cause: Throwable): Throwable = { val message = if (hasDots) "" else ". Ensure the class name includes package prefix." new AnalysisException( - errorClass = "PROTOBUF_CLASS_LOAD_ERROR", + errorClass = "CANNOT_LOAD_PROTOBUF_CLASS", messageParameters = Map("protobufClassName" -> protobufClassName, "message" -> message), cause = Option(cause.getCause)) } def protobufMessageTypeError(protobufClassName: String): Throwable = { new AnalysisException( - errorClass = "PROTOBUF_MESSAGE_TYPE_ERROR", + errorClass = "INVALID_PROTOBUF_MESSAGE_TYPE", messageParameters = Map("protobufClassName" -> protobufClassName)) } def protobufDescriptorDependencyError(dependencyName: String): Throwable = { new AnalysisException( - errorClass = "PROTOBUF_DEPENDENCY_ERROR", + errorClass = "PROTOBUF_DEPENDENCY_NOT_FOUND", messageParameters = Map("dependencyName" -> dependencyName)) } - def invalidByteStringFormatError(): Throwable = { - new AnalysisException(errorClass = "INVALID_BYTE_STRING_ERROR", messageParameters = Map.empty) + def invalidByteStringFormatError(unsupported: Any): Throwable = { + new AnalysisException( + errorClass = "INVALID_BYTE_STRING", + messageParameters = Map( + "unsupported" -> unsupported.toString, + "class" -> unsupported.getClass.toString)) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala index d9a27a61ad82f..0afeee97e9227 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala @@ -2707,7 +2707,7 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase { def malformedProtobufMessageDetectedInMessageParsingError(e: Throwable): Throwable = { new SparkException( - errorClass = "MALFORMED_PROTOBUF_MESSAGE_ERROR", + errorClass = "MALFORMED_PROTOBUF_MESSAGE", messageParameters = Map( "failFastMode" -> FailFastMode.name), cause = e) From be02c9e28d0268067557b469fc8e3cecb9428d96 Mon Sep 17 00:00:00 2001 From: SandishKumarHN Date: Sat, 29 Oct 2022 12:43:00 -0700 Subject: [PATCH 16/19] NO_UDF_INTERFACE_ERROR to NO_UDF_INTERFACE --- .../apache/spark/sql/errors/QueryCompilationErrorsSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala index 8086a0e97ec2d..bed647ef49fcd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryCompilationErrorsSuite.scala @@ -227,7 +227,7 @@ class QueryCompilationErrorsSuite parameters = Map[String, String]()) } - test("NO_UDF_INTERFACE_ERROR: java udf class does not implement any udf interface") { + test("NO_UDF_INTERFACE: java udf class does not implement any udf interface") { val className = "org.apache.spark.sql.errors.MyCastToString" val e = intercept[AnalysisException]( spark.udf.registerJava( @@ -237,7 +237,7 @@ class QueryCompilationErrorsSuite ) checkError( exception = e, - errorClass = "NO_UDF_INTERFACE_ERROR", + errorClass = "NO_UDF_INTERFACE", parameters = Map("className" -> className)) } From d8cce824672b8ec118cdda03ee44089f6f4e5681 Mon Sep 17 00:00:00 2001 From: SandishKumarHN Date: Mon, 31 Oct 2022 22:49:57 -0700 Subject: [PATCH 17/19] review changes scala style, find, parseFileDescriptorSet --- .../sql/protobuf/utils/ProtobufUtils.scala | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala index 9e99c2f6a9045..7a1904290150b 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala @@ -175,26 +175,25 @@ private[sql] object ProtobufUtils extends Logging { .asInstanceOf[Descriptor] } - // TODO: Revisit to ensure that messageName is searched through all imports def buildDescriptor(descFilePath: String, messageName: String): Descriptor = { - val descriptorList = parseFileDescriptor(descFilePath).map(fileDescriptor => { - fileDescriptor.getMessageTypes.asScala.find { desc => + val fileDescriptor = parseFileDescriptorSet(descFilePath) + .find(!_.getMessageTypes.asScala.find(desc => desc.getName == messageName || desc.getFullName == messageName - } - }).filter(f => !f.isEmpty) - - if (descriptorList.isEmpty) { - throw QueryCompilationErrors.noProtobufMessageTypeReturnError(messageName) - } + ).isEmpty) - descriptorList.last match { - case Some(d) => d - case None => - throw QueryCompilationErrors.unableToLocateProtobufMessageError(messageName) + fileDescriptor match { + case Some(f) => + f.getMessageTypes.asScala.find { desc => + desc.getName == messageName || desc.getFullName == messageName + } match { + case Some(d) => d + case None => throw QueryCompilationErrors.unableToLocateProtobufMessageError(messageName) + } + case None => throw QueryCompilationErrors.noProtobufMessageTypeReturnError(messageName) } } - private def parseFileDescriptor(descFilePath: String): List[Descriptors.FileDescriptor] = { + private def parseFileDescriptorSet(descFilePath: String): List[Descriptors.FileDescriptor] = { var fileDescriptorSet: DescriptorProtos.FileDescriptorSet = null try { val dscFile = new BufferedInputStream(new FileInputStream(descFilePath)) From 48bcb5c994952ee8c4d4a7f2617f8226c4da61ac Mon Sep 17 00:00:00 2001 From: SandishKumarHN Date: Tue, 1 Nov 2022 10:19:06 -0700 Subject: [PATCH 18/19] review changes buildDescriptor suggested changes --- .../sql/protobuf/utils/ProtobufUtils.scala | 21 ++++++++----------- .../main/resources/error/error-classes.json | 5 ----- .../sql/errors/QueryCompilationErrors.scala | 6 ------ 3 files changed, 9 insertions(+), 23 deletions(-) diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala index 7a1904290150b..ad217b41db51d 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala @@ -176,20 +176,17 @@ private[sql] object ProtobufUtils extends Logging { } def buildDescriptor(descFilePath: String, messageName: String): Descriptor = { - val fileDescriptor = parseFileDescriptorSet(descFilePath) - .find(!_.getMessageTypes.asScala.find(desc => - desc.getName == messageName || desc.getFullName == messageName - ).isEmpty) - - fileDescriptor match { - case Some(f) => - f.getMessageTypes.asScala.find { desc => + // Find the first message descriptor that matches the name. + val descriptorOpt = parseFileDescriptorSet(descFilePath) + .flatMap { fileDesc => + fileDesc.getMessageTypes.asScala.find { desc => desc.getName == messageName || desc.getFullName == messageName - } match { - case Some(d) => d - case None => throw QueryCompilationErrors.unableToLocateProtobufMessageError(messageName) } - case None => throw QueryCompilationErrors.noProtobufMessageTypeReturnError(messageName) + }.headOption + + descriptorOpt match { + case Some(d) => d + case None => throw QueryCompilationErrors.unableToLocateProtobufMessageError(messageName) } } diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index 06db85166bb6c..2ce7dbc3c6024 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -656,11 +656,6 @@ "No handler for UDAF ''. Use sparkSession.udf.register(...) instead." ] }, - "NO_PROTOBUF_MESSAGE_TYPE" : { - "message" : [ - "No MessageTypes returned, " - ] - }, "NO_SQL_TYPE_IN_PROTOBUF_SCHEMA" : { "message" : [ "Cannot find in Protobuf schema" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index c9c58dfa54dbe..eda01fd52bc74 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -3340,12 +3340,6 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { cause = Option(cause.getCause)) } - def noProtobufMessageTypeReturnError(descriptorName: String): Throwable = { - new AnalysisException( - errorClass = "NO_PROTOBUF_MESSAGE_TYPE", - messageParameters = Map("descriptorName" -> descriptorName)) - } - def failedParsingDescriptorError(descFilePath: String, cause: Throwable): Throwable = { new AnalysisException( errorClass = "CANNOT_CONSTRUCT_PROTOBUF_DESCRIPTOR", From 87918a13dfbfc8868c694c5bf6145129c1b6882a Mon Sep 17 00:00:00 2001 From: SandishKumarHN Date: Wed, 2 Nov 2022 17:12:14 -0700 Subject: [PATCH 19/19] review changes, error classes --- .../sql/protobuf/utils/ProtobufUtils.scala | 3 +-- .../main/resources/error/error-classes.json | 22 +++++++++---------- .../sql/errors/QueryCompilationErrors.scala | 8 +++---- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala index ad217b41db51d..4bd59ddce6cee 100644 --- a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala @@ -159,8 +159,7 @@ private[sql] object ProtobufUtils extends Logging { Utils.classForName(protobufClassName) } catch { case e: ClassNotFoundException => - val hasDots = protobufClassName.contains(".") - throw QueryCompilationErrors.protobufClassLoadError(protobufClassName, hasDots, e) + throw QueryCompilationErrors.protobufClassLoadError(protobufClassName, e) } if (!classOf[Message].isAssignableFrom(protobufClass)) { diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index 2ce7dbc3c6024..337266891236b 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -22,11 +22,21 @@ "Error constructing FileDescriptor for " ] }, + "CANNOT_CONVERT_PROTOBUF_FIELD_TYPE_TO_SQL_TYPE" : { + "message" : [ + "Cannot convert Protobuf to SQL because schema is incompatible (protobufType = , sqlType = )." + ] + }, "CANNOT_CONVERT_PROTOBUF_MESSAGE_TYPE_TO_SQL_TYPE" : { "message" : [ "Unable to convert of Protobuf to SQL type ." ] }, + "CANNOT_CONVERT_SQL_TYPE_TO_PROTOBUF_ENUM_TYPE" : { + "message" : [ + "Cannot convert SQL to Protobuf because cannot be written since it's not defined in ENUM " + ] + }, "CANNOT_CONVERT_SQL_TYPE_TO_PROTOBUF_FIELD_TYPE" : { "message" : [ "Cannot convert SQL to Protobuf because schema is incompatible (protobufType = , sqlType = )." @@ -46,7 +56,7 @@ }, "CANNOT_LOAD_PROTOBUF_CLASS" : { "message" : [ - "Could not load Protobuf class with name " + "Could not load Protobuf class with name . Ensure the class name includes package prefix." ] }, "CANNOT_PARSE_DECIMAL" : { @@ -758,11 +768,6 @@ "Type mismatch encountered for field: " ] }, - "PROTOBUF_FIELD_TYPE_TO_SQL_TYPE_ERROR" : { - "message" : [ - "Cannot convert Protobuf to SQL because schema is incompatible (protobufType = , sqlType = )." - ] - }, "PROTOBUF_MESSAGE_NOT_FOUND" : { "message" : [ "Unable to locate Message in Descriptor" @@ -832,11 +837,6 @@ ], "sqlState" : "22023" }, - "SQL_TYPE_TO_PROTOBUF_ENUM_TYPE_ERROR" : { - "message" : [ - "Cannot convert SQL to Protobuf because cannot be written since it's not defined in ENUM " - ] - }, "TABLE_OR_VIEW_ALREADY_EXISTS" : { "message" : [ "Cannot create table or view because it already exists.", diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index eda01fd52bc74..26583f3f51eeb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -3219,7 +3219,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { protobufType: String, sqlType: DataType): Throwable = { new AnalysisException( - errorClass = "PROTOBUF_FIELD_TYPE_TO_SQL_TYPE_ERROR", + errorClass = "CANNOT_CONVERT_PROTOBUF_FIELD_TYPE_TO_SQL_TYPE", messageParameters = Map( "protobufColumn" -> protobufColumn, "sqlColumn" -> toSQLId(sqlColumn), @@ -3247,7 +3247,7 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { data: String, enumString: String): Throwable = { new AnalysisException( - errorClass = "SQL_TYPE_TO_PROTOBUF_ENUM_TYPE_ERROR", + errorClass = "CANNOT_CONVERT_SQL_TYPE_TO_PROTOBUF_ENUM_TYPE", messageParameters = Map( "sqlColumn" -> toSQLId(sqlColumn), "protobufColumn" -> protobufColumn, @@ -3361,12 +3361,10 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { def protobufClassLoadError( protobufClassName: String, - hasDots: Boolean, cause: Throwable): Throwable = { - val message = if (hasDots) "" else ". Ensure the class name includes package prefix." new AnalysisException( errorClass = "CANNOT_LOAD_PROTOBUF_CLASS", - messageParameters = Map("protobufClassName" -> protobufClassName, "message" -> message), + messageParameters = Map("protobufClassName" -> protobufClassName), cause = Option(cause.getCause)) }