From c78e517874562fed5a1cf1fbabe268de1a36ece2 Mon Sep 17 00:00:00 2001 From: Takeshi Yamamuro Date: Fri, 18 Sep 2020 15:36:19 +0900 Subject: [PATCH] Fix --- .../sql/catalyst/analysis/ResolveUnion.scala | 29 +++++++++++++-- .../sql/catalyst/analysis/TypeCoercion.scala | 3 +- .../sql/catalyst/optimizer/Optimizer.scala | 22 ++++++++--- .../sql/catalyst/optimizer/subquery.scala | 5 +-- .../catalyst/plans/logical/LogicalPlan.scala | 15 ++++---- .../plans/logical/basicLogicalOperators.scala | 37 +++++++++++++------ .../analysis/DecimalPrecisionSuite.scala | 2 +- .../catalyst/analysis/ResolveUnionSuite.scala | 9 +++-- .../scala/org/apache/spark/sql/Dataset.scala | 2 +- .../spark/sql/execution/SparkStrategies.scala | 2 +- .../execution/basicPhysicalOperators.scala | 18 ++------- .../sql/execution/SparkPlannerSuite.scala | 2 +- 12 files changed, 90 insertions(+), 56 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveUnion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveUnion.scala index 693a5a4e75443..efbd7101d6915 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveUnion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveUnion.scala @@ -18,17 +18,33 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.catalyst.expressions.{Alias, Literal} +import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, Literal, NamedExpression} import org.apache.spark.sql.catalyst.optimizer.CombineUnions import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, Union} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.SchemaUtils /** * Resolves different children of Union to a common set of columns. */ object ResolveUnion extends Rule[LogicalPlan] { + + private[catalyst] def makeUnionOutput(children: Seq[LogicalPlan]): Seq[Attribute] = { + children.map(_.output).transpose.map { attrs => + val firstAttr = attrs.head + val nullable = attrs.exists(_.nullable) + val newDt = attrs.map(_.dataType).reduce(StructType.merge) + if (firstAttr.dataType == newDt) { + firstAttr.withNullability(nullable) + } else { + AttributeReference(firstAttr.name, newDt, nullable, firstAttr.metadata)( + NamedExpression.newExprId, firstAttr.qualifier) + } + } + } + private def unionTwoSides( left: LogicalPlan, right: LogicalPlan, @@ -68,7 +84,8 @@ object ResolveUnion extends Rule[LogicalPlan] { } else { left } - Union(leftChild, rightChild) + val unionOutput = makeUnionOutput(Seq(leftChild, rightChild)) + Union(leftChild, rightChild, unionOutput) } // Check column name duplication @@ -88,13 +105,17 @@ object ResolveUnion extends Rule[LogicalPlan] { } def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp { - case e if !e.childrenResolved => e + case p if !p.childrenResolved => p - case Union(children, byName, allowMissingCol) if byName => + case Union(children, byName, allowMissingCol, _) if byName => val union = children.reduceLeft { (left, right) => checkColumnNames(left, right) unionTwoSides(left, right, allowMissingCol) } CombineUnions(union) + + case u @ Union(children, _, _, unionOutput) + if u.allChildrenCompatible && unionOutput.isEmpty => + u.copy(unionOutput = makeUnionOutput(children)) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala index deaa49bf423b1..f71e2c5a3c505 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala @@ -359,7 +359,8 @@ object TypeCoercion { s -> Nil } else { val attrMapping = s.children.head.output.zip(newChildren.head.output) - s.copy(children = newChildren) -> attrMapping + val newOutput = ResolveUnion.makeUnionOutput(newChildren) + s.copy(children = newChildren, unionOutput = newOutput) -> attrMapping } } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index f27c5a26741d4..507381699642f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -592,7 +592,8 @@ object PushProjectionThroughUnion extends Rule[LogicalPlan] with PredicateHelper val rewrites = buildRewrites(u.children.head, child) Project(projectList.map(pushToRight(_, rewrites)), child) } - u.copy(children = newFirstChild +: newOtherChildren) + val newChildren = newFirstChild +: newOtherChildren + u.copy(children = newChildren, unionOutput = ResolveUnion.makeUnionOutput(newChildren)) } else { p } @@ -967,17 +968,20 @@ object CombineUnions extends Rule[LogicalPlan] { // rules (by position and by name) could cause incorrect results. while (stack.nonEmpty) { stack.pop() match { - case Distinct(Union(children, byName, allowMissingCol)) + case Distinct(Union(children, byName, allowMissingCol, _)) if flattenDistinct && byName == topByName && allowMissingCol == topAllowMissingCol => stack.pushAll(children.reverse) - case Union(children, byName, allowMissingCol) + case Union(children, byName, allowMissingCol, _) if byName == topByName && allowMissingCol == topAllowMissingCol => stack.pushAll(children.reverse) case child => flattened += child } } - union.copy(children = flattened.toSeq) + union.copy( + children = flattened, + unionOutput = ResolveUnion.makeUnionOutput(flattened) + ) } } @@ -1689,7 +1693,8 @@ object RewriteExceptAll extends Rule[LogicalPlan] { val newColumnRight = Alias(Literal(-1L), "vcol")() val modifiedLeftPlan = Project(Seq(newColumnLeft) ++ left.output, left) val modifiedRightPlan = Project(Seq(newColumnRight) ++ right.output, right) - val unionPlan = Union(modifiedLeftPlan, modifiedRightPlan) + val unionOutput = ResolveUnion.makeUnionOutput(Seq(modifiedLeftPlan, modifiedRightPlan)) + val unionPlan = Union(modifiedLeftPlan, modifiedRightPlan, output = unionOutput) val aggSumCol = Alias(AggregateExpression(Sum(unionPlan.output.head.toAttribute), Complete, false), "sum")() val aggOutputColumns = left.output ++ Seq(aggSumCol) @@ -1753,7 +1758,12 @@ object RewriteIntersectAll extends Rule[LogicalPlan] { val leftPlanWithAddedVirtualCols = Project(Seq(trueVcol1, nullVcol2) ++ left.output, left) val rightPlanWithAddedVirtualCols = Project(Seq(nullVcol1, trueVcol2) ++ right.output, right) - val unionPlan = Union(leftPlanWithAddedVirtualCols, rightPlanWithAddedVirtualCols) + val unionOutput = ResolveUnion.makeUnionOutput( + Seq(leftPlanWithAddedVirtualCols, rightPlanWithAddedVirtualCols)) + val unionPlan = Union( + leftPlanWithAddedVirtualCols, + rightPlanWithAddedVirtualCols, + output = unionOutput) // Expressions to compute count and minimum of both the counts. val vCol1AggrExpr = diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala index 9173b2d8032b5..a168dcd7a83f5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala @@ -463,10 +463,7 @@ object RewriteCorrelatedScalarSubquery extends Rule[LogicalPlan] { sys.error(s"Unexpected operator in scalar subquery: $lp") } - val resultMap = evalPlan(plan).mapValues { _.transform { - case a: Alias => a.newInstance() // Assigns a new `ExprId` - } - } + val resultMap = evalPlan(plan) // By convention, the scalar subquery result is the leftmost field. resultMap.get(plan.output.head.exprId) match { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala index cefc97a95d223..eee9fb7cff5a6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala @@ -218,19 +218,18 @@ object LogicalPlanIntegrity { /** * Since some logical plans (e.g., `Union`) can build `AttributeReference`s in their `output`, - * this method checks if the same `ExprId` refers to a semantically-equal attribute - * in a plan output. + * this method checks if the same `ExprId` refers to attributes having the same data type + * in plan output. */ def hasUniqueExprIdsForOutput(plan: LogicalPlan): Boolean = { - val allOutputAttrs = plan.collect { case p if canGetOutputAttrs(p) => + val exprIds = plan.collect { case p if canGetOutputAttrs(p) => // NOTE: we still need to filter resolved expressions here because the output of // some resolved logical plans can have unresolved references, // e.g., outer references in `ExistenceJoin`. - p.output.filter(_.resolved).map(_.canonicalized.asInstanceOf[Attribute]) - } - val groupedAttrsByExprId = allOutputAttrs - .flatten.groupBy(_.exprId).values.map(_.distinct) - groupedAttrsByExprId.forall(_.length == 1) + p.output.filter(_.resolved).map { a => (a.exprId, a.dataType) } + }.flatten + val groupedDataTypesByExprId = exprIds.groupBy(_._1).values.map(_.distinct) + groupedDataTypesByExprId.forall(_.length == 1) } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala index 01f6f96437d39..b301b9e37b649 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala @@ -213,8 +213,13 @@ case class Except( /** Factory for constructing new `Union` nodes. */ object Union { + + def apply(left: LogicalPlan, right: LogicalPlan, output: Seq[Attribute]): Union = { + Union(left :: right :: Nil, unionOutput = output) + } + def apply(left: LogicalPlan, right: LogicalPlan): Union = { - Union (left :: right :: Nil) + Union(left :: right :: Nil) } } @@ -229,7 +234,8 @@ object Union { case class Union( children: Seq[LogicalPlan], byName: Boolean = false, - allowMissingCol: Boolean = false) extends LogicalPlan { + allowMissingCol: Boolean = false, + unionOutput: Seq[Attribute] = Seq.empty) extends LogicalPlan { assert(!allowMissingCol || byName, "`allowMissingCol` can be true only if `byName` is true.") override def maxRows: Option[Long] = { @@ -256,8 +262,7 @@ case class Union( AttributeSet.fromAttributeSets(children.map(_.outputSet)).size } - // updating nullability to make all the children consistent - override def output: Seq[Attribute] = { + private def makeUnionOutput(): Seq[Attribute] = { children.map(_.output).transpose.map { attrs => val firstAttr = attrs.head val nullable = attrs.exists(_.nullable) @@ -271,17 +276,27 @@ case class Union( } } - override lazy val resolved: Boolean = { + // updating nullability to make all the children consistent + override def output: Seq[Attribute] = { + assert(unionOutput.nonEmpty, "Union should have at least a single column") + unionOutput + } + + def allChildrenCompatible: Boolean = { // allChildrenCompatible needs to be evaluated after childrenResolved - def allChildrenCompatible: Boolean = - children.tail.forall( child => - // compare the attribute number with the first child - child.output.length == children.head.output.length && + childrenResolved && children.tail.forall { child => + // compare the attribute number with the first child + child.output.length == children.head.output.length && // compare the data types with the first child child.output.zip(children.head.output).forall { case (l, r) => l.dataType.sameType(r.dataType) - }) - children.length > 1 && !(byName || allowMissingCol) && childrenResolved && allChildrenCompatible + } + } + } + + override lazy val resolved: Boolean = { + children.length > 1 && !(byName || allowMissingCol) && allChildrenCompatible && + unionOutput.nonEmpty } /** diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala index d5991ff10ce6c..2351523f8f41c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala @@ -73,7 +73,7 @@ class DecimalPrecisionSuite extends AnalysisTest with BeforeAndAfter { Union(Project(Seq(Alias(left, "l")()), relation), Project(Seq(Alias(right, "r")()), relation)) val (l, r) = analyzer.execute(plan).collect { - case Union(Seq(child1, child2), _, _) => (child1.output.head, child2.output.head) + case Union(Seq(child1, child2), _, _, _) => (child1.output.head, child2.output.head) }.head assert(l.dataType === expectedType) assert(r.dataType === expectedType) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveUnionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveUnionSuite.scala index 5c7ad0067a456..42bd4012b0a63 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveUnionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ResolveUnionSuite.scala @@ -16,6 +16,8 @@ */ package org.apache.spark.sql.catalyst.analysis +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.RuleExecutor @@ -51,7 +53,8 @@ class ResolveUnionSuite extends AnalysisTest { val analyzed1 = analyzer.execute(union1) val projected1 = Project(Seq(table2.output(3), table2.output(0), table2.output(1), table2.output(2)), table2) - val expected1 = Union(table1 :: projected1 :: Nil) + val expectedOutput = Seq('i.int, 'u.decimal, 'b.byte, 'd.double) + val expected1 = Union(table1 :: projected1 :: Nil, unionOutput = expectedOutput) comparePlans(analyzed1, expected1) // Allow missing column @@ -60,7 +63,7 @@ class ResolveUnionSuite extends AnalysisTest { val nullAttr1 = Alias(Literal(null, ByteType), "b")() val projected2 = Project(Seq(table2.output(3), table2.output(0), nullAttr1, table2.output(2)), table3) - val expected2 = Union(table1 :: projected2 :: Nil) + val expected2 = Union(table1 :: projected2 :: Nil, unionOutput = expectedOutput) comparePlans(analyzed2, expected2) // Allow missing column + Allow missing column @@ -69,7 +72,7 @@ class ResolveUnionSuite extends AnalysisTest { val nullAttr2 = Alias(Literal(null, DoubleType), "d")() val projected3 = Project(Seq(table2.output(3), table2.output(0), nullAttr1, nullAttr2), table4) - val expected3 = Union(table1 :: projected2 :: projected3 :: Nil) + val expected3 = Union(table1 :: projected2 :: projected3 :: Nil, unionOutput = expectedOutput) comparePlans(analyzed3, expected3) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 87b9aea80c823..0c4b5dc5a73cf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -227,7 +227,7 @@ class Dataset[T] private[sql]( val plan = queryExecution.analyzed match { case c: Command => LocalRelation(c.output, withAction("command", queryExecution)(_.executeCollect())) - case u @ Union(children, _, _) if children.forall(_.isInstanceOf[Command]) => + case u @ Union(children, _, _, _) if children.forall(_.isInstanceOf[Command]) => LocalRelation(u.output, withAction("command", queryExecution)(_.executeCollect())) case _ => queryExecution.analyzed diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala index ba3d83714c302..abea4a7a8eaff 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala @@ -692,7 +692,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { case logical.GlobalLimit(IntegerLiteral(limit), child) => execution.GlobalLimitExec(limit, planLater(child)) :: Nil case union: logical.Union => - execution.UnionExec(union.children.map(planLater)) :: Nil + execution.UnionExec(union.children.map(planLater), union.unionOutput) :: Nil case g @ logical.Generate(generator, _, outer, _, _, child) => execution.GenerateExec( generator, g.requiredChildOutput, outer, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala index 1f70fde3f7654..740564f120e1a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala @@ -634,21 +634,9 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range) * If we change how this is implemented physically, we'd need to update * [[org.apache.spark.sql.catalyst.plans.logical.Union.maxRowsPerPartition]]. */ -case class UnionExec(children: Seq[SparkPlan]) extends SparkPlan { - // updating nullability to make all the children consistent - override def output: Seq[Attribute] = { - children.map(_.output).transpose.map { attrs => - val firstAttr = attrs.head - val nullable = attrs.exists(_.nullable) - val newDt = attrs.map(_.dataType).reduce(StructType.merge) - if (firstAttr.dataType == newDt) { - firstAttr.withNullability(nullable) - } else { - AttributeReference(firstAttr.name, newDt, nullable, firstAttr.metadata)( - firstAttr.exprId, firstAttr.qualifier) - } - } - } +case class UnionExec(children: Seq[SparkPlan], unionOutput: Seq[Attribute]) extends SparkPlan { + + override def output: Seq[Attribute] = unionOutput protected override def doExecute(): RDD[InternalRow] = sparkContext.union(children.map(_.execute())) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlannerSuite.scala index b4cb7e3fce3cf..4ba25db374147 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlannerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlannerSuite.scala @@ -39,7 +39,7 @@ class SparkPlannerSuite extends SharedSparkSession { planLater(child) :: planLater(NeverPlanned) :: Nil case u: Union => planned += 1 - UnionExec(u.children.map(planLater)) :: planLater(NeverPlanned) :: Nil + UnionExec(u.children.map(planLater), u.unionOutput) :: planLater(NeverPlanned) :: Nil case LocalRelation(output, data, _) => planned += 1 LocalTableScanExec(output, data) :: planLater(NeverPlanned) :: Nil