review

apache · Sep 2, 2020 · 759dda6 · 759dda6
1 parent 9d46a60
commit 759dda6
Show file tree

Hide file tree

Showing 5 changed files with 331 additions and 186 deletions.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -124,6 +124,127 @@ object AnalysisContext {
   }
 }
 
+object Analyzer {
+
+  /**
+   * Rewrites a given `plan` recursively based on rewrite mappings from old plans to new ones.
+   * This method also updates all the related references in the `plan` accordingly.
+   *
+   * @param plan to rewrite
+   * @param rewritePlanMap has mappings from old plans to new ones for the given `plan`.
+   * @return a rewritten plan and updated references related to a root node of
+   *         the given `plan` for rewriting it.
+   */
+  def rewritePlan(plan: LogicalPlan, rewritePlanMap: Map[LogicalPlan, LogicalPlan])
+    : (LogicalPlan, Seq[(Attribute, Attribute)]) = {
+    if (plan.resolved) {
+      val attrMapping = new mutable.ArrayBuffer[(Attribute, Attribute)]()
+      val newChildren = plan.children.map { child =>
+        // If not, we'd rewrite child plan recursively until we find the
+        // conflict node or reach the leaf node.
+        val (newChild, childAttrMapping) = rewritePlan(child, rewritePlanMap)
+        attrMapping ++= childAttrMapping.filter { case (oldAttr, _) =>
+          // `attrMapping` is not only used to replace the attributes of the current `plan`,
+          // but also to be propagated to the parent plans of the current `plan`. Therefore,
+          // the `oldAttr` must be part of either `plan.references` (so that it can be used to
+          // replace attributes of the current `plan`) or `plan.outputSet` (so that it can be
+          // used by those parent plans).
+          (plan.outputSet ++ plan.references).contains(oldAttr)
+        }
+        newChild
+      }
+
+      val newPlan = if (rewritePlanMap.contains(plan)) {
+        rewritePlanMap(plan).withNewChildren(newChildren)
+      } else {
+        plan.withNewChildren(newChildren)
+      }
+
+      assert(!attrMapping.groupBy(_._1.exprId)
+        .exists(_._2.map(_._2.exprId).distinct.length > 1),
+        "Found duplicate rewrite attributes")
+
+      val attributeRewrites = AttributeMap(attrMapping)
+      // Using attrMapping from the children plans to rewrite their parent node.
+      // Note that we shouldn't rewrite a node using attrMapping from its sibling nodes.
+      val p = newPlan.transformExpressions {
+        case a: Attribute =>
+          updateAttr(a, attributeRewrites)
+        case s: SubqueryExpression =>
+          s.withNewPlan(updateOuterReferencesInSubquery(s.plan, attributeRewrites))
+      }
+      attrMapping ++= plan.output.zip(p.output)
+        .filter { case (a1, a2) => a1.exprId != a2.exprId }
+      p -> attrMapping
+    } else {
+      // Just passes through unresolved nodes
+      plan.mapChildren {
+        rewritePlan(_, rewritePlanMap)._1
+      } -> Nil
+    }
+  }
+
+  private def updateAttr(attr: Attribute, attrMap: AttributeMap[Attribute]): Attribute = {
+    val exprId = attrMap.getOrElse(attr, attr).exprId
+    attr.withExprId(exprId)
+  }
+
+  /**
+   * The outer plan may have old references and the function below updates the
+   * outer references to refer to the new attributes.
+   *
+   * For example (SQL):
+   * {{{
+   *   SELECT * FROM t1
+   *   INTERSECT
+   *   SELECT * FROM t1
+   *   WHERE EXISTS (SELECT 1
+   *                 FROM t2
+   *                 WHERE t1.c1 = t2.c1)
+   * }}}
+   * Plan before resolveReference rule.
+   *    'Intersect
+   *    :- Project [c1#245, c2#246]
+   *    :  +- SubqueryAlias t1
+   *    :     +- Relation[c1#245,c2#246] parquet
+   *    +- 'Project [*]
+   *       +- Filter exists#257 [c1#245]
+   *       :  +- Project [1 AS 1#258]
+   *       :     +- Filter (outer(c1#245) = c1#251)
+   *       :        +- SubqueryAlias t2
+   *       :           +- Relation[c1#251,c2#252] parquet
+   *       +- SubqueryAlias t1
+   *          +- Relation[c1#245,c2#246] parquet
+   * Plan after the resolveReference rule.
+   *    Intersect
+   *    :- Project [c1#245, c2#246]
+   *    :  +- SubqueryAlias t1
+   *    :     +- Relation[c1#245,c2#246] parquet
+   *    +- Project [c1#259, c2#260]
+   *       +- Filter exists#257 [c1#259]
+   *       :  +- Project [1 AS 1#258]
+   *       :     +- Filter (outer(c1#259) = c1#251) => Updated
+   *       :        +- SubqueryAlias t2
+   *       :           +- Relation[c1#251,c2#252] parquet
+   *       +- SubqueryAlias t1
+   *          +- Relation[c1#259,c2#260] parquet  => Outer plan's attributes are rewritten.
+   */
+  private def updateOuterReferencesInSubquery(
+      plan: LogicalPlan,
+      attrMap: AttributeMap[Attribute]): LogicalPlan = {
+    AnalysisHelper.allowInvokingTransformsInAnalyzer {
+      plan transformDown { case currentFragment =>
+        currentFragment transformExpressions {
+          case OuterReference(a: Attribute) =>
+            OuterReference(updateAttr(a, attrMap))
+          case s: SubqueryExpression =>
+            s.withNewPlan(updateOuterReferencesInSubquery(s.plan, attrMap))
+        }
+      }
+    }
+  }
+}
+
 /**
  * Provides a logical query plan analyzer, which translates [[UnresolvedAttribute]]s and
  * [[UnresolvedRelation]]s into fully typed objects using information in a [[SessionCatalog]].
@@ -137,7 +258,7 @@ class Analyzer(
   private val v1SessionCatalog: SessionCatalog = catalogManager.v1SessionCatalog
 
   override protected def isPlanIntegral(plan: LogicalPlan): Boolean = {
-    !Utils.isTesting || LogicalPlanIntegrity.hasUniqueExprIdsForAttributes(plan)
+    !Utils.isTesting || LogicalPlanIntegrity.checkIfExprIdsAreGloballyUnique(plan)
   }
 
   override def isView(nameParts: Seq[String]): Boolean = v1SessionCatalog.isView(nameParts)
@@ -1256,109 +1377,7 @@ class Analyzer(
       if (conflictPlans.isEmpty) {
         right
       } else {
-        rewritePlan(right, conflictPlans.toMap)._1
-      }
-    }
-
-    private def rewritePlan(plan: LogicalPlan, conflictPlanMap: Map[LogicalPlan, LogicalPlan])
-      : (LogicalPlan, Seq[(Attribute, Attribute)]) = {
-      if (conflictPlanMap.contains(plan)) {
-        // If the plan is the one that conflict the with left one, we'd
-        // just replace it with the new plan and collect the rewrite
-        // attributes for the parent node.
-        val newRelation = conflictPlanMap(plan)
-        newRelation -> plan.output.zip(newRelation.output)
-      } else {
-        val attrMapping = new mutable.ArrayBuffer[(Attribute, Attribute)]()
-        val newPlan = plan.mapChildren { child =>
-          // If not, we'd rewrite child plan recursively until we find the
-          // conflict node or reach the leaf node.
-          val (newChild, childAttrMapping) = rewritePlan(child, conflictPlanMap)
-          attrMapping ++= childAttrMapping.filter { case (oldAttr, _) =>
-            // `attrMapping` is not only used to replace the attributes of the current `plan`,
-            // but also to be propagated to the parent plans of the current `plan`. Therefore,
-            // the `oldAttr` must be part of either `plan.references` (so that it can be used to
-            // replace attributes of the current `plan`) or `plan.outputSet` (so that it can be
-            // used by those parent plans).
-            (plan.outputSet ++ plan.references).contains(oldAttr)
-          }
-          newChild
-        }
-
-        if (attrMapping.isEmpty) {
-          newPlan -> attrMapping.toSeq
-        } else {
-          assert(!attrMapping.groupBy(_._1.exprId)
-            .exists(_._2.map(_._2.exprId).distinct.length > 1),
-            "Found duplicate rewrite attributes")
-          val attributeRewrites = AttributeMap(attrMapping.toSeq)
-          // Using attrMapping from the children plans to rewrite their parent node.
-          // Note that we shouldn't rewrite a node using attrMapping from its sibling nodes.
-          newPlan.transformExpressions {
-            case a: Attribute =>
-              dedupAttr(a, attributeRewrites)
-            case s: SubqueryExpression =>
-              s.withNewPlan(dedupOuterReferencesInSubquery(s.plan, attributeRewrites))
-          } -> attrMapping.toSeq
-        }
-      }
-    }
-
-    private def dedupAttr(attr: Attribute, attrMap: AttributeMap[Attribute]): Attribute = {
-      val exprId = attrMap.getOrElse(attr, attr).exprId
-      attr.withExprId(exprId)
-    }
-
-    /**
-     * The outer plan may have been de-duplicated and the function below updates the
-     * outer references to refer to the de-duplicated attributes.
-     *
-     * For example (SQL):
-     * {{{
-     *   SELECT * FROM t1
-     *   INTERSECT
-     *   SELECT * FROM t1
-     *   WHERE EXISTS (SELECT 1
-     *                 FROM t2
-     *                 WHERE t1.c1 = t2.c1)
-     * }}}
-     * Plan before resolveReference rule.
-     *    'Intersect
-     *    :- Project [c1#245, c2#246]
-     *    :  +- SubqueryAlias t1
-     *    :     +- Relation[c1#245,c2#246] parquet
-     *    +- 'Project [*]
-     *       +- Filter exists#257 [c1#245]
-     *       :  +- Project [1 AS 1#258]
-     *       :     +- Filter (outer(c1#245) = c1#251)
-     *       :        +- SubqueryAlias t2
-     *       :           +- Relation[c1#251,c2#252] parquet
-     *       +- SubqueryAlias t1
-     *          +- Relation[c1#245,c2#246] parquet
-     * Plan after the resolveReference rule.
-     *    Intersect
-     *    :- Project [c1#245, c2#246]
-     *    :  +- SubqueryAlias t1
-     *    :     +- Relation[c1#245,c2#246] parquet
-     *    +- Project [c1#259, c2#260]
-     *       +- Filter exists#257 [c1#259]
-     *       :  +- Project [1 AS 1#258]
-     *       :     +- Filter (outer(c1#259) = c1#251) => Updated
-     *       :        +- SubqueryAlias t2
-     *       :           +- Relation[c1#251,c2#252] parquet
-     *       +- SubqueryAlias t1
-     *          +- Relation[c1#259,c2#260] parquet  => Outer plan's attributes are de-duplicated.
-     */
-    private def dedupOuterReferencesInSubquery(
-        plan: LogicalPlan,
-        attrMap: AttributeMap[Attribute]): LogicalPlan = {
-      plan transformDown { case currentFragment =>
-        currentFragment transformExpressions {
-          case OuterReference(a: Attribute) =>
-            OuterReference(dedupAttr(a, attrMap))
-          case s: SubqueryExpression =>
-            s.withNewPlan(dedupOuterReferencesInSubquery(s.plan, attrMap))
-        }
+        Analyzer.rewritePlan(right, conflictPlans.toMap)._1
       }
     }
 

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -45,8 +45,8 @@ abstract class Optimizer(catalogManager: CatalogManager)
   // - only host special expressions in supported operators
   override protected def isPlanIntegral(plan: LogicalPlan): Boolean = {
     !Utils.isTesting || (plan.resolved &&
-      plan.find(PlanHelper.specialExpressionsInUnsupportedOperator(_).nonEmpty).isEmpty) &&
-      LogicalPlanIntegrity.hasUniqueExprIdsForAttributes(plan)
+      plan.find(PlanHelper.specialExpressionsInUnsupportedOperator(_).nonEmpty).isEmpty &&
+      LogicalPlanIntegrity.checkIfExprIdsAreGloballyUnique(plan))
   }
 
   override protected val excludedOnceBatches: Set[String] =
@@ -1580,23 +1580,36 @@ object ReplaceDistinctWithAggregate extends Rule[LogicalPlan] {
  * Replaces logical [[Deduplicate]] operator with an [[Aggregate]] operator.
  */
 object ReplaceDeduplicateWithAggregate extends Rule[LogicalPlan] {
-  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
-    case Deduplicate(keys, child) if !child.isStreaming =>
-      val keyExprIds = keys.map(_.exprId)
-      val aggCols = child.output.map { attr =>
-        if (keyExprIds.contains(attr.exprId)) {
-          attr
-        } else {
-          Alias(new First(attr).toAggregateExpression(), attr.name)(attr.exprId)
-        }
-      }
-      // SPARK-22951: Physical aggregate operators distinguishes global aggregation and grouping
-      // aggregations by checking the number of grouping keys. The key difference here is that a
-      // global aggregation always returns at least one row even if there are no input rows. Here
-      // we append a literal when the grouping key list is empty so that the result aggregate
-      // operator is properly treated as a grouping aggregation.
-      val nonemptyKeys = if (keys.isEmpty) Literal(1) :: Nil else keys
-      Aggregate(nonemptyKeys, aggCols, child)
+  def apply(plan: LogicalPlan): LogicalPlan = {
+    val rewritePlanMap = mutable.ArrayBuffer[(LogicalPlan, LogicalPlan)]()
+    val newPlan = plan transform {
+      case Deduplicate(keys, child) if !child.isStreaming =>
+        val keyExprIds = keys.map(_.exprId)
+        val aggCols = child.output.map { attr =>
+          if (keyExprIds.contains(attr.exprId)) {
+            attr -> attr
+          } else {
+            val alias = Alias(new First(attr).toAggregateExpression(), attr.name)(attr.exprId)
+            alias -> alias.newInstance()
+          }
+        }.unzip
+        // SPARK-22951: Physical aggregate operators distinguishes global aggregation and grouping
+        // aggregations by checking the number of grouping keys. The key difference here is that a
+        // global aggregation always returns at least one row even if there are no input rows. Here
+        // we append a literal when the grouping key list is empty so that the result aggregate
+        // operator is properly treated as a grouping aggregation.
+        val nonemptyKeys = if (keys.isEmpty) Literal(1) :: Nil else keys
+        val newAgg = Aggregate(nonemptyKeys, aggCols._1, child)
+        rewritePlanMap += newAgg -> Aggregate(nonemptyKeys, aggCols._2, child)
+        newAgg
+    }
+
+    if (rewritePlanMap.nonEmpty) {
+      assert(!plan.fastEquals(newPlan))
+      Analyzer.rewritePlan(newPlan, rewritePlanMap.toMap)._1
+    } else {
+      plan
+    }
   }
 }