preserve spark properties on async subquery tasks (#579)

## Upstream SPARK-XXXXX ticket and PR link (if not applicable, explain) https://issues.apache.org/jira/browse/SPARK-27744 apache#24625 ## What changes were proposed in this pull request? preserve spark properties on subquery exec tasks ## How was this patch tested? unit tests
palantir · Jun 20, 2019 · 5454b49 · 5454b49
1 parent d210c0a
commit 5454b49
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 10 deletions.
diff --git a/core/src/main/scala/org/apache/spark/util/Futures.scala b/core/src/main/scala/org/apache/spark/util/Futures.scala
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util
+
+import scala.concurrent.{ExecutionContext, Future}
+
+import org.apache.spark.SparkContext
+
+
+private[spark] object Futures {
+  def withLocalProperties[T](
+    sc: SparkContext)(
+    body: => T)(
+    implicit executor: ExecutionContext): Future[T] = {
+    val properties = sc.getLocalProperties
+    Future {
+      val originalProperties = sc.getLocalProperties
+      try {
+        sc.setLocalProperties(properties)
+        body
+      } finally {
+        sc.setLocalProperties(originalProperties)
+      }
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.types.LongType
-import org.apache.spark.util.ThreadUtils
+import org.apache.spark.util.{Futures, ThreadUtils}
 import org.apache.spark.util.random.{BernoulliCellSampler, PoissonSampler}
 
 /** Physical plan for Project. */
@@ -674,12 +674,8 @@ case class SubqueryExec(name: String, child: SparkPlan) extends UnaryExecNode {
 
   @transient
   private lazy val relationFuture: Future[Array[InternalRow]] = {
-    // relationFuture is used in "doExecute". Therefore we can get the execution id correctly here.
-    val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
-    Future {
-      // This will run in another thread. Set the execution id so that we can connect these jobs
-      // with the correct execution.
-      SQLExecution.withExecutionId(sqlContext.sparkSession, executionId) {
+    Futures.withLocalProperties(sparkContext) {
+      SQLExecution.withSQLConfPropagated(sqlContext.sparkSession) {
         val beforeCollect = System.nanoTime()
         // Note that we use .executeCollect() because we don't want to convert data to Scala types
         val rows: Array[InternalRow] = child.executeCollect()
@@ -688,7 +684,10 @@ case class SubqueryExec(name: String, child: SparkPlan) extends UnaryExecNode {
         val dataSize = rows.map(_.asInstanceOf[UnsafeRow].getSizeInBytes.toLong).sum
         longMetric("dataSize") += dataSize
 
-        SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toSeq)
+        SQLMetrics.postDriverMetricUpdates(
+          sparkContext,
+          sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY),
+          metrics.values.toSeq)
         rows
       }
     }(SubqueryExec.executionContext)
@@ -708,6 +707,7 @@ case class SubqueryExec(name: String, child: SparkPlan) extends UnaryExecNode {
 }
 
 object SubqueryExec {
+  private[spark] val THREADS = 16
   private[execution] val executionContext = ExecutionContext.fromExecutorService(
-    ThreadUtils.newDaemonCachedThreadPool("subquery", 16))
+    ThreadUtils.newDaemonCachedThreadPool("subquery", THREADS))
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -19,8 +19,11 @@ package org.apache.spark.sql
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, SubqueryExpression, UnsafeRow}
 import org.apache.spark.sql.catalyst.plans.logical.{Join, LogicalPlan, Sort}
+import org.apache.spark.sql.execution.{SparkPlan, SubqueryExec}
 import org.apache.spark.sql.test.SharedSQLContext
 
 class SubquerySuite extends QueryTest with SharedSQLContext {
@@ -1316,4 +1319,24 @@ class SubquerySuite extends QueryTest with SharedSQLContext {
       checkAnswer(df3, Seq(Row("a", 2, "a"), Row("a", 2, "b")))
     }
   }
+
+  test("SPARK-27744: Subquery execution preserves spark local properties") {
+    case class LocalPropertiesOperator(key: String, value: String) extends SparkPlan {
+      override protected def doExecute(): RDD[InternalRow] = {
+        assert(spark.sparkContext.getLocalProperty(key) == value)
+        sparkContext.parallelize(Seq(new UnsafeRow()))
+
+      }
+      override def output: Seq[Attribute] = Seq()
+      override def producedAttributes: AttributeSet = outputSet
+      override def children: Seq[SparkPlan] = Nil
+    }
+
+    spark.sparkContext.setLocalProperty("a", "1")
+    for (i <- 0 to SubqueryExec.THREADS) {
+      SubqueryExec("test", LocalPropertiesOperator("a", "1")).executeCollect()
+    }
+    spark.sparkContext.setLocalProperty("a", "2")
+    SubqueryExec("test", LocalPropertiesOperator("a", "2")).executeCollect()
+  }
 }