apache · spaced4ndy · Apr 25, 2018 · May 7, 2018 · May 22, 2018 · yanlin-Lynn
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1271,6 +1271,14 @@ object SQLConf {
       .intConf
       .createWithDefault(Int.MaxValue)
 
+  val MAX_EPOCH_BACKLOG = buildConf("spark.sql.streaming.continuous.maxEpochBacklog")
+    .internal()
+    .doc("The max number of epochs to be stored in queue to wait for late epochs. " +
+      "If this parameter is exceeded by the size of the queue, stream is stopped with an error " +
+      "indicating too many epochs stacked up.")
+    .intConf
+    .createWithDefault(10000)
+
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
   }
@@ -1641,6 +1649,8 @@ class SQLConf extends Serializable with Logging {
   def partitionOverwriteMode: PartitionOverwriteMode.Value =
     PartitionOverwriteMode.withName(getConf(PARTITION_OVERWRITE_MODE))
 
+  def maxEpochBacklog: Int = getConf(MAX_EPOCH_BACKLOG)
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */

diff --git a/.../main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/.../main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -17,8 +17,10 @@
 
 package org.apache.spark.sql.execution.streaming.continuous
 
+import java.lang.Thread.UncaughtExceptionHandler
 import java.util.UUID
 import java.util.concurrent.TimeUnit
+import java.util.concurrent.atomic.AtomicReference
 import java.util.function.UnaryOperator
 
 import scala.collection.JavaConverters._
@@ -233,9 +235,15 @@ class ContinuousExecution(
               }
               false
             } else if (isActive) {
-              currentBatchId = epochEndpoint.askSync[Long](IncrementAndGetEpoch)
-              logInfo(s"New epoch $currentBatchId is starting.")
-              true
+              val maxBacklogExceeded = epochEndpoint.askSync[Boolean](CheckIfMaxBacklogIsExceeded)
+              if (maxBacklogExceeded) {
+                throw new IllegalStateException(
+                  "Size of the epochs queue has exceeded maximum allowed epoch backlog.")
+              } else {
+                currentBatchId = epochEndpoint.askSync[Long](IncrementAndGetEpoch)
+                logInfo(s"New epoch $currentBatchId is starting.")
+                true
+              }
             } else {
               false
             }
@@ -248,7 +256,12 @@ class ContinuousExecution(
       }
     }, s"epoch update thread for $prettyIdString")
 
+    val throwableReference: AtomicReference[Throwable] = new AtomicReference[Throwable]()
     try {
+      epochUpdateThread.setUncaughtExceptionHandler(new UncaughtExceptionHandler {
+        override def uncaughtException(thread: Thread, throwable: Throwable): Unit =
+          throwableReference.set(throwable)
+      })
       epochUpdateThread.setDaemon(true)
       epochUpdateThread.start()
 
@@ -268,6 +281,11 @@ class ContinuousExecution(
       epochUpdateThread.interrupt()
       epochUpdateThread.join()
 
+      val throwable: Throwable = throwableReference.get()
+      if (throwable != null && throwable.isInstanceOf[IllegalStateException]) {
+        throw throwable.asInstanceOf[IllegalStateException]
+      }
+
       stopSources()
       sparkSession.sparkContext.cancelJobGroup(runId.toString)
     }

diff --git a/...src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala b/...src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
@@ -45,6 +45,11 @@ private[sql] case object IncrementAndGetEpoch extends EpochCoordinatorMessage
  */
 private[sql] case object StopContinuousExecutionWrites extends EpochCoordinatorMessage
 
+/**
+ * Returns boolean indicating if size of the epochs queue has exceeded maximum epoch backlog.
+ */
+private[sql] case object CheckIfMaxBacklogIsExceeded extends EpochCoordinatorMessage
+
 // Init messages
 /**
  * Set the reader and writer partition counts. Tasks may not be started until the coordinator
@@ -123,6 +128,9 @@ private[continuous] class EpochCoordinator(
     override val rpcEnv: RpcEnv)
   extends ThreadSafeRpcEndpoint with Logging {
 
+  private val maxEpochBacklog = session.sqlContext.conf.maxEpochBacklog
+
+  private var maxEpochBacklogExceeded: Boolean = false
   private var queryWritesStopped: Boolean = false
 
   private var numReaderPartitions: Int = _
@@ -153,9 +161,13 @@ private[continuous] class EpochCoordinator(
       // If not, add the epoch being currently processed to epochs waiting to be committed,
       // otherwise commit it.
       if (lastCommittedEpoch != epoch - 1) {
-        logDebug(s"Epoch $epoch has received commits from all partitions " +
-          s"and is waiting for epoch ${epoch - 1} to be committed first.")
-        epochsWaitingToBeCommitted.add(epoch)
+        if (epochsWaitingToBeCommitted.size == maxEpochBacklog) {
+          maxEpochBacklogExceeded = true
+        } else {
+          logDebug(s"Epoch $epoch has received commits from all partitions " +
+            s"and is waiting for epoch ${epoch - 1} to be committed first.")
+          epochsWaitingToBeCommitted.add(epoch)
+        }
       } else {
         commitEpoch(epoch, thisEpochCommits)
         lastCommittedEpoch = epoch
@@ -246,5 +258,8 @@ private[continuous] class EpochCoordinator(
     case StopContinuousExecutionWrites =>
       queryWritesStopped = true
       context.reply(())
+
+    case CheckIfMaxBacklogIsExceeded =>
+      context.reply(maxEpochBacklogExceeded)
   }
 }