sandy

SCHJonathan · sryza · commit 79fcd7c73ae4 · 2025-06-02T16:08:40.000-07:00
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala
@@ -95,8 +95,10 @@ trait FlowExecution {
   /** Context about this pipeline update. */
   def updateContext: PipelineUpdateContext
 
-  implicit val executionContext: ExecutionContext =
+  /** The thread execution context for the current [[FlowExecution]]. */
+  implicit val executionContext: ExecutionContext = {
     ExecutionContext.fromExecutor(FlowExecution.threadPool)
+  }
 
   /**
    * Stops execution of this [[FlowExecution]]. If you override this, please be sure to
@@ -107,8 +109,15 @@ trait FlowExecution {
     stopped.set(true)
   }
 
+  /** Returns an optional exception that occurred during execution, if any. */
   def exception: Option[Throwable] = _future.flatMap(_.value).flatMap(_.failed.toOption)
 
+  /**
+   * Executes this PhysicalFlow synchronously to perform its intended update.
+   * This method should be overridden by subclasses to provide the actual execution logic.
+   *
+   * @return a Future that completes when the execution is finished or stopped.
+   */
   def executeInternal(): Future[Unit]
 
   /**
@@ -129,10 +138,7 @@ trait FlowExecution {
         executeInternal()
           .transform {
             case Success(_) => Success(ExecutionResult.FINISHED)
-            // Add origin to exceptions raised while executing a flow i.e. inside the `Future`
-            // created by the `executeInternal` method.
-            case Failure(e) =>
-              Failure(e)
+            case Failure(e) => Failure(e)
           }
           .map(_ => ExecutionResult.FINISHED)
           .recover {
@@ -155,8 +161,10 @@ trait FlowExecution {
 }
 
 object FlowExecution {
-  private val threadPool: ThreadPoolExecutor =
+  /** A thread pool used to execute [[FlowExecution]]s. */
+  private val threadPool: ThreadPoolExecutor = {
     ThreadUtils.newDaemonCachedThreadPool("FlowExecution")
+  }
 }
 
 /** A [[FlowExecution]] that processes data statefully using Structured Streaming. */
@@ -190,6 +198,7 @@ trait StreamingFlowExecution extends FlowExecution with Logging {
   }
 }
 
+/** A [[StreamingFlowExecution]] that writes a streaming DataFrame to a DLT [[Table]]. */
 class StreamingTableWrite(
     val identifier: TableIdentifier,
     val flow: ResolvedFlow,
@@ -217,6 +226,7 @@ class StreamingTableWrite(
   }
 }
 
+/** A [[FlowExecution]] that writes a batch DataFrame to a DLT [[Table]]. */
 class BatchFlowExecution(
     val identifier: TableIdentifier,
     val flow: ResolvedFlow,
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/GraphExecution.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/GraphExecution.scala
@@ -18,12 +18,10 @@
 package org.apache.spark.sql.pipelines.graph
 import java.util.concurrent.{ConcurrentHashMap, TimeoutException}
 
-import scala.annotation.unused
 import scala.concurrent.ExecutionContext
 import scala.jdk.CollectionConverters._
 import scala.util.{Failure, Success}
 
-import org.apache.spark.SparkException
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.internal.SQLConf
@@ -70,29 +68,18 @@ abstract class GraphExecution(
     triggerFor = streamTrigger
   )
 
-  val SERIAL_PLANNING = "SERIAL"
-
   // Listeners to process events and metrics.
   private val batchListener = new BatchListener()
   private val streamListener = new StreamListener(env, graphForExecution)
 
-  /**
-   * Run the given planning function `f` for each flow in `flows`.
-   */
-  protected def startPlanning(flows: Seq[ResolvedFlow])(
-      f: (ResolvedFlow, String) => Unit
-  ): Unit = {
-    flows.foreach(f(_, SERIAL_PLANNING))
-  }
-
   /**
    * Plans the logical [[ResolvedFlow]] into a [[FlowExecution]] and then starts executing it.
    * Implementation note: Thread safe
    *
    * @return None if the flow planner decided that there is no actual update required here.
    *         Otherwise returns the corresponding physical flow.
    */
-  def startFlow(flow: ResolvedFlow): Option[FlowExecution] = {
+  def planAndStartFlow(flow: ResolvedFlow): Option[FlowExecution] = {
     try {
       val physicalFlow = flowPlanner.plan(
         flow = graphForExecution.resolvedFlow(flow.identifier)
@@ -249,9 +236,12 @@ object GraphExecution extends Logging {
 
   // Set of states after checking the exception for flow execution retryability analysis.
   sealed trait FlowExecutionAction
+    /** Indicates that the flow execution should be retried. */
   case object RetryFlowExecution extends FlowExecutionAction
+    /** Indicates that the flow execution should be stopped with a specific reason. */
   case class StopFlowExecution(reason: FlowExecutionStopReason) extends FlowExecutionAction
 
+  /** Represents the reason why a flow execution should be stopped. */
   sealed trait FlowExecutionStopReason {
     def cause: Throwable
     def flowDisplayName: String
@@ -261,28 +251,10 @@ object GraphExecution extends Logging {
     def warnInsteadOfError: Boolean = false
   }
 
-  @unused
-  case class ReanalyzeFlowSchema(originalCause: Throwable, flowDisplayName: String)
-      extends FlowExecutionStopReason {
-    override lazy val updateTerminationReason: UpdateTerminationReason = {
-      UpdateSchemaChange(flowDisplayName, Option(originalCause))
-    }
-    // Schema change can be automatically retried to handle
-    override val warnInsteadOfError: Boolean = true
-    override lazy val failureMessage: String = {
-      s"Flow '$flowDisplayName' has encountered a schema change during execution and " +
-      s"terminated. A new update using the new schema will be automatically started."
-    }
-    // Override the cause to make it more friendly for tracking purpose
-    override lazy val cause: Throwable = {
-      new SparkException(
-        errorClass = "FLOW_SCHEMA_CHANGED",
-        messageParameters = Map("flowName" -> flowDisplayName),
-        cause = originalCause
-      )
-    }
-  }
-
+  /**
+   * Represents the [[FlowExecution]] should be stopped due to it failed with some retryable errors
+   * and has exhausted all the retry attempts.
+   */
   private case class MaxRetryExceeded(
       cause: Throwable,
       flowDisplayName: String,
@@ -297,23 +269,6 @@ object GraphExecution extends Logging {
     }
   }
 
-  @unused
-  case class NonRetryableException(cause: Throwable, flowDisplayName: String)
-      extends FlowExecutionStopReason {
-    override lazy val updateTerminationReason: UpdateTerminationReason = {
-      QueryExecutionFailure(
-        flowName = flowDisplayName,
-        // Set maxRetries to 0 to not mention maxRetries in the error message.
-        maxRetries = 0,
-        cause = Option(cause)
-      )
-    }
-    override lazy val failureMessage: String = {
-      s"Flow '$flowDisplayName' has FAILED due to a non-retryable exception and will not be " +
-      s"restarted in this update."
-    }
-  }
-
   /**
    * Analyze the exception thrown by flow execution and figure out if we should retry the execution,
    * or we need to reanalyze the flow entirely to resolve issues like schema changes.
diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/TriggeredGraphExecution.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/TriggeredGraphExecution.scala
@@ -31,19 +31,16 @@ import org.apache.spark.sql.pipelines.util.ExponentialBackoffStrategy
 import org.apache.spark.sql.streaming.Trigger
 import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils}
 
-sealed trait StreamState
-
-object StreamState {
-  case object QUEUED extends StreamState
-  case object RUNNING extends StreamState
-  case object EXCLUDED extends StreamState
-  case object IDLE extends StreamState
-  case object SKIPPED extends StreamState
-  case object TERMINATED_WITH_ERROR extends StreamState
-  case object CANCELED extends StreamState
-  case object SUCCESSFUL extends StreamState
-}
-
+/**
+ * Executes all of the flows in the given graph in topological order. Each flow processes
+ * all available data before downstream flows are triggered.
+ *
+ * @param graphForExecution the graph to execute.
+ * @param env the context in which the graph is executed.
+ * @param onCompletion a callback to execute after all streams are done. The boolean
+ *                     argument is true if the execution was successful.
+ * @param clock a clock used to determine the time of execution.
+ */
 class TriggeredGraphExecution(
     graphForExecution: DataflowGraph,
     env: PipelineUpdateContext,
@@ -219,16 +216,12 @@ class TriggeredGraphExecution(
         flowsToStart.append(graphForExecution.resolvedFlow(flowIdentifier))
       }
 
-      val (batchFlowsToStart, otherFlowsToStart) = flowsToStart.partition { f =>
-        graphForExecution.resolvedFlow(f.identifier).isInstanceOf[CompleteFlow]
-      }
-
-      def startFlowWithPlanningMode(flow: ResolvedFlow, mode: String): Unit = {
+      def startFlow(flow: ResolvedFlow): Unit = {
         val flowIdentifier = flow.identifier
-        logInfo(s"Starting flow ${flow.identifier} in $mode mode")
+        logInfo(s"Starting flow ${flow.identifier}")
         env.flowProgressEventLogger.recordPlanningForBatchFlow(flow)
         try {
-          val flowStarted = startFlow(flow)
+          val flowStarted = planAndStartFlow(flow)
           if (flowStarted.nonEmpty) {
             pipelineState.put(flowIdentifier, StreamState.RUNNING)
             logInfo(s"Flow $flowIdentifier started.")
@@ -250,16 +243,12 @@ class TriggeredGraphExecution(
         }
       }
 
-      // start non-batch flows serially because the configs will be attached to the pipeline's spark
-      // session (source dataframe's spark session)
-      otherFlowsToStart.foreach(startFlowWithPlanningMode(_, SERIAL_PLANNING))
-
-      // only start MV flows in parallel if enabled
-      startPlanning(batchFlowsToStart.toSeq) { (flow, mode) =>
-        startFlowWithPlanningMode(flow, mode)
-      }
+      // start each flow serially
+      flowsToStart.foreach(startFlow)
 
       try {
+        // Put thread to sleep for the configured polling interval to avoid busy-waiting
+        // and holding one CPU core.
         Thread.sleep(pipelineConf.streamStatePollingInterval * 1000)
       } catch {
         case _: InterruptedException => return
@@ -453,6 +442,35 @@ case class TriggeredFailureInfo(
 
 object TriggeredGraphExecution {
 
+  // All possible states of a data stream for a flow
+  sealed trait StreamState
+  object StreamState {
+    // Stream is waiting on its parent tables to successfully finish processing
+    // data to start running, in triggered execution
+    case object QUEUED extends StreamState
+
+    // Stream is processing data
+    case object RUNNING extends StreamState
+
+    // Stream excluded if it's not selected in the partial graph update API call.
+    case object EXCLUDED extends StreamState
+
+    // Stream will not be rerun because it is a ONCE flow.
+    case object IDLE extends StreamState
+
+    // Stream will not be run due to parent tables not finishing successfully in triggered execution
+    case object SKIPPED extends StreamState
+
+    // Stream has been stopped with a fatal error
+    case object TERMINATED_WITH_ERROR extends StreamState
+
+    // Stream stopped before completion in triggered execution
+    case object CANCELED extends StreamState
+
+    // Stream successfully processed all available data in triggered execution
+    case object SUCCESSFUL extends StreamState
+  }
+
   /**
    * List of terminal states which we don't consider as failures.
    *
diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/TriggeredGraphExecutionSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/TriggeredGraphExecutionSuite.scala
@@ -27,6 +27,7 @@ import org.apache.spark.sql.classic.{DataFrame, Dataset}
 import org.apache.spark.sql.connector.catalog.{CatalogV2Util, Identifier, TableCatalog}
 import org.apache.spark.sql.execution.streaming.MemoryStream
 import org.apache.spark.sql.pipelines.common.{FlowStatus, RunState}
+import org.apache.spark.sql.pipelines.graph.TriggeredGraphExecution.StreamState
 import org.apache.spark.sql.pipelines.logging.EventLevel
 import org.apache.spark.sql.pipelines.utils.{ExecutionTest, TestGraphRegistrationContext}
 import org.apache.spark.sql.types.{IntegerType, StringType, StructType}