herman

jon-mio · jon-mio · commit 138587ea2fed · 2025-06-04T13:08:19.000-04:00
diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/pipelines/PipelineExecutionHolder.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/pipelines/PipelineExecutionHolder.scala
diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/pipelines/PipelinesHandler.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/pipelines/PipelinesHandler.scala
@@ -33,16 +33,7 @@ import org.apache.spark.sql.connect.service.SessionHolder
 import org.apache.spark.sql.pipelines.Language.Python
 import org.apache.spark.sql.pipelines.QueryOriginType
 import org.apache.spark.sql.pipelines.common.RunState.{CANCELED, FAILED}
-import org.apache.spark.sql.pipelines.graph.{
-  FlowAnalysis,
-  GraphIdentifierManager,
-  IdentifierHelper,
-  QueryContext,
-  QueryOrigin,
-  Table,
-  TemporaryView,
-  UnresolvedFlow
-}
+import org.apache.spark.sql.pipelines.graph.{FlowAnalysis, GraphIdentifierManager, IdentifierHelper, PipelineUpdateContextImpl, QueryContext, QueryOrigin, Table, TemporaryView, UnresolvedFlow}
 import org.apache.spark.sql.pipelines.logging.{PipelineEvent, RunProgress}
 import org.apache.spark.sql.types.StructType
 
@@ -341,11 +332,11 @@ private[connect] object PipelinesHandler extends Logging {
           )
       }
     }
-    PipelineExecutionHolder.executePipeline(
-      dataflowGraphId,
-      graphElementRegistry.toDataflowGraph,
-      eventCallback
-    )
+    val pipelineUpdateContext = new PipelineUpdateContextImpl(
+      graphElementRegistry.toDataflowGraph, eventCallback)
+    sessionHolder.cachePipelineExecution(dataflowGraphId, pipelineUpdateContext)
+    pipelineUpdateContext.pipelineExecution.runPipeline()
+
     // Rethrow any exceptions that caused the pipeline run to fail so that the exception is
     // propagated back to the SC client / CLI.
     runFailureEvent.foreach { event =>
diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SessionHolder.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SessionHolder.scala
@@ -41,6 +41,7 @@ import org.apache.spark.sql.connect.ml.MLCache
 import org.apache.spark.sql.connect.planner.PythonStreamingQueryListener
 import org.apache.spark.sql.connect.planner.StreamingForeachBatchHelper
 import org.apache.spark.sql.connect.service.SessionHolder.{ERROR_CACHE_SIZE, ERROR_CACHE_TIMEOUT_SEC}
+import org.apache.spark.sql.pipelines.graph.PipelineUpdateContext
 import org.apache.spark.sql.streaming.StreamingQueryListener
 import org.apache.spark.util.{SystemClock, Utils}
 
@@ -119,6 +120,11 @@ case class SessionHolder(userId: String, sessionId: String, session: SparkSessio
   private lazy val listenerCache: ConcurrentMap[String, StreamingQueryListener] =
     new ConcurrentHashMap()
 
+  // Mapping from graphId to the pipeline update context. This is used to manage the lifecycle of
+  // pipeline executions.
+  private lazy val pipelineExecutions =
+    new ConcurrentHashMap[String, PipelineUpdateContext]()
+
   // Handles Python process clean up for streaming queries. Initialized on first use in a query.
   private[connect] lazy val streamingForeachBatchRunnerCleanerCache =
     new StreamingForeachBatchHelper.CleanerCache(this)
@@ -311,6 +317,8 @@ case class SessionHolder(userId: String, sessionId: String, session: SparkSessio
     SparkConnectService.streamingSessionManager.cleanupRunningQueries(this, blocking = true)
     streamingForeachBatchRunnerCleanerCache.cleanUpAll() // Clean up any streaming workers.
     removeAllListeners() // removes all listener and stop python listener processes if necessary.
+    // Stops all pipeline execution and clears the pipeline execution cache
+    removeAllPipelineExecutions()
 
     // if there is a server side listener, clean up related resources
     if (streamingServersideListenerHolder.isServerSideListenerRegistered) {
@@ -426,6 +434,56 @@ case class SessionHolder(userId: String, sessionId: String, session: SparkSessio
     listenerCache.keySet().asScala.toSeq
   }
 
+  /**
+   * Caches the pipeline execution context for a given graph ID.
+   * @param graphId The id of the graph being executed.
+   * @param pipelineUpdateContext The context for the pipeline execution.
+   */
+  private[connect] def cachePipelineExecution(
+      graphId: String,
+      pipelineUpdateContext: PipelineUpdateContext): Unit = {
+    pipelineExecutions.compute(
+      graphId,
+      (_, existing) => {
+        if (Option(existing).isDefined) {
+          throw new IllegalStateException(
+            s"Pipeline execution for graph ID $graphId already exists. " +
+            s"Stop the existing execution before starting a new one."
+          )
+        }
+
+        pipelineUpdateContext
+      }
+    )
+  }
+
+  /** Stops the pipeline execution and removes it from the cache. */
+  private def removeCachedPipelineExecution(graphId: String): Unit = {
+    pipelineExecutions.compute(graphId, (_, context) => {
+      if (context.pipelineExecution.executionStarted) {
+        context.pipelineExecution.stopPipeline()
+      }
+      // Remove the execution.
+      null
+    })
+  }
+
+  /** Stops all pipeline executions and clears the pipeline execution cache. */
+  def removeAllPipelineExecutions(): Unit = {
+    pipelineExecutions.forEach((graphId, _) => {
+      removeCachedPipelineExecution(graphId)
+    })
+    pipelineExecutions.clear()
+  }
+
+  /**
+   * Returns [[PipelineUpdateContext]] cached for the given graphId. If it is not found, return
+   * None.
+   */
+  private[connect] def getPipelineExecution(graphId: String): Option[PipelineUpdateContext] = {
+    Option(pipelineExecutions.get(graphId))
+  }
+
   /**
    * An accumulator for Python executors.
    *
diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/pipelines/PythonPipelineSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/pipelines/PythonPipelineSuite.scala
@@ -59,7 +59,7 @@ class PythonPipelineSuite
     with EventVerificationTestHelpers {
 
   def buildGraph(pythonText: String): DataflowGraph = {
-    val indentedPythonText = pythonText.linesIterator.map("        " + _).mkString("\n")
+    val indentedPythonText = pythonText.linesIterator.map("    " + _).mkString("\n")
     val pythonCode =
       s"""
          |from pyspark.sql import SparkSession
@@ -72,24 +72,24 @@ class PythonPipelineSuite
          |    graph_element_registration_context,
          |)
          |
-         |try:
-         |    spark = SparkSession.builder \\
-         |        .remote("sc://localhost:$serverPort") \\
-         |        .config("spark.connect.grpc.channel.timeout", "5s") \\
-         |        .create()
+         |spark = SparkSession.builder \\
+         |    .remote("sc://localhost:$serverPort") \\
+         |    .config("spark.connect.grpc.channel.timeout", "5s") \\
+         |    .create()
          |
-         |    dataflow_graph_id = create_dataflow_graph(
-         |        spark,
-         |        default_catalog=None,
-         |        default_database=None,
-         |        sql_conf={},
-         |    )
+         |dataflow_graph_id = create_dataflow_graph(
+         |    spark,
+         |    default_catalog=None,
+         |    default_database=None,
+         |    sql_conf={},
+         |)
          |
-         |    registry = SparkConnectGraphElementRegistry(spark, dataflow_graph_id)
-         |    with graph_element_registration_context(registry):
-         |    $indentedPythonText
+         |registry = SparkConnectGraphElementRegistry(spark, dataflow_graph_id)
+         |with graph_element_registration_context(registry):
+         |$indentedPythonText
          |""".stripMargin
 
+    logInfo(s"Running code: $pythonCode")
     val (exitCode, output) = executePythonCode(pythonCode)
 
     if (exitCode != 0) {
diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/pipelines/SparkDeclarativePipelinesServerSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/pipelines/SparkDeclarativePipelinesServerSuite.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.connect.pipelines
 
-import scala.concurrent.duration.DurationInt
-
 import org.apache.spark.connect.proto
 import org.apache.spark.connect.proto.{DatasetType, Expression, PipelineCommand, Relation, UnresolvedTableValuedFunction}
 import org.apache.spark.connect.proto.PipelineCommand.{DefineDataset, DefineFlow}
diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/pipelines/SparkDeclarativePipelinesServerTest.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/pipelines/SparkDeclarativePipelinesServerTest.scala
@@ -20,12 +20,14 @@ package org.apache.spark.sql.connect.pipelines
 import org.apache.spark.connect.{proto => sc}
 import org.apache.spark.sql.connect.{SparkConnectServerTest, SparkConnectTestUtils}
 import org.apache.spark.sql.connect.planner.SparkConnectPlanner
+import org.apache.spark.sql.connect.service.{SessionKey, SparkConnectService}
 import org.apache.spark.sql.pipelines.utils.PipelineTest
 
 class SparkDeclarativePipelinesServerTest extends SparkConnectServerTest {
 
   override def afterEach(): Unit = {
-    PipelineExecutionHolder.stopAllPipelineExecutions()
+    SparkConnectService.sessionManager.getIsolatedSessionIfPresent(
+      SessionKey(defaultUserId, defaultSessionId)).foreach(_.removeAllPipelineExecutions())
     DataflowGraphRegistry.dropAllDataflowGraphs()
     PipelineTest.cleanupMetastore(spark)
     super.afterEach()
diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/service/SparkConnectSessionHolderSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/service/SparkConnectSessionHolderSuite.scala
@@ -37,6 +37,8 @@ import org.apache.spark.sql.connect.common.InvalidPlanInput
 import org.apache.spark.sql.connect.config.Connect
 import org.apache.spark.sql.connect.planner.{PythonStreamingQueryListener, SparkConnectPlanner, StreamingForeachBatchHelper}
 import org.apache.spark.sql.connect.planner.StreamingForeachBatchHelper.RunnerCleaner
+import org.apache.spark.sql.pipelines.graph.{DataflowGraph, PipelineUpdateContextImpl}
+import org.apache.spark.sql.pipelines.logging.PipelineEvent
 import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.util.ArrayImplicits._
 
@@ -422,4 +424,19 @@ class SparkConnectSessionHolderSuite extends SharedSparkSession {
     }
     assert(ex.getMessage.contains("already exists"))
   }
+
+  test("Pipeline execution cache") {
+    val sessionHolder = SparkConnectTestUtils.createDummySessionHolder(spark)
+    val graphId = "test_graph"
+    val pipelineUpdateContext = new PipelineUpdateContextImpl(
+      new DataflowGraph(Seq(), Seq(), Seq()),
+      (_: PipelineEvent) => None
+    )
+    sessionHolder.cachePipelineExecution(graphId, pipelineUpdateContext)
+    assert(
+      sessionHolder.getPipelineExecution(graphId).nonEmpty, "pipeline execution was not cached")
+    sessionHolder.removeAllPipelineExecutions()
+    assert(
+      sessionHolder.getPipelineExecution(graphId).isEmpty, "pipeline execution was not removed")
+  }
 }
diff --git a/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/service/SparkConnectSessionManagerSuite.scala b/sql/connect/server/src/test/scala/org/apache/spark/sql/connect/service/SparkConnectSessionManagerSuite.scala
@@ -23,6 +23,8 @@ import org.scalatest.BeforeAndAfterEach
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.SparkSQLException
+import org.apache.spark.sql.pipelines.graph.{DataflowGraph, PipelineUpdateContextImpl}
+import org.apache.spark.sql.pipelines.logging.PipelineEvent
 import org.apache.spark.sql.test.SharedSparkSession
 
 class SparkConnectSessionManagerSuite extends SharedSparkSession with BeforeAndAfterEach {
@@ -136,7 +138,6 @@ class SparkConnectSessionManagerSuite extends SharedSparkSession with BeforeAndA
   test("SessionHolder is recorded with status closed after close") {
     val key = SessionKey("user", UUID.randomUUID().toString)
     val sessionHolder = SparkConnectService.sessionManager.getOrCreateIsolatedSession(key, None)
-
     val activeSessionInfo = SparkConnectService.sessionManager.listActiveSessions.find(
       _.sessionId == sessionHolder.sessionId)
     assert(activeSessionInfo.isDefined)
@@ -152,4 +153,21 @@ class SparkConnectSessionManagerSuite extends SharedSparkSession with BeforeAndA
     assert(closedSessionInfo.get.status == SessionStatus.Closed)
     assert(closedSessionInfo.get.closedTimeMs.isDefined)
   }
+
+
+  test("Pipeline execution cache is cleared when the session holder is closed") {
+    val key = SessionKey("user", UUID.randomUUID().toString)
+    val sessionHolder = SparkConnectService.sessionManager.getOrCreateIsolatedSession(key, None)
+    val graphId = "test_graph"
+    val pipelineUpdateContext = new PipelineUpdateContextImpl(
+      new DataflowGraph(Seq(), Seq(), Seq()),
+      (_: PipelineEvent) => None
+    )
+    sessionHolder.cachePipelineExecution(graphId, pipelineUpdateContext)
+    assert(
+      sessionHolder.getPipelineExecution(graphId).nonEmpty, "pipeline execution was not cached")
+    sessionHolder.close()
+    assert(
+      sessionHolder.getPipelineExecution(graphId).isEmpty, "pipeline execution was not removed")
+  }
 }
diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/TriggeredGraphExecutionSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/TriggeredGraphExecutionSuite.scala
@@ -606,7 +606,7 @@ class TriggeredGraphExecutionSuite extends ExecutionTest {
 
     val graph = pipelineDef.toDataflowGraph
     val updateContext = TestPipelineUpdateContext(spark, graph)
-    updateContext.pipelineExecution.runPipeline()
+    updateContext.pipelineExecution.startPipeline()
 
     val graphExecution = updateContext.pipelineExecution.graphExecution.get
 
@@ -1033,9 +1033,7 @@ class TriggeredGraphExecutionSuite extends ExecutionTest {
     }.toDataflowGraph
 
     val updateContext = TestPipelineUpdateContext(spark = spark, unresolvedGraph = graph)
-    intercept[UnresolvedPipelineException] {
-      updateContext.pipelineExecution.runPipeline()
-    }
+    updateContext.pipelineExecution.runPipeline()
 
     assertFlowProgressEvent(
       updateContext.eventBuffer,