apache · eason-yuchen-liu · Mar 17, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 19, 2026
diff --git a/.../main/scala/org/apache/spark/sql/execution/streaming/sources/LowLatencyMemoryStream.scala b/.../main/scala/org/apache/spark/sql/execution/streaming/sources/LowLatencyMemoryStream.scala
@@ -76,16 +76,19 @@ class LowLatencyMemoryStream[A: Encoder](
   private val records =
     Seq.fill(numPartitions)(new ListBuffer[(UnsafeRow, Long)]())
 
+  @GuardedBy("this")
+  private var numRecords: Long = 0L
+
   private val recordEndpoint = new LowLatencyMemoryStreamEndpoint(records, this)
   @volatile private var endpointRef: RpcEndpointRef = _
 
   override def addData(data: IterableOnce[A]): Offset = synchronized {
     // Distribute data evenly among partition lists.
     val timestamp = clock.getTimeMillis()
-    data.iterator.to(Seq).zipWithIndex.map {
-      case (item, index) =>
-        val partitionId = index % numPartitions
-        records(partitionId) += ((toRow(item).copy().asInstanceOf[UnsafeRow], timestamp))
+    data.iterator.foreach { item =>
+      val partitionId: Int = (numRecords % numPartitions).toInt
+      records(partitionId) += ((toRow(item).copy().asInstanceOf[UnsafeRow], timestamp))
+      numRecords += 1
     }
 
     // The new target offset is the offset where all records in all partitions have been processed.
@@ -102,6 +105,7 @@ class LowLatencyMemoryStream[A: Encoder](
     val timestamp = clock.getTimeMillis()
     data.iterator.foreach { item =>
       records(partitionId) += ((toRow(item).copy().asInstanceOf[UnsafeRow], timestamp))
+      numRecords += 1
     }
 
     // The new target offset is the offset where all records in all partitions have been processed.

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamRealTimeModeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamRealTimeModeSuite.scala
@@ -24,10 +24,11 @@ import scala.concurrent.duration.Duration
 
 import org.scalatest.concurrent.PatienceConfiguration.Timeout
 
-import org.apache.spark.{SparkIllegalArgumentException, SparkIllegalStateException}
+import org.apache.spark.{SparkIllegalArgumentException, SparkIllegalStateException, TaskContext}
 import org.apache.spark.sql.execution.streaming.{LowLatencyMemoryStream, RealTimeTrigger}
 import org.apache.spark.sql.execution.streaming.runtime.MemoryStream
 import org.apache.spark.sql.execution.streaming.sources.ContinuousMemorySink
+import org.apache.spark.sql.functions.udf
 import org.apache.spark.sql.internal.SQLConf
 
 class StreamRealTimeModeSuite extends StreamRealTimeModeSuiteBase {
@@ -204,6 +205,26 @@ class StreamRealTimeModeSuite extends StreamRealTimeModeSuiteBase {
       }
     )
   }
+
+  test("LowLatencyMemoryStream load balance among all partitions") {
+    val numPartitions = 3
+    val inputData = LowLatencyMemoryStream[Int](numPartitions)
+
+    val getPartitionId = udf(() => TaskContext.getPartitionId())
+
+    val mapped = inputData.toDS().select($"value", getPartitionId()).as[(Int, Int)]
+
+    testStream(mapped, OutputMode.Update, Map.empty, new ContinuousMemorySink())(
+      StartStream(),
+      // 6 items round-robin across 3 partitions: item i goes to partition (i-1) % 3
+      AddData(inputData, 1, 2, 3),
+      AddData(inputData, 4),
+      AddData(inputData, 5),
+      AddData(inputData, 6),
+      CheckAnswerWithTimeout(10000, (1, 0), (2, 1), (3, 2), (4, 0), (5, 1), (6, 2)),
+      StopStream
+    )
+  }
 }
 
 class StreamRealTimeModeWithManualClockSuite extends StreamRealTimeModeManualClockSuiteBase {