Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,13 @@ message PrepareCheckpointRequest{
bool estimationOnly = 2;
}

enum StatisticsUpdateTarget {
BOTH_UI_AND_PERSISTENCE = 0;
UI_ONLY = 1;
PERSISTENCE_ONLY = 2;
}

message QueryStatisticsRequest{
repeated core.ActorVirtualIdentity filterByWorkers = 1;
StatisticsUpdateTarget updateTarget = 2;
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ case class ExecutionStateUpdate(state: WorkflowAggregatedState) extends ClientEv

case class ExecutionStatsUpdate(operatorMetrics: Map[String, OperatorMetrics]) extends ClientEvent

case class RuntimeStatisticsPersist(operatorMetrics: Map[String, OperatorMetrics])
extends ClientEvent

case class ReportCurrentProcessingTuple(
operatorID: String,
tuple: Array[(Tuple, ActorVirtualIdentity)]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,15 @@ object ControllerConfig {
def default: ControllerConfig =
ControllerConfig(
statusUpdateIntervalMs = Option(ApplicationConfig.getStatusUpdateIntervalInMs),
runtimeStatisticsIntervalMs = Option(ApplicationConfig.getRuntimeStatisticsIntervalInMs),
stateRestoreConfOpt = None,
faultToleranceConfOpt = None
)
}

final case class ControllerConfig(
statusUpdateIntervalMs: Option[Long],
runtimeStatisticsIntervalMs: Option[Long],
stateRestoreConfOpt: Option[StateRestoreConfig],
faultToleranceConfOpt: Option[FaultToleranceConfig]
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ import org.apache.pekko.actor.Cancellable
import org.apache.texera.amber.engine.architecture.common.AkkaActorService
import org.apache.texera.amber.engine.architecture.rpc.controlcommands.{
AsyncRPCContext,
QueryStatisticsRequest
QueryStatisticsRequest,
StatisticsUpdateTarget
}
import org.apache.texera.amber.engine.architecture.rpc.controllerservice.ControllerServiceGrpc.METHOD_CONTROLLER_INITIATE_QUERY_STATISTICS
import org.apache.texera.amber.engine.common.rpc.AsyncRPCClient.ControlInvocation
Expand All @@ -36,28 +37,61 @@ class ControllerTimerService(
akkaActorService: AkkaActorService
) {
var statusUpdateAskHandle: Option[Cancellable] = None
var runtimeStatisticsAskHandle: Option[Cancellable] = None

def enableStatusUpdate(): Unit = {
if (controllerConfig.statusUpdateIntervalMs.nonEmpty && statusUpdateAskHandle.isEmpty) {
statusUpdateAskHandle = Option(
private def enableTimer(
intervalMs: Option[Long],
updateTarget: StatisticsUpdateTarget,
handleOpt: Option[Cancellable]
): Option[Cancellable] = {
if (intervalMs.nonEmpty && handleOpt.isEmpty) {
Option(
akkaActorService.sendToSelfWithFixedDelay(
0.milliseconds,
FiniteDuration.apply(controllerConfig.statusUpdateIntervalMs.get, MILLISECONDS),
FiniteDuration.apply(intervalMs.get, MILLISECONDS),
ControlInvocation(
METHOD_CONTROLLER_INITIATE_QUERY_STATISTICS,
QueryStatisticsRequest(Seq.empty),
QueryStatisticsRequest(Seq.empty, updateTarget),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As you are having two separate timers that each send separate QueryStatisticsRequests, more requests will be sent than before. I am wondering what would be the implication of this? For example, would more frequent QueryStatistics be sent to each worker? It would be good if you can comment on this in your PR description.

AsyncRPCContext(SELF, SELF),
0
)
)
)
} else {
handleOpt
}
}

def disableStatusUpdate(): Unit = {
if (statusUpdateAskHandle.nonEmpty) {
statusUpdateAskHandle.get.cancel()
statusUpdateAskHandle = Option.empty
private def disableTimer(handleOpt: Option[Cancellable]): Option[Cancellable] = {
if (handleOpt.nonEmpty) {
handleOpt.get.cancel()
Option.empty
} else {
handleOpt
}
}

def enableStatusUpdate(): Unit = {
statusUpdateAskHandle = enableTimer(
controllerConfig.statusUpdateIntervalMs,
StatisticsUpdateTarget.UI_ONLY,
statusUpdateAskHandle
)
}

def enableRuntimeStatisticsCollection(): Unit = {
runtimeStatisticsAskHandle = enableTimer(
controllerConfig.runtimeStatisticsIntervalMs,
StatisticsUpdateTarget.PERSISTENCE_ONLY,
runtimeStatisticsAskHandle
)
}

def disableStatusUpdate(): Unit = {
statusUpdateAskHandle = disableTimer(statusUpdateAskHandle)
}

def disableRuntimeStatisticsCollection(): Unit = {
runtimeStatisticsAskHandle = disableTimer(runtimeStatisticsAskHandle)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ import org.apache.texera.amber.core.virtualidentity.ActorVirtualIdentity
import org.apache.texera.amber.engine.architecture.controller.{
ControllerAsyncRPCHandlerInitializer,
ExecutionStateUpdate,
ExecutionStatsUpdate
ExecutionStatsUpdate,
RuntimeStatisticsPersist
}
import org.apache.texera.amber.engine.architecture.rpc.controlcommands.{
AsyncRPCContext,
Expand All @@ -47,6 +48,7 @@ trait PauseHandler {

override def pauseWorkflow(request: EmptyRequest, ctx: AsyncRPCContext): Future[EmptyReturn] = {
cp.controllerTimerService.disableStatusUpdate() // to be enabled in resume
cp.controllerTimerService.disableRuntimeStatisticsCollection() // to be enabled in resume
Future
.collect(
cp.workflowExecution.getRunningRegionExecutions
Expand Down Expand Up @@ -81,12 +83,10 @@ trait PauseHandler {
.toSeq
)
.map { _ =>
// update frontend workflow status
sendToClient(
ExecutionStatsUpdate(
cp.workflowExecution.getAllRegionExecutionsStats
)
)
// update frontend workflow status and persist statistics
val stats = cp.workflowExecution.getAllRegionExecutionsStats
sendToClient(ExecutionStatsUpdate(stats))
sendToClient(RuntimeStatisticsPersist(stats))
sendToClient(ExecutionStateUpdate(cp.workflowExecution.getState))
logger.info(s"workflow paused")
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ import org.apache.texera.amber.engine.architecture.controller.{
import org.apache.texera.amber.engine.architecture.rpc.controlcommands.{
AsyncRPCContext,
PortCompletedRequest,
QueryStatisticsRequest
QueryStatisticsRequest,
StatisticsUpdateTarget
}
import org.apache.texera.amber.engine.architecture.rpc.controlreturns.EmptyReturn
import org.apache.texera.amber.engine.common.virtualidentity.util.CONTROLLER
Expand All @@ -50,7 +51,13 @@ trait PortCompletedHandler {
ctx: AsyncRPCContext
): Future[EmptyReturn] = {
controllerInterface
.controllerInitiateQueryStatistics(QueryStatisticsRequest(scala.Seq(ctx.sender)), CONTROLLER)
.controllerInitiateQueryStatistics(
QueryStatisticsRequest(
scala.Seq(ctx.sender),
StatisticsUpdateTarget.BOTH_UI_AND_PERSISTENCE
),
CONTROLLER
)
.map { _ =>
val globalPortId = GlobalPortIdentity(
VirtualIdentityUtils.getPhysicalOpId(ctx.sender),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,15 @@ import com.twitter.util.Future
import org.apache.texera.amber.core.virtualidentity.PhysicalOpIdentity
import org.apache.texera.amber.engine.architecture.controller.{
ControllerAsyncRPCHandlerInitializer,
ExecutionStatsUpdate
ExecutionStatsUpdate,
RuntimeStatisticsPersist
}
import org.apache.texera.amber.engine.architecture.deploysemantics.layer.WorkerExecution
import org.apache.texera.amber.engine.architecture.rpc.controlcommands.{
AsyncRPCContext,
EmptyRequest,
QueryStatisticsRequest
QueryStatisticsRequest,
StatisticsUpdateTarget
}
import org.apache.texera.amber.engine.architecture.rpc.controlreturns.WorkflowAggregatedState.COMPLETED
import org.apache.texera.amber.engine.architecture.rpc.controlreturns.{
Expand Down Expand Up @@ -133,9 +135,21 @@ trait QueryWorkerStatisticsHandler {
case (wExec, resp, timestamp) =>
wExec.update(timestamp, resp.metrics.workerState, resp.metrics.workerStatistics)
}
sendToClient(
ExecutionStatsUpdate(cp.workflowExecution.getAllRegionExecutionsStats)
)
// Send appropriate event(s) based on updateTarget
val stats = cp.workflowExecution.getAllRegionExecutionsStats
msg.updateTarget match {
case StatisticsUpdateTarget.UI_ONLY =>
// Timer-triggered: update UI only
sendToClient(ExecutionStatsUpdate(stats))
case StatisticsUpdateTarget.PERSISTENCE_ONLY =>
// Timer-triggered: persist statistics only
sendToClient(RuntimeStatisticsPersist(stats))
case StatisticsUpdateTarget.BOTH_UI_AND_PERSISTENCE |
StatisticsUpdateTarget.Unrecognized(_) =>
// Event-triggered or default: update UI and persist statistics (original behavior)
sendToClient(ExecutionStatsUpdate(stats))
sendToClient(RuntimeStatisticsPersist(stats))
}
// Release the global query lock if it was set
if (globalQueryStatsOngoing) {
globalQueryStatsOngoing = false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ package org.apache.texera.amber.engine.architecture.controller.promisehandlers
import com.twitter.util.Future
import org.apache.texera.amber.engine.architecture.controller.{
ControllerAsyncRPCHandlerInitializer,
ExecutionStatsUpdate
ExecutionStatsUpdate,
RuntimeStatisticsPersist
}
import org.apache.texera.amber.engine.architecture.rpc.controlcommands.{
AsyncRPCContext,
Expand Down Expand Up @@ -57,14 +58,14 @@ trait ResumeHandler {
.toSeq
)
.map { _ =>
// update frontend status
sendToClient(
ExecutionStatsUpdate(
cp.workflowExecution.getAllRegionExecutionsStats
)
)
// update frontend status and persist statistics
val stats = cp.workflowExecution.getAllRegionExecutionsStats
sendToClient(ExecutionStatsUpdate(stats))
sendToClient(RuntimeStatisticsPersist(stats))
cp.controllerTimerService
.enableStatusUpdate() //re-enabled it since it is disabled in pause
cp.controllerTimerService
.enableRuntimeStatisticsCollection() //re-enabled it since it is disabled in pause
EmptyReturn()
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ trait StartWorkflowHandler {
.coordinateRegionExecutors(cp.actorService)
.map(_ => {
cp.controllerTimerService.enableStatusUpdate()
cp.controllerTimerService.enableRuntimeStatisticsCollection()
StartWorkflowResponse(RUNNING)
})
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ import org.apache.texera.amber.engine.architecture.controller.{
import org.apache.texera.amber.engine.architecture.rpc.controlcommands.{
AsyncRPCContext,
EmptyRequest,
QueryStatisticsRequest
QueryStatisticsRequest,
StatisticsUpdateTarget
}
import org.apache.texera.amber.engine.architecture.rpc.controlreturns.EmptyReturn
import org.apache.texera.amber.engine.common.virtualidentity.util.SELF
Expand All @@ -52,7 +53,7 @@ trait WorkerExecutionCompletedHandler {
// and the user sees the last update before completion
val statsRequest =
controllerInterface.controllerInitiateQueryStatistics(
QueryStatisticsRequest(Seq(ctx.sender)),
QueryStatisticsRequest(Seq(ctx.sender), StatisticsUpdateTarget.BOTH_UI_AND_PERSISTENCE),
mkContext(SELF)
)

Expand All @@ -64,6 +65,7 @@ trait WorkerExecutionCompletedHandler {
// after query result come back: send completed event, cleanup ,and kill workflow
sendToClient(ExecutionStateUpdate(cp.workflowExecution.getState))
cp.controllerTimerService.disableStatusUpdate()
cp.controllerTimerService.disableRuntimeStatisticsCollection()
}
})
EmptyReturn()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ package org.apache.texera.amber.engine.architecture.controller.promisehandlers
import com.twitter.util.Future
import org.apache.texera.amber.engine.architecture.controller.{
ControllerAsyncRPCHandlerInitializer,
ExecutionStatsUpdate
ExecutionStatsUpdate,
RuntimeStatisticsPersist
}
import org.apache.texera.amber.engine.architecture.rpc.controlcommands.{
AsyncRPCContext,
Expand Down Expand Up @@ -50,11 +51,9 @@ trait WorkerStateUpdatedHandler {
.foreach(operatorExecution =>
operatorExecution.getWorkerExecution(ctx.sender).update(System.nanoTime(), msg.state)
)
sendToClient(
ExecutionStatsUpdate(
cp.workflowExecution.getAllRegionExecutionsStats
)
)
val stats = cp.workflowExecution.getAllRegionExecutionsStats
sendToClient(ExecutionStatsUpdate(stats))
sendToClient(RuntimeStatisticsPersist(stats))
EmptyReturn()
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import org.apache.texera.amber.engine.architecture.controller.execution.{
import org.apache.texera.amber.engine.architecture.controller.{
ControllerConfig,
ExecutionStatsUpdate,
RuntimeStatisticsPersist,
WorkerAssignmentUpdate
}
import org.apache.texera.amber.engine.architecture.rpc.controlcommands._
Expand Down Expand Up @@ -275,9 +276,9 @@ class RegionExecutionCoordinator(
val resourceConfig = region.resourceConfig.get
val regionExecution = workflowExecution.getRegionExecution(region.id)

asyncRPCClient.sendToClient(
ExecutionStatsUpdate(workflowExecution.getAllRegionExecutionsStats)
)
val stats = workflowExecution.getAllRegionExecutionsStats
asyncRPCClient.sendToClient(ExecutionStatsUpdate(stats))
asyncRPCClient.sendToClient(RuntimeStatisticsPersist(stats))
asyncRPCClient.sendToClient(
WorkerAssignmentUpdate(
operatorsToRun
Expand Down Expand Up @@ -489,11 +490,9 @@ class RegionExecutionCoordinator(
region: Region,
isDependeePhase: Boolean
): Future[Seq[Unit]] = {
asyncRPCClient.sendToClient(
ExecutionStatsUpdate(
workflowExecution.getAllRegionExecutionsStats
)
)
val stats = workflowExecution.getAllRegionExecutionsStats
asyncRPCClient.sendToClient(ExecutionStatsUpdate(stats))
asyncRPCClient.sendToClient(RuntimeStatisticsPersist(stats))
val allStarterOperators = region.getStarterOperators
val starterOpsForThisPhase =
if (isDependeePhase) allStarterOperators.filter(_.dependeeInputs.nonEmpty)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,14 @@ import org.apache.texera.amber.core.tuple.Tuple
import org.apache.texera.amber.core.workflow.WorkflowContext
import org.apache.texera.amber.core.workflowruntimestate.FatalErrorType.EXECUTION_FAILURE
import org.apache.texera.amber.core.workflowruntimestate.WorkflowFatalError
import org.apache.texera.amber.engine.architecture.controller._
import org.apache.texera.amber.engine.architecture.controller.{
ExecutionStateUpdate,
ExecutionStatsUpdate,
FatalError,
RuntimeStatisticsPersist,
WorkerAssignmentUpdate,
WorkflowRecoveryStatus
}
import org.apache.texera.amber.engine.architecture.rpc.controlreturns.WorkflowAggregatedState
import org.apache.texera.amber.engine.architecture.rpc.controlreturns.WorkflowAggregatedState.{
COMPLETED,
Expand Down Expand Up @@ -175,12 +182,20 @@ class ExecutionStatsService(
}

private[this] def registerCallbackOnWorkflowStatsUpdate(): Unit = {
// Register callback for UI updates (UI state store update only, no persistence)
addSubscription(
client
.registerCallback[ExecutionStatsUpdate]((evt: ExecutionStatsUpdate) => {
stateStore.statsStore.updateState { statsStore =>
statsStore.withOperatorInfo(evt.operatorMetrics)
}
})
)

// Register callback for statistics persistence (persistence only, no UI update)
addSubscription(
client
.registerCallback[RuntimeStatisticsPersist]((evt: RuntimeStatisticsPersist) => {
metricsPersistThread.execute(() => {
storeRuntimeStatistics(computeStatsDiff(evt.operatorMetrics))
lastPersistedMetrics = evt.operatorMetrics
Expand Down
3 changes: 3 additions & 0 deletions common/config/src/main/resources/application.conf
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ constants {

status-update-interval = 500
status-update-interval = ${?CONSTANTS_STATUS_UPDATE_INTERVAL}

runtime-statistics-interval = 2000
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should be more explicit that this config is about persistence. Please add it in the name.

runtime-statistics-interval = ${?CONSTANTS_RUNTIME_STATISTICS_INTERVAL}
}

flow-control {
Expand Down
Loading
Loading