Skip to content

Commit cd3d6f4

Browse files
committed
Prototype of a rate-limiter intended to favor workflows getting history over polling for new workflows.
This is to address an explosion of GetWorkflowExecutionHistory requests in one of our internal domains. "Explosion" to the tune of: normally a couple hundred per second, but during this issue we saw up to ~100,000/s. A larger description will come after I get some more sleep, but the quick and dirty summary is: - they had many "live" workflows - they started to build up a decision-schedule queue - slowing them down - overloading caches, causing a lot of un-cached decisions - ... leading to a lot of history iterators in new workflows looping, trying to load history, and getting ratelimited... - ... causing more to loop and try to load history... - ... slowing things down further and making it worse. Decision tasks were regularly >10 minutes, just trying to load history. So this is an attempt to prevent that from happening. It's not yet complete, just contains the limiter I'm planning, and tests.
1 parent e5063a1 commit cd3d6f4

8 files changed

+673
-35
lines changed

internal/internal_coroutines_test.go

+21-21
Original file line numberDiff line numberDiff line change
@@ -272,13 +272,13 @@ func TestBlockingSelect(t *testing.T) {
272272
Go(ctx, func(ctx Context) {
273273
history = append(history, "add-one")
274274
c1.Send(ctx, "one")
275-
history = append(history, "add-one-done")
275+
history = append(history, "add-one-stopped")
276276

277277
})
278278
Go(ctx, func(ctx Context) {
279279
history = append(history, "add-two")
280280
c2.Send(ctx, "two")
281-
history = append(history, "add-two-done")
281+
history = append(history, "add-two-stopped")
282282
})
283283

284284
s := NewSelector(ctx)
@@ -298,21 +298,21 @@ func TestBlockingSelect(t *testing.T) {
298298
s.Select(ctx)
299299
history = append(history, "select2")
300300
s.Select(ctx)
301-
history = append(history, "done")
301+
history = append(history, "stopped")
302302
})
303303
require.NoError(t, d.ExecuteUntilAllBlocked())
304304
require.True(t, d.IsDone(), strings.Join(history, "\n"))
305305

306306
expected := []string{
307307
"select1",
308308
"add-one",
309-
"add-one-done",
309+
"add-one-stopped",
310310
"add-two",
311311
"c1-one",
312312
"select2",
313313
"c2-two",
314-
"done",
315-
"add-two-done",
314+
"stopped",
315+
"add-two-stopped",
316316
}
317317
require.EqualValues(t, expected, history)
318318
}
@@ -339,7 +339,7 @@ func TestBlockingSelectAsyncSend(t *testing.T) {
339339
history = append(history, fmt.Sprintf("select-%v", ii))
340340
s.Select(ctx)
341341
}
342-
history = append(history, "done")
342+
history = append(history, "stopped")
343343
})
344344
require.NoError(t, d.ExecuteUntilAllBlocked())
345345
require.True(t, d.IsDone(), strings.Join(history, "\n"))
@@ -354,7 +354,7 @@ func TestBlockingSelectAsyncSend(t *testing.T) {
354354
"select-2",
355355
"add-2",
356356
"c1-2",
357-
"done",
357+
"stopped",
358358
}
359359
require.EqualValues(t, expected, history)
360360
}
@@ -429,7 +429,7 @@ func TestBlockingSelectAsyncSend2(t *testing.T) {
429429
c1.SendAsync("s1")
430430
history = append(history, "select-1")
431431
s.Select(ctx)
432-
history = append(history, "done")
432+
history = append(history, "stopped")
433433
})
434434
require.NoError(t, d.ExecuteUntilAllBlocked())
435435
require.True(t, d.IsDone(), strings.Join(history, "\n"))
@@ -441,7 +441,7 @@ func TestBlockingSelectAsyncSend2(t *testing.T) {
441441
"send-s1",
442442
"select-1",
443443
"c1-s1",
444-
"done",
444+
"stopped",
445445
}
446446
require.EqualValues(t, expected, history)
447447
}
@@ -469,7 +469,7 @@ func TestSendSelect(t *testing.T) {
469469
s.Select(ctx)
470470
history = append(history, "select2")
471471
s.Select(ctx)
472-
history = append(history, "done")
472+
history = append(history, "stopped")
473473
})
474474
require.NoError(t, d.ExecuteUntilAllBlocked())
475475
require.True(t, d.IsDone())
@@ -481,7 +481,7 @@ func TestSendSelect(t *testing.T) {
481481
"send2",
482482
"select2",
483483
"send1",
484-
"done",
484+
"stopped",
485485
"c1-one",
486486
}
487487
require.EqualValues(t, expected, history)
@@ -511,7 +511,7 @@ func TestSendSelectWithAsyncReceive(t *testing.T) {
511511
s.Select(ctx)
512512
history = append(history, "select2")
513513
s.Select(ctx)
514-
history = append(history, "done")
514+
history = append(history, "stopped")
515515
})
516516
require.NoError(t, d.ExecuteUntilAllBlocked())
517517
require.True(t, d.IsDone(), strings.Join(history, "\n"))
@@ -523,7 +523,7 @@ func TestSendSelectWithAsyncReceive(t *testing.T) {
523523
"send2",
524524
"select2",
525525
"send1",
526-
"done",
526+
"stopped",
527527
"c1-one",
528528
}
529529
require.EqualValues(t, expected, history)
@@ -533,7 +533,7 @@ func TestChannelClose(t *testing.T) {
533533
var history []string
534534
d, _ := newDispatcher(createRootTestContext(t), func(ctx Context) {
535535
jobs := NewBufferedChannel(ctx, 5)
536-
done := NewNamedChannel(ctx, "done")
536+
done := NewNamedChannel(ctx, "stopped")
537537

538538
GoNamed(ctx, "receiver", func(ctx Context) {
539539
for {
@@ -555,7 +555,7 @@ func TestChannelClose(t *testing.T) {
555555
jobs.Close()
556556
history = append(history, "sent all jobs")
557557
done.Receive(ctx, nil)
558-
history = append(history, "done")
558+
history = append(history, "stopped")
559559

560560
})
561561
require.EqualValues(t, 0, len(history))
@@ -571,7 +571,7 @@ func TestChannelClose(t *testing.T) {
571571
"received job 2",
572572
"received job 3",
573573
"received all jobs",
574-
"done",
574+
"stopped",
575575
}
576576
require.EqualValues(t, expected, history)
577577
}
@@ -987,7 +987,7 @@ func TestSelectFuture(t *testing.T) {
987987
s.Select(ctx)
988988
history = append(history, "select2")
989989
s.Select(ctx)
990-
history = append(history, "done")
990+
history = append(history, "stopped")
991991
})
992992
require.NoError(t, d.ExecuteUntilAllBlocked())
993993
require.True(t, d.IsDone())
@@ -999,7 +999,7 @@ func TestSelectFuture(t *testing.T) {
999999
"c1-one",
10001000
"select2",
10011001
"c2-two",
1002-
"done",
1002+
"stopped",
10031003
}
10041004
require.EqualValues(t, expected, history)
10051005
}
@@ -1036,7 +1036,7 @@ func TestSelectDecodeFuture(t *testing.T) {
10361036
s.Select(ctx)
10371037
history = append(history, "select2")
10381038
s.Select(ctx)
1039-
history = append(history, "done")
1039+
history = append(history, "stopped")
10401040
})
10411041
require.NoError(t, d.ExecuteUntilAllBlocked())
10421042
require.True(t, d.IsDone())
@@ -1048,7 +1048,7 @@ func TestSelectDecodeFuture(t *testing.T) {
10481048
"c1-one",
10491049
"select2",
10501050
"c2-two",
1051-
"done",
1051+
"stopped",
10521052
}
10531053
require.EqualValues(t, expected, history)
10541054
}

internal/internal_task_handlers_test.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -526,13 +526,13 @@ func (t *TaskHandlersTestSuite) TestWorkflowTask_QueryWorkflow_2() {
526526
task = createQueryTask(testEvents[0:8], 8, "HelloWorld_Workflow", queryType)
527527
taskHandler = newWorkflowTaskHandler(testDomain, params, nil, t.registry)
528528
response, _ = taskHandler.ProcessWorkflowTask(&workflowTask{task: task}, nil)
529-
t.verifyQueryResult(response, "done")
529+
t.verifyQueryResult(response, "stopped")
530530

531531
// query after second decision task with extra events
532532
task = createQueryTask(testEvents[0:9], 9, "HelloWorld_Workflow", queryType)
533533
taskHandler = newWorkflowTaskHandler(testDomain, params, nil, t.registry)
534534
response, _ = taskHandler.ProcessWorkflowTask(&workflowTask{task: task}, nil)
535-
t.verifyQueryResult(response, "done")
535+
t.verifyQueryResult(response, "stopped")
536536

537537
task = createQueryTask(testEvents[0:9], 9, "HelloWorld_Workflow", "invalid-query-type")
538538
taskHandler = newWorkflowTaskHandler(testDomain, params, nil, t.registry)

internal/internal_worker_base.go

+5-1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ import (
3737
"go.uber.org/cadence/internal/common/backoff"
3838
"go.uber.org/cadence/internal/common/metrics"
3939
"go.uber.org/cadence/internal/common/util"
40+
"go.uber.org/cadence/internal/pahlimiter"
4041
"go.uber.org/zap"
4142
"go.uber.org/zap/zapcore"
4243
"golang.org/x/time/rate"
@@ -137,6 +138,8 @@ type (
137138
pollerAutoScaler *pollerAutoScaler
138139
taskQueueCh chan interface{}
139140
sessionTokenBucket *sessionTokenBucket
141+
142+
pollAndHistoryLimiter pahlimiter.PollAndHistoryLimiter
140143
}
141144

142145
polledTask struct {
@@ -288,8 +291,9 @@ func (bw *baseWorker) pollTask() {
288291
}
289292
}
290293

291-
bw.retrier.Throttle()
294+
bw.retrier.Throttle() // sleeps if retry policy determines it should sleep after failures
292295
if bw.pollLimiter == nil || bw.pollLimiter.Wait(bw.limiterContext) == nil {
296+
// TODO: block here on a shared semaphore with history-loading?
293297
task, err = bw.options.taskWorker.PollTask()
294298
if err != nil && enableVerboseLogging {
295299
bw.logger.Debug("Failed to poll for task.", zap.Error(err))

internal/internal_worker_interfaces_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ const (
4141
signalCh = "signal-chan"
4242

4343
startingQueryValue = ""
44-
finishedQueryValue = "done"
44+
finishedQueryValue = "stopped"
4545
queryErr = "error handling query"
4646
)
4747

internal/internal_workers_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,7 @@ func (s *WorkersTestSuite) TestQueryTask_WorkflowCacheEvicted() {
390390
if err := ExecuteActivity(ctx, activityFn).Get(ctx, nil); err != nil {
391391
return err
392392
}
393-
queryResult = "done"
393+
queryResult = "stopped"
394394
return nil
395395
}
396396

internal/internal_workflow_testsuite_test.go

+9-9
Original file line numberDiff line numberDiff line change
@@ -1707,7 +1707,7 @@ func (s *WorkflowTestSuiteUnitTest) Test_WorkflowUnknownName() {
17071707

17081708
func (s *WorkflowTestSuiteUnitTest) Test_QueryWorkflow() {
17091709
queryType := "state"
1710-
stateWaitSignal, stateWaitActivity, stateDone := "wait for signal", "wait for activity", "done"
1710+
stateWaitSignal, stateWaitActivity, stateDone := "wait for signal", "wait for activity", "stopped"
17111711
workflowFn := func(ctx Context) error {
17121712
var state string
17131713
err := SetQueryHandler(ctx, queryType, func(queryInput string) (string, error) {
@@ -2199,7 +2199,7 @@ func (s *WorkflowTestSuiteUnitTest) Test_Channel() {
21992199
processedCount++
22002200
runningCount++
22012201
Go(ctx, func(ctx Context) {
2202-
doneCh.SendAsync("done")
2202+
doneCh.SendAsync("stopped")
22032203
runningCount--
22042204
})
22052205
}
@@ -2230,7 +2230,7 @@ func (s *WorkflowTestSuiteUnitTest) Test_ContextMisuse() {
22302230

22312231
Go(ctx, func(shouldUseThisCtx Context) {
22322232
Sleep(ctx, time.Hour)
2233-
ch.Send(ctx, "done")
2233+
ch.Send(ctx, "stopped")
22342234
})
22352235

22362236
var done string
@@ -2294,7 +2294,7 @@ func (s *WorkflowTestSuiteUnitTest) Test_ActivityRetry() {
22942294
if info.Attempt < 2 {
22952295
return "", NewCustomError("bad-luck")
22962296
}
2297-
return "retry-done", nil
2297+
return "retry-stopped", nil
22982298
}
22992299

23002300
workflowFn := func(ctx Context) (string, error) {
@@ -2339,7 +2339,7 @@ func (s *WorkflowTestSuiteUnitTest) Test_ActivityRetry() {
23392339
s.NoError(env.GetWorkflowError())
23402340
var result string
23412341
s.NoError(env.GetWorkflowResult(&result))
2342-
s.Equal("retry-done", result)
2342+
s.Equal("retry-stopped", result)
23432343
s.Equal(1, attempt1Count)
23442344
s.Equal(3, attempt2Count)
23452345
}
@@ -2416,7 +2416,7 @@ func (s *WorkflowTestSuiteUnitTest) Test_LocalActivityRetry() {
24162416
if info.Attempt < 2 {
24172417
return "", NewCustomError("bad-luck")
24182418
}
2419-
return "retry-done", nil
2419+
return "retry-stopped", nil
24202420
}
24212421

24222422
workflowFn := func(ctx Context) (string, error) {
@@ -2454,7 +2454,7 @@ func (s *WorkflowTestSuiteUnitTest) Test_LocalActivityRetry() {
24542454
s.NoError(env.GetWorkflowError())
24552455
var result string
24562456
s.NoError(env.GetWorkflowResult(&result))
2457-
s.Equal("retry-done", result)
2457+
s.Equal("retry-stopped", result)
24582458
s.Equal(1, nonretriableCount)
24592459
s.Equal(3, retriableCount)
24602460
}
@@ -2544,7 +2544,7 @@ func (s *WorkflowTestSuiteUnitTest) Test_ChildWorkflowRetry() {
25442544
if info.Attempt < 2 {
25452545
return "", NewCustomError("bad-luck")
25462546
}
2547-
return "retry-done", nil
2547+
return "retry-stopped", nil
25482548
}
25492549

25502550
workflowFn := func(ctx Context) (string, error) {
@@ -2578,7 +2578,7 @@ func (s *WorkflowTestSuiteUnitTest) Test_ChildWorkflowRetry() {
25782578
s.NoError(env.GetWorkflowError())
25792579
var result string
25802580
s.NoError(env.GetWorkflowResult(&result))
2581-
s.Equal("retry-done", result)
2581+
s.Equal("retry-stopped", result)
25822582
}
25832583

25842584
func (s *WorkflowTestSuiteUnitTest) Test_SignalChildWorkflowRetry() {

0 commit comments

Comments
 (0)