Skip to content

Commit b0e9358

Browse files
sjuddConvex, Inc.
authored and
Convex, Inc.
committed
Rollforward add a rate limiter for node actions (#23906)
Rollforward add a rate limiter for node actions Now with the correct nomad env variables - adds the missing max_concurrent_node_actions var to backend.nomad This reverts commit 3c9da1c7caa47fb3476a42b31dc004a6d7e71378. GitOrigin-RevId: 2872d0b1a8b90db1cd63b537438872a03eebb4e9
1 parent d082908 commit b0e9358

File tree

4 files changed

+175
-47
lines changed

4 files changed

+175
-47
lines changed

crates/application/src/application_function_runner/metrics.rs

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
use common::types::UdfType;
1+
use common::types::{
2+
ModuleEnvironment,
3+
UdfType,
4+
};
25
use metrics::{
36
log_counter,
47
log_counter_with_tags,
@@ -9,6 +12,7 @@ use metrics::{
912
register_convex_counter,
1013
register_convex_gauge,
1114
register_convex_histogram,
15+
MetricTag,
1216
StatusTimer,
1317
STATUS_LABEL,
1418
};
@@ -71,42 +75,63 @@ register_convex_gauge!(
7175
APPLICATION_FUNCTION_RUNNER_OUTSTANDING_TOTAL,
7276
"The number of currently outstanding functions of a given type. Includes both running and \
7377
waiting functions",
74-
&["udf_type", "state"]
78+
&["udf_type", "state", "env_type"]
7579
);
76-
pub fn log_outstanding_functions(total: usize, udf_type: UdfType, state: OutstandingFunctionState) {
80+
pub fn log_outstanding_functions(
81+
total: usize,
82+
env: ModuleEnvironment,
83+
udf_type: UdfType,
84+
state: OutstandingFunctionState,
85+
) {
7786
let state_tag = metric_tag_const(match state {
7887
OutstandingFunctionState::Running => "state:running",
7988
OutstandingFunctionState::Waiting => "state:waiting",
8089
});
8190
log_gauge_with_tags(
8291
&APPLICATION_FUNCTION_RUNNER_OUTSTANDING_TOTAL,
8392
total as f64,
84-
vec![udf_type.metric_tag(), state_tag],
93+
vec![udf_type.metric_tag(), state_tag, env.metric_tag()],
8594
)
8695
}
8796

8897
register_convex_histogram!(
8998
APPLICATION_FUNCTION_RUNNER_TOTAL_SECONDS,
9099
"The total time it took to execute a function. This includes wait time and run time. The \
91100
metric is also logged for isolate client code path so we can compare apples to apples.",
92-
&[STATUS_LABEL[0], "udf_type"]
101+
&[STATUS_LABEL[0], "udf_type", "env_type"]
93102
);
94-
pub fn function_total_timer(udf_type: UdfType) -> StatusTimer {
103+
pub fn function_total_timer(env: ModuleEnvironment, udf_type: UdfType) -> StatusTimer {
95104
let mut timer = StatusTimer::new(&APPLICATION_FUNCTION_RUNNER_TOTAL_SECONDS);
96105
timer.add_tag(udf_type.metric_tag());
106+
timer.add_tag(env.metric_tag());
97107
timer
98108
}
99109

110+
trait ModuleEnvironmentExt {
111+
fn metric_tag(&self) -> MetricTag;
112+
}
113+
114+
impl ModuleEnvironmentExt for ModuleEnvironment {
115+
fn metric_tag(&self) -> MetricTag {
116+
let value = match self {
117+
ModuleEnvironment::Isolate => "env_type:isolate",
118+
ModuleEnvironment::Node => "env_type:node",
119+
ModuleEnvironment::Invalid => "env_type:invalid",
120+
};
121+
metric_tag_const(value)
122+
}
123+
}
124+
100125
register_convex_counter!(
101126
APPLICATION_FUNCTION_RUNNER_WAIT_TIMEOUT_TOTAL,
102127
"Total number with running a function has timed out due to instance concurrency limits.",
103-
&["udf_type"],
128+
&["udf_type", "env_type"],
104129
);
105-
pub fn log_function_wait_timeout(udf_type: UdfType) {
130+
pub fn log_function_wait_timeout(env: ModuleEnvironment, udf_type: UdfType) {
106131
log_counter_with_tags(
107132
&APPLICATION_FUNCTION_RUNNER_WAIT_TIMEOUT_TOTAL,
108133
1,
109-
vec![udf_type.metric_tag()],
134+
vec![udf_type.metric_tag(), env.metric_tag()],
110135
);
111136
}
112137

crates/application/src/application_function_runner/mod.rs

Lines changed: 64 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,10 @@ use common::{
1919
identity::InertIdentity,
2020
knobs::{
2121
APPLICATION_FUNCTION_RUNNER_SEMAPHORE_TIMEOUT,
22-
APPLICATION_MAX_CONCURRENT_ACTIONS,
2322
APPLICATION_MAX_CONCURRENT_MUTATIONS,
23+
APPLICATION_MAX_CONCURRENT_NODE_ACTIONS,
2424
APPLICATION_MAX_CONCURRENT_QUERIES,
25+
APPLICATION_MAX_CONCURRENT_V8_ACTIONS,
2526
ISOLATE_MAX_USER_HEAP_SIZE,
2627
UDF_EXECUTOR_OCC_INITIAL_BACKOFF,
2728
UDF_EXECUTOR_OCC_MAX_BACKOFF,
@@ -225,16 +226,19 @@ impl<RT: Runtime> FunctionRouter<RT> {
225226
database,
226227
system_env_vars,
227228
query_limiter: Arc::new(Limiter::new(
229+
ModuleEnvironment::Isolate,
228230
UdfType::Query,
229231
*APPLICATION_MAX_CONCURRENT_QUERIES,
230232
)),
231233
mutation_limiter: Arc::new(Limiter::new(
234+
ModuleEnvironment::Isolate,
232235
UdfType::Mutation,
233236
*APPLICATION_MAX_CONCURRENT_MUTATIONS,
234237
)),
235238
action_limiter: Arc::new(Limiter::new(
239+
ModuleEnvironment::Isolate,
236240
UdfType::Action,
237-
*APPLICATION_MAX_CONCURRENT_ACTIONS,
241+
*APPLICATION_MAX_CONCURRENT_V8_ACTIONS,
238242
)),
239243
}
240244
}
@@ -255,7 +259,8 @@ impl<RT: Runtime> FunctionRouter<RT> {
255259
context: ExecutionContext,
256260
) -> anyhow::Result<(Transaction<RT>, FunctionOutcome)> {
257261
anyhow::ensure!(udf_type == UdfType::Query || udf_type == UdfType::Mutation);
258-
let timer = function_total_timer(udf_type);
262+
// All queries and mutations are run in the isolate environment.
263+
let timer = function_total_timer(ModuleEnvironment::Isolate, udf_type);
259264
let (tx, outcome) = self
260265
.function_runner_execute(tx, path_and_args, udf_type, journal, context, None)
261266
.await?;
@@ -271,7 +276,6 @@ impl<RT: Runtime> FunctionRouter<RT> {
271276
log_line_sender: mpsc::UnboundedSender<LogLine>,
272277
context: ExecutionContext,
273278
) -> anyhow::Result<ActionOutcome> {
274-
let timer = function_total_timer(UdfType::Action);
275279
let (_, outcome) = self
276280
.function_runner_execute(
277281
tx,
@@ -289,11 +293,10 @@ impl<RT: Runtime> FunctionRouter<RT> {
289293
outcome
290294
)
291295
};
292-
timer.finish();
293296
Ok(outcome)
294297
}
295298

296-
// Execute using the function runner. Can be used for all Udf types including
299+
// Execute using the function runner. Can be used for v8 udfs other than http
297300
// actions.
298301
async fn function_runner_execute(
299302
&self,
@@ -316,17 +319,9 @@ impl<RT: Runtime> FunctionRouter<RT> {
316319
UdfType::Action => &self.action_limiter,
317320
UdfType::HttpAction => anyhow::bail!("Function runner does not support http actions"),
318321
};
319-
let mut request_guard = limiter.start();
320-
select_biased! {
321-
_ = request_guard.acquire_permit().fuse() => {},
322-
_ = self.rt.wait(*APPLICATION_FUNCTION_RUNNER_SEMAPHORE_TIMEOUT) => {
323-
log_function_wait_timeout(udf_type);
324-
anyhow::bail!(ErrorMetadata::overloaded(
325-
"TooManyConcurrentRequests",
326-
"Too many concurrent requests, backoff and try again.",
327-
));
328-
},
329-
}
322+
323+
let request_guard = limiter.acquire_permit_with_timeout(&self.rt).await?;
324+
330325
let timer = function_run_timer(udf_type);
331326
let (function_tx, outcome, usage_stats) = self
332327
.function_runner
@@ -384,6 +379,7 @@ impl<RT: Runtime> FunctionRouter<RT> {
384379
// and log gauges for the number of waiting and currently running functions.
385380
struct Limiter {
386381
udf_type: UdfType,
382+
env: ModuleEnvironment,
387383

388384
// Used to limit running functions.
389385
semaphore: Semaphore,
@@ -394,9 +390,10 @@ struct Limiter {
394390
}
395391

396392
impl Limiter {
397-
fn new(udf_type: UdfType, total_permits: usize) -> Self {
393+
fn new(env: ModuleEnvironment, udf_type: UdfType, total_permits: usize) -> Self {
398394
let limiter = Self {
399395
udf_type,
396+
env,
400397
semaphore: Semaphore::new(total_permits),
401398
total_permits,
402399
total_outstanding: AtomicUsize::new(0),
@@ -406,6 +403,24 @@ impl Limiter {
406403
limiter
407404
}
408405

406+
async fn acquire_permit_with_timeout<'a, RT: Runtime>(
407+
&'a self,
408+
rt: &'a RT,
409+
) -> anyhow::Result<RequestGuard<'a>> {
410+
let mut request_guard = self.start();
411+
select_biased! {
412+
_ = request_guard.acquire_permit().fuse() => {},
413+
_ = rt.wait(*APPLICATION_FUNCTION_RUNNER_SEMAPHORE_TIMEOUT) => {
414+
log_function_wait_timeout(self.env, self.udf_type);
415+
anyhow::bail!(ErrorMetadata::overloaded(
416+
"TooManyConcurrentRequests",
417+
"Too many concurrent requests, backoff and try again.",
418+
));
419+
},
420+
}
421+
Ok(request_guard)
422+
}
423+
409424
fn start(&self) -> RequestGuard {
410425
self.total_outstanding.fetch_add(1, Ordering::SeqCst);
411426
// Update the gauge to account for the newly waiting request.
@@ -423,8 +438,18 @@ impl Limiter {
423438
.total_outstanding
424439
.load(Ordering::SeqCst)
425440
.saturating_sub(running);
426-
log_outstanding_functions(running, self.udf_type, OutstandingFunctionState::Running);
427-
log_outstanding_functions(waiting, self.udf_type, OutstandingFunctionState::Waiting);
441+
log_outstanding_functions(
442+
running,
443+
self.env,
444+
self.udf_type,
445+
OutstandingFunctionState::Running,
446+
);
447+
log_outstanding_functions(
448+
waiting,
449+
self.env,
450+
self.udf_type,
451+
OutstandingFunctionState::Waiting,
452+
);
428453
}
429454
}
430455

@@ -463,6 +488,11 @@ impl<'a> Drop for RequestGuard<'a> {
463488
}
464489
}
465490

491+
/// Executes UDFs for backends.
492+
///
493+
/// This struct directly executes http and node actions. Queries, Mutations and
494+
/// v8 Actions are instead routed through the FunctionRouter and its
495+
/// FunctionRunner implementation.
466496
pub struct ApplicationFunctionRunner<RT: Runtime> {
467497
runtime: RT,
468498
pub(crate) database: Database<RT>,
@@ -480,6 +510,7 @@ pub struct ApplicationFunctionRunner<RT: Runtime> {
480510

481511
cache_manager: CacheManager<RT>,
482512
system_env_vars: BTreeMap<EnvVarName, EnvVarValue>,
513+
node_action_limiter: Limiter,
483514
}
484515

485516
impl<RT: Runtime> HeapSize for ApplicationFunctionRunner<RT> {
@@ -529,6 +560,11 @@ impl<RT: Runtime> ApplicationFunctionRunner<RT> {
529560
function_log,
530561
cache_manager,
531562
system_env_vars,
563+
node_action_limiter: Limiter::new(
564+
ModuleEnvironment::Node,
565+
UdfType::Action,
566+
*APPLICATION_MAX_CONCURRENT_NODE_ACTIONS,
567+
),
532568
}
533569
}
534570

@@ -1072,6 +1108,8 @@ impl<RT: Runtime> ApplicationFunctionRunner<RT> {
10721108
.await?
10731109
.context("Missing a valid module_version")?;
10741110
let (log_line_sender, log_line_receiver) = mpsc::unbounded();
1111+
1112+
let timer = function_total_timer(module_version.environment, UdfType::Action);
10751113
match module_version.environment {
10761114
ModuleEnvironment::Isolate => {
10771115
// TODO: This is the only use case of clone. We should get rid of clone,
@@ -1098,6 +1136,7 @@ impl<RT: Runtime> ApplicationFunctionRunner<RT> {
10981136
let memory_in_mb: u64 = (*ISOLATE_MAX_USER_HEAP_SIZE / (1 << 20))
10991137
.try_into()
11001138
.unwrap();
1139+
timer.finish();
11011140
Ok(ActionCompletion {
11021141
outcome,
11031142
execution_time: start.elapsed(),
@@ -1110,6 +1149,10 @@ impl<RT: Runtime> ApplicationFunctionRunner<RT> {
11101149
})
11111150
},
11121151
ModuleEnvironment::Node => {
1152+
let _request_guard = self
1153+
.node_action_limiter
1154+
.acquire_permit_with_timeout(&self.runtime)
1155+
.await?;
11131156
let mut source_maps = BTreeMap::new();
11141157
if let Some(source_map) = module_version.source_map.clone() {
11151158
source_maps.insert(name.module().clone(), source_map);
@@ -1204,6 +1247,7 @@ impl<RT: Runtime> ApplicationFunctionRunner<RT> {
12041247
syscall_trace: node_outcome.syscall_trace,
12051248
udf_server_version,
12061249
};
1250+
timer.finish();
12071251
let memory_in_mb = node_outcome.memory_used_in_mb;
12081252
Ok(ActionCompletion {
12091253
outcome,

crates/application/src/lib.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,11 +48,11 @@ use common::{
4848
execution_context::ExecutionContext,
4949
http::fetch::FetchClient,
5050
knobs::{
51+
APPLICATION_MAX_CONCURRENT_HTTP_ACTIONS,
5152
BACKEND_ISOLATE_ACTIVE_THREADS_PERCENT,
5253
MAX_JOBS_CANCEL_BATCH,
5354
SNAPSHOT_LIST_LIMIT,
5455
UDF_ISOLATE_MAX_EXEC_THREADS,
55-
V8_ACTION_MAX_ISOLATE_EXEC_THREADS,
5656
},
5757
log_lines::run_function_and_collect_log_lines,
5858
log_streaming::LogSender,
@@ -510,7 +510,7 @@ impl<RT: Runtime> Application<RT> {
510510
let actions_isolate = IsolateClient::new(
511511
runtime.clone(),
512512
actions_isolate_worker,
513-
*V8_ACTION_MAX_ISOLATE_EXEC_THREADS,
513+
*APPLICATION_MAX_CONCURRENT_HTTP_ACTIONS,
514514
true,
515515
instance_name.clone(),
516516
instance_secret,

0 commit comments

Comments
 (0)