@@ -228,11 +228,52 @@ pub trait Alloc {
228
228
}
229
229
230
230
pub mod test_utils {
231
+ use std:: time:: Duration ;
232
+
233
+ use hyperactor:: Actor ;
234
+ use hyperactor:: Handler ;
235
+ use hyperactor:: Instance ;
236
+ use hyperactor:: Named ;
231
237
use tokio:: sync:: broadcast:: Receiver ;
232
238
use tokio:: sync:: broadcast:: Sender ;
233
239
234
240
use super :: * ;
235
241
242
+ // This can't be defined under a `#[cfg(test)]` because there needs to
243
+ // be an entry in the spawnable actor registry in the executable
244
+ // 'hyperactor_mesh_test_bootstrap' for the `tests::process` actor
245
+ // mesh test suite.
246
+ #[ derive( Debug ) ]
247
+ #[ hyperactor:: export(
248
+ spawn = true ,
249
+ handlers = [
250
+ Wait
251
+ ] ,
252
+ ) ]
253
+ pub struct TestActor ;
254
+
255
+ #[ async_trait]
256
+ impl Actor for TestActor {
257
+ type Params = ( ) ;
258
+
259
+ async fn new ( _params : Self :: Params ) -> Result < Self , anyhow:: Error > {
260
+ Ok ( Self )
261
+ }
262
+ }
263
+
264
+ #[ derive( Debug , Serialize , Deserialize , Named , Clone ) ]
265
+ pub struct Wait ( ) ;
266
+
267
+ #[ async_trait]
268
+ impl Handler < Wait > for TestActor {
269
+ async fn handle ( & mut self , _: & Instance < Self > , Wait ( ) : Wait ) -> Result < ( ) , anyhow:: Error > {
270
+ loop {
271
+ #[ allow( clippy:: disallowed_methods) ]
272
+ tokio:: time:: sleep ( Duration :: from_secs ( 60 ) ) . await ;
273
+ }
274
+ }
275
+ }
276
+
236
277
/// Test wrapper around MockAlloc to allow us to block next() calls since
237
278
/// mockall doesn't support returning futures.
238
279
pub struct MockAllocWrapper {
@@ -302,12 +343,29 @@ pub mod test_utils {
302
343
303
344
#[ cfg( test) ]
304
345
pub ( crate ) mod testing {
346
+ use core:: panic;
305
347
use std:: collections:: HashMap ;
306
348
use std:: collections:: HashSet ;
307
-
349
+ use std:: time:: Duration ;
350
+
351
+ use hyperactor:: Mailbox ;
352
+ use hyperactor:: actor:: remote:: Remote ;
353
+ use hyperactor:: channel;
354
+ use hyperactor:: mailbox;
355
+ use hyperactor:: mailbox:: BoxedMailboxSender ;
356
+ use hyperactor:: mailbox:: DialMailboxRouter ;
357
+ use hyperactor:: mailbox:: IntoBoxedMailboxSender ;
358
+ use hyperactor:: mailbox:: MailboxServer ;
359
+ use hyperactor:: mailbox:: UndeliverableMailboxSender ;
360
+ use hyperactor:: proc:: Proc ;
361
+ use hyperactor:: reference:: Reference ;
308
362
use ndslice:: shape;
363
+ use tokio:: process:: Command ;
309
364
310
365
use super :: * ;
366
+ use crate :: alloc:: test_utils:: TestActor ;
367
+ use crate :: alloc:: test_utils:: Wait ;
368
+ use crate :: proc_mesh:: mesh_agent:: MeshAgentMessageClient ;
311
369
312
370
#[ macro_export]
313
371
macro_rules! alloc_test_suite {
@@ -367,4 +425,145 @@ pub(crate) mod testing {
367
425
assert ! ( alloc. next( ) . await . is_none( ) ) ;
368
426
assert_eq ! ( stopped, running) ;
369
427
}
428
+
429
+ async fn spawn_proc ( alloc : & ProcessAlloc ) -> ( DialMailboxRouter , Mailbox , Proc , ChannelAddr ) {
430
+ let ( router_channel_addr, router_rx) = channel:: serve ( ChannelAddr :: any ( alloc. transport ( ) ) )
431
+ . await
432
+ . map_err ( |err| AllocatorError :: Other ( err. into ( ) ) )
433
+ . unwrap ( ) ;
434
+ let router =
435
+ DialMailboxRouter :: new_with_default ( ( UndeliverableMailboxSender { } ) . into_boxed ( ) ) ;
436
+ router
437
+ . clone ( )
438
+ . serve ( router_rx, mailbox:: monitored_return_handle ( ) ) ;
439
+
440
+ let client_proc_id = ProcId ( WorldId ( format ! ( "test_{}" , alloc. world_id( ) . name( ) ) ) , 0 ) ;
441
+ let ( client_proc_addr, client_rx) = channel:: serve ( ChannelAddr :: any ( alloc. transport ( ) ) )
442
+ . await
443
+ . map_err ( |err| AllocatorError :: Other ( err. into ( ) ) )
444
+ . unwrap ( ) ;
445
+ let client_proc = Proc :: new (
446
+ client_proc_id. clone ( ) ,
447
+ BoxedMailboxSender :: new ( router. clone ( ) ) ,
448
+ ) ;
449
+ client_proc
450
+ . clone ( )
451
+ . serve ( client_rx, mailbox:: monitored_return_handle ( ) ) ;
452
+ router. bind ( client_proc_id. clone ( ) . into ( ) , client_proc_addr) ;
453
+ (
454
+ router,
455
+ client_proc. attach ( "test_proc" ) . unwrap ( ) ,
456
+ client_proc,
457
+ router_channel_addr,
458
+ )
459
+ }
460
+
461
+ async fn spawn_test_actor (
462
+ rank : usize ,
463
+ client_proc : & Proc ,
464
+ client : & Mailbox ,
465
+ router_channel_addr : ChannelAddr ,
466
+ mesh_agent : ActorRef < MeshAgent > ,
467
+ ) -> ActorRef < TestActor > {
468
+ let supervisor = client_proc. attach ( "supervisor" ) . unwrap ( ) ;
469
+ let ( supervison_port, _) = supervisor. open_port ( ) ;
470
+ let ( config_handle, _) = client. open_port ( ) ;
471
+ mesh_agent
472
+ . configure (
473
+ client,
474
+ rank,
475
+ router_channel_addr,
476
+ supervison_port. bind ( ) ,
477
+ HashMap :: new ( ) ,
478
+ config_handle. bind ( ) ,
479
+ )
480
+ . await
481
+ . unwrap ( ) ;
482
+ let remote = Remote :: collect ( ) ;
483
+ let actor_type = remote
484
+ . name_of :: < TestActor > ( )
485
+ . ok_or ( anyhow:: anyhow!( "actor not registered" ) )
486
+ . unwrap ( )
487
+ . to_string ( ) ;
488
+ let params = & ( ) ;
489
+ let ( completed_handle, mut completed_receiver) = mailbox:: open_port ( client) ;
490
+ // gspawn actor
491
+ mesh_agent
492
+ . gspawn (
493
+ client,
494
+ actor_type,
495
+ "Stuck" . to_string ( ) ,
496
+ bincode:: serialize ( params) . unwrap ( ) ,
497
+ completed_handle. bind ( ) ,
498
+ )
499
+ . await
500
+ . unwrap ( ) ;
501
+ let ( _, actor_id) = completed_receiver. recv ( ) . await . unwrap ( ) ;
502
+ ActorRef :: < TestActor > :: attest ( actor_id)
503
+ }
504
+
505
+ #[ timed_test:: async_timed_test( timeout_secs = 120 ) ]
506
+ async fn test_allocator_stuck_task ( ) {
507
+ // Override config.
508
+ // Use temporary config for this test
509
+ let config = hyperactor:: config:: global:: lock ( ) ;
510
+ let _guard = config. override_key (
511
+ hyperactor:: config:: PROCESS_EXIT_TIMEOUT ,
512
+ Duration :: from_secs ( 1 ) ,
513
+ ) ;
514
+
515
+ let mut command =
516
+ Command :: new ( buck_resources:: get ( "monarch/hyperactor_mesh/bootstrap" ) . unwrap ( ) ) ;
517
+ command. arg ( "300000" ) ;
518
+
519
+ let mut allocator = ProcessAllocator :: new ( command) ;
520
+ let mut alloc = allocator
521
+ . allocate ( AllocSpec {
522
+ shape : shape ! { replica = 1 } ,
523
+ constraints : Default :: default ( ) ,
524
+ } )
525
+ . await
526
+ . unwrap ( ) ;
527
+
528
+ // Get everything up into running state. We require that we get
529
+ let mut procs = HashMap :: new ( ) ;
530
+ let mut running = HashSet :: new ( ) ;
531
+ let mut actor_ref = None ;
532
+ let ( router, client, client_proc, router_addr) = spawn_proc ( & alloc) . await ;
533
+ while running. is_empty ( ) {
534
+ match alloc. next ( ) . await . unwrap ( ) {
535
+ ProcState :: Created { proc_id, coords } => {
536
+ procs. insert ( proc_id, coords) ;
537
+ }
538
+ ProcState :: Running {
539
+ proc_id,
540
+ mesh_agent,
541
+ addr,
542
+ } => {
543
+ router. bind ( Reference :: Proc ( proc_id. clone ( ) ) , addr. clone ( ) ) ;
544
+
545
+ assert ! ( procs. contains_key( & proc_id) ) ;
546
+ assert ! ( !running. contains( & proc_id) ) ;
547
+
548
+ actor_ref = Some (
549
+ spawn_test_actor ( 0 , & client_proc, & client, router_addr, mesh_agent) . await ,
550
+ ) ;
551
+ running. insert ( proc_id) ;
552
+ break ;
553
+ }
554
+ event => panic ! ( "unexpected event: {:?}" , event) ,
555
+ }
556
+ }
557
+ assert ! ( actor_ref. unwrap( ) . send( & client, Wait ( ) ) . is_ok( ) ) ;
558
+
559
+ // There is a stuck actor! We should get a watchdog failure.
560
+ alloc. stop ( ) . await . unwrap ( ) ;
561
+ let mut stopped = HashSet :: new ( ) ;
562
+ while let Some ( ProcState :: Stopped { proc_id, reason } ) = alloc. next ( ) . await {
563
+ assert_eq ! ( reason, ProcStopReason :: Watchdog ) ;
564
+ stopped. insert ( proc_id) ;
565
+ }
566
+ assert ! ( alloc. next( ) . await . is_none( ) ) ;
567
+ assert_eq ! ( stopped, running) ;
568
+ }
370
569
}
0 commit comments