@@ -235,11 +235,51 @@ pub trait Alloc {
235
235
}
236
236
237
237
pub mod test_utils {
238
+ use std:: time:: Duration ;
239
+
240
+ use hyperactor:: Actor ;
241
+ use hyperactor:: Handler ;
242
+ use hyperactor:: Instance ;
243
+ use hyperactor:: Named ;
238
244
use tokio:: sync:: broadcast:: Receiver ;
239
245
use tokio:: sync:: broadcast:: Sender ;
240
246
241
247
use super :: * ;
242
248
249
+ // This can't be defined under a `#[cfg(test)]` because there needs to
250
+ // be an entry in the spawnable actor registry in the executable
251
+ // 'hyperactor_mesh_test_bootstrap' for the `tests::process` actor
252
+ // mesh test suite.
253
+ #[ derive( Debug ) ]
254
+ #[ hyperactor:: export(
255
+ spawn = true ,
256
+ handlers = [
257
+ Wait
258
+ ] ,
259
+ ) ]
260
+ pub struct TestActor ;
261
+
262
+ #[ async_trait]
263
+ impl Actor for TestActor {
264
+ type Params = ( ) ;
265
+
266
+ async fn new ( _params : Self :: Params ) -> Result < Self , anyhow:: Error > {
267
+ Ok ( Self )
268
+ }
269
+ }
270
+
271
+ #[ derive( Debug , Serialize , Deserialize , Named , Clone ) ]
272
+ pub struct Wait ( ) ;
273
+
274
+ #[ async_trait]
275
+ impl Handler < Wait > for TestActor {
276
+ async fn handle ( & mut self , _: & Instance < Self > , Wait ( ) : Wait ) -> Result < ( ) , anyhow:: Error > {
277
+ loop {
278
+ std:: thread:: sleep ( Duration :: from_secs ( 60 ) ) ;
279
+ }
280
+ }
281
+ }
282
+
243
283
/// Test wrapper around MockAlloc to allow us to block next() calls since
244
284
/// mockall doesn't support returning futures.
245
285
pub struct MockAllocWrapper {
@@ -309,12 +349,29 @@ pub mod test_utils {
309
349
310
350
#[ cfg( test) ]
311
351
pub ( crate ) mod testing {
352
+ use core:: panic;
312
353
use std:: collections:: HashMap ;
313
354
use std:: collections:: HashSet ;
314
-
355
+ use std:: time:: Duration ;
356
+
357
+ use hyperactor:: Mailbox ;
358
+ use hyperactor:: actor:: remote:: Remote ;
359
+ use hyperactor:: channel;
360
+ use hyperactor:: mailbox;
361
+ use hyperactor:: mailbox:: BoxedMailboxSender ;
362
+ use hyperactor:: mailbox:: DialMailboxRouter ;
363
+ use hyperactor:: mailbox:: IntoBoxedMailboxSender ;
364
+ use hyperactor:: mailbox:: MailboxServer ;
365
+ use hyperactor:: mailbox:: UndeliverableMailboxSender ;
366
+ use hyperactor:: proc:: Proc ;
367
+ use hyperactor:: reference:: Reference ;
315
368
use ndslice:: shape;
369
+ use tokio:: process:: Command ;
316
370
317
371
use super :: * ;
372
+ use crate :: alloc:: test_utils:: TestActor ;
373
+ use crate :: alloc:: test_utils:: Wait ;
374
+ use crate :: proc_mesh:: mesh_agent:: MeshAgentMessageClient ;
318
375
319
376
#[ macro_export]
320
377
macro_rules! alloc_test_suite {
@@ -376,4 +433,150 @@ pub(crate) mod testing {
376
433
assert ! ( alloc. next( ) . await . is_none( ) ) ;
377
434
assert_eq ! ( stopped, running) ;
378
435
}
436
+
437
+ async fn spawn_proc ( alloc : & ProcessAlloc ) -> ( DialMailboxRouter , Mailbox , Proc , ChannelAddr ) {
438
+ let ( router_channel_addr, router_rx) = channel:: serve ( ChannelAddr :: any ( alloc. transport ( ) ) )
439
+ . await
440
+ . map_err ( |err| AllocatorError :: Other ( err. into ( ) ) )
441
+ . unwrap ( ) ;
442
+ let router =
443
+ DialMailboxRouter :: new_with_default ( ( UndeliverableMailboxSender { } ) . into_boxed ( ) ) ;
444
+ router
445
+ . clone ( )
446
+ . serve ( router_rx, mailbox:: monitored_return_handle ( ) ) ;
447
+
448
+ let client_proc_id = ProcId ( WorldId ( format ! ( "test_{}" , alloc. world_id( ) . name( ) ) ) , 0 ) ;
449
+ let ( client_proc_addr, client_rx) = channel:: serve ( ChannelAddr :: any ( alloc. transport ( ) ) )
450
+ . await
451
+ . map_err ( |err| AllocatorError :: Other ( err. into ( ) ) )
452
+ . unwrap ( ) ;
453
+ let client_proc = Proc :: new (
454
+ client_proc_id. clone ( ) ,
455
+ BoxedMailboxSender :: new ( router. clone ( ) ) ,
456
+ ) ;
457
+ client_proc
458
+ . clone ( )
459
+ . serve ( client_rx, mailbox:: monitored_return_handle ( ) ) ;
460
+ router. bind ( client_proc_id. clone ( ) . into ( ) , client_proc_addr) ;
461
+ (
462
+ router,
463
+ client_proc. attach ( "test_proc" ) . unwrap ( ) ,
464
+ client_proc,
465
+ router_channel_addr,
466
+ )
467
+ }
468
+
469
+ async fn spawn_test_actor (
470
+ rank : usize ,
471
+ client_proc : & Proc ,
472
+ client : & Mailbox ,
473
+ router_channel_addr : ChannelAddr ,
474
+ mesh_agent : ActorRef < MeshAgent > ,
475
+ ) -> ActorRef < TestActor > {
476
+ let supervisor = client_proc. attach ( "supervisor" ) . unwrap ( ) ;
477
+ let ( supervison_port, _) = supervisor. open_port ( ) ;
478
+ let ( config_handle, _) = client. open_port ( ) ;
479
+ mesh_agent
480
+ . configure (
481
+ client,
482
+ rank,
483
+ router_channel_addr,
484
+ supervison_port. bind ( ) ,
485
+ HashMap :: new ( ) ,
486
+ config_handle. bind ( ) ,
487
+ )
488
+ . await
489
+ . unwrap ( ) ;
490
+ let remote = Remote :: collect ( ) ;
491
+ let actor_type = remote
492
+ . name_of :: < TestActor > ( )
493
+ . ok_or ( anyhow:: anyhow!( "actor not registered" ) )
494
+ . unwrap ( )
495
+ . to_string ( ) ;
496
+ let params = & ( ) ;
497
+ let ( completed_handle, mut completed_receiver) = mailbox:: open_port ( client) ;
498
+ // gspawn actor
499
+ mesh_agent
500
+ . gspawn (
501
+ client,
502
+ actor_type,
503
+ "Stuck" . to_string ( ) ,
504
+ bincode:: serialize ( params) . unwrap ( ) ,
505
+ completed_handle. bind ( ) ,
506
+ )
507
+ . await
508
+ . unwrap ( ) ;
509
+ let ( _, actor_id) = completed_receiver. recv ( ) . await . unwrap ( ) ;
510
+ ActorRef :: < TestActor > :: attest ( actor_id)
511
+ }
512
+
513
+ /// In order to simulate stuckness, we have to do two things:
514
+ /// An actor that is blocked forever AND
515
+ /// a proc that does not time out when it is asked to wait for
516
+ /// a stuck actor.
517
+ #[ tokio:: test]
518
+ async fn test_allocator_stuck_task ( ) {
519
+ // Override config.
520
+ // Use temporary config for this test
521
+ let config = hyperactor:: config:: global:: lock ( ) ;
522
+ let _guard = config. override_key (
523
+ hyperactor:: config:: PROCESS_EXIT_TIMEOUT ,
524
+ Duration :: from_secs ( 1 ) ,
525
+ ) ;
526
+
527
+ let mut command =
528
+ Command :: new ( buck_resources:: get ( "monarch/hyperactor_mesh/bootstrap" ) . unwrap ( ) ) ;
529
+ command. env ( "ACTOR_EXIT_TIMEOUT_MS" , "300000" ) ;
530
+ let mut allocator = ProcessAllocator :: new ( command) ;
531
+ let mut alloc = allocator
532
+ . allocate ( AllocSpec {
533
+ shape : shape ! { replica = 1 } ,
534
+ constraints : Default :: default ( ) ,
535
+ } )
536
+ . await
537
+ . unwrap ( ) ;
538
+
539
+ // Get everything up into running state. We require that we get
540
+ let mut procs = HashMap :: new ( ) ;
541
+ let mut running = HashSet :: new ( ) ;
542
+ let mut actor_ref = None ;
543
+ let ( router, client, client_proc, router_addr) = spawn_proc ( & alloc) . await ;
544
+ while running. is_empty ( ) {
545
+ match alloc. next ( ) . await . unwrap ( ) {
546
+ ProcState :: Created {
547
+ proc_id, coords, ..
548
+ } => {
549
+ procs. insert ( proc_id, coords) ;
550
+ }
551
+ ProcState :: Running {
552
+ proc_id,
553
+ mesh_agent,
554
+ addr,
555
+ } => {
556
+ router. bind ( Reference :: Proc ( proc_id. clone ( ) ) , addr. clone ( ) ) ;
557
+
558
+ assert ! ( procs. contains_key( & proc_id) ) ;
559
+ assert ! ( !running. contains( & proc_id) ) ;
560
+
561
+ actor_ref = Some (
562
+ spawn_test_actor ( 0 , & client_proc, & client, router_addr, mesh_agent) . await ,
563
+ ) ;
564
+ running. insert ( proc_id) ;
565
+ break ;
566
+ }
567
+ event => panic ! ( "unexpected event: {:?}" , event) ,
568
+ }
569
+ }
570
+ assert ! ( actor_ref. unwrap( ) . send( & client, Wait ( ) ) . is_ok( ) ) ;
571
+
572
+ // There is a stuck actor! We should get a watchdog failure.
573
+ alloc. stop ( ) . await . unwrap ( ) ;
574
+ let mut stopped = HashSet :: new ( ) ;
575
+ while let Some ( ProcState :: Stopped { proc_id, reason } ) = alloc. next ( ) . await {
576
+ assert_eq ! ( reason, ProcStopReason :: Watchdog ) ;
577
+ stopped. insert ( proc_id) ;
578
+ }
579
+ assert ! ( alloc. next( ) . await . is_none( ) ) ;
580
+ assert_eq ! ( stopped, running) ;
581
+ }
379
582
}
0 commit comments