@@ -228,11 +228,51 @@ pub trait Alloc {
228
228
}
229
229
230
230
pub mod test_utils {
231
+ use std:: time:: Duration ;
232
+
233
+ use hyperactor:: Actor ;
234
+ use hyperactor:: Handler ;
235
+ use hyperactor:: Instance ;
236
+ use hyperactor:: Named ;
231
237
use tokio:: sync:: broadcast:: Receiver ;
232
238
use tokio:: sync:: broadcast:: Sender ;
233
239
234
240
use super :: * ;
235
241
242
+ // This can't be defined under a `#[cfg(test)]` because there needs to
243
+ // be an entry in the spawnable actor registry in the executable
244
+ // 'hyperactor_mesh_test_bootstrap' for the `tests::process` actor
245
+ // mesh test suite.
246
+ #[ derive( Debug ) ]
247
+ #[ hyperactor:: export(
248
+ spawn = true ,
249
+ handlers = [
250
+ Wait
251
+ ] ,
252
+ ) ]
253
+ pub struct TestActor ;
254
+
255
+ #[ async_trait]
256
+ impl Actor for TestActor {
257
+ type Params = ( ) ;
258
+
259
+ async fn new ( _params : Self :: Params ) -> Result < Self , anyhow:: Error > {
260
+ Ok ( Self )
261
+ }
262
+ }
263
+
264
+ #[ derive( Debug , Serialize , Deserialize , Named , Clone ) ]
265
+ pub struct Wait ( ) ;
266
+
267
+ #[ async_trait]
268
+ impl Handler < Wait > for TestActor {
269
+ async fn handle ( & mut self , _: & Instance < Self > , Wait ( ) : Wait ) -> Result < ( ) , anyhow:: Error > {
270
+ #[ allow( clippy:: disallowed_methods) ]
271
+ tokio:: time:: sleep ( Duration :: from_secs ( 60 ) ) . await ;
272
+ Ok ( ( ) )
273
+ }
274
+ }
275
+
236
276
/// Test wrapper around MockAlloc to allow us to block next() calls since
237
277
/// mockall doesn't support returning futures.
238
278
pub struct MockAllocWrapper {
@@ -302,12 +342,29 @@ pub mod test_utils {
302
342
303
343
#[ cfg( test) ]
304
344
pub ( crate ) mod testing {
345
+ use core:: panic;
305
346
use std:: collections:: HashMap ;
306
347
use std:: collections:: HashSet ;
307
-
348
+ use std:: time:: Duration ;
349
+
350
+ use hyperactor:: Mailbox ;
351
+ use hyperactor:: actor:: remote:: Remote ;
352
+ use hyperactor:: channel;
353
+ use hyperactor:: mailbox;
354
+ use hyperactor:: mailbox:: BoxedMailboxSender ;
355
+ use hyperactor:: mailbox:: DialMailboxRouter ;
356
+ use hyperactor:: mailbox:: IntoBoxedMailboxSender ;
357
+ use hyperactor:: mailbox:: MailboxServer ;
358
+ use hyperactor:: mailbox:: UndeliverableMailboxSender ;
359
+ use hyperactor:: proc:: Proc ;
360
+ use hyperactor:: reference:: Reference ;
308
361
use ndslice:: shape;
362
+ use tokio:: process:: Command ;
309
363
310
364
use super :: * ;
365
+ use crate :: alloc:: test_utils:: TestActor ;
366
+ use crate :: alloc:: test_utils:: Wait ;
367
+ use crate :: proc_mesh:: mesh_agent:: MeshAgentMessageClient ;
311
368
312
369
#[ macro_export]
313
370
macro_rules! alloc_test_suite {
@@ -367,4 +424,143 @@ pub(crate) mod testing {
367
424
assert ! ( alloc. next( ) . await . is_none( ) ) ;
368
425
assert_eq ! ( stopped, running) ;
369
426
}
427
+
428
+ async fn spawn_proc ( alloc : & ProcessAlloc ) -> ( DialMailboxRouter , Mailbox , Proc , ChannelAddr ) {
429
+ let ( router_channel_addr, router_rx) = channel:: serve ( ChannelAddr :: any ( alloc. transport ( ) ) )
430
+ . await
431
+ . map_err ( |err| AllocatorError :: Other ( err. into ( ) ) )
432
+ . unwrap ( ) ;
433
+ let router =
434
+ DialMailboxRouter :: new_with_default ( ( UndeliverableMailboxSender { } ) . into_boxed ( ) ) ;
435
+ router
436
+ . clone ( )
437
+ . serve ( router_rx, mailbox:: monitored_return_handle ( ) ) ;
438
+
439
+ let client_proc_id = ProcId ( WorldId ( format ! ( "test_{}" , alloc. world_id( ) . name( ) ) ) , 0 ) ;
440
+ let ( client_proc_addr, client_rx) = channel:: serve ( ChannelAddr :: any ( alloc. transport ( ) ) )
441
+ . await
442
+ . map_err ( |err| AllocatorError :: Other ( err. into ( ) ) )
443
+ . unwrap ( ) ;
444
+ let client_proc = Proc :: new (
445
+ client_proc_id. clone ( ) ,
446
+ BoxedMailboxSender :: new ( router. clone ( ) ) ,
447
+ ) ;
448
+ client_proc
449
+ . clone ( )
450
+ . serve ( client_rx, mailbox:: monitored_return_handle ( ) ) ;
451
+ router. bind ( client_proc_id. clone ( ) . into ( ) , client_proc_addr) ;
452
+ (
453
+ router,
454
+ client_proc. attach ( "test_proc" ) . unwrap ( ) ,
455
+ client_proc,
456
+ router_channel_addr,
457
+ )
458
+ }
459
+
460
+ async fn spawn_test_actor (
461
+ rank : usize ,
462
+ client_proc : & Proc ,
463
+ client : & Mailbox ,
464
+ router_channel_addr : ChannelAddr ,
465
+ mesh_agent : ActorRef < MeshAgent > ,
466
+ ) -> ActorRef < TestActor > {
467
+ let supervisor = client_proc. attach ( "supervisor" ) . unwrap ( ) ;
468
+ let ( supervison_port, _) = supervisor. open_port ( ) ;
469
+ let ( config_handle, _) = client. open_port ( ) ;
470
+ mesh_agent
471
+ . configure (
472
+ client,
473
+ rank,
474
+ router_channel_addr,
475
+ supervison_port. bind ( ) ,
476
+ HashMap :: new ( ) ,
477
+ config_handle. bind ( ) ,
478
+ )
479
+ . await
480
+ . unwrap ( ) ;
481
+ let remote = Remote :: collect ( ) ;
482
+ let actor_type = remote
483
+ . name_of :: < TestActor > ( )
484
+ . ok_or ( anyhow:: anyhow!( "actor not registered" ) )
485
+ . unwrap ( )
486
+ . to_string ( ) ;
487
+ let params = & ( ) ;
488
+ let ( completed_handle, mut completed_receiver) = mailbox:: open_port ( client) ;
489
+ // gspawn actor
490
+ mesh_agent
491
+ . gspawn (
492
+ client,
493
+ actor_type,
494
+ "Stuck" . to_string ( ) ,
495
+ bincode:: serialize ( params) . unwrap ( ) ,
496
+ completed_handle. bind ( ) ,
497
+ )
498
+ . await
499
+ . unwrap ( ) ;
500
+ let ( _, actor_id) = completed_receiver. recv ( ) . await . unwrap ( ) ;
501
+ ActorRef :: < TestActor > :: attest ( actor_id)
502
+ }
503
+
504
+ #[ tokio:: test]
505
+ async fn test_allocator_stuck_task ( ) {
506
+ let mut allocator = ProcessAllocator :: new ( Command :: new (
507
+ buck_resources:: get ( "monarch/hyperactor_mesh/bootstrap" ) . unwrap ( ) ,
508
+ ) ) ;
509
+ // Override config.
510
+ // Use temporary config for this test
511
+ let config = hyperactor:: config:: global:: lock ( ) ;
512
+ let _guard = config. override_key (
513
+ hyperactor:: config:: PROCESS_EXIT_TIMEOUT ,
514
+ Duration :: from_secs ( 1 ) ,
515
+ ) ;
516
+
517
+ let mut alloc = allocator
518
+ . allocate ( AllocSpec {
519
+ shape : shape ! { replica = 1 } ,
520
+ constraints : Default :: default ( ) ,
521
+ } )
522
+ . await
523
+ . unwrap ( ) ;
524
+
525
+ // Get everything up into running state. We require that we get
526
+ let mut procs = HashMap :: new ( ) ;
527
+ let mut running = HashSet :: new ( ) ;
528
+ let mut actor_ref = None ;
529
+ let ( router, client, client_proc, router_addr) = spawn_proc ( & alloc) . await ;
530
+ while running. is_empty ( ) {
531
+ match alloc. next ( ) . await . unwrap ( ) {
532
+ ProcState :: Created { proc_id, coords } => {
533
+ procs. insert ( proc_id, coords) ;
534
+ }
535
+ ProcState :: Running {
536
+ proc_id,
537
+ mesh_agent,
538
+ addr,
539
+ } => {
540
+ router. bind ( Reference :: Proc ( proc_id. clone ( ) ) , addr. clone ( ) ) ;
541
+
542
+ assert ! ( procs. contains_key( & proc_id) ) ;
543
+ assert ! ( !running. contains( & proc_id) ) ;
544
+
545
+ actor_ref = Some (
546
+ spawn_test_actor ( 0 , & client_proc, & client, router_addr, mesh_agent) . await ,
547
+ ) ;
548
+ running. insert ( proc_id) ;
549
+ break ;
550
+ }
551
+ event => panic ! ( "unexpected event: {:?}" , event) ,
552
+ }
553
+ }
554
+ assert ! ( actor_ref. unwrap( ) . send( & client, Wait ( ) ) . is_ok( ) ) ;
555
+
556
+ // There is a stuck actor! We should get a watchdog failure.
557
+ alloc. stop ( ) . await . unwrap ( ) ;
558
+ let mut stopped = HashSet :: new ( ) ;
559
+ while let Some ( ProcState :: Stopped { proc_id, reason } ) = alloc. next ( ) . await {
560
+ assert_eq ! ( reason, ProcStopReason :: Watchdog ) ;
561
+ stopped. insert ( proc_id) ;
562
+ }
563
+ assert ! ( alloc. next( ) . await . is_none( ) ) ;
564
+ assert_eq ! ( stopped, running) ;
565
+ }
370
566
}
0 commit comments