2020#include "../../../util/pod/fd_pod_format.h"
2121#include "../../../waltz/resolv/fd_io_readline.h"
2222#include "../../shared/commands/monitor/helper.h"
23+ #include "../../../disco/metrics/fd_metrics.h"
2324#include "../../../discof/repair/fd_repair_tile.c"
2425
2526#include "gossip.h"
@@ -131,6 +132,7 @@ repair_topo( config_t * config ) {
131132 fd_topob_wksp ( topo , "snapin_manif" );
132133
133134 fd_topob_wksp ( topo , "slot_fseqs" ); /* fseqs for marked slots eg. turbine slot */
135+ fd_topob_wksp ( topo , "genesi_out" ); /* mock genesi_out for ipecho */
134136
135137 #define FOR (cnt ) for( ulong i=0UL; i<cnt; i++ )
136138
@@ -160,6 +162,8 @@ repair_topo( config_t * config ) {
160162
161163 /**/ fd_topob_link ( topo , "snapin_manif" , "snapin_manif" , 2UL , sizeof (fd_snapshot_manifest_t ), 1UL );
162164
165+ /**/ fd_topob_link ( topo , "genesi_out" , "genesi_out" , 2UL , 128 , 1UL );
166+
163167 FOR (net_tile_cnt ) fd_topos_net_rx_link ( topo , "net_repair ", i , config -> net .ingress_buffer_size );
164168 FOR (net_tile_cnt ) fd_topos_net_rx_link ( topo , "net_quic ", i , config -> net .ingress_buffer_size );
165169 FOR (net_tile_cnt ) fd_topos_net_rx_link ( topo , "net_shred ", i, config->net.ingress_buffer_size );
@@ -229,7 +233,6 @@ repair_topo( config_t * config ) {
229233 FOR (shred_tile_cnt ) fd_topob_tile_out ( topo , "shred" , i , "shred_out" , i );
230234 FOR (shred_tile_cnt ) fd_topob_tile_out ( topo , "shred ", i, " shred_net ", i );
231235 FOR (shred_tile_cnt ) fd_topob_tile_in ( topo , "shred" , i , "metric_in" , "ipecho_out" , 0UL , FD_TOPOB_RELIABLE , FD_TOPOB_POLLED );
232-
233236 FOR (shred_tile_cnt ) fd_topob_tile_in ( topo , "shred ", i, " metric_in ", " repair_shred ", i, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED );
234237
235238 /**/ fd_topob_tile_out( topo, " repair ", 0UL, " repair_net ", 0UL );
@@ -260,6 +263,7 @@ repair_topo( config_t * config ) {
260263 FOR (sign_tile_cnt - 1 ) fd_topob_tile_in ( topo , "repair" , 0UL , "metric_in" , "sign_repair" , i , FD_TOPOB_UNRELIABLE , FD_TOPOB_POLLED );
261264
262265 /**/ fd_topob_tile_in ( topo , "gossip" , 0UL , "metric_in" , "sign_gossip" , 0UL , FD_TOPOB_UNRELIABLE , FD_TOPOB_UNPOLLED );
266+ /**/ fd_topob_tile_in ( topo , "ipecho" , 0UL , "metric_in" , "genesi_out" , 0UL , FD_TOPOB_RELIABLE , FD_TOPOB_POLLED );
263267
264268 if ( 1 ) {
265269 fd_topob_wksp ( topo , "scap" );
@@ -283,8 +287,8 @@ repair_topo( config_t * config ) {
283287 FD_TEST ( fd_link_permit_no_producers ( topo , "quic_net" ) == quic_tile_cnt );
284288 FD_TEST ( fd_link_permit_no_producers ( topo , "poh_shred" ) == 1UL );
285289 FD_TEST ( fd_link_permit_no_producers ( topo , "send_out" ) == 1UL );
286-
287- FD_TEST ( fd_link_permit_no_consumers ( topo , "net_quic" ) == quic_tile_cnt );
290+ FD_TEST ( fd_link_permit_no_producers ( topo , "genesi_out" ) == 1UL );
291+ FD_TEST ( fd_link_permit_no_consumers ( topo , "net_quic" ) == net_tile_cnt );
288292
289293 config -> tiles .send .send_src_port = 0 ; /* disable send */
290294
@@ -311,21 +315,24 @@ repair_cmd_args( int * pargc,
311315
312316 if ( FD_UNLIKELY ( !* pargc ) )
313317 FD_LOG_ERR (( "\n \
314- usage: (1) repair --manifest-path <manifest_path> [--iptable-path <iptable_path>] \n \
315- (2) repair --metrics [--iptable-path <iptable_path>] \n\n \
318+ usage: (1) repair --manifest-path <manifest_path> [--iptable-path <iptable_path>] [--sorted] \n \
319+ (2) repair --metrics [--iptable-path <iptable_path>] [--sorted] \n\n \
316320 (3) repair --tree \n\n \
317321 Passing --manifest-path starts up profiler mode, which runs a reduced topology that tests catchup and repair performance. \n \
318322 Passing --metrics prints recent slot completion times and response latencies during a live run. These modes are exclusive. \n \
319323 Passing --tree prints the tree of the repair process. \n \
324+ --sorted: optional flag to print the slots in sorted order. Default prints in completion order.\n \
320325 --iptable-path: optional path to iptable file to map IP addresses to locations." ));
321326
322327 char const * manifest_path = fd_env_strip_cmdline_cstr ( pargc , pargv , "--manifest-path" , NULL , NULL );
328+ int sorted = fd_env_strip_cmdline_contains ( pargc , pargv , "--sorted" );
323329 if ( fd_env_strip_cmdline_contains ( pargc , pargv , "--metrics" ) ) {
324330 args -> repair .metrics_only = 1 ;
325331 if ( FD_UNLIKELY ( manifest_path ) ) FD_LOG_ERR (( "metrics mode does not support --manifest-path" ));
326332 } else if ( fd_env_strip_cmdline_contains ( pargc , pargv , "--tree" ) ) {
327333 args -> repair .forest_only = 1 ;
328334 if ( FD_UNLIKELY ( manifest_path ) ) FD_LOG_ERR (( "tree mode does not support --manifest-path" ));
335+ if ( FD_UNLIKELY ( sorted ) ) FD_LOG_ERR (( "tree mode does not support --sorted" ));
329336 } else {
330337 fd_cstr_fini ( fd_cstr_append_cstr_safe ( fd_cstr_init ( args -> repair .manifest_path ), manifest_path , sizeof (args -> repair .manifest_path )- 1UL ) );
331338 }
@@ -334,6 +341,9 @@ usage: (1) repair --manifest-path <manifest_path> [--iptable-path <iptable_path>
334341 if ( FD_LIKELY ( iptable_path ) ) {
335342 fd_cstr_fini ( fd_cstr_append_cstr_safe ( fd_cstr_init ( args -> repair .iptable_path ), iptable_path , sizeof (args -> repair .iptable_path )- 1UL ) );
336343 }
344+ if ( FD_LIKELY ( sorted ) ) {
345+ args -> repair .sorted = 1 ;
346+ }
337347}
338348
339349static char *
@@ -430,12 +440,18 @@ print_histogram_buckets( volatile ulong * metrics,
430440 printf ( " +---------------------+--------------------+---------------+\n" );
431441}
432442
443+ static fd_slot_metrics_t temp_slots [ FD_CATCHUP_METRICS_MAX ];
444+
433445static void
434- print_catchup_slots ( fd_wksp_t * repair_tile_wksp , ctx_t * repair_ctx , int verbose ) {
446+ print_catchup_slots ( fd_wksp_t * repair_tile_wksp , ctx_t * repair_ctx , int verbose , int sorted ) {
435447 fd_repair_metrics_t * catchup = repair_ctx -> slot_metrics ;
436448 ulong catchup_gaddr = fd_wksp_gaddr_fast ( repair_ctx -> wksp , catchup );
437449 fd_repair_metrics_t * catchup_table = (fd_repair_metrics_t * )fd_wksp_laddr ( repair_tile_wksp , catchup_gaddr );
438- fd_repair_metrics_print ( catchup_table , verbose );
450+ if ( FD_LIKELY ( sorted ) ) {
451+ fd_repair_metrics_print_sorted ( catchup_table , verbose , temp_slots );
452+ } else {
453+ fd_repair_metrics_print ( catchup_table , verbose );
454+ }
439455}
440456
441457static fd_location_info_t * location_table ;
@@ -452,7 +468,7 @@ sort_peers_by_latency( fd_policy_peer_t * active_table, fd_peer_dlist_t * peers_
452468 if ( FD_UNLIKELY ( i >= FD_ACTIVE_KEY_MAX ) ) break ;
453469 iter = fd_peer_dlist_iter_fwd_next ( iter , peers_dlist , peers_arr );
454470 }
455- FD_LOG_NOTICE (( "Fast peers cnt: %lu. Remainder is slow." , i )) ;
471+ ulong fast_cnt = i ;
456472 iter = fd_peer_dlist_iter_fwd_init ( peers_wlist , peers_arr );
457473 while ( !fd_peer_dlist_iter_done ( iter , peers_wlist , peers_arr ) ) {
458474 fd_peer_t * peer = fd_peer_dlist_iter_ele ( iter , peers_wlist , peers_arr );
@@ -461,6 +477,7 @@ sort_peers_by_latency( fd_policy_peer_t * active_table, fd_peer_dlist_t * peers_
461477 if ( FD_UNLIKELY ( i >= FD_ACTIVE_KEY_MAX ) ) break ;
462478 iter = fd_peer_dlist_iter_fwd_next ( iter , peers_wlist , peers_arr );
463479 }
480+ FD_LOG_NOTICE (( "Fast peers cnt: %lu. Slow peers cnt: %lu." , fast_cnt , i - fast_cnt ));
464481
465482 ulong peer_cnt = i ;
466483 for ( uint i = 0 ; i < peer_cnt - 1 ; i ++ ) {
@@ -511,7 +528,7 @@ print_peer_location_latency( fd_wksp_t * repair_tile_wksp, ctx_t * tile_ctx ) {
511528 char * geolocation = info ? info -> location : "Unknown" ;
512529 double peer_bps = (double )(active -> res_cnt * FD_SHRED_MIN_SZ ) / ((double )(active -> last_resp_ts - active -> first_resp_ts ) / 1e9 );
513530 double req_bps = (double )active -> req_cnt * 202 / ((double )(active -> last_req_ts - active -> first_req_ts ) / 1e9 );
514- printf ( "| %-46s | %-7lu | %-8.2f | %-8.2f | %-7.2f | %10.3fms | %s\n" , FD_BASE58_ENC_32_ALLOCA ( & active -> key ), active -> req_cnt , req_bps , peer_bps , (double )active -> res_cnt / (double )active -> req_cnt , ((double )active -> total_lat / (double )active -> res_cnt ) / 1e6 , geolocation );
531+ printf ( "%-5u | %-46s | %-7lu | %-8.2f | %-8.2f | %-7.2f | %10.3fms | %s\n" , i , FD_BASE58_ENC_32_ALLOCA ( & active -> key ), active -> req_cnt , req_bps , peer_bps , (double )active -> res_cnt / (double )active -> req_cnt , ((double )active -> total_lat / (double )active -> res_cnt ) / 1e6 , geolocation );
515532 }
516533 }
517534 fflush ( stdout );
@@ -549,18 +566,15 @@ repair_cmd_fn_metrics_mode( args_t * args,
549566 config_t * config ) {
550567 FD_LOG_NOTICE (( "Attempting to join with running firedancer-dev instance..." ));
551568
552- fd_topo_t * topo = & config -> topo ;
553- ulong wksp_id = fd_topo_find_wksp ( topo , "repair" );
569+ fd_topo_t * topo = & config -> topo ;
570+ ulong wksp_id = fd_topo_find_wksp ( topo , "repair" );
554571 if ( FD_UNLIKELY ( wksp_id == ULONG_MAX ) ) FD_LOG_ERR (( "repair workspace not found" ));
555-
556572 fd_topo_wksp_t * repair_wksp = & topo -> workspaces [ wksp_id ];
557-
558- ulong tile_id = fd_topo_find_tile ( topo , "repair" , 0UL );
559- if ( FD_UNLIKELY ( tile_id == ULONG_MAX ) ) FD_LOG_ERR (( "repair tile not found" ));
560-
561573 fd_topo_join_workspace ( topo , repair_wksp , FD_SHMEM_JOIN_MODE_READ_ONLY );
562574
563575 /* Access the repair tile scratch memory where repair_tile_ctx is stored */
576+ ulong tile_id = fd_topo_find_tile ( topo , "repair" , 0UL );
577+ if ( FD_UNLIKELY ( tile_id == ULONG_MAX ) ) FD_LOG_ERR (( "repair tile not found" ));
564578 fd_topo_tile_t * tile = & topo -> tiles [ tile_id ];
565579 void * scratch = fd_topo_obj_laddr ( & config -> topo , tile -> tile_obj_id );
566580 if ( FD_UNLIKELY ( !scratch ) ) FD_LOG_ERR (( "Failed to access repair tile scratch memory" ));
@@ -570,16 +584,8 @@ repair_cmd_fn_metrics_mode( args_t * args,
570584
571585 /* catchup cmd owned memory */
572586 location_table = fd_location_table_join ( fd_location_table_new ( location_table_mem ) );
573-
574587 read_iptable ( args -> repair .iptable_path , location_table );
575588
576- if ( FD_UNLIKELY ( !args -> repair .metrics_only ) ) {
577- print_peer_location_latency ( repair_wksp -> wksp , repair_ctx );
578- print_catchup_slots ( repair_wksp -> wksp , repair_ctx , 0 );
579- printf ( "\nCatchup tool completed successfully.\n" );
580- return ;
581- }
582-
583589 // Add terminal setup here - same as monitor.c
584590 atexit ( restore_terminal );
585591 if ( FD_UNLIKELY ( 0 != tcgetattr ( STDIN_FILENO , & termios_backup ) ) ) {
@@ -603,7 +609,7 @@ repair_cmd_fn_metrics_mode( args_t * args,
603609 long now = fd_log_wallclock ();
604610 if ( FD_UNLIKELY ( now - last_print > 1e9L ) ) {
605611 last_print = now ;
606- print_catchup_slots ( repair_wksp -> wksp , repair_ctx , catchup_verbose );
612+ print_catchup_slots ( repair_wksp -> wksp , repair_ctx , catchup_verbose , args -> repair . sorted );
607613 printf ( "catchup slots | Use 'i' to toggle extra slot information" TEXT_NEWLINE );
608614 fflush ( stdout );
609615
@@ -692,6 +698,11 @@ repair_cmd_fn_profiler_mode( args_t * args,
692698 FD_TEST ( repair_tile_idx != ULONG_MAX );
693699 fd_topo_tile_t * repair_tile = & config -> topo .tiles [ repair_tile_idx ];
694700
701+ ulong wksp_id = fd_topo_find_wksp ( & config -> topo , "repair" );
702+ if ( FD_UNLIKELY ( wksp_id == ULONG_MAX ) ) FD_LOG_ERR (( "repair workspace not found" ));
703+ fd_topo_wksp_t * repair_wksp = & config -> topo .workspaces [ wksp_id ];
704+ fd_topo_join_workspace ( & config -> topo , repair_wksp , FD_SHMEM_JOIN_MODE_READ_WRITE );
705+
695706 ulong shred_tile_idx = fd_topo_find_tile ( & config -> topo , "shred" , 0UL );
696707 FD_TEST ( shred_tile_idx != ULONG_MAX );
697708 fd_topo_tile_t * shred_tile = & config -> topo .tiles [ shred_tile_idx ];
@@ -702,9 +713,27 @@ repair_cmd_fn_profiler_mode( args_t * args,
702713 volatile ulong * repair_metrics = fd_metrics_tile ( repair_tile -> metrics );
703714 FD_TEST ( repair_metrics );
704715
716+ /* Collect all net tiles and their repair_net link metrics */
717+ ulong net_tile_cnt = config -> layout .net_tile_count ;
718+ volatile ulong * * repair_net_links = aligned_alloc ( 8UL , net_tile_cnt * sizeof (volatile ulong * ) );
719+ FD_TEST ( repair_net_links );
720+
721+ for ( ulong i = 0UL ; i < net_tile_cnt ; i ++ ) {
722+ ulong tile_idx = fd_topo_find_tile ( & config -> topo , "net" , i );
723+ if ( FD_UNLIKELY ( tile_idx == ULONG_MAX ) ) FD_LOG_ERR (( "net tile %lu not found" , i ));
724+ fd_topo_tile_t * tile = & config -> topo .tiles [ tile_idx ];
725+
726+ ulong repair_net_in_idx = fd_topo_find_tile_in_link ( & config -> topo , tile , "repair_net" , 0UL );
727+ if ( FD_UNLIKELY ( repair_net_in_idx == ULONG_MAX ) ) {
728+ FD_LOG_ERR (( "repair_net link not found for net tile %lu" , i ));
729+ }
730+ repair_net_links [i ] = fd_metrics_link_in ( tile -> metrics , repair_net_in_idx );
731+ FD_TEST ( repair_net_links [i ] );
732+ }
733+
705734 FD_LOG_NOTICE (( "Repair profiler run" ));
706735
707- ulong shred_out_link_idx = fd_topo_find_link ( & config -> topo , "shred_out" , 0UL );
736+ ulong shred_out_link_idx = fd_topo_find_link ( & config -> topo , "shred_out" , 0UL );
708737 FD_TEST ( shred_out_link_idx != ULONG_MAX );
709738 fd_topo_link_t * shred_out_link = & config -> topo .links [ shred_out_link_idx ];
710739 FD_TEST ( shred_out_link );
@@ -742,6 +771,18 @@ repair_cmd_fn_profiler_mode( args_t * args,
742771 printf ( " | HighestWindow | %s |\n" , fmt_count ( buf2 , repair_metrics [ MIDX ( COUNTER , REPAIR , SENT_PKT_TYPES_NEEDED_HIGHEST_WINDOW ) ] ) );
743772 printf ( " | Index | %s |\n" , fmt_count ( buf2 , repair_metrics [ MIDX ( COUNTER , REPAIR , SENT_PKT_TYPES_NEEDED_WINDOW ) ] ) );
744773 printf ( " +---------------+--------------+\n" );
774+ printf ( " Send Pkt Rate: %s pps\n" , fmt_count ( buf2 , repair_metrics [ MIDX ( GAUGE , REPAIR , SEND_PKT_RATE ) ] ) );
775+
776+ /* Sum overrun across all net tiles connected to repair_net */
777+ ulong total_overrun = repair_net_links [0 ][ MIDX ( COUNTER , LINK , OVERRUN_POLLING_FRAG_COUNT ) ]; /* coarse double counting prevention */
778+ ulong total_consumed = 0UL ;
779+ for ( ulong i = 0UL ; i < net_tile_cnt ; i ++ ) {
780+ volatile ulong * ovar_net_metrics = repair_net_links [i ];
781+ total_overrun += ovar_net_metrics [ MIDX ( COUNTER , LINK , OVERRUN_READING_FRAG_COUNT ) ];
782+ total_consumed += ovar_net_metrics [ MIDX ( COUNTER , LINK , CONSUMED_COUNT ) ];
783+ }
784+ printf ( " Total overrun: %s\n" , fmt_count ( buf2 , total_overrun ) );
785+ printf ( " Net consumed: %s\n" , fmt_count ( buf2 , total_consumed ) );
745786
746787 print_histogram_buckets ( repair_metrics ,
747788 MIDX ( HISTOGRAM , REPAIR , RESPONSE_LATENCY ),
@@ -767,8 +808,20 @@ repair_cmd_fn_profiler_mode( args_t * args,
767808 fflush ( stdout );
768809 last_print = now ;
769810 }
811+
770812 if ( FD_UNLIKELY ( catchup_finished ) ) {
771- repair_cmd_fn_metrics_mode ( args , config );
813+ /* Access the repair tile scratch memory where repair_tile_ctx is stored */
814+ void * scratch = fd_topo_obj_laddr ( & config -> topo , repair_tile -> tile_obj_id );
815+ if ( FD_UNLIKELY ( !scratch ) ) FD_LOG_ERR (( "Failed to access repair tile scratch memory" ));
816+ FD_SCRATCH_ALLOC_INIT ( l , scratch );
817+ ctx_t * repair_ctx = FD_SCRATCH_ALLOC_APPEND ( l , alignof(ctx_t ), sizeof (ctx_t ) );
818+
819+ /* repair cmd owned memory */
820+ location_table = fd_location_table_join ( fd_location_table_new ( location_table_mem ) );
821+ read_iptable ( args -> repair .iptable_path , location_table );
822+ print_peer_location_latency ( repair_wksp -> wksp , repair_ctx );
823+ print_catchup_slots ( repair_wksp -> wksp , repair_ctx , 0 , 0 );
824+ printf ( "\nCatchup tool completed successfully.\n" );
772825 FD_LOG_ERR (("catchup finished. slot %lu" , turbine_slot0 ));
773826 }
774827 }
0 commit comments