From b22d12ccf7fd75293bff672d811969f43b64fe3a Mon Sep 17 00:00:00 2001 From: jherrera-jump Date: Mon, 23 Feb 2026 19:15:16 +0000 Subject: [PATCH] gui, topo: add allow_crash flag --- src/app/firedancer/topology.c | 38 ++++++++--------- src/app/shared/commands/run/run.c | 66 +++++++++++++++++++++++------- src/app/shared/commands/run/run.h | 6 +-- src/disco/gui/fd_gui.c | 44 +++++++++----------- src/disco/gui/fd_gui.h | 31 +++----------- src/disco/gui/fd_gui_tile.c | 47 ++++----------------- src/disco/topo/fd_topo.h | 1 + src/disco/topo/fd_topob.c | 2 + src/discof/replay/fd_replay_tile.c | 64 ++++++++++++++++++++++------- src/discof/replay/fd_replay_tile.h | 13 +++++- 10 files changed, 170 insertions(+), 142 deletions(-) diff --git a/src/app/firedancer/topology.c b/src/app/firedancer/topology.c index 7f2d3a300e3..c12da70e61a 100644 --- a/src/app/firedancer/topology.c +++ b/src/app/firedancer/topology.c @@ -1299,41 +1299,41 @@ fd_topo_initialize( config_t * config ) { fd_topob_wksp( topo, "gui" ); fd_topob_wksp( topo, "gui_replay" ); - /**/ fd_topob_tile( topo, "gui", "gui", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 1, 0 ); + /**/ fd_topob_tile( topo, "gui", "gui", "metric_in", tile_to_cpu[ topo->tile_cnt ], 0, 1, 0 )->allow_crash = 1; /* Read banks */ /**/ fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "gui", 0UL ) ], banks_obj, FD_SHMEM_JOIN_MODE_READ_ONLY ); /**/ fd_topob_tile_uses( topo, &topo->tiles[ fd_topo_find_tile( topo, "gui", 0UL ) ], banks_locks_obj, FD_SHMEM_JOIN_MODE_READ_WRITE ); /* release ownership of banks */ - /**/ fd_topob_link( topo, "gui_replay", "gui_replay", 128, 0UL,2UL ); /* burst==2 since a bank and its parent may be sent in one burst */ + /**/ fd_topob_link( topo, "gui_replay", "gui_replay", 128, 0UL, 1UL ); /**/ fd_topob_tile_out( topo, "gui", 0UL, "gui_replay", 0UL ); /**/ fd_topob_tile_in ( topo, "replay", 0UL, "metric_in", "gui_replay", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); - /* topo, tile_name, tile_kind_id, fseq_wksp, link_name, link_kind_id, reliable, polled */ - FOR(net_tile_cnt) fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "net_gossvf", i, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); /* No reliable consumers of networking fragments, may be dropped or overrun */ - /**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "repair_net", 0UL, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); - FOR(shred_tile_cnt) fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "shred_out", i, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); - /**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "gossip_net", 0UL, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); - /**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "gossip_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); - /**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "tower_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); - /**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "replay_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); - /**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "replay_epoch", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); - /**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "genesi_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + /* topo, tile_name, tile_kind_id, fseq_wksp, link_name, link_kind_id, reliable, polled */ + FOR(net_tile_cnt) fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "net_gossvf", i, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); /* No reliable consumers of networking fragments, may be dropped or overrun */ + /**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "repair_net", 0UL, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); + FOR(shred_tile_cnt) fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "shred_out", i, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + /**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "gossip_net", 0UL, FD_TOPOB_UNRELIABLE, FD_TOPOB_POLLED ); + /**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "gossip_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + /**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "tower_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + /**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "replay_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + /**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "replay_epoch", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + /**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "genesi_out", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); if( leader_enabled ) { - /**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "pack_poh", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); - /**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "pack_execle", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); - FOR(execle_tile_cnt) fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "execle_poh", i, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + /**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "pack_poh", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + /**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "pack_execle", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + FOR(execle_tile_cnt) fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "execle_poh", i, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); } - FOR(execrp_tile_cnt) fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "execrp_replay", i, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + FOR(execrp_tile_cnt) fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "execrp_replay", i, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); if( FD_LIKELY( snapshots_enabled ) ) { - /**/ fd_topob_tile_in ( topo, "gui", 0UL, "metric_in", "snapct_gui", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); - /**/ fd_topob_tile_in ( topo, "gui", 0UL, "metric_in", "snapin_gui", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + /**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "snapct_gui", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + /**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "snapin_gui", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); } if( FD_UNLIKELY( config->tiles.bundle.enabled ) ) { - /**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "bundle_status", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); + /**/ fd_topob_tile_in( topo, "gui", 0UL, "metric_in", "bundle_status", 0UL, FD_TOPOB_RELIABLE, FD_TOPOB_POLLED ); } } diff --git a/src/app/shared/commands/run/run.c b/src/app/shared/commands/run/run.c index f7fa782bfb6..f1dec17c4b1 100644 --- a/src/app/shared/commands/run/run.c +++ b/src/app/shared/commands/run/run.c @@ -63,9 +63,9 @@ run_cmd_perm( args_t * args, } struct pidns_clone_args { - config_t const * config; - int * pipefd; - int closefd; + config_t * config; + int * pipefd; + int closefd; }; extern char fd_log_private_path[ 1024 ]; /* empty string on start */ @@ -231,7 +231,7 @@ main_pid_namespace( void * _args ) { if( FD_UNLIKELY( close( args->closefd ) ) ) FD_LOG_ERR(( "close() failed (%i-%s)", errno, fd_io_strerror( errno ) )); } - config_t const * config = args->config; + config_t * config = args->config; fd_log_thread_set( "pidns" ); ulong pid = fd_sandbox_getpid(); /* Need to read /proc again.. we got a new PID from clone */ @@ -441,18 +441,43 @@ main_pid_namespace( void * _args ) { char * tile_name = child_names[ i ]; ulong tile_idx = child_idxs[ i ]; - ulong tile_id = config->topo.tiles[ tile_idx ].kind_id; + if( FD_UNLIKELY( tile_idx==ULONG_MAX ) ) FD_LOG_ERR(( "unreachable" )); + fd_topo_tile_t const * tile = &config->topo.tiles[ tile_idx ]; if( FD_UNLIKELY( !WIFEXITED( wstatus ) ) ) { - FD_LOG_ERR_NOEXIT(( "tile %s:%lu exited with signal %d (%s)", tile_name, tile_id, WTERMSIG( wstatus ), fd_io_strsignal( WTERMSIG( wstatus ) ) )); - fd_sys_util_exit_group( WTERMSIG( wstatus ) ? WTERMSIG( wstatus ) : 1 ); + if( FD_UNLIKELY( tile->allow_crash ) ) { + FD_LOG_ERR_NOEXIT(( "expendable tile %s:%lu crashed with signal %d (%s)", tile_name, tile->kind_id, WTERMSIG( wstatus ), fd_io_strsignal( WTERMSIG( wstatus ) ) )); + + /* We need to make all reliable links unreliable by + provisioning infinite credits to prevent the application + from stalling. */ + for( ulong j=0UL; jin_cnt; j++ ) { + if( FD_UNLIKELY( !tile->in_link_poll[ j ] || !tile->in_link_reliable[ j ] ) ) continue; + + ulong fseq_id = tile->in_link_fseq_obj_id[ j ]; + fd_topo_wksp_t * wksp = &config->topo.workspaces[ config->topo.objs[ fseq_id ].wksp_id ]; + fd_topo_join_workspace( &config->topo, wksp, FD_SHMEM_JOIN_MODE_READ_WRITE, 0 ); + ulong * fseq = fd_fseq_join( fd_topo_obj_laddr( &config->topo, fseq_id ) ); + if( FD_UNLIKELY( !fseq ) ) { + FD_LOG_ERR_NOEXIT(( "failed to join fseq" )); + fd_sys_util_exit_group( 1 ); + } + fd_fseq_update( fseq, ULONG_MAX-1UL ); + fd_topo_leave_workspace( &config->topo, wksp ); + + fd_topo_link_t * in_link = &config->topo.links[ tile->in_link_id[ j ] ]; + FD_LOG_NOTICE(( "demoted reliable in-link %s(%lu)->%s to unreliable", in_link->name, in_link->kind_id, tile->name )); + } + } else { + FD_LOG_ERR_NOEXIT(( "tile %s:%lu crashed with signal %d (%s)", tile_name, tile->kind_id, WTERMSIG( wstatus ), fd_io_strsignal( WTERMSIG( wstatus ) ) )); + fd_sys_util_exit_group( WTERMSIG( wstatus ) ? WTERMSIG( wstatus ) : 1 ); + } } else { int exit_code = WEXITSTATUS( wstatus ); - if( FD_LIKELY( !exit_code && tile_idx!=ULONG_MAX && config->topo.tiles[ tile_idx ].allow_shutdown ) ) { - found = 1; - FD_LOG_INFO(( "tile %s:%lu exited gracefully with code %d", tile_name, tile_id, exit_code )); + if( FD_LIKELY( tile->allow_shutdown ) ) { + FD_LOG_INFO(( "tile %s:%lu exited gracefully with code %d", tile_name, tile->kind_id, exit_code )); } else { - FD_LOG_ERR_NOEXIT(( "tile %s:%lu exited with code %d", tile_name, tile_id, exit_code )); + FD_LOG_ERR_NOEXIT(( "tile %s:%lu exited unexpectedly with code %d", tile_name, tile->kind_id, exit_code )); fd_sys_util_exit_group( exit_code ? exit_code : 1 ); } } @@ -465,9 +490,9 @@ main_pid_namespace( void * _args ) { } int -clone_firedancer( config_t const * config, - int close_fd, - int * out_pipe ) { +clone_firedancer( config_t * config, + int close_fd, + int * out_pipe ) { /* This pipe is here just so that the child process knows when the parent has died (it will get a HUP). */ int pipefd[2]; @@ -633,7 +658,18 @@ initialize_workspaces( config_t * config ) { } fd_topo_join_workspace( &config->topo, wksp, FD_SHMEM_JOIN_MODE_READ_WRITE, 0 ); fd_topo_wksp_new( &config->topo, wksp, CALLBACKS ); - fd_topo_leave_workspace( &config->topo, wksp ); + + /* Don't unmap fseq wksp for tile with allow_crash=1, since we need + to retain access in order to demote reliable links to unreliable. */ + int is_crashable_fseq_wksp = 0; + for( ulong j=0UL; jtopo.tile_cnt; j++ ){ + fd_topo_tile_t * tile = &config->topo.tiles[ j ]; + for( ulong k=0UL; kin_cnt; k++ ) { + if( FD_UNLIKELY( !tile->allow_crash || !tile->in_link_poll[ k ] || !tile->in_link_reliable[ k ] ) ) continue; + if( FD_UNLIKELY( i==config->topo.objs[ tile->in_link_fseq_obj_id[ k ] ].wksp_id ) ) is_crashable_fseq_wksp = 1; + } + } + if( FD_LIKELY( !is_crashable_fseq_wksp ) ) fd_topo_leave_workspace( &config->topo, wksp ); } if( FD_UNLIKELY( seteuid( uid ) ) ) FD_LOG_ERR(( "seteuid() failed (%i-%s)", errno, fd_io_strerror( errno ) )); diff --git a/src/app/shared/commands/run/run.h b/src/app/shared/commands/run/run.h index 853c594326e..41c99eb24cd 100644 --- a/src/app/shared/commands/run/run.h +++ b/src/app/shared/commands/run/run.h @@ -10,9 +10,9 @@ void * create_clone_stack( void ); int -clone_firedancer( config_t const * config, - int close_fd, - int * out_pipe ); +clone_firedancer( config_t * config, + int close_fd, + int * out_pipe ); void fdctl_check_configure( config_t const * config ); diff --git a/src/disco/gui/fd_gui.c b/src/disco/gui/fd_gui.c index 80a3704ce97..c3dd44cf65c 100644 --- a/src/disco/gui/fd_gui.c +++ b/src/disco/gui/fd_gui.c @@ -2830,28 +2830,24 @@ fd_gui_handle_tower_update( fd_gui_t * gui, } void -fd_gui_handle_replay_update( fd_gui_t * gui, - fd_gui_slot_completed_t * slot_completed, - fd_hash_t const * block_hash, - ulong vote_slot, - ulong storage_slot, - ulong rooted_slot, - ulong identity_balance, - long now ) { +fd_gui_handle_replay_update( fd_gui_t * gui, + fd_replay_slot_completed_t const * slot_completed, + ulong vote_slot, + long now ) { (void)now; - if( FD_LIKELY( rooted_slot!=ULONG_MAX && gui->summary.slot_rooted!=rooted_slot ) ) { - fd_gui_handle_rooted_slot( gui, rooted_slot ); + if( FD_LIKELY( slot_completed->root_slot!=ULONG_MAX && gui->summary.slot_rooted!=slot_completed->root_slot ) ) { + fd_gui_handle_rooted_slot( gui, slot_completed->root_slot ); } - if( FD_LIKELY( gui->summary.slot_storage!=storage_slot ) ) { - gui->summary.slot_storage = storage_slot; + if( FD_LIKELY( gui->summary.slot_storage!=slot_completed->storage_slot ) ) { + gui->summary.slot_storage = slot_completed->storage_slot; fd_gui_printf_storage_slot( gui ); fd_http_server_ws_broadcast( gui->http ); } - if( FD_UNLIKELY( identity_balance!=ULONG_MAX && gui->summary.identity_account_balance!=identity_balance ) ) { - gui->summary.identity_account_balance = identity_balance; + if( FD_UNLIKELY( slot_completed->identity_balance!=ULONG_MAX && gui->summary.identity_account_balance!=slot_completed->identity_balance ) ) { + gui->summary.identity_account_balance = slot_completed->identity_balance; fd_gui_printf_identity_balance( gui ); fd_http_server_ws_broadcast( gui->http ); @@ -2873,12 +2869,12 @@ fd_gui_handle_replay_update( fd_gui_t * gui, if( FD_UNLIKELY( slot->mine ) ) { fd_gui_leader_slot_t * lslot = fd_gui_get_leader_slot( gui, slot->slot ); - if( FD_LIKELY( lslot ) ) fd_memcpy( lslot->block_hash.uc, block_hash->uc, sizeof(fd_hash_t) ); + if( FD_LIKELY( lslot ) ) fd_memcpy( lslot->block_hash.uc, slot_completed->block_hash.uc, sizeof(fd_hash_t) ); } - slot->completed_time = slot_completed->completed_time; + slot->completed_time = slot_completed->completion_time_nanos; slot->parent_slot = slot_completed->parent_slot; - slot->max_compute_units = fd_uint_if( slot_completed->max_compute_units==UINT_MAX, slot->max_compute_units, slot_completed->max_compute_units ); + slot->max_compute_units = fd_uint_if( slot_completed->cost_tracker.block_cost_limit==ULONG_MAX, slot->max_compute_units, (uint)slot_completed->cost_tracker.block_cost_limit ); if( FD_LIKELY( slot->levellevel = FD_GUI_SLOT_LEVEL_COMPLETED; } } - slot->total_txn_cnt = slot_completed->total_txn_cnt; - slot->vote_txn_cnt = slot_completed->vote_txn_cnt; - slot->failed_txn_cnt = slot_completed->failed_txn_cnt; - slot->nonvote_failed_txn_cnt = slot_completed->nonvote_failed_txn_cnt; + slot->total_txn_cnt = fd_uint_if( slot_completed->total_txn_cnt==ULONG_MAX, slot->total_txn_cnt, (uint)slot_completed->total_txn_cnt ); + slot->vote_txn_cnt = fd_uint_if( slot_completed->vote_txn_cnt==ULONG_MAX, slot->vote_txn_cnt, (uint)slot_completed->vote_txn_cnt ); + slot->failed_txn_cnt = fd_uint_if( slot_completed->failed_txn_cnt==ULONG_MAX, slot->failed_txn_cnt, (uint)slot_completed->failed_txn_cnt ); + slot->nonvote_failed_txn_cnt = fd_uint_if( slot_completed->nonvote_failed_txn_cnt==ULONG_MAX, slot->nonvote_failed_txn_cnt, (uint)slot_completed->nonvote_failed_txn_cnt ); slot->transaction_fee = slot_completed->transaction_fee; slot->priority_fee = slot_completed->priority_fee; slot->tips = slot_completed->tips; - slot->compute_units = slot_completed->compute_units; - slot->shred_cnt = slot_completed->shred_cnt; + slot->compute_units = fd_uint_if( slot_completed->cost_tracker.block_cost==ULONG_MAX, slot->compute_units, (uint)slot_completed->cost_tracker.block_cost ); + slot->shred_cnt = fd_uint_if( slot_completed->shred_cnt==ULONG_MAX, slot->shred_cnt, (uint)slot_completed->shred_cnt ); slot->vote_slot = vote_slot; try_publish_vote_status( gui, slot_completed->slot ); @@ -2935,7 +2931,7 @@ fd_gui_handle_replay_update( fd_gui_t * gui, fd_gui_slot_staged_shred_event_t * slot_complete_event = &gui->shreds.staged[ gui->shreds.staged_tail % FD_GUI_SHREDS_STAGING_SZ ]; gui->shreds.staged_tail++; slot_complete_event->event = FD_GUI_SLOT_SHRED_SHRED_SLOT_COMPLETE; - slot_complete_event->timestamp = slot_completed->completed_time; + slot_complete_event->timestamp = slot_completed->completion_time_nanos; slot_complete_event->shred_idx = USHORT_MAX; slot_complete_event->slot = slot->slot; diff --git a/src/disco/gui/fd_gui.h b/src/disco/gui/fd_gui.h index 7ff77910433..b2267fbe71a 100644 --- a/src/disco/gui/fd_gui.h +++ b/src/disco/gui/fd_gui.h @@ -11,6 +11,7 @@ #include "../../disco/bundle/fd_bundle_tile.h" #include "../../discof/restore/fd_snapct_tile.h" #include "../../discof/tower/fd_tower_tile.h" +#include "../../discof/replay/fd_replay_tile.h" #include "../../choreo/tower/fd_tower.h" #include "../../choreo/tower/fd_tower_serdes.h" #include "../../flamenco/leaders/fd_leaders.h" @@ -341,24 +342,6 @@ struct fd_gui_turbine_slot { typedef struct fd_gui_turbine_slot fd_gui_turbine_slot_t; -struct fd_gui_slot_completed { - ulong slot; - long completed_time; - ulong parent_slot; - uint max_compute_units; - uint total_txn_cnt; - uint vote_txn_cnt; - uint failed_txn_cnt; - uint nonvote_failed_txn_cnt; - ulong transaction_fee; - ulong priority_fee; - ulong tips; - uint compute_units; - uint shred_cnt; -}; - -typedef struct fd_gui_slot_completed fd_gui_slot_completed_t; - struct fd_gui_slot_staged_shred_event { long timestamp; ulong slot; @@ -923,14 +906,10 @@ fd_gui_handle_tower_update( fd_gui_t * gui, long now ); void -fd_gui_handle_replay_update( fd_gui_t * gui, - fd_gui_slot_completed_t * slot_completed, - fd_hash_t const * block_hash, - ulong vote_slot, - ulong storage_slot, - ulong root_slot, - ulong identity_balance, - long now ); +fd_gui_handle_replay_update( fd_gui_t * gui, + fd_replay_slot_completed_t const * slot_completed, + ulong vote_slot, + long now ); void fd_gui_handle_genesis_hash( fd_gui_t * gui, diff --git a/src/disco/gui/fd_gui_tile.c b/src/disco/gui/fd_gui_tile.c index 8bc4c754f49..44d2141173d 100644 --- a/src/disco/gui/fd_gui_tile.c +++ b/src/disco/gui/fd_gui_tile.c @@ -355,14 +355,13 @@ after_frag( fd_gui_ctx_t * ctx, case IN_KIND_REPLAY_OUT: { FD_TEST( ctx->is_full_client ); if( FD_UNLIKELY( sig==REPLAY_SIG_SLOT_COMPLETED ) ) { - fd_replay_slot_completed_t const * replay = (fd_replay_slot_completed_t const *)src; + fd_replay_slot_completed_t const * slot_completed = (fd_replay_slot_completed_t const *)src; - - /* bank should already have positive refcnt */ + /* This section is security-critical, since the GUI crashing + while owning a bank will cause the client to stall. */ fd_bank_t bank[1]; - FD_TEST( fd_banks_bank_query( bank, ctx->banks, replay->bank_idx ) ); + FD_TEST( fd_banks_bank_query( bank, ctx->banks, slot_completed->bank_idx ) ); FD_TEST( bank->data->refcnt!=0 ); - fd_vote_stakes_t * vote_stakes = fd_bank_vote_stakes_locking_modify( bank ); ulong vote_count = 0UL; @@ -383,44 +382,12 @@ after_frag( fd_gui_ctx_t * ctx, } fd_bank_vote_stakes_end_locking_modify( bank ); - fd_gui_slot_completed_t slot_completed; - if( FD_LIKELY( replay->parent_bank_idx!=ULONG_MAX ) ) { - fd_bank_t parent_bank[1]; - FD_TEST( fd_banks_bank_query( parent_bank, ctx->banks, replay->parent_bank_idx ) ); - - slot_completed.total_txn_cnt = (uint)(fd_bank_txn_count_get( bank ) - fd_bank_txn_count_get( parent_bank )); - slot_completed.vote_txn_cnt = slot_completed.total_txn_cnt - (uint)(fd_bank_nonvote_txn_count_get( bank ) - fd_bank_nonvote_txn_count_get( parent_bank )); - slot_completed.failed_txn_cnt = (uint)(fd_bank_failed_txn_count_get( bank ) - fd_bank_failed_txn_count_get( parent_bank )); - slot_completed.nonvote_failed_txn_cnt = (uint)(fd_bank_nonvote_failed_txn_count_get( bank ) - fd_bank_nonvote_failed_txn_count_get( parent_bank )); - - fd_stem_publish( stem, ctx->replay_out->idx, replay->parent_bank_idx, 0UL, 0UL, 0UL, 0UL, 0UL ); - } else { - slot_completed.total_txn_cnt = (uint)fd_bank_txn_count_get( bank ); - slot_completed.vote_txn_cnt = slot_completed.total_txn_cnt - (uint)fd_bank_nonvote_txn_count_get( bank ); - slot_completed.failed_txn_cnt = (uint)fd_bank_failed_txn_count_get( bank ); - slot_completed.nonvote_failed_txn_cnt = (uint)fd_bank_nonvote_failed_txn_count_get( bank ); - } + /* release bank ownership */ + fd_stem_publish( stem, ctx->replay_out->idx, slot_completed->bank_idx, 0UL, 0UL, 0UL, 0UL, 0UL ); - slot_completed.slot = fd_bank_slot_get( bank ); - slot_completed.completed_time = replay->completion_time_nanos; - slot_completed.parent_slot = fd_bank_parent_slot_get( bank ); - slot_completed.max_compute_units = fd_uint_if( replay->cost_tracker.block_cost_limit==0UL, UINT_MAX, (uint)replay->cost_tracker.block_cost_limit ); - slot_completed.transaction_fee = fd_bank_execution_fees_get( bank ); - slot_completed.transaction_fee = slot_completed.transaction_fee - (slot_completed.transaction_fee>>1); /* burn */ - slot_completed.priority_fee = fd_bank_priority_fees_get( bank ); - slot_completed.tips = fd_bank_tips_get( bank ); - slot_completed.compute_units = fd_uint_if( replay->cost_tracker.block_cost==0UL, UINT_MAX, (uint)replay->cost_tracker.block_cost ); - slot_completed.shred_cnt = (uint)fd_bank_shred_cnt_get( bank ); - - /* release shared ownership of bank and parent_bank */ - fd_stem_publish( stem, ctx->replay_out->idx, replay->bank_idx, 0UL, 0UL, 0UL, 0UL, 0UL ); - - /* update vote info */ fd_gui_peers_handle_vote_update( ctx->peers, ctx->peers->votes, vote_count, fd_clock_now( ctx->clock ), ctx->gui->summary.identity_key ); - /* update slot data */ - fd_gui_handle_replay_update( ctx->gui, &slot_completed, &replay->block_hash, ctx->peers->slot_voted, replay->storage_slot, replay->root_slot, replay->identity_balance, fd_clock_now( ctx->clock ) ); - + fd_gui_handle_replay_update( ctx->gui, slot_completed, ctx->peers->slot_voted, fd_clock_now( ctx->clock ) ); } else if( FD_UNLIKELY( sig==REPLAY_SIG_BECAME_LEADER ) ) { fd_became_leader_t * became_leader = (fd_became_leader_t *)src; fd_gui_became_leader( ctx->gui, became_leader->slot, became_leader->slot_start_ns, became_leader->slot_end_ns, became_leader->limits.slot_max_cost, became_leader->max_microblocks_in_slot ); diff --git a/src/disco/topo/fd_topo.h b/src/disco/topo/fd_topo.h index b9cf010f507..2e9a5a80e69 100644 --- a/src/disco/topo/fd_topo.h +++ b/src/disco/topo/fd_topo.h @@ -129,6 +129,7 @@ struct fd_topo_tile { ulong kind_id; /* The ID of this tile within its name. If there are n tile of a particular name, they have IDs [0, N). The pair (name, kind_id) uniquely identifies a tile, as does "id" on its own. */ int is_agave; /* If the tile needs to run in the Agave (Anza) address space or not. */ int allow_shutdown; /* If the tile is allowed to shutdown gracefully. If false, when the tile exits it will tear down the entire application. */ + int allow_crash; /* If the tile is allowed to crash. If false, when the tile crashes it will tear down the entire application. */ ulong cpu_idx; /* The CPU index to pin the tile on. A value of ULONG_MAX or more indicates the tile should be floating and not pinned to a core. */ diff --git a/src/disco/topo/fd_topob.c b/src/disco/topo/fd_topob.c index 9f326803f76..b9c0813b395 100644 --- a/src/disco/topo/fd_topob.c +++ b/src/disco/topo/fd_topob.c @@ -169,6 +169,8 @@ fd_topob_tile( fd_topo_t * topo, tile->in_cnt = 0UL; tile->out_cnt = 0UL; tile->uses_obj_cnt = 0UL; + tile->allow_shutdown = 0; + tile->allow_crash = 0; fd_topo_obj_t * tile_obj = fd_topob_obj( topo, "tile", tile_wksp ); tile->tile_obj_id = tile_obj->id; diff --git a/src/discof/replay/fd_replay_tile.c b/src/discof/replay/fd_replay_tile.c index e1e6020e496..2eea5fa3c32 100644 --- a/src/discof/replay/fd_replay_tile.c +++ b/src/discof/replay/fd_replay_tile.c @@ -417,6 +417,7 @@ struct fd_replay_tile { consumer is enabled so it can increment the bank's refcnt before publishing the bank_idx to the gui. */ int gui_enabled; + ulong * replay_gui_fseq; int rpc_enabled; # if FD_HAS_FLATCC @@ -714,15 +715,19 @@ static void cost_tracker_snap( fd_bank_t * bank, fd_replay_slot_completed_t * slot_info ) { if( bank->data->cost_tracker_pool_idx!=fd_bank_cost_tracker_pool_idx_null( fd_bank_get_cost_tracker_pool( bank->data ) ) ) { fd_cost_tracker_t const * cost_tracker = fd_bank_cost_tracker_locking_query( bank ); - slot_info->cost_tracker.block_cost = cost_tracker->block_cost; - slot_info->cost_tracker.vote_cost = cost_tracker->vote_cost; - slot_info->cost_tracker.allocated_accounts_data_size = cost_tracker->allocated_accounts_data_size; - slot_info->cost_tracker.block_cost_limit = cost_tracker->block_cost_limit; - slot_info->cost_tracker.vote_cost_limit = cost_tracker->vote_cost_limit; - slot_info->cost_tracker.account_cost_limit = cost_tracker->account_cost_limit; + if( FD_UNLIKELY( cost_tracker->block_cost_limit==0UL ) ) { + memset( &slot_info->cost_tracker, (int)UINT_MAX, sizeof(slot_info->cost_tracker) ); + } else { + slot_info->cost_tracker.block_cost = cost_tracker->block_cost; + slot_info->cost_tracker.vote_cost = cost_tracker->vote_cost; + slot_info->cost_tracker.allocated_accounts_data_size = cost_tracker->allocated_accounts_data_size; + slot_info->cost_tracker.block_cost_limit = cost_tracker->block_cost_limit; + slot_info->cost_tracker.vote_cost_limit = cost_tracker->vote_cost_limit; + slot_info->cost_tracker.account_cost_limit = cost_tracker->account_cost_limit; + } fd_bank_cost_tracker_end_locking_query( bank ); } else { - memset( &slot_info->cost_tracker, 0, sizeof(slot_info->cost_tracker) ); + memset( &slot_info->cost_tracker, (int)UINT_MAX, sizeof(slot_info->cost_tracker) ); } } @@ -807,19 +812,38 @@ publish_slot_completed( fd_replay_tile_t * ctx, they are done using the bank. */ bank->data->refcnt++; /* tower_tile */ if( FD_LIKELY( ctx->rpc_enabled ) ) bank->data->refcnt++; /* rpc tile */ - if( FD_LIKELY( ctx->gui_enabled ) ) bank->data->refcnt++; /* gui tile */ + + /* If link credits are infinity, GUI has crashed. Technically there + is a race here between the gui tile crashing and the supervisor + process demoting the reliable link, but slot_complete messages + should be infrequent enough that this is not an issue. + + Ideally, we eliminate GUI's dependence on banks altogether. */ + int gui_down = ctx->gui_enabled && fd_fseq_query( ctx->replay_gui_fseq )==(ULONG_MAX-1); + if( FD_LIKELY( ctx->gui_enabled && !gui_down ) ) bank->data->refcnt++; /* gui tile */ slot_info->bank_idx = bank->data->idx; FD_LOG_DEBUG(( "bank (idx=%lu, slot=%lu) refcnt incremented to %lu for tower, rpc, gui", bank->data->idx, slot, bank->data->refcnt )); slot_info->parent_bank_idx = ULONG_MAX; fd_bank_t parent_bank[1]; - if( FD_LIKELY( fd_banks_get_parent( parent_bank, ctx->banks, bank ) && ctx->gui_enabled ) ) { - parent_bank->data->refcnt++; - FD_LOG_DEBUG(( "bank (idx=%lu, slot=%lu) refcnt incremented to %lu for gui", parent_bank->data->idx, fd_bank_slot_get( parent_bank ), parent_bank->data->refcnt )); - slot_info->parent_bank_idx = parent_bank->data->idx; + if( FD_LIKELY( fd_banks_get_parent( parent_bank, ctx->banks, bank ) ) ) { + slot_info->total_txn_cnt = fd_bank_txn_count_get( bank ) - fd_bank_txn_count_get( parent_bank ); + slot_info->vote_txn_cnt = slot_info->total_txn_cnt - (fd_bank_nonvote_txn_count_get( bank ) - fd_bank_nonvote_txn_count_get( parent_bank )); + slot_info->failed_txn_cnt = fd_bank_failed_txn_count_get( bank ) - fd_bank_failed_txn_count_get( parent_bank ); + slot_info->nonvote_failed_txn_cnt = fd_bank_nonvote_failed_txn_count_get( bank ) - fd_bank_nonvote_failed_txn_count_get( parent_bank ); + } else { + slot_info->total_txn_cnt = ULONG_MAX; + slot_info->vote_txn_cnt = ULONG_MAX; + slot_info->failed_txn_cnt = ULONG_MAX; + slot_info->nonvote_failed_txn_cnt = ULONG_MAX; } slot_info->is_leader = is_leader; + slot_info->transaction_fee = fd_bank_execution_fees_get( bank ); + slot_info->transaction_fee -= (slot_info->transaction_fee>>1); /* burn */ + slot_info->priority_fee = fd_bank_priority_fees_get( bank ); + slot_info->tips = fd_bank_tips_get( bank ); + slot_info->shred_cnt = fd_bank_shred_cnt_get( bank ); FD_BASE58_ENCODE_32_BYTES( ctx->block_id_arr[ bank->data->idx ].latest_mr.uc, block_id_cstr ); FD_BASE58_ENCODE_32_BYTES( fd_bank_bank_hash_query( bank )->uc, bank_hash_cstr ); @@ -1035,7 +1059,7 @@ publish_root_advanced( fd_replay_tile_t * ctx, if( ctx->rpc_enabled ) { bank->data->refcnt++; - FD_LOG_DEBUG(( "bank (idx=%lu, slot=%lu) refcnt incremented to %lu for gui", bank->data->idx, fd_bank_slot_get( bank ), bank->data->refcnt )); + FD_LOG_DEBUG(( "bank (idx=%lu, slot=%lu) refcnt incremented to %lu for rpc", bank->data->idx, fd_bank_slot_get( bank ), bank->data->refcnt )); } /* Increment the reference count on the consensus root bank to account @@ -2901,7 +2925,19 @@ unprivileged_init( fd_topo_t * topo, *ctx->replay_out = out1( topo, tile, "replay_out" ); FD_TEST( ctx->replay_out->idx!=ULONG_MAX ); *ctx->exec_out = out1( topo, tile, "replay_execrp" ); FD_TEST( ctx->exec_out->idx!=ULONG_MAX ); - ctx->gui_enabled = fd_topo_find_tile( topo, "gui", 0UL )!=ULONG_MAX; + ulong gui_tile_idx = fd_topo_find_tile( topo, "gui", 0UL ); + ctx->gui_enabled = gui_tile_idx!=ULONG_MAX; + + if( ctx->gui_enabled ) { + ctx->replay_gui_fseq = NULL; + for( ulong i = 0UL; i < topo->tiles[ gui_tile_idx ].in_cnt; i++ ) { + if( strcmp( topo->links[ topo->tiles[ gui_tile_idx ].in_link_id[ i ] ].name, "replay_out" ) ) continue; + ulong fseq_id = topo->tiles[ gui_tile_idx ].in_link_fseq_obj_id[ i ]; + ctx->replay_gui_fseq = fd_fseq_join( fd_topo_obj_laddr( topo, fseq_id ) ); + } + FD_TEST( ctx->replay_gui_fseq ); + } + ctx->rpc_enabled = fd_topo_find_tile( topo, "rpc", 0UL )!=ULONG_MAX; if( FD_UNLIKELY( strcmp( "", tile->replay.solcap_capture ) ) ) { diff --git a/src/discof/replay/fd_replay_tile.h b/src/discof/replay/fd_replay_tile.h index e4ac1a37a22..bd88349d5b9 100644 --- a/src/discof/replay/fd_replay_tile.h +++ b/src/discof/replay/fd_replay_tile.h @@ -28,7 +28,7 @@ struct fd_replay_slot_completed { fd_hash_t bank_hash; /* bank hash of the slot received from replay */ fd_hash_t block_hash; /* last microblock header hash of slot received from replay */ - ulong transaction_count; + ulong transaction_count; /* since genesis */ struct { double initial; @@ -59,6 +59,17 @@ struct fd_replay_slot_completed { int is_leader; /* whether we were leader for this slot */ ulong identity_balance; + /* counts since slot start, default ULONG_MAX */ + ulong total_txn_cnt; + ulong vote_txn_cnt; + ulong failed_txn_cnt; + ulong nonvote_failed_txn_cnt; + + ulong transaction_fee; + ulong priority_fee; + ulong tips; + ulong shred_cnt; + struct { ulong block_cost; ulong vote_cost;