Skip to content

Commit 57a1d35

Browse files
committed
fix(validator): auto-reconnect BlockSync after Bittensor connection loss
When the Bittensor RPC connection drops, BlockSync stops emitting events and weights are never submitted. This adds automatic reconnection logic: - Track disconnection state via BlockSyncEvent::Disconnected - Attempt reconnection every 30s in the heartbeat tick - Create new BittensorClient and BlockSync on reconnect - Refresh metagraph after successful reconnection
1 parent 5077c75 commit 57a1d35

File tree

1 file changed

+65
-0
lines changed

1 file changed

+65
-0
lines changed

bins/validator-node/src/main.rs

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -677,11 +677,16 @@ async fn main() -> Result<()> {
677677

678678
let netuid = args.netuid;
679679
let version_key = args.version_key;
680+
let subtensor_endpoint = args.subtensor_endpoint.clone();
680681
let mut interval = tokio::time::interval(Duration::from_secs(60));
681682
let mut metrics_interval = tokio::time::interval(Duration::from_secs(5));
682683
let mut challenge_refresh_interval = tokio::time::interval(Duration::from_secs(60));
683684
let mut metagraph_refresh_interval = tokio::time::interval(Duration::from_secs(300)); // 5 minutes
684685

686+
// Track Bittensor connection state for reconnection
687+
let mut bittensor_disconnected = false;
688+
let mut last_reconnect_attempt = std::time::Instant::now();
689+
685690
// Store challenges in Arc<RwLock> for periodic refresh
686691
let cached_challenges: Arc<RwLock<Vec<ChallengeInfo>>> = Arc::new(RwLock::new(
687692
platform_client.list_challenges().await.unwrap_or_default(),
@@ -703,6 +708,17 @@ async fn main() -> Result<()> {
703708
None => std::future::pending().await,
704709
}
705710
} => {
711+
// Track disconnection state for reconnection logic
712+
match &event {
713+
BlockSyncEvent::Disconnected(_) => {
714+
bittensor_disconnected = true;
715+
}
716+
BlockSyncEvent::Reconnected | BlockSyncEvent::NewBlock { .. } => {
717+
bittensor_disconnected = false;
718+
}
719+
_ => {}
720+
}
721+
706722
handle_block_event(
707723
event,
708724
&platform_client,
@@ -717,6 +733,55 @@ async fn main() -> Result<()> {
717733

718734
_ = interval.tick() => {
719735
debug!("Heartbeat");
736+
737+
// Check if we need to attempt Bittensor reconnection
738+
if bittensor_disconnected && last_reconnect_attempt.elapsed() > Duration::from_secs(30) {
739+
last_reconnect_attempt = std::time::Instant::now();
740+
info!("Attempting Bittensor reconnection...");
741+
742+
// Try to reconnect by creating a new BlockSync
743+
match BittensorClient::new(&subtensor_endpoint).await {
744+
Ok(new_client) => {
745+
let mut sync = BlockSync::new(BlockSyncConfig {
746+
netuid,
747+
..Default::default()
748+
});
749+
750+
if let Some(new_rx) = sync.take_event_receiver() {
751+
let new_client = Arc::new(new_client);
752+
match sync.connect(new_client.clone()).await {
753+
Ok(()) => {
754+
// Spawn the sync task to keep it running
755+
tokio::spawn(async move {
756+
if let Err(e) = sync.start().await {
757+
error!("Block sync error after reconnect: {}", e);
758+
}
759+
});
760+
761+
info!("Bittensor reconnected successfully");
762+
block_rx = Some(new_rx);
763+
bittensor_disconnected = false;
764+
765+
// Also refresh metagraph with new client
766+
if let Some(ref st_client) = subtensor_client {
767+
if let Ok(mg) = sync_metagraph(&new_client, netuid).await {
768+
info!("Metagraph refreshed after reconnect: {} neurons", mg.n);
769+
let mut client = st_client.write();
770+
client.set_metagraph(mg);
771+
}
772+
}
773+
}
774+
Err(e) => {
775+
warn!("Failed to connect block sync: {}", e);
776+
}
777+
}
778+
}
779+
}
780+
Err(e) => {
781+
warn!("Bittensor reconnection failed: {} (will retry in 30s)", e);
782+
}
783+
}
784+
}
720785
}
721786

722787
_ = metrics_interval.tick() => {

0 commit comments

Comments
 (0)