fix: prevent weight divergence between bootstrap and non-bootstrap validators

echobt · echobt · commit aec5fe78438d · 2026-02-25T14:57:13.000Z
Five fixes for P2P state consistency:

1. Remove optimistic local write from propose_write
2. Content-based state hashing (not just counts)
3. Fix merge_from to update existing entries
4. Force metagraph refresh before weight submission
5. Post-bootstrap reconciliation via improved hash + merge
diff --git a/bins/validator-node/src/challenge_storage.rs b/bins/validator-node/src/challenge_storage.rs
@@ -105,19 +105,11 @@ impl StorageBackend for ChallengeStorageBackend {
                 signature,
             });
 
-            // Write locally first so WASM can read-your-own-writes during sync
-            let storage_key = build_challenge_storage_key(challenge_id, key);
-            if let Err(e) = tokio::task::block_in_place(|| {
-                tokio::runtime::Handle::current().block_on(self.storage.put(
-                    storage_key,
-                    value.to_vec(),
-                    DPutOptions::default(),
-                ))
-            }) {
-                tracing::warn!(error = %e, "Failed to write locally before P2P broadcast");
-            }
+            // DO NOT write locally before consensus - this causes state divergence.
+            // The proposer's get_weights would read uncommitted data that other validators
+            // don't have yet. All nodes (including proposer) write only after P2P consensus.
 
-            // Broadcast via P2P so other validators also apply the write
+            // Broadcast via P2P so all validators apply the write after consensus
             tracing::debug!(
                 proposal_id = %hex::encode(&proposal_id[..8]),
                 challenge_id = %challenge_id,
diff --git a/bins/validator-node/src/main.rs b/bins/validator-node/src/main.rs
@@ -1025,6 +1025,23 @@ async fn main() -> Result<()> {
                     None => std::future::pending().await,
                 }
             } => {
+                // Force metagraph refresh before weight submission to avoid stale hotkey->UID mappings
+                if matches!(event, BlockSyncEvent::CommitWindowOpen { .. }) {
+                    if let Some(bittensor_client) = bittensor_client_for_metagraph.as_ref() {
+                        match sync_metagraph(bittensor_client, netuid).await {
+                            Ok(mg) => {
+                                info!("Pre-commit metagraph refresh: {} neurons", mg.n);
+                                update_validator_set_from_metagraph(&mg, &validator_set, &chain_state, &valid_voters, &state_root_consensus, &state_manager);
+                                if let Some(sc) = subtensor_client.as_mut() {
+                                    sc.set_metagraph(mg);
+                                }
+                            }
+                            Err(e) => {
+                                warn!("Pre-commit metagraph refresh failed: {}. Using cached metagraph.", e);
+                            }
+                        }
+                    }
+                }
                 handle_block_event(
                     event,
                     &subtensor,
diff --git a/crates/core/src/state.rs b/crates/core/src/state.rs
@@ -298,28 +298,62 @@ impl ChainState {
     }
 
     /// Update the state hash
+    ///
+    /// Hashes actual content (not just counts) so divergent state is detectable.
     pub fn update_hash(&mut self) {
+        #[derive(Serialize)]
+        struct ChallengeHashEntry {
+            id: String,
+            emission_weight: u64, // f64 as bits for deterministic hashing
+            mechanism_id: u8,
+            is_active: bool,
+        }
+
         #[derive(Serialize)]
         struct HashInput<'a> {
             block_height: BlockHeight,
             sudo_key: &'a Hotkey,
-            validator_count: usize,
-            challenge_count: usize,
-            wasm_challenge_count: usize,
-            pending_jobs: usize,
             mutation_sequence: u64,
-            route_count: usize,
+            validators: Vec<String>,
+            challenge_ids: Vec<String>,
+            wasm_challenges: Vec<ChallengeHashEntry>,
+            mechanism_configs: Vec<(u8, String)>,
         }
 
+        let mut validators: Vec<String> = self.validators.keys().map(|h| h.to_ss58()).collect();
+        validators.sort();
+
+        let mut challenge_ids: Vec<String> =
+            self.challenges.keys().map(|c| c.to_string()).collect();
+        challenge_ids.sort();
+
+        let mut wasm_challenges: Vec<ChallengeHashEntry> = self
+            .wasm_challenge_configs
+            .iter()
+            .map(|(id, cfg)| ChallengeHashEntry {
+                id: id.to_string(),
+                emission_weight: cfg.config.emission_weight.to_bits(),
+                mechanism_id: cfg.config.mechanism_id,
+                is_active: cfg.is_active,
+            })
+            .collect();
+        wasm_challenges.sort_by(|a, b| a.id.cmp(&b.id));
+
+        let mut mechanism_configs: Vec<(u8, String)> = self
+            .mechanism_configs
+            .iter()
+            .map(|(k, v)| (*k, format!("{:?}", v)))
+            .collect();
+        mechanism_configs.sort_by_key(|(k, _)| *k);
+
         let input = HashInput {
             block_height: self.block_height,
             sudo_key: &self.sudo_key,
-            validator_count: self.validators.len(),
-            challenge_count: self.challenges.len(),
-            wasm_challenge_count: self.wasm_challenge_configs.len(),
-            pending_jobs: self.pending_jobs.len(),
             mutation_sequence: self.mutation_sequence,
-            route_count: self.challenge_routes.len(),
+            validators,
+            challenge_ids,
+            wasm_challenges,
+            mechanism_configs,
         };
 
         self.state_hash = hash_data(&input).unwrap_or([0u8; 32]);
@@ -640,18 +674,30 @@ impl ChainState {
             return false;
         }
 
+        // Merge WASM challenge configs: update existing entries too (not just insert)
+        // so that sudo actions (SetEmission, SetMechanism) propagate correctly.
         for (id, config) in &peer_state.wasm_challenge_configs {
-            self.wasm_challenge_configs
-                .entry(*id)
-                .or_insert_with(|| config.clone());
+            self.wasm_challenge_configs.insert(*id, config.clone());
         }
 
         for (id, routes) in &peer_state.challenge_routes {
-            self.challenge_routes
-                .entry(*id)
-                .or_insert_with(|| routes.clone());
+            self.challenge_routes.insert(*id, routes.clone());
+        }
+
+        // Merge mechanism configs
+        for (mid, mconfig) in &peer_state.mechanism_configs {
+            self.mechanism_configs.insert(*mid, mconfig.clone());
+        }
+
+        // Merge challenge weight allocations
+        for (id, alloc) in &peer_state.challenge_weights {
+            self.challenge_weights.insert(*id, alloc.clone());
         }
 
+        // Sync paused state from higher-sequence peer
+        self.paused = peer_state.paused;
+        self.pause_reason = peer_state.pause_reason.clone();
+
         self.mutation_sequence = peer_state.mutation_sequence;
         self.update_hash();
         true
diff --git a/crates/p2p-consensus/src/state.rs b/crates/p2p-consensus/src/state.rs
@@ -465,24 +465,37 @@ impl ChainState {
     pub fn update_hash(&mut self) {
         self.last_updated = chrono::Utc::now().timestamp_millis();
 
-        // Create a deterministic hash input
+        // Hash actual content (not just counts) so divergent state is detectable
         #[derive(Serialize)]
         struct HashInput {
             sequence: SequenceNumber,
             epoch: u64,
-            validator_count: usize,
-            challenge_count: usize,
-            pending_count: usize,
             netuid: u16,
+            validators: Vec<(String, u64)>,
+            challenges: Vec<String>,
+            pending_ids: Vec<String>,
         }
 
+        let mut validators: Vec<(String, u64)> = self
+            .validators
+            .iter()
+            .map(|(h, s)| (h.to_ss58(), *s))
+            .collect();
+        validators.sort_by(|a, b| a.0.cmp(&b.0));
+
+        let mut challenges: Vec<String> = self.challenges.keys().map(|c| c.to_string()).collect();
+        challenges.sort();
+
+        let mut pending_ids: Vec<String> = self.pending_evaluations.keys().cloned().collect();
+        pending_ids.sort();
+
         let input = HashInput {
             sequence: self.sequence,
             epoch: self.epoch,
-            validator_count: self.validators.len(),
-            challenge_count: self.challenges.len(),
-            pending_count: self.pending_evaluations.len(),
             netuid: self.netuid,
+            validators,
+            challenges,
+            pending_ids,
         };
 
         self.state_hash = hash_data(&input).unwrap_or([0u8; 32]);