diff --git a/src/Cargo.lock b/src/Cargo.lock index 693cdc3..7fbaabd 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -5033,7 +5033,7 @@ dependencies = [ [[package]] name = "simplex" -version = "0.5.0" +version = "0.6.0" dependencies = [ "adnl", "anyhow", diff --git a/src/adnl/src/overlay/broadcast.rs b/src/adnl/src/overlay/broadcast.rs index 2d5c752..0a8fb4e 100644 --- a/src/adnl/src/overlay/broadcast.rs +++ b/src/adnl/src/overlay/broadcast.rs @@ -1758,7 +1758,19 @@ impl BroadcastProtocol for BroadcastTwostepFecProtocol { ctx: &mut BroadcastRecvContext, bcast_id: &BroadcastId, ) -> Result<(Option, bool)> { - >::process_broadcast(bcast, ctx, bcast_id).await + let (info, mut resend) = + >::process_broadcast(bcast, ctx, bcast_id) + .await?; + if resend { + let Some(bcast) = ctx.overlay.owned_broadcasts.get(bcast_id) else { + return Ok((info, false)); + }; + let Some(transfer) = Self::unwrap_transfer(bcast.val()) else { + return Ok((info, false)); + }; + resend = ctx.peers.other() == &transfer.src_key_id; + } + Ok((info, resend)) } // Send side diff --git a/src/adnl/src/quic/mod.rs b/src/adnl/src/quic/mod.rs index 110d064..22cc717 100644 --- a/src/adnl/src/quic/mod.rs +++ b/src/adnl/src/quic/mod.rs @@ -6,6 +6,9 @@ * * This software is provided "AS IS", WITHOUT WARRANTY OF ANY KIND. */ +mod rate_limiter; +mod stat; + use crate::{ common::{ add_unbound_object_to_map, add_unbound_object_to_map_with_update, spawn_cancelable, @@ -14,6 +17,9 @@ use crate::{ node::AdnlNode, transport::{Connections, SendQueue}, }; +pub use rate_limiter::QuicRateLimitConfig; +use rate_limiter::{ConnectionRateLimiters, RateLimiter}; +use stat::{extract_inner_tag, tl_tag_name, ConnSnapshot, MsgKind, MsgStats, TransportErrors}; use std::{ collections::{HashMap, HashSet}, fmt::{Debug, Formatter, Write}, @@ -25,7 +31,7 @@ use std::{ time::{Duration, Instant}, }; use ton_api::{ - deserialize_boxed, deserialize_boxed_with_suffix, serialize_boxed, + deserialize_boxed, serialize_boxed, ton::quic::{ answer::Answer as QuicAnswer, request::{Message as QuicMessage, Query as QuicQuery}, @@ -494,6 +500,8 @@ pub struct QuicNode { key_cmd_tx: tokio::sync::mpsc::UnboundedSender, /// Aggregate error counters (reset each stats dump interval). transport_errors: Arc, + /// Rate limiting configuration for inbound QUIC connections. + rate_limit_config: QuicRateLimitConfig, } impl QuicNode { @@ -552,6 +560,7 @@ impl QuicNode { cancellation_token: tokio_util::sync::CancellationToken, max_streams_per_connection: Option, runtime_handle: tokio::runtime::Handle, + rate_limit_config: Option, ) -> Arc { let max_streams_per_connection = max_streams_per_connection.unwrap_or(Self::DEFAULT_MAX_STREAMS_PER_CONNECTION); @@ -573,6 +582,7 @@ impl QuicNode { msg_stats: MsgStats::new(), key_cmd_tx, transport_errors: TransportErrors::new(), + rate_limit_config: rate_limit_config.unwrap_or_default(), }); // Spawn background task that processes key commands inside the Tokio runtime. let weak = Arc::downgrade(&transport); @@ -1184,6 +1194,15 @@ impl QuicNode { let per_key_configs = Arc::new(Mutex::new(HashMap::new())); let reconnect_tracker = Arc::new(Mutex::new(HashMap::new())); + let rl_config = self.rate_limit_config.clone(); + let conn_rate_limiters = + ConnectionRateLimiters::new(rl_config.per_ip_capacity, rl_config.per_ip_period); + let global_rate_limiter = if rl_config.global_capacity > 0 { + Some(RateLimiter::new(rl_config.global_capacity, rl_config.global_period)) + } else { + None + }; + Self::spawn_accept_loop( endpoint.clone(), local_key_names.clone(), @@ -1196,6 +1215,10 @@ impl QuicNode { self.msg_stats.clone(), per_key_configs.clone(), reconnect_tracker.clone(), + rl_config, + conn_rate_limiters, + global_rate_limiter, + self.transport_errors.clone(), ); let state = Arc::new(EndpointState { @@ -1341,32 +1364,17 @@ impl QuicNode { local {local_key_id} peer {peer_key_id}" ); + // Replace any existing inbound connection for this peer path and close the + // old one immediately (matches C++ QuicSender::on_connected_inner behaviour). let inbound_key = QuicInboundKey(local_key_id.clone(), peer_key_id.clone()); - let had_existing = { - let mut found_existing = false; - let result = - add_unbound_object_to_map_with_update(&inbound, inbound_key.clone(), |existing| { - if existing.is_some() { - found_existing = true; - // Keep existing entry; resolver task will handle replacement - Ok(None) - } else { - Ok(Some(conn.clone())) - } - }); - if let Err(e) = result { - log::warn!(target: TARGET, "Store QUIC inbound for {addr}: {e}"); - return; - } - found_existing - }; - if had_existing { - tokio::spawn(Self::resolve_duplicate_connection( - inbound.clone(), - conn.clone(), - inbound_key.clone(), - addr, - )); + let old_conn = inbound.remove(&inbound_key).map(|g| g.val().clone()); + let _ = add_unbound_object_to_map(&inbound, inbound_key.clone(), || Ok(conn.clone())); + if let Some(old) = old_conn { + log::info!( + target: TARGET, + "Replacing duplicate inbound from {addr} key {peer_key_id}, closing old" + ); + old.close(0u32.into(), b"duplicate replaced"); } let peers = AdnlPeers::with_keys(local_key_id, peer_key_id); @@ -1742,50 +1750,6 @@ impl QuicNode { } } - async fn resolve_duplicate_connection( - inbound: Arc, - new_conn: quinn::Connection, - key: QuicInboundKey, - addr: SocketAddr, - ) { - use rand::Rng; - let delay_ms = rand::thread_rng().gen_range(500..=2500); - tokio::time::sleep(Duration::from_millis(delay_ms)).await; - - let peer_key = &key.1; - let old_alive = - inbound.get(&key).map(|e| e.val().close_reason().is_none()).unwrap_or(false); - let new_alive = new_conn.close_reason().is_none(); - - if old_alive && new_alive { - // Don't close the old connection — it may be the remote peer's active - // outbound. Just replace it in the tracking map; the old connection's - // accept loops will continue until idle timeout. - if let Some(_old) = inbound.remove(&key) { - log::info!( - target: TARGET, - "Replacing old duplicate inbound from {addr} key {peer_key} \ - (both alive after {delay_ms}ms)" - ); - } - let nc = new_conn.clone(); - let _ = add_unbound_object_to_map_with_update(&inbound, key, |_| Ok(Some(nc.clone()))); - } else if new_alive { - inbound.remove(&key); - log::debug!( - target: TARGET, - "Old inbound from {addr} key {peer_key} already closed, keeping new" - ); - let nc = new_conn.clone(); - let _ = add_unbound_object_to_map_with_update(&inbound, key, |_| Ok(Some(nc.clone()))); - } else { - log::debug!( - target: TARGET, - "New inbound from {addr} key {peer_key} already closed, keeping old" - ); - } - } - /// Drain the send queue and exit. Spawned when `message()` has no live /// connection and must enqueue data for later delivery. The task establishes /// the connection, sends all queued messages, and terminates. @@ -2116,7 +2080,18 @@ impl QuicNode { msg_stats: Arc, per_key_configs: Arc>>>, reconnect_tracker: Arc>>, + rl_config: QuicRateLimitConfig, + mut conn_rate_limiters: ConnectionRateLimiters, + mut global_rate_limiter: Option, + transport_errors: Arc, ) { + log::info!( + target: TARGET, + "QUIC accept loop on {bind_addr}: Retry={} per_ip_capacity={} global_capacity={}", + if rl_config.stateless_retry { "on" } else { "off" }, + rl_config.per_ip_capacity, + rl_config.global_capacity, + ); tokio::spawn(async move { loop { log::trace!(target: TARGET, "Loop QUIC server on {bind_addr}"); @@ -2130,6 +2105,16 @@ impl QuicNode { log::info!(target: TARGET, "QUIC endpoint on {bind_addr} closed"); break; }; + let Some(incoming) = rate_limit( + incoming, + &rl_config, + &mut conn_rate_limiters, + &mut global_rate_limiter, + &transport_errors, + bind_addr, + ) else { + continue; + }; let addr = incoming.remote_address(); log::debug!(target: TARGET, "Accept in QUIC server on {bind_addr} from {addr}"); @@ -2436,7 +2421,7 @@ impl QuicNode { .ok(); } - let (sf, qt, cf, qf, dr) = transport.transport_errors.take(); + let (sf, qt, cf, qf, dr, rl_ip, rl_gl, retry) = transport.transport_errors.take(); Write::write_fmt( &mut dump, format_args!( @@ -2445,6 +2430,14 @@ impl QuicNode { ), ) .ok(); + Write::write_fmt( + &mut dump, + format_args!( + " rate_limit: per_ip_rejected={rl_ip} global_rejected={rl_gl} \ + retry_sent={retry}\n", + ), + ) + .ok(); Write::write_fmt(&mut dump, format_args!( " total: {total} connections, {} msg entries", msg_entries.len(), @@ -2456,419 +2449,50 @@ impl QuicNode { } } -/// Extract the "inner" TL constructor tag from message data. -/// -/// QUIC message payloads are typically wrapped in an overlay prefix -/// (`overlay.message` or `overlay.query`). The outer tag is not useful -/// for diagnostics. This function skips past the overlay wrapper and -/// returns the constructor tag of the actual inner payload. +/// Apply rate-limiting checks to an incoming QUIC connection. /// -/// `overlay.message` and `overlay.query` have a fixed layout: -/// constructor(4 bytes) + int256(32 bytes) = 36 bytes prefix. -/// `WithExtra` variants have a variable-length extra field, so we -/// fall back to `deserialize_boxed_with_suffix` for those. -fn extract_inner_tag(data: &[u8]) -> u32 { - if data.len() < 4 { - return 0; - } - let outer = u32::from_le_bytes([data[0], data[1], data[2], data[3]]); - // overlay.message / overlay.query: fixed 36-byte prefix (constructor + int256) - const FIXED_PREFIX: usize = 4 + 32; - match outer { - 0x75252420 | 0xccfd8443 => { - // overlay.message, overlay.query - if data.len() >= FIXED_PREFIX + 4 { - let s = &data[FIXED_PREFIX..]; - return u32::from_le_bytes([s[0], s[1], s[2], s[3]]); - } - outer - } - 0xa232233d | 0x94ffc3e9 => { - // overlay.messageWithExtra, overlay.queryWithExtra - if let Ok((_obj, suffix_offset)) = deserialize_boxed_with_suffix(data) { - if suffix_offset + 4 <= data.len() { - let s = &data[suffix_offset..]; - return u32::from_le_bytes([s[0], s[1], s[2], s[3]]); - } - } - outer - } - _ => outer, - } -} - -/// Map well-known TL constructor tags to short human-readable names for log output. -fn tl_tag_name(tag: u32) -> &'static str { - match tag { - 0x75252420 => "overlay.message", - 0xa232233d => "overlay.messageWithExtra", - 0xccfd8443 => "overlay.query", - 0x94ffc3e9 => "overlay.queryWithExtra", - 0xb15a2b6b => "overlay.broadcast", - 0xbad7c36a => "overlay.broadcastFec", - 0xf1881342 => "overlay.broadcastFecShort", - 0x46efae62 => "overlay.broadcastStream", - 0xf99fd63d => "overlay.broadcastTwostepFec", - 0x80b859b0 => "overlay.broadcastTwostepSimple", - 0x33534e24 => "overlay.unicast", - 0xd55c14ec => "overlay.fec.received", - 0x09d76914 => "overlay.fec.completed", - 0x48ee64ab => "overlay.getRandomPeers", - 0xa58e7ecc => "overlay.getRandomPeersV2", - 0x690cb481 => "overlay.ping", - 0x236758c4 => "catchain.blockUpdate", - 0x9283ce37 => "validatorSession.blockUpdate", - 0xbe7b573a => "consensus.simplex.certificate", - 0xc37ef4f3 => "consensus.simplex.vote", - 0x543fba6c => "consensus.simplex.requestCandidate", - _ => "unknown", - } -} - -/// Aggregate error counters for the QUIC transport layer (lock-free, reset on each stats dump). -struct TransportErrors { - /// send_via_stream / send_via_stream_nowait failures (stream open, write, finish errors) - send_failed: AtomicU64, - /// query timeouts (deadline exceeded waiting for response) - query_timeout: AtomicU64, - /// connection establishment failures (connect / handshake errors) - connect_failed: AtomicU64, - /// messages dropped because the per-peer send queue was full - queue_full: AtomicU64, - /// dead connections removed (by checker or on send failure) - dead_conn_removed: AtomicU64, -} - -impl TransportErrors { - fn new() -> Arc { - Arc::new(Self { - send_failed: AtomicU64::new(0), - query_timeout: AtomicU64::new(0), - connect_failed: AtomicU64::new(0), - queue_full: AtomicU64::new(0), - dead_conn_removed: AtomicU64::new(0), - }) - } - - /// Take current values and reset to zero; returns (send_failed, query_timeout, - /// connect_failed, queue_full, dead_conn_removed). - fn take(&self) -> (u64, u64, u64, u64, u64) { - ( - self.send_failed.swap(0, Ordering::Relaxed), - self.query_timeout.swap(0, Ordering::Relaxed), - self.connect_failed.swap(0, Ordering::Relaxed), - self.queue_full.swap(0, Ordering::Relaxed), - self.dead_conn_removed.swap(0, Ordering::Relaxed), - ) - } -} - -/// Snapshot of cumulative counters from a single connection, used to compute deltas. -#[derive(Clone, Copy)] -struct ConnSnapshot { - tx_bytes: u64, - tx_dgrams: u64, - rx_bytes: u64, - rx_dgrams: u64, - lost_pkts: u64, - /// When this connection was first observed by the stats dumper. - connected_since: Instant, -} - -impl ConnSnapshot { - fn new(s: &quinn::ConnectionStats, connected_since: Instant) -> Self { - Self { - tx_bytes: s.udp_tx.bytes, - tx_dgrams: s.udp_tx.datagrams, - rx_bytes: s.udp_rx.bytes, - rx_dgrams: s.udp_rx.datagrams, - lost_pkts: s.path.lost_packets, - connected_since, - } - } - - fn delta(&self, prev: &Self) -> Self { - Self { - tx_bytes: self.tx_bytes.saturating_sub(prev.tx_bytes), - tx_dgrams: self.tx_dgrams.saturating_sub(prev.tx_dgrams), - rx_bytes: self.rx_bytes.saturating_sub(prev.rx_bytes), - rx_dgrams: self.rx_dgrams.saturating_sub(prev.rx_dgrams), - lost_pkts: self.lost_pkts.saturating_sub(prev.lost_pkts), - connected_since: self.connected_since, - } - } - - fn uptime_str(&self) -> String { - let secs = self.connected_since.elapsed().as_secs(); - if secs < 60 { - format!("{secs}s") - } else if secs < 3600 { - format!("{}m{}s", secs / 60, secs % 60) - } else if secs < 36 * 3600 { - format!("{}h{}m", secs / 3600, (secs % 3600) / 60) - } else { - let days = secs / 86400; - let hours = (secs % 86400) / 3600; - let mins = (secs % 3600) / 60; - format!("{days}d{hours}h{mins}m") - } - } -} - -/// Per-TL-tag message counters (lock-free atomics, collected per dump interval). -struct MsgTagCounters { - count: AtomicU64, - bytes: AtomicU64, -} - -impl MsgTagCounters { - fn new() -> Self { - Self { count: AtomicU64::new(0), bytes: AtomicU64::new(0) } - } - - fn record(&self, size: usize) { - self.count.fetch_add(1, Ordering::Relaxed); - self.bytes.fetch_add(size as u64, Ordering::Relaxed); - } - - /// Take current values and reset to zero. - fn take(&self) -> (u64, u64) { - (self.count.swap(0, Ordering::Relaxed), self.bytes.swap(0, Ordering::Relaxed)) - } -} - -/// Classification of a QUIC message for telemetry. -#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] -enum MsgKind { - Message, - Query, - Answer, - NoAnswer, -} - -impl MsgKind { - fn label(&self) -> &'static str { - match self { - MsgKind::Message => "msg ", - MsgKind::Query => "query ", - MsgKind::Answer => "ans ", - MsgKind::NoAnswer => "no_ans", - } - } -} - -/// Per-peer, per-TL-tag message statistics key. -#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] -struct MsgStatsKey { - addr: SocketAddr, - tag: u32, - is_outbound: bool, - kind: MsgKind, -} - -/// Tracks per-peer, per-message-kind statistics for QUIC traffic. -struct MsgStats { - counters: lockfree::map::Map, -} - -impl MsgStats { - fn new() -> Arc { - Arc::new(Self { counters: lockfree::map::Map::new() }) - } - - fn record(&self, tag: u32, size: usize, addr: SocketAddr, is_outbound: bool, kind: MsgKind) { - let key = MsgStatsKey { addr, tag, is_outbound, kind }; - if let Some(entry) = self.counters.get(&key) { - entry.val().record(size); - return; - } - let _ = add_unbound_object_to_map(&self.counters, key, || Ok(MsgTagCounters::new())); - if let Some(entry) = self.counters.get(&key) { - entry.val().record(size); - } - } - - /// Drain all counters and return entries sorted by peer then bytes desc. - /// Entries with zero activity since the last drain are removed - fn drain(&self) -> Vec<(MsgStatsKey, u64, u64)> { - let mut result = Vec::new(); - let mut stale = Vec::new(); - for entry in self.counters.iter() { - let (count, bytes) = entry.val().take(); - if count > 0 { - result.push((*entry.key(), count, bytes)); - } else { - stale.push(*entry.key()); - } - } - for key in stale { - self.counters.remove(&key); +/// Returns `Some(incoming)` if the connection is allowed to proceed, +/// or `None` if it was rejected (retry sent, refused, or ignored). +fn rate_limit( + incoming: quinn::Incoming, + config: &QuicRateLimitConfig, + conn_rate_limiters: &mut ConnectionRateLimiters, + global_rate_limiter: &mut Option, + transport_errors: &TransportErrors, + bind_addr: SocketAddr, +) -> Option { + let addr = incoming.remote_address(); + + // Layer 1: Stateless Retry — force address validation + if config.stateless_retry && !incoming.remote_address_validated() && incoming.may_retry() { + log::trace!(target: TARGET, "Sending QUIC Retry to unvalidated {addr} on {bind_addr}"); + transport_errors.retry_sent.fetch_add(1, Ordering::Relaxed); + if let Err(e) = incoming.retry() { + log::warn!(target: TARGET, "QUIC retry failed for {addr}: {e}"); + } + return None; + } + + // Layer 2: Per-IP rate limit + if !conn_rate_limiters.take_new_connection(addr.ip()) { + log::debug!(target: TARGET, "Per-IP rate limit for {} on {bind_addr}", addr.ip()); + transport_errors.rate_limited_per_ip.fetch_add(1, Ordering::Relaxed); + incoming.refuse(); + return None; + } + + // Periodic cleanup of stale per-IP entries + conn_rate_limiters.cleanup(); + + // Layer 3: Global rate limit + if let Some(ref mut gl) = global_rate_limiter { + if !gl.take() { + log::debug!(target: TARGET, "Global rate limit on {bind_addr}, refusing {addr}"); + transport_errors.rate_limited_global.fetch_add(1, Ordering::Relaxed); + incoming.refuse(); + return None; } - result.sort_by(|a, b| a.0.addr.cmp(&b.0.addr).then(b.2.cmp(&a.2))); - result - } -} - -#[cfg(test)] -mod tests { - use super::*; - - // --- extract_inner_tag --- - - /// Helper: build an overlay.message (0x75252420) wrapping the given inner tag. - fn make_overlay_message(inner_tag: u32) -> Vec { - let mut buf = Vec::new(); - buf.extend_from_slice(&0x75252420u32.to_le_bytes()); // outer tag - buf.extend_from_slice(&[0u8; 32]); // overlay int256 - buf.extend_from_slice(&inner_tag.to_le_bytes()); // inner payload tag - buf - } - - /// Helper: build an overlay.query (0xccfd8443) wrapping the given inner tag. - fn make_overlay_query(inner_tag: u32) -> Vec { - let mut buf = Vec::new(); - buf.extend_from_slice(&0xccfd8443u32.to_le_bytes()); - buf.extend_from_slice(&[0u8; 32]); - buf.extend_from_slice(&inner_tag.to_le_bytes()); - buf - } - - #[test] - fn test_extract_inner_tag_empty() { - assert_eq!(extract_inner_tag(&[]), 0); - assert_eq!(extract_inner_tag(&[1, 2, 3]), 0); } - #[test] - fn test_extract_inner_tag_unknown_outer() { - let data = 0xDEADBEEFu32.to_le_bytes(); - assert_eq!(extract_inner_tag(&data), 0xDEADBEEF); - } - - #[test] - fn test_extract_inner_tag_overlay_message() { - let data = make_overlay_message(0x236758c4); // catchain.blockUpdate - assert_eq!(extract_inner_tag(&data), 0x236758c4); - } - - #[test] - fn test_extract_inner_tag_overlay_query() { - let data = make_overlay_query(0x48ee64ab); // overlay.getRandomPeers - assert_eq!(extract_inner_tag(&data), 0x48ee64ab); - } - - #[test] - fn test_extract_inner_tag_overlay_message_too_short() { - // outer tag + partial overlay id (not enough for inner tag) - let mut data = Vec::new(); - data.extend_from_slice(&0x75252420u32.to_le_bytes()); - data.extend_from_slice(&[0u8; 30]); // only 30 bytes, need 32 + 4 - assert_eq!(extract_inner_tag(&data), 0x75252420); // falls back to outer - } - - // --- MsgStats --- - - fn test_addr(port: u16) -> SocketAddr { - SocketAddr::from(([127, 0, 0, 1], port)) - } - - #[test] - fn test_msg_stats_record_and_drain() { - let stats = MsgStats::new(); - let addr = test_addr(1000); - - stats.record(0xAA, 100, addr, true, MsgKind::Message); - stats.record(0xAA, 200, addr, true, MsgKind::Message); - stats.record(0xBB, 50, addr, true, MsgKind::Query); - - let entries = stats.drain(); - assert_eq!(entries.len(), 2); - - // Sorted by addr (same), then bytes desc: AA(300) before BB(50) - assert_eq!(entries[0].0.tag, 0xAA); - assert_eq!(entries[0].1, 2); // count - assert_eq!(entries[0].2, 300); // bytes - - assert_eq!(entries[1].0.tag, 0xBB); - assert_eq!(entries[1].1, 1); - assert_eq!(entries[1].2, 50); - } - - #[test] - fn test_msg_stats_drain_sorts_by_addr_then_bytes() { - let stats = MsgStats::new(); - let addr_a = test_addr(1000); - let addr_b = test_addr(2000); - - stats.record(0xAA, 10, addr_b, true, MsgKind::Message); - stats.record(0xBB, 500, addr_a, true, MsgKind::Message); - stats.record(0xCC, 100, addr_a, true, MsgKind::Message); - - let entries = stats.drain(); - assert_eq!(entries.len(), 3); - - // addr_a (port 1000) first, sorted by bytes desc - assert_eq!(entries[0].0.addr, addr_a); - assert_eq!(entries[0].0.tag, 0xBB); // 500 bytes - assert_eq!(entries[1].0.addr, addr_a); - assert_eq!(entries[1].0.tag, 0xCC); // 100 bytes - - // addr_b (port 2000) last - assert_eq!(entries[2].0.addr, addr_b); - assert_eq!(entries[2].0.tag, 0xAA); - } - - #[test] - fn test_msg_stats_drain_resets_counters() { - let stats = MsgStats::new(); - let addr = test_addr(1000); - - stats.record(0xAA, 100, addr, true, MsgKind::Message); - let entries = stats.drain(); - assert_eq!(entries.len(), 1); - - // Second drain: no new activity, should return empty - let entries = stats.drain(); - assert!(entries.is_empty()); - } - - #[test] - fn test_msg_stats_drain_evicts_stale_keys() { - let stats = MsgStats::new(); - let addr = test_addr(1000); - - stats.record(0xAA, 100, addr, true, MsgKind::Message); - stats.record(0xBB, 50, addr, false, MsgKind::Message); - - // First drain: both active, counters reset - let _ = stats.drain(); - - // Only record on 0xAA - stats.record(0xAA, 200, addr, true, MsgKind::Message); - - // Second drain: 0xBB was idle → evicted - let _ = stats.drain(); - - // Record on 0xBB again — must re-insert (was evicted) - stats.record(0xBB, 30, addr, false, MsgKind::Message); - let entries = stats.drain(); - - // 0xAA was idle since last drain (evicted), only 0xBB with activity is returned - assert_eq!(entries.len(), 1); - assert_eq!(entries[0].0.tag, 0xBB); - assert_eq!(entries[0].2, 30); - } - - #[test] - fn test_msg_stats_distinguishes_direction_and_kind() { - let stats = MsgStats::new(); - let addr = test_addr(1000); - - stats.record(0xAA, 100, addr, true, MsgKind::Message); // outbound msg - stats.record(0xAA, 200, addr, false, MsgKind::Message); // inbound msg - stats.record(0xAA, 300, addr, true, MsgKind::Query); // outbound query - - let entries = stats.drain(); - assert_eq!(entries.len(), 3); - } + Some(incoming) } diff --git a/src/adnl/src/quic/rate_limiter.rs b/src/adnl/src/quic/rate_limiter.rs new file mode 100644 index 0000000..48b2f1e --- /dev/null +++ b/src/adnl/src/quic/rate_limiter.rs @@ -0,0 +1,253 @@ +use std::{ + collections::HashMap, + net::IpAddr, + sync::OnceLock, + time::{Duration, Instant}, +}; + +/// O(1) timestamp-based token bucket rate limiter. +/// +/// Matches the C++ `RateLimiter` from `adnl/utils.hpp` +pub(crate) struct RateLimiter { + period: f64, + emission_interval: f64, + ready_at: f64, + last_take_at: Instant, +} + +impl RateLimiter { + /// Create a new rate limiter. + /// + /// - `capacity`: burst size (how many calls succeed in rapid succession). + /// Must be >= 1. + /// - `period`: time in seconds between successive token emissions. + /// Must be > 0. + pub fn new(capacity: u32, period: f64) -> Self { + assert!(capacity >= 1, "RateLimiter capacity must be >= 1"); + assert!(period > 0.0, "RateLimiter period must be > 0"); + let emission_interval = (capacity - 1) as f64 * period; + Self { + period, + emission_interval, + ready_at: -emission_interval, + last_take_at: Instant::now(), + } + } + + /// Try to consume one token. Returns `true` if allowed, `false` if rate-limited. + pub fn take(&mut self) -> bool { + let now = now_secs(); + self.last_take_at = Instant::now(); + // Clamp: don't accumulate more than `capacity` tokens worth of credit. + let min_ready_at = now - self.emission_interval; + if self.ready_at < min_ready_at { + self.ready_at = min_ready_at; + } + if self.ready_at > now { + return false; + } + self.ready_at += self.period; + true + } + + pub fn last_take_at(&self) -> Instant { + self.last_take_at + } +} + +/// Per-IP rate limiter map with periodic cleanup. +/// +/// Each unique IP gets its own `RateLimiter` instance, created on demand. +/// Stale entries (no activity for longer than `period` seconds) are evicted +/// every 10 seconds. +pub(crate) struct ConnectionRateLimiters { + capacity: u32, + period: f64, + limiters: HashMap, + cleanup_at: Option, +} + +impl ConnectionRateLimiters { + pub fn new(capacity: u32, period: f64) -> Self { + Self { capacity, period, limiters: HashMap::new(), cleanup_at: None } + } + + /// Check per-IP rate limit. Returns `true` if allowed. + /// If `capacity == 0`, rate limiting is disabled (always returns `true`). + pub fn take_new_connection(&mut self, ip: IpAddr) -> bool { + if self.capacity == 0 { + return true; + } + let limiter = + self.limiters.entry(ip).or_insert_with(|| RateLimiter::new(self.capacity, self.period)); + let allowed = limiter.take(); + self.schedule_cleanup(); + allowed + } + + /// Remove stale per-IP entries. Call periodically from the accept loop. + pub fn cleanup(&mut self) { + let Some(at) = self.cleanup_at else { return }; + if Instant::now() < at { + return; + } + let period = self.period; + self.limiters.retain(|_, limiter| limiter.last_take_at().elapsed().as_secs_f64() < period); + self.cleanup_at = if self.limiters.is_empty() { + None + } else { + Some(Instant::now() + Duration::from_secs(10)) + }; + } + + fn schedule_cleanup(&mut self) { + if self.cleanup_at.is_none() { + self.cleanup_at = Some(Instant::now() + Duration::from_secs(10)); + } + } +} + +/// Configuration for QUIC connection rate limiting. +#[derive(Clone, Debug)] +pub struct QuicRateLimitConfig { + /// Per-IP: max burst of new connections before throttling (0 = disabled). + pub per_ip_capacity: u32, + /// Per-IP: time in seconds to refill one token. + pub per_ip_period: f64, + /// Global: max burst of new connections across all IPs (0 = disabled). + pub global_capacity: u32, + /// Global: time in seconds to refill one token. + pub global_period: f64, + /// Whether to send QUIC Retry packets for unvalidated addresses. + pub stateless_retry: bool, +} + +impl Default for QuicRateLimitConfig { + fn default() -> Self { + Self { + per_ip_capacity: 10, + per_ip_period: 0.2, + global_capacity: 100_000, + global_period: 0.00001, + stateless_retry: true, + } + } +} + +impl QuicRateLimitConfig { + /// Config that disables all rate limiting (for tests). + pub fn disabled() -> Self { + Self { + per_ip_capacity: 0, + per_ip_period: 1.0, + global_capacity: 0, + global_period: 1.0, + stateless_retry: false, + } + } +} + +/// Monotonic timestamp in seconds, used for the token bucket math. +/// Using a module-level function so tests can potentially override via mock. +fn now_secs() -> f64 { + static EPOCH: OnceLock = OnceLock::new(); + let epoch = EPOCH.get_or_init(Instant::now); + epoch.elapsed().as_secs_f64() +} + +#[cfg(test)] +mod tests { + use super::*; + use std::thread; + + /// Advance the process-level monotonic clock by sleeping. + /// For short durations this is reliable enough for unit tests. + fn sleep_ms(ms: u64) { + thread::sleep(Duration::from_millis(ms)); + } + + #[test] + fn test_burst_and_refill() { + // capacity=3, period=0.05s (50ms) → emission_interval = 2*0.05 = 0.1s + let mut limiter = RateLimiter::new(3, 0.05); + + // Initial burst: 3 allowed + assert!(limiter.take(), "1st should succeed"); + assert!(limiter.take(), "2nd should succeed"); + assert!(limiter.take(), "3rd should succeed"); + assert!(!limiter.take(), "4th should fail (burst exhausted)"); + + // Wait one period → one token refills + sleep_ms(55); + assert!(limiter.take(), "should succeed after 1 period"); + assert!(!limiter.take(), "should fail again"); + + // Wait two periods → two tokens refill + sleep_ms(105); + assert!(limiter.take(), "1st after 2 periods"); + assert!(limiter.take(), "2nd after 2 periods"); + assert!(!limiter.take(), "3rd should fail"); + + // Wait long time → clamped to capacity (3) + sleep_ms(500); + assert!(limiter.take()); + assert!(limiter.take()); + assert!(limiter.take()); + assert!(!limiter.take()); + } + + #[test] + fn test_capacity_one() { + let mut limiter = RateLimiter::new(1, 0.05); + + assert!(limiter.take(), "first should succeed"); + assert!(!limiter.take(), "second should fail immediately"); + + sleep_ms(55); + assert!(limiter.take(), "should succeed after period"); + assert!(!limiter.take(), "should fail again"); + } + + #[test] + fn test_per_ip_isolation() { + let mut limiters = ConnectionRateLimiters::new(2, 0.05); + + let ip_a: IpAddr = "1.2.3.4".parse().unwrap(); + let ip_b: IpAddr = "5.6.7.8".parse().unwrap(); + + assert!(limiters.take_new_connection(ip_a)); + assert!(limiters.take_new_connection(ip_a)); + assert!(!limiters.take_new_connection(ip_a), "ip_a exhausted"); + + // ip_b is independent + assert!(limiters.take_new_connection(ip_b)); + assert!(limiters.take_new_connection(ip_b)); + assert!(!limiters.take_new_connection(ip_b), "ip_b exhausted"); + } + + #[test] + fn test_disabled_capacity_zero() { + let mut limiters = ConnectionRateLimiters::new(0, 1.0); + + let ip: IpAddr = "1.2.3.4".parse().unwrap(); + for _ in 0..1000 { + assert!(limiters.take_new_connection(ip)); + } + } + + #[test] + fn test_cleanup_stale() { + // period=0.05s, so entries older than 50ms are stale + let mut limiters = ConnectionRateLimiters::new(2, 0.05); + + let ip: IpAddr = "1.2.3.4".parse().unwrap(); + limiters.take_new_connection(ip); + + // Force cleanup_at to be in the past + limiters.cleanup_at = Some(Instant::now()); + + sleep_ms(60); + limiters.cleanup(); + assert!(limiters.limiters.is_empty(), "stale entry should be evicted"); + } +} diff --git a/src/adnl/src/quic/stat.rs b/src/adnl/src/quic/stat.rs new file mode 100644 index 0000000..840a6fe --- /dev/null +++ b/src/adnl/src/quic/stat.rs @@ -0,0 +1,447 @@ +use crate::common::add_unbound_object_to_map; +use std::{ + net::SocketAddr, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, + time::Instant, +}; +use ton_api::deserialize_boxed_with_suffix; + +/// Extract the "inner" TL constructor tag from message data. +/// +/// QUIC message payloads are typically wrapped in an overlay prefix +/// (`overlay.message` or `overlay.query`). The outer tag is not useful +/// for diagnostics. This function skips past the overlay wrapper and +/// returns the constructor tag of the actual inner payload. +/// +/// `overlay.message` and `overlay.query` have a fixed layout: +/// constructor(4 bytes) + int256(32 bytes) = 36 bytes prefix. +/// `WithExtra` variants have a variable-length extra field, so we +/// fall back to `deserialize_boxed_with_suffix` for those. +pub(super) fn extract_inner_tag(data: &[u8]) -> u32 { + if data.len() < 4 { + return 0; + } + let outer = u32::from_le_bytes([data[0], data[1], data[2], data[3]]); + // overlay.message / overlay.query: fixed 36-byte prefix (constructor + int256) + const FIXED_PREFIX: usize = 4 + 32; + match outer { + 0x75252420 | 0xccfd8443 => { + // overlay.message, overlay.query + if data.len() >= FIXED_PREFIX + 4 { + let s = &data[FIXED_PREFIX..]; + return u32::from_le_bytes([s[0], s[1], s[2], s[3]]); + } + outer + } + 0xa232233d | 0x94ffc3e9 => { + // overlay.messageWithExtra, overlay.queryWithExtra + if let Ok((_obj, suffix_offset)) = deserialize_boxed_with_suffix(data) { + if suffix_offset + 4 <= data.len() { + let s = &data[suffix_offset..]; + return u32::from_le_bytes([s[0], s[1], s[2], s[3]]); + } + } + outer + } + _ => outer, + } +} + +/// Map well-known TL constructor tags to short human-readable names for log output. +pub(super) fn tl_tag_name(tag: u32) -> &'static str { + match tag { + 0x75252420 => "overlay.message", + 0xa232233d => "overlay.messageWithExtra", + 0xccfd8443 => "overlay.query", + 0x94ffc3e9 => "overlay.queryWithExtra", + 0xb15a2b6b => "overlay.broadcast", + 0xbad7c36a => "overlay.broadcastFec", + 0xf1881342 => "overlay.broadcastFecShort", + 0x46efae62 => "overlay.broadcastStream", + 0xf99fd63d => "overlay.broadcastTwostepFec", + 0x80b859b0 => "overlay.broadcastTwostepSimple", + 0x33534e24 => "overlay.unicast", + 0xd55c14ec => "overlay.fec.received", + 0x09d76914 => "overlay.fec.completed", + 0x48ee64ab => "overlay.getRandomPeers", + 0xa58e7ecc => "overlay.getRandomPeersV2", + 0x690cb481 => "overlay.ping", + 0x236758c4 => "catchain.blockUpdate", + 0x9283ce37 => "validatorSession.blockUpdate", + 0xbe7b573a => "consensus.simplex.certificate", + 0xc37ef4f3 => "consensus.simplex.vote", + 0x543fba6c => "consensus.simplex.requestCandidate", + _ => "unknown", + } +} + +/// Aggregate error counters for the QUIC transport layer (lock-free, reset on each stats dump). +pub(super) struct TransportErrors { + /// send_via_stream / send_via_stream_nowait failures (stream open, write, finish errors) + pub send_failed: AtomicU64, + /// query timeouts (deadline exceeded waiting for response) + pub query_timeout: AtomicU64, + /// connection establishment failures (connect / handshake errors) + pub connect_failed: AtomicU64, + /// messages dropped because the per-peer send queue was full + pub queue_full: AtomicU64, + /// dead connections removed (by checker or on send failure) + pub dead_conn_removed: AtomicU64, + /// connections rejected by per-IP rate limiter + pub rate_limited_per_ip: AtomicU64, + /// connections rejected by global rate limiter + pub rate_limited_global: AtomicU64, + /// stateless Retry packets sent + pub retry_sent: AtomicU64, +} + +impl TransportErrors { + pub fn new() -> Arc { + Arc::new(Self { + send_failed: AtomicU64::new(0), + query_timeout: AtomicU64::new(0), + connect_failed: AtomicU64::new(0), + queue_full: AtomicU64::new(0), + dead_conn_removed: AtomicU64::new(0), + rate_limited_per_ip: AtomicU64::new(0), + rate_limited_global: AtomicU64::new(0), + retry_sent: AtomicU64::new(0), + }) + } + + /// Take current values and reset to zero; returns (send_failed, query_timeout, + /// connect_failed, queue_full, dead_conn_removed, rate_limited_per_ip, + /// rate_limited_global, retry_sent). + pub fn take(&self) -> (u64, u64, u64, u64, u64, u64, u64, u64) { + ( + self.send_failed.swap(0, Ordering::Relaxed), + self.query_timeout.swap(0, Ordering::Relaxed), + self.connect_failed.swap(0, Ordering::Relaxed), + self.queue_full.swap(0, Ordering::Relaxed), + self.dead_conn_removed.swap(0, Ordering::Relaxed), + self.rate_limited_per_ip.swap(0, Ordering::Relaxed), + self.rate_limited_global.swap(0, Ordering::Relaxed), + self.retry_sent.swap(0, Ordering::Relaxed), + ) + } +} + +/// Snapshot of cumulative counters from a single connection, used to compute deltas. +#[derive(Clone, Copy)] +pub(super) struct ConnSnapshot { + pub tx_bytes: u64, + pub tx_dgrams: u64, + pub rx_bytes: u64, + pub rx_dgrams: u64, + pub lost_pkts: u64, + /// When this connection was first observed by the stats dumper. + pub connected_since: Instant, +} + +impl ConnSnapshot { + pub fn new(s: &quinn::ConnectionStats, connected_since: Instant) -> Self { + Self { + tx_bytes: s.udp_tx.bytes, + tx_dgrams: s.udp_tx.datagrams, + rx_bytes: s.udp_rx.bytes, + rx_dgrams: s.udp_rx.datagrams, + lost_pkts: s.path.lost_packets, + connected_since, + } + } + + pub fn delta(&self, prev: &Self) -> Self { + Self { + tx_bytes: self.tx_bytes.saturating_sub(prev.tx_bytes), + tx_dgrams: self.tx_dgrams.saturating_sub(prev.tx_dgrams), + rx_bytes: self.rx_bytes.saturating_sub(prev.rx_bytes), + rx_dgrams: self.rx_dgrams.saturating_sub(prev.rx_dgrams), + lost_pkts: self.lost_pkts.saturating_sub(prev.lost_pkts), + connected_since: self.connected_since, + } + } + + pub fn uptime_str(&self) -> String { + let secs = self.connected_since.elapsed().as_secs(); + if secs < 60 { + format!("{secs}s") + } else if secs < 3600 { + format!("{}m{}s", secs / 60, secs % 60) + } else if secs < 36 * 3600 { + format!("{}h{}m", secs / 3600, (secs % 3600) / 60) + } else { + let days = secs / 86400; + let hours = (secs % 86400) / 3600; + let mins = (secs % 3600) / 60; + format!("{days}d{hours}h{mins}m") + } + } +} + +/// Per-TL-tag message counters (lock-free atomics, collected per dump interval). +pub(super) struct MsgTagCounters { + count: AtomicU64, + bytes: AtomicU64, +} + +impl MsgTagCounters { + pub fn new() -> Self { + Self { count: AtomicU64::new(0), bytes: AtomicU64::new(0) } + } + + pub fn record(&self, size: usize) { + self.count.fetch_add(1, Ordering::Relaxed); + self.bytes.fetch_add(size as u64, Ordering::Relaxed); + } + + /// Take current values and reset to zero. + pub fn take(&self) -> (u64, u64) { + (self.count.swap(0, Ordering::Relaxed), self.bytes.swap(0, Ordering::Relaxed)) + } +} + +/// Classification of a QUIC message for telemetry. +#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub(super) enum MsgKind { + Message, + Query, + Answer, + NoAnswer, +} + +impl MsgKind { + pub fn label(&self) -> &'static str { + match self { + MsgKind::Message => "msg ", + MsgKind::Query => "query ", + MsgKind::Answer => "ans ", + MsgKind::NoAnswer => "no_ans", + } + } +} + +/// Per-peer, per-TL-tag message statistics key. +#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub(super) struct MsgStatsKey { + pub addr: SocketAddr, + pub tag: u32, + pub is_outbound: bool, + pub kind: MsgKind, +} + +/// Tracks per-peer, per-message-kind statistics for QUIC traffic. +pub(super) struct MsgStats { + counters: lockfree::map::Map, +} + +impl MsgStats { + pub fn new() -> Arc { + Arc::new(Self { counters: lockfree::map::Map::new() }) + } + + pub fn record( + &self, + tag: u32, + size: usize, + addr: SocketAddr, + is_outbound: bool, + kind: MsgKind, + ) { + let key = MsgStatsKey { addr, tag, is_outbound, kind }; + if let Some(entry) = self.counters.get(&key) { + entry.val().record(size); + return; + } + let _ = add_unbound_object_to_map(&self.counters, key, || Ok(MsgTagCounters::new())); + if let Some(entry) = self.counters.get(&key) { + entry.val().record(size); + } + } + + /// Drain all counters and return entries sorted by peer then bytes desc. + /// Entries with zero activity since the last drain are removed + pub fn drain(&self) -> Vec<(MsgStatsKey, u64, u64)> { + let mut result = Vec::new(); + let mut stale = Vec::new(); + for entry in self.counters.iter() { + let (count, bytes) = entry.val().take(); + if count > 0 { + result.push((*entry.key(), count, bytes)); + } else { + stale.push(*entry.key()); + } + } + for key in stale { + self.counters.remove(&key); + } + result.sort_by(|a, b| a.0.addr.cmp(&b.0.addr).then(b.2.cmp(&a.2))); + result + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // --- extract_inner_tag --- + + /// Helper: build an overlay.message (0x75252420) wrapping the given inner tag. + fn make_overlay_message(inner_tag: u32) -> Vec { + let mut buf = Vec::new(); + buf.extend_from_slice(&0x75252420u32.to_le_bytes()); // outer tag + buf.extend_from_slice(&[0u8; 32]); // overlay int256 + buf.extend_from_slice(&inner_tag.to_le_bytes()); // inner payload tag + buf + } + + /// Helper: build an overlay.query (0xccfd8443) wrapping the given inner tag. + fn make_overlay_query(inner_tag: u32) -> Vec { + let mut buf = Vec::new(); + buf.extend_from_slice(&0xccfd8443u32.to_le_bytes()); + buf.extend_from_slice(&[0u8; 32]); + buf.extend_from_slice(&inner_tag.to_le_bytes()); + buf + } + + #[test] + fn test_extract_inner_tag_empty() { + assert_eq!(extract_inner_tag(&[]), 0); + assert_eq!(extract_inner_tag(&[1, 2, 3]), 0); + } + + #[test] + fn test_extract_inner_tag_unknown_outer() { + let data = 0xDEADBEEFu32.to_le_bytes(); + assert_eq!(extract_inner_tag(&data), 0xDEADBEEF); + } + + #[test] + fn test_extract_inner_tag_overlay_message() { + let data = make_overlay_message(0x236758c4); // catchain.blockUpdate + assert_eq!(extract_inner_tag(&data), 0x236758c4); + } + + #[test] + fn test_extract_inner_tag_overlay_query() { + let data = make_overlay_query(0x48ee64ab); // overlay.getRandomPeers + assert_eq!(extract_inner_tag(&data), 0x48ee64ab); + } + + #[test] + fn test_extract_inner_tag_overlay_message_too_short() { + // outer tag + partial overlay id (not enough for inner tag) + let mut data = Vec::new(); + data.extend_from_slice(&0x75252420u32.to_le_bytes()); + data.extend_from_slice(&[0u8; 30]); // only 30 bytes, need 32 + 4 + assert_eq!(extract_inner_tag(&data), 0x75252420); // falls back to outer + } + + // --- MsgStats --- + + fn test_addr(port: u16) -> SocketAddr { + SocketAddr::from(([127, 0, 0, 1], port)) + } + + #[test] + fn test_msg_stats_record_and_drain() { + let stats = MsgStats::new(); + let addr = test_addr(1000); + + stats.record(0xAA, 100, addr, true, MsgKind::Message); + stats.record(0xAA, 200, addr, true, MsgKind::Message); + stats.record(0xBB, 50, addr, true, MsgKind::Query); + + let entries = stats.drain(); + assert_eq!(entries.len(), 2); + + // Sorted by addr (same), then bytes desc: AA(300) before BB(50) + assert_eq!(entries[0].0.tag, 0xAA); + assert_eq!(entries[0].1, 2); // count + assert_eq!(entries[0].2, 300); // bytes + + assert_eq!(entries[1].0.tag, 0xBB); + assert_eq!(entries[1].1, 1); + assert_eq!(entries[1].2, 50); + } + + #[test] + fn test_msg_stats_drain_sorts_by_addr_then_bytes() { + let stats = MsgStats::new(); + let addr_a = test_addr(1000); + let addr_b = test_addr(2000); + + stats.record(0xAA, 10, addr_b, true, MsgKind::Message); + stats.record(0xBB, 500, addr_a, true, MsgKind::Message); + stats.record(0xCC, 100, addr_a, true, MsgKind::Message); + + let entries = stats.drain(); + assert_eq!(entries.len(), 3); + + // addr_a (port 1000) first, sorted by bytes desc + assert_eq!(entries[0].0.addr, addr_a); + assert_eq!(entries[0].0.tag, 0xBB); // 500 bytes + assert_eq!(entries[1].0.addr, addr_a); + assert_eq!(entries[1].0.tag, 0xCC); // 100 bytes + + // addr_b (port 2000) last + assert_eq!(entries[2].0.addr, addr_b); + assert_eq!(entries[2].0.tag, 0xAA); + } + + #[test] + fn test_msg_stats_drain_resets_counters() { + let stats = MsgStats::new(); + let addr = test_addr(1000); + + stats.record(0xAA, 100, addr, true, MsgKind::Message); + let entries = stats.drain(); + assert_eq!(entries.len(), 1); + + // Second drain: no new activity, should return empty + let entries = stats.drain(); + assert!(entries.is_empty()); + } + + #[test] + fn test_msg_stats_drain_evicts_stale_keys() { + let stats = MsgStats::new(); + let addr = test_addr(1000); + + stats.record(0xAA, 100, addr, true, MsgKind::Message); + stats.record(0xBB, 50, addr, false, MsgKind::Message); + + // First drain: both active, counters reset + let _ = stats.drain(); + + // Only record on 0xAA + stats.record(0xAA, 200, addr, true, MsgKind::Message); + + // Second drain: 0xBB was idle → evicted + let _ = stats.drain(); + + // Record on 0xBB again — must re-insert (was evicted) + stats.record(0xBB, 30, addr, false, MsgKind::Message); + let entries = stats.drain(); + + // 0xAA was idle since last drain (evicted), only 0xBB with activity is returned + assert_eq!(entries.len(), 1); + assert_eq!(entries[0].0.tag, 0xBB); + assert_eq!(entries[0].2, 30); + } + + #[test] + fn test_msg_stats_distinguishes_direction_and_kind() { + let stats = MsgStats::new(); + let addr = test_addr(1000); + + stats.record(0xAA, 100, addr, true, MsgKind::Message); // outbound msg + stats.record(0xAA, 200, addr, false, MsgKind::Message); // inbound msg + stats.record(0xAA, 300, addr, true, MsgKind::Query); // outbound query + + let entries = stats.drain(); + assert_eq!(entries.len(), 3); + } +} diff --git a/src/adnl/tests/test_quic.rs b/src/adnl/tests/test_quic.rs index cab30a6..2eccf2f 100644 --- a/src/adnl/tests/test_quic.rs +++ b/src/adnl/tests/test_quic.rs @@ -10,7 +10,7 @@ use adnl::{ common::{AdnlPeers, QueryResult, Subscriber, Version}, node::{AdnlNode, IpAddress}, - DhtNode, OverlayNode, QuicNode, + DhtNode, OverlayNode, QuicNode, QuicRateLimitConfig, }; use std::{ collections::HashSet, @@ -158,6 +158,7 @@ fn test_quic_concurrent_accept() { server_token.clone(), None, tokio::runtime::Handle::current(), + Some(QuicRateLimitConfig::disabled()), ); server.add_key(&server_key, &server_key_id, server_bind).unwrap(); @@ -186,8 +187,13 @@ fn test_quic_concurrent_accept() { let bind: SocketAddr = format!("127.0.0.1:{}", port + QuicNode::OFFSET_PORT).parse().unwrap(); let token = CancellationToken::new(); - let quic = - QuicNode::new(vec![sub], token.clone(), None, tokio::runtime::Handle::current()); + let quic = QuicNode::new( + vec![sub], + token.clone(), + None, + tokio::runtime::Handle::current(), + Some(QuicRateLimitConfig::disabled()), + ); quic.add_key(&key, &key_id, bind).unwrap(); quic.add_peer_key(server_key_id.clone(), server_bind).unwrap(); server.add_peer_key(key_id.clone(), bind).unwrap(); @@ -282,12 +288,22 @@ fn test_quic_session() { let bind_a: SocketAddr = "127.0.0.1:5600".parse().unwrap(); let bind_b: SocketAddr = "127.0.0.1:5601".parse().unwrap(); - let quic_a = - QuicNode::new(vec![sub_a], token_a.clone(), None, tokio::runtime::Handle::current()); + let quic_a = QuicNode::new( + vec![sub_a], + token_a.clone(), + None, + tokio::runtime::Handle::current(), + Some(QuicRateLimitConfig::disabled()), + ); quic_a.add_key(&key_bytes_a, &key_id_a, bind_a).unwrap(); - let quic_b = - QuicNode::new(vec![sub_b], token_b.clone(), None, tokio::runtime::Handle::current()); + let quic_b = QuicNode::new( + vec![sub_b], + token_b.clone(), + None, + tokio::runtime::Handle::current(), + Some(QuicRateLimitConfig::disabled()), + ); quic_b.add_key(&key_bytes_b, &key_id_b, bind_b).unwrap(); // Register peer addresses @@ -366,6 +382,7 @@ fn test_quic_reconnect_after_server_restart() { client_token.clone(), None, tokio::runtime::Handle::current(), + Some(QuicRateLimitConfig::disabled()), ); client.add_key(&client_key, &client_key_id, client_bind).unwrap(); @@ -390,6 +407,7 @@ fn test_quic_reconnect_after_server_restart() { server_token1.clone(), None, tokio::runtime::Handle::current(), + Some(QuicRateLimitConfig::disabled()), ); server1.add_key(&server_key, &server_key_id, server_bind).unwrap(); @@ -428,6 +446,7 @@ fn test_quic_reconnect_after_server_restart() { server_token2.clone(), None, tokio::runtime::Handle::current(), + Some(QuicRateLimitConfig::disabled()), ); server2.add_key(&server_key, &server_key_id, server_bind).unwrap(); server2.add_peer_key(client_key_id.clone(), client_bind).unwrap(); @@ -528,8 +547,13 @@ fn test_quic_stream_limit() { let server_bind: SocketAddr = format!("127.0.0.1:{}", SERVER_PORT + QuicNode::OFFSET_PORT).parse().unwrap(); - let server = - QuicNode::new(vec![server_sub], server_token.clone(), Some(STREAM_LIMIT), tokio::runtime::Handle::current()); + let server = QuicNode::new( + vec![server_sub], + server_token.clone(), + Some(STREAM_LIMIT), + tokio::runtime::Handle::current(), + Some(QuicRateLimitConfig::disabled()), + ); server.add_key(&server_key, &server_key_id, server_bind).unwrap(); // --- client (normal limits) --- @@ -548,7 +572,13 @@ fn test_quic_stream_limit() { let client_bind: SocketAddr = format!("127.0.0.1:{}", CLIENT_PORT + QuicNode::OFFSET_PORT).parse().unwrap(); - let client = QuicNode::new(vec![client_sub], client_token.clone(), None, tokio::runtime::Handle::current()); + let client = QuicNode::new( + vec![client_sub], + client_token.clone(), + None, + tokio::runtime::Handle::current(), + Some(QuicRateLimitConfig::disabled()), + ); client.add_key(&client_key, &client_key_id, client_bind).unwrap(); // Register peers @@ -559,12 +589,12 @@ fn test_quic_stream_limit() { // Establish the connection with a ping/pong first let resp = tokio::time::timeout( Duration::from_secs(10), - client.query(make_ping_data(42), None, &peers, None), + client.query(make_ping_data(100500), None, &peers, None), ) .await .expect("initial query timed out") .expect("initial query failed"); - assert_eq!(parse_pong(resp.unwrap()), 42, "warmup pong mismatch"); + assert_eq!(parse_pong(resp.unwrap()), 100500, "warmup pong mismatch"); // --- fire NUM_MESSAGES concurrently --- let mut handles = Vec::with_capacity(NUM_MESSAGES); @@ -593,16 +623,14 @@ fn test_quic_stream_limit() { let observed_peak = peak.load(Ordering::SeqCst); println!( - "Stream limit test: limit={STREAM_LIMIT}, messages={NUM_MESSAGES}, peak_concurrent={observed_peak}" + "Stream limit test: limit={STREAM_LIMIT}, \ + messages={NUM_MESSAGES}, peak_concurrent={observed_peak}" ); assert!( observed_peak <= STREAM_LIMIT, "Peak concurrency {observed_peak} exceeded stream limit {STREAM_LIMIT}" ); - assert!( - observed_peak > 0, - "No messages were processed — test is broken" - ); + assert!(observed_peak > 0, "No messages were processed — test is broken"); // --- cleanup --- client.shutdown(); @@ -618,6 +646,14 @@ fn test_quic_stream_limit() { fn make_endpoint( adnl_port: u16, ) -> (Arc, [u8; Ed25519KeyOption::PVT_KEY_SIZE], Arc, SocketAddr, CancellationToken) +{ + make_endpoint_with_config(adnl_port, QuicRateLimitConfig::disabled()) +} + +fn make_endpoint_with_config( + adnl_port: u16, + rl_config: QuicRateLimitConfig, +) -> (Arc, [u8; Ed25519KeyOption::PVT_KEY_SIZE], Arc, SocketAddr, CancellationToken) { let key = ed25519_generate_private_key().unwrap().to_bytes(); let (_, cfg) = AdnlNodeConfig::from_ip_address_and_private_keys( @@ -632,7 +668,13 @@ fn make_endpoint( let (tx, _rx) = tokio::sync::mpsc::unbounded_channel(); let sub = Arc::new(TestSubscriber { key_id: key_id.clone(), msg_tx: tx }) as Arc; - let quic = QuicNode::new(vec![sub], token.clone(), None, tokio::runtime::Handle::current()); + let quic = QuicNode::new( + vec![sub], + token.clone(), + None, + tokio::runtime::Handle::current(), + Some(rl_config), + ); quic.add_key(&key, &key_id, bind).unwrap(); (quic, key, key_id, bind, token) } @@ -857,15 +899,9 @@ fn test_quic_duplicate_inbound_same_address() { let hex = hex::encode(server_key_id.data()); let sni = format!("{}.{}", &hex[..32], &hex[32..]); - // Open two connections from the same endpoint to the same server + // Open first connection and verify it works let conn1 = endpoint.connect(server_bind, &sni).unwrap().await.expect("raw conn1 handshake failed"); - let conn2 = - endpoint.connect(server_bind, &sni).unwrap().await.expect("raw conn2 handshake failed"); - - println!("Two raw connections established from same address to server"); - - // Send a ping via conn1 let ping_data = make_ping_wire(100); let (mut send1, mut recv1) = conn1.open_bi().await.unwrap(); send1.write_all(&ping_data).await.unwrap(); @@ -877,36 +913,27 @@ fn test_quic_duplicate_inbound_same_address() { assert_eq!(parse_pong_wire(&resp1), 100, "conn1 pong mismatch"); println!("conn1 ping/pong succeeded"); - // Wait for duplicate resolution - tokio::time::sleep(Duration::from_secs(3)).await; + // Open second connection with the same key — this replaces conn1 immediately + let conn2 = + endpoint.connect(server_bind, &sni).unwrap().await.expect("raw conn2 handshake failed"); + println!("conn2 established — conn1 should be closed by duplicate resolution"); - // At least one connection should still work (the survivor). - // Try conn2 first; if it was closed by resolution, fall back to conn1. - let mut success = false; - for (label, conn) in [("conn2", &conn2), ("conn1", &conn1)] { - if conn.close_reason().is_some() { - println!("{label} was closed by duplicate resolution"); - continue; - } - let (mut s, mut r) = match conn.open_bi().await { - Ok(pair) => pair, - Err(_) => continue, - }; - let ping = make_ping_wire(200); - if s.write_all(&ping).await.is_err() { - continue; - } - let _ = s.finish(); - if let Ok(Ok(resp)) = - tokio::time::timeout(Duration::from_secs(10), r.read_to_end(1 << 20)).await - { - assert_eq!(parse_pong_wire(&resp), 200, "{label} pong mismatch"); - println!("{label} survived duplicate resolution and answered query"); - success = true; - break; - } - } - assert!(success, "Neither connection survived duplicate resolution"); + // Give quinn a moment to propagate the close + tokio::time::sleep(Duration::from_millis(200)).await; + + assert!(conn1.close_reason().is_some(), "conn1 should be closed after conn2 replaced it"); + + // conn2 must work + let (mut s2, mut r2) = conn2.open_bi().await.unwrap(); + let ping2 = make_ping_wire(200); + s2.write_all(&ping2).await.unwrap(); + s2.finish().unwrap(); + let resp2 = tokio::time::timeout(Duration::from_secs(10), r2.read_to_end(1 << 20)) + .await + .expect("conn2 response timed out") + .expect("conn2 read failed"); + assert_eq!(parse_pong_wire(&resp2), 200, "conn2 pong mismatch"); + println!("conn2 survived duplicate resolution and answered query"); // --- cleanup --- conn1.close(0u32.into(), b"done"); @@ -1165,20 +1192,16 @@ fn test_quic_same_key_deduplication() { assert_eq!(parse_pong_wire(&resp2), 302); println!("conn2 ping/pong OK"); - // Wait past the duplicate-resolution window (max 2500ms + margin) - tokio::time::sleep(Duration::from_secs(4)).await; - - // The old connection (conn1) should have been closed by duplicate resolution. - // Check by trying to open a stream — if the connection was closed, this fails. - let conn1_alive = conn1.close_reason().is_none() && conn1.open_bi().await.is_ok(); - let conn2_alive = conn2.close_reason().is_none() && conn2.open_bi().await.is_ok(); + // conn1 is closed immediately when conn2 is accepted (same peer key). + // Give quinn a moment to propagate the close. + tokio::time::sleep(Duration::from_millis(200)).await; + let conn1_alive = conn1.close_reason().is_none(); + let conn2_alive = conn2.close_reason().is_none(); println!("After dedup: conn1_alive={conn1_alive}, conn2_alive={conn2_alive}"); - // Exactly one should have been closed (the old one). - // The new connection (conn2) must survive. - assert!(conn2_alive, "conn2 (newer) should survive deduplication"); assert!(!conn1_alive, "conn1 (older) should have been closed by deduplication"); + assert!(conn2_alive, "conn2 (newer) should survive deduplication"); println!("PASS: same-key duplicate was correctly deduplicated"); @@ -1450,7 +1473,7 @@ fn test_quic_reject_non_rpk_client() { let resp = tokio::time::timeout( Duration::from_secs(10), legit.query( - make_ping_data(42), + make_ping_data(100500), None, &AdnlPeers::with_keys(lk_id.clone(), server_key_id.clone()), None, @@ -1459,7 +1482,7 @@ fn test_quic_reject_non_rpk_client() { .await .expect("legit query timed out after rogue attempt") .expect("legit query failed"); - assert_eq!(parse_pong(resp.unwrap()), 42); + assert_eq!(parse_pong(resp.unwrap()), 100500); println!("Legitimate client works fine after rogue rejection"); // --- cleanup --- @@ -1584,8 +1607,13 @@ fn test_quic_connection_pool_exhaustion() { let (tx, _rx) = tokio::sync::mpsc::unbounded_channel(); let sub = Arc::new(TestSubscriber { key_id: key_id.clone(), msg_tx: tx }) as Arc; - let quic = - QuicNode::new(vec![sub], token.clone(), None, tokio::runtime::Handle::current()); + let quic = QuicNode::new( + vec![sub], + token.clone(), + None, + tokio::runtime::Handle::current(), + Some(QuicRateLimitConfig::disabled()), + ); quic.add_key(&key, &key_id, bind).unwrap(); quic.add_peer_key(server_key_id.clone(), server_bind).unwrap(); server.add_peer_key(key_id.clone(), bind).unwrap(); @@ -1694,6 +1722,7 @@ fn test_quic_message_burst_reconnect() { client_token.clone(), None, tokio::runtime::Handle::current(), + Some(QuicRateLimitConfig::disabled()), ); client.add_key(&client_key, &client_key_id, client_bind).unwrap(); @@ -1715,6 +1744,7 @@ fn test_quic_message_burst_reconnect() { srv_token1.clone(), None, tokio::runtime::Handle::current(), + Some(QuicRateLimitConfig::disabled()), ); server1.add_key(&server_key, &server_key_id, server_bind).unwrap(); @@ -1760,6 +1790,7 @@ fn test_quic_message_burst_reconnect() { srv_token2.clone(), None, tokio::runtime::Handle::current(), + Some(QuicRateLimitConfig::disabled()), ); server2.add_key(&server_key, &server_key_id, server_bind).unwrap(); server2.add_peer_key(client_key_id.clone(), client_bind).unwrap(); @@ -1833,6 +1864,7 @@ fn test_quic_single_sender_invariant() { client_token.clone(), None, tokio::runtime::Handle::current(), + Some(QuicRateLimitConfig::disabled()), ); client.add_key(&client_key, &client_key_id, client_bind).unwrap(); @@ -1853,6 +1885,7 @@ fn test_quic_single_sender_invariant() { srv_token.clone(), None, tokio::runtime::Handle::current(), + Some(QuicRateLimitConfig::disabled()), ); server.add_key(&server_key, &server_key_id, server_bind).unwrap(); @@ -2203,3 +2236,225 @@ fn test_no_quic_address_dht_distribution() { adnl2.stop().await; }); } + +// =========================================================================== +// Rate-limit integration tests +// =========================================================================== + +/// Create a raw quinn client endpoint on an ephemeral OS-assigned port. +/// Returns the endpoint and the SNI string for connecting to `server_key_id`. +fn make_raw_client_endpoint(server_key_id: &KeyId) -> (quinn::Endpoint, String) { + let key = ed25519_generate_private_key().unwrap().to_bytes(); + let client_config = build_raw_quinn_client(&key); + + let sock = socket2::Socket::new( + socket2::Domain::IPV4, + socket2::Type::DGRAM, + Some(socket2::Protocol::UDP), + ) + .unwrap(); + sock.set_reuse_address(true).unwrap(); + // Bind to 127.0.0.1:0 — OS assigns an ephemeral port + sock.bind(&"127.0.0.1:0".parse::().unwrap().into()).unwrap(); + sock.set_nonblocking(true).unwrap(); + let udp = std::net::UdpSocket::from(sock); + let runtime: Arc = Arc::new(quinn::TokioRuntime); + let mut endpoint = + quinn::Endpoint::new(quinn::EndpointConfig::default(), None, udp, runtime).unwrap(); + endpoint.set_default_client_config(client_config); + + let hex = hex::encode(server_key_id.data()); + let sni = format!("{}.{}", &hex[..32], &hex[32..]); + (endpoint, sni) +} + +/// Try to establish a QUIC connection with a timeout. +/// Returns Ok(connection) on success, Err on failure or timeout. +async fn try_connect( + endpoint: &quinn::Endpoint, + server_bind: SocketAddr, + sni: &str, + timeout: Duration, +) -> std::result::Result { + let connecting = endpoint.connect(server_bind, sni).map_err(|e| format!("connect: {e}"))?; + match tokio::time::timeout(timeout, connecting).await { + Ok(Ok(conn)) => Ok(conn), + Ok(Err(e)) => Err(format!("handshake: {e}")), + Err(_) => Err("timeout".into()), + } +} + +/// Per-IP rate limiter: server allows burst of 2 connections, then refuses. +/// Five rapid connection attempts from the same IP; first 2 succeed, rest fail. +#[test] +fn test_quic_rate_limit_per_ip() { + init_test_log(); + let rt = tokio::runtime::Builder::new_multi_thread().enable_all().build().unwrap(); + rt.block_on(async { + const SERVER_PORT: u16 = 8300; + const BURST: u32 = 2; + const TOTAL_ATTEMPTS: usize = 5; + const CONNECT_TIMEOUT: Duration = Duration::from_secs(3); + + let rl_config = QuicRateLimitConfig { + per_ip_capacity: BURST, + per_ip_period: 100.0, // very slow refill — no tokens come back during the test + global_capacity: 0, // global disabled + global_period: 1.0, + stateless_retry: false, + }; + let (server, _key, server_key_id, server_bind, server_token) = + make_endpoint_with_config(SERVER_PORT, rl_config); + + // wait a little to server spin-up + tokio::time::sleep(Duration::from_millis(50)).await; + + // 127.0.0.1 — same IP for per-IP limiting + let mut succeeded = 0u32; + let mut failed = 0u32; + let mut conns = Vec::new(); + for i in 0..TOTAL_ATTEMPTS { + let (ep, sni) = make_raw_client_endpoint(&server_key_id); + match try_connect(&ep, server_bind, &sni, CONNECT_TIMEOUT).await { + Ok(conn) => { + println!(" connection {i}: OK (stable_id={})", conn.stable_id()); + succeeded += 1; + conns.push(conn); + } + Err(e) => { + println!(" connection {i}: REJECTED ({e})"); + failed += 1; + } + } + } + + println!( + "Per-IP rate limit test: burst={BURST}, attempts={TOTAL_ATTEMPTS}, \ + succeeded={succeeded}, failed={failed}" + ); + assert_eq!(succeeded, BURST as u32, "expected exactly {BURST} connections to succeed"); + assert_eq!( + failed, + (TOTAL_ATTEMPTS - BURST as usize) as u32, + "expected {} connections to be rejected", + TOTAL_ATTEMPTS - BURST as usize, + ); + + drop(conns); + server.shutdown(); + server_token.cancel(); + }); +} + +/// Global rate limiter: server allows burst of 3 connections total, then refuses. +#[test] +fn test_quic_rate_limit_global() { + init_test_log(); + let rt = tokio::runtime::Builder::new_multi_thread().enable_all().build().unwrap(); + rt.block_on(async { + const SERVER_PORT: u16 = 8310; + const BURST: u32 = 3; + const TOTAL_ATTEMPTS: usize = 6; + const CONNECT_TIMEOUT: Duration = Duration::from_secs(3); + + let rl_config = QuicRateLimitConfig { + per_ip_capacity: 0, // per-IP disabled + per_ip_period: 1.0, + global_capacity: BURST, + global_period: 100.0, // very slow refill + stateless_retry: false, + }; + let (server, _key, server_key_id, server_bind, server_token) = + make_endpoint_with_config(SERVER_PORT, rl_config); + + // wait a little to server spin-up + tokio::time::sleep(Duration::from_millis(50)).await; + + let mut succeeded = 0u32; + let mut failed = 0u32; + let mut conns = Vec::new(); + for i in 0..TOTAL_ATTEMPTS { + let (ep, sni) = make_raw_client_endpoint(&server_key_id); + match try_connect(&ep, server_bind, &sni, CONNECT_TIMEOUT).await { + Ok(conn) => { + println!(" connection {i}: OK (stable_id={})", conn.stable_id()); + succeeded += 1; + conns.push(conn); + } + Err(e) => { + println!(" connection {i}: REJECTED ({e})"); + failed += 1; + } + } + } + + println!( + "Global rate limit test: burst={BURST}, attempts={TOTAL_ATTEMPTS}, \ + succeeded={succeeded}, failed={failed}" + ); + assert_eq!(succeeded, BURST as u32, "expected exactly {BURST} connections to succeed"); + assert_eq!( + failed, + (TOTAL_ATTEMPTS - BURST as usize) as u32, + "expected {} connections to be rejected", + TOTAL_ATTEMPTS - BURST as usize, + ); + + drop(conns); + server.shutdown(); + server_token.cancel(); + }); +} + +/// Stateless Retry: server requires address validation via Retry packets. +/// A normal client should still connect successfully this verifies retry +/// doesn't break connectivity. +#[test] +fn test_quic_stateless_retry() { + init_test_log(); + let rt = tokio::runtime::Builder::new_multi_thread().enable_all().build().unwrap(); + rt.block_on(async { + const SERVER_PORT: u16 = 8320; + const CONNECT_TIMEOUT: Duration = Duration::from_secs(5); + + let rl_config = QuicRateLimitConfig { + per_ip_capacity: 0, // rate-limiting disabled + per_ip_period: 1.0, + global_capacity: 0, + global_period: 1.0, + stateless_retry: true, // retry enabled + }; + let (server, _key, server_key_id, server_bind, server_token) = + make_endpoint_with_config(SERVER_PORT, rl_config); + + tokio::time::sleep(Duration::from_millis(50)).await; + + // Connect a raw client — quinn handles the Retry transparently + let (ep, sni) = make_raw_client_endpoint(&server_key_id); + let conn = try_connect(&ep, server_bind, &sni, CONNECT_TIMEOUT) + .await + .expect("connection with stateless retry should succeed"); + + // Verify the connection works by opening a stream and doing ping/pong + let (mut send, mut recv) = conn.open_bi().await.unwrap(); + let ping_data = make_ping_wire(100500); + send.write_all(&ping_data).await.unwrap(); + send.finish().unwrap(); + let response = + tokio::time::timeout(Duration::from_secs(5), recv.read_to_end(16 * 1024 * 1024)) + .await + .expect("read timed out") + .expect("read failed"); + let pong = parse_pong_wire(&response); + assert_eq!(pong, 100500, "ping/pong mismatch through stateless retry"); + + println!( + "Stateless retry test: connection succeeded, ping/pong OK, remote={}", + conn.remote_address() + ); + + drop(conn); + server.shutdown(); + server_token.cancel(); + }); +} diff --git a/src/block-json/src/deserialize.rs b/src/block-json/src/deserialize.rs index ff15b65..c869ad3 100644 --- a/src/block-json/src/deserialize.rs +++ b/src/block-json/src/deserialize.rs @@ -703,6 +703,12 @@ impl StateParser { candidate_resolve_rate_limit: p .get_num32("candidate_resolve_rate_limit") .unwrap_or(d.candidate_resolve_rate_limit), + min_block_interval_ms: p + .get_num32("min_block_interval_ms") + .unwrap_or(d.min_block_interval_ms), + no_empty_blocks_on_error_timeout_ms: p + .get_num32("no_empty_blocks_on_error_timeout_ms") + .unwrap_or(d.no_empty_blocks_on_error_timeout_ms), }, }) } diff --git a/src/block-json/src/serialize.rs b/src/block-json/src/serialize.rs index 51ed955..93f96fc 100644 --- a/src/block-json/src/serialize.rs +++ b/src/block-json/src/serialize.rs @@ -1097,6 +1097,12 @@ fn serialize_simplex_config(cfg: &SimplexConfig) -> Result { serialize_field(&mut map, "max_leader_window_desync", np.max_leader_window_desync); serialize_field(&mut map, "bad_signature_ban_duration_ms", np.bad_signature_ban_duration_ms); serialize_field(&mut map, "candidate_resolve_rate_limit", np.candidate_resolve_rate_limit); + serialize_field(&mut map, "min_block_interval_ms", np.min_block_interval_ms); + serialize_field( + &mut map, + "no_empty_blocks_on_error_timeout_ms", + np.no_empty_blocks_on_error_timeout_ms, + ); Ok(map.into()) } diff --git a/src/block-json/src/tests/test_deserialize.rs b/src/block-json/src/tests/test_deserialize.rs index 5a1065c..dc05a58 100644 --- a/src/block-json/src/tests/test_deserialize.rs +++ b/src/block-json/src/tests/test_deserialize.rs @@ -44,14 +44,18 @@ fn test_parse_zerostate_p30_use_quic_survives_into_v2_boc() { "slots_per_leader_window": 8, "target_rate_ms": 200, "first_block_timeout_ms": 500, - "max_leader_window_desync": 2 + "max_leader_window_desync": 2, + "min_block_interval_ms": 333, + "no_empty_blocks_on_error_timeout_ms": 22000 }, "shard": { "use_quic": 1, "slots_per_leader_window": 16, "target_rate_ms": 200, "first_block_timeout_ms": 500, - "max_leader_window_desync": 2 + "max_leader_window_desync": 2, + "min_block_interval_ms": 444, + "no_empty_blocks_on_error_timeout_ms": 23000 } }), ); @@ -70,6 +74,8 @@ fn test_parse_zerostate_p30_use_quic_survives_into_v2_boc() { assert_eq!(mc.noncritical_params.target_rate_ms, 200); assert_eq!(mc.noncritical_params.first_block_timeout_ms, 500); assert_eq!(mc.noncritical_params.max_leader_window_desync, 2); + assert_eq!(mc.noncritical_params.min_block_interval_ms, 333); + assert_eq!(mc.noncritical_params.no_empty_blocks_on_error_timeout_ms, 22_000); let shard = parsed_p30.shard.as_ref().expect("expected shard simplex config"); assert!(shard.use_quic); @@ -77,6 +83,8 @@ fn test_parse_zerostate_p30_use_quic_survives_into_v2_boc() { assert_eq!(shard.noncritical_params.target_rate_ms, 200); assert_eq!(shard.noncritical_params.first_block_timeout_ms, 500); assert_eq!(shard.noncritical_params.max_leader_window_desync, 2); + assert_eq!(shard.noncritical_params.min_block_interval_ms, 444); + assert_eq!(shard.noncritical_params.no_empty_blocks_on_error_timeout_ms, 23_000); let key = 30u32.write_to_bitstring().unwrap(); let p30_slice = config.config_params.get(key).unwrap().expect("expected raw p30 cell"); @@ -367,6 +375,8 @@ fn get_config_param30() -> NewConsensusConfigAll { target_rate_ms: 300, first_block_timeout_ms: 1000, max_leader_window_desync: 100, + min_block_interval_ms: 111, + no_empty_blocks_on_error_timeout_ms: 21_000, ..Default::default() }, ..Default::default() @@ -377,6 +387,8 @@ fn get_config_param30() -> NewConsensusConfigAll { target_rate_ms: 200, first_block_timeout_ms: 500, max_leader_window_desync: 50, + min_block_interval_ms: 222, + no_empty_blocks_on_error_timeout_ms: 22_000, ..Default::default() }, ..Default::default() diff --git a/src/block-json/src/tests/test_serialize.rs b/src/block-json/src/tests/test_serialize.rs index 200ffc6..0079624 100644 --- a/src/block-json/src/tests/test_serialize.rs +++ b/src/block-json/src/tests/test_serialize.rs @@ -1210,6 +1210,39 @@ fn test_block_proof_serialize_deserialize_roundtrip_ordinary() { assert_eq!(original_proof.write_to_bytes().unwrap(), parsed_proof.write_to_bytes().unwrap()); } +#[test] +fn test_serialize_new_consensus_config_all_includes_new_timing_fields() { + use ton_block::{NewConsensusConfigAll, NoncriticalParams, SimplexConfig}; + + let cfg = NewConsensusConfigAll { + mc: Some(SimplexConfig { + use_quic: true, + slots_per_leader_window: 4, + noncritical_params: NoncriticalParams { + min_block_interval_ms: 111, + no_empty_blocks_on_error_timeout_ms: 21_000, + ..Default::default() + }, + }), + shard: Some(SimplexConfig { + use_quic: false, + slots_per_leader_window: 8, + noncritical_params: NoncriticalParams { + min_block_interval_ms: 222, + no_empty_blocks_on_error_timeout_ms: 22_000, + ..Default::default() + }, + }), + }; + + let json = serialize_new_consensus_config_all(&cfg).unwrap(); + + assert_eq!(json["mc"]["min_block_interval_ms"], 111); + assert_eq!(json["mc"]["no_empty_blocks_on_error_timeout_ms"], 21_000); + assert_eq!(json["shard"]["min_block_interval_ms"], 222); + assert_eq!(json["shard"]["no_empty_blocks_on_error_timeout_ms"], 22_000); +} + #[test] fn test_db_serialize_block_proof_simplex() { use ton_block::{ diff --git a/src/block/src/config_params.rs b/src/block/src/config_params.rs index b292be9..efef405 100644 --- a/src/block/src/config_params.rs +++ b/src/block/src/config_params.rs @@ -3806,6 +3806,8 @@ pub struct NoncriticalParams { pub max_leader_window_desync: u32, // idx 10, uint32 pub bad_signature_ban_duration_ms: u32, // idx 11, duration pub candidate_resolve_rate_limit: u32, // idx 12, uint32 + pub min_block_interval_ms: u32, // idx 13, duration + pub no_empty_blocks_on_error_timeout_ms: u32, // idx 14, duration } impl Default for NoncriticalParams { @@ -3824,6 +3826,8 @@ impl Default for NoncriticalParams { max_leader_window_desync: 250, bad_signature_ban_duration_ms: 5_000, candidate_resolve_rate_limit: 10, + min_block_interval_ms: 0, + no_empty_blocks_on_error_timeout_ms: 15_000, } } } @@ -3845,6 +3849,8 @@ impl NoncriticalParams { 10 => self.max_leader_window_desync = value, 11 => self.bad_signature_ban_duration_ms = value, 12 => self.candidate_resolve_rate_limit = value, + 13 => self.min_block_interval_ms = value, + 14 => self.no_empty_blocks_on_error_timeout_ms = value, _ => {} } } @@ -3874,6 +3880,8 @@ impl NoncriticalParams { (10, self.max_leader_window_desync), (11, self.bad_signature_ban_duration_ms), (12, self.candidate_resolve_rate_limit), + (13, self.min_block_interval_ms), + (14, self.no_empty_blocks_on_error_timeout_ms), ]) } } @@ -3902,8 +3910,8 @@ impl Default for SimplexConfig { } } -/// Maximum noncritical param key defined in the C++ reference (candidate_resolve_rate_limit). -const NONCRITICAL_PARAMS_MAX_KEY: u8 = 12; +/// Maximum noncritical param key defined in the C++ reference. +const NONCRITICAL_PARAMS_MAX_KEY: u8 = 14; /// Always serializes as simplex_config_v2#22 (the current on-chain format). impl Serializable for SimplexConfig { diff --git a/src/block/src/tests/test_config_params.rs b/src/block/src/tests/test_config_params.rs index 4219ab8..fa7cf02 100644 --- a/src/block/src/tests/test_config_params.rs +++ b/src/block/src/tests/test_config_params.rs @@ -1050,6 +1050,8 @@ fn test_simplex_config_v2_round_trip() { first_block_timeout_ms: 1000, max_leader_window_desync: 250, candidate_resolve_rate_limit: 10, + min_block_interval_ms: 333, + no_empty_blocks_on_error_timeout_ms: 22_000, ..Default::default() }, ..Default::default() diff --git a/src/node/consensus-common/src/node_test_network.rs b/src/node/consensus-common/src/node_test_network.rs index 6d16f58..055fcf0 100644 --- a/src/node/consensus-common/src/node_test_network.rs +++ b/src/node/consensus-common/src/node_test_network.rs @@ -16,7 +16,9 @@ use crate::{ ConsensusOverlayManager, ConsensusOverlayManagerPtr, ConsensusOverlayPtr, OverlayTransportType, PrivateKey, PrivateOverlayShortId, PublicKeyHash, QueryResponseCallback, }; -use adnl::{node::AdnlNode, DhtNode, NetworkStack, OverlayNode, QuicNode, RldpNode}; +use adnl::{ + node::AdnlNode, DhtNode, NetworkStack, OverlayNode, QuicNode, QuicRateLimitConfig, RldpNode, +}; use futures; use lazy_static::lazy_static; use std::{ @@ -248,6 +250,7 @@ impl<'a> NodeTestNetwork<'a> { cancellation_token.clone(), None, tokio::runtime::Handle::current(), + Some(QuicRateLimitConfig::disabled()), ); overlay.set_quic(quic.clone()).unwrap(); Some(quic) diff --git a/src/node/simplex/CHANGELOG.md b/src/node/simplex/CHANGELOG.md index d84df76..2218377 100644 --- a/src/node/simplex/CHANGELOG.md +++ b/src/node/simplex/CHANGELOG.md @@ -4,10 +4,166 @@ All notable changes to the Simplex Consensus Protocol implementation will be doc ## [Unreleased] +## [0.6.0] - 2026-04-08 + +Major release: **finalized-driven delivery**, **C++ parity overhaul**, **legacy mode removal**, +and **production-grade stall diagnostics**. 71 commits since v0.5.0. + +**Milestones** +- Simplex switched to finalized-driven delivery (`on_block_finalized()`); the old sequential + `on_block_committed()` path is no longer used by Simplex. +- Legacy fallback/strict-parent mode removed; only C++-compatible three-vote behavior remains. +- Structured stall-diagnosis debug dump with health findings, per-slot phase tracking, + and per-peer activity snapshots. + ### Added -- Observability: peer-delivered candidate ingress counters + +#### Finalized-driven delivery +- **`handle_block_finalized()`**: delivers finalized blocks through `SessionListener::on_block_finalized()` + as soon as a `FinalCert` is observed and the block body is available. +- **`maybe_apply_finalized_state()`**: updates local finalized-head state (seqno, block ID) after delivery. +- **`finalized_pending_body`**: when a finalization certificate arrives before the candidate body, + the finalized entry is stored and later materialized when the body arrives. +- **Out-of-order finalized delivery** is the only mode; the old `SessionOptions::out_of_order_finalized_delivery` toggle removed. + +#### Session lifecycle +- **Separate session creation from start**: `SessionFactory::create_session()` returns a session handle; + `session.start(initial_block_seqno)` begins consensus processing with the expected first block seqno. +- **`simplex_config_v2` deserialization**: unified `SimplexConfig` (v1+v2), named + `NoncriticalParams` struct (13 fields), flat JSON. `SIMPLEX_USE_TESTING_CONSTANTS` removed. + +#### C++ parity features +- **Candidate chaining within leader windows**: candidates in a multi-slot leader window build + on the previous slot's candidate, matching C++ `pool.cpp` behavior. +- **Notarized-parent collation mode** with robust retry: collation selects notarized parents + as base, with fallback retry on state unavailability. +- **Stale leader window guards** for collation: prevents generation for outdated windows. +- **Bootstrap skip-cascade prevention**: timeouts are unarmed by default during bootstrap, + preventing spurious skip-vote storms on session startup. +- **Available-base propagation on leader window advancement**: FSM correctly propagates + the available base when leader windows advance due to skip certificates. +- **Empty candidate FSM-tip validation**: empty candidates are rejected unless the + referenced block matches the parent state's current normal tip. +- **Leader window desync margin**: `max_leader_window_desync` wired into `SimplexState`/`Receiver` + ingress checks to limit horizon of accepted slots. +- **Timing wake discipline and pacing parity**: `min_block_interval_ms` + wired through config → session → runtime pacing. `gen_utime_ms` threaded into candidates/chain-heads. + Validation waits for `parent.gen_utime_ms + min_block_interval`. + +#### Observability and diagnostics +- **Stall dump redesign**: `debug_dump()` rewritten with structured sections: + - **Conclusion** (stalled only): structured `HealthFinding`s explaining the stall reason. + - **Header**: shard, validators, local index, total/th66/th33/active weight. + - **Frontiers**: `first_non_finalized` / `first_non_progressed` with "unchanged for Xs" tracking. + - **Heads**: `finalized_head_seqno`, `finalized_head` block ID, `last_mc_applied` block ID, + `last_mc_finalized_seqno`. + - **Milestone timestamps**: `last_finalization`, `last_notarization`, `last_final_cert`, + `last_notar_cert` — all with relative time and slot. + - **Statistics**: candidate funnel (`received/validated/notarized/finalized/other` with %), + traffic counters, typed vote breakdown (`notar/final/skip`), duplicate counters. + - **Collation**: grouped by windows with leader identity (validator index + full base64 + `pubkey_hash` + `adnl_id`), slot phase, timing, generated blocks info. + - **Validation inventory**: lifecycle buckets (`received`, `validated`, `notarized`, + `finalized` last 10s, `other` with total count), block IDs for correlation with collator, + per-block percentage of total candidates. + - **Peers**: per-validator dump with full base64 ADNL ID and pubkey hash, weight in %, + last activity/vote/cert/candidate times, typed vote/cert counts, candidate counts. + - **Health findings**: all applicable `HealthFindingKind` anomalies with severity and summary. +- `SessionObservability` struct tracking cursor-change timestamps, last notarization/finalization + times, last certificate times, and last MC applied block ID. +- `HealthFinding` / `HealthFindingKind` types for structured stall diagnosis in dump header. +- `SlotWaitPhase` / `SlotDiagnostic` / `WindowDiagnostic` types for per-slot and per-window + structured diagnostics from `SimplexState::collect_window_diagnostics()`. +- `ReceiverActivitySnapshot` / `SourceActivitySnapshot` for comprehensive per-peer network + activity statistics passed from `Receiver` to `SessionProcessor` via `on_activity()`. +- `CandidateTotals` helper for computing and formatting candidate lifecycle bucket percentages. +- Peer-delivered candidate ingress counters (`simplex_candidate_received_broadcast`, `simplex_candidate_received_query`) and a unified `simplex_collation_starts` counter, with derivative `/s` dump support for operator debugging. +- Info-level metrics dumps with execution time guards. +- Log throttling and validator isolation detection. +- Skip-dominance health false-positive reduction. +- `simplex_finalized_pending_body_count` gauge for finalized blocks waiting for body arrival. + +#### Networking +- Overlay response size increased to +1MB for `requestCandidate` (C++ parity). +- Two-step broadcast TL schema alignment for C++/Rust interop: `extra:bytes` in + `broadcastTwostepSimple`/`Fec`, `data_size`+`extra` in `broadcastTwostep.id`, + `consensus.broadcastExtra(slot)` propagated through send path. +- Standstill replay aligned with C++ parity: range tracking, sparse iteration, receiver + ingress bounds synced with finalized frontier. +- Standstill warning suppression for inactive sessions. +- Receiver ingress bounds synced with finalized frontier. + +### Changed +- **Finalized-driven delivery model**: Simplex delivers finalized blocks through + `on_block_finalized()`. The old `on_block_committed()` sequential path, `try_commit_finalized_chains()`, + `collect_gapless_commit_chain()`, `commit_single_block()`, and related proof-fetch/retry flow + are all removed. +- **Restart behavior is state restoration only**: `RestartRecommitStrategy` removed. Startup + restores finalized state and continues from there without historical replay callbacks. +- `ReceiverListener::on_activity()` now accepts `ReceiverActivitySnapshot` parameter + for per-peer statistics (breaking internal trait change). +- `SourceStats` in `receiver.rs` extended with typed vote/cert/candidate counters, + last-receive timestamps, and duplicate counters. +- Health status line format updated: `Session health [OK|STALLED]: shard=... slot_nf=... + slot_np=... finalized_head_seqno=...`. +- `check_all()` ordering: validated candidates are now processed before FSM timeouts, + preventing stale-slot validation when timeouts would advance the window first. +- Candidate resolver behavior aligned with C++ parity: DB-backed fallback on cache miss, + merged partial-response completion (body + notar). +- Certificate relay aligned with C++ `handle_saved_certificate` semantics. +- Parent gating aligned with C++ flow: `is_wait_for_parent_ready()` mirrors C++ + `pool.cpp::maybe_resolve_request()` exactly (finalized-boundary check, notarized-parent + hash match, skip-gap coverage). +- Progress cursor and repair flow aligned with C++ parity. + +### Fixed +- **Stall dump never fired**: `round_debug_at` was never scheduled as a `next_awake_time`, + so when consensus stalled and no FSM timeouts remained, `check_all()` never ran and the + stalled dump (at ERROR level) was unreachable. Fixed by adding + `self.set_next_awake_time(self.round_debug_at)` in `check_all()`. +- **`check_all()` ordering**: process validated candidates before FSM timeouts to prevent + stale-slot validation when timeouts advance the window. +- **Bootstrap skip cascade**: unarmed-by-default timeouts prevent spurious skip-vote + storms on session startup. +- **Available-base propagation** on leader window advancement: prevent collation + deadlock when base is not forwarded after skip certificates. +- **Empty-tip validation**: reject empty candidates whose referenced block does + not match the parent state's normal tip. +- **First-pending-candidate-wins guard**: prevent notarization race when multiple candidates + arrive for the same slot; remove `its_over` gate from `try_notar`. +- **Candidates stored as pending despite local skip vote**: candidates arriving after + the local validator has skip-voted are still stored for potential notarization. +- **Simplex C++ parity fixes**: MC validation ordering, empty block ownership, collation + gating, MC notification uses `BlockIdExt`, shard empty-block recovery, fixed-base + timeout schedule, genesis-parent validation, startup timeout gating, pending block + preservation, and vote-mix observability. +- **Gapless commit scheduler hardening** : monotonic seqno enforcement, resolver cache + deserialization-failure purge, missing-body-logged cleanup, DB payload lookup order fix,future-time warning. +- **Mid-session finalization parity**: candidate repair completion and DB persistence + aligned with C++. +- **Cursor ingress alignment** with C++ parity: progress cursor, repair flow, and + standstill replay all follow C++ progress-cursor semantics. +- **Replay integrity hardening**: deterministic replay order in recovery paths. + +### Removed +- **Legacy fallback and strict-parent mode**: `NotarizeFallback`, `SkipFallback` vote types, + `enable_fallback_protocol` option, `require_finalized_parent` strict mode, and all related + code paths removed. Only C++-compatible three-vote behavior (Notarize, Finalize, Skip) remains. +- **`get_approved_candidate` callback**: Rust no longer uses `SessionListener::get_approved_candidate` + and instead relies on the DB-backed repair path. +- **`SessionListener::get_committed_candidate`**: removed together with `CommittedBlockProof` and + the old missing-proof recovery path. +- **Sequential commit path**: `try_commit_finalized_chains()`, `collect_gapless_commit_chain()`, + `commit_single_block()`, `notify_block_committed()`, and related proof-fetch/retry flow removed. +- **`RestartRecommitStrategy`**: removed; restart is now state-restoration only. +- **`out_of_order_finalized_delivery` toggle**: removed (out-of-order is the only mode). +- **`DISABLE_NON_FINALIZED_PARENTS_FOR_COLLATION` dead mode**: removed. +- **Empty-parent metadata queue**: `PendingParentResolution` queue, `is_fully_resolved` metadata + flag, and `ParentAging` health check removed; parent resolution is now on-demand. +- **`requestCandidate2` / `candidateAndCert2`**: already removed in 0.5.0; all remaining + references cleaned up. ## [0.5.0] - 2026-03-20 @@ -42,7 +198,7 @@ All notable changes to the Simplex Consensus Protocol implementation will be doc Rust/C++ interop for block candidate acceptance. Added regression test `test_candidate_id_to_sign_is_bare_candidate_id`. - MC fork prevention: reject masterchain candidates building on stale heads - (C++ `block-validator.cpp` commit `9aac62b8` parity). Track `last_accepted_mc_seqno` + (C++ `block-validator.cpp` parity). Track `last_accepted_mc_seqno` in `ValidatorGroupImpl` and fail validation when candidate parent seqno is behind. - Committed-proof ingestion hardening: verify `proof.block_id` matches requested `block_id` before ingesting downloaded block proofs for MC gap recovery. @@ -61,13 +217,13 @@ All notable changes to the Simplex Consensus Protocol implementation will be doc `ENABLE_REQUEST_CANDIDATE_V2` constant removed. `want_final` param removed from `request_candidate()`. All FinalCert recovery now uses committed-block proof download. -### Planned -- FinalCert proactive rebroadcast (C++ `cfd8850c` parity) -- MC fork prevention in validator (C++ `9aac62b8` parity) -- Adaptive skip timeout increase (C++ `3c0cae03` parity) -- Precollation parent tracking (lock parent at start of collation) -- Twostep broadcast via RLDP2 -- C++ interoperability testing with testnet +### Planned (resolved in 0.6.0) +- ~~FinalCert proactive rebroadcast~~ → resolved +- ~~MC fork prevention in validator~~ → resolved +- ~~Adaptive skip timeout increase~~ → resolved +- ~~Precollation parent tracking~~ → superseded by notarized-parent collation mode +- ~~Twostep broadcast via RLDP2~~ → resolved (two-step broadcast TL alignment) +- ~~C++ interoperability testing with testnet~~ → resolved (mixed 5+5 Rust/C++ validation) --- @@ -434,6 +590,7 @@ Major release focusing on candidate resolution, certificate system, and operatio | Version | Date | Tag | Description | |---------|------|-----|-------------| +| 0.6.0 | 2026-04-08 | `simplex-0.6.0` | Finalized-driven delivery, C++ parity overhaul, legacy mode removal, stall diagnostics | | 0.5.0 | 2026-03-20 | `simplex-0.5.0` | Committed-block proof recovery, restart gremlin fix, requestCandidate2 removal, parity docs update | | 0.4.0 | 2026-02-01 | `simplex-0.4.0` | Block signature types, C++ compatibility, restart resilience | | 0.3.0 | 2026-01-14 | `simplex-0.3.0` | Candidate resolver, certificates, operational stability | diff --git a/src/node/simplex/Cargo.toml b/src/node/simplex/Cargo.toml index 3ca3440..bdf4fef 100644 --- a/src/node/simplex/Cargo.toml +++ b/src/node/simplex/Cargo.toml @@ -1,6 +1,6 @@ [package] name = 'simplex' -version = '0.5.0' +version = '0.6.0' edition = '2021' authors = ['RSquad'] description = 'Simplex consensus protocol implementation for TON blockchain' diff --git a/src/node/simplex/README.md b/src/node/simplex/README.md index e6c129f..82b6bc1 100644 --- a/src/node/simplex/README.md +++ b/src/node/simplex/README.md @@ -1,34 +1,48 @@ # Simplex Consensus Protocol -**Version**: 0.5.0 (March 20, 2026) | [Changelog](CHANGELOG.md) +**Version**: 0.6.0 (April 8, 2026) | [Changelog](CHANGELOG.md) Rust implementation of the Simplex consensus protocol for TON blockchain. -> **C++ Reference**: Primary tracking is `ton-blockchain/ton/tree/simplex` (main repo). -> Secondary: `DanShaders/ton/tree/alpenglow` (superseded). +> **C++ Reference**: Primary tracking is [ton-blockchain/ton](https://github.com/ton-blockchain/ton) (`testnet/validator/consensus/simplex`). +> +> **Protocol Spec**: [ton-blockchain/simplex-docs](https://github.com/ton-blockchain/simplex-docs) (`Simplex.md`). -> **Current semantics (Apr 2026):** +> **Current semantics (Apr 2026, v0.6.0):** > - Simplex is finalized-driven. > - Finalized blocks are delivered through `on_block_finalized()` and may arrive out of order. > - `on_block_committed()` remains part of the shared listener interface for legacy sequential acceptance, but Simplex must not use it. > - Missing-body handling uses `finalized_pending_body`: a finalized block can be known before its body arrives locally. +> - Historical Rust-only fallback/strict-parent mode has been removed; only the C++-compatible three-vote behavior is supported. +> - Session creation is separated from start: `create_session()` + `start(initial_block_seqno)`. +> - Restart recovery is state-restoration only — no historical replay callbacks. ## Overview -Simplex is a consensus protocol based on the Solana Alpenglow White Paper (May 2025 v1) with modifications for TON: +Simplex is a consensus protocol with TON-specific implementation choices: - **Conservative path only** (no fast finality/optimistic path) -- **Fault tolerance**: <1/3 Byzantine nodes (not 20% as in original Alpenglow) +- **Fault tolerance**: <1/3 Byzantine nodes - **Certificate threshold**: 2/3 stake weight - **No erasure coding**: Simple broadcast instead of Rotor shreds ### Key Design Decisions -1. **Simplified Alpenglow**: Focus on conservative path for reliability over speed +1. **Conservative consensus path**: Focus on reliability over speed 2. **Ed25519 signatures**: Individual signatures, no BLS aggregation 3. **Actor model**: Separate threads for consensus, callbacks, and network 4. **Task queues**: Cross-thread communication via closures +### Protocol Mapping (Simplex.md -> C++ -> Rust) + +| Simplex.md Concept | C++ Touchpoint | Rust Touchpoint | +|---|---|---| +| `tryNotar` | `consensus.cpp::try_notarize` | `simplex_state.rs::try_notar` | +| `tryFinal` | finalize gating in `consensus.cpp` | `simplex_state.rs::try_final` | +| `trySkipWindow` / timeout alarm | `consensus.cpp::alarm` | `simplex_state.rs::process_timeouts`, `simplex_state.rs::try_skip_window` | +| Certificate ingestion | `pool.cpp::handle_foreign_certificate` | `simplex_state.rs::set_notarize_certificate`, `set_finalize_certificate`, `set_skip_certificate` | +| Progress cursor / leader-window publish | `pool.cpp::maybe_publish_new_leader_windows` | `simplex_state.rs::advance_progress_cursor`, `advance_leader_window_on_progress_cursor` | + ### Relationship to Other Components ``` @@ -49,24 +63,34 @@ overlay / ADNL (lower level, network) ## Rust vs C++ reference: known differences -This crate targets wire-compatibility with the upstream **C++ Simplex** implementation (`origin/testnet@e40d0e36`, Feb 28, 2026). +This crate targets wire-compatibility with the upstream **C++ Simplex** implementation in [ton-blockchain/ton](https://github.com/ton-blockchain/ton) (`testnet/validator/consensus/simplex`). ### Protocol parity gaps (from C++ upstream) -- C++ proactively rebroadcasts FinalCerts (`cfd8850c`) — Rust standstill replay is less aggressive. **HIGH** +- Certificate persistence ordering — C++ persists certs before state transitions. **HIGH** +- Deterministic vote replay ordering — wait-for-store semantics. **MEDIUM** +- Anti-spam / DOS hardening — overlay-level antispam. **MEDIUM** +- External-aware collation pipeline — callback-driven external wait loop. **MEDIUM** +- Ghost-parent MC collation deadlock — notarized-but-unfinalized parent state unavailability. **HIGH** +- Speculative state resolver — in-process state computation for unfinalized parents. **MEDIUM** ### Implementation parity gaps -- Committed-parent validation gate — needs state-root caching / apply-block-to-state. -- Base selection should use "max available base" like C++ `SlotState::add_available_base` (audit needed). - C++ has `ImprovedStructureLZ4WithState` (BOC compression algo 2) — Rust only supports algos 0 and 1. - C++ has `StoreCellHint` for DB commit optimization during MerkleUpdate apply — Rust lacks equivalent. - C++ overlay manager can buffer messages for unknown overlays (disabled by default) — Rust lacks equivalent. ### Resolved (for reference) +- Finalized-driven delivery: Simplex delivers through `on_block_finalized()`, matching C++ out-of-order finalized model. +- Parent gating aligned with C++ flow: `is_wait_for_parent_ready()` mirrors `pool.cpp::maybe_resolve_request()`. +- Candidate chaining within leader windows matches C++ `pool.cpp`. +- Empty-candidate FSM-tip validation: reject unless referenced block matches parent normal tip. +- Available-base propagation on leader window advancement. +- Two-step broadcast TL schema alignment: all 10 nodes producing in mixed 5x5 test. +- Timing wake discipline and block-rate cap parity: `min_block_interval_ms` wired. - Candidate signature now signs bare `consensus.candidateId` directly, matching C++ testnet. Regression test: `test_candidate_id_to_sign_is_bare_candidate_id`. -- MC stale-head rejection implemented in `validator_group.rs` (`should_reject_stale_mc_candidate`), matching C++ `block-validator.cpp` commit `9aac62b8`. +- MC stale-head rejection implemented in `validator_group.rs` (`should_reject_stale_mc_candidate`), matching C++ `block-validator.cpp`. - Adaptive first-block timeout backoff after skip implemented in `simplex_state.rs` (`apply_adaptive_timeout_backoff`), matching C++ `consensus.cpp`. - Twostep FEC broadcast implemented in `consensus-common/adnl_overlay.rs` (`BroadcastTwostepSimple`), with C++-compatible signing. - QUIC transport supported via `SessionOptions::use_quic` and `OverlayTransportType::SimplexQuic`. Tested in `test_adnl_overlay_quic_delivery`. @@ -78,6 +102,9 @@ This crate targets wire-compatibility with the upstream **C++ Simplex** implemen - Shard `before_split` empty block rule - Restart support (DB persistence + startup recovery) - Certificate rebroadcast on restart +- FinalCert proactive rebroadcast (Rust local-creation broadcast behavior) +- Base selection uses progress cursor (`first_non_progressed_slot`) for leader-window advancement +- Committed-parent validation gate (superseded by finalized-driven delivery model) ## Architecture @@ -143,7 +170,7 @@ This crate targets wire-compatibility with the upstream **C++ Simplex** implemen │ SessionProcessor │ │ 1. Pull task from main queue │ │ 2. Process vote → update slot state, check thresholds │ -│ 3. Emit events (BlockNotarized, SafeToNotar, etc.) │ +│ 3. Emit events (BlockNotarized, BlockFinalized, SlotSkipped, etc.) │ │ 4. May broadcast new vote via Receiver │ └─────────────────────────────────────────────────────────────────────────────┘ │ @@ -174,24 +201,6 @@ Slots are grouped into **leader windows**. One leader is responsible for all slo | `NotarizeVote` | ✅ | Vote to notarize a block in a slot | | `FinalizeVote` | ✅ | Vote to finalize after notarization | | `SkipVote` | ✅ | Skip a slot (timeout or no valid block) | -| `NotarizeFallbackVote` | ❌ | Fallback notarization (internal only, see below) | -| `SkipFallbackVote` | ❌ | Fallback skip (internal only, see below) | - -#### Alpenglow vs TON Implementation Vote Differences - -The original Alpenglow White Paper specifies 5 vote types for consensus. However, the **TON C++ implementation uses only 3 vote types on the wire**: - -| Aspect | Alpenglow (Paper) | TON Implementation | -|--------|-------------------|-------------------| -| **Wire Vote Types** | 5 (Notarize, Finalize, Skip, NotarizeFallback, SkipFallback) | 3 (Notarize, Finalize, Skip) | -| **Fallback Votes** | Required for liveness under adversarial conditions | Internal FSM state only, not transmitted | -| **SafeToNotar Trigger** | Broadcasts `NotarizeFallbackVote` | Used internally for state tracking | -| **SafeToSkip Trigger** | Broadcasts `SkipFallbackVote` | Used internally for state tracking | -| **Simultaneous Voting** | N/A | C++ allows simultaneous Skip + Notarize per validator | - -**Configuration**: The `enable_fallback_protocol` option in `SimplexState` controls whether fallback votes are generated internally: -- `false` (default): C++ compatible, 3 vote types on wire -- `true`: Full Alpenglow algorithm with internal fallback tracking ### Certificates @@ -204,7 +213,7 @@ Certificates are implicit (derived from vote counts), not explicit on-wire objec ### Empty Blocks (TON-Specific Extension) -Empty blocks are a **finalization recovery** mechanism not in the original Alpenglow White Paper: +Empty blocks are a **finalization recovery** mechanism not in the original protocol paper: **Purpose**: When consensus gets ahead of finalization (no FinalizeCertificates), empty blocks let validators re-vote on the previous block to attempt getting a FinalizeCertificate. @@ -228,7 +237,7 @@ let validators re-vote on the previous block to attempt getting a FinalizeCertif | Threshold | Value | Purpose | |-----------|-------|---------| | 2/3 (66%) | `(total * 2 + 2) / 3` | Certificate formation | -| 1/3 (33%) | `(total + 2) / 3` | Safety conditions (SafeToNotar, SafeToSkip) | +| 1/3 (33%) | `(total + 2) / 3` | Helper quorum threshold | ### Consensus Loop @@ -282,7 +291,8 @@ node/simplex/ │ ├── test_restart.rs │ ├── test_session_description.rs │ ├── test_session_processor.rs -│ └── test_simplex_state.rs +│ ├── test_simplex_state.rs +│ └── test_slot_bounds.rs └── tests/ ├── test_collation.rs # Single-node collation integration test ├── test_consensus.rs # Multi-instance consensus integration tests @@ -301,7 +311,7 @@ Entry point for integration. See `lib.rs` documentation for detailed API referen | `SessionFactory` | Factory for creating sessions and overlay managers | | `SessionOptions` | Configuration options for sessions | | `ConsensusSession` | Base session interface (trait, from consensus-common) | -| `SimplexSession` | Simplex-specific session operations (extends `Session`) | +| `SimplexSession` | Simplex-specific session operations (extends `ConsensusSession`) | | `SessionListener` | Callback trait (from consensus-common) | | `SessionStats` | Session health metrics passed alongside validator callbacks | | `Receiver` | Network sender interface (trait) | @@ -310,15 +320,15 @@ Entry point for integration. See `lib.rs` documentation for detailed API referen **SimplexSession Trait** (for MC finalization notification): ```rust -pub trait SimplexSession: Session { - /// Notify session about masterchain finalization (for shard empty block decisions) - fn notify_mc_finalized(&self, mc_block_seqno: u32); +pub trait SimplexSession: ConsensusSession { + /// Notify session about accepted MC top (for shard empty block decisions) + fn notify_mc_finalized(&self, applied_top: BlockIdExt); } ``` -This separate trait allows simplex-specific operations without modifying the shared `Session` trait +This separate trait allows simplex-specific operations without modifying the shared `ConsensusSession` trait from validator-session. For shard chains, the higher layer (ValidatorManager) should call -`notify_mc_finalized()` when masterchain blocks are finalized to enable empty block generation. +`notify_mc_finalized()` with the accepted MC top `BlockIdExt` when masterchain blocks are finalized to enable empty block generation. ### Session (`session.rs`) @@ -348,13 +358,17 @@ Single-threaded consensus algorithm (crate-private): - ✅ Precollation pipeline (`precollate_block()`, `remove_precollated_block()`) - ✅ Block finalization (`handle_block_finalized()`) - signature collection done - ✅ Validation flow (`on_block_broadcast()`, `check_validation()`) -- ✅ Debug dump (`debug_dump()`) - session and FSM state dump +- ✅ Debug dump (`debug_dump()`) - structured stall diagnosis with conclusion, frontiers, heads, statistics, collation, validation inventory, per-peer activity, health findings, and `finalized_pending_body` tracking - ✅ Empty block generation - `should_generate_empty_block()`, `CollationResult` enum, `GeneratedBlockDesc` - ✅ MC finalization callback - `SimplexSession::notify_mc_finalized()` posts to `set_mc_finalized_seqno()` - ✅ Missing block requests - `schedule_request_candidate()` → delayed action → `receiver.request_candidate()` -- ✅ Recursive parent resolution - `PendingParentResolution`, `update_resolution_cache_chain()`, `find_first_missing_parent()` -- ✅ Finalized-driven delivery - `handle_block_finalized()`, `maybe_emit_out_of_order_finalized()`, `maybe_apply_finalized_state()` +- ✅ Parent metadata resolution for empty/recovery helpers - `PendingParentResolution`, `update_resolution_cache_chain()`, `find_first_missing_parent()` +- ✅ Finalized-driven delivery - `handle_block_finalized()`, `maybe_apply_finalized_state()`, `finalized_pending_body` for deferred body materialization - ✅ Roundless listener model - round is not used for Simplex sequencing logic +- ✅ Separate session creation/start - `create_session()` + `start(initial_block_seqno)` +- ✅ Candidate chaining within leader windows (C++ parity) +- ✅ Leader window desync margin (`max_leader_window_desync`) for ingress filtering +- ✅ Block-rate cap timing parity (`min_block_interval_ms`) for validation pacing - ✅ Standstill coordination - calls `receiver.reschedule_standstill()` on finalization, `set_standstill_slots()` on finalization/skip - ✅ DB persistence - finalized blocks, candidate infos, notar certs, votes, pool state persisted to RocksDB - ✅ Startup recovery - bootstrap load, vote replay, receiver cache restore, finalized-boundary restoration @@ -399,7 +413,7 @@ Single-threaded consensus algorithm (crate-private): ### SimplexState (`simplex_state.rs`) Core consensus state machine (crate-private): -- Implements Alpenglow White Paper Algorithm 1 and 2 +- Implements the three-vote Simplex protocol used by C++ - Event-based output via `SimplexEvent` enum - Vote accounting with threshold detection - Leader window and slot management @@ -413,10 +427,10 @@ Core consensus state machine (crate-private): **API**: - `SimplexState::new(&SessionDescription)` - Create FSM - `on_candidate(&desc, candidate)` - Process incoming block -- `on_vote(&desc, validator_idx, vote)` - Process incoming vote +- `on_vote(&desc, validator_idx, vote, signature, raw_vote)` - Process incoming vote - `check_all(&desc)` - Process timeouts and pending actions - `pull_event()` - Get next output event -- `pending_event_count()` / `has_pending_events()` - Query event queue +- `has_pending_events()` - Query event queue (tests) - `get_available_parent(slot)` - Get parent block for collation - `has_available_parent(slot)` - Check if parent is available for collation - `get_tracked_slots_interval()` - Returns `(first_non_finalized_slot, current_window_end)` for standstill @@ -424,11 +438,6 @@ Core consensus state machine (crate-private): - `cleanup_slots(up_to_slot)` - Clean up old slots (called externally by SessionProcessor, respects first_non_finalized_slot) - `debug_dump(&desc, full_dump)` - Dump FSM state (compact or full) -**Options** (`SimplexStateOptions`): -- `enable_fallback_protocol` - Enable fallback votes (default: false, C++ compatible) -- `allow_skip_after_notarize` - Allow skip after notarize (default: true) -- `require_finalized_parent` - When true, parent must be finalized; when false (default, C++ mode), notarized parent OK - ### SessionDescription (`session_description.rs`) Session-level constants (crate-private): @@ -577,12 +586,10 @@ let nodes: Vec = validators.iter().map(|v| SessionNode { // 3. Create session let shard = ton_block::ShardIdent::masterchain(); // Or workchain shard -let initial_block_seqno = 1; // Expected seqno for first block let session = SessionFactory::create_session( &SessionOptions::default(), &session_id, &shard, - initial_block_seqno, // First block will have this seqno nodes, &local_private_key, "/path/to/db".into(), @@ -590,8 +597,12 @@ let session = SessionFactory::create_session( Arc::downgrade(&listener) as SessionListenerPtr, )?; -// 4. Session runs in background, callbacks via SessionListener -// 5. Stop when done +// 4. Start consensus processing with expected first block seqno +let initial_block_seqno = 1; +session.start(initial_block_seqno); + +// 5. Session runs in background, callbacks via SessionListener +// 6. Stop when done session.stop(); ``` @@ -614,49 +625,20 @@ impl SessionListener for MyListener { } fn on_block_committed(&self, source_info, root_hash, file_hash, data, signatures, approve_signatures, stats) { - unreachable!("Simplex must not call on_block_committed()"); + unreachable!("Simplex does not use on_block_committed(); finalized blocks arrive via on_block_finalized()"); } fn on_block_skipped(&self, round: u32) { - // DEPRECATED: Not called in production - skip events are handled internally. - // Test implementations should use unreachable!() here. + unreachable!("Skip events are handled internally by Simplex"); } } ``` -### Creating a Receiver (for Testing) - -For unit testing the receiver component in isolation: - -```rust -use simplex::{SessionFactory, ReceiverListenerPtr}; - -let shard = ton_block::ShardIdent::masterchain(); -let max_candidate_size = 8 << 20; // 8 MB - -let receiver = SessionFactory::create_receiver( - session_id, - &shard, - max_candidate_size, - &nodes, - &local_private_key, - overlay_manager, - receiver_listener_weak, -)?; - -// Send votes and broadcasts -receiver.send_vote(vote); -receiver.send_block_broadcast(candidate); - -// Stop when done -receiver.stop(); -``` - ## Tests -**Total: 300 tests** (281 lib + 13 integration + 6 doc-tests) +**Total: 427 tests** (406 lib + 15 integration + 6 doc-tests) -**Integration tests**: 9 consensus + 1 collation + 1 validation + 2 restart +**Integration tests**: 12 consensus + 1 collation + 1 validation + 1 restart **Crypto tests include**: Threshold calculations, session signatures, candidate signatures, vote TL serialization, vote signing with session wrapper, and signature format tests (C++ TL library compatibility). @@ -670,11 +652,14 @@ Multi-instance consensus tests with in-process overlay. |------|-------------|--------| | `test_simplex_consensus_basic` | Basic consensus with 7 nodes, 100 rounds | ✅ | | `test_simplex_consensus_with_failures` | Consensus with simulated failures | ✅ | -| `test_simplex_consensus_finalcert_recovery` | FinalCert recovery and finalized delivery without the old proof callback path | ✅ | +| `test_simplex_consensus_finalcert_recovery` | FinalCert recovery and finalized delivery | ✅ | | `test_simplex_consensus_shard_with_mc_notifications` | MC finalization forwarding to shards | ✅ | | `test_simplex_consensus_adnl_overlay` | ADNL overlay-based consensus | ✅ | | `test_simplex_consensus_adnl_net_gremlin` | ADNL net gremlin (packet loss/delay simulation) | ✅ | -| `test_simplex_consensus_restart_gremlin` | Restart gremlin (stop/restart with DB persistence) | ✅ | +| `test_simplex_consensus_restart_gremlin` | Restart gremlin (stop/restart with DB persistence) | ⚠️ (temp ignore) | +| `test_simplex_consensus_candidate_chaining` | Candidate chaining within leader windows | ✅ | +| `test_simplex_consensus_candidate_chaining_with_lossy_overlay` | Candidate chaining with packet loss | ✅ | +| `test_simplex_start_gate` | Session start gate (create/start separation) | ✅ | | `test_collated_file_hash_consistency` | Collated file hash consistency checks | ✅ | | `test_empty_collated_data_hash` | Empty collated data hash computation | ✅ | @@ -707,7 +692,7 @@ Restart integration tests (public API only) validating DB-backed stop/restart re | Test | Description | |------|-------------| -| `test_single_session_restart_round_monotonicity_first_commit_after_finalized` | Restart after finalized boundary; resumed session keeps finalized state consistent without historical recommit | +| `test_single_session_restart_round_monotonicity_first_commit_after_finalized` | Restart after finalized boundary; resumed session keeps finalized state consistent via state restoration | **Running:** ```bash @@ -726,9 +711,10 @@ Crate-private unit tests with access to internal symbols. | `test_database.rs` | Simplex DB records + bootstrap roundtrips | | `test_receiver.rs` | Receiver behavior, standstill cache, certificate send/receive, candidate resolver flow | | `test_candidate_resolver.rs` | CandidateResolverCache unit tests (late-joiner repair) | -| `test_session_processor.rs` | SessionProcessor unit tests (manual clock, delayed actions, scheduling) | +| `test_session_processor.rs` | SessionProcessor unit tests (manual clock, delayed actions, scheduling, finalized delivery) | | `test_restart.rs` | Restart byte-level tests (crate-private) | | `test_simplex_state.rs` | FSM logic + invariants (included via `#[path]`) | +| `test_slot_bounds.rs` | Slot bounds validation | | `test_misbehavior.rs` | Misbehavior proofs and invariant checks | | `test_session_description.rs` | Validator indexing, thresholds, time control | @@ -744,13 +730,13 @@ Core tests for the consensus state machine. Located in a separate file but inclu via `#[path]` attribute in `simplex_state.rs` to access private struct fields. Tests cover: - **Basic FSM**: State creation, initialization, validation -- **Candidate handling**: First slot with genesis, pending blocks, parent resolution +- **Candidate handling**: First slot with genesis, pending blocks, parent readiness / empty-tip lookup - **Vote accounting**: Notarize/skip/finalize weights, conflict detection -- **Threshold triggers**: BlockNotarized (2/3), SafeToNotar (1/3), BlockFinalized (2/3) +- **Threshold triggers**: BlockNotarized (2/3), BlockFinalized (2/3), SlotSkipped (2/3) - **Certificate Creation**: Notarization/finalization/skip certificates at threshold, caching, events - **External Certificate Import**: `set_notarize_certificate()` updates vote accounting and flags -- **Parent Validation Modes**: `require_finalized_parent` flag tests for notarized vs finalized parent requirements and deadlock scenarios -- **Misbehavior detection**: Conflicting votes, too many fallback votes, invalid ranges +- **Parent validation**: notarized/finalized parent readiness for collation +- **Misbehavior detection**: conflicting votes and invalid ranges - **Corner cases**: Finalized slot handling, window cleanup, duplicate votes, multiple blocks per slot **Running:** @@ -791,8 +777,6 @@ TL schema messages from `tl/ton_api/tl/ton_api.tl`: | `consensus.simplex.candidateAndCert` | Candidate + notarization cert (query response) | | `consensus.simplex.requestCandidate` | Query for missing candidate (RPC) | -**Note**: Fallback votes (`NotarizeFallbackVote`, `SkipFallbackVote`) are **internal FSM state only** and have no TL representation on the wire. See "Alpenglow vs TON Implementation Vote Differences" above. - ### Signature Scheme All signatures are **session-scoped** to prevent cross-session replay: @@ -920,7 +904,6 @@ Health checks run every 20 seconds. Anomaly alerts use the `SIMPLEX_HEALTH` log | `progress_gap` | WARN/ERROR | `first_non_progressed - first_non_finalized > window_size` | Check network connectivity | | `zero_finalization_speed` | WARN (>15s) / ERROR (>60s) | No new finalized slots | Check validator activity, standstill | | `low_activity` | WARN (<66%) / ERROR (<33%) | Active weight below threshold | Check peer connectivity | -| `parent_aging` | WARN (>30s) / ERROR (>120s) | Oldest pending parent resolution age | Check candidate availability | **Log format** (single-line, grep-friendly): @@ -928,6 +911,59 @@ Health checks run every 20 seconds. Anomaly alerts use the `SIMPLEX_HEALTH` log SIMPLEX_HEALTH anomaly= session=<8-char-hex> = ... ``` +### Session Debug Dump + +Every 15–20 seconds the session produces a structured dump. Under normal operation the dump goes to DEBUG level (`dump [OK]`). When no finalizations occur for `ROUND_DEBUG_PERIOD` (15s), the dump fires at ERROR level (`dump [STALLED]`) with a stall conclusion. + +**Health status line** (INFO, always emitted): +``` +Session 882cc37b health [OK]: shard=-1:8000000000000000 slot_nf=s57 slot_np=s57 finalized_head_seqno=43 +``` + +**Stalled dump structure** (ERROR level): +``` +Session dump [STALLED]: + conclusion: + - : + shard= + header: + validators=N local=vNNN session_time=Xs slot_duration=Xs + total_weight=W th66=T th33=T active_weight=W (XX.X%) + frontiers: + first_non_finalized=sN (unchanged Xs) + first_non_progressed=sN (unchanged Xs) + last_finalization: seqno=N slot=sN, Xs ago + last_notarization: seqno=? slot=sN, Xs ago + last_final_cert: seqno=? slot=sN, Xs ago + last_notar_cert: seqno=? slot=sN, Xs ago + heads: + finalized_head_seqno=N + finalized_head=slot sN id=((shard, seqno, rh ..., fh ...)) + last_mc_applied=((shard, seqno, rh ..., fh ...)) + statistics: + candidates: received=N validated=N (%) notarized=N (%) finalized=N (%) other=N (%) + traffic: msgs_in=N msgs_out=N bcasts_in=N bcasts_out=N + votes_in: notar=N final=N skip=N + duplicates: votes=N broadcasts=N request_candidates_sent=N request_candidates_recv=N + collation: + window wN slots=[sN..sN] leader=vN pubkey_b64=... adnl_b64=... + sN phase= reason=... notar=N% final=N% skip=N% flags=[...] certs=[...] + validation: + received (N%): ... + validated (N%): ... + notarized (N%): ... + finalized (N%): ... (last 10s only) + other: omitted=N total_received=N + peers: + vN adnl_b64=... pubkey_b64=... weight=N (N%) last_activity=Xs ago ... + health_findings: + - [Warn|Error] : + standstill_diagnostic: ... +``` + +**`SlotWaitPhase` values** identify what the system is waiting for in each non-finalized slot: +`WaitingForCandidate`, `WaitingForParentBase`, `WaitingForNotarization`, `NotarizedWaitingForFinalization`, `TimeoutSkipped`, `Skipped`, `Finalized`. + ### Metrics Dump Format Periodic dumps output all registered metrics with current values, derivative speeds, and computed percentages. Example: @@ -940,10 +976,7 @@ simplex_votes_in_notarize 126 0.84/s ## References -- [Solana Alpenglow White Paper v1, May 2025](https://drive.google.com/file/d/1y_7ddr8oNOknTQYHzXeeMD2ProQ0WjMs/view) -- [Solana Alpenglow White Paper v1.1, July 2025](https://drive.google.com/file/d/1Rlr3PdHsBmPahOInP6-Pl0bMzdayltdV/view) - - **Note**: v1.1 is a documentation update only; core algorithms unchanged -- [TON C++ Implementation](https://github.com/ton-blockchain/ton/tree/testnet/validator/consensus) +- [TON C++ Implementation](https://github.com/ton-blockchain/ton) (`testnet/validator/consensus/simplex`) ## License diff --git a/src/node/simplex/src/lib.rs b/src/node/simplex/src/lib.rs index b8077a7..9d66105 100644 --- a/src/node/simplex/src/lib.rs +++ b/src/node/simplex/src/lib.rs @@ -8,15 +8,16 @@ */ //! # Simplex Consensus Protocol //! -//! This crate implements the Simplex consensus protocol for TON blockchain, -//! based on the Solana Alpenglow White Paper with modifications for TON. +//! This crate implements the Simplex consensus protocol for TON blockchain. //! -//! ## Key Differences from Original Alpenglow +//! ## Key Protocol Properties //! //! - **Conservative path only** (no fast finality/optimistic path) //! - **Fault tolerance**: <1/3 Byzantine nodes (not 20% as in original) //! - **Certificate threshold**: 2/3 stake weight //! - **No erasure coding**: Simple broadcast instead of Rotor shreds +//! - **Spec mapping**: protocol rules tracked in [ton-blockchain/simplex-docs](https://github.com/ton-blockchain/simplex-docs) +//! - **Finalized-driven semantics**: `on_block_finalized()` is the delivery path; `on_block_committed()` stays legacy-only //! //! ## Quick Start //! @@ -37,7 +38,6 @@ //! validator_nodes, //! &local_private_key, //! db_path, -//! db_suffix, //! overlay_manager, //! session_listener, // Weak //! )?; @@ -377,6 +377,10 @@ pub struct SessionOptions { /// Default: 1 second pub target_rate: Duration, + /// Minimum interval between a parent block's exact generation time and + /// validation / generation of the next non-empty block. + pub min_block_interval: Duration, + /// Timeout for first block in window /// Default: 3 seconds pub first_block_timeout: Duration, @@ -441,15 +445,10 @@ pub struct SessionOptions { pub health_stall_warning_secs: u64, pub health_stall_error_secs: u64, - /// Parent resolution aging thresholds (seconds). - /// Default: 30s (warn), 120s (error) - pub health_parent_aging_warning_secs: u64, - pub health_parent_aging_error_secs: u64, - // -- Noncritical params (from simplex_config_v2 HashmapE) -- // - // These fields are deserialized from on-chain config and passed through, but not yet - // consumed by the Rust session logic. + // Most of these fields are deserialized from on-chain config and passed through. + // `min_block_interval` is consumed directly by the Rust session timing logic. // TODO: replace `timeout_increase_factor` / `max_backoff_delay` with these two fields. // C++ consensus.cpp applies multiplier+cap on window skip (exponential backoff of @@ -480,16 +479,9 @@ pub struct SessionOptions { // 1-second sliding window with this limit per peer for requestCandidate. pub candidate_resolve_rate_limit: u32, - /// When true, collation for non-first slots in a leader window waits until - /// the parent slot is notarized (or finalized) before producing the next - /// candidate. This avoids broadcasting blocks that validators cannot yet - /// accept because C++ `WaitForParent` defers until the parent is notarized. - /// - /// When false, in-window candidate chaining uses the locally generated - /// parent immediately (optimistic pipelining). - /// - /// Default: true - pub require_notarized_parent_for_collation: bool, + // TODO: wire into empty-block error backoff. C++ block-producer.cpp suppresses + // empty blocks for this period after a failed normal collation. + pub no_empty_blocks_on_error_timeout: Duration, } impl Default for SessionOptions { @@ -500,6 +492,7 @@ impl Default for SessionOptions { max_backoff_delay: Duration::from_secs(100), slots_per_leader_window: 1, target_rate: Duration::from_secs(1), + min_block_interval: Duration::from_secs(0), first_block_timeout: Duration::from_secs(3), max_block_size: 4 << 20, // 4 MB max_collated_data_size: 4 << 20, // 4 MB @@ -515,8 +508,6 @@ impl Default for SessionOptions { health_alert_cooldown: Duration::from_secs(30), health_stall_warning_secs: 15, health_stall_error_secs: 60, - health_parent_aging_warning_secs: 30, - health_parent_aging_error_secs: 120, first_block_timeout_multiplier: 1.2, first_block_timeout_cap: Duration::from_secs(100), candidate_resolve_timeout: Duration::from_secs(1), @@ -527,7 +518,7 @@ impl Default for SessionOptions { max_leader_window_desync: 250, bad_signature_ban_duration: Duration::from_secs(5), candidate_resolve_rate_limit: 10, - require_notarized_parent_for_collation: true, + no_empty_blocks_on_error_timeout: Duration::from_secs(15), } } } @@ -582,14 +573,6 @@ impl SessionOptions { fail!("health_stall_error_secs must be >= health_stall_warning_secs") } - if self.health_parent_aging_warning_secs == 0 { - fail!("health_parent_aging_warning_secs must be > 0") - } - - if self.health_parent_aging_error_secs < self.health_parent_aging_warning_secs { - fail!("health_parent_aging_error_secs must be >= health_parent_aging_warning_secs") - } - // Noncritical params from on-chain config if !self.first_block_timeout_multiplier.is_finite() || self.first_block_timeout_multiplier < 1.0 diff --git a/src/node/simplex/src/misbehavior.rs b/src/node/simplex/src/misbehavior.rs index aa03df9..32201e2 100644 --- a/src/node/simplex/src/misbehavior.rs +++ b/src/node/simplex/src/misbehavior.rs @@ -6,9 +6,6 @@ * * This software is provided "AS IS", WITHOUT WARRANTY OF ANY KIND. */ -// TODO: Remove allow(dead_code) when misbehavior detection is integrated into simplex_state.rs -#![allow(dead_code)] - //! Misbehavior detection and proof collection for Simplex consensus. //! //! This module provides types for collecting cryptographic proofs of validator @@ -106,10 +103,6 @@ pub enum VoteDescriptor { Notarize(UInt256), /// Finalize vote with block hash Finalize(UInt256), - /// Notarize fallback vote with block hash - NotarizeFallback(UInt256), - /// Skip fallback vote (no hash) - SkipFallback, } impl VoteDescriptor { @@ -119,27 +112,6 @@ impl VoteDescriptor { Self::Skip => "skip".to_string(), Self::Notarize(hash) => format!("notarize:{}", &hash.to_hex_string()[..8]), Self::Finalize(hash) => format!("finalize:{}", &hash.to_hex_string()[..8]), - Self::NotarizeFallback(hash) => format!("notar-fb:{}", &hash.to_hex_string()[..8]), - Self::SkipFallback => "skip-fb".to_string(), - } - } - - /// Get the vote type name (without hash) - pub fn vote_type(&self) -> &'static str { - match self { - Self::Skip => "skip", - Self::Notarize(_) => "notarize", - Self::Finalize(_) => "finalize", - Self::NotarizeFallback(_) => "notar-fallback", - Self::SkipFallback => "skip-fallback", - } - } - - /// Get the hash if this vote references a block - pub fn hash(&self) -> Option<&UInt256> { - match self { - Self::Skip | Self::SkipFallback => None, - Self::Notarize(h) | Self::Finalize(h) | Self::NotarizeFallback(h) => Some(h), } } } @@ -190,21 +162,6 @@ pub enum ConflictReason { /// Validator sent both Finalize and Skip for the same slot. /// C++: `finalize_.has_value() && skip_.has_value()` FinalizeAfterSkip, - - /// Validator sent both Notarize and Skip for the same slot (no fallback context). - NotarizeAfterSkip, - - /// Validator sent Finalize after already sending NotarFallback. - FinalizeAfterNotarFallback, - - /// Validator sent Finalize after already sending SkipFallback. - FinalizeAfterSkipFallback, - - /// Validator sent NotarFallback after already sending Finalize. - NotarFallbackAfterFinalize, - - /// Validator sent SkipFallback after already sending Finalize. - SkipFallbackAfterFinalize, } /// Report of misbehavior with validator identity and slot context. @@ -280,18 +237,21 @@ impl VoteResult { /// Returns true if the vote was a duplicate #[inline] + #[cfg(test)] pub fn is_duplicate(&self) -> bool { matches!(self, Self::Duplicate) } /// Returns true if the vote was rejected due to misbehavior #[inline] + #[cfg(test)] pub fn is_misbehavior(&self) -> bool { matches!(self, Self::Misbehavior(_)) } /// Returns the misbehavior proof if this is a misbehavior result #[inline] + #[cfg(test)] pub fn misbehavior_proof(&self) -> Option<&MisbehaviorProof> { match self { Self::Misbehavior(proof) => Some(proof), @@ -303,6 +263,7 @@ impl VoteResult { /// /// Compatibility method for tests migrating from `Result<()>`. #[inline] + #[cfg(test)] pub fn is_ok(&self) -> bool { matches!(self, Self::Applied | Self::Duplicate) } @@ -311,6 +272,7 @@ impl VoteResult { /// /// Compatibility method for tests migrating from `Result<()>`. #[inline] + #[cfg(test)] pub fn is_err(&self) -> bool { matches!(self, Self::Misbehavior(_) | Self::Rejected(_)) } @@ -320,6 +282,7 @@ impl VoteResult { /// Accepts `Applied`, `Duplicate`, and `SlotAlreadyFinalized` as success. /// For use in tests only. Production code should match on variants. #[track_caller] + #[cfg(test)] pub fn unwrap(self) { match self { Self::Applied | Self::Duplicate | Self::SlotAlreadyFinalized => {} @@ -335,6 +298,7 @@ impl VoteResult { /// Accepts `Applied`, `Duplicate`, and `SlotAlreadyFinalized` as success. /// For use in tests only. Production code should match on variants. #[track_caller] + #[cfg(test)] pub fn expect(self, msg: &str) { match self { Self::Applied | Self::Duplicate | Self::SlotAlreadyFinalized => {} @@ -425,6 +389,7 @@ impl MisbehaviorProof { } /// Returns the validator index who misbehaved. + #[cfg(test)] pub fn validator_idx(&self) -> ValidatorIndex { match self { Self::ConflictingVotes { validator_idx, .. } => *validator_idx, @@ -433,6 +398,7 @@ impl MisbehaviorProof { } /// Returns a human-readable description of the misbehavior type. + #[cfg(test)] pub fn description(&self) -> &'static str { match self { Self::ConflictingVotes { .. } => "conflicting votes for same slot", @@ -441,6 +407,7 @@ impl MisbehaviorProof { } /// Returns the size of the proof data in bytes. + #[cfg(test)] pub fn size_bytes(&self) -> usize { match self { Self::ConflictingVotes { vote1, vote2, .. } => vote1.len() + vote2.len(), @@ -448,12 +415,6 @@ impl MisbehaviorProof { } } - /// Format a hash as the full hex string. - #[inline] - pub fn format_hash(hash: &UInt256) -> String { - hash.to_hex_string() - } - /// Format a hash as a short hex prefix (8 characters) for logging. #[inline] pub fn format_hash_short(hash: &UInt256) -> String { @@ -461,6 +422,7 @@ impl MisbehaviorProof { } /// Get the first hash for ConflictingVotes (returns None for other variants). + #[cfg(test)] pub fn hash1(&self) -> Option<&UInt256> { match self { Self::ConflictingVotes { hash1, .. } => Some(hash1), @@ -469,6 +431,7 @@ impl MisbehaviorProof { } /// Get the second hash for ConflictingVotes (returns None for other variants). + #[cfg(test)] pub fn hash2(&self) -> Option<&UInt256> { match self { Self::ConflictingVotes { hash2, .. } => Some(hash2), @@ -477,6 +440,7 @@ impl MisbehaviorProof { } /// Get the existing vote descriptor for ConflictingVoteTypes (returns None for other variants). + #[cfg(test)] pub fn existing_vote(&self) -> Option<&VoteDescriptor> { match self { Self::ConflictingVoteTypes { existing_vote, .. } => Some(existing_vote), @@ -485,6 +449,7 @@ impl MisbehaviorProof { } /// Get the new vote descriptor for ConflictingVoteTypes (returns None for other variants). + #[cfg(test)] pub fn new_vote(&self) -> Option<&VoteDescriptor> { match self { Self::ConflictingVoteTypes { new_vote, .. } => Some(new_vote), @@ -499,11 +464,6 @@ impl ConflictReason { match self { Self::NotarizeFinalizeHashMismatch => "notarize and finalize for different blocks", Self::FinalizeAfterSkip => "finalize after skip", - Self::NotarizeAfterSkip => "notarize after skip", - Self::FinalizeAfterNotarFallback => "finalize after notar-fallback", - Self::FinalizeAfterSkipFallback => "finalize after skip-fallback", - Self::NotarFallbackAfterFinalize => "notar-fallback after finalize", - Self::SkipFallbackAfterFinalize => "skip-fallback after finalize", } } } @@ -511,15 +471,25 @@ impl ConflictReason { impl Display for MisbehaviorProof { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { - Self::ConflictingVotes { slot, validator_idx, vote_type, hash1, hash2, .. } => { + Self::ConflictingVotes { + slot, + validator_idx, + vote_type, + hash1, + hash2, + vote1, + vote2, + } => { write!( f, - "conflicting {} votes from v{:03} at slot {}: {} vs {}", + "conflicting {} votes from v{:03} at slot {}: {} vs {} (raw={}+{} bytes)", vote_type, validator_idx.value(), slot.value(), Self::format_hash_short(hash1), - Self::format_hash_short(hash2) + Self::format_hash_short(hash2), + vote1.len(), + vote2.len() ) } Self::ConflictingVoteTypes { @@ -527,17 +497,20 @@ impl Display for MisbehaviorProof { validator_idx, existing_vote, new_vote, + vote1, + vote2, reason, - .. } => { write!( f, - "{} from v{:03} at slot {}: existing={}, new={}", + "{} from v{:03} at slot {}: existing={}, new={} (raw={}+{} bytes)", reason, validator_idx.value(), slot.value(), existing_vote, - new_vote + new_vote, + vote1.len(), + vote2.len() ) } } diff --git a/src/node/simplex/src/receiver.rs b/src/node/simplex/src/receiver.rs index a2d857f..ff54d58 100644 --- a/src/node/simplex/src/receiver.rs +++ b/src/node/simplex/src/receiver.rs @@ -223,7 +223,7 @@ pub(crate) struct StandstillTriggerNotification { Receiver trait and type aliases These are crate-internal - not exposed in public API. - Moved here from lib.rs for encapsulation (CODE-2). + Moved here from lib.rs for encapsulation. */ /// Shared health counters between receiver and session processor. @@ -474,7 +474,13 @@ pub(crate) trait ReceiverListener: Send + Sync { /// Periodic activity update from receiver /// - active_weight: sum of weights for validators with recent activity /// - last_activity: last receive time per validator (None if never received) - fn on_activity(&self, active_weight: ValidatorWeight, last_activity: Vec>); + /// - snapshot: full per-source activity snapshot for dump diagnostics + fn on_activity( + &self, + active_weight: ValidatorWeight, + last_activity: Vec>, + snapshot: ReceiverActivitySnapshot, + ); /// Standstill alarm fired and a fresh replay snapshot was built. /// @@ -490,7 +496,7 @@ pub(crate) trait ReceiverListener: Send + Sync { /// /// This achieves parity with C++ `CandidateResolver::try_load_candidate_data_from_db()`. /// - /// Reference: Alpenglow-Implementation-Plan.md Section 7.14a + /// Reference: Simplex implementation plan Section 7.14a fn on_candidate_query_fallback( &self, slot: SlotIndex, @@ -723,7 +729,7 @@ impl ReceiverThreads { if let Err(panic_payload) = result { log::error!( - "FATAL PANIC (PANIC-1): caught panic in {}: payload=\"{}\"; forcing receiver stop", + "FATAL PANIC: caught panic in {}: payload=\"{}\"; forcing receiver stop", thread::current().name().unwrap_or(""), crate::utils::panic_payload_to_string(panic_payload.as_ref()) ); @@ -839,6 +845,26 @@ struct SourceStats { last_recv_time: Option, /// Last send time last_send_time: Option, + // Typed vote counters + votes_in_notarize: u64, + votes_in_finalize: u64, + votes_in_skip: u64, + // Typed cert counters + certs_in_notar: u64, + certs_in_final: u64, + certs_in_skip: u64, + // Candidate counters + candidates_received: u64, + candidate_requests_sent: u64, + candidate_requests_received: u64, + // Typed last-receive timestamps + last_vote_recv_time: Option, + last_notar_cert_recv_time: Option, + last_final_cert_recv_time: Option, + last_candidate_recv_time: Option, + // Duplicate counters + duplicate_votes: u64, + duplicate_broadcasts: u64, } impl SourceStats { @@ -859,10 +885,64 @@ impl SourceStats { out_broadcasts: 0, last_recv_time: None, last_send_time: None, + votes_in_notarize: 0, + votes_in_finalize: 0, + votes_in_skip: 0, + certs_in_notar: 0, + certs_in_final: 0, + certs_in_skip: 0, + candidates_received: 0, + candidate_requests_sent: 0, + candidate_requests_received: 0, + last_vote_recv_time: None, + last_notar_cert_recv_time: None, + last_final_cert_recv_time: None, + last_candidate_recv_time: None, + duplicate_votes: 0, + duplicate_broadcasts: 0, } } } +/// Snapshot of per-source activity for the session dump. +/// +/// Passed from receiver thread to session processor via `on_activity()`. +#[derive(Clone, Debug)] +pub(crate) struct SourceActivitySnapshot { + pub source_idx: u32, + pub weight: ValidatorWeight, + pub adnl_id_base64: String, + pub in_messages: u64, + pub out_messages: u64, + pub in_broadcasts: u64, + pub out_broadcasts: u64, + pub last_recv_time: Option, + pub last_send_time: Option, + pub votes_in_notarize: u64, + pub votes_in_finalize: u64, + pub votes_in_skip: u64, + pub certs_in_notar: u64, + pub certs_in_final: u64, + pub certs_in_skip: u64, + pub candidates_received: u64, + pub candidate_requests_sent: u64, + pub candidate_requests_received: u64, + pub last_vote_recv_time: Option, + pub last_notar_cert_recv_time: Option, + pub last_final_cert_recv_time: Option, + pub last_candidate_recv_time: Option, + pub duplicate_votes: u64, + pub duplicate_broadcasts: u64, +} + +/// Aggregate snapshot of receiver activity for session dump. +#[derive(Clone, Debug)] +pub(crate) struct ReceiverActivitySnapshot { + pub active_weight: ValidatorWeight, + pub last_activity: Vec>, + pub sources: Vec, +} + /* Deduplication key */ @@ -1084,10 +1164,23 @@ impl ReceiverImpl { source_idx, slot ); + if let Some(stats) = self.sources.get_mut(source_idx as usize) { + stats.duplicate_votes += 1; + } return; } slot_dedup.insert(dedup_key, true); + if let Some(stats) = self.sources.get_mut(source_idx as usize) { + let now = SystemTime::now(); + stats.last_vote_recv_time = Some(now); + match vote.vote() { + UnsignedVote::Consensus_Simplex_NotarizeVote(_) => stats.votes_in_notarize += 1, + UnsignedVote::Consensus_Simplex_FinalizeVote(_) => stats.votes_in_finalize += 1, + UnsignedVote::Consensus_Simplex_SkipVote(_) => stats.votes_in_skip += 1, + } + } + // Forward to listener with raw bytes for misbehavior proof storage if let Some(listener) = self.listener.upgrade() { listener.on_vote(source_idx, vote, raw_vote); @@ -1161,6 +1254,24 @@ impl ReceiverImpl { return; } + if let Some(stats) = self.sources.get_mut(source_idx as usize) { + let now = SystemTime::now(); + match kind { + "notarize" => { + stats.certs_in_notar += 1; + stats.last_notar_cert_recv_time = Some(now); + } + "finalize" => { + stats.certs_in_final += 1; + stats.last_final_cert_recv_time = Some(now); + } + "skip" => { + stats.certs_in_skip += 1; + } + _ => {} + } + } + // Forward to listener for verification and application // SessionProcessor will verify the certificate signatures and update SimplexState if let Some(listener) = self.listener.upgrade() { @@ -1351,6 +1462,11 @@ impl ReceiverImpl { // Reference: C++ CandidateResolver caches candidates on CandidateReceived event self.resolver_cache.cache_candidate(slot_idx, candidate_hash.clone(), candidate_bytes); + if let Some(stats) = self.sources.get_mut(source_idx as usize) { + stats.candidates_received += 1; + stats.last_candidate_recv_time = Some(SystemTime::now()); + } + // Forward to listener (no deduplication for blocks - SessionProcessor handles it) // None notar_cert for broadcasts - certificate comes separately or via query if let Some(listener) = self.listener.upgrade() { @@ -2856,6 +2972,44 @@ impl ReceiverImpl { .collect() } + fn build_activity_snapshot( + &self, + active_weight: ValidatorWeight, + last_activity: Vec>, + ) -> ReceiverActivitySnapshot { + let sources = self + .sources + .iter() + .map(|s| SourceActivitySnapshot { + source_idx: s.source_idx, + weight: s.weight, + adnl_id_base64: key_to_base64(&s.adnl_id), + in_messages: s.in_messages, + out_messages: s.out_messages, + in_broadcasts: s.in_broadcasts, + out_broadcasts: s.out_broadcasts, + last_recv_time: s.last_recv_time, + last_send_time: s.last_send_time, + votes_in_notarize: s.votes_in_notarize, + votes_in_finalize: s.votes_in_finalize, + votes_in_skip: s.votes_in_skip, + certs_in_notar: s.certs_in_notar, + certs_in_final: s.certs_in_final, + certs_in_skip: s.certs_in_skip, + candidates_received: s.candidates_received, + candidate_requests_sent: s.candidate_requests_sent, + candidate_requests_received: s.candidate_requests_received, + last_vote_recv_time: s.last_vote_recv_time, + last_notar_cert_recv_time: s.last_notar_cert_recv_time, + last_final_cert_recv_time: s.last_final_cert_recv_time, + last_candidate_recv_time: s.last_candidate_recv_time, + duplicate_votes: s.duplicate_votes, + duplicate_broadcasts: s.duplicate_broadcasts, + }) + .collect(); + ReceiverActivitySnapshot { active_weight, last_activity, sources } + } + /// Debug dump of receiver state fn debug_dump(&self) { if !log::log_enabled!(log::Level::Debug) { @@ -4114,8 +4268,12 @@ impl ReceiverWrapper { let active_weight = receiver_impl.calculate_active_weight(ACTIVITY_THRESHOLD); let last_activity = receiver_impl.get_last_activity(); + let snapshot = receiver_impl.build_activity_snapshot( + active_weight, + last_activity.clone(), + ); if let Some(listener) = receiver_impl.listener.upgrade() { - listener.on_activity(active_weight, last_activity); + listener.on_activity(active_weight, last_activity, snapshot); } next_active_weight_time = SystemTime::now() + ACTIVE_WEIGHT_RECOMPUTE_PERIOD; @@ -4178,7 +4336,7 @@ impl ReceiverWrapper { /// Compute overlay ID matching C++ consensus.overlayId /// /// CRITICAL: Must match C++ implementation exactly. - /// See: docs/ton-node-cpp-alpenglow/validator/consensus/private-overlay.cpp + /// See: ton-node-cpp/validator/consensus/private-overlay.cpp fn compute_overlay_id( session_id: &SessionId, nodes: &[SessionNode], diff --git a/src/node/simplex/src/session.rs b/src/node/simplex/src/session.rs index 5b4d0c9..ffc8373 100644 --- a/src/node/simplex/src/session.rs +++ b/src/node/simplex/src/session.rs @@ -142,9 +142,14 @@ impl ReceiverListener for ReceiverListenerImpl { } /// Handle activity updates from the receiver - fn on_activity(&self, active_weight: ValidatorWeight, last_activity: Vec>) { + fn on_activity( + &self, + active_weight: ValidatorWeight, + last_activity: Vec>, + snapshot: crate::receiver::ReceiverActivitySnapshot, + ) { self.task_queue.post_closure(Box::new(move |processor: &mut SessionProcessor| { - processor.on_activity(active_weight, last_activity); + processor.on_activity(active_weight, last_activity, snapshot); })); } diff --git a/src/node/simplex/src/session_description.rs b/src/node/simplex/src/session_description.rs index 2d99ce6..3640744 100644 --- a/src/node/simplex/src/session_description.rs +++ b/src/node/simplex/src/session_description.rs @@ -279,6 +279,7 @@ impl SessionDescription { /// Check if the given validator index is this validator #[inline] + #[cfg(test)] pub fn is_self(&self, idx: ValidatorIndex) -> bool { idx == self.self_idx } @@ -328,6 +329,7 @@ impl SessionDescription { } /// Is this the first slot in its leader window? + #[cfg(test)] pub fn is_first_in_window(&self, slot: SlotIndex) -> bool { slot.is_first_in_window(self.options.slots_per_leader_window) } @@ -344,6 +346,7 @@ impl SessionDescription { } /// Is this node the leader for the given slot's window? + #[cfg(test)] pub fn is_self_leader(&self, slot: SlotIndex) -> bool { self.get_leader(slot) == self.self_idx } diff --git a/src/node/simplex/src/session_processor.rs b/src/node/simplex/src/session_processor.rs index 3889b3d..a3a776c 100644 --- a/src/node/simplex/src/session_processor.rs +++ b/src/node/simplex/src/session_processor.rs @@ -10,6 +10,7 @@ //! //! Contains the core consensus algorithm in a single-threaded context. //! This module is crate-private. +//! C++ cross-reference: [ton-blockchain/ton](https://github.com/ton-blockchain/ton) (`testnet/validator/consensus/simplex`). //! //! # Architecture //! @@ -78,12 +79,13 @@ use crate::{ session_description::SessionDescription, simplex_state::{ BlockFinalizedEvent, FinalizationReachedEvent, NotarizationReachedEvent, SimplexEvent, - SimplexState, SimplexStateOptions, SkipCertificateReachedEvent, SlotSkippedEvent, Vote, + SimplexState, SkipCertificateReachedEvent, SlotSkippedEvent, Vote, }, startup_recovery::{CandidateHash, SessionStartupRecoveryListener, SignatureBytes}, task_queue::{post_callback_closure, CallbackTaskQueuePtr, TaskPtr, TaskQueuePtr}, utils::{ - extract_vote_and_signature, sign_vote, threshold_33, threshold_66, verify_vote_signature, + extract_consensus_gen_utime_ms, extract_vote_and_signature, sign_vote, threshold_33, + threshold_66, verify_vote_signature, }, BlockCandidatePriority, ConsensusOverlayManagerPtr, MetricsHandle, PrivateKey, RawVoteData, SessionId, SessionListenerPtr, ValidatorWeight, SIMPLEX_ROUNDLESS, @@ -93,12 +95,13 @@ use consensus_common::{ }; use std::{ collections::{BTreeMap, HashMap, HashSet, VecDeque}, + fmt::{Display, Formatter}, mem::discriminant, sync::{ atomic::{AtomicBool, AtomicU32, Ordering}, Arc, }, - time::{Duration, SystemTime}, + time::{Duration, SystemTime, UNIX_EPOCH}, }; use ton_api::{ deserialize_boxed, deserialize_typed, serialize_boxed, @@ -118,18 +121,20 @@ use ton_api::{ IntoBoxed, }; use ton_block::{ - error, fail, sha256_digest, BlockIdExt, BlockSignaturesPure, BlockSignaturesSimplex, - BlockSignaturesVariant, BocFlags, CryptoSignature, CryptoSignaturePair, Error, Result, UInt256, - ValidatorBaseInfo, + base64_encode, error, fail, sha256_digest, BlockIdExt, BlockSignaturesPure, + BlockSignaturesSimplex, BlockSignaturesVariant, BocFlags, CryptoSignature, CryptoSignaturePair, + Error, Result, UInt256, ValidatorBaseInfo, }; /* Constants */ -/// Maximum timeout for next awake time (1 day) -/// Used as default "far future" value when no specific timeout is scheduled -const MAX_AWAKE_TIMEOUT: Duration = Duration::from_secs(86400); +/// Maximum timeout for next awake time. +/// +/// TODO(simplex-timing): experimental 10ms wake fallback for simnet/testnet validation. +/// Restore the old "far future" behavior after the wake-discipline fixes are validated. +const MAX_AWAKE_TIMEOUT: Duration = Duration::from_millis(10); /// Maximum generation time for collation - warn if exceeded const MAX_GENERATION_TIME: Duration = Duration::from_millis(1000); @@ -151,55 +156,156 @@ const CANDIDATE_REQUEST_DELAY: Duration = Duration::from_secs(1); /// Under network partitions, a single request may time out; we must retry, but not spam. const CANDIDATE_REQUEST_RETRY_INTERVAL: Duration = Duration::from_secs(2); -/// Maximum parent chain depth for resolution tracking -/// Protects against excessive recursion in update_resolution_cache_chain +/// Maximum empty-parent ancestry depth to walk when resolving the expected +/// normal tip for empty candidates. const MAX_CHAIN_DEPTH: u32 = 10000; -/// Warning threshold for deep parent chain recursion -/// Logs a warning if recursion depth reaches this level -const DEEP_RECURSION_WARNING_THRESHOLD: u32 = 100; +/// SessionProcessor always enforces C++ `WaitForParent` readiness before dispatching validation. +/// +/// Masterchain stale-parent protection remains validator-side, matching the C++ split where +/// simplex waits for parent/skip readiness and `block-validator.cpp` owns accepted-head checks. -/// Maximum time to wait for parent resolution before timeout -/// Candidates waiting longer than this are considered failed -const MAX_PARENT_WAIT_TIME: Duration = Duration::from_secs(600); // 10 minutes +/// Maximum number of recently-finalized blocks to show in validation section dump. +const RECENT_FINALIZED_DUMP_WINDOW: Duration = Duration::from_secs(10); -/// Integration knob: avoid generating NON-EMPTY blocks on non-finalized parents. -/// -/// When `true`, shardchain sessions use the masterchain-style empty-block rule -/// (`finalized_head_seqno + 1 < new_seqno`) instead of the C++ shardchain rule -/// (MC lag threshold). This was needed before optimistic validation was implemented. +/// Observability state for stall diagnostics. /// -/// Now that ValidatorGroup uses candidate-native validation (run_validate_query_any_candidate) -/// and check_validation() accepts notarized parents, this flag is set to `false` for C++ parity. -const DISABLE_NON_FINALIZED_PARENTS_FOR_COLLATION: bool = false; +/// Tracks cursor-change timestamps and consensus milestone times so the dump +/// can report how long since each frontier moved and when the last cert arrived. +struct SessionObservability { + prev_first_non_finalized: SlotIndex, + prev_first_non_progressed: SlotIndex, + last_finalized_cursor_change_at: SystemTime, + last_progression_change_at: SystemTime, + last_notarization_at: Option, + last_notarization_slot: Option, + last_notar_cert_at: Option, + last_notar_cert_slot: Option, + last_final_cert_at: Option, + last_final_cert_slot: Option, + last_mc_applied_block_id: Option, +} + +impl SessionObservability { + fn new(now: SystemTime) -> Self { + Self { + prev_first_non_finalized: SlotIndex(0), + prev_first_non_progressed: SlotIndex(0), + last_finalized_cursor_change_at: now, + last_progression_change_at: now, + last_notarization_at: None, + last_notarization_slot: None, + last_notar_cert_at: None, + last_notar_cert_slot: None, + last_final_cert_at: None, + last_final_cert_slot: None, + last_mc_applied_block_id: None, + } + } +} + +/// Health check finding kind for structured stall diagnosis. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum HealthFindingKind { + ZeroFinalizationSpeed, + ProgressGap, + LowActivity, + StandstillTriggers, + CandidateGiveups, + SkipVoteDominance, + ValidatorIsolated, + CertVerifyFailures, +} + +/// Single health check finding with severity and human-readable summary. +#[derive(Debug, Clone)] +struct HealthFinding { + kind: HealthFindingKind, + severity: log::Level, + summary: String, +} + +/// Lifecycle phase of a non-finalized slot for stall diagnosis. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[allow(dead_code)] +pub(crate) enum SlotWaitPhase { + WaitingForCandidate, + WaitingForParentBase, + WaitingForNotarization, + NotarizedWaitingForFinalization, + Skipped, + Finalized, + TimeoutSkipped, +} -/// Controls whether SessionProcessor blocks validation submission on C++-style WaitForParent -/// readiness (`parent finalized/notarized + full skip-gap coverage`). -#[derive(Clone, Copy, Debug, Eq, PartialEq)] +impl Display for SlotWaitPhase { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::WaitingForCandidate => write!(f, "WaitingForCandidate"), + Self::WaitingForParentBase => write!(f, "WaitingForParentBase"), + Self::WaitingForNotarization => write!(f, "WaitingForNotarization"), + Self::NotarizedWaitingForFinalization => write!(f, "NotarizedWaitFinalization"), + Self::Skipped => write!(f, "Skipped"), + Self::Finalized => write!(f, "Finalized"), + Self::TimeoutSkipped => write!(f, "TimeoutSkipped"), + } + } +} + +/// Per-slot diagnostic for non-finalized slots. +#[derive(Debug)] #[allow(dead_code)] -enum ParentReadinessMode { - /// Keep C++ parity behavior in SessionProcessor. - StrictWaitForParent, - /// Allow sending candidates to validator-side validation before parent readiness converges. - RelaxedAllowEarlyValidation, +pub(crate) struct SlotDiagnostic { + pub slot: SlotIndex, + pub window_idx: WindowIndex, + pub phase: SlotWaitPhase, + pub reason: String, + pub has_pending_block: bool, + pub available_parent: bool, + pub voted_notar: bool, + pub voted_skip: bool, + pub voted_final: bool, + pub has_notar_cert: bool, + pub has_final_cert: bool, + pub has_skip_cert: bool, + pub notar_weight_pct: f64, + pub final_weight_pct: f64, + pub skip_weight_pct: f64, + pub notar_or_skip_weight_pct: f64, + pub is_timeout_skipped: bool, } -/// Controls whether SessionProcessor enforces MC accepted-head ordering before forwarding. -#[derive(Clone, Copy, Debug, Eq, PartialEq)] +/// Window-level summary for dump grouping. +#[derive(Debug)] #[allow(dead_code)] -enum McAcceptedHeadMode { - /// Keep MC accepted-head gate in SessionProcessor (`check_mc_validation_ready`). - StrictSessionProcessorGate, - /// Delegate MC stale protection to validator-side checks. - ValidatorSideOnly, +pub(crate) struct WindowDiagnostic { + pub window_idx: WindowIndex, + pub slot_begin: SlotIndex, + pub slot_end: SlotIndex, + pub leader_idx: ValidatorIndex, + pub had_timeouts: bool, + pub slots: Vec, } -/// Default mode for current pass: allow validator-side attempts even while parent readiness is -/// still converging in SessionProcessor. -const PARENT_READINESS_MODE: ParentReadinessMode = ParentReadinessMode::RelaxedAllowEarlyValidation; +/// Candidate funnel totals for validation inventory. +struct CandidateTotals { + received_total: usize, + received_unvalidated: usize, + validated_not_notarized: usize, + notarized_not_finalized: usize, + finalized_recent: usize, + other_omitted: usize, +} -/// Default mode for current pass: keep MC stale protection on validator side. -const MC_ACCEPTED_HEAD_MODE: McAcceptedHeadMode = McAcceptedHeadMode::ValidatorSideOnly; +impl CandidateTotals { + fn pct(&self, value: usize) -> f64 { + if self.received_total == 0 { + 0.0 + } else { + 100.0 * value as f64 / self.received_total as f64 + } + } +} /// Tracks per-anomaly cooldowns and delta baselines for health alert deduplication. /// All timestamps use `SystemTime` (via `self.now()`) for deterministic testing. @@ -209,7 +315,6 @@ pub(crate) struct HealthAlertState { last_cert_fail_warn: SystemTime, last_finalization_speed_warn: SystemTime, last_finalization_nonzero_at: SystemTime, - last_parent_aging_warn: SystemTime, last_progress_warn: SystemTime, last_skip_ratio_warn: SystemTime, last_standstill_warn: SystemTime, @@ -234,7 +339,6 @@ impl HealthAlertState { last_cert_fail_warn: warn_base, last_finalization_speed_warn: warn_base, last_finalization_nonzero_at: now, - last_parent_aging_warn: warn_base, last_progress_warn: warn_base, last_skip_ratio_warn: warn_base, last_standstill_warn: warn_base, @@ -360,6 +464,8 @@ struct LocalChainHead { parent_info: crate::block::CandidateParentInfo, /// Resolved BlockIdExt of the generated candidate (for seqno derivation and explicit parent hint) block_id: BlockIdExt, + /// Exact generation time extracted from ConsensusExtraData, if available. + gen_utime_ms: Option, } /* @@ -408,6 +514,8 @@ struct GeneratedBlockDesc { tl_candidate_data: CandidateData, /// Signature for FSM Candidate signature: Vec, + /// Exact generation time extracted from ConsensusExtraData, if available. + gen_utime_ms: Option, } /* @@ -456,7 +564,7 @@ struct ReceivedCandidate { source_idx: ValidatorIndex, /// Candidate ID hash (from RawCandidateId.hash) /// This is computed from TL candidateHashData, NOT the block's root_hash - /// Used for matching parent references in parent resolution + /// Used for matching parent references during candidate metadata lookups #[allow(dead_code)] // May be used for debugging/diagnostics candidate_id_hash: UInt256, /// Serialized CandidateHashData TL bytes @@ -477,36 +585,17 @@ struct ReceivedCandidate { /// Collated data (extracted from TL) #[allow(dead_code)] collated_data: crate::BlockPayloadPtr, + /// Exact generation time extracted from ConsensusExtraData, if available. + gen_utime_ms: Option, /// Time when candidate was received (for latency tracking) #[allow(dead_code)] receive_time: SystemTime, /// True if this is an empty block (inherits parent's BlockIdExt) is_empty: bool, - /// Parent candidate ID (None for genesis/first in epoch) - /// Used for recursive parent resolution + /// Parent candidate ID (None for genesis/first in epoch). + /// Used for empty-parent tip checks, explicit-parent collation hints, and + /// restart-seeded metadata lookups. parent_id: Option, - /// Cached resolution status: true if entire parent chain is available - /// Updated by update_resolution_cache_chain when parents arrive - is_fully_resolved: bool, -} - -/// Tracks candidates waiting for parent chain resolution -/// -/// When a candidate is received but its parent is not yet available, -/// it's queued here until the parent arrives. This enables recursive -/// parent resolution - if the parent itself has a missing parent, -/// the chain is resolved depth-first. -/// -/// Reference: C++ candidate-resolver.cpp ResolveCandidate bus message -struct PendingParentResolution { - /// The raw candidate waiting for parent(s) - raw_candidate: RawCandidate, - /// Slot of this candidate - slot: SlotIndex, - /// Source validator index (leader) - source_idx: ValidatorIndex, - /// Time when candidate was received (for timeout) - receive_time: SystemTime, } /// Pending validation entry @@ -535,6 +624,7 @@ struct GeneratedCandidateValidationWatch { } #[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[allow(dead_code)] enum McValidationReadiness { Ready, WaitingForAcceptedHead, @@ -698,6 +788,9 @@ pub(crate) struct SessionProcessor { /// `on_candidate_received` self-loop, so `resolve_parent_block_id()` can /// find the parent immediately for chained precollation. generated_parent_cache: HashMap, + /// Exact generation timestamps for locally generated parents before the + /// async self-receive path populates `received_candidates`. + generated_parent_gen_utime_ms_cache: HashMap, /// Locally generated candidates that have not yet validated successfully. /// /// Used to surface warnings and metrics when the self-loop or validation @@ -761,6 +854,8 @@ pub(crate) struct SessionProcessor { validation_latency_histogram: metrics::Histogram, /// Histogram for collation latency (time to generate a block) collation_latency_histogram: metrics::Histogram, + /// Histogram for how late `check_all()` runs relative to its scheduled wake. + check_all_wake_slip_histogram: metrics::Histogram, /// Gauge for current active weight from network active_weight_gauge: metrics::Gauge, /// Result status counter for validation requests @@ -939,24 +1034,6 @@ pub(crate) struct SessionProcessor { /// Candidate request throttling: (slot, hash) → next allowed request time. requested_candidates: HashMap, - /* - ======================================================================== - Pending Parent Resolution (Recursive Candidate Resolution) - - Tracks candidates waiting for their parent chain to be resolved. - When a candidate is received but its parent is not yet available, - it's queued here until the parent arrives. - - Reference: C++ consensus.cpp get_resolved_candidate, bus.h ResolveCandidate - ======================================================================== - */ - /// Map: parent_id → Vec of candidates waiting for this parent - /// - /// When a candidate's parent is missing, we queue the candidate here. - /// When a parent arrives (in on_candidate_received), we check this map - /// and process any waiting candidates. - pending_parent_resolutions: HashMap>, - /* ======================================================================== Misbehavior Tracking @@ -1036,6 +1113,10 @@ pub(crate) struct SessionProcessor { pub(crate) receiver_health_counters: Arc, /// Local cert verify fail total (for delta-based anomaly detection) pub(crate) cert_verify_fails_total: u64, + /// Observability state for stall diagnostics (cursor ages, milestone timestamps) + observability: SessionObservability, + /// Latest receiver activity snapshot (updated periodically via on_activity) + last_receiver_snapshot: Option, } impl SessionProcessor { @@ -1176,7 +1257,7 @@ impl SessionProcessor { /* ======================================================================== - INT-2: Per-slot stage tracking accessors (for latency metrics) + Per-slot stage tracking accessors (for latency metrics) These accessors track milestone events within a slot for latency measurement: first candidate received, first notarized, first finalized. @@ -1249,18 +1330,6 @@ impl SessionProcessor { .and_then(|rt| rt.validated_candidate_data.as_ref()) } - /// Check if any validator has validated candidate data for this slot. - fn slot_has_validated_candidate_from( - &self, - slot: SlotIndex, - validator_idx: ValidatorIndex, - ) -> bool { - self.slot_entry(slot) - .and_then(|e| e.runtime.as_ref()) - .and_then(|rt| rt.validated_candidate_data.as_ref()) - .map_or(false, |vc| vc.source_idx == validator_idx) - } - /// Create new session processor /// /// The processor is created with empty state. Bootstrap state is applied @@ -1295,16 +1364,10 @@ impl SessionProcessor { initial_block_seqno ); - // Initialize SimplexState FSM with C++-compatible options. - // - // We keep `require_finalized_parent=false` (C++ mode) so the FSM can parent on notarized - // blocks and avoid deadlock when a slot is notarized but not finalized/skipped yet. - // + // Initialize SimplexState FSM. // SIMPLEX_ROUNDLESS: // - We pass `SIMPLEX_ROUNDLESS` in callbacks to bypass round-based invariants. - let simplex_state_options = SimplexStateOptions::cpp_compatible(); - - let simplex_state = SimplexState::new(&description, simplex_state_options)?; + let simplex_state = SimplexState::new(&description)?; let initial_standstill_slots = simplex_state.get_tracked_slots_interval(); let initial_progress_slot = simplex_state.get_first_non_progressed_slot().value(); @@ -1316,22 +1379,23 @@ impl SessionProcessor { receiver.set_standstill_slots(initial_standstill_slots.0, initial_standstill_slots.1); log::info!( - "Session {} SIMPLEX MODE: require_finalized_parent=false (C++ parenting enabled). \ - Optimistic validation: candidate-native path (notarized parents accepted). \ - DISABLE_NON_FINALIZED_PARENTS_FOR_COLLATION={}. \ - parent_readiness_mode={:?}, mc_accepted_head_mode={:?}.", + "Session {} SIMPLEX MODE: C++ parenting enabled (notarized parents accepted). \ + Candidate-native validation enabled. \ + WaitForParent gating=strict, MC stale protection=validator-side.", session_id.to_hex_string(), - DISABLE_NON_FINALIZED_PARENTS_FOR_COLLATION, - PARENT_READINESS_MODE, - MC_ACCEPTED_HEAD_MODE, ); log::info!( - "Session {} SimplexState FSM initialized: slots_per_window={}, \ - require_finalized_parent=false", + "Session {} SimplexState FSM initialized: slots_per_window={}", session_id.to_hex_string(), description.opts().slots_per_leader_window, ); + log::warn!( + "Session {}: TEMP experimental MAX_AWAKE_TIMEOUT={}ms is enabled; restore the old \ + far-future fallback after timing validation is complete", + session_id.to_hex_string(), + MAX_AWAKE_TIMEOUT.as_millis(), + ); // Initialize metrics let metrics_receiver = description.get_metrics_receiver().clone(); @@ -1341,6 +1405,7 @@ impl SessionProcessor { slot_duration_histogram, validation_latency_histogram, collation_latency_histogram, + check_all_wake_slip_histogram, active_weight_gauge, validates_counter, collates_counter, @@ -1420,6 +1485,7 @@ impl SessionProcessor { earliest_collation_time: None, local_chain_head: None, generated_parent_cache: HashMap::new(), + generated_parent_gen_utime_ms_cache: HashMap::new(), generated_candidates_waiting_validation: HashMap::new(), // Validation state pending_validations: HashMap::new(), @@ -1439,6 +1505,7 @@ impl SessionProcessor { slot_duration_histogram, validation_latency_histogram, collation_latency_histogram, + check_all_wake_slip_histogram, active_weight_gauge, validates_counter, collates_counter, @@ -1482,8 +1549,6 @@ impl SessionProcessor { accepted_normal_head_block_id: None, // Candidate request tracking requested_candidates: HashMap::new(), - // Pending parent resolution - pending_parent_resolutions: HashMap::new(), finalized_delivery_sent: HashSet::new(), // Misbehavior tracking misbehavior_reports: Vec::new(), @@ -1520,6 +1585,8 @@ impl SessionProcessor { health_alert_state: HealthAlertState::new(now, health_alert_cooldown), receiver_health_counters, cert_verify_fails_total: 0, + observability: SessionObservability::new(now), + last_receiver_snapshot: None, }; // Increment errors_counter metric with startup errors (for metrics consistency) @@ -1561,6 +1628,21 @@ impl SessionProcessor { self.description.get_session_creation_time() } + /// Format a duration as "X.Ys ago" or "never". + fn fmt_ago(now: SystemTime, t: Option) -> String { + t.and_then(|t| now.duration_since(t).ok()) + .map(|d| format!("{:.1}s ago", d.as_secs_f64())) + .unwrap_or_else(|| "never".to_string()) + } + + /// Format a duration as "X.Ys" or "never". + fn fmt_dur(now: SystemTime, t: SystemTime) -> String { + now.duration_since(t) + .ok() + .map(|d| format!("{:.1}s", d.as_secs_f64())) + .unwrap_or_else(|| "?".to_string()) + } + /* Metrics initialization */ @@ -1580,6 +1662,7 @@ impl SessionProcessor { metrics::Histogram, // slot_duration_histogram metrics::Histogram, // validation_latency_histogram metrics::Histogram, // collation_latency_histogram + metrics::Histogram, // check_all_wake_slip_histogram metrics::Gauge, // active_weight_gauge ResultStatusCounter, // validates_counter ResultStatusCounter, // collates_counter @@ -1632,6 +1715,8 @@ impl SessionProcessor { let validation_latency_histogram = sink.register_histogram(&"time:validation_latency".into()); let collation_latency_histogram = sink.register_histogram(&"time:collation_latency".into()); + let check_all_wake_slip_histogram = + sink.register_histogram(&"time:check_all_wake_slip_ms".into()); let broadcast_validation_latency_histogram = sink.register_histogram(&"time:broadcast_validation_latency".into()); @@ -1721,6 +1806,7 @@ impl SessionProcessor { slot_duration_histogram, validation_latency_histogram, collation_latency_histogram, + check_all_wake_slip_histogram, active_weight_gauge, validates_counter, collates_counter, @@ -2209,6 +2295,7 @@ impl SessionProcessor { /// } /// ``` pub fn set_mc_finalized_block(&mut self, applied_top: BlockIdExt) { + self.observability.last_mc_applied_block_id = Some(applied_top.clone()); let session_shard = self.description.get_shard(); if applied_top.shard() != session_shard { log::trace!( @@ -2239,6 +2326,7 @@ impl SessionProcessor { if new_val > consensus { self.last_consensus_finalized_seqno = Some(new_val); } + self.wake_now(); } /// Get the last masterchain finalized seqno @@ -2251,7 +2339,7 @@ impl SessionProcessor { /// Determines if an empty block should be generated for finalization recovery /// - /// Empty blocks are a TON-specific extension (not in Alpenglow White Paper) that + /// Empty blocks are a TON-specific extension that /// allows the consensus to continue when the blockchain finalization is lagging /// behind. Instead of generating a new block with transactions, validators /// re-sign the previous block to help it get a FinalizeCertificate. @@ -2303,25 +2391,23 @@ impl SessionProcessor { return true; } - if self.description.get_shard().is_masterchain() - || DISABLE_NON_FINALIZED_PARENTS_FOR_COLLATION - { + if self.description.get_shard().is_masterchain() { // Masterchain: consensus-finalized seqno must be at most 1 behind new seqno. // C++ parity: block-producer.cpp uses `last_consensus_finalized_seqno_` which // advances on FinalizeBlock(is_final) and on BlockFinalizedInMasterchain. match self.last_consensus_finalized_seqno { Some(finalized) => finalized + 1 < new_seqno, - None => false, // No finalization yet, can't be behind + None => false, } } else { - // Shardchain: MC finalized can be up to threshold behind - // Threshold is configurable via empty_block_mc_lag_threshold option + // Shardchain: MC finalized can be up to threshold behind. + // C++ parity: block-producer.cpp `last_mc_finalized_seqno_ + 8 < new_seqno`. match ( self.last_mc_finalized_seqno, self.description.opts().empty_block_mc_lag_threshold, ) { (Some(mc_finalized), Some(threshold)) => mc_finalized + threshold < new_seqno, - _ => false, // No MC finalization yet or threshold not set + _ => false, } } } @@ -2342,13 +2428,18 @@ impl SessionProcessor { } } - /// Reset next awake time to far future + /// Reset next awake time to the fallback poll horizon. /// /// Called at the beginning of check_all() before collecting timeouts from all sources. pub fn reset_next_awake_time(&mut self) { self.next_awake_time = self.now() + MAX_AWAKE_TIMEOUT; } + /// Force the main loop to run `check_all()` again immediately. + fn wake_now(&mut self) { + self.set_next_awake_time(self.now()); + } + /* Delayed actions */ @@ -2439,6 +2530,168 @@ impl SessionProcessor { ); } + /// Collect current health findings without logging or cooldown gating. + /// + /// Used by both `debug_dump()` (for the stall conclusion) and `run_health_checks()` + /// (for cooldown-gated alerts). + fn collect_health_findings(&self) -> Vec { + let now = self.now(); + let mut findings = Vec::new(); + + let first_non_finalized = self.simplex_state.get_first_non_finalized_slot().0; + let first_non_progressed = self.simplex_state.get_first_non_progressed_slot().0; + let window_size = self.description.opts().slots_per_leader_window; + let total_weight = self.description.get_total_weight(); + let active_weight = self.active_weight; + + // Progress gap + if first_non_progressed > first_non_finalized { + let gap = first_non_progressed - first_non_finalized; + if gap > window_size { + let sev = if gap > 2 * window_size { log::Level::Error } else { log::Level::Warn }; + findings.push(HealthFinding { + kind: HealthFindingKind::ProgressGap, + severity: sev, + summary: format!( + "progress gap={gap} (nf={first_non_finalized} np={first_non_progressed} \ + window={window_size})" + ), + }); + } + } + + // Zero finalization speed + let stall_warn_secs = self.description.opts().health_stall_warning_secs; + let stall_duration = now + .duration_since(self.health_alert_state.last_finalization_nonzero_at) + .unwrap_or_default(); + if stall_duration >= Duration::from_secs(stall_warn_secs) { + let stall_err_secs = self.description.opts().health_stall_error_secs; + let sev = if stall_duration >= Duration::from_secs(stall_err_secs) { + log::Level::Error + } else { + log::Level::Warn + }; + findings.push(HealthFinding { + kind: HealthFindingKind::ZeroFinalizationSpeed, + severity: sev, + summary: format!("no local finalization for {:.1}s", stall_duration.as_secs_f64()), + }); + } + + // Low activity + let t66 = threshold_66(total_weight); + if active_weight < t66 { + let t33 = threshold_33(total_weight); + let sev = if active_weight < t33 { log::Level::Error } else { log::Level::Warn }; + let pct = if total_weight > 0 { + (active_weight as f64 / total_weight as f64) * 100.0 + } else { + 0.0 + }; + findings.push(HealthFinding { + kind: HealthFindingKind::LowActivity, + severity: sev, + summary: format!("active_weight={active_weight} ({pct:.0}%) < th66={t66}"), + }); + } + + // Cert verify failures + let current_cert_fails = self.cert_verify_fails_total; + let prev_cert_fails = self.health_alert_state.prev_cert_verify_fails; + if current_cert_fails > prev_cert_fails { + findings.push(HealthFinding { + kind: HealthFindingKind::CertVerifyFailures, + severity: log::Level::Warn, + summary: format!( + "cert_verify_fail delta={} total={}", + current_cert_fails - prev_cert_fails, + current_cert_fails + ), + }); + } + + // Standstill triggers + let current_standstill = + self.receiver_health_counters.standstill_triggers.load(Ordering::Relaxed); + let prev_standstill = self.health_alert_state.prev_standstill_triggers; + if current_standstill > prev_standstill { + findings.push(HealthFinding { + kind: HealthFindingKind::StandstillTriggers, + severity: log::Level::Warn, + summary: format!( + "standstill_triggers delta={} total={}", + current_standstill - prev_standstill, + current_standstill + ), + }); + } + + // Candidate giveups + let current_giveups = + self.receiver_health_counters.candidate_giveups.load(Ordering::Relaxed); + let prev_giveups = self.health_alert_state.prev_candidate_giveups; + if current_giveups > prev_giveups { + findings.push(HealthFinding { + kind: HealthFindingKind::CandidateGiveups, + severity: log::Level::Warn, + summary: format!( + "candidate_giveups delta={} total={}", + current_giveups - prev_giveups, + current_giveups + ), + }); + } + + // Skip vote dominance + let delta_notar = self + .votes_in_notarize_total + .saturating_sub(self.health_alert_state.prev_votes_in_notarize); + let delta_final = self + .votes_in_finalize_total + .saturating_sub(self.health_alert_state.prev_votes_in_finalize); + let delta_skip = + self.votes_in_skip_total.saturating_sub(self.health_alert_state.prev_votes_in_skip); + let delta_total = delta_notar + delta_final + delta_skip; + let skip_ratio_min_delta = (self.description.get_total_nodes() as u64).max(2) / 2; + if delta_total >= skip_ratio_min_delta { + let progress_votes = delta_notar + delta_final; + let skip_to_progress = delta_skip as f64 / (progress_votes.max(1) as f64); + if skip_to_progress >= 3.0 { + let sev = if skip_to_progress >= 8.0 && progress_votes == 0 { + log::Level::Error + } else { + log::Level::Warn + }; + let skip_share = if delta_total > 0 { + 100.0 * delta_skip as f64 / delta_total as f64 + } else { + 0.0 + }; + findings.push(HealthFinding { + kind: HealthFindingKind::SkipVoteDominance, + severity: sev, + summary: format!( + "skip_share={skip_share:.0}% skip={delta_skip} notar={delta_notar} \ + final={delta_final}" + ), + }); + } + } + + // Validator isolation + let session_age = now.duration_since(self.session_creation_time()).unwrap_or_default(); + if session_age > Duration::from_secs(60) && active_weight <= 1 && total_weight > 1 { + findings.push(HealthFinding { + kind: HealthFindingKind::ValidatorIsolated, + severity: log::Level::Error, + summary: format!("only self active, session_age={:.0}s", session_age.as_secs_f64()), + }); + } + + findings + } + /// Public health check dump for periodic monitoring /// /// Called from session main loop for periodic health checks. @@ -2555,46 +2808,7 @@ impl SessionProcessor { } } - // 4. Parent resolution aging: oldest pending resolution exceeds threshold - let parent_warn_secs = self.description.opts().health_parent_aging_warning_secs; - let parent_err_secs = self.description.opts().health_parent_aging_error_secs; - if !self.pending_parent_resolutions.is_empty() { - let mut oldest_age = Duration::ZERO; - for entries in self.pending_parent_resolutions.values() { - for entry in entries { - if let Ok(age) = now.duration_since(entry.receive_time) { - if age > oldest_age { - oldest_age = age; - } - } - } - } - if oldest_age > Duration::from_secs(parent_warn_secs) - && now - .duration_since(self.health_alert_state.last_parent_aging_warn) - .unwrap_or_default() - >= cooldown - { - self.health_alert_state.last_parent_aging_warn = now; - self.health_warnings_counter.increment(1); - let pending_count = self.pending_parent_resolutions.len(); - if oldest_age > Duration::from_secs(parent_err_secs) { - log::error!( - "SIMPLEX_HEALTH anomaly=parent_aging session={session_prefix} \ - oldest_secs={:.0} pending_count={pending_count}", - oldest_age.as_secs_f64(), - ); - } else { - log::warn!( - "SIMPLEX_HEALTH anomaly=parent_aging session={session_prefix} \ - oldest_secs={:.0} pending_count={pending_count}", - oldest_age.as_secs_f64(), - ); - } - } - } - - // 5. Cert verify failures (delta-based) + // 4. Cert verify failures (delta-based) let current_cert_fails = self.cert_verify_fails_total; let prev_cert_fails = self.health_alert_state.prev_cert_verify_fails; if current_cert_fails > prev_cert_fails @@ -2613,7 +2827,7 @@ impl SessionProcessor { ); } - // 6. Standstill trigger rate (delta-based, from receiver) + // 5. Standstill trigger rate (delta-based, from receiver) let current_standstill = self.receiver_health_counters.standstill_triggers.load(Ordering::Relaxed); let prev_standstill = self.health_alert_state.prev_standstill_triggers; @@ -2633,7 +2847,7 @@ impl SessionProcessor { ); } - // 7. Candidate request giveups (delta-based, from receiver) + // 6. Candidate request giveups (delta-based, from receiver) let current_giveups = self.receiver_health_counters.candidate_giveups.load(Ordering::Relaxed); let prev_giveups = self.health_alert_state.prev_candidate_giveups; @@ -2655,7 +2869,7 @@ impl SessionProcessor { ); } - // 8. Skip/notar/final ratio anomaly (delta-based, inbound vote stream). + // 7. Skip/notar/final ratio anomaly (delta-based, inbound vote stream). let current_notar = self.votes_in_notarize_total; let current_final = self.votes_in_finalize_total; let current_skip = self.votes_in_skip_total; @@ -2719,7 +2933,7 @@ impl SessionProcessor { } } - // 9. Validator isolation: only self is active for extended period + // 8. Validator isolation: only self is active for extended period let isolation_threshold = Duration::from_secs(60); let session_age = now.duration_since(self.session_creation_time()).unwrap_or_default(); if session_age > isolation_threshold @@ -2747,12 +2961,174 @@ impl SessionProcessor { } } + /// Compute candidate funnel totals for validation inventory dump. + fn compute_candidate_totals(&self, now: SystemTime) -> CandidateTotals { + let received_total = self.received_candidates.len(); + let mut received_unvalidated = 0usize; + let mut validated_not_notarized = 0usize; + let mut notarized_not_finalized = 0usize; + let mut finalized_recent = 0usize; + let mut other_omitted = 0usize; + + for (id, _rc) in &self.received_candidates { + let is_finalized = self.finalized_blocks.contains(id); + let is_notarized = + self.simplex_state.get_notarized_block_hash(&self.description, id.slot).as_ref() + == Some(&id.hash); + let is_approved = self.approved.contains_key(id); + let is_pending = self.pending_validations.contains_key(id); + + if is_finalized { + let is_recent = self.finalized_pending_body.get(id).map_or_else( + || { + // Already materialized: check receive time as proxy + false + }, + |entry| { + now.duration_since(entry.finalized_at) + .map(|d| d <= RECENT_FINALIZED_DUMP_WINDOW) + .unwrap_or(false) + }, + ); + // Also check if it was recently finalized by checking last_finalization_time proximity + let recent_by_time = now + .duration_since(self.last_finalization_time) + .map(|d| d <= RECENT_FINALIZED_DUMP_WINDOW) + .unwrap_or(false); + if is_recent || recent_by_time { + finalized_recent += 1; + } else { + other_omitted += 1; + } + } else if is_notarized { + notarized_not_finalized += 1; + } else if is_approved || (!is_pending && !self.rejected.contains(id)) { + validated_not_notarized += 1; + } else { + received_unvalidated += 1; + } + } + + CandidateTotals { + received_total, + received_unvalidated, + validated_not_notarized, + notarized_not_finalized, + finalized_recent, + other_omitted, + } + } + + /// Dump validation inventory with lifecycle-bucketed blocks. + fn dump_validation_inventory(&self, r: &mut String, now: SystemTime, totals: &CandidateTotals) { + r.push_str(" validation:\n"); + + let mut received_rows = Vec::new(); + let mut validated_rows = Vec::new(); + let mut notarized_rows = Vec::new(); + let mut finalized_rows = Vec::new(); + + for (id, rc) in &self.received_candidates { + let is_finalized = self.finalized_blocks.contains(id); + let is_notarized = self.simplex_state.has_notarized_block(id.slot); + let is_approved = self.approved.contains_key(id); + let is_pending = self.pending_validations.contains_key(id); + let is_rejected = self.rejected.contains(id); + + let mut flags = Vec::new(); + if is_pending { + flags.push("pending_validation"); + } + if is_approved { + flags.push("approved"); + } + if is_rejected { + flags.push("rejected"); + } + if is_notarized { + flags.push("notarized"); + } + if is_finalized { + flags.push("finalized"); + } + if rc.is_empty { + flags.push("empty"); + } + let flags_str = if flags.is_empty() { "-".to_string() } else { flags.join(",") }; + + let age = Self::fmt_ago(now, Some(rc.receive_time)); + let line = format!( + " slot {} src={} candidate={} block=({}) flags=[{}] recv={}\n", + rc.slot, + rc.source_idx, + &id.hash.to_hex_string()[..8], + rc.block_id, + flags_str, + age, + ); + + if is_finalized { + let is_recent = now + .duration_since(self.last_finalization_time) + .map(|d| d <= RECENT_FINALIZED_DUMP_WINDOW) + .unwrap_or(false); + if is_recent { + finalized_rows.push(line); + } + // older finalized: omitted + } else if is_notarized { + notarized_rows.push(line); + } else if is_approved || (!is_pending && !is_rejected) { + validated_rows.push(line); + } else { + received_rows.push(line); + } + } + + r.push_str(&format!(" received ({:.1}%):\n", totals.pct(totals.received_unvalidated),)); + for row in &received_rows { + r.push_str(row); + } + + r.push_str(&format!( + " validated ({:.1}%):\n", + totals.pct(totals.validated_not_notarized), + )); + for row in &validated_rows { + r.push_str(row); + } + + r.push_str(&format!( + " notarized ({:.1}%):\n", + totals.pct(totals.notarized_not_finalized), + )); + for row in ¬arized_rows { + r.push_str(row); + } + + r.push_str(&format!(" finalized ({:.1}%):\n", totals.pct(totals.finalized_recent),)); + for row in &finalized_rows { + r.push_str(row); + } + + r.push_str(&format!( + " other: omitted={} total_received={}\n", + totals.other_omitted, totals.received_total, + )); + } + /// Produce detailed debug dump of session state /// /// Includes: - /// - Session-level info (validators, weights, timing) - /// - Collation/validation state - /// - SimplexState FSM dump (via SimplexState::debug_dump) + /// - Stall conclusion with health findings + /// - Session header, shard info, frontiers with cursor ages + /// - Consensus milestone timestamps (finalization, notarization, cert times) + /// - Heads (finalized, accepted, MC applied) + /// - Candidate funnel statistics with percentages + /// - Collation state with per-window grouping and leader identity + /// - Validation inventory with lifecycle buckets + /// - Peer diagnostics with typed vote/cert/candidate stats + /// - Standstill diagnostic grid (on stall) /// /// # Arguments /// * `is_stalled` - If true, consensus is stalled (no finalizations for ROUND_DEBUG_PERIOD). @@ -2766,7 +3142,6 @@ impl SessionProcessor { let now = self.now(); let fsm_first_non_finalized_slot = self.simplex_state.get_first_non_finalized_slot(); let fsm_first_non_progressed_slot = self.simplex_state.get_first_non_progressed_slot(); - // Use current slot's started_at time let slot_duration = now.duration_since(self.slot_started_at(fsm_first_non_progressed_slot)); let total_weight = self.description.get_total_weight(); let slot_dur_secs = slot_duration.map(|d| d.as_secs_f64()).unwrap_or(0.0); @@ -2774,8 +3149,8 @@ impl SessionProcessor { .duration_since(self.session_creation_time()) .map(|d| d.as_secs_f64()) .unwrap_or(0.0); + let shard = self.description.get_shard(); - // Stalled consensus: log error and increment error counter if is_stalled { let time_since_finalization = now .duration_since(self.last_finalization_time) @@ -2794,29 +3169,23 @@ impl SessionProcessor { self.increment_error(); } - // INFO level: Compact health status (always logged when info enabled) - // Provides quick health check without enabling debug + // INFO level: compact health status line if log::log_enabled!(log::Level::Info) { let status = if is_stalled { "STALLED" } else { "OK" }; + let head_seqno = + self.finalized_head_seqno.map(|s| s.to_string()).unwrap_or_else(|| "?".to_string()); log::info!( - "Session {} health [{}]: slot_nf={}, slot_np={}, time={:.1}s, slot_dur={:.1}s, \ - active={}/{} ({:.0}%), pending_val={}, approved={}, finalized={}", + "Session {} health [{}]: shard={}:{:016x} slot_nf={} slot_np={} finalized_head_seqno={}", &self.session_id().to_hex_string()[..8], status, + shard.workchain_id(), + shard.shard_prefix_with_tag(), fsm_first_non_finalized_slot, fsm_first_non_progressed_slot, - session_time, - slot_dur_secs, - self.active_weight, - total_weight, - 100.0 * self.active_weight as f64 / total_weight as f64, - self.pending_validations.len(), - self.approved.len(), - self.finalized_blocks.len(), + head_seqno, ); } - // Full details: logged to DEBUG in normal mode, INFO in stall mode let should_dump_full = if is_stalled { log::log_enabled!(log::Level::Info) } else { @@ -2827,183 +3196,341 @@ impl SessionProcessor { return; } - let mut result = String::new(); + let health_findings = self.collect_health_findings(); + let window_diags = self.simplex_state.collect_window_diagnostics(&self.description); - // Session header - result.push_str(&format!("Session {} dump:\n", self.session_id().to_hex_string())); + let mut r = String::with_capacity(4096); + let status_str = if is_stalled { "STALLED" } else { "OK" }; - // Timing - result.push_str(&format!(" - slot_duration: {:.3}s\n", slot_dur_secs)); - result.push_str(&format!(" - session_time: {:.3}s\n", session_time)); + // ---- Conclusion (stalled only) ---- + r.push_str(&format!( + "Session {} dump [{}]:\n", + self.session_id().to_hex_string(), + status_str, + )); + if is_stalled { + r.push_str(" conclusion:\n"); + for f in &health_findings { + r.push_str(&format!(" - {:?}: {}\n", f.kind, f.summary)); + } + // Frontier-based reason from first non-finalized slot diagnostic + if let Some(wd) = window_diags.first() { + if let Some(sd) = wd.slots.first() { + r.push_str(&format!( + " - frontier_reason: slot {} {} ({})\n", + sd.slot, sd.phase, sd.reason + )); + } + } + if health_findings.is_empty() { + r.push_str(" - none\n"); + } + } - // Session info (show FSM slot boundaries) - result.push_str(&format!( - " - first_non_finalized_slot: {} (fsm)\n - first_non_progressed_slot: {} (fsm)\n - validators_count: {}\n - local_idx: {}\n", - fsm_first_non_finalized_slot, - fsm_first_non_progressed_slot, - self.description.get_total_nodes(), - self.description.get_self_idx() + // ---- Shard ---- + r.push_str(&format!( + " shard={}:{:016x}\n", + shard.workchain_id(), + shard.shard_prefix_with_tag(), )); - // Weights - let total_weight = self.description.get_total_weight(); - result.push_str(&format!( - " - total_weight: {}\n - threshold_66: {} ({:.2}%)\n - threshold_33: {} ({:.2}%)\n", - total_weight, - threshold_66(total_weight), - 100.0 * threshold_66(total_weight) as f64 / total_weight as f64, - threshold_33(total_weight), - 100.0 * threshold_33(total_weight) as f64 / total_weight as f64 + // ---- Header ---- + let t66 = threshold_66(total_weight); + let t33 = threshold_33(total_weight); + let active_pct = if total_weight > 0 { + 100.0 * self.active_weight as f64 / total_weight as f64 + } else { + 0.0 + }; + r.push_str(" header:\n"); + r.push_str(&format!( + " validators={} local={} session_time={:.1}s slot_duration={:.1}s\n", + self.description.get_total_nodes(), + self.description.get_self_idx(), + session_time, + slot_dur_secs, )); - result.push_str(&format!( - " - active_weight: {} ({:.2}%)\n", + r.push_str(&format!( + " total_weight={total_weight} th66={t66} th33={t33} active_weight={} ({active_pct:.1}%)\n", self.active_weight, - 100.0 * self.active_weight as f64 / total_weight as f64 )); - // Inactive validators (similar to validator-session session_processor.rs) - let mut inactive_validators = Vec::new(); - for (idx, last_time) in self.last_activity.iter().enumerate() { - if idx == usize::from(self.description.get_self_idx()) { - continue; - } - let is_active = if let Some(last_activity) = last_time { - if let Ok(elapsed) = now.duration_since(*last_activity) { - elapsed < crate::utils::ACTIVITY_THRESHOLD - } else { - false - } - } else { - false + // ---- Frontiers ---- + let nf_age = Self::fmt_dur(now, self.observability.last_finalized_cursor_change_at); + let np_age = Self::fmt_dur(now, self.observability.last_progression_change_at); + r.push_str(" frontiers:\n"); + r.push_str(&format!( + " first_non_finalized={} (unchanged {})\n", + fsm_first_non_finalized_slot, nf_age, + )); + r.push_str(&format!( + " first_non_progressed={} (unchanged {})\n", + fsm_first_non_progressed_slot, np_age, + )); + + // Milestone timestamps + let fmt_milestone = + |label: &str, seqno: Option, slot: Option, ts: Option| { + let seqno_str = + seqno.map(|s| format!("seqno={s}")).unwrap_or_else(|| "seqno=?".to_string()); + let slot_str = slot.map(|s| format!(" slot={s}")).unwrap_or_default(); + let age = Self::fmt_ago(now, ts); + format!(" {label}: {seqno_str}{slot_str}, {age}\n") }; + r.push_str(&fmt_milestone( + "last_finalization", + self.finalized_head_seqno, + self.finalized_head_slot, + Some(self.last_finalization_time), + )); + r.push_str(&fmt_milestone( + "last_notarization", + None, + self.observability.last_notarization_slot, + self.observability.last_notarization_at, + )); + r.push_str(&fmt_milestone( + "last_final_cert", + None, + self.observability.last_final_cert_slot, + self.observability.last_final_cert_at, + )); + r.push_str(&fmt_milestone( + "last_notar_cert", + None, + self.observability.last_notar_cert_slot, + self.observability.last_notar_cert_at, + )); - if !is_active { - let last_str = last_time - .and_then(|t| now.duration_since(t).ok()) - .map(|d| format!("{:.0}s", d.as_secs_f64())) - .unwrap_or_else(|| "?".to_string()); - inactive_validators.push(format!("v{:03}/{}", idx, last_str)); - } + // ---- Heads ---- + r.push_str(" heads:\n"); + r.push_str(&format!( + " finalized_head_seqno={}\n", + self.finalized_head_seqno.map(|s| s.to_string()).unwrap_or_else(|| "?".to_string()), + )); + if let Some(ref bid) = self.finalized_head_block_id { + r.push_str(&format!( + " finalized_head=slot {} id=({})\n", + self.finalized_head_slot.map(|s| s.to_string()).unwrap_or_else(|| "?".to_string()), + bid, + )); } - if !inactive_validators.is_empty() { - result.push_str(&format!(" - inactive: [{}]\n", inactive_validators.join(", "))); + r.push_str(&format!( + " last_consensus_finalized_seqno={}\n", + self.last_consensus_finalized_seqno + .map(|s| s.to_string()) + .unwrap_or_else(|| "?".to_string()), + )); + if let Some(ref bid) = self.accepted_normal_head_block_id { + r.push_str(&format!( + " accepted_normal_head=seqno {} id=({})\n", + self.accepted_normal_head_seqno, bid + )); + } else { + r.push_str(&format!( + " accepted_normal_head=seqno {}\n", + self.accepted_normal_head_seqno + )); } - - // Collation state (per-slot state for current slot) - let current_slot = fsm_first_non_progressed_slot; - result.push_str(&format!( - " - collation: pending_gen={}, generated={}, sent_gen={}, precollated={}\n", - self.slot_is_pending_generate(current_slot), - self.slot_is_generated(current_slot), - self.slot_is_sent_generated(current_slot), - self.precollated_blocks.len() + if let Some(ref bid) = self.observability.last_mc_applied_block_id { + r.push_str(&format!(" last_mc_applied=({})\n", bid)); + } + r.push_str(&format!( + " last_mc_finalized_seqno={}\n", + self.last_mc_finalized_seqno.map(|s| s.to_string()).unwrap_or_else(|| "?".to_string()), )); - // Validation state (session-level) - result.push_str(&format!( - " - validation: pending={}, approved={}, rejected={}, validated_queue={}\n", + // ---- Statistics ---- + let totals = self.compute_candidate_totals(now); + r.push_str(" statistics:\n"); + r.push_str(&format!( + " candidates: received={} validated={} ({:.1}%) notarized={} \ + ({:.1}%) finalized={} ({:.1}%) other={} ({:.1}%)\n", + totals.received_total, + totals.received_total - totals.received_unvalidated, + totals.pct(totals.received_total - totals.received_unvalidated), + totals.notarized_not_finalized + totals.finalized_recent + totals.other_omitted, + totals.pct( + totals.notarized_not_finalized + totals.finalized_recent + totals.other_omitted + ), + totals.finalized_recent, + totals.pct(totals.finalized_recent), + totals.other_omitted, + totals.pct(totals.other_omitted), + )); + if let Some(ref snap) = self.last_receiver_snapshot { + let total_in_msgs: u64 = snap.sources.iter().map(|s| s.in_messages).sum(); + let total_out_msgs: u64 = snap.sources.iter().map(|s| s.out_messages).sum(); + let total_in_bcasts: u64 = snap.sources.iter().map(|s| s.in_broadcasts).sum(); + let total_out_bcasts: u64 = snap.sources.iter().map(|s| s.out_broadcasts).sum(); + let total_dup_votes: u64 = snap.sources.iter().map(|s| s.duplicate_votes).sum(); + let total_dup_bcasts: u64 = snap.sources.iter().map(|s| s.duplicate_broadcasts).sum(); + let total_req_sent: u64 = snap.sources.iter().map(|s| s.candidate_requests_sent).sum(); + let total_req_recv: u64 = + snap.sources.iter().map(|s| s.candidate_requests_received).sum(); + r.push_str(&format!( + " traffic: msgs_in={total_in_msgs} msgs_out={total_out_msgs} \ + bcasts_in={total_in_bcasts} bcasts_out={total_out_bcasts}\n" + )); + r.push_str(&format!( + " votes_in: notar={} final={} skip={}\n", + self.votes_in_notarize_total, + self.votes_in_finalize_total, + self.votes_in_skip_total, + )); + r.push_str(&format!( + " duplicates: votes={total_dup_votes} broadcasts={total_dup_bcasts} \ + request_candidates_sent={total_req_sent} request_candidates_recv={total_req_recv}\n" + )); + } + r.push_str(&format!( + " pending: validations={} approvals={} rejections={} finalized_pending_body={}\n", self.pending_validations.len(), self.approved.len(), self.rejected.len(), - self.validated_candidates.len() + self.finalized_pending_body.len(), )); - let metrics_snapshot = self.metrics_receiver.snapshot(); - let metric_counter = |name: &str| metrics_snapshot.counters.get(name).copied().unwrap_or(0); - result.push_str(&format!( - " - metrics: recv_broadcast={}, recv_query={}, drop_old={}, drop_future={}, \ - drop_unexpected_sender={}, drop_conflicting_slot={}, generated_missed_validation={}, \ + // ---- Collation (per-window) ---- + let current_slot = fsm_first_non_progressed_slot; + r.push_str(" collation:\n"); + r.push_str(&format!( + " current_slot={} pending_gen={} generated={} sent_gen={} precollated={} \ generated_waiting_validation={}\n", - metric_counter("simplex_candidate_received_broadcast"), - metric_counter("simplex_candidate_received_query"), - metric_counter("simplex_candidate_precheck_drop_old_slot"), - metric_counter("simplex_candidate_precheck_drop_future_slot"), - metric_counter("simplex_candidate_precheck_drop_unexpected_sender"), - metric_counter("simplex_candidate_precheck_drop_conflicting_slot"), - metric_counter("simplex_generated_candidate_validation_missed"), + current_slot, + self.slot_is_pending_generate(current_slot), + self.slot_is_generated(current_slot), + self.slot_is_sent_generated(current_slot), + self.precollated_blocks.len(), self.generated_candidates_waiting_validation.len(), )); - - // Nodes list (with activity info) - result.push_str(" - nodes:\n"); - for i in 0..self.description.get_total_nodes() { - let validator_idx = ValidatorIndex::from(i); - let public_key_hash = self.description.get_source_public_key_hash(validator_idx); - let weight = self.description.get_node_weight(validator_idx); - let is_self = self.description.is_self(validator_idx); - let is_leader = - self.description.is_self_leader(fsm_first_non_finalized_slot) && is_self; - - // Check if there's validated candidate data from this source - let has_candidate = self.slot_has_validated_candidate_from(current_slot, validator_idx); - - // Activity info - let last_activity_time = self.last_activity.get(i as usize).and_then(|t| *t); - let last_activity_delay = last_activity_time.and_then(|t| now.duration_since(t).ok()); - let is_active = - last_activity_delay.map(|d| d < crate::utils::ACTIVITY_THRESHOLD).unwrap_or(false); - let last_activity_str = last_activity_delay - .map(|d| format!("{:6.2}s", d.as_secs_f64())) - .unwrap_or_else(|| " N/A ".to_string()); - - // Status: "self" for local validator, "inactive" for inactive, blank for active - let status_str = if is_self { - "self " - } else if is_active { - " " - } else { - "inactive" - }; - - result.push_str(&format!( - " - {}: {} last_activity={}, weight={}, pubkey_hash={}{}{}\n", - validator_idx, - status_str, - last_activity_str, - weight, - public_key_hash, - if is_leader { " [LEADER]" } else { "" }, - if has_candidate { " [HAS_CANDIDATE]" } else { "" }, + for wd in &window_diags { + let leader_pubkey = + base64_encode(self.description.get_source_public_key_hash(wd.leader_idx).data()); + let leader_adnl = + base64_encode(self.description.get_source_adnl_id(wd.leader_idx).data()); + r.push_str(&format!( + " window {} slots=[{}..{}] leader={} pubkey_b64={} adnl_b64={}\n", + wd.window_idx, + wd.slot_begin, + wd.slot_end, + wd.leader_idx, + leader_pubkey, + leader_adnl, )); + for sd in &wd.slots { + let mut flags = Vec::new(); + if sd.voted_notar { + flags.push("Voted"); + } + if sd.voted_skip { + flags.push("VotedSkip"); + } + if sd.voted_final { + flags.push("VotedFinal"); + } + if sd.has_pending_block { + flags.push("Pending"); + } + if sd.is_timeout_skipped { + flags.push("TimeoutSkipped"); + } + let flags_str = if flags.is_empty() { "none".to_string() } else { flags.join("|") }; + let mut certs = Vec::new(); + if sd.has_notar_cert { + certs.push("notar"); + } + if sd.has_final_cert { + certs.push("final"); + } + if sd.has_skip_cert { + certs.push("skip"); + } + let certs_str = if certs.is_empty() { "none".to_string() } else { certs.join("|") }; + r.push_str(&format!( + " {} phase={} reason={} notar={:.0}% final={:.0}% \ + skip={:.0}% flags=[{}] certs=[{}]\n", + sd.slot, + sd.phase, + sd.reason, + sd.notar_weight_pct, + sd.final_weight_pct, + sd.skip_weight_pct, + flags_str, + certs_str, + )); + } } - // Delayed actions - if !self.delayed_actions.is_empty() { - result.push_str(&format!(" - delayed_actions: {}\n", self.delayed_actions.len())); - for (i, action) in self.delayed_actions.iter().enumerate() { - let expires_in = action - .expiration_time - .duration_since(now) - .map(|d| format!("{:.3}s", d.as_secs_f64())) - .unwrap_or_else(|_| "expired".to_string()); - result.push_str(&format!(" - action {}: expires_in={}\n", i, expires_in)); + // ---- Validation inventory ---- + self.dump_validation_inventory(&mut r, now, &totals); + + // ---- Peers ---- + if let Some(ref snap) = self.last_receiver_snapshot { + r.push_str(" peers:\n"); + for src in &snap.sources { + let is_self = src.source_idx == self.description.get_self_idx().0 as u32; + let vi = ValidatorIndex::from(src.source_idx); + let weight = self.description.get_node_weight(vi); + let weight_pct = if total_weight > 0 { + 100.0 * weight as f64 / total_weight as f64 + } else { + 0.0 + }; + let pubkey_b64 = + base64_encode(self.description.get_source_public_key_hash(vi).data()); + let last_act = Self::fmt_ago(now, src.last_recv_time); + let last_vote = Self::fmt_ago(now, src.last_vote_recv_time); + let last_final_cert = Self::fmt_ago(now, src.last_final_cert_recv_time); + let last_notar_cert = Self::fmt_ago(now, src.last_notar_cert_recv_time); + let last_cand = Self::fmt_ago(now, src.last_candidate_recv_time); + let marker = if is_self { " (self)" } else { "" }; + r.push_str(&format!( + " {} adnl_b64={} pubkey_b64={} weight={} ({weight_pct:.1}%) \ + last_activity={last_act} last_vote={last_vote} last_final_cert={last_final_cert} \ + last_notar_cert={last_notar_cert} last_candidate={last_cand} \ + votes[n/f/s]={}/{}/{} certs[n/f/s]={}/{}/{} candidates={} \ + req[s/r]={}/{}{marker}\n", + vi, + src.adnl_id_base64, + pubkey_b64, + weight, + src.votes_in_notarize, + src.votes_in_finalize, + src.votes_in_skip, + src.certs_in_notar, + src.certs_in_final, + src.certs_in_skip, + src.candidates_received, + src.candidate_requests_sent, + src.candidate_requests_received, + )); } } - // SimplexState dump (full format for debug dumps) - result.push_str(" - simplex_state:\n"); - let fsm_dump = self.simplex_state.debug_dump(&self.description, true); - // Indent FSM dump - for line in fsm_dump.lines() { - result.push_str(&format!(" {}\n", line)); + // ---- Health findings ---- + if !health_findings.is_empty() { + r.push_str(" health_findings:\n"); + for f in &health_findings { + r.push_str(&format!(" - [{:?}] {:?}: {}\n", f.severity, f.kind, f.summary)); + } } - // C++ parity: standstill diagnostic dump (only on stall) + // ---- Standstill diagnostic (C++ parity, stall only) ---- if is_stalled { let diagnostic = self.simplex_state.standstill_diagnostic_dump(&self.description); if !diagnostic.is_empty() { - result.push_str(" - standstill_diagnostic:\n"); + r.push_str(" standstill_diagnostic:\n"); for line in diagnostic.lines() { - result.push_str(&format!(" {}\n", line)); + r.push_str(&format!(" {}\n", line)); } } } - // Log full dump: ERROR for stalled (critical), DEBUG for health check if is_stalled { - log::error!("{}", result); + log::error!("{}", r); } else { - log::debug!("{}", result); + log::debug!("{}", r); } } @@ -3043,17 +3570,24 @@ impl SessionProcessor { // Increment metrics counter self.check_all_counter.increment(1); + let now = self.now(); + let wake_slip = now.duration_since(self.next_awake_time).unwrap_or_default(); + self.check_all_wake_slip_histogram.record(wake_slip.as_millis() as f64); + // Reset awake time to far future, will be updated by various checks self.reset_next_awake_time(); // Stalled consensus detection - let now = self.now(); // Debug dump if no finalizations for ROUND_DEBUG_PERIOD (stalled consensus) if now >= self.round_debug_at { self.debug_dump(true); // is_stalled=true: full dump to INFO level self.round_debug_at = now + ROUND_DEBUG_PERIOD; } + // Release delayed gates before evaluating validation readiness so retries + // can re-enter validation in the same `check_all()` pass. + self.process_delayed_actions(); + // Check validation (process pending validations) self.check_validation(); @@ -3078,22 +3612,27 @@ impl SessionProcessor { self.set_next_awake_time(fsm_timeout); } + // Ensure we wake up for stall detection even when FSM has no pending timeouts + self.set_next_awake_time(self.round_debug_at); + // Persist pool state (first_nonannounced_window) when window advances self.maybe_store_pool_state(); // Check collation (am I leader? should I generate?) self.check_collation(); - // Check pending parent resolution timeouts - self.check_pending_parent_timeouts(); - - // Process delayed actions first - self.process_delayed_actions(); - - self.first_non_finalized_slot_gauge - .set(self.simplex_state.get_first_non_finalized_slot().0 as f64); - self.first_non_progressed_slot_gauge - .set(self.simplex_state.get_first_non_progressed_slot().0 as f64); + let cur_nf = self.simplex_state.get_first_non_finalized_slot(); + let cur_np = self.simplex_state.get_first_non_progressed_slot(); + if cur_nf != self.observability.prev_first_non_finalized { + self.observability.last_finalized_cursor_change_at = now; + self.observability.prev_first_non_finalized = cur_nf; + } + if cur_np != self.observability.prev_first_non_progressed { + self.observability.last_progression_change_at = now; + self.observability.prev_first_non_progressed = cur_np; + } + self.first_non_finalized_slot_gauge.set(cur_nf.0 as f64); + self.first_non_progressed_slot_gauge.set(cur_np.0 as f64); // Debug state dump self.log_consensus_state("check_all"); @@ -3159,8 +3698,49 @@ impl SessionProcessor { // received_candidates map which is populated asynchronously. self.generated_parent_cache .get(&parent_id) - .cloned() - .or_else(|| self.received_candidates.get(&parent_id).map(|c| c.block_id.clone())) + .cloned() + .or_else(|| self.received_candidates.get(&parent_id).map(|c| c.block_id.clone())) + } + + fn resolve_parent_gen_utime_ms( + &self, + parent: &crate::block::CandidateParentInfo, + ) -> Option { + let parent_id = RawCandidateId { slot: parent.slot, hash: parent.hash.clone() }; + self.generated_parent_gen_utime_ms_cache + .get(&parent_id) + .copied() + .or_else(|| self.received_candidates.get(&parent_id).and_then(|c| c.gen_utime_ms)) + .or_else(|| { + self.local_chain_head.as_ref().and_then(|head| { + if head.parent_info.slot == parent.slot && head.parent_info.hash == parent.hash + { + head.gen_utime_ms + } else { + None + } + }) + }) + } + + fn compute_collation_start_time( + &self, + parent: Option<&crate::block::CandidateParentInfo>, + ) -> SystemTime { + let now = self.now(); + let Some(parent) = parent else { + return now; + }; + let Some(parent_gen_utime_ms) = self.resolve_parent_gen_utime_ms(parent) else { + return now; + }; + + let target_rate = self.description.opts().target_rate; + let earliest_from_parent = UNIX_EPOCH + .checked_add(Duration::from_millis(parent_gen_utime_ms).saturating_add(target_rate)) + .unwrap_or(now); + let latest_reasonable = now.checked_add(target_rate).unwrap_or(now); + earliest_from_parent.max(now).min(latest_reasonable) } /// Advance `earliest_collation_time` by `target_rate` from now. @@ -3230,10 +3810,8 @@ impl SessionProcessor { } // Don't generate if already generated or pending for this slot. - // However, if we have a local chain head with a deferred precollation for the - // next slot in this window, retry it — the parent may have been notarized since - // the initial deferral, but the retry in handle_notarization_reached could have - // missed it (e.g., local_chain_head was updated between defer and event). + // However, if we have a local chain head and no queued precollation for the + // next slot in this window, keep the pipeline full by retrying it here. if self.slot_is_generated(current_slot) || self.slot_is_pending_generate(current_slot) { if let Some(ref head) = self.local_chain_head { let next_slot = SlotIndex(head.slot.0 + 1); @@ -3401,6 +3979,20 @@ impl SessionProcessor { } } + let slot_start_time = self.compute_collation_start_time(parent.as_ref()); + let now = self.now(); + if slot_start_time > now { + log::trace!( + "Session {} invoke_collation: deferring slot {} until {:?}", + self.session_id().to_hex_string(), + slot, + slot_start_time + ); + self.slot_set_pending_generate(slot, false); + self.set_next_awake_time(slot_start_time); + return; + } + // Derive `new_seqno` from the locked parent (C++ behavior). // // Reference: C++ block-producer.cpp: @@ -3442,7 +4034,7 @@ impl SessionProcessor { // // Without this, empty blocks would fail with: // `generated_block: empty block for slot sX has no parent`. - let request = AsyncRequestImpl::new(request_id, true, self.now()); + let request = AsyncRequestImpl::new(request_id, true, slot_start_time); let precollated_block = PrecollatedBlock { request: request.clone(), candidate: None, @@ -3483,7 +4075,7 @@ impl SessionProcessor { let request_id = self.precollated_blocks_next_request_id; self.precollated_blocks_next_request_id += 1; - let request = AsyncRequestImpl::new(request_id, true, self.now()); + let request = AsyncRequestImpl::new(request_id, true, slot_start_time); let precollated_block = PrecollatedBlock { request: request.clone(), candidate: None, parent: parent.clone() }; @@ -3899,7 +4491,20 @@ impl SessionProcessor { // Capture parent at collation start (same as invoke_collation) let parent = self.simplex_state.get_available_parent(&self.description, slot); - let request = AsyncRequestImpl::new(request_id, true, self.now()); + let slot_start_time = self.compute_collation_start_time(parent.as_ref()); + let now = self.now(); + if slot_start_time > now { + log::trace!( + "Session {} invoke_collation_retry: deferring slot {} retry until {:?}", + self.session_id().to_hex_string(), + slot, + slot_start_time + ); + self.set_next_awake_time(slot_start_time); + return; + } + + let request = AsyncRequestImpl::new(request_id, true, slot_start_time); let precollated_block = PrecollatedBlock { request: request.clone(), candidate: None, parent: parent.clone() }; @@ -4191,6 +4796,12 @@ impl SessionProcessor { crate::block::CandidateParentInfo { slot, hash: prepared.candidate_hash.clone() }; let raw_id = RawCandidateId { slot, hash: prepared.candidate_hash.clone() }; self.generated_parent_cache.insert(raw_id.clone(), prepared.block_id_ext.clone()); + if let Some(gen_utime_ms) = prepared.gen_utime_ms { + self.generated_parent_gen_utime_ms_cache.insert( + RawCandidateId { slot, hash: prepared.candidate_hash.clone() }, + gen_utime_ms, + ); + } self.track_generated_candidate_for_validation(raw_id.clone()); let slot_window = self.description.get_window_idx(slot); @@ -4199,6 +4810,7 @@ impl SessionProcessor { slot, parent_info: candidate_parent_info, block_id: prepared.block_id_ext.clone(), + gen_utime_ms: prepared.gen_utime_ms, }); log::trace!( @@ -4253,7 +4865,7 @@ impl SessionProcessor { slot ); - // Update state (INT-2: per-slot state) + // Update per-slot state self.slot_set_pending_generate(slot, false); self.slot_set_generated(slot, true); self.slot_set_sent_generated(slot, true); @@ -4419,6 +5031,7 @@ impl SessionProcessor { let computed_file_hash = consensus_common::utils::get_hash_from_block_payload(data); let computed_collated_file_hash = consensus_common::utils::get_hash_from_block_payload(collated_data); + let gen_utime_ms = extract_consensus_gen_utime_ms(collated_data.data()); let block_id_ext = BlockIdExt { shard_id: self.description.get_shard().clone(), @@ -4445,6 +5058,7 @@ impl SessionProcessor { candidate_hash, tl_candidate_data, signature, + gen_utime_ms, }) } @@ -4501,6 +5115,7 @@ impl SessionProcessor { candidate_hash, tl_candidate_data, signature, + gen_utime_ms: None, }) } @@ -4603,38 +5218,17 @@ impl SessionProcessor { // the same window was just generated locally (block-producer.cpp `parent = id`). // Fall back to FSM available_base for the first slot in a window or if the // local chain head is stale. - // - // When require_notarized_parent_for_collation is enabled, the local chain head - // is only used if the parent slot is already notarized (or finalized). This - // matches C++ WaitForParent semantics where validators defer notarization until - // the parent is notarized, so broadcasting a candidate with a non-notarized - // parent is wasteful. Instead of returning early, fall through to the FSM - // available_base path — this ensures collation proceeds if the FSM already has - // a notarized base for this slot (e.g., notarization arrived but the retry in - // handle_notarization_reached didn't match the local_chain_head). let parent = if let Some(ref head) = self.local_chain_head { if head.window == target_window && head.slot + 1 == target_slot { - if self.description.opts().require_notarized_parent_for_collation - && !self.simplex_state.has_notarized_block(head.slot) - { - log::debug!( - "Session {} precollate_block: local_chain_head parent s{} \ - not yet notarized, falling through to FSM base", - &self.session_id().to_hex_string()[..8], - head.slot, - ); - None - } else { - log::trace!( - "Session {} precollate_block: using local_chain_head for slot {} \ - (parent=s{}:{})", - &self.session_id().to_hex_string()[..8], - target_slot, - head.parent_info.slot, - &head.parent_info.hash.to_hex_string()[..8], - ); - Some(head.parent_info.clone()) - } + log::trace!( + "Session {} precollate_block: using local_chain_head for slot {} \ + (parent=s{}:{})", + &self.session_id().to_hex_string()[..8], + target_slot, + head.parent_info.slot, + &head.parent_info.hash.to_hex_string()[..8], + ); + Some(head.parent_info.clone()) } else { None } @@ -4724,6 +5318,7 @@ impl SessionProcessor { } self.local_chain_head = None; self.generated_parent_cache.clear(); + self.generated_parent_gen_utime_ms_cache.clear(); } /* @@ -4866,8 +5461,6 @@ impl SessionProcessor { Vote::Notarize(v) => v.slot, Vote::Finalize(v) => v.slot, Vote::Skip(v) => v.slot, - Vote::NotarizeFallback(v) => v.slot, - Vote::SkipFallback(v) => v.slot, }; log::trace!( @@ -5295,14 +5888,6 @@ impl SessionProcessor { } } } - _ => { - log::warn!( - "Session {} on_certificate: REJECTED - unexpected vote type: {:?}", - self.session_id().to_hex_string(), - cert.vote - ); - return; - } } // Immediately process any state changes @@ -5761,6 +6346,10 @@ impl SessionProcessor { // Extract actual block data from RawCandidate (not the TL wrapper) // This is what validation/finalization callbacks consume. + let gen_utime_ms = raw_candidate + .block + .as_block() + .and_then(|block| extract_consensus_gen_utime_ms(&block.collated_data)); let (block_data, collated_data) = match raw_candidate.block.as_block() { Some(block) => ( consensus_common::ConsensusCommonFactory::create_block_payload(block.data.clone()), @@ -5775,9 +6364,7 @@ impl SessionProcessor { ), }; - // Compute is_fully_resolved based on parent chain availability let parent_id = raw_candidate.parent_id.clone(); - let is_fully_resolved = self.compute_is_fully_resolved(&parent_id); // Build CandidateHashData TL bytes for signature verification // This is the data that was hashed to produce candidate_id_hash @@ -5814,12 +6401,14 @@ impl SessionProcessor { ) }; + let parent_metadata_present = + parent_id.as_ref().is_none_or(|parent| self.received_candidates.contains_key(parent)); log::trace!( - "Session {} on_candidate_received: slot={} parent={:?} is_fully_resolved={}", + "Session {} on_candidate_received: slot={} parent={:?} parent_metadata_present={}", self.session_id().to_hex_string(), slot, parent_id.as_ref().map(|p| p.slot), - is_fully_resolved, + parent_metadata_present, ); // Clone data needed for DB save before moving into ReceivedCandidate @@ -5838,10 +6427,10 @@ impl SessionProcessor { file_hash, data: block_data, collated_data, + gen_utime_ms, receive_time, is_empty, parent_id: parent_id.clone(), - is_fully_resolved, }, ); @@ -5869,7 +6458,7 @@ impl SessionProcessor { // DEBUG: Short pattern for quick grep (RECV = candidate received) log::debug!( "Session {} RECV candidate: slot={slot}, hash={}, seqno={received_seqno}, \ - from=v{:03}, empty={is_empty}, resolved={is_fully_resolved}", + from=v{:03}, empty={is_empty}, parent_metadata_present={parent_metadata_present}", &self.session_id().to_hex_string()[..8], &id_hash.to_hex_string()[..8], leader_idx, @@ -5877,68 +6466,33 @@ impl SessionProcessor { // TRACE: Method name pattern for detailed tracking log::trace!( "Session {} on_candidate_received: slot={slot}, hash={}, seqno={received_seqno}, \ - source={leader_idx}, empty={is_empty}, parent={:?}, resolved={is_fully_resolved}", + source={leader_idx}, empty={is_empty}, parent={:?}, parent_metadata_present={parent_metadata_present}", self.session_id().to_hex_string(), id_hash.to_hex_string(), parent_id.as_ref().map(|p| format!("{}:{}", p.slot, p.hash.to_hex_string())), ); // 8. Process notarization/finalization signature-sets if provided (from query response) - // This can be done immediately, regardless of parent resolution status + // This can be done immediately, regardless of parent-metadata availability. // Clone id_hash before use for certificates let id_hash_for_cert = id_hash.clone(); if let Some(ref cert_bytes) = notar_cert { self.process_received_notar_cert(slot, &id_hash_for_cert, cert_bytes); } - // 9. Update resolution cache for dependent candidates - self.update_resolution_cache_chain(&candidate_id); - - // 10. Try to resolve any candidates waiting for this one as their parent - self.try_resolve_waiting_candidates(&candidate_id); - - // 11. Register candidate based on resolution status - if is_fully_resolved { - // Candidate is fully resolved - register for validation - log::trace!( - "Session {} on_candidate_received: registering fully resolved candidate \ - slot={slot} id={:?}", - self.session_id().to_hex_string(), - id_hash, - ); - - // Optimistic validation: candidates with non-finalized (notarized-only) parents - // are accepted and forwarded to check_validation(), which validates them as soon - // as the parent slot is notarized in the FSM. No finalized-head gating. - if let Some(ref p) = parent_id { - let parent_is_finalized_head = self - .finalized_head_block_id - .as_ref() - .and_then(|head| self.received_candidates.get(p).map(|r| &r.block_id == head)) - .unwrap_or(false); - - if !parent_is_finalized_head { - log::debug!( - "Session {} on_candidate_received: candidate slot={} hash={} has \ - non-finalized parent (slot={}), will validate optimistically.", - &self.session_id().to_hex_string()[..8], - slot, - &id_hash.to_hex_string()[..8], - p.slot - ); - } - } - - self.register_resolved_candidate(raw_candidate, slot, leader_idx, receive_time); - } else { - // Candidate's parent chain is not fully resolved - queue for parent resolution - log::trace!( - "Session {} on_candidate_received: queueing candidate slot={slot} for parent \ - resolution", - self.session_id().to_hex_string(), + // 9. Admit the candidate immediately; check_validation() owns the remaining + // WaitForParent gate and, for empties, waits until the expected normal tip can be + // reconstructed from locally known parent metadata. + if !parent_metadata_present { + log::debug!( + "Session {} on_candidate_received: slot={} hash={} is missing parent metadata, \ + but ingress no longer parks candidates behind a simplex-local resolution queue", + &self.session_id().to_hex_string()[..8], + slot, + &id_hash.to_hex_string()[..8], ); - self.queue_for_parent_resolution(raw_candidate, slot, leader_idx, receive_time); } + self.register_candidate_for_validation(raw_candidate, slot, leader_idx, receive_time); // Immediately process the new candidate (don't wait for next awake) self.check_all(); @@ -6095,6 +6649,7 @@ impl SessionProcessor { &mut self, active_weight: ValidatorWeight, last_activity: Vec>, + snapshot: crate::receiver::ReceiverActivitySnapshot, ) { if self.active_weight != active_weight { log::debug!( @@ -6109,6 +6664,7 @@ impl SessionProcessor { self.active_weight_gauge.set(active_weight as f64); } self.last_activity = last_activity; + self.last_receiver_snapshot = Some(snapshot); } pub fn on_standstill_trigger(&mut self, notification: StandstillTriggerNotification) { @@ -6139,45 +6695,26 @@ impl SessionProcessor { /* ======================================================================== - Recursive Parent Resolution + Empty Parent Tip Metadata Reference: C++ consensus.cpp get_resolved_candidate, get_resolved_candidate_inner Reference: C++ candidate-resolver.cpp resolve_candidate_inner Reference: C++ pool.cpp maybe_resolve_request - When a candidate is received, we check if its parent chain is fully - resolved (all parents available in received_candidates). If not, we - queue the candidate for parent resolution and request the missing parent. - When a parent arrives, we process all waiting candidates recursively. + Non-empty candidates enter `pending_validations` immediately after ingress + and rely on strict `WaitForParent` plus validator-side state resolution. + Simplex only keeps the minimal metadata walk needed to resolve the + expected normal tip for empty candidates when parent metadata is not yet + present locally. ======================================================================== */ - /// Compute whether a candidate's parent chain is fully resolved - /// - /// A candidate is fully resolved if: - /// - It has no parent (genesis/first in epoch), OR - /// - Its parent exists in received_candidates AND parent.is_fully_resolved == true - /// - /// This function does NOT modify state - it just checks the current status. - fn compute_is_fully_resolved(&self, parent_id: &Option) -> bool { - match parent_id { - None => true, // No parent = genesis/first in epoch = fully resolved - Some(parent) => { - match self.received_candidates.get(parent) { - None => false, // Parent not yet received - Some(parent_received) => parent_received.is_fully_resolved, - } - } - } - } - - /// Find the first missing parent in a candidate's parent chain + /// Find the first missing metadata record needed to resolve an empty + /// candidate's expected normal tip. /// - /// Walks up the parent chain until finding a parent that is not in received_candidates. - /// Returns None if all parents are available (candidate is fully resolved). - /// - /// Uses MAX_CHAIN_DEPTH to prevent infinite loops on malformed data. - fn find_first_missing_parent( + /// We only walk through empty ancestors; a non-empty parent already defines + /// the normal tip we need for the C++ `event->state->as_normal()` check. + fn find_first_missing_parent_metadata( &self, candidate: &RawCandidate, ) -> Option { @@ -6188,254 +6725,75 @@ impl SessionProcessor { depth += 1; if depth > MAX_CHAIN_DEPTH { log::error!( - "Session {} find_first_missing_parent: exceeded \ + "Session {} find_first_missing_parent_metadata: exceeded \ MAX_CHAIN_DEPTH={MAX_CHAIN_DEPTH} for candidate slot={}", self.session_id().to_hex_string(), candidate.id.slot, ); self.increment_error(); - return None; // Treat as resolved to avoid infinite loops - } - - match self.received_candidates.get(&parent_id) { - None => { - // This parent is missing - return it - log::trace!( - "Session {} find_first_missing_parent: missing parent slot={} hash={} for \ - candidate slot={}", - self.session_id().to_hex_string(), - parent_id.slot, - &parent_id.hash.to_hex_string()[..8], - candidate.id.slot, - ); - return Some(parent_id); - } - Some(parent_received) => { - if !parent_received.is_fully_resolved { - // Parent exists but is not fully resolved - find ITS missing parent - current_parent = parent_received.parent_id.clone(); - } else { - // Parent is fully resolved - we're done - return None; - } - } + return None; } - } - - // No missing parent found - None - } - /// Queue a candidate for parent resolution - /// - /// Called when a candidate is received but its parent chain is not fully resolved. - /// The candidate is stored in pending_parent_resolutions and a request for the - /// missing parent is scheduled. - fn queue_for_parent_resolution( - &mut self, - raw_candidate: RawCandidate, - slot: SlotIndex, - source_idx: ValidatorIndex, - receive_time: SystemTime, - ) { - // Find the first missing parent in the chain - let missing_parent = match self.find_first_missing_parent(&raw_candidate) { - Some(p) => p, - None => { - // No missing parent - shouldn't happen if caller checked is_fully_resolved + let Some(parent_received) = self.received_candidates.get(&parent_id) else { log::trace!( - "Session {} queue_for_parent_resolution: no missing parent for slot={slot} \ - but was queued", + "Session {} find_first_missing_parent_metadata: missing parent slot={} hash={} \ + for candidate slot={}", self.session_id().to_hex_string(), + parent_id.slot, + &parent_id.hash.to_hex_string()[..8], + candidate.id.slot, ); - return; - } - }; - - log::trace!( - "Session {} queue_for_parent_resolution: queuing slot={slot} waiting for parent \ - slot={} hash={}", - self.session_id().to_hex_string(), - missing_parent.slot, - &missing_parent.hash.to_hex_string()[..8], - ); - - let key = missing_parent.clone(); - let pending = PendingParentResolution { raw_candidate, slot, source_idx, receive_time }; - - self.pending_parent_resolutions.entry(key).or_default().push(pending); - - // Request the missing parent immediately (no delay). Parent-cascade requests are - // catch-up traffic: the candidate was already produced long ago and won't arrive - // via broadcast, so the 1-second CANDIDATE_REQUEST_DELAY only adds latency. - self.request_candidate(missing_parent.slot, missing_parent.hash, Some(Duration::ZERO)); - } - - /// Update the `is_fully_resolved` cache for a specific candidate and its descendants. - /// - /// A candidate is keyed by `RawCandidateId` (slot, candidate_id_hash). - /// This must be called when: - /// - a candidate is inserted into `received_candidates`, OR - /// - a parent candidate's resolution status may have changed. - fn update_resolution_cache_chain(&mut self, id: &RawCandidateId) { - // NOTE: This used to be recursive; on single-host nets we can receive an old missing parent - // late (after hundreds of descendants already exist), which produced deep recursion warnings - // and risks stack overflow. Keep the semantics but do it iteratively. - let session_id_hex = self.session_id().to_hex_string(); - let mut stack: Vec<(RawCandidateId, u32)> = vec![(id.clone(), 0)]; - let mut visited: HashSet = HashSet::new(); - let mut max_depth_seen: u32 = 0; - - while let Some((cur_id, depth)) = stack.pop() { - max_depth_seen = max_depth_seen.max(depth); - - log::trace!( - "Session {} update_resolution_cache_chain: slot={} hash={} depth={}", - &session_id_hex, - cur_id.slot, - &cur_id.hash.to_hex_string()[..8], - depth, - ); - - if depth >= MAX_CHAIN_DEPTH { - log::error!( - "Session {} update_resolution_cache_chain: exceeded \ - MAX_CHAIN_DEPTH={MAX_CHAIN_DEPTH} slot={} hash={}, aborting", - &session_id_hex, - cur_id.slot, - &cur_id.hash.to_hex_string()[..8], - ); - self.increment_error(); - continue; - } - - if !visited.insert(cur_id.clone()) { - continue; - } - - // Compute resolution status for this exact candidate (identified by RawCandidateId). - let is_resolved = match self.received_candidates.get(&cur_id) { - Some(candidate) => self.compute_is_fully_resolved(&candidate.parent_id), - None => continue, + return Some(parent_id); }; - // Update the is_fully_resolved flag if it changed. - if let Some(candidate) = self.received_candidates.get_mut(&cur_id) { - if candidate.is_fully_resolved != is_resolved { - let old_resolved = candidate.is_fully_resolved; - candidate.is_fully_resolved = is_resolved; - log::trace!( - "Session {} update_resolution_cache_chain: slot={} hash={} \ - is_fully_resolved: {old_resolved} -> {is_resolved}", - &session_id_hex, - candidate.slot, - &cur_id.hash.to_hex_string()[..8], - ); - } + if !parent_received.is_empty { + return None; } - // If this candidate is now resolved, update descendants that depend on it. - if is_resolved { - // Collect dependent candidate keys first to avoid borrow conflicts. - let mut dependent_keys: Vec = Vec::new(); - for (child_id, child) in &self.received_candidates { - if let Some(parent) = &child.parent_id { - if parent == &cur_id { - dependent_keys.push(child_id.clone()); - } - } - } - - for child_id in dependent_keys { - stack.push((child_id, depth + 1)); - } - } + current_parent = parent_received.parent_id.clone(); } - // Still report unusually deep chains (informational), but avoid spamming WARNs. - if max_depth_seen >= DEEP_RECURSION_WARNING_THRESHOLD { - log::debug!( - "Session {} update_resolution_cache_chain: deep dependency chain \ - depth={max_depth_seen} start_slot={} start_hash={}", - &session_id_hex, - id.slot, - &id.hash.to_hex_string()[..8], - ); - } + None } - /// Process all candidates waiting for a specific parent + /// Ensure an empty candidate can resolve the expected normal tip before approval. /// - /// Called when a parent candidate arrives. Takes all waiting candidates - /// from pending_parent_resolutions and processes them. - fn try_resolve_waiting_candidates(&mut self, parent_id: &RawCandidateId) { - // Take all waiting candidates (removes from map) - let waiting = match self.pending_parent_resolutions.remove(parent_id) { - Some(v) => v, - None => return, // No candidates waiting for this parent - }; - - log::trace!( - "Session {} try_resolve_waiting_candidates: {} candidates waiting for parent s{}:{}", - self.session_id().to_hex_string(), - waiting.len(), - parent_id.slot, - &parent_id.hash.to_hex_string()[..8], - ); - - // Process each waiting candidate - for pending in waiting { - self.process_candidate_with_resolved_parent(pending); + /// Unlike the old simplex-local waiting queue, this is an on-demand check in the + /// validation path: if metadata is missing, we request the next missing parent and + /// keep the candidate in `pending_validations`. + fn ensure_empty_parent_tip_ready( + &mut self, + raw_candidate: &RawCandidate, + slot: SlotIndex, + ) -> bool { + if self.resolve_parent_normal_tip(raw_candidate).is_some() { + return true; } - } - - /// Process a candidate whose parent just arrived - /// - /// Re-checks resolution status and either registers the candidate - /// (if fully resolved) or re-queues it (if still waiting for a grandparent). - fn process_candidate_with_resolved_parent(&mut self, pending: PendingParentResolution) { - // Update resolution cache for this candidate - self.update_resolution_cache_chain(&pending.raw_candidate.id); - - // Check if the candidate is now fully resolved - let is_resolved = self.compute_is_fully_resolved(&pending.raw_candidate.parent_id); - if is_resolved { + if let Some(missing_parent) = self.find_first_missing_parent_metadata(raw_candidate) { log::trace!( - "Session {} process_candidate_with_resolved_parent: candidate slot={} is now \ - fully resolved", + "Session {} ensure_empty_parent_tip_ready: requesting missing parent metadata \ + slot={} hash={} for empty candidate slot={}", self.session_id().to_hex_string(), - pending.slot, - ); - // Register as a resolved candidate - self.register_resolved_candidate( - pending.raw_candidate, - pending.slot, - pending.source_idx, - pending.receive_time, + missing_parent.slot, + &missing_parent.hash.to_hex_string()[..8], + slot, ); + self.request_candidate(missing_parent.slot, missing_parent.hash, Some(Duration::ZERO)); } else { log::trace!( - "Session {} process_candidate_with_resolved_parent: candidate slot={} still \ - waiting for grandparent", + "Session {} ensure_empty_parent_tip_ready: empty candidate slot={} is still \ + waiting for the accepted normal head or restart-seeded parent metadata", self.session_id().to_hex_string(), - pending.slot, - ); - // Still has missing parents - re-queue - self.queue_for_parent_resolution( - pending.raw_candidate, - pending.slot, - pending.source_idx, - pending.receive_time, + slot, ); } + + false } - /// Register a fully resolved candidate for validation - /// - /// Called when a candidate's entire parent chain is available. - /// Adds the candidate to pending_validations and tracks latency metrics. - fn register_resolved_candidate( + /// Register a candidate for validation once it has been accepted at ingress. + fn register_candidate_for_validation( &mut self, raw_candidate: RawCandidate, slot: SlotIndex, @@ -6451,7 +6809,7 @@ impl SessionProcessor { || self.rejected.contains(&candidate_id) { log::trace!( - "Session {} register_resolved_candidate: candidate already known: {:?}", + "Session {} register_candidate_for_validation: candidate already known: {:?}", self.session_id().to_hex_string(), candidate_id, ); @@ -6459,7 +6817,7 @@ impl SessionProcessor { } log::trace!( - "Session {} register_resolved_candidate: registering candidate slot={} hash={}", + "Session {} register_candidate_for_validation: registering candidate slot={} hash={}", self.session_id().to_hex_string(), slot, &candidate_id.hash.to_hex_string()[..8], @@ -6471,8 +6829,7 @@ impl SessionProcessor { PendingValidation { raw_candidate, slot, receive_time, source_idx }, ); - // Track first candidate received in this slot (for latency metrics) - // Only track for fully resolved candidates in the current slot (progress cursor) + // Track first candidate received in this slot (for latency metrics). let first_non_progressed_slot = self.simplex_state.get_first_non_progressed_slot(); if !self.slot_first_candidate_received(slot) && slot == first_non_progressed_slot { self.slot_set_first_candidate_received(slot, true); @@ -6484,45 +6841,6 @@ impl SessionProcessor { } } - /// Check timeouts for pending parent resolutions - /// - /// Called from check_all(). Candidates waiting longer than MAX_PARENT_WAIT_TIME - /// are logged as errors and removed. - fn check_pending_parent_timeouts(&mut self) { - let now = self.now(); - let mut timed_out_keys: Vec = Vec::new(); - let session_id = self.session_id().to_hex_string(); - - for (key, pending_list) in &self.pending_parent_resolutions { - for pending in pending_list { - if let Ok(elapsed) = now.duration_since(pending.receive_time) { - if elapsed > MAX_PARENT_WAIT_TIME { - log::error!( - "Session {session_id} check_pending_parent_timeouts: candidate \ - slot={} timed out waiting for parent ({}s > {}s)", - pending.slot, - elapsed.as_secs(), - MAX_PARENT_WAIT_TIME.as_secs(), - ); - timed_out_keys.push(key.clone()); - break; // One timeout per key is enough to remove the whole list - } - } - } - } - - // Increment error count for timed out entries - let timeout_count = timed_out_keys.len(); - for _ in 0..timeout_count { - self.increment_error(); - } - - // Remove timed out entries - for key in timed_out_keys { - self.pending_parent_resolutions.remove(&key); - } - } - /* Validation processing Reference: validator-session/src/session_processor.rs try_approve_block, candidate_decision_* @@ -6612,47 +6930,6 @@ impl SessionProcessor { true } - fn check_mc_validation_ready( - &self, - pending: &PendingValidation, - ) -> Result { - let expected_tip = self.resolve_parent_normal_tip(&pending.raw_candidate); - let expected_seqno = match expected_tip.as_ref() { - Some(block_id) => block_id.seq_no, - None if pending.raw_candidate.parent_id.is_none() => { - self.description.get_initial_block_seqno().saturating_sub(1) - } - None => { - fail!("Cannot resolve parent normal tip for MC candidate"); - } - }; - - if self.accepted_normal_head_seqno < expected_seqno { - return Ok(McValidationReadiness::WaitingForAcceptedHead); - } - if self.accepted_normal_head_seqno > expected_seqno { - fail!( - "Stale MC candidate parent: accepted head seqno {} already passed expected {}", - self.accepted_normal_head_seqno, - expected_seqno - ); - } - - if let (Some(accepted_tip), Some(expected_tip)) = - (self.accepted_normal_head_block_id.as_ref(), expected_tip.as_ref()) - { - if accepted_tip != expected_tip { - fail!( - "Stale MC candidate parent: accepted head {} does not match expected {}", - accepted_tip, - expected_tip - ); - } - } - - Ok(McValidationReadiness::Ready) - } - /// Check pending validations and send to higher layer for validation /// /// Called from check_all(). Iterates all pending_validations and forwards @@ -6663,22 +6940,36 @@ impl SessionProcessor { fn check_validation(&mut self) { check_execution_time!(10_000); instrument!(); + let now = self.now(); // Collect candidates to validate. // A candidate is eligible when: - // 1. It is fully resolved (parent chain data available — enforced by register_resolved_candidate). + // 1. It has been admitted into pending_validations. // 2. Parent chain is C++ WaitForParent-ready (notar/final parent + gap skip coverage). - // 3. It is not already being validated, approved, or rejected. + // 3. Empty candidates can resolve the expected `event->state->as_normal()` tip from + // locally known metadata (requesting the next missing parent on demand if needed). + // 4. MC stale-parent protection is handled in validator-side candidate-native validation. + // 5. It is not already being validated, approved, or rejected. let mut to_validate: Vec<(RawCandidateId, SlotIndex, ValidatorIndex, SystemTime)> = Vec::new(); - let mut to_reject: Vec<(RawCandidateId, SlotIndex, Error)> = Vec::new(); let candidate_ids: Vec = self.pending_validations.keys().cloned().collect(); for candidate_id in candidate_ids { - let pending = match self.pending_validations.get(&candidate_id) { - Some(p) => p, - None => continue, - }; + let (slot, source_idx, receive_time, raw_candidate, wait_for_parent_ready) = + match self.pending_validations.get(&candidate_id) { + Some(p) => ( + p.slot, + p.source_idx, + p.receive_time, + p.raw_candidate.clone(), + self.is_wait_for_parent_ready(p), + ), + None => continue, + }; + + if !wait_for_parent_ready { + continue; + } // Skip if already being validated or decided if self.pending_approve.contains(&candidate_id) { @@ -6703,52 +6994,50 @@ impl SessionProcessor { } } - let wait_for_parent_ready = self.is_wait_for_parent_ready(pending); - if !wait_for_parent_ready - && matches!(PARENT_READINESS_MODE, ParentReadinessMode::StrictWaitForParent) - { - continue; - } - - if self.description.get_shard().is_masterchain() - && !pending.raw_candidate.block.is_empty() - && matches!(MC_ACCEPTED_HEAD_MODE, McAcceptedHeadMode::StrictSessionProcessorGate) - { - match self.check_mc_validation_ready(pending) { - Ok(McValidationReadiness::Ready) => {} - Ok(McValidationReadiness::WaitingForAcceptedHead) => continue, - Err(e) => { - to_reject.push((candidate_id.clone(), pending.slot, e)); - continue; + if !raw_candidate.block.is_empty() { + if let Some(parent_id) = raw_candidate.parent_id.as_ref() { + let parent_info = crate::block::CandidateParentInfo { + slot: parent_id.slot, + hash: parent_id.hash.clone(), + }; + if let Some(parent_gen_utime_ms) = + self.resolve_parent_gen_utime_ms(&parent_info) + { + let earliest_validation_time = UNIX_EPOCH + .checked_add(Duration::from_millis(parent_gen_utime_ms)) + .and_then(|parent_gen_time| { + parent_gen_time + .checked_add(self.description.opts().min_block_interval) + }); + let Some(earliest_validation_time) = earliest_validation_time else { + log::warn!( + "Session {} check_validation: invalid parent_gen_utime_ms {} for \ + parent slot {}", + self.session_id().to_hex_string(), + parent_gen_utime_ms, + parent_info.slot, + ); + continue; + }; + if now < earliest_validation_time { + self.set_next_awake_time(earliest_validation_time); + continue; + } } } } - // Empty blocks skip ValidatorGroup validation but still need FSM-tip reference - // check (performed in try_approve_block). C++ block-validator.cpp rejects unless - // block == event->state->as_normal(). - if pending.raw_candidate.block.is_empty() { - to_validate.push(( - candidate_id.clone(), - pending.slot, - pending.source_idx, - pending.receive_time, - )); + if raw_candidate.block.is_empty() { + if !self.ensure_empty_parent_tip_ready(&raw_candidate, slot) { + continue; + } + + to_validate.push((candidate_id.clone(), slot, source_idx, receive_time)); continue; } - to_validate.push(( - candidate_id.clone(), - pending.slot, - pending.source_idx, - pending.receive_time, - )); - } - - for (candidate_id, slot, err) in to_reject { - self.candidate_decision_fail(slot, candidate_id, err); + to_validate.push((candidate_id.clone(), slot, source_idx, receive_time)); } - // Process each candidate for (candidate_id, slot, source_idx, receive_time) in to_validate { self.try_approve_block(&candidate_id, slot, source_idx, receive_time); @@ -6826,7 +7115,8 @@ impl SessionProcessor { // Handle empty blocks: C++ block-validator.cpp rejects unless the referenced // block equals event->state->as_normal(). We resolve the expected block from - // the parent chain and compare before approving. + // the parent chain and compare before approving; if metadata is still missing, + // the candidate stays pending and waits for the next repair round. if pending.raw_candidate.block.is_empty() { let referenced_block = pending.raw_candidate.block.block_id().clone(); let expected = self.resolve_parent_normal_tip(&pending.raw_candidate); @@ -6858,17 +7148,22 @@ impl SessionProcessor { ); } None => { - log::warn!( - "Session {} try_approve_block: empty block REJECTED — cannot resolve \ - parent normal tip for {:?}", + if let Some(missing_parent) = + self.find_first_missing_parent_metadata(&pending.raw_candidate) + { + self.request_candidate( + missing_parent.slot, + missing_parent.hash, + Some(Duration::ZERO), + ); + } + self.pending_approve.remove(candidate_id); + log::trace!( + "Session {} try_approve_block: empty block still waiting for parent \ + normal tip for {:?}", self.session_id().to_hex_string(), cid, ); - self.candidate_decision_fail( - slot, - cid, - error!("Cannot resolve parent normal tip for empty candidate"), - ); } } return; @@ -7044,7 +7339,7 @@ impl SessionProcessor { self.candidate_decision_ok_internal(candidate_id, slot, receive_time); // Wake immediately so check_all() runs in the very next main-loop iteration - self.set_next_awake_time(self.now()); + self.wake_now(); } /// Internal helper for successful validation (used by both normal and empty block paths) @@ -7057,7 +7352,7 @@ impl SessionProcessor { // Remove from pending_approve self.pending_approve.remove(&candidate_id); - // Get and remove from pending_validations (INT-2: per-slot state) + // Get and remove from pending_validations (per-slot state) let pending = match self.pending_validations.remove(&candidate_id) { Some(p) => p, None => { @@ -7211,6 +7506,7 @@ impl SessionProcessor { ); // Remove from pending_approve to allow re-validation processor.pending_approve.remove(&candidate_id_copy); + processor.wake_now(); }, ); @@ -7370,7 +7666,6 @@ impl SessionProcessor { self.votes_out_total_counter.increment(1); self.votes_out_skip_counter.increment(1); } - _ => {} } // WaitCandidateInfoStored parity (C++ consensus.cpp): @@ -7858,15 +8153,6 @@ impl SessionProcessor { // Remove pending candidate requests for slots < up_to_slot self.requested_candidates.retain(|id, _| id.slot >= up_to_slot); - // Remove pending parent resolutions for old slots. - // Parent resolution is hash-based; slot is informational only, so we only use - // candidate slots (first-seen) to bound memory usage. - // TODO: implement cleanup of pending parent resolutions - //self.pending_parent_resolutions.retain(|_parent_hash, pending_list| { - // pending_list.retain(|p| p.slot >= up_to_slot); - // !pending_list.is_empty() - //}); - // Clear SlotRuntime for old slots (keep SlotEntry for outcome emission) // TODO: LK: optimize this for slot_idx in 0..up_to_slot.value() { @@ -7942,6 +8228,12 @@ impl SessionProcessor { fn handle_notarization_reached(&mut self, event: NotarizationReachedEvent) { check_execution_time!(1_000); + let now = self.now(); + self.observability.last_notarization_at = Some(now); + self.observability.last_notarization_slot = Some(event.slot); + self.observability.last_notar_cert_at = Some(now); + self.observability.last_notar_cert_slot = Some(event.slot); + log::trace!( "Session {} notarization reached: slot={} block={} sigs={}", self.session_id().to_hex_string(), @@ -8060,30 +8352,11 @@ impl SessionProcessor { self.increment_error(); } } - - // When require_notarized_parent_for_collation is active, precollation for the - // next slot may have been deferred because this parent wasn't yet notarized. - // Now that the notarization arrived, retry precollation. - if self.description.opts().require_notarized_parent_for_collation { - if let Some(ref head) = self.local_chain_head { - if head.slot == event.slot { - let next_slot = SlotIndex(head.slot.0 + 1); - log::debug!( - "Session {} handle_notarization_reached: parent s{} now notarized, \ - retrying precollate_block for slot {}", - &self.session_id().to_hex_string()[..8], - event.slot, - next_slot, - ); - self.precollate_block(next_slot); - } - } - } } /// Handle skip certificate reached event /// - /// Called when FSM determines skip threshold reached for a slot (C++ mode only). + /// Called when FSM determines skip threshold reached for a slot. /// Serializes and broadcasts the skip certificate to all validators. /// /// Reference: C++ pool.cpp creates skip certificate and broadcasts it @@ -8152,6 +8425,10 @@ impl SessionProcessor { fn handle_finalization_reached(&mut self, event: FinalizationReachedEvent) { check_execution_time!(1_000); + let now = self.now(); + self.observability.last_final_cert_at = Some(now); + self.observability.last_final_cert_slot = Some(event.slot); + log::trace!( "Session {} finalization reached: slot={} block={} sigs={}", self.session_id().to_hex_string(), @@ -8387,7 +8664,7 @@ impl SessionProcessor { slot, parent_info ); - }); + }); log::trace!( "Session {} notify_generate_slot: explicit parent for slot {}: {}", @@ -9073,8 +9350,6 @@ impl SessionStartupRecoveryListener for SessionProcessor { Vote::Notarize(v) => v.slot, Vote::Finalize(v) => v.slot, Vote::Skip(v) => v.slot, - Vote::NotarizeFallback(v) => v.slot, - Vote::SkipFallback(v) => v.slot, }; log::trace!( "Session {}: recovery_mark_slot_voted_on_restart(slot={})", @@ -9217,7 +9492,7 @@ impl SessionStartupRecoveryListener for SessionProcessor { continue; } - // Seed a minimal received candidate record for parent resolution + // Seed a minimal received candidate record for restart-side parent/tip lookups. self.received_candidates.insert( candidate_id, ReceivedCandidate { @@ -9230,10 +9505,10 @@ impl SessionStartupRecoveryListener for SessionProcessor { file_hash: block_id.file_hash.clone(), data: consensus_common::ConsensusCommonFactory::create_block_payload(Vec::new()), collated_data: consensus_common::ConsensusCommonFactory::create_block_payload(Vec::new()), + gen_utime_ms: None, receive_time: self.now(), is_empty, parent_id: block.parent.clone(), - is_fully_resolved: true, }, ); } @@ -9267,17 +9542,15 @@ impl SessionStartupRecoveryListener for SessionProcessor { .map(|p| format!("s{}:{}", p.slot.value(), &p.hash.to_hex_string()[..8])), ); - let is_fully_resolved = self.compute_is_fully_resolved(&parent); - if let Some(existing) = self.received_candidates.get_mut(&candidate_id) { existing.source_idx = leader_idx; existing.candidate_hash_data_bytes = candidate_hash_data_bytes; existing.block_id.clone_from(&block_id); existing.root_hash.clone_from(&block_id.root_hash); existing.file_hash.clone_from(&block_id.file_hash); + existing.gen_utime_ms = None; existing.is_empty = is_empty; existing.parent_id = parent; - existing.is_fully_resolved = is_fully_resolved; return; } @@ -9295,10 +9568,10 @@ impl SessionStartupRecoveryListener for SessionProcessor { collated_data: consensus_common::ConsensusCommonFactory::create_block_payload( Vec::new(), ), + gen_utime_ms: None, receive_time: self.now(), is_empty, parent_id: parent, - is_fully_resolved, }, ); } diff --git a/src/node/simplex/src/simplex_state.rs b/src/node/simplex/src/simplex_state.rs index d1d2f8d..7295bb3 100644 --- a/src/node/simplex/src/simplex_state.rs +++ b/src/node/simplex/src/simplex_state.rs @@ -7,237 +7,42 @@ * This software is provided "AS IS", WITHOUT WARRANTY OF ANY KIND. */ -//! SimplexState - Core Consensus State Machine +//! SimplexState - Core Simplex FSM //! -//! This module implements the core consensus state machine based on: -//! - Solana Alpenglow White Paper (May 2025), Algorithm 1 (page 21) and Algorithm 2 (page 22) -//! - C++ reference: `simplex/consensus.cpp`, `simplex/pool.cpp` +//! Protocol references: +//! - [Simplex protocol spec (`Simplex.md`)](https://github.com/ton-blockchain/simplex-docs) +//! - [TON C++ sources](https://github.com/ton-blockchain/ton) (`testnet/validator/consensus/simplex`) //! -//! ## White Paper Algorithm -//! -//! ### Algorithm 1: Event Handlers -//! -//! ```text -//! upon Block(s, hash, hashparent) do -//! if tryNotar(Block(s, hash, hashparent)) then -//! checkPendingBlocks() -//! else if Voted ∉ state[s] then -//! pendingBlocks[s] ← Block(s, hash, hashparent) -//! -//! upon Timeout(s) do -//! if Voted ∉ state[s] then -//! trySkipWindow(s) -//! -//! upon BlockNotarized(s, hash(b)) do -//! state[s] ← state[s] ∪ {BlockNotarized(hash(b))} -//! tryFinal(s, hash(b)) -//! -//! upon ParentReady(window, hash(b)) do -//! state[window.first_slot] ← state[window.first_slot] ∪ {ParentReady(hash(b))} -//! checkPendingBlocks() -//! setTimeouts(window) -//! -//! upon SafeToNotar(s, hash(b)) do -//! trySkipWindow(s) -//! if ItsOver ∉ state[s] then -//! broadcast NotarFallbackVote(s, hash(b)) -//! state[s] ← state[s] ∪ {BadWindow} -//! -//! upon SafeToSkip(s) do -//! trySkipWindow(s) -//! if ItsOver ∉ state[s] then -//! broadcast SkipFallbackVote(s) -//! state[s] ← state[s] ∪ {BadWindow} -//! ``` -//! -//! ### Algorithm 2: Helper Functions -//! -//! ```text -//! function tryNotar(Block(s, hash, hashparent)) -//! if Voted ∈ state[s] then return false -//! firstSlot ← (s is the first slot in leader window) -//! if firstSlot then -//! canVote ← ParentReady(hashparent) ∈ state[s] -//! else -//! canVote ← VotedNotar(hashparent) ∈ state[s-1] -//! if canVote then -//! broadcast NotarVote(s, hash) -//! state[s] ← state[s] ∪ {Voted, VotedNotar(hash)} -//! pendingBlocks[s] ← ⊥ -//! tryFinal(s, hash) -//! return true -//! return false -//! -//! function tryFinal(s, hash(b)) -//! if BlockNotarized(hash(b)) ∈ state[s] and VotedNotar(hash(b)) ∈ state[s] -//! and BadWindow ∉ state[s] then -//! broadcast FinalVote(s) -//! state[s] ← state[s] ∪ {ItsOver} -//! -//! function trySkipWindow(s) -//! for k ∈ windowSlots(s) do -//! if Voted ∉ state[k] then -//! broadcast SkipVote(k) -//! state[k] ← state[k] ∪ {Voted, BadWindow} -//! pendingBlocks[k] ← ⊥ -//! -//! function checkPendingBlocks() -//! for s : pendingBlocks[s] ≠ ⊥ do -//! tryNotar(pendingBlocks[s]) -//! ``` -//! -//! ## C++ Implementation vs Alpenglow White Paper -//! -//! The C++ reference uses a **simplified protocol** without fallback votes. -//! Rust supports both modes via `enable_fallback_protocol` constructor parameter. -//! -//! ### Vote Types -//! -//! | Vote Type | White Paper | C++ (wire) | Rust (internal) | -//! |-----------|-------------|------------|-----------------| -//! | Notarize | ✅ | ✅ | ✅ | -//! | Finalize | ✅ | ✅ | ✅ | -//! | Skip | ✅ | ✅ | ✅ | -//! | NotarizeFallback | ✅ | ❌ | ✅ (filtered) | -//! | SkipFallback | ✅ | ❌ | ✅ (filtered) | -//! -//! ### `enable_fallback_protocol` Option -//! -//! - **`false` (default, C++ compatible)**: 3 vote types only (Notarize, Finalize, Skip). -//! - Fallback votes are filtered in `broadcast_vote()` -//! - `SafeToNotar` / `SafeToSkip` events are NOT processed -//! - `Notarize + Skip` from same validator is **allowed** (C++ pool.cpp behavior) -//! - Timeout check: `its_over` (voted_final in C++) blocks timeout -//! -//! - **`true` (full Alpenglow)**: All 5 vote types. -//! - Full White Paper algorithm with fallback mechanism -//! - `Notarize + Skip` from same validator is **misbehavior** -//! - Timeout check: `is_voted` (any vote) blocks timeout -//! -//! ### C++ Differences from White Paper -//! -//! 1. **No fallback votes**: C++ TL schema has no `NotarizeFallback` / `SkipFallback` -//! -//! 2. **Leader Windows**: C++ uses explicit `LeaderWindow` struct with `available_bases`, -//! `slots[]`, and `had_timeouts` flag. Windows are created lazily. -//! -//! 3. **ParentReady is per-window**: In C++, `ParentReady` event is received per-window, -//! not per-slot. The `available_bases` set in `LeaderWindow` tracks valid parents. -//! -//! 4. **Timeout behavior**: -//! - C++ `alarm()` checks `voted_final` (not `is_voted`), allowing Skip after Notarize -//! - C++ uses `multimap` for timeouts -//! - Fresh timeout scheduled on `LeaderWindowObserved` (= `on_window_base_ready`) -//! -//! 5. **trySkipWindow**: C++ iterates all window slots and checks `voted_final`, -//! not `is_voted`, allowing Skip after Notarize -//! -//! 6. **Vote thresholds**: -//! - `BlockNotarized`: notar(b) >= 2/3 (certificate threshold) -//! - `SafeToNotar`: notar(b) >= 1/3 AND notar(b) + skip >= 2/3 -//! - `SafeToSkip`: skip + sum(notar) - max(notar) >= 1/3 -//! -//! 7. **Block identification**: Uses `BlockIdExt` (full block ID) in votes, -//! not just hash. Parent references use `CandidateParentInfo` (slot + hash). -//! -//! 8. **Empty blocks**: C++ supports empty blocks for finalization recovery -//! (not in White Paper). Empty block has `block = None`, must have parent. -//! -//! ## Design Principles -//! -//! 1. **No external dependencies** - TL, network, etc. are not used directly -//! 2. **Event-based output** - All actions produce `SimplexEvent` (no callbacks) -//! 3. **Self-contained timing** - FSM manages its own timeouts via `check_all()` + `get_next_timeout()` -//! 4. **Independent testing** - FSM can be unit tested without network or TL dependencies -//! 5. **C++ compatible** - Block types match C++ implementation (BlockIdExt in votes) -//! -//! ## Architecture -//! -//! ```text -//! ┌─────────────────────────────────────────────────────────────────────┐ -//! │ SimplexState FSM │ -//! │ │ -//! │ ┌─────────────────────────────┐ ┌─────────────────────────────┐ │ -//! │ │ Consensus State │ │ Vote Accounting │ │ -//! │ │ (from SimplexConsensusImpl) │ │ (from SimplexPoolImpl) │ │ -//! │ │ │ │ │ │ -//! │ │ - leader_windows │ │ - slot_votes │ │ -//! │ │ - pending_slots │ │ - notarize weights │ │ -//! │ │ - first_non_finalized_slot │ │ - skip weights │ │ -//! │ │ - timeout state │ │ - certificate tracking │ │ -//! │ └─────────────────────────────┘ └─────────────────────────────┘ │ -//! │ │ -//! │ ┌─────────────────────────────────────────────────────────────┐ │ -//! │ │ Event Queue (VecDeque) │ │ -//! │ │ │ │ -//! │ │ - BroadcastVote(vote) │ │ -//! │ │ - BlockFinalized(slot, block) │ │ -//! │ │ - SlotSkipped(slot) │ │ -//! │ └─────────────────────────────────────────────────────────────┘ │ -//! │ │ -//! │ Input API: Output (pull events): │ -//! │ - on_candidate(desc, ...) - pull_event() -> SimplexEvent │ -//! │ - on_vote(desc, ...) - pending_event_count() │ -//! │ - check_all(desc) - has_pending_events() │ -//! └─────────────────────────────────────────────────────────────────────┘ -//! ``` -//! -//! ## Event Model -//! -//! Instead of callbacks, SimplexState produces events that are queued internally: -//! 1. Call FSM methods (`on_candidate`, `on_vote`, `check_all`) -//! 2. Pull events with `pull_event()` until it returns `None` -//! 3. Process each event (broadcast vote, notify listener, etc.) -//! -//! This enables: -//! - **Testing**: Inspect produced events without mocking -//! - **Debugging**: Dump event queue for diagnostics -//! - **Tracing**: All FSM outputs go through a single mechanism -//! -//! ## Timeout Model -//! -//! The FSM controls its own timing. Clients should: -//! 1. Call `check_all(desc)` whenever `get_next_timeout()` has elapsed (or earlier is ok) -//! 2. `check_all()` processes pending timeouts and updates `next_timeout` -//! 3. After any event (`on_candidate`, `on_vote`), `next_timeout` may change +//! Flow summary: +//! 1. `on_candidate()` stores and validates candidates, then attempts `try_notar()`. +//! 2. `on_vote()` updates per-slot vote weights and calls `check_thresholds_and_trigger()`. +//! 3. Threshold handling emits certificates/events and can trigger `try_final()`. +//! 4. `check_all()` drives timeout handling via `process_timeouts()` / `try_skip_window()`. +//! 5. Callers drain `SimplexEvent` via `pull_event()` and execute side effects. //! //! ## Usage //! //! ```ignore -//! use simplex::{SimplexState, SimplexEvent}; +//! use simplex::{RawVoteData, SimplexEvent, SimplexState}; //! -//! // Create FSM -//! let mut state = SimplexState::new(&session_description, false).expect("Invalid session config"); +//! let mut state = SimplexState::new(&session_description)?; +//! let _ = state.on_candidate(&session_description, candidate); +//! let _ = state.on_vote( +//! &session_description, +//! validator_idx, +//! vote, +//! signature, +//! RawVoteData::default(), +//! ); +//! state.check_all(&session_description); //! -//! // Main loop -//! loop { -//! let timeout = state.get_next_timeout(); -//! // wait for timeout or incoming event... -//! -//! // Process incoming events (handle errors as needed) -//! if let Err(e) = state.on_candidate(&session_description, candidate) { -//! log::warn!("Misbehavior: {}", e); -//! } -//! if let Err(e) = state.on_vote(&session_description, validator_idx, vote) { -//! log::warn!("Misbehavior: {}", e); -//! } -//! -//! // Check for timeouts and pending actions -//! state.check_all(&session_description); -//! -//! // Process all produced events -//! while let Some(event) = state.pull_event() { -//! match event { -//! SimplexEvent::BroadcastVote(vote) => { -//! receiver.send_vote(vote); -//! } -//! SimplexEvent::BlockFinalized(e) => { -//! listener.on_block_committed(e.slot, e.block, ...); -//! } -//! SimplexEvent::SlotSkipped(e) => { -//! listener.on_block_skipped(e.slot); -//! } +//! while let Some(event) = state.pull_event() { +//! match event { +//! SimplexEvent::BroadcastVote(v) => receiver.send_vote(v), +//! SimplexEvent::BlockFinalized(e) => { +//! listener.on_block_committed(e.slot, e.block_hash, ...); //! } +//! _ => {} //! } //! } //! ``` @@ -255,6 +60,7 @@ use crate::{ ConflictReason, ConflictingVoteType, MisbehaviorProof, VoteDescriptor, VoteResult, }, session_description::SessionDescription, + session_processor::{SlotDiagnostic, SlotWaitPhase, WindowDiagnostic}, RawVoteData, ValidatorWeight, }; use std::{ @@ -267,107 +73,6 @@ use std::{ }; use ton_block::{error, fail, BlockIdExt, Result, UInt256}; -/* - ============================================================================ - SimplexState Options - ============================================================================ -*/ - -/// Configuration options for SimplexState -/// -/// Controls behavior of the consensus FSM, particularly around fallback -/// protocol and C++ compatibility. -#[derive(Clone, Debug)] -pub struct SimplexStateOptions { - /// Enable fallback protocol (SafeToNotar/SafeToSkip and fallback votes) - /// - /// When `false` (default, C++ compatible): - /// - SafeToNotar/SafeToSkip events are not processed - /// - Fallback votes (NotarizeFallback, SkipFallback) are not broadcast - /// - /// When `true` (full Alpenglow): - /// - Full White Paper algorithm with fallback mechanism - pub enable_fallback_protocol: bool, - - /// Allow skip vote after notarize for same slot (and vice versa) - /// - /// When `true` (default, C++ compatible): - /// - Notarize + Skip from same validator is ALLOWED - /// - Matches C++ pool.cpp behavior - /// - /// When `false` (Alpenglow strict mode): - /// - Notarize + Skip from same validator is MISBEHAVIOR - pub allow_skip_after_notarize: bool, - - /// Require parent to be finalized (not just notarized) for block generation - /// - /// When `false` (default, C++ compatible): - /// - Parent can be notarized OR finalized to build child block - /// - Matches C++ pool.cpp behavior (parent_slot->state->notarized.has_value()) - /// - Allows progress even when finalization is delayed - /// - Prevents deadlock when some validators vote skip while others vote finalize - /// - /// When `true` (strict mode): - /// - Parent must be finalized before child block can be generated - /// - Use for testing sequential finalization scenarios - /// - WARNING: Can cause deadlock if finalization is blocked - /// - /// C++ uses notarized parent check (not finalized) for collation availability. - pub require_finalized_parent: bool, -} - -impl Default for SimplexStateOptions { - fn default() -> Self { - Self { - // C++ compatible mode by default - enable_fallback_protocol: false, - allow_skip_after_notarize: true, - // C++ allows notarized blocks as parents (not just finalized) - require_finalized_parent: false, - } - } -} - -impl SimplexStateOptions { - /// Create options for C++ compatible mode (default) - pub fn cpp_compatible() -> Self { - Self::default() - } - - /// Create options for full Alpenglow mode - #[allow(dead_code)] - pub fn alpenglow() -> Self { - Self { - enable_fallback_protocol: true, - allow_skip_after_notarize: false, - require_finalized_parent: false, - } - } - - /// Create options for strict sequential mode (for testing deadlock scenarios) - /// - /// WARNING: This mode requires parent to be finalized, which can cause deadlock - /// if some validators vote skip while others vote finalize. Use only for testing. - #[allow(dead_code)] - pub fn strict_sequential() -> Self { - Self { - enable_fallback_protocol: false, - allow_skip_after_notarize: true, - require_finalized_parent: true, - } - } -} - -/* - ============================================================================ - Constants - ============================================================================ -*/ - -/// Maximum number of notar-fallback votes allowed per validator per slot -/// Reference: C++ pool.cpp TooManyFallbackVotesMisbehaviorProof (>3 = misbehavior) -const MAX_NOTAR_FALLBACK_VOTES_PER_VALIDATOR: usize = 3; - /* ============================================================================ Vote Types for FSM @@ -421,26 +126,6 @@ pub struct SkipVote { pub slot: SlotIndex, } -/// Notarization fallback vote - fallback notarization for a block -/// -/// Algorithm 1: upon SafeToNotar(s, hash(b)) do broadcast NotarFallbackVote(s, hash(b)) -#[derive(Clone, Debug, PartialEq, Eq)] -pub struct NotarizeFallbackVote { - /// Slot number - pub slot: SlotIndex, - /// Candidate hash being notarized via fallback - pub block_hash: UInt256, -} - -/// Skip fallback vote - fallback skip for a slot -/// -/// Algorithm 1: upon SafeToSkip(s) do broadcast SkipFallbackVote(s) -#[derive(Clone, Debug, PartialEq, Eq)] -pub struct SkipFallbackVote { - /// Slot to skip - pub slot: SlotIndex, -} - /// Vote enum - all vote types for FSM processing /// /// Reference: C++ Vote variant in consensus-bus.h @@ -452,10 +137,6 @@ pub enum Vote { Finalize(FinalizeVote), /// Skip vote Skip(SkipVote), - /// Notarization fallback vote - NotarizeFallback(NotarizeFallbackVote), - /// Skip fallback vote - SkipFallback(SkipFallbackVote), } /* @@ -552,8 +233,7 @@ pub struct NotarizationReachedEvent { /// /// This event is emitted when a slot receives enough skip votes /// (2/3 + 1 of validator weight). Used for broadcasting the skip -/// certificate to other validators (C++ mode only - Alpenglow paper -/// doesn't have explicit skip certificates). +/// certificate to other validators. #[derive(Clone, Debug)] pub struct SkipCertificateReachedEvent { /// Slot number @@ -591,7 +271,7 @@ pub struct FinalizationReachedEvent { /// - `BlockFinalized` → Notify SessionListener::on_block_committed /// - `SlotSkipped` → Notify SessionListener::on_block_skipped /// - `NotarizationReached` → Cache serialized notarization certificate in receiver -/// - `SkipCertificateReached` → Broadcast skip certificate to validators (C++ mode only) +/// - `SkipCertificateReached` → Relay skip certificate to validators /// - `FinalizationReached` → Cache finalization certificate and relay to peers #[derive(Clone, Debug)] pub enum SimplexEvent { @@ -620,7 +300,7 @@ pub enum SimplexEvent { /// /// Reference: C++ pool.cpp skip certificate creation /// Used to broadcast skip certificate to validators. - /// Only emitted in C++ compatibility mode. + /// Emitted once per newly stored skip certificate. SkipCertificateReached(SkipCertificateReachedEvent), /// A finalization threshold was reached (certificate created) @@ -655,15 +335,15 @@ struct Slot { available_base: Option, /// Pending block candidate waiting for parent/conditions - /// Alpenglow: pendingBlocks[s] + /// Pending candidate for slot retry logic. pending_block: Option, /// Has this node voted in this slot? - /// Alpenglow: Voted ∈ state[s] + /// Local voted flag for this slot. is_voted: bool, /// Block we voted to notarize (if any) - /// Alpenglow: VotedNotar(hash) ∈ state[s] + /// Local notar vote marker for this slot. voted_notar: Option, /// Have we voted to skip this slot? @@ -686,7 +366,7 @@ struct Slot { voted_final: bool, /// Observed notarization certificate for a block - /// Alpenglow: BlockNotarized(hash(b)) ∈ state[s] + /// Observed notarization certificate for this slot. observed_notar_certificate: Option, /// Has this slot reached skip certificate threshold (2/3)? @@ -696,12 +376,12 @@ struct Slot { skipped: bool, /// Is consensus finished for this slot? - /// Alpenglow: ItsOver ∈ state[s] - its_over: bool, + /// Local completion marker for this slot. + is_completed: bool, - /// Have we entered fallback mode for this slot's window? - /// Alpenglow: BadWindow ∈ state[s] - is_bad_window: bool, + /// Did this slot get marked by timeout-driven skip handling? + /// Window skip marker for this slot. + is_timeout_skipped: bool, } impl Slot { @@ -829,27 +509,6 @@ struct ValidatorVotes { /// Finalize vote raw bytes (stored for misbehavior proofs) /// Uses Arc for memory-efficient sharing finalize_raw: Option, - - /// Skip fallback vote (at most one per validator) - fallback_skip: Option, - - /// Skip fallback vote signature (stored for certificate creation, if fallback is added to TL) - #[allow(dead_code)] - fallback_skip_signature: Option>, - - /// Skip fallback vote raw bytes (stored for misbehavior proofs) - /// Uses Arc for memory-efficient sharing - fallback_skip_raw: Option, - - /// Notar fallback votes (up to MAX_NOTAR_FALLBACK_VOTES_PER_VALIDATOR) - /// Key is candidate hash, value is raw bytes for misbehavior proofs - /// Uses Arc for memory-efficient sharing - fallback_notarize: HashMap, - - /// Notar fallback vote signatures (stored for certificate creation, if fallback is added to TL) - /// Key is candidate hash - #[allow(dead_code)] - fallback_notarize_signatures: HashMap>, } /* @@ -911,24 +570,18 @@ struct SlotVotes { /// Total weight that voted notarize OR skip notarize_or_skip_weight: ValidatorWeight, - /// Total weight that voted skip OR skip-fallback - skip_or_skip_fallback_weight: ValidatorWeight, + /// Total weight that voted skip + skip_weight: ValidatorWeight, /// Have we published BlockNotarized event? block_notarized_published: bool, - /// Have we published SafeToSkip event? - safe_to_skip_published: bool, - /// Have we published BlockFinalized event? block_finalized_published: bool, /// Have we published SlotSkipped event? slot_skipped_published: bool, - /// Blocks (by candidate_hash) for which we've published SafeToNotar - safe_to_notar_blocks: HashSet, - /// Cached notarization certificate (only one per slot) /// Created when notarization threshold (2/3) is reached. /// Used by candidate resolver to respond to requestCandidate queries. @@ -953,12 +606,10 @@ impl SlotVotes { notarize_weight_by_block: HashMap::new(), finalize_weight_by_block: HashMap::new(), notarize_or_skip_weight: 0, - skip_or_skip_fallback_weight: 0, + skip_weight: 0, block_notarized_published: false, - safe_to_skip_published: false, block_finalized_published: false, slot_skipped_published: false, - safe_to_notar_blocks: HashSet::new(), notarize_certificate: None, finalize_certificate: None, skip_certificate: None, @@ -1036,11 +687,6 @@ impl SlotVotes { Ok(true) } - /// Get validator votes with bounds checking (returns None if out of bounds) - fn get_validator_votes(&self, validator_idx: ValidatorIndex) -> Option<&ValidatorVotes> { - self.votes.get(validator_idx.value() as usize) - } - /// Get mutable validator votes with bounds checking (returns None if out of bounds) fn get_validator_votes_mut( &mut self, @@ -1118,7 +764,6 @@ impl SlotVotes { /// # Returns /// /// SkipCert containing the vote and all matching signatures - #[allow(dead_code)] fn create_skip_cert(&self, slot: SlotIndex) -> SkipCert { let vote = SkipVote { slot }; let signatures = self.collect_skip_signatures(); @@ -1373,9 +1018,6 @@ pub(crate) struct SimplexState { /// C++ parity: `params_.max_leader_window_desync` bound used in consensus/pool ingress. max_leader_window_desync: u32, - /// SimplexState options (fallback protocol, etc.) - opts: SimplexStateOptions, - /// Throttle counter for `ensure_window_exists` rejection warnings. /// Prevents log flooding when standstill re-broadcasts reference far-future windows. window_reject_count: u64, @@ -1399,7 +1041,6 @@ impl SimplexState { /// # Arguments /// /// * `desc` - Session description with validators and options - /// * `opts` - SimplexState-specific options (fallback protocol, etc.) /// /// # Errors /// @@ -1411,9 +1052,9 @@ impl SimplexState { /// /// ```ignore /// // C++ compatible mode (default) - /// let state = SimplexState::new(&desc, SimplexStateOptions::default())?; + /// let state = SimplexState::new(&desc)?; /// ``` - pub fn new(desc: &SessionDescription, opts: SimplexStateOptions) -> Result { + pub fn new(desc: &SessionDescription) -> Result { let slots_per_window = desc.opts().slots_per_leader_window; let num_validators = desc.get_total_nodes() as usize; @@ -1427,10 +1068,9 @@ impl SimplexState { } log::trace!( - "SimplexState::new: initializing FSM with {} validators, {} slots/window, opts={:?}", + "SimplexState::new: initializing FSM with {} validators, {} slots/window", num_validators, - slots_per_window, - opts + slots_per_window ); let first_block_timeout = desc.opts().first_block_timeout; @@ -1454,7 +1094,6 @@ impl SimplexState { target_rate_timeout, slots_per_leader_window: slots_per_window, max_leader_window_desync: desc.opts().max_leader_window_desync, - opts, window_reject_count: 0, }; @@ -1683,12 +1322,9 @@ impl SimplexState { Vote::Notarize(v) => v.slot, Vote::Finalize(v) => v.slot, Vote::Skip(v) => v.slot, - Vote::NotarizeFallback(v) => v.slot, - Vote::SkipFallback(v) => v.slot, }; let window_idx = desc.get_window_idx(slot); let offset = desc.get_slot_offset_in_window(slot) as usize; - let clear_pending_on_skip = self.opts.enable_fallback_protocol; // After restart recovery sets `first_non_finalized_slot`, we may prune old leader windows // by advancing `leader_window_offset`. Votes for slots in pruned windows are irrelevant @@ -1721,38 +1357,21 @@ impl SimplexState { &v.block_hash.to_hex_string()[..8] ); } - Vote::NotarizeFallback(v) => { - window.slots[offset].is_voted = true; - window.slots[offset].voted_notar = - Some(CandidateParentInfo { slot, hash: v.block_hash.clone() }); - log::trace!( - "SimplexState::mark_slot_voted_on_restart: slot {} marked voted_notar_fb={}:{}", - slot.value(), - slot.value(), - &v.block_hash.to_hex_string()[..8] - ); - } Vote::Finalize(_) => { // C++: slot->state->voted_final = true window.slots[offset].is_voted = true; - window.slots[offset].its_over = true; + window.slots[offset].is_completed = true; window.slots[offset].voted_final = true; log::trace!( "SimplexState::mark_slot_voted_on_restart: slot {} marked voted_final=true", slot.value() ); } - Vote::Skip(_) | Vote::SkipFallback(_) => { + Vote::Skip(_) => { // C++: slot->state->voted_skip = true window.slots[offset].is_voted = true; window.slots[offset].voted_skip = true; - window.slots[offset].is_bad_window = true; - // C++ mode preserves pending candidate on skip/restart-skip: - // consensus.cpp gates candidate intake by voted_notar, not voted_skip. - // Keep Alpenglow behavior unchanged (clear pending on skip). - if clear_pending_on_skip { - window.slots[offset].pending_block = None; - } + window.slots[offset].is_timeout_skipped = true; log::trace!( "SimplexState::mark_slot_voted_on_restart: slot {} marked voted_skip=true", slot.value() @@ -1790,7 +1409,6 @@ impl SimplexState { // end_slot = window * slots_per_leader_window let start_slot = (first_nonannounced_window.value() - 1) * slots_per_window; let end_slot = first_nonannounced_window.value() * slots_per_window; - let clear_pending_on_skip = self.opts.enable_fallback_protocol; let mut skip_count = 0u32; for slot_num in start_slot..end_slot { @@ -1806,9 +1424,9 @@ impl SimplexState { // Ensure window exists let _ = self.window_at_mut(window_idx); - // Check if slot is already finalized (its_over in C++) + // Check if slot is already finalized (is_completed in C++) let should_skip = if let Some(window) = self.get_window(window_idx) { - offset < window.slots.len() && !window.slots[offset].its_over + offset < window.slots.len() && !window.slots[offset].is_completed } else { false }; @@ -1820,12 +1438,7 @@ impl SimplexState { // Reference: C++ consensus.cpp start_up() sets voted_skip=true before publishing SkipVote. window.slots[offset].is_voted = true; window.slots[offset].voted_skip = true; - window.slots[offset].is_bad_window = true; - // C++ mode preserves pending candidate across restart skip voting. - // Keep Alpenglow fallback behavior unchanged. - if clear_pending_on_skip { - window.slots[offset].pending_block = None; - } + window.slots[offset].is_timeout_skipped = true; log::trace!( "SimplexState::generate_restart_skip_votes: queueing skip for slot {}", @@ -1882,12 +1495,6 @@ impl SimplexState { self.events.pop_front() } - /// Get the number of pending events in the queue - #[allow(dead_code)] - pub fn pending_event_count(&self) -> usize { - self.events.len() - } - /// Check if there are any pending events #[cfg(test)] pub fn has_pending_events(&self) -> bool { @@ -1957,10 +1564,6 @@ impl SimplexState { format!("finalize {}", Self::format_block(v.slot, &v.block_hash)) } Vote::Skip(v) => format!("skip {}", v.slot), - Vote::NotarizeFallback(v) => { - format!("notar-fb {}", Self::format_block(v.slot, &v.block_hash)) - } - Vote::SkipFallback(v) => format!("skip-fb {}", v.slot), } } @@ -2027,31 +1630,8 @@ impl SimplexState { self.events.push_front(event); } - /// Push a broadcast vote event - /// - /// Filters out fallback votes (NotarizeFallback, SkipFallback) when - /// enable_fallback_protocol is false. + /// Push a broadcast vote event. fn broadcast_vote(&mut self, vote: Vote) { - // Filter fallback votes in C++ compatible mode - if !self.opts.enable_fallback_protocol { - match &vote { - Vote::NotarizeFallback(v) => { - log::warn!( - "SimplexState::broadcast_vote: FILTERED notar-fb (fallback disabled) slot={}", - v.slot - ); - return; - } - Vote::SkipFallback(v) => { - log::warn!( - "SimplexState::broadcast_vote: FILTERED skip-fb (fallback disabled) slot={}", - v.slot - ); - return; - } - _ => {} // Allow Notarize, Finalize, Skip - } - } self.push_event_back(SimplexEvent::BroadcastVote(vote)); } @@ -2062,15 +1642,10 @@ impl SimplexState { Reference: C++ pool.cpp check_invariants() - The C++ implementation (enable_fallback_protocol=false) checks: + The C++ implementation checks: 1. notarize + finalize must have same candidate id (if both exist) 2. finalize + skip is misbehavior (conflicting votes) - Full Alpenglow (enable_fallback_protocol=true) is stricter: - 1. notarize + skip is misbehavior (a validator cannot hedge) - 2. notarize + finalize must have same candidate id - 3. finalize + skip is misbehavior - These invariants are checked at the start of check_all() and check_thresholds_and_trigger() to ensure state consistency. */ @@ -2099,23 +1674,13 @@ impl SimplexState { /// Check invariants for a single validator's votes in a slot /// - /// # Invariants (C++ compatible, enable_fallback_protocol=false) - /// /// 1. If both notarize and finalize exist, they must be for the same candidate: /// `notarize.id == finalize.id` /// /// 2. Finalize + Skip is conflicting (misbehavior): /// `!(finalize.is_some() && skip.is_some())` /// - /// # Invariants (Full Alpenglow, enable_fallback_protocol=true) - /// - /// All of the above, plus: - /// - /// 3. Notarize + Skip is conflicting (a validator cannot hedge): - /// `!(notarize.is_some() && skip.is_some())` - /// /// Reference: C++ pool.cpp check_invariants() - /// Reference: Solana Alpenglow White Paper (May 2025), voting rules fn check_validator_invariants( &self, _desc: &SessionDescription, @@ -2139,7 +1704,7 @@ impl SimplexState { ); } - // Invariant 2: Finalize + Skip is misbehavior (applies to both modes) + // Invariant 2: Finalize + Skip is misbehavior. assert!( !(votes.finalize.is_some() && votes.skip.is_some()), "SimplexState INVARIANT VIOLATION: {}/{} has both finalize and skip votes \ @@ -2149,21 +1714,6 @@ impl SimplexState { votes.finalize, votes.skip ); - - // Invariant 3: Notarize + Skip is misbehavior when not allowed - // When allow_skip_after_notarize=false (Alpenglow strict mode): - // A validator cannot hedge by voting both notarize and skip - if !self.opts.allow_skip_after_notarize { - assert!( - !(votes.notarize.is_some() && votes.skip.is_some()), - "SimplexState INVARIANT VIOLATION: {}/{} has both notarize \ - and skip votes (notarize={:?}, skip={:?})", - validator_idx, - slot, - votes.notarize, - votes.skip - ); - } } /* @@ -2183,7 +1733,7 @@ impl SimplexState { /// /// Reference: C++ set_timeouts() /// - /// Alpenglow Algorithm 2: + /// Pseudocode equivalent: /// ```text /// function setTimeouts(s) // s is first slot of window /// for i ∈ windowSlots(s) do // set timeouts for all slots @@ -2196,7 +1746,7 @@ impl SimplexState { // C++ consensus.cpp: // timeout_base_ = Timestamp::in(first_block_timeout_); // base = now + first_block // alarm_timestamp() = Timestamp::in(target_rate, timeout_base_); // alarm = base + target_rate - // First alarm fires at: now + first_block_timeout + target_rate (both modes). + // First alarm fires at: now + first_block_timeout + target_rate. self.timeout_base = Some(desc.get_time() + self.first_block_timeout); let first_timeout = self.first_block_timeout + self.target_rate_timeout; self.skip_timestamp = Some(desc.get_time() + first_timeout); @@ -2259,16 +1809,11 @@ impl SimplexState { } } - /// Process expired timeouts + /// Process expired timeouts. /// - /// Reference: C++ alarm() - /// - /// Alpenglow Algorithm 1: - /// ```text - /// upon Timeout(s) do - /// if Voted ∉ state[s] then - /// trySkipWindow(s) - /// ``` + /// Rule mapping: + /// - `Simplex.md` timeout-to-skip transition for the active leader window. + /// - C++ `consensus.cpp::alarm()` driving skip escalation. fn process_timeouts(&mut self, desc: &SessionDescription) { // Check if we have a timeout pending let Some(mut skip_timestamp) = self.skip_timestamp else { @@ -2303,10 +1848,6 @@ impl SimplexState { // Ensure window exists self.ensure_window_exists(window_idx); - // Check if we should skip the timeout: - // - Alpenglow (enable_fallback_protocol=true): Check is_voted (any vote blocks skip) - // - C++ compatible (enable_fallback_protocol=false): Check voted_final OR voted_skip - // // C++ alarm() checks voted_final and fires once per window (one-shot alarm). // Rust process_timeouts fires per-slot, so we must also check voted_skip to // prevent repeated skip vote broadcasts for the same window. @@ -2314,15 +1855,10 @@ impl SimplexState { let should_skip_timeout = { let window = self.get_window(window_idx); if let Some(window) = window { - if self.opts.enable_fallback_protocol { - // Alpenglow: Any vote blocks timeout (Voted ∈ state[s]) - window.slots[offset].is_voted - } else { - // C++: voted_final or voted_skip blocks timeout. - // C++ alarm is one-shot so only checks voted_final, but Rust fires - // per-slot so we also check voted_skip to avoid re-broadcasting. - window.slots[offset].voted_final || window.slots[offset].voted_skip - } + // C++: voted_final or voted_skip blocks timeout. + // C++ alarm is one-shot so only checks voted_final, but Rust fires + // per-slot so we also check voted_skip to avoid re-broadcasting. + window.slots[offset].voted_final || window.slots[offset].voted_skip } else { continue; } @@ -2331,17 +1867,14 @@ impl SimplexState { // Skip if condition is met if !should_skip_timeout { // Get slot state for logging - let (is_voted, its_over) = self + let (is_voted, is_completed) = self .get_window(window_idx) - .map(|w| (w.slots[offset].is_voted, w.slots[offset].its_over)) + .map(|w| (w.slots[offset].is_voted, w.slots[offset].is_completed)) .unwrap_or((false, false)); log::trace!( - "SimplexState::process_timeouts: ({}/{}) timeout expired, voted={}, its_over={} -> skip window", - window_idx, - slot_id, - is_voted, - its_over + "SimplexState::process_timeouts: ({window_idx}/{slot_id}) timeout expired, \ + voted={is_voted}, is_completed={is_completed} -> skip window" ); // Mark window as having timeouts @@ -2349,10 +1882,9 @@ impl SimplexState { window.had_timeouts = true; } - // Alpenglow: trySkipWindow(s) self.try_skip_window(window_idx); - // C++ compatibility: skip entire remaining window at once, then STOP. + // C++ behavior: skip entire remaining window at once, then STOP. // Reference: C++ consensus.cpp alarm(): // C++ fires alarm once and skips ALL remaining slots in the window, // then sets timeout_slot_ = window_end. Crucially, C++ does NOT @@ -2362,24 +1894,22 @@ impl SimplexState { // of actual window advancement, firing for future windows with // only target_rate delay instead of first_block_timeout + target_rate, // causing nodes to vote skip before leaders can produce blocks. - if !self.opts.enable_fallback_protocol { - let window_end_slot = (window_idx + 1) * self.slots_per_leader_window; - if self.skip_slot < window_end_slot { - log::debug!( - "SimplexState::process_timeouts: C++ window skip: \ - advancing skip_slot {} -> {} (window_end)", - self.skip_slot, - window_end_slot - ); - self.skip_slot = window_end_slot; - } - // Do NOT reschedule — let advance_leader_window_on_progress_cursor() - // re-arm via set_timeouts() with proper first_block_timeout when - // the next window actually starts. - self.skip_timestamp = None; - self.timeout_base = None; - break; + let window_end_slot = (window_idx + 1) * self.slots_per_leader_window; + if self.skip_slot < window_end_slot { + log::debug!( + "SimplexState::process_timeouts: C++ window skip: \ + advancing skip_slot {} -> {} (window_end)", + self.skip_slot, + window_end_slot + ); + self.skip_slot = window_end_slot; } + // Do NOT reschedule — let advance_leader_window_on_progress_cursor() + // re-arm via set_timeouts() with proper first_block_timeout when + // the next window actually starts. + self.skip_timestamp = None; + self.timeout_base = None; + break; } } } @@ -2411,7 +1941,7 @@ impl SimplexState { let max_delay = desc.opts().max_backoff_delay; // Only back off first_block_timeout, not target_rate_timeout. - // C++ reference (consensus.cpp:98-99) only backs off first_block_timeout_s_, + // C++ reference (`consensus.cpp`) only backs off first_block_timeout_s_, // keeping target_rate_s_ constant. Backing off target_rate causes the full // rotation of 16 slots to take 16s instead of 8s, making blocks from remote // leaders arrive after the skip timeout and preventing finalization. @@ -2444,7 +1974,7 @@ impl SimplexState { /// Handle incoming block candidate /// - /// Reference: Alpenglow Algorithm 1, "upon Block(s, hash, hashparent) do" + /// Reference: protocol event handler for candidate reception. /// /// ```text /// if tryNotar(Block(s, hash, hashparent)) then @@ -2472,7 +2002,7 @@ impl SimplexState { ); // Validate leader index - // Alpenglow: Each slot has a designated leader from the set of nodes + // Each slot has a designated leader from the validator set. if leader.value() >= self.num_validators as u32 { log::trace!( "SimplexState::on_candidate: ({}/{}) INVALID leader {} >= max {}, dropping", @@ -2537,17 +2067,17 @@ impl SimplexState { parent ); - // Alpenglow: if tryNotar(Block(s, hash, hashparent)) then + // If try_notar succeeds, continue with pending checks. if self.try_notar(desc, slot, &candidate.id.hash, parent.as_ref()) { log::trace!( "SimplexState::on_candidate: ({}/{}) try_notar succeeded, checking pending blocks", window_idx, slot ); - // Alpenglow: checkPendingBlocks() + // Retry any queued pending candidates. self.check_pending_blocks(desc); } - // Alpenglow: else if Voted ∉ state[s] then + // Otherwise, store as pending when slot is not already dominated. else { let offset = desc.get_slot_offset_in_window(slot) as usize; @@ -2558,13 +2088,8 @@ impl SimplexState { // pending — the pending retry (`check_pending_blocks`) will notarize it once // the parent base propagates through skip certs. // - // Alpenglow uses the stricter `is_voted` (any local vote blocks storage). let dominated = if let Some(window) = self.get_window(window_idx) { - if self.opts.enable_fallback_protocol { - window.slots[offset].is_voted - } else { - window.slots[offset].voted_notar.is_some() - } + window.slots[offset].voted_notar.is_some() } else { false }; @@ -2642,8 +2167,6 @@ impl SimplexState { Vote::Notarize(v) => v.slot, Vote::Finalize(v) => v.slot, Vote::Skip(v) => v.slot, - Vote::NotarizeFallback(v) => v.slot, - Vote::SkipFallback(v) => v.slot, }; let window_idx = desc.get_window_idx(slot); @@ -2696,16 +2219,9 @@ impl SimplexState { Vote::Finalize(v) => { self.handle_finalize_vote(desc, validator_idx, v, signature, raw_vote) } - Vote::NotarizeFallback(v) => { - self.handle_notar_fallback_vote(validator_idx, v, raw_vote) - } - Vote::SkipFallback(v) => { - self.handle_skip_fallback_vote(desc, validator_idx, v, raw_vote) - } }; - // Check thresholds after successful vote processing - // This is called once for all vote types, including notar-fallback + // Check thresholds after successful vote processing. if result.is_applied() && slot >= self.first_non_finalized_slot { self.check_thresholds_and_trigger(desc, slot); } @@ -2813,9 +2329,6 @@ impl SimplexState { return VoteResult::SlotAlreadyFinalized; } - // Capture before mutable borrow - let allow_skip_after_notarize = self.opts.allow_skip_after_notarize; - let slot_votes = self.slot_votes_at(slot); let Some(votes) = slot_votes.get_validator_votes_mut(validator_idx) else { return VoteResult::Rejected(format!( @@ -2884,35 +2397,6 @@ impl SimplexState { } } - // C++ pool.cpp check_invariants() does NOT check notarize+skip conflict. - // Only finalize+skip is misbehavior in C++. - // - // When allow_skip_after_notarize=true (C++ compatible mode): - // Notarize + Skip is ALLOWED (matches C++ behavior) - // - // When allow_skip_after_notarize=false (Alpenglow strict mode): - // Notarize + Skip is MISBEHAVIOR (in Alpenglow, once you vote notarize - // on the fast path, you shouldn't also vote skip) - if !allow_skip_after_notarize && votes.skip.is_some() { - log::trace!( - "SimplexState::handle_notarize_vote: ({}/{}) {} has skip, rejecting notarize", - window_idx, - slot, - validator_idx - ); - // Use stored raw bytes from existing skip vote and new raw bytes for proof - let existing_raw = votes.skip_raw.clone().unwrap_or_default(); - return VoteResult::Misbehavior(MisbehaviorProof::conflicting_types( - slot, - validator_idx, - VoteDescriptor::Skip, - VoteDescriptor::Notarize(vote.block_hash.clone()), - existing_raw, - raw_vote, - ConflictReason::NotarizeAfterSkip, - )); - } - // Record vote, signature, and raw bytes (for certificate creation and misbehavior proofs) let had_notarize_or_skip = votes.notarize.is_some() || votes.skip.is_some(); votes.notarize = Some(vote.clone()); @@ -2975,9 +2459,6 @@ impl SimplexState { return VoteResult::SlotAlreadyFinalized; } - // Capture before mutable borrow - let allow_skip_after_notarize = self.opts.allow_skip_after_notarize; - let slot_votes = self.slot_votes_at(slot); let Some(votes) = slot_votes.get_validator_votes_mut(validator_idx) else { return VoteResult::Rejected(format!( @@ -2996,39 +2477,9 @@ impl SimplexState { return VoteResult::Duplicate; } - // C++ pool.cpp check_invariants() does NOT check notarize+skip conflict. - // Only finalize+skip is misbehavior in C++. - // - // When allow_skip_after_notarize=true (C++ compatible mode): - // Skip + Notarize is ALLOWED (matches C++ behavior) - // - // When allow_skip_after_notarize=false (Alpenglow strict mode): - // Skip + Notarize is MISBEHAVIOR (in Alpenglow, once you vote skip - // you shouldn't also vote notarize for the same slot) - if !allow_skip_after_notarize && votes.notarize.is_some() { - let existing_notar = votes.notarize.as_ref().unwrap(); - log::trace!( - "SimplexState::handle_skip_vote: ({}/{}) {} has notarize, rejecting skip", - window_idx, - slot, - validator_idx - ); - // Use stored raw bytes from existing notarize vote and new raw bytes for proof - let existing_raw = votes.notarize_raw.clone().unwrap_or_default(); - return VoteResult::Misbehavior(MisbehaviorProof::conflicting_types( - slot, - validator_idx, - VoteDescriptor::Notarize(existing_notar.block_hash.clone()), - VoteDescriptor::Skip, - existing_raw, - raw_vote, - ConflictReason::NotarizeAfterSkip, - )); - } - // Record vote, signature, and raw bytes (for certificate creation and misbehavior proofs) let had_notarize_or_skip = votes.notarize.is_some() || votes.skip.is_some(); - let had_skip_or_skip_fallback = votes.skip.is_some() || votes.fallback_skip.is_some(); + let had_skip = votes.skip.is_some(); votes.skip = Some(vote); votes.skip_signature = Some(signature); votes.skip_raw = Some(raw_vote); @@ -3038,22 +2489,22 @@ impl SimplexState { if !had_notarize_or_skip { slot_votes.notarize_or_skip_weight += weight; } - if !had_skip_or_skip_fallback { - slot_votes.skip_or_skip_fallback_weight += weight; + if !had_skip { + slot_votes.skip_weight += weight; } if log::log_enabled!(log::Level::Trace) { let total_weight = desc.get_total_weight(); log::trace!( - "SimplexState::handle_skip_vote: ({}/{}) {} +{} -> n|s={}({:.0}%) s|fb={}({:.0}%)", + "SimplexState::handle_skip_vote: ({}/{}) {} +{} -> n|s={}({:.0}%) skip={}({:.0}%)", window_idx, slot, validator_idx, weight, slot_votes.notarize_or_skip_weight, 100.0 * slot_votes.notarize_or_skip_weight as f64 / total_weight as f64, - slot_votes.skip_or_skip_fallback_weight, - 100.0 * slot_votes.skip_or_skip_fallback_weight as f64 / total_weight as f64 + slot_votes.skip_weight, + 100.0 * slot_votes.skip_weight as f64 / total_weight as f64 ); } @@ -3180,45 +2631,6 @@ impl SimplexState { )); } - // Check conflicts with fallback votes - if let Some((first_fb_hash, first_fb_raw)) = votes.fallback_notarize.iter().next() { - log::trace!( - "SimplexState::handle_finalize_vote: ({}/{}) {} has notar-fb, rejecting finalize", - window_idx, - slot, - validator_idx - ); - return VoteResult::Misbehavior(MisbehaviorProof::conflicting_types( - slot, - validator_idx, - VoteDescriptor::NotarizeFallback(first_fb_hash.clone()), - VoteDescriptor::Finalize(vote.block_hash.clone()), - first_fb_raw.clone(), - raw_vote, - ConflictReason::FinalizeAfterNotarFallback, - )); - } - - if votes.fallback_skip.is_some() { - log::trace!( - "SimplexState::handle_finalize_vote: ({}/{}) {} has skip-fb, rejecting finalize", - window_idx, - slot, - validator_idx - ); - // Use stored raw bytes from existing skip-fallback vote and new raw bytes for proof - let existing_raw = votes.fallback_skip_raw.clone().unwrap_or_default(); - return VoteResult::Misbehavior(MisbehaviorProof::conflicting_types( - slot, - validator_idx, - VoteDescriptor::SkipFallback, - VoteDescriptor::Finalize(vote.block_hash.clone()), - existing_raw, - raw_vote, - ConflictReason::FinalizeAfterSkipFallback, - )); - } - // Record vote, signature, and raw bytes (for certificate creation and misbehavior proofs) votes.finalize = Some(vote.clone()); votes.finalize_signature = Some(signature); @@ -3248,254 +2660,39 @@ impl SimplexState { VoteResult::Applied } - /// Handle notar-fallback vote - /// - /// Reference: C++ handle_vote - /// - /// # Arguments + /// Check thresholds and trigger internal FSM events. /// - /// * `validator_idx` - Validator index - /// * `vote` - Notar-fallback vote content - /// * `raw_vote` - Serialized vote bytes (stored for misbehavior proofs) - fn handle_notar_fallback_vote( - &mut self, - validator_idx: ValidatorIndex, - vote: NotarizeFallbackVote, - raw_vote: RawVoteData, - ) -> VoteResult { - let slot = vote.slot; - let window_idx = slot.window_index(self.slots_per_leader_window); + /// Rule mapping: + /// - `Simplex.md` threshold transitions for `Notarize`, `Finalize`, and `Skip`. + /// - C++ `pool.cpp::check_and_publish_events`. + fn check_thresholds_and_trigger(&mut self, desc: &SessionDescription, slot_id: SlotIndex) { + // Check invariants at the start of threshold processing + self.check_invariants(desc); - if slot < self.first_non_finalized_slot { - log::trace!( - "SimplexState::handle_notar_fallback_vote: {} < first_non_finalized={}, ignoring", - slot, - self.first_non_finalized_slot - ); - return VoteResult::SlotAlreadyFinalized; - } + let threshold_66 = desc.get_threshold_66(); - // First pass: check conditions - { - let slot_votes = self.slot_votes_at(slot); - let Some(votes) = slot_votes.get_validator_votes(validator_idx) else { - return VoteResult::Rejected(format!( - "validator {} out of bounds for slot {}", - validator_idx, slot - )); - }; - - if votes.fallback_notarize.len() >= MAX_NOTAR_FALLBACK_VOTES_PER_VALIDATOR { - log::trace!( - "SimplexState::handle_notar_fallback_vote: ({}/{}) {} too many notar-fb ({})", - window_idx, - slot, - validator_idx, - votes.fallback_notarize.len() - ); - // Note: Exceeding max votes is rejected but not classic misbehavior - return VoteResult::Rejected(format!( - "validator {} exceeded max notar-fallback votes ({}) for {}", - validator_idx, MAX_NOTAR_FALLBACK_VOTES_PER_VALIDATOR, slot - )); - } - - if let Some(ref finalize) = votes.finalize { - log::trace!( - "SimplexState::handle_notar_fallback_vote: ({}/{}) {} has finalize, rejecting notar-fb", - window_idx, - slot, - validator_idx - ); - // Use stored raw bytes from existing finalize vote and new raw bytes for proof - let existing_raw = votes.finalize_raw.clone().unwrap_or_default(); - return VoteResult::Misbehavior(MisbehaviorProof::conflicting_types( - slot, - validator_idx, - VoteDescriptor::Finalize(finalize.block_hash.clone()), - VoteDescriptor::NotarizeFallback(vote.block_hash.clone()), - existing_raw, - raw_vote, - ConflictReason::NotarFallbackAfterFinalize, - )); - } - } - - // Second pass: insert vote - let slot_votes = self.slot_votes_at(slot); - let Some(votes) = slot_votes.get_validator_votes_mut(validator_idx) else { - return VoteResult::Rejected(format!( - "validator {} out of bounds for slot {}", - validator_idx, slot - )); - }; - - // Check if already voted for this block hash - if votes.fallback_notarize.contains_key(&vote.block_hash) { - log::trace!( - "SimplexState::handle_notar_fallback_vote: {}, {}, duplicate hash={}, ignoring", - slot, - validator_idx, - vote.block_hash.to_hex_string() - ); - return VoteResult::Duplicate; - } - - // Insert vote with raw bytes - votes.fallback_notarize.insert(vote.block_hash.clone(), raw_vote); - log::trace!( - "SimplexState::handle_notar_fallback_vote: {}, {}, hash={}, fallback_count={}", - slot, - validator_idx, - vote.block_hash.to_hex_string(), - votes.fallback_notarize.len() - ); - VoteResult::Applied - } - - /// Handle skip-fallback vote - /// - /// Reference: C++ handle_vote - /// - /// # Arguments - /// - /// * `desc` - Session description - /// * `validator_idx` - Validator index - /// * `vote` - Skip-fallback vote content - /// * `raw_vote` - Serialized vote bytes (stored for misbehavior proofs) - fn handle_skip_fallback_vote( - &mut self, - desc: &SessionDescription, - validator_idx: ValidatorIndex, - vote: SkipFallbackVote, - raw_vote: RawVoteData, - ) -> VoteResult { - let slot = vote.slot; - let window_idx = desc.get_window_idx(slot); - - if slot < self.first_non_finalized_slot { - log::trace!( - "SimplexState::handle_skip_fallback_vote: {} < first_non_finalized={}, ignoring", - slot, - self.first_non_finalized_slot - ); - return VoteResult::SlotAlreadyFinalized; - } - - // First pass: check conditions - let weight = desc.get_node_weight(validator_idx); - { - let slot_votes = self.slot_votes_at(slot); - let Some(votes) = slot_votes.get_validator_votes(validator_idx) else { - return VoteResult::Rejected(format!( - "validator {} out of bounds for slot {}", - validator_idx, slot - )); - }; - - // Already voted (not an error) - if votes.fallback_skip.is_some() { - log::trace!( - "SimplexState::handle_skip_fallback_vote: {}, {} duplicate skip-fallback, ignoring", - slot, - validator_idx - ); - return VoteResult::Duplicate; - } - - if let Some(ref finalize) = votes.finalize { - log::trace!( - "SimplexState::handle_skip_fallback_vote: ({}/{}) {} has finalize, rejecting skip-fb", - window_idx, - slot, - validator_idx - ); - // Use stored raw bytes from existing finalize vote and new raw bytes for proof - let existing_raw = votes.finalize_raw.clone().unwrap_or_default(); - return VoteResult::Misbehavior(MisbehaviorProof::conflicting_types( - slot, - validator_idx, - VoteDescriptor::Finalize(finalize.block_hash.clone()), - VoteDescriptor::SkipFallback, - existing_raw, - raw_vote, - ConflictReason::SkipFallbackAfterFinalize, - )); - } - } - - // Second pass: update state - let slot_votes = self.slot_votes_at(slot); - let Some(votes) = slot_votes.get_validator_votes_mut(validator_idx) else { - return VoteResult::Rejected(format!( - "validator {} out of bounds for slot {}", - validator_idx, slot - )); - }; - - let had_skip_or_skip_fallback = votes.skip.is_some() || votes.fallback_skip.is_some(); - votes.fallback_skip = Some(vote); - votes.fallback_skip_raw = Some(raw_vote); - - // Update weights - if !had_skip_or_skip_fallback { - slot_votes.skip_or_skip_fallback_weight += weight; - } - - if log::log_enabled!(log::Level::Trace) { - let total_weight = desc.get_total_weight(); - log::trace!( - "SimplexState::handle_skip_fallback_vote: ({}/{}) {} +{} -> s|fb={}({:.0}%)", - window_idx, - slot, - validator_idx, - weight, - slot_votes.skip_or_skip_fallback_weight, - 100.0 * slot_votes.skip_or_skip_fallback_weight as f64 / total_weight as f64 - ); - } - - VoteResult::Applied - } - - /// Check thresholds and trigger internal FSM events - /// - /// Reference: C++ check_and_publish_events - fn check_thresholds_and_trigger(&mut self, desc: &SessionDescription, slot_id: SlotIndex) { - // Check invariants at the start of threshold processing - self.check_invariants(desc); - - let threshold_66 = desc.get_threshold_66(); - let threshold_33 = desc.get_threshold_33(); - - // Clone data we need to avoid borrow issues - let (notarize_weights, finalize_weights, flags) = { - let Some(sv) = self.slot_votes.get(&slot_id) else { - return; + // Clone data we need to avoid borrow issues + let (notarize_weights, finalize_weights, flags) = { + let Some(sv) = self.slot_votes.get(&slot_id) else { + return; }; ( sv.notarize_weight_by_block.clone(), sv.finalize_weight_by_block.clone(), ( sv.block_notarized_published, - sv.safe_to_skip_published, sv.block_finalized_published, sv.slot_skipped_published, - sv.notarize_or_skip_weight, - sv.skip_or_skip_fallback_weight, - sv.safe_to_notar_blocks.clone(), + sv.skip_weight, ), ) }; let ( block_notarized_published, - safe_to_skip_published, block_finalized_published, slot_skipped_published, - notarize_or_skip_weight, - skip_or_skip_fallback_weight, - safe_to_notar_blocks, + skip_weight, ) = flags; let window_idx = desc.get_window_idx(slot_id); @@ -3553,74 +2750,6 @@ impl SimplexState { } self.on_block_notarized(desc, slot_id, block.clone()); } - - // SafeToNotar: skip(s) + notar(b) >= 2/3 AND notar(b) >= 1/3 - // Reference: Alpenglow White Paper Section 2.5 - // "SafeToNotar(s, hash(b)): Moreover: skip(s) + notar(b) ≥ 2/3 and notar(b) ≥ 1/3" - // - // Only relevant when notar alone isn't enough for BlockNotarized. - // If notar(b) >= 2/3, BlockNotarized triggers via normal path - no fallback needed. - // - // SKIP when enable_fallback_protocol = false (C++ compatible mode) - if self.opts.enable_fallback_protocol { - let skip_plus_notar_b = skip_or_skip_fallback_weight + *weight; - if !safe_to_notar_blocks.contains(block) - && *weight < threshold_66 // notar alone isn't enough for normal path - && *weight >= threshold_33 - && skip_plus_notar_b >= threshold_66 - { - log::trace!( - "SimplexState::check_thresholds: ({}/{}) SAFE_TO_NOTAR {}:{} notar={}({:.0}%) skip+notar={}({:.0}%)", - window_idx, - slot_id, - slot_id, - &block.to_hex_string()[..8], - weight, - 100.0 * *weight as f64 / total_weight as f64, - skip_plus_notar_b, - 100.0 * skip_plus_notar_b as f64 / total_weight as f64 - ); - - if let Some(sv) = self.slot_votes.get_mut(&slot_id) { - sv.safe_to_notar_blocks.insert(block.clone()); - } - self.on_safe_to_notar(slot_id, block.clone()); - } - } - } - - // SafeToSkip: skip(s) + sum(notar(b)) - max(notar(b)) >= 1/3 - // Reference: Alpenglow White Paper Section 2.5 - // "SafeToSkip(s): Moreover: skip(s) + sum(notar(b)) − max_b(notar(b)) >= 1/3" - // - // notarize_or_skip_weight = skip + sum(notar) because each validator votes skip OR notar - // So the condition: skip + sum(notar) - max(notar) >= 1/3 - // Becomes: notarize_or_skip_weight - max(notar) >= threshold_33 - // Or: notarize_or_skip_weight >= threshold_33 + max(notar) - // - // Only relevant when skip alone isn't enough for SlotSkipped. - // If skip >= 2/3, SlotSkipped triggers via normal path - no fallback needed. - // - // SKIP when enable_fallback_protocol = false (C++ compatible mode) - if self.opts.enable_fallback_protocol && !safe_to_skip_published { - let max_notarize = notarize_weights.values().max().copied().unwrap_or(0); - if skip_or_skip_fallback_weight < threshold_66 // skip alone isn't enough for normal path - && notarize_or_skip_weight >= threshold_33 + max_notarize - { - log::trace!( - "SimplexState::check_thresholds: ({}/{}) SAFE_TO_SKIP n|s={}({:.0}%) max_notar={}", - window_idx, - slot_id, - notarize_or_skip_weight, - 100.0 * notarize_or_skip_weight as f64 / total_weight as f64, - max_notarize - ); - - if let Some(sv) = self.slot_votes.get_mut(&slot_id) { - sv.safe_to_skip_published = true; - } - self.on_safe_to_skip(slot_id); - } } // BlockFinalized: finalize(b) >= 2/3 @@ -3761,7 +2890,7 @@ impl SimplexState { } } - // SlotSkipped: skip_or_skip_fallback >= 2/3 (skip certificate) + // SlotSkipped: skip >= 2/3 (skip certificate) // This means finalization is no longer possible for this slot. // We only emit this if we haven't already finalized the slot. // C++ doesn't gate on sequential order - events are emitted as thresholds are reached. @@ -3773,17 +2902,17 @@ impl SimplexState { if !slot_skipped_published && !block_finalized_published && can_emit_skip - && skip_or_skip_fallback_weight >= threshold_66 + && skip_weight >= threshold_66 { log::trace!( - "SimplexState::check_thresholds: ({}/{}) SKIPPED s|fb={}({:.0}%)", + "SimplexState::check_thresholds: ({}/{}) SKIPPED skip={}({:.0}%)", window_idx, slot_id, - skip_or_skip_fallback_weight, - 100.0 * skip_or_skip_fallback_weight as f64 / total_weight as f64 + skip_weight, + 100.0 * skip_weight as f64 / total_weight as f64 ); - // Create and cache skip certificate, emit event (C++ mode only for broadcast) + // Create and cache skip certificate, then emit relay event. let skip_cert = if let Some(sv) = self.slot_votes.get_mut(&slot_id) { sv.slot_skipped_published = true; @@ -3817,14 +2946,10 @@ impl SimplexState { self.push_event_back(SimplexEvent::SlotSkipped(SlotSkippedEvent { slot: slot_id })); - // Emit SkipCertificateReached event for broadcasting (C++ mode only) - // Alpenglow paper doesn't require explicit skip certificate broadcast - if !self.opts.enable_fallback_protocol { - if let Some(cert) = skip_cert { - self.push_event_back(SimplexEvent::SkipCertificateReached( - SkipCertificateReachedEvent { slot: slot_id, certificate: cert }, - )); - } + if let Some(cert) = skip_cert { + self.push_event_back(SimplexEvent::SkipCertificateReached( + SkipCertificateReachedEvent { slot: slot_id, certificate: cert }, + )); } // Update notarized-parent chain tracking (C++ pool.cpp parity, always maintained): @@ -3868,7 +2993,7 @@ impl SimplexState { /// upon BlockNotarized(s, hash(b)) do /// - /// Reference: Alpenglow Algorithm 1 + /// Reference: protocol event handler for notarized block. /// /// ```text /// state[s] ← state[s] ∪ {BlockNotarized(hash(b))} @@ -3903,7 +3028,7 @@ impl SimplexState { self.ensure_window_exists(window_idx); - // Alpenglow: state[s] ← state[s] ∪ {BlockNotarized(hash(b))} + // Record observed notarization certificate in slot state. if let Some(window) = self.get_window_mut(window_idx) { window.slots[offset].observed_notar_certificate = Some(CandidateParentInfo { slot, hash: block_hash.clone() }); @@ -3939,141 +3064,39 @@ impl SimplexState { // This prevents stale updates and also prevents overwriting a deadline // that was freshly set by advance_leader_window_on_progress_cursor when // notarization of the last window slot caused a window transition. - if !self.opts.enable_fallback_protocol { - let next_slot = slot + 1; - if self.skip_slot <= slot { - if let Some(base) = self.timeout_base { - let window_start = - self.current_leader_window_idx.window_start(self.slots_per_leader_window); - - // C++ timeout_slot_ = slot+2 normally, slot+1 at window end. - // Rust skip_slot = C++ timeout_slot_ - 1. - let is_window_end = next_slot.value() % self.slots_per_leader_window == 0; - let cpp_timeout_slot = - if is_window_end { next_slot.value() } else { next_slot.value() + 1 }; - - let offset = cpp_timeout_slot - window_start.value(); - let new_deadline = base + self.target_rate_timeout * offset; - - log::debug!( - "SimplexState::on_block_notarized: advancing skip timer: \ - skip_slot {} -> {next_slot}, deadline at base+{}*target_rate", - self.skip_slot, - offset, - ); - self.skip_slot = next_slot; - self.skip_timestamp = Some(new_deadline); - } + let next_slot = slot + 1; + if self.skip_slot <= slot { + if let Some(base) = self.timeout_base { + let window_start = + self.current_leader_window_idx.window_start(self.slots_per_leader_window); + + // C++ timeout_slot_ = slot+2 normally, slot+1 at window end. + // Rust skip_slot = C++ timeout_slot_ - 1. + let is_window_end = next_slot.value() % self.slots_per_leader_window == 0; + let cpp_timeout_slot = + if is_window_end { next_slot.value() } else { next_slot.value() + 1 }; + + let offset = cpp_timeout_slot - window_start.value(); + let new_deadline = base + self.target_rate_timeout * offset; + + log::debug!( + "SimplexState::on_block_notarized: advancing skip timer: \ + skip_slot {} -> {next_slot}, deadline at base+{}*target_rate", + self.skip_slot, + offset, + ); + self.skip_slot = next_slot; + self.skip_timestamp = Some(new_deadline); } } - // Alpenglow: tryFinal(s, hash(b)) + // Attempt finalization after notarization update. self.try_final(desc, slot, &block_hash); } - /// upon SafeToNotar(s, hash(b)) do - /// - /// Reference: Alpenglow Algorithm 1 - /// - /// ```text - /// trySkipWindow(s) - /// if ItsOver ∉ state[s] then - /// broadcast NotarFallbackVote(s, hash(b)) - /// state[s] ← state[s] ∪ {BadWindow} - /// ``` - fn on_safe_to_notar(&mut self, slot: SlotIndex, block_hash: UInt256) { - log::trace!( - "SimplexState::on_safe_to_notar: slot={}, block_hash={}", - slot, - block_hash.to_hex_string() - ); - - if slot < self.first_non_finalized_slot { - return; - } - - let window_idx = slot.window_index(self.slots_per_leader_window); - let offset = slot.offset_in_window(self.slots_per_leader_window) as usize; - - // Alpenglow: trySkipWindow(s) - self.try_skip_window(window_idx); - - // Alpenglow: if ItsOver ∉ state[s] then - self.ensure_window_exists(window_idx); - - // Check if we should broadcast (without holding mutable borrow) - let should_broadcast = - self.get_window(window_idx).map(|w| !w.slots[offset].its_over).unwrap_or(false); - - if should_broadcast { - // Alpenglow: broadcast NotarFallbackVote(s, hash(b)) - log::trace!( - "SimplexState::on_safe_to_notar: ({}/{}) broadcasting notar-fb for {}:{}, marking BadWindow", - window_idx, - slot, - slot, - &block_hash.to_hex_string()[..8] - ); - - self.broadcast_vote(Vote::NotarizeFallback(NotarizeFallbackVote { slot, block_hash })); - - // Alpenglow: state[s] ← state[s] ∪ {BadWindow} - if let Some(window) = self.get_window_mut(window_idx) { - window.slots[offset].is_bad_window = true; - } - } - } - - /// upon SafeToSkip(s) do - /// - /// Reference: Alpenglow Algorithm 1 - /// - /// ```text - /// trySkipWindow(s) - /// if ItsOver ∉ state[s] then - /// broadcast SkipFallbackVote(s) - /// state[s] ← state[s] ∪ {BadWindow} - /// ``` - fn on_safe_to_skip(&mut self, slot: SlotIndex) { - log::trace!("SimplexState::on_safe_to_skip: slot={}", slot); - - if slot < self.first_non_finalized_slot { - return; - } - - let window_idx = slot.window_index(self.slots_per_leader_window); - let offset = slot.offset_in_window(self.slots_per_leader_window) as usize; - - // Alpenglow: trySkipWindow(s) - self.try_skip_window(window_idx); - - // Alpenglow: if ItsOver ∉ state[s] then - self.ensure_window_exists(window_idx); - - // Check if we should broadcast (without holding mutable borrow) - let should_broadcast = - self.get_window(window_idx).map(|w| !w.slots[offset].its_over).unwrap_or(false); - - if should_broadcast { - // Alpenglow: broadcast SkipFallbackVote(s) - log::trace!( - "SimplexState::on_safe_to_skip: ({}/{}) broadcasting skip-fb, marking BadWindow", - window_idx, - slot - ); - - self.broadcast_vote(Vote::SkipFallback(SkipFallbackVote { slot })); - - // Alpenglow: state[s] ← state[s] ∪ {BadWindow} - if let Some(window) = self.get_window_mut(window_idx) { - window.slots[offset].is_bad_window = true; - } - } - } - /// upon ParentReady(window, hash(b)) do /// - /// Reference: C++ handle ParentReady event, Alpenglow Algorithm 1 + /// Reference: C++ parent-ready event handling. /// /// # Errors /// @@ -4141,7 +3164,7 @@ impl SimplexState { self.ensure_window_exists(window_idx); - // Alpenglow: state[window.first_slot] ← state[window.first_slot] ∪ {ParentReady(hash(b))} + // Store newly available parent base for this window. if let Some(window) = self.get_window_mut(window_idx) { let is_new = window.available_bases.insert(parent.clone()); log::trace!( @@ -4179,10 +3202,10 @@ impl SimplexState { } } - // Alpenglow: checkPendingBlocks() + // Retry pending candidates after parent-base update. self.check_pending_blocks(desc); - // Alpenglow: setTimeouts(window) with adaptive backoff + // Arm timeout schedule for the window with adaptive backoff. if self.current_leader_window_idx < window_idx { log::trace!( "SimplexState::on_window_base_ready: ({}/{}) advancing window {}->{}", @@ -4232,7 +3255,9 @@ impl SimplexState { /// function tryNotar(Block(s, hash, hashparent)) /// - /// Reference: Alpenglow Algorithm 2 + /// Rule mapping: + /// - `Simplex.md` helper `tryNotar`. + /// - C++ `consensus.cpp::on_candidate_to_notarize` / `try_notarize`. /// /// ```text /// if Voted ∈ state[s] then return false @@ -4261,43 +3286,15 @@ impl SimplexState { self.ensure_window_exists(window_idx); - // "Already voted" semantics differ by mode: - // - Alpenglow (enable_fallback_protocol=true): any local vote blocks notar - // - C++ compatible (enable_fallback_protocol=false): skip does NOT block notar - // - // Reference (C++): consensus.cpp on_candidate_to_notarize checks only voted_notar, + // C++ parity: consensus.cpp on_candidate_to_notarize checks only voted_notar, // allowing Notarize after Skip. if let Some(window) = self.get_window(window_idx) { let slot_state = &window.slots[offset]; - let already_voted = if self.opts.enable_fallback_protocol { - // Alpenglow: Voted ∈ state[s] - // - // Invariant (debug only): if any "local decision" flag is set, - // then `is_voted` must also be set. - debug_assert!( - !slot_state.voted_skip || slot_state.is_voted, - "SimplexState invariant violated: voted_skip implies is_voted (slot={})", - slot.value() - ); - debug_assert!( - slot_state.voted_notar.is_none() || slot_state.is_voted, - "SimplexState invariant violated: voted_notar implies is_voted (slot={})", - slot.value() - ); - debug_assert!( - !slot_state.its_over || slot_state.is_voted, - "SimplexState invariant violated: its_over implies is_voted (slot={})", - slot.value() - ); - - slot_state.is_voted - } else { - // C++ parity: only voted_notar gates notarization. C++ try_notarize() - // does NOT check voted_final/its_over — a slot that was finalized on a - // previous run can still be re-notarized after restart (the later - // auto-finalize simply skips re-broadcasting). - slot_state.voted_notar.is_some() - }; + // C++ parity: only voted_notar gates notarization. C++ try_notarize() + // does NOT check voted_final/is_completed — a slot that was finalized on a + // previous run can still be re-notarized after restart (the later + // auto-finalize simply skips re-broadcasting). + let already_voted = slot_state.voted_notar.is_some(); if already_voted { log::trace!("SimplexState::try_notar: slot {} already voted", slot); @@ -4332,22 +3329,22 @@ impl SimplexState { &block_hash.to_hex_string()[..8] ); - // Alpenglow: broadcast NotarVote(s, hash) + // Broadcast notarize vote. self.broadcast_vote(Vote::Notarize(NotarizeVote { slot, block_hash: block_hash.clone(), })); - // Alpenglow: state[s] ← state[s] ∪ {Voted, VotedNotar(hash)} + // Mark local notar vote state. if let Some(window) = self.get_window_mut(window_idx) { window.slots[offset].is_voted = true; window.slots[offset].voted_notar = Some(CandidateParentInfo { slot, hash: block_hash.clone() }); - // Alpenglow: pendingBlocks[s] ← ⊥ + // Clear pending candidate after successful notar vote. window.slots[offset].pending_block = None; } - // Alpenglow: tryFinal(s, hash) + // Try to finalize immediately if conditions are met. self.try_final(desc, slot, block_hash); return true; @@ -4358,11 +3355,13 @@ impl SimplexState { /// function tryFinal(s, hash(b)) /// - /// Reference: Alpenglow Algorithm 2 + /// Rule mapping: + /// - `Simplex.md` helper `tryFinal`. + /// - C++ `consensus.cpp` finalize gating on local slot state. /// /// ```text /// if BlockNotarized(hash(b)) ∈ state[s] and VotedNotar(hash(b)) ∈ state[s] - /// and BadWindow ∉ state[s] then + /// and TimeoutSkipped ∉ state[s] then /// broadcast FinalVote(s) /// state[s] ← state[s] ∪ {ItsOver} /// ``` @@ -4375,45 +3374,36 @@ impl SimplexState { let should_vote_final = if let Some(window) = self.get_window(window_idx) { let slot_state = &window.slots[offset]; - // Alpenglow: BlockNotarized(hash(b)) ∈ state[s] + // Slot has observed notarization for this hash. let has_notar_cert = slot_state .observed_notar_certificate .as_ref() .map(|c| c.hash == *block_hash) .unwrap_or(false); - // Alpenglow: VotedNotar(hash(b)) ∈ state[s] + // Local node voted notarize for this hash. let voted_notar = slot_state.voted_notar.as_ref().map(|c| c.hash == *block_hash).unwrap_or(false); - // Alpenglow: BadWindow ∉ state[s] - // C++ try_vote_final does NOT check bad_window — it only checks - // voted_skip, voted_final, and voted_notar==notar_cert. - let not_bad_window = if self.opts.enable_fallback_protocol { - !slot_state.is_bad_window - } else { - true // C++ doesn't check bad_window in try_vote_final - }; - let not_its_over = !slot_state.its_over; + let not_is_completed = !slot_state.is_completed; // C++: do not auto-finalize if we already voted skip for this slot. // Reference: C++ consensus.cpp: `!voted_skip && !voted_final && voted_notar==id` - // Both modes now match C++ strictly: once voted_skip, never finalize. + // C++ rule: once voted_skip, never auto-finalize this slot. let not_voted_skip = !slot_state.voted_skip; - let result = - has_notar_cert && voted_notar && not_bad_window && not_its_over && not_voted_skip; + let result = has_notar_cert && voted_notar && not_is_completed && not_voted_skip; - // Log when finalize is blocked specifically by voted_skip (Alpenglow mode only) + // Log when finalize is blocked by local skip vote. if has_notar_cert && voted_notar && !not_voted_skip { log::warn!( "SimplexState::try_final: ({}/{}) FINALIZE BLOCKED by voted_skip! \ - cert={} notar={} bad_window={} its_over={} voted_skip={}", + cert={} notar={} bad_window={} is_completed={} voted_skip={}", window_idx, slot, has_notar_cert, voted_notar, - slot_state.is_bad_window, - slot_state.its_over, + slot_state.is_timeout_skipped, + slot_state.is_completed, slot_state.voted_skip, ); } @@ -4425,13 +3415,13 @@ impl SimplexState { if slot_state.is_voted { flags.push("V"); } - if slot_state.is_bad_window { + if slot_state.is_timeout_skipped { flags.push("Bad"); } if slot_state.voted_skip { flags.push("Skip"); } - if slot_state.its_over { + if slot_state.is_completed { flags.push("Over"); } if slot_state.pending_block.is_some() { @@ -4514,16 +3504,16 @@ impl SimplexState { &block_hash.to_hex_string()[..8] ); - // Alpenglow: broadcast FinalVote(s) + // Broadcast finalize vote. self.broadcast_vote(Vote::Finalize(FinalizeVote { slot, block_hash: block_hash.clone(), })); - // Alpenglow: state[s] ← state[s] ∪ {ItsOver} + // Mark slot locally completed. // C++: slot->state->voted_final = true if let Some(window) = self.get_window_mut(window_idx) { - window.slots[offset].its_over = true; + window.slots[offset].is_completed = true; window.slots[offset].voted_final = true; } } @@ -4531,13 +3521,15 @@ impl SimplexState { /// function trySkipWindow(s) /// - /// Reference: Alpenglow Algorithm 2 + /// Rule mapping: + /// - `Simplex.md` helper `trySkipWindow`. + /// - C++ `consensus.cpp::alarm()` skip broadcast loop. /// /// ```text /// for k ∈ windowSlots(s) do /// if Voted ∉ state[k] then /// broadcast SkipVote(k) - /// state[k] ← state[k] ∪ {Voted, BadWindow} + /// state[k] ← state[k] ∪ {Voted, TimeoutSkipped} /// pendingBlocks[k] ← ⊥ /// ``` fn try_skip_window(&mut self, window_idx: WindowIndex) { @@ -4545,28 +3537,17 @@ impl SimplexState { let start_slot = window_idx * self.slots_per_leader_window; let num_slots = self.slots_per_leader_window as usize; - let enable_fallback = self.opts.enable_fallback_protocol; - - // Collect slots to skip - // - Alpenglow (enable_fallback_protocol=true): Skip only unvoted slots (Voted ∉ state[k]) - // - C++ compatible (enable_fallback_protocol=false): Skip all non-finalized slots - // + // Collect slots to skip. // C++ alarm() checks voted_final, not voted_notar: // Reference: C++ consensus.cpp alarm(): if (!affected_slot->voted_final) let mut slots_to_skip = Vec::new(); if let Some(window) = self.get_window(window_idx) { - // Alpenglow: for k ∈ windowSlots(s) do for i in 0..num_slots { - let should_skip = if enable_fallback { - // Alpenglow: if Voted ∉ state[k] then - !window.slots[i].is_voted - } else { - // C++: if !voted_final — once this node votes final, it cannot - // vote skip. This prevents split-brain deadlocks where some - // nodes vote skip and others vote final. - // Reference: C++ consensus.cpp alarm(): if (!affected_slot->voted_final) - !window.slots[i].voted_final - }; + // C++: if !voted_final — once this node votes final, it cannot + // vote skip. This prevents split-brain deadlocks where some + // nodes vote skip and others vote final. + // Reference: C++ consensus.cpp alarm(): if (!affected_slot->voted_final) + let should_skip = !window.slots[i].voted_final; if should_skip { slots_to_skip.push(start_slot + i as u32); } @@ -4587,39 +3568,32 @@ impl SimplexState { ); } - // Skip each unvoted slot - // Alpenglow: broadcast SkipVote(k) + // Skip each eligible slot. for slot in slots_to_skip { self.broadcast_vote(Vote::Skip(SkipVote { slot })); - // Alpenglow: state[k] ← state[k] ∪ {Voted, BadWindow} - // Alpenglow: pendingBlocks[k] ← ⊥ let offset = slot.offset_in_window(self.slots_per_leader_window) as usize; if let Some(window) = self.get_window_mut(window_idx) { window.slots[offset].is_voted = true; window.slots[offset].voted_skip = true; - window.slots[offset].is_bad_window = true; + window.slots[offset].is_timeout_skipped = true; // C++ alarm() only sets voted_skip — it does NOT clear pending_block. // The async try_notarize() coroutine can still complete after a skip // vote, producing both Skip and Notar votes for the same slot. - // Only clear pending_block in Alpenglow mode (strict Voted gate). - if enable_fallback { - window.slots[offset].pending_block = None; - } } } } /// function checkPendingBlocks() /// - /// Reference: Alpenglow Algorithm 2 + /// Reference: protocol helper logic. /// /// ```text /// for s : pendingBlocks[s] ≠ ⊥ do // iterate with increasing s /// tryNotar(pendingBlocks[s]) /// ``` fn check_pending_blocks(&mut self, desc: &SessionDescription) { - // Alpenglow: for s : pendingBlocks[s] ≠ ⊥ do (iterate with increasing s) + // Iterate pending slots in increasing order and retry notarization. // // Take ownership of pending slots for processing. Slots that still need retry // are pushed directly to `self.pending_slots` (which is now empty). @@ -4648,7 +3622,7 @@ impl SimplexState { }); if let Some((candidate_hash, parent)) = pending_info { - // Alpenglow: tryNotar(pendingBlocks[s]) + // Retry notarization for pending candidate. log::trace!( "SimplexState::check_pending_blocks: ({}/{}) trying pending {}", window_idx, @@ -4731,7 +3705,6 @@ impl SimplexState { } /// Get current leader window index - #[allow(dead_code)] pub fn get_current_leader_window_idx(&self) -> WindowIndex { self.current_leader_window_idx } @@ -4923,50 +3896,13 @@ impl SimplexState { if slot >= window_start && slot < window_end { let offset = (slot - window_start) as usize; - return window.slots[offset].its_over; + return window.slots[offset].is_completed; } } false } - /// Check if this node should generate a block for the current slot - #[allow(dead_code)] - pub fn should_generate_block( - &self, - desc: &SessionDescription, - ) -> Option<(SlotIndex, Option)> { - let slot = self.first_non_finalized_slot; - let window_idx = desc.get_window_idx(slot); - let offset = desc.get_slot_offset_in_window(slot) as usize; - - // Check if we're the leader - if !desc.is_self_leader(slot) { - return None; - } - - // Check if already voted - if let Some(window) = self.get_window(window_idx) { - if window.slots[offset].is_voted { - return None; - } - - // Get parent from available bases or previous slot - let parent = if desc.is_first_in_window(slot) { - // For first slot, pick any available base - window.available_bases.iter().next().cloned().flatten() - } else { - // For other slots, use voted_notar from previous slot - let prev_offset = offset - 1; - window.slots[prev_offset].voted_notar.clone() - }; - - return Some((slot, parent)); - } - - None - } - /// Get available parent for block generation at a given slot /// /// This is derived from per-slot `Slot.available_base` (C++ pool.cpp `SlotState::available_base`) @@ -4976,10 +3912,6 @@ impl SimplexState { /// - `available_base == Some(None)` → genesis base (parent is None) /// - `available_base == Some(Some(id))` → use `id` as parent /// - /// Parent validity depends on `require_finalized_parent` option: - /// - `require_finalized_parent=false` (C++ mode, default): parent can be notarized OR finalized - /// - `require_finalized_parent=true` (strict mode): parent must be finalized - /// /// Reference: C++ pool.cpp `SlotState::available_base`, block-producer.cpp `get_parent()`. pub fn get_available_parent( &self, @@ -5008,9 +3940,6 @@ impl SimplexState { /// - `available_base == Some(None)` → genesis base (parent is available) /// - `available_base == Some(Some(id))` → parent is available if it is valid /// - /// Parent validity depends on the `require_finalized_parent` option: - /// - `require_finalized_parent=false` (C++ mode, default): parent can be notarized OR finalized - /// - `require_finalized_parent=true` (strict mode): parent must be finalized pub fn has_available_parent(&self, desc: &SessionDescription, slot: SlotIndex) -> bool { let base = self.get_slot_available_base(desc, slot); match base { @@ -5024,81 +3953,17 @@ impl SimplexState { /// /// A parent is valid if: /// - Slot is finalized (< first_non_finalized_slot), OR - /// - When `require_finalized_parent = false`: slot has observed notarization certificate + /// - Slot has observed notarization certificate pub fn is_parent_valid(&self, parent_slot: SlotIndex) -> bool { // Finalized slots are always valid parents if parent_slot < self.first_non_finalized_slot { return true; } - // Strict mode: require finalized parent - if self.opts.require_finalized_parent { - return false; - } - - // C++ mode: notarized block is valid parent - // Check if the slot has observed notarization certificate + // Notarized block is a valid parent. self.has_notarized_block(parent_slot) } - /// Get indices of validators who voted finalize for a block in a slot - /// - /// Returns indices of validators who have finalize votes matching the block. - /// Used by SessionProcessor to collect signatures for on_block_committed. - #[allow(dead_code)] // Replaced by certificate.signatures - pub fn get_finalize_voters( - &self, - slot: SlotIndex, - block_hash: &UInt256, - ) -> Vec { - if let Some(slot_votes) = self.slot_votes.get(&slot) { - slot_votes - .votes - .iter() - .enumerate() - .filter_map(|(idx, v)| { - if let Some(ref finalize) = v.finalize { - if finalize.block_hash == *block_hash { - return Some(ValidatorIndex::from(idx)); - } - } - None - }) - .collect() - } else { - Vec::new() - } - } - - /// Get indices of validators who voted notarize for a block in a slot - /// - /// Returns indices of validators who have notarize votes matching the block. - /// Note: Prefer `get_notarize_certificate` which includes actual signatures. - #[allow(dead_code)] - pub fn get_notarize_voters( - &self, - slot: SlotIndex, - block_hash: &UInt256, - ) -> Vec { - if let Some(slot_votes) = self.slot_votes.get(&slot) { - slot_votes - .votes - .iter() - .enumerate() - .filter_map(|(idx, v)| { - if let Some(ref notarize) = v.notarize { - if notarize.block_hash == *block_hash { - return Some(ValidatorIndex::from(idx)); - } - } - None - }) - .collect() - } else { - Vec::new() - } - } - /// Get cached notarization certificate for a block in a slot /// /// Returns the cached certificate if notarization threshold (2/3) was reached, @@ -5114,7 +3979,11 @@ impl SimplexState { self.slot_votes.get(&slot).and_then(|sv| sv.get_notarize_certificate(block_hash)) } - /// Set notarization certificate from external source (query response) + /// Set notarization certificate from external source (query response). + /// + /// Rule mapping: + /// - `Simplex.md` certificate import path for notarized state. + /// - C++ `pool.cpp::handle_foreign_certificate` / `handle_saved_certificate`. /// /// Updates vote accounting with votes from the certificate so FSM recognizes /// the block as notarized. Called when we receive a candidate + notar cert @@ -5250,7 +4119,11 @@ impl SimplexState { Ok(true) } - /// Set finalization certificate from external source + /// Set finalization certificate from external source. + /// + /// Rule mapping: + /// - `Simplex.md` finalize-certificate state transition. + /// - C++ `pool.cpp::handle_foreign_certificate`. /// /// Updates FSM state as if we had received enough finalize votes to create /// the certificate. This is used when receiving a `consensus.simplex.certificate` @@ -5436,7 +4309,11 @@ impl SimplexState { Ok(true) } - /// Set skip certificate from external source (C++ parity) + /// Set skip certificate from external source. + /// + /// Rule mapping: + /// - `Simplex.md` skip-certificate transition. + /// - C++ `pool.cpp::handle_foreign_certificate`. /// /// Updates FSM state as if we had received enough skip votes to create /// the certificate. This is used when receiving a `consensus.simplex.certificate` @@ -5519,7 +4396,7 @@ impl SimplexState { if !had_notarize_or_skip { sv.notarize_or_skip_weight += weight; } - sv.skip_or_skip_fallback_weight += weight; + sv.skip_weight += weight; } } @@ -5563,13 +4440,10 @@ impl SimplexState { // C++ parity (pool.cpp handle_saved_certificate): re-gossip every newly // accepted certificate regardless of origin. // - // SkipCertificateReached is only relevant in C++-compatible mode - // (Alpenglow paper does not require explicit skip certificate broadcast). - if !self.opts.enable_fallback_protocol { - self.push_event_back(SimplexEvent::SkipCertificateReached( - SkipCertificateReachedEvent { slot, certificate: certificate.clone() }, - )); - } + self.push_event_back(SimplexEvent::SkipCertificateReached(SkipCertificateReachedEvent { + slot, + certificate: certificate.clone(), + })); Ok(true) } @@ -5593,7 +4467,7 @@ impl SimplexState { /// Used for testing vote accounting. #[cfg(test)] pub fn get_skip_weight(&self, slot: SlotIndex) -> ValidatorWeight { - self.slot_votes.get(&slot).map_or(0, |sv| sv.skip_or_skip_fallback_weight) + self.slot_votes.get(&slot).map_or(0, |sv| sv.skip_weight) } /// Check if a slot has a finalize certificate @@ -5633,23 +4507,6 @@ impl SimplexState { .unwrap_or(0) } - /// Get the candidate stored in slot state (if any) - /// - /// Returns the pending block or the voted_notar block info. - /// Used for retrieving block data during finalization. - #[allow(dead_code)] - pub fn get_slot_candidate(&self, slot: SlotIndex) -> Option<&Candidate> { - let window_idx = slot.window_index(self.slots_per_leader_window); - let offset = slot.offset_in_window(self.slots_per_leader_window) as usize; - - if let Some(window) = self.get_window(window_idx) { - if offset < window.slots.len() { - return window.slots[offset].pending_block.as_ref(); - } - } - None - } - /* ======================================================================== Notarized-Parent Chain Base Propagation (C++ pool.cpp parity) @@ -5809,9 +4666,11 @@ impl SimplexState { self.check_pending_blocks(desc); } - /// Advance progress cursor through all progressed slots + /// Advance progress cursor through all progressed slots. /// - /// Reference: C++ pool.cpp maybe_publish_new_leader_windows(): + /// Rule mapping: + /// - `Simplex.md` progress cursor over notarized/skipped slots. + /// - C++ `pool.cpp::maybe_publish_new_leader_windows`: /// `while (slot(now_).notarized || slot(now_).skipped) ++now_` /// /// This helper is always called to keep `first_non_progressed_slot` up-to-date with consensus progress. @@ -5999,13 +4858,13 @@ impl SimplexState { if slot.is_voted { flags.push("V"); } - if slot.is_bad_window { + if slot.is_timeout_skipped { flags.push("Bad"); } if slot.voted_skip { flags.push("Skip"); } - if slot.its_over { + if slot.is_completed { flags.push("Over"); } if slot.pending_block.is_some() { @@ -6033,21 +4892,15 @@ impl SimplexState { .unwrap_or_else(|| ("-".to_string(), "-".to_string(), "-".to_string())); // Get current slot vote weights - let (notar_weight, skip_weight, final_weight, notar_or_skip, skip_or_fb) = self + let (notar_weight, skip_weight, final_weight, notar_or_skip) = self .slot_votes .get(¤t_slot) .map(|sv| { let max_notar = sv.notarize_weight_by_block.values().max().copied().unwrap_or(0); let max_final = sv.finalize_weight_by_block.values().max().copied().unwrap_or(0); - ( - max_notar, - sv.skip_or_skip_fallback_weight, - max_final, - sv.notarize_or_skip_weight, - sv.skip_or_skip_fallback_weight, - ) + (max_notar, sv.skip_weight, max_final, sv.notarize_or_skip_weight) }) - .unwrap_or((0, 0, 0, 0, 0)); + .unwrap_or((0, 0, 0, 0)); // Get available bases for current window (formatted list) let bases_list: String = self @@ -6081,7 +4934,7 @@ impl SimplexState { "SimplexState: {current_window_idx}/{current_slot} \ first_non_finalized={} first_non_progressed={} flags=[{slot_flags}] \ notar={}({:.0}%) skip={}({:.0}%) final={}({:.0}%) n|s={}({:.0}%) \ - s|fb={}({:.0}%) th66/33={}({:.0}%)/{}({:.0}%) bases=[{bases_list}] \ + th66/33={}({:.0}%)/{}({:.0}%) bases=[{bases_list}] \ voted={voted_notar_short} cert={notar_cert_short} evts=[{events_list}]", self.first_non_finalized_slot, self.first_non_progressed_slot, @@ -6093,8 +4946,6 @@ impl SimplexState { pct(final_weight), notar_or_skip, pct(notar_or_skip), - skip_or_fb, - pct(skip_or_fb), threshold_66, pct(threshold_66), threshold_33, @@ -6129,12 +4980,11 @@ impl SimplexState { result.push_str(&format!( " - {current_slot} weights: notar={notar_weight}({:.1}%), \ skip={skip_weight}({:.1}%), final={final_weight}({:.1}%), \ - n|s={notar_or_skip}({:.1}%), s|fb={skip_or_fb}({:.1}%)\n", + n|s={notar_or_skip}({:.1}%)\n", pct(notar_weight), pct(skip_weight), pct(final_weight), - pct(notar_or_skip), - pct(skip_or_fb) + pct(notar_or_skip) )); // State info @@ -6182,13 +5032,13 @@ impl SimplexState { if slot.is_voted { flags.push("Voted"); } - if slot.is_bad_window { - flags.push("BadWindow"); + if slot.is_timeout_skipped { + flags.push("TimeoutSkipped"); } if slot.voted_skip { flags.push("VotedSkip"); } - if slot.its_over { + if slot.is_completed { flags.push("ItsOver"); } if slot.pending_block.is_some() { @@ -6226,9 +5076,6 @@ impl SimplexState { if sv.block_notarized_published { pub_flags.push("Notarized"); } - if sv.safe_to_skip_published { - pub_flags.push("SafeToSkip"); - } if sv.block_finalized_published { pub_flags.push("Finalized"); } @@ -6239,12 +5086,12 @@ impl SimplexState { if pub_flags.is_empty() { "none".to_string() } else { pub_flags.join("|") }; result.push_str(&format!( - " - s{}: n|s={}({:.1}%), s|fb={}({:.1}%), published=[{}]\n", + " - s{}: n|s={}({:.1}%), skip={}({:.1}%), published=[{}]\n", slot_id, sv.notarize_or_skip_weight, pct(sv.notarize_or_skip_weight), - sv.skip_or_skip_fallback_weight, - pct(sv.skip_or_skip_fallback_weight), + sv.skip_weight, + pct(sv.skip_weight), pub_flags_str )); @@ -6309,7 +5156,7 @@ impl SimplexState { if let Some(sv) = self.slot_votes.get(&slot_idx) { for j in 0..num_validators { let vv = &sv.votes[j]; - let has_skip = vv.skip.is_some() || vv.fallback_skip.is_some(); + let has_skip = vv.skip.is_some(); if vv.finalize.is_some() { sb.push('F'); } else if vv.notarize.is_some() && has_skip { @@ -6343,6 +5190,115 @@ impl SimplexState { sb } + + /// Collect structured diagnostics for non-finalized slots, grouped by leader window. + /// + /// Each window reports its leader identity and each non-finalized slot within it + /// reports its wait phase, vote weight percentages, and flags. + pub fn collect_window_diagnostics(&self, desc: &SessionDescription) -> Vec { + let total_weight = desc.get_total_weight(); + let pct = |w: u64| -> f64 { + if total_weight == 0 { + 0.0 + } else { + 100.0 * w as f64 / total_weight as f64 + } + }; + let first_nf = self.first_non_finalized_slot; + + let mut windows = Vec::new(); + for window in &self.leader_windows { + let w_start = window.start_slot; + let w_end = w_start + window.slots.len() as u32; + + let mut slot_diags = Vec::new(); + for (i, slot) in window.slots.iter().enumerate() { + let slot_idx = w_start + i as u32; + if slot_idx < first_nf { + continue; + } + + let sv = self.slot_votes.get(&slot_idx); + + let max_notar_weight = sv + .map(|v| v.notarize_weight_by_block.values().max().copied().unwrap_or(0)) + .unwrap_or(0); + let max_final_weight = sv + .map(|v| v.finalize_weight_by_block.values().max().copied().unwrap_or(0)) + .unwrap_or(0); + let skip_weight = sv.map(|v| v.skip_weight).unwrap_or(0); + let notar_or_skip_weight = sv.map(|v| v.notarize_or_skip_weight).unwrap_or(0); + let has_notar_cert = sv.map(|v| v.notarize_certificate.is_some()).unwrap_or(false); + let has_final_cert = sv.map(|v| v.finalize_certificate.is_some()).unwrap_or(false); + let has_skip_cert = sv.map(|v| v.skip_certificate.is_some()).unwrap_or(false); + + let (phase, reason) = if slot.is_completed && has_final_cert { + (SlotWaitPhase::Finalized, "finalized".to_string()) + } else if slot.is_timeout_skipped { + (SlotWaitPhase::TimeoutSkipped, "bad_window_active".to_string()) + } else if slot.skipped { + (SlotWaitPhase::Skipped, "skip_cert_reached".to_string()) + } else if slot.observed_notar_certificate.is_some() { + if has_final_cert { + (SlotWaitPhase::Finalized, "final_cert_present".to_string()) + } else { + ( + SlotWaitPhase::NotarizedWaitingForFinalization, + format!("final_weight_below_th66 ({:.0}%)", pct(max_final_weight)), + ) + } + } else if slot.pending_block.is_some() { + if slot.available_base.is_none() { + (SlotWaitPhase::WaitingForParentBase, "no_available_base".to_string()) + } else { + ( + SlotWaitPhase::WaitingForNotarization, + format!("notar_weight_below_th66 ({:.0}%)", pct(max_notar_weight)), + ) + } + } else if slot.available_base.is_none() { + (SlotWaitPhase::WaitingForParentBase, "no_available_base".to_string()) + } else { + (SlotWaitPhase::WaitingForCandidate, "no_pending_block".to_string()) + }; + + slot_diags.push(SlotDiagnostic { + slot: slot_idx, + window_idx: window.window_idx, + phase, + reason, + has_pending_block: slot.pending_block.is_some(), + available_parent: slot.available_base.is_some(), + voted_notar: slot.voted_notar.is_some(), + voted_skip: slot.voted_skip, + voted_final: slot.voted_final, + has_notar_cert, + has_final_cert, + has_skip_cert, + notar_weight_pct: pct(max_notar_weight), + final_weight_pct: pct(max_final_weight), + skip_weight_pct: pct(skip_weight), + notar_or_skip_weight_pct: pct(notar_or_skip_weight), + is_timeout_skipped: slot.is_timeout_skipped, + }); + } + + if slot_diags.is_empty() { + continue; + } + + let leader_idx = desc.get_leader(w_start); + windows.push(WindowDiagnostic { + window_idx: window.window_idx, + slot_begin: w_start, + slot_end: SlotIndex(w_end.0.saturating_sub(1)), + leader_idx, + had_timeouts: window.had_timeouts, + slots: slot_diags, + }); + } + windows + } } /* diff --git a/src/node/simplex/src/startup_recovery.rs b/src/node/simplex/src/startup_recovery.rs index cd2e424..2dcc672 100644 --- a/src/node/simplex/src/startup_recovery.rs +++ b/src/node/simplex/src/startup_recovery.rs @@ -35,7 +35,7 @@ //! - [`SessionStartupRecoveryListener`]: Object-safe trait for recovery operations //! - [`SessionStartupRecoveryProcessor`]: Coordinator that loads bootstrap and drives recovery //! -//! See [`RESTART-RECOMMIT-PLAN.md`] for detailed design documentation. +//! See the startup recovery section in crate-level docs for design details. use crate::{ block::{RawCandidateId, SlotIndex, ValidatorIndex, WindowIndex}, @@ -183,18 +183,18 @@ pub(crate) trait SessionStartupRecoveryListener { /// * `block_hash` - The hash of the finalized block fn recovery_seed_finalized_block(&mut self, slot: SlotIndex, block_hash: CandidateHash); - /// Seed ALL finalized blocks into received_candidates for parent resolution. + /// Seed ALL finalized blocks into `received_candidates` for restart-side parent/tip lookups. /// /// After restart, `received_candidates` is empty, but collation/validation require /// parent `BlockIdExt` to be resolvable. This seeds all finalized blocks so their - /// Block idExt can be looked up during parent resolution. + /// `BlockIdExt` can be looked up after restart without waiting for new bodies. /// /// # Arguments /// /// * `finalized_blocks` - All finalized blocks from bootstrap fn recovery_seed_received_candidates(&mut self, finalized_blocks: &[FinalizedBlockRecord]); - /// Seed a candidate into `received_candidates` for parent resolution. + /// Seed a candidate into `received_candidates` for restart-side parent/tip lookups. /// /// After restart, collation uses the FSM progress cursor (`first_non_progressed_slot`) /// and can require a notarized (but not finalized) parent candidate's `BlockIdExt`. @@ -551,13 +551,13 @@ impl SessionStartupRecoveryProcessor { ); self.restore_notar_cert_cache(listener, &receiver_boot.notar_certs)?; - // Step 9b: Seed notarized candidates into received_candidates for parent resolution + // Step 9b: Seed notarized candidates into `received_candidates` for post-restart lookups. // // This ensures post-restart collation can resolve `BlockIdExt` for notarized parents // without relying on `requestCandidate` (which may be impossible in single-node tests). log::debug!( target: LOG_TARGET, - "Session {}: step 9b/12 - seeding {} candidate infos into received_candidates for parent resolution", + "Session {}: step 9b/12 - seeding {} candidate infos into received_candidates for post-restart parent/tip lookups", self.session_id.to_hex_string(), self.candidate_info_map.len() ); @@ -620,7 +620,7 @@ impl SessionStartupRecoveryProcessor { Ok(()) } - /// Seed notarized candidates into `received_candidates` for parent resolution. + /// Seed notarized candidates into `received_candidates` for post-restart parent/tip lookups. /// /// Uses `candidate_info_map` to reconstruct minimal metadata (BlockIdExt + parent id + hash data bytes) /// for candidates that have a stored NotarCert record. @@ -938,13 +938,13 @@ impl SessionStartupRecoveryProcessor { /// `BlockFinalized(last_known_finalized_block, true)` after loading. /// /// This notification seeds ALL finalized blocks into `received_candidates` - /// for parent resolution, then notifies about the last finalized block. + /// for restart-side parent/tip lookups, then notifies about the last finalized block. fn notify_last_finalized_block( &self, listener: &mut dyn SessionStartupRecoveryListener, finalized_blocks: &[FinalizedBlockRecord], ) { - // First, seed ALL finalized blocks into received_candidates for parent resolution + // First, seed ALL finalized blocks into `received_candidates` for restart-side lookups. listener.recovery_seed_received_candidates(finalized_blocks); // Find the last block with is_final=true diff --git a/src/node/simplex/src/tests/test_crypto.rs b/src/node/simplex/src/tests/test_crypto.rs index 411bdbf..e2de328 100644 --- a/src/node/simplex/src/tests/test_crypto.rs +++ b/src/node/simplex/src/tests/test_crypto.rs @@ -240,9 +240,7 @@ fn test_compute_candidate_id_hash_with_block_id() { */ use crate::{ - simplex_state::{ - FinalizeVote, NotarizeFallbackVote, NotarizeVote, SkipFallbackVote, SkipVote, Vote, - }, + simplex_state::{FinalizeVote, NotarizeVote, SkipVote, Vote}, utils::{ extract_vote, sign_vote, tl_unsigned_to_vote, verify_vote_signature, vote_to_tl_unsigned, }, @@ -306,35 +304,6 @@ fn test_skip_vote_roundtrip() { assert_eq!(vote, vote_back); } -/// Test NotarizeFallbackVote conversion returns error -/// -/// Note: C++ protocol doesn't support fallback votes in TL format. -#[test] -fn test_notarize_fallback_vote_roundtrip() { - let block_hash = UInt256::from([7u8; 32]); - - let vote = Vote::NotarizeFallback(NotarizeFallbackVote { - slot: SlotIndex::new(45), - block_hash: block_hash.clone(), - }); - - // Fallback votes are not supported in C++ TL format - should return error - let result = vote_to_tl_unsigned(&vote); - assert!(result.is_err(), "NotarizeFallback should not be supported in TL"); -} - -/// Test SkipFallbackVote conversion returns error -/// -/// Note: C++ protocol doesn't support fallback votes in TL format. -#[test] -fn test_skip_fallback_vote_roundtrip() { - let vote = Vote::SkipFallback(SkipFallbackVote { slot: SlotIndex::new(60) }); - - // Fallback votes are not supported in C++ TL format - should return error - let result = vote_to_tl_unsigned(&vote); - assert!(result.is_err(), "SkipFallback should not be supported in TL"); -} - /// Test vote signing and verification /// /// # C++ Reference (pool.cpp) diff --git a/src/node/simplex/src/tests/test_misbehavior.rs b/src/node/simplex/src/tests/test_misbehavior.rs index 3e0db88..04203ce 100644 --- a/src/node/simplex/src/tests/test_misbehavior.rs +++ b/src/node/simplex/src/tests/test_misbehavior.rs @@ -111,15 +111,6 @@ fn test_conflict_reason_descriptions() { "notarize and finalize for different blocks" ); assert_eq!(ConflictReason::FinalizeAfterSkip.description(), "finalize after skip"); - assert_eq!(ConflictReason::NotarizeAfterSkip.description(), "notarize after skip"); - assert_eq!( - ConflictReason::FinalizeAfterNotarFallback.description(), - "finalize after notar-fallback" - ); - assert_eq!( - ConflictReason::FinalizeAfterSkipFallback.description(), - "finalize after skip-fallback" - ); } #[test] @@ -377,15 +368,15 @@ fn test_misbehavior_proof_accessors() { assert_eq!(proof.hash1(), Some(&hash1)); assert_eq!(proof.hash2(), Some(&hash2)); - let notarize_hash = UInt256::rand(); + let finalize_hash = UInt256::rand(); let proof2 = MisbehaviorProof::conflicting_types( SlotIndex::new(88), ValidatorIndex(22), VoteDescriptor::Skip, - VoteDescriptor::Notarize(notarize_hash.clone()), + VoteDescriptor::Finalize(finalize_hash.clone()), RawVoteData::default(), RawVoteData::default(), - ConflictReason::NotarizeAfterSkip, + ConflictReason::FinalizeAfterSkip, ); assert_eq!(proof2.slot(), SlotIndex::new(88)); assert_eq!(proof2.validator_idx(), ValidatorIndex(22)); @@ -394,5 +385,5 @@ fn test_misbehavior_proof_accessors() { assert_eq!(proof2.hash2(), None); // But we can access the vote descriptors assert_eq!(proof2.existing_vote(), Some(&VoteDescriptor::Skip)); - assert_eq!(proof2.new_vote(), Some(&VoteDescriptor::Notarize(notarize_hash))); + assert_eq!(proof2.new_vote(), Some(&VoteDescriptor::Finalize(finalize_hash))); } diff --git a/src/node/simplex/src/tests/test_receiver.rs b/src/node/simplex/src/tests/test_receiver.rs index 4e1d554..24840a3 100644 --- a/src/node/simplex/src/tests/test_receiver.rs +++ b/src/node/simplex/src/tests/test_receiver.rs @@ -11,9 +11,9 @@ //! Tests receiver communication with multiple instances using in-process overlay. //! Similar structure to `test_consensus.rs` and `catchain/tests/test_catchain_network.rs` //! -//! Note: This test was moved from `tests/test_receiver.rs` to internal tests -//! as part of CODE-2 (receiver privatization). The test now uses `crate::` -//! imports to access internal types like `Receiver`, `ReceiverListener`, etc. +//! Note: This test was moved from `tests/test_receiver.rs` to internal tests. +//! It now uses `crate::` imports to access internal types like `Receiver`, +//! `ReceiverListener`, etc. use crate::{ receiver::{Receiver, ReceiverListener, ReceiverListenerPtr}, @@ -177,7 +177,12 @@ impl ReceiverListener for TestReceiverListener { ); } - fn on_activity(&self, active_weight: ValidatorWeight, _last_activity: Vec>) { + fn on_activity( + &self, + active_weight: ValidatorWeight, + _last_activity: Vec>, + _snapshot: crate::receiver::ReceiverActivitySnapshot, + ) { self.stats.active_weight_updates.fetch_add(1, Ordering::Relaxed); self.stats.last_active_weight.store(active_weight, Ordering::Relaxed); log::trace!( @@ -1088,8 +1093,6 @@ fn certificate_slot(cert: &CertificateBoxed) -> u32 { crate::simplex_state::Vote::Notarize(v) => v.slot.value(), crate::simplex_state::Vote::Finalize(v) => v.slot.value(), crate::simplex_state::Vote::Skip(v) => v.slot.value(), - crate::simplex_state::Vote::NotarizeFallback(v) => v.slot.value(), - crate::simplex_state::Vote::SkipFallback(v) => v.slot.value(), } } diff --git a/src/node/simplex/src/tests/test_session_processor.rs b/src/node/simplex/src/tests/test_session_processor.rs index 6f5f960..0445742 100644 --- a/src/node/simplex/src/tests/test_session_processor.rs +++ b/src/node/simplex/src/tests/test_session_processor.rs @@ -15,7 +15,6 @@ use super::*; use crate::{ block::ValidatorIndex, receiver::Receiver, - simplex_state::SimplexStateOptions, task_queue::{CallbackTaskQueuePtr, TaskQueuePtr}, SessionId, SessionNode, SessionOptions, SIMPLEX_ROUNDLESS, }; @@ -36,17 +35,13 @@ use std::{ }; use ton_api::{ deserialize_boxed, - ton::{ - consensus::{ - candidatedata::Block as CandidateDataBlock, - simplex::{ - certificate::Certificate, unsignedvote::SkipVote, vote::Vote as TlVote, - votesignature::VoteSignature, votesignatureset::VoteSignatureSet, CandidateAndCert, - Certificate as CertificateBoxed, VoteSignature as VoteSignatureBoxed, - }, - CandidateData, CandidateParent, + ton::consensus::{ + simplex::{ + certificate::Certificate, unsignedvote::SkipVote, vote::Vote as TlVote, + votesignature::VoteSignature, votesignatureset::VoteSignatureSet, CandidateAndCert, + Certificate as CertificateBoxed, VoteSignature as VoteSignatureBoxed, }, - validator_session::candidate::Candidate as TlCandidate, + CandidateData, }, IntoBoxed, }; @@ -325,11 +320,16 @@ impl consensus_common::SessionListener for MockListener { fn get_approved_candidate( &self, _source: PublicKey, - _root_hash: UInt256, + root_hash: UInt256, _file_hash: UInt256, _collated_data_hash: UInt256, _callback: ValidatorBlockCandidateCallback, ) { + panic!( + "unexpected legacy get_approved_candidate request in session_processor MockListener \ + (root_hash={}); active simplex tests must not use this callback", + root_hash.to_hex_string() + ); } } @@ -435,12 +435,16 @@ impl consensus_common::SessionListener for RecordingListener { fn get_approved_candidate( &self, _source: PublicKey, - _root_hash: UInt256, + root_hash: UInt256, _file_hash: UInt256, _collated_data_hash: UInt256, _callback: ValidatorBlockCandidateCallback, ) { - // No-op + panic!( + "unexpected legacy get_approved_candidate request in session_processor \ + RecordingListener (root_hash={}); active simplex tests must not use this callback", + root_hash.to_hex_string() + ); } } @@ -776,11 +780,9 @@ fn test_genesis_collation_expected_seqno_uses_initial_block_seqno() { #[test] fn test_should_generate_empty_block_uses_committed_head_at_session_start() { - // With DISABLE_NON_FINALIZED_PARENTS_FOR_COLLATION=false (optimistic validation), - // shardchains use the MC lag threshold rule for empty-block generation, not the - // finalized-head rule. Only masterchain still uses finalized-head gating. - // - // Verify that masterchain sessions still use finalized-head empty-block gating. + // Masterchain uses finalized-head gating for empty-block generation (C++ parity: + // `last_consensus_finalized_seqno_ + 1 < new_seqno`), while shardchains use + // the MC lag threshold rule. Verify masterchain path. let nodes = create_test_validators(4); let local_idx = 0; let initial_block_seqno = 47; @@ -1019,66 +1021,6 @@ fn test_out_of_order_mode_does_not_run_commit_chain_recovery_for_missing_body() ); } -#[test] -fn test_update_resolution_cache_chain_handles_deep_descendant_chains() { - // Regression: in single-host nets, we can receive an old missing candidate late (e.g. slot ~5), - // when hundreds of descendants already exist. update_resolution_cache_chain used to recurse - // over descendants and could hit very deep recursion; it must handle deep chains safely. - let mut fixture = TestFixture::new(4); - - // Build a linear parent->child chain longer than the previous recursion warning threshold. - const N: usize = 256; - let mut ids: Vec = Vec::with_capacity(N); - - for i in 0..N { - let slot = SlotIndex::new(i as u32); - let candidate_hash = UInt256::rand(); - let id = RawCandidateId { slot, hash: candidate_hash.clone() }; - - let parent_id = if i == 0 { None } else { Some(ids[i - 1].clone()) }; - - let block_id = BlockIdExt::with_params( - ShardIdent::masterchain(), - i as u32, - UInt256::rand(), - UInt256::rand(), - ); - - fixture.processor.received_candidates.insert( - id.clone(), - ReceivedCandidate { - slot, - source_idx: ValidatorIndex::new(0), - candidate_id_hash: candidate_hash.clone(), - candidate_hash_data_bytes: vec![1, 2, 3], - block_id: block_id.clone(), - root_hash: block_id.root_hash.clone(), - file_hash: block_id.file_hash.clone(), - data: consensus_common::ConsensusCommonFactory::create_block_payload( - vec![0xAA].into(), - ), - collated_data: consensus_common::ConsensusCommonFactory::create_block_payload( - vec![0xBB].into(), - ), - receive_time: fixture.description.get_time(), - is_empty: false, - parent_id, - is_fully_resolved: false, - }, - ); - - ids.push(id); - } - - // Trigger an update from the root; resolution should propagate to all descendants. - fixture.processor.update_resolution_cache_chain(&ids[0]); - - for id in ids { - let r = fixture.processor.received_candidates.get(&id).expect("candidate missing"); - assert!(r.is_fully_resolved, "candidate {:?} should be resolved", id.slot); - } -} - #[test] fn test_check_all_updates_awake_time() { let mut fixture = TestFixture::new(4); @@ -1501,10 +1443,10 @@ fn test_time_isolation_between_tests() { } // ============================================================================ -// Batch Finalization Tests (BATCH-COMMIT-1 / TEST-BATCH-1) +// Batch Finalization Tests // ============================================================================ -/// TEST-BATCH-1: Notarized parents + finalized descendant (Case A) +/// Notarized parents + finalized descendant (Case A) /// /// Scenario: /// - slot 1: notarized (NotarCert), not finalized @@ -1526,9 +1468,9 @@ fn test_time_isolation_between_tests() { /// /// Status: PLACEHOLDER - full test requires FSM integration #[test] -#[ignore] // TODO(TEST-BATCH-1): requires FSM events + candidate resolution infrastructure +#[ignore] // TODO: requires FSM events + candidate resolution infrastructure fn test_batch_finalization_notarized_parents_finalized_descendant() { - // This test is a placeholder documenting the expected behavior for BATCH-COMMIT-1. + // This test is a placeholder documenting the expected behavior. // // Full implementation requires: // 1. Creating a SessionProcessor with RecordingListener that captures is_final @@ -1545,11 +1487,11 @@ fn test_batch_finalization_notarized_parents_finalized_descendant() { // For now, this serves as documentation of the expected behavior and a reminder // to implement the full test once the FSM integration infrastructure is in place. - todo!("TEST-BATCH-1: implement full batch finalization integration test"); + todo!("implement full batch finalization integration test"); } // ============================================================================ -// SIMPLEX_ROUNDLESS Mode Tests (ROUNDLESS-1) +// SIMPLEX_ROUNDLESS Mode Tests // ============================================================================ /// Test that SIMPLEX_ROUNDLESS constant is u32::MAX @@ -1559,28 +1501,6 @@ fn test_simplex_roundless_constant_value() { assert_eq!(SIMPLEX_ROUNDLESS, 0xFFFFFFFF, "SIMPLEX_ROUNDLESS should be 0xFFFFFFFF"); } -/// Test that SessionProcessor uses C++-compatible parenting options -/// -/// We keep `require_finalized_parent=false` (C++ mode) to avoid deadlock when a slot is -/// notarized but not finalized/skipped yet. ValidatorGroup limitation is handled separately -/// by forcing EMPTY collation on non-finalized parents. -#[test] -fn test_simplex_state_options_require_finalized_parent() { - // Default (cpp_compatible) should have require_finalized_parent=false - let cpp_compat = SimplexStateOptions::cpp_compatible(); - assert!( - !cpp_compat.require_finalized_parent, - "cpp_compatible() should have require_finalized_parent=false" - ); - - // With optimistic validation, the collation gate is disabled: - // ValidatorGroup now uses candidate-native validation, so non-finalized parents are allowed. - assert!( - !DISABLE_NON_FINALIZED_PARENTS_FOR_COLLATION, - "DISABLE_NON_FINALIZED_PARENTS_FOR_COLLATION should be false with optimistic validation enabled" - ); -} - // ============================================================================ // Direct Emission Model Tests (SIMPLEX_ROUNDLESS cleanup) // ============================================================================ @@ -1765,6 +1685,24 @@ fn insert_received_candidate( block_id: BlockIdExt, is_empty: bool, parent_id: Option, +) { + insert_received_candidate_with_gen_utime_ms( + processor, + candidate_id, + block_id, + is_empty, + parent_id, + None, + ); +} + +fn insert_received_candidate_with_gen_utime_ms( + processor: &mut SessionProcessor, + candidate_id: &RawCandidateId, + block_id: BlockIdExt, + is_empty: bool, + parent_id: Option, + gen_utime_ms: Option, ) { processor.received_candidates.insert( candidate_id.clone(), @@ -1780,10 +1718,10 @@ fn insert_received_candidate( collated_data: consensus_common::ConsensusCommonFactory::create_block_payload( Vec::new(), ), + gen_utime_ms, receive_time: SystemTime::now(), is_empty, parent_id, - is_fully_resolved: true, }, ); } @@ -1840,14 +1778,6 @@ fn skip_slot(fixture: &mut TestFixture, slot: SlotIndex) { .expect("set_skip_certificate failed"); } -fn strict_wait_for_parent_mode_enabled() -> bool { - matches!(PARENT_READINESS_MODE, ParentReadinessMode::StrictWaitForParent) -} - -fn strict_mc_session_gate_mode_enabled() -> bool { - matches!(MC_ACCEPTED_HEAD_MODE, McAcceptedHeadMode::StrictSessionProcessorGate) -} - #[test] fn test_check_validation_forwards_candidate_with_notarized_parent() { let mut fixture = TestFixture::new_shard(4); @@ -1864,21 +1794,12 @@ fn test_check_validation_forwards_candidate_with_notarized_parent() { let time = fixture.description.get_time(); insert_pending_validation(&mut fixture.processor, &child_id, raw_candidate, time); - // Before notarization: - // - strict mode: candidate stays blocked by WaitForParent - // - relaxed mode: candidate is forwarded early to validator-side checks + // Before notarization the candidate must stay blocked by WaitForParent. fixture.processor.check_validation(); - if strict_wait_for_parent_mode_enabled() { - assert!( - !fixture.processor.pending_approve.contains(&child_id), - "strict mode: candidate must not be forwarded when parent is not notarized" - ); - } else { - assert!( - fixture.processor.pending_approve.contains(&child_id), - "relaxed mode: candidate should be forwarded before parent notarization" - ); - } + assert!( + !fixture.processor.pending_approve.contains(&child_id), + "candidate must not be forwarded when parent is not notarized" + ); // Notarize the parent slot notarize_slot(&mut fixture, parent_slot, &parent_hash); @@ -1910,23 +1831,16 @@ fn test_check_validation_blocks_candidate_with_non_notarized_parent() { let time = fixture.description.get_time(); insert_pending_validation(&mut fixture.processor, &child_id, raw_candidate, time); - // Parent is NOT notarized — candidate must stay in pending_validations + // Parent is NOT notarized — candidate must stay in pending_validations. fixture.processor.check_validation(); - if strict_wait_for_parent_mode_enabled() { - assert!( - !fixture.processor.pending_approve.contains(&child_id), - "strict mode: candidate must NOT be forwarded when parent slot is not notarized" - ); - assert!( - fixture.processor.pending_validations.contains_key(&child_id), - "strict mode: candidate must remain in pending_validations" - ); - } else { - assert!( - fixture.processor.pending_approve.contains(&child_id), - "relaxed mode: candidate should be forwarded even when parent is not notarized" - ); - } + assert!( + !fixture.processor.pending_approve.contains(&child_id), + "candidate must NOT be forwarded when parent slot is not notarized" + ); + assert!( + fixture.processor.pending_validations.contains_key(&child_id), + "candidate must remain in pending_validations" + ); } #[test] @@ -2185,23 +2099,16 @@ fn test_check_validation_chains_notarized_parent_to_descendant() { // First check_validation: // - A (genesis) should pass - // - B should wait in strict mode, but can pass in relaxed mode + // - B should wait until parent slot A is notarized fixture.processor.check_validation(); assert!( fixture.processor.pending_approve.contains(&id_a), "genesis candidate A must be forwarded" ); - if strict_wait_for_parent_mode_enabled() { - assert!( - !fixture.processor.pending_approve.contains(&id_b), - "strict mode: candidate B must wait until parent slot is notarized" - ); - } else { - assert!( - fixture.processor.pending_approve.contains(&id_b), - "relaxed mode: candidate B may be forwarded before parent notarization" - ); - } + assert!( + !fixture.processor.pending_approve.contains(&id_b), + "candidate B must wait until parent slot is notarized" + ); // Notarize slot 0 (A's slot) notarize_slot(&mut fixture, slot_a, &hash_a); @@ -2233,39 +2140,21 @@ fn test_check_validation_wait_for_parent_requires_gap_skip_certificates() { let time = fixture.description.get_time(); insert_pending_validation(&mut fixture.processor, &child_id, raw_candidate, time); - // Parent notarized, but gap skips missing: - // - strict mode: still blocked - // - relaxed mode: forwarded early + // Parent notarized, but gap skips missing: still blocked. notarize_slot(&mut fixture, parent_slot, &parent_hash); fixture.processor.check_validation(); - if strict_wait_for_parent_mode_enabled() { - assert!( - !fixture.processor.pending_approve.contains(&child_id), - "strict mode: candidate must be blocked until all intermediate slots are skip-certified" - ); - } else { - assert!( - fixture.processor.pending_approve.contains(&child_id), - "relaxed mode: candidate can be forwarded before skip-gap coverage converges" - ); - } + assert!( + !fixture.processor.pending_approve.contains(&child_id), + "candidate must be blocked until all intermediate slots are skip-certified" + ); - // Add skip cert for slot 1 only: - // - strict mode: still blocked - // - relaxed mode: remains forwarded + // Add skip cert for slot 1 only: still blocked. skip_slot(&mut fixture, SlotIndex::new(1)); fixture.processor.check_validation(); - if strict_wait_for_parent_mode_enabled() { - assert!( - !fixture.processor.pending_approve.contains(&child_id), - "strict mode: candidate must remain blocked when gap skip coverage is partial" - ); - } else { - assert!( - fixture.processor.pending_approve.contains(&child_id), - "relaxed mode: candidate remains eligible while skip-gap coverage is partial" - ); - } + assert!( + !fixture.processor.pending_approve.contains(&child_id), + "candidate must remain blocked when gap skip coverage is partial" + ); // Add skip cert for slot 2 -> now eligible. skip_slot(&mut fixture, SlotIndex::new(2)); @@ -2296,18 +2185,114 @@ fn test_check_validation_wait_for_parent_rejects_parent_hash_mismatch() { notarize_slot(&mut fixture, parent_slot, ¬arized_parent_hash); fixture.processor.check_validation(); - if strict_wait_for_parent_mode_enabled() { - assert!( - !fixture.processor.pending_approve.contains(&child_id), - "strict mode: candidate must not be forwarded when parent hash mismatches notarized block" - ); - } else { - assert!( - fixture.processor.pending_approve.contains(&child_id) - || fixture.processor.rejected.contains(&child_id), - "relaxed mode: candidate should not be stalled only by WaitForParent mismatch" - ); - } + assert!( + !fixture.processor.pending_approve.contains(&child_id), + "candidate must not be forwarded when parent hash mismatches notarized block" + ); + assert!( + fixture.processor.pending_validations.contains_key(&child_id), + "candidate should remain pending while WaitForParent readiness cannot be proven" + ); +} + +#[test] +fn test_on_candidate_received_non_empty_does_not_wait_for_unresolved_ancestor_chain() { + let mut fixture = TestFixture::new(4); + + let grandparent_id = + RawCandidateId { slot: SlotIndex::new(0), hash: UInt256::from([0x71; 32]) }; + let parent_id = RawCandidateId { slot: SlotIndex::new(1), hash: UInt256::from([0x72; 32]) }; + let parent_block_id = BlockIdExt::with_params( + ShardIdent::masterchain(), + 10, + UInt256::from([0x73; 32]), + UInt256::from([0x74; 32]), + ); + insert_received_candidate( + &mut fixture.processor, + &parent_id, + parent_block_id, + false, + Some(grandparent_id), + ); + + let (leader_source, child_id, broadcast) = make_signed_block_broadcast_with_parent( + &fixture, + 2, + vec![0x55, 0x66, 0x77], + Some(parent_id.clone()), + ); + + fixture.processor.on_candidate_received(leader_source, broadcast, None); + + assert!( + fixture.processor.pending_validations.contains_key(&child_id), + "non-empty candidate must be admitted immediately even if only ancestor metadata is missing" + ); + assert!( + fixture.processor.requested_candidates.is_empty(), + "non-empty admission must not trigger ancestor prefetch requests" + ); + + let received = fixture + .processor + .received_candidates + .get(&child_id) + .expect("child candidate must be stored"); + assert!( + received.parent_id.as_ref() == Some(&parent_id), + "stored child candidate must preserve the explicit parent link" + ); +} + +#[test] +fn test_on_candidate_received_empty_waits_in_pending_validation_and_requests_missing_parent() { + let mut fixture = TestFixture::new(4); + + let parent_id = RawCandidateId { slot: SlotIndex::new(1), hash: UInt256::from([0x81; 32]) }; + let referenced_block = BlockIdExt::with_params( + ShardIdent::masterchain(), + 20, + UInt256::from([0x82; 32]), + UInt256::from([0x83; 32]), + ); + let (leader_source, child_id, broadcast) = make_signed_empty_block_broadcast_with_parent( + &fixture, + 2, + parent_id.clone(), + referenced_block, + ); + + fixture.processor.on_candidate_received(leader_source, broadcast, None); + + assert!( + fixture.processor.pending_validations.contains_key(&child_id), + "empty candidate must enter pending_validations immediately after ingress" + ); + assert!( + fixture.processor.requested_candidates.is_empty(), + "ingress must not prefetch parent metadata before WaitForParent is satisfied" + ); + + notarize_slot(&mut fixture, parent_id.slot, &parent_id.hash); + fixture.processor.check_validation(); + + assert!( + fixture.processor.pending_validations.contains_key(&child_id), + "empty candidate must remain pending while parent metadata is still missing" + ); + assert!( + fixture.processor.requested_candidates.contains_key(&parent_id), + "validation path must request the missing parent metadata on demand" + ); + assert!( + !fixture.processor.pending_approve.contains(&child_id), + "empty candidate must not enter pending_approve until the parent normal tip is resolvable" + ); + assert!( + !fixture.processor.rejected.contains(&child_id), + "missing parent metadata must defer empty approval instead of rejecting it" + ); } // ============================================================================ @@ -2321,7 +2306,6 @@ fn reset_health_alert_time(processor: &mut SessionProcessor, base: SystemTime) { let s = &mut processor.health_alert_state; s.last_progress_warn = base; s.last_activity_warn = base; - s.last_parent_aging_warn = base; s.last_cert_fail_warn = base; s.last_skip_ratio_warn = base; s.last_standstill_warn = base; @@ -2567,6 +2551,38 @@ fn test_update_collation_pacing_advances_on_repeated_calls() { ); } +#[test] +fn test_compute_collation_start_time_caps_parent_delay_to_target_rate() { + let mut fixture = TestFixture::new(4); + let base_time = SystemTime::UNIX_EPOCH + Duration::from_secs(1_700_000_000); + fixture.processor.set_time(base_time); + + let parent_id = RawCandidateId { slot: SlotIndex::new(0), hash: UInt256::rand() }; + let parent_block_id = + BlockIdExt::with_params(ShardIdent::masterchain(), 1, UInt256::rand(), UInt256::rand()); + let parent_gen_utime_ms = + base_time.duration_since(SystemTime::UNIX_EPOCH).unwrap().as_millis() as u64 + 500; + + insert_received_candidate_with_gen_utime_ms( + &mut fixture.processor, + &parent_id, + parent_block_id, + false, + None, + Some(parent_gen_utime_ms), + ); + + let parent_info = + crate::block::CandidateParentInfo { slot: parent_id.slot, hash: parent_id.hash.clone() }; + let start_time = fixture.processor.compute_collation_start_time(Some(&parent_info)); + + assert_eq!( + start_time, + base_time + fixture.description.opts().target_rate, + "collation start time should not be delayed by more than one target_rate from now" + ); +} + #[test] fn test_check_collation_blocks_before_earliest_time() { let mut fixture = TestFixture::new(4); @@ -2580,7 +2596,11 @@ fn test_check_collation_blocks_before_earliest_time() { fixture.processor.check_collation(); - assert_eq!(fixture.processor.get_next_awake_time(), gate_time); + assert_eq!( + fixture.processor.get_next_awake_time(), + base_time + MAX_AWAKE_TIMEOUT, + "with the temporary 10ms fallback poll, next_awake_time stays at the earlier fallback wake" + ); } #[test] @@ -2615,13 +2635,13 @@ fn test_check_collation_pacing_gate_is_idempotent() { fixture.processor.reset_next_awake_time(); fixture.processor.check_collation(); - assert_eq!(fixture.processor.get_next_awake_time(), gate_time); + assert_eq!(fixture.processor.get_next_awake_time(), base_time + MAX_AWAKE_TIMEOUT); assert_eq!(fixture.processor.earliest_collation_time, Some(gate_time)); fixture.processor.reset_next_awake_time(); fixture.processor.check_collation(); - assert_eq!(fixture.processor.get_next_awake_time(), gate_time); + assert_eq!(fixture.processor.get_next_awake_time(), base_time + MAX_AWAKE_TIMEOUT); } #[test] @@ -2727,24 +2747,16 @@ fn test_candidate_query_fallback_miss_returns_empty() { assert!(inner.candidate.is_empty(), "candidate bytes should be empty when not found"); } -fn make_signed_block_broadcast( +fn make_signed_block_broadcast_with_parent( fixture: &TestFixture, slot: u32, block_data: Vec, + parent_id: Option, ) -> (u32, RawCandidateId, CandidateData) { let collated_data: Vec = vec![]; let root_hash = UInt256::from_slice(&sha256_digest(&block_data)); let shard = fixture.processor.description.get_shard().clone(); - let tl_inner = TlCandidate { - src: UInt256::default(), - round: slot as i32, - root_hash: root_hash.clone(), - data: block_data.into(), - collated_data: collated_data.clone().into(), - }; - let candidate_bytes = consensus_common::serialize_tl_boxed_object!(&tl_inner.into_boxed()); - let block_id = BlockIdExt { shard_id: shard, seq_no: slot, @@ -2752,12 +2764,14 @@ fn make_signed_block_broadcast( file_hash: root_hash, }; let collated_file_hash = UInt256::from_slice(&sha256_digest(&collated_data)); + let data_bytes = block_data; + let collated_data_bytes = collated_data; let candidate_hash = crate::utils::compute_candidate_id_hash_u32( slot, Some(&block_id), Some(&collated_file_hash), - None, + parent_id.as_ref().map(|p| (p.slot.value(), &p.hash)), ); let session_id = fixture.processor.session_id().clone(); @@ -2767,14 +2781,65 @@ fn make_signed_block_broadcast( crate::utils::sign_candidate_u32(&session_id, slot, &candidate_hash, leader_key) .expect("signing failed"); - let broadcast = CandidateData::Consensus_Block(CandidateDataBlock { - slot: slot as i32, - candidate: candidate_bytes.into(), - parent: CandidateParent::Consensus_CandidateWithoutParents, - signature: signature.into(), - }); + let candidate_id = RawCandidateId { slot: SlotIndex::new(slot), hash: candidate_hash }; + let block = crate::block::BlockCandidate { + id: block_id, + collated_file_hash, + data: data_bytes, + collated_data: collated_data_bytes, + creator: leader_key.clone(), + }; + let raw_candidate = crate::block::RawCandidate::new( + candidate_id.clone(), + parent_id, + leader_idx, + block, + signature, + ); + let serialized = raw_candidate.serialize(false).expect("serialize RawCandidate"); + let msg = deserialize_boxed(&serialized).expect("deserialize CandidateData"); + let broadcast = msg.downcast::().expect("downcast CandidateData"); + + (leader_idx.value(), candidate_id, broadcast) +} + +fn make_signed_block_broadcast( + fixture: &TestFixture, + slot: u32, + block_data: Vec, +) -> (u32, RawCandidateId, CandidateData) { + make_signed_block_broadcast_with_parent(fixture, slot, block_data, None) +} + +fn make_signed_empty_block_broadcast_with_parent( + fixture: &TestFixture, + slot: u32, + parent_id: RawCandidateId, + referenced_block: BlockIdExt, +) -> (u32, RawCandidateId, CandidateData) { + let candidate_hash = crate::utils::compute_candidate_id_hash_empty( + &referenced_block, + (parent_id.slot, &parent_id.hash), + ); + let session_id = fixture.processor.session_id().clone(); + let leader_idx = fixture.processor.description.get_leader(SlotIndex::new(slot)); + let leader_key = fixture.processor.description.get_source_public_key(leader_idx); + let signature = + crate::utils::sign_candidate_u32(&session_id, slot, &candidate_hash, leader_key) + .expect("signing failed"); let candidate_id = RawCandidateId { slot: SlotIndex::new(slot), hash: candidate_hash }; + let raw_candidate = crate::block::RawCandidate::new_empty( + candidate_id.clone(), + parent_id, + leader_idx, + referenced_block, + signature, + ); + let serialized = raw_candidate.serialize(false).expect("serialize RawCandidate"); + let msg = deserialize_boxed(&serialized).expect("deserialize CandidateData"); + let broadcast = msg.downcast::().expect("downcast CandidateData"); + (leader_idx.value(), candidate_id, broadcast) } @@ -2924,7 +2989,7 @@ fn test_candidate_precheck_progress_gap_uses_progress_cursor() { } #[test] -fn test_register_resolved_candidate_keeps_slot_behind_progress_cursor_until_finalized() { +fn test_register_candidate_for_validation_keeps_slot_behind_progress_cursor_until_finalized() { let mut fixture = TestFixture::new(4); fixture.drain_receiver_actions(); @@ -2940,7 +3005,7 @@ fn test_register_resolved_candidate_keeps_slot_behind_progress_cursor_until_fina let raw_candidate = make_test_non_empty_candidate(candidate_id.clone(), None, &fixture.nodes); let receive_time = fixture.description.get_time(); - fixture.processor.register_resolved_candidate( + fixture.processor.register_candidate_for_validation( raw_candidate, slot, fixture.description.get_self_idx(), @@ -3049,10 +3114,10 @@ fn test_has_real_candidate_body_returns_false_for_stub() { collated_data: consensus_common::ConsensusCommonFactory::create_block_payload( Vec::new(), ), + gen_utime_ms: None, receive_time: fixture.processor.now(), is_empty: false, parent_id: None, - is_fully_resolved: true, }, ); @@ -3185,7 +3250,25 @@ fn test_set_mc_finalized_block_ignores_mismatched_shard() { } #[test] -fn test_check_validation_waits_for_mc_applied_head_before_submitting() { +fn test_set_mc_finalized_block_wakes_processor() { + let mut fixture = TestFixture::new(4); + let base_time = SystemTime::UNIX_EPOCH + Duration::from_secs(1_700_000_000); + fixture.processor.set_time(base_time); + fixture.processor.next_awake_time = base_time + Duration::from_secs(60); + + let mc_registered_top = + BlockIdExt::with_params(ShardIdent::masterchain(), 42, UInt256::rand(), UInt256::rand()); + fixture.processor.set_mc_finalized_block(mc_registered_top); + + assert_eq!( + fixture.processor.get_next_awake_time(), + base_time, + "MC finalization should wake the FSM immediately" + ); +} + +#[test] +fn test_check_validation_does_not_wait_for_mc_applied_head_before_submitting() { let mut fixture = TestFixture::new(4); let parent_slot = SlotIndex::new(0); @@ -3210,40 +3293,118 @@ fn test_check_validation_waits_for_mc_applied_head_before_submitting() { notarize_slot(&mut fixture, parent_slot, &parent_id.hash); fixture.processor.check_validation(); - if strict_mc_session_gate_mode_enabled() { - assert!( - fixture.processor.pending_validations.contains_key(&child_id), - "strict MC gate: candidate must stay pending until accepted head reaches parent normal tip" - ); - assert!( - !fixture.processor.pending_approve.contains(&child_id), - "strict MC gate: candidate must not be submitted while accepted head is behind" - ); - assert!( - !fixture.processor.rejected.contains(&child_id), - "strict MC gate: candidate should wait, not reject, while applied head is behind" - ); - } else { - assert!( - fixture.processor.pending_approve.contains(&child_id), - "validator-side MC mode: candidate should be submitted without SessionProcessor wait" - ); - assert!( - !fixture.processor.rejected.contains(&child_id), - "validator-side MC mode: candidate should not be rejected by SessionProcessor gate" - ); - } + assert!( + fixture.processor.pending_approve.contains(&child_id), + "candidate should be submitted without a SessionProcessor wait on the accepted MC head" + ); + assert!( + !fixture.processor.rejected.contains(&child_id), + "SessionProcessor should not reject while validator-side MC stale protection owns this check" + ); +} + +#[test] +fn test_check_all_releases_validation_retry_before_revalidation() { + let mut fixture = TestFixture::new_shard(4); + + let parent_slot = SlotIndex::new(0); + let child_slot = SlotIndex::new(1); + let parent_id = RawCandidateId { slot: parent_slot, hash: UInt256::rand() }; + let child_id = RawCandidateId { slot: child_slot, hash: UInt256::rand() }; + let parent_block_id = BlockIdExt::with_params( + fixture.description.get_shard().clone(), + 1, + UInt256::rand(), + UInt256::rand(), + ); + + insert_received_candidate(&mut fixture.processor, &parent_id, parent_block_id, false, None); + + let raw_candidate = + make_test_non_empty_candidate(child_id.clone(), Some(parent_id.clone()), &fixture.nodes); + let time = fixture.description.get_time(); + insert_pending_validation(&mut fixture.processor, &child_id, raw_candidate, time); + notarize_slot(&mut fixture, parent_slot, &parent_id.hash); + + fixture.processor.pending_approve.insert(child_id.clone()); + let child_id_for_release = child_id.clone(); + fixture.processor.post_delayed_action(time, move |processor| { + processor.pending_approve.remove(&child_id_for_release); + }); + + fixture.processor.check_all(); + + assert!( + fixture.processor.pending_approve.contains(&child_id), + "retry gate release should happen before check_validation so the candidate is resubmitted in the same pass" + ); +} + +#[test] +fn test_check_validation_waits_for_min_block_interval() { + let opts = + SessionOptions { min_block_interval: Duration::from_millis(500), ..Default::default() }; + let mut fixture = TestFixture::new_with_shard_and_local_idx( + 4, + 0, + ShardIdent::with_tagged_prefix(0, 0x8000_0000_0000_0000).unwrap(), + opts, + ); + let base_time = SystemTime::UNIX_EPOCH + Duration::from_secs(1_700_000_000); + fixture.processor.set_time(base_time); + fixture.processor.next_awake_time = base_time + Duration::from_secs(60); + + let parent_slot = SlotIndex::new(0); + let child_slot = SlotIndex::new(1); + let parent_id = RawCandidateId { slot: parent_slot, hash: UInt256::rand() }; + let child_id = RawCandidateId { slot: child_slot, hash: UInt256::rand() }; + let parent_block_id = BlockIdExt::with_params( + fixture.description.get_shard().clone(), + 1, + UInt256::rand(), + UInt256::rand(), + ); + let parent_gen_utime_ms = + base_time.duration_since(SystemTime::UNIX_EPOCH).unwrap().as_millis() as u64; + + insert_received_candidate_with_gen_utime_ms( + &mut fixture.processor, + &parent_id, + parent_block_id, + false, + None, + Some(parent_gen_utime_ms), + ); + + let raw_candidate = + make_test_non_empty_candidate(child_id.clone(), Some(parent_id.clone()), &fixture.nodes); + insert_pending_validation(&mut fixture.processor, &child_id, raw_candidate, base_time); + notarize_slot(&mut fixture, parent_slot, &parent_id.hash); + + fixture.processor.check_validation(); + + assert!( + !fixture.processor.pending_approve.contains(&child_id), + "candidate should not be submitted before the min_block_interval elapses" + ); + assert_eq!( + fixture.processor.get_next_awake_time(), + base_time + Duration::from_millis(500), + "validation should arm a wake for the min_block_interval deadline" + ); - fixture.processor.set_mc_finalized_block(parent_block_id); + fixture.advance_time(Duration::from_millis(600)); + fixture.processor.next_awake_time = base_time + Duration::from_secs(60); fixture.processor.check_validation(); + assert!( fixture.processor.pending_approve.contains(&child_id), - "candidate should be submitted once the exact accepted head reaches the parent normal tip" + "candidate should be submitted after the min_block_interval elapses" ); } #[test] -fn test_check_validation_rejects_mc_candidate_with_wrong_exact_parent_head() { +fn test_check_validation_does_not_reject_mc_candidate_with_wrong_exact_parent_head() { let mut fixture = TestFixture::new(4); let parent_slot = SlotIndex::new(0); @@ -3279,25 +3440,14 @@ fn test_check_validation_rejects_mc_candidate_with_wrong_exact_parent_head() { notarize_slot(&mut fixture, parent_slot, &parent_id.hash); fixture.processor.check_validation(); - if strict_mc_session_gate_mode_enabled() { - assert!( - fixture.processor.rejected.contains(&child_id), - "strict MC gate: candidate must be rejected when exact accepted head disagrees with parent" - ); - assert!( - !fixture.processor.pending_validations.contains_key(&child_id), - "strict MC gate: stale candidate should be removed from pending after rejection" - ); - } else { - assert!( - fixture.processor.pending_approve.contains(&child_id), - "validator-side MC mode: candidate should be submitted without exact-head rejection" - ); - assert!( - !fixture.processor.rejected.contains(&child_id), - "validator-side MC mode: SessionProcessor should not reject on exact-head mismatch" - ); - } + assert!( + fixture.processor.pending_approve.contains(&child_id), + "candidate should be submitted without exact-head rejection in SessionProcessor" + ); + assert!( + !fixture.processor.rejected.contains(&child_id), + "SessionProcessor should not reject on exact-head mismatch; validator-side MC fork prevention covers this" + ); } #[test] @@ -3817,10 +3967,10 @@ fn test_finalized_empty_block_does_not_advance_seqno() { collated_data: consensus_common::ConsensusCommonFactory::create_block_payload( vec![0xBB].into(), ), + gen_utime_ms: None, receive_time: fixture.description.get_time(), is_empty: true, parent_id: None, - is_fully_resolved: true, }, ); @@ -3983,7 +4133,7 @@ fn test_process_validated_candidates_before_fsm_timeout() { has_notarize, "slot 0 must be notarized (candidate was fed to FSM before timeout evaluation)" ); - // In C++ mode (allow_skip_after_notarize=true) a skip vote may follow + // In C++ mode a skip vote may follow // the notarize vote after the timeout fires -- that is harmless and // expected. The key property is that the notarize vote was emitted. } @@ -4091,6 +4241,7 @@ fn test_invalidate_local_chain_head_clears_state() { slot: SlotIndex::new(0), parent_info, block_id: block_id.clone(), + gen_utime_ms: None, }); fixture.processor.generated_parent_cache.insert(raw_id.clone(), block_id); @@ -4151,6 +4302,7 @@ fn test_reset_precollations_clears_chain_head() { slot: SlotIndex::new(0), parent_info, block_id, + gen_utime_ms: None, }); fixture.processor.reset_precollations(); @@ -4242,65 +4394,15 @@ fn test_on_collation_complete_publishes_future_slot_in_current_window() { } // ============================================================================ -// require_notarized_parent_for_collation tests +// C++ in-window collation parity tests // ============================================================================ -/// When the flag is enabled and the local chain head's parent slot is NOT notarized, -/// precollate_block must defer (return early without invoking collation). -#[test] -fn test_precollate_defers_when_parent_not_notarized_flag_enabled() { - let opts = SessionOptions { - slots_per_leader_window: 4, - require_notarized_parent_for_collation: true, - ..Default::default() - }; - let mut fixture = TestFixture::new_with_opts(4, opts); - - let parent_hash = UInt256::from([0xA1; 32]); - let block_id = BlockIdExt::with_params( - ShardIdent::masterchain(), - 1, - UInt256::from([0xA2; 32]), - UInt256::from([0xA3; 32]), - ); - - fixture.processor.local_chain_head = Some(LocalChainHead { - window: WindowIndex::new(0), - slot: SlotIndex::new(0), - parent_info: crate::block::CandidateParentInfo { - slot: SlotIndex::new(0), - hash: parent_hash.clone(), - }, - block_id: block_id.clone(), - }); - fixture - .processor - .generated_parent_cache - .insert(RawCandidateId { slot: SlotIndex::new(0), hash: parent_hash.clone() }, block_id); - - assert!( - !fixture.processor.simplex_state.has_notarized_block(SlotIndex::new(0)), - "precondition: slot 0 not notarized" - ); - - fixture.processor.precollate_block(SlotIndex::new(1)); - - assert!( - !fixture.processor.precollated_blocks.contains_key(&SlotIndex::new(1)), - "collation must NOT be invoked when parent is not notarized and flag is enabled" - ); -} - -/// When the flag is disabled, precollate_block uses the local chain head even -/// if the parent slot is not notarized (legacy optimistic pipelining). #[test] -fn test_precollate_proceeds_when_flag_disabled() { - let opts = SessionOptions { - slots_per_leader_window: 4, - require_notarized_parent_for_collation: false, - ..Default::default() - }; - let mut fixture = TestFixture::new_with_opts(4, opts); +fn test_precollate_same_window_slot_uses_local_chain_head_before_parent_notarization() { + let opts = SessionOptions { slots_per_leader_window: 4, ..Default::default() }; + let probe = TestFixture::new_with_opts(4, opts.clone()); + let local_idx = probe.description.get_leader(SlotIndex::new(1)).value() as usize; + let mut fixture = TestFixture::new_with_local_idx(4, local_idx, opts); let parent_hash = UInt256::from([0xB1; 32]); let block_id = BlockIdExt::with_params( @@ -4318,152 +4420,174 @@ fn test_precollate_proceeds_when_flag_disabled() { hash: parent_hash.clone(), }, block_id: block_id.clone(), + gen_utime_ms: None, }); - fixture - .processor - .generated_parent_cache - .insert(RawCandidateId { slot: SlotIndex::new(0), hash: parent_hash.clone() }, block_id); + fixture.processor.generated_parent_cache.insert( + RawCandidateId { slot: SlotIndex::new(0), hash: parent_hash.clone() }, + block_id.clone(), + ); + fixture.processor.last_consensus_finalized_seqno = Some(block_id.seq_no); assert!( !fixture.processor.simplex_state.has_notarized_block(SlotIndex::new(0)), - "precondition: slot 0 not notarized" + "precondition: slot 0 is not notarized yet" ); fixture.processor.precollate_block(SlotIndex::new(1)); - assert!( - fixture.processor.precollated_blocks.contains_key(&SlotIndex::new(1)), - "collation MUST proceed when flag is disabled (legacy behavior)" - ); + let precollated = fixture + .processor + .precollated_blocks + .get(&SlotIndex::new(1)) + .expect("same-window child slot must be precollated immediately"); + let parent = precollated.parent.as_ref().expect("precollated slot must capture parent"); + assert_eq!(parent.slot, SlotIndex::new(0)); + assert_eq!(parent.hash, parent_hash); } -/// When the flag is enabled and the parent IS notarized, precollation proceeds normally. #[test] -fn test_precollate_proceeds_when_parent_notarized_flag_enabled() { - let opts = SessionOptions { - slots_per_leader_window: 4, - require_notarized_parent_for_collation: true, - ..Default::default() - }; - let mut fixture = TestFixture::new_with_opts(4, opts); - - let parent_hash = UInt256::from([0xC1; 32]); - let block_id = BlockIdExt::with_params( - ShardIdent::masterchain(), - 1, - UInt256::from([0xC2; 32]), - UInt256::from([0xC3; 32]), - ); +fn test_precollate_first_slot_in_new_window_uses_fsm_available_base() { + let opts = SessionOptions { slots_per_leader_window: 2, ..Default::default() }; + let probe = TestFixture::new_with_opts(4, opts.clone()); + let local_idx = probe.description.get_leader(SlotIndex::new(2)).value() as usize; + let mut fixture = TestFixture::new_with_local_idx(4, local_idx, opts); + let stale_hash = UInt256::from([0xC0; 32]); fixture.processor.local_chain_head = Some(LocalChainHead { window: WindowIndex::new(0), - slot: SlotIndex::new(0), + slot: SlotIndex::new(1), parent_info: crate::block::CandidateParentInfo { - slot: SlotIndex::new(0), - hash: parent_hash.clone(), + slot: SlotIndex::new(1), + hash: stale_hash.clone(), }, - block_id: block_id.clone(), + block_id: BlockIdExt::with_params( + ShardIdent::masterchain(), + 50, + UInt256::from([0xC1; 32]), + UInt256::from([0xC2; 32]), + ), + gen_utime_ms: None, }); - fixture - .processor - .generated_parent_cache - .insert(RawCandidateId { slot: SlotIndex::new(0), hash: parent_hash.clone() }, block_id); - // Manually mark slot 0 as notarized in the FSM + let fsm_parent_id = RawCandidateId { slot: SlotIndex::new(1), hash: UInt256::from([0xC3; 32]) }; + let fsm_parent_block_id = BlockIdExt::with_params( + ShardIdent::masterchain(), + 11, + UInt256::from([0xC4; 32]), + UInt256::from([0xC5; 32]), + ); + insert_received_candidate( + &mut fixture.processor, + &fsm_parent_id, + fsm_parent_block_id.clone(), + false, + Some(RawCandidateId { slot: SlotIndex::new(0), hash: UInt256::from([0xC6; 32]) }), + ); + fixture.processor.simplex_state.on_block_notarized_for_test( &fixture.description, SlotIndex::new(0), - parent_hash.clone(), + UInt256::from([0xC7; 32]), ); - assert!( - fixture.processor.simplex_state.has_notarized_block(SlotIndex::new(0)), - "precondition: slot 0 must be notarized" + fixture.processor.simplex_state.on_block_notarized_for_test( + &fixture.description, + SlotIndex::new(1), + fsm_parent_id.hash.clone(), ); + fixture.processor.last_consensus_finalized_seqno = Some(fsm_parent_block_id.seq_no); - // Advance consensus-finalized seqno so should_generate_empty_block returns false - // (initial_block_seqno=1, parent seq_no=1, new_seqno=2, need finalized >= 1) - fixture.processor.last_consensus_finalized_seqno = Some(1); + assert_eq!( + fixture.processor.simplex_state.get_current_leader_window_idx(), + fixture.description.get_window_idx(SlotIndex::new(2)), + "precondition: slot 2 is now the first slot of the current leader window" + ); - fixture.processor.precollate_block(SlotIndex::new(1)); + fixture.processor.precollate_block(SlotIndex::new(2)); - assert!( - fixture.processor.precollated_blocks.contains_key(&SlotIndex::new(1)), - "collation must proceed when parent is notarized and flag is enabled" + let precollated = fixture + .processor + .precollated_blocks + .get(&SlotIndex::new(2)) + .expect("first slot in the new window must use the FSM base"); + let parent = precollated.parent.as_ref().expect("new-window slot must capture parent"); + assert_eq!(parent.slot, fsm_parent_id.slot); + assert_eq!(parent.hash, fsm_parent_id.hash); + assert_ne!( + parent.hash, stale_hash, + "new-window collation must ignore the stale local_chain_head and use FSM available_base" ); } -/// When require_notarized_parent_for_collation is enabled and the local chain head -/// parent becomes notarized, check_collation retries the deferred precollation even -/// if handle_notarization_reached didn't trigger (e.g., local_chain_head was stale). #[test] -fn test_check_collation_retries_deferred_precollation_after_notarization() { - let opts = SessionOptions { - slots_per_leader_window: 4, - require_notarized_parent_for_collation: true, - ..Default::default() - }; - let mut fixture = TestFixture::new_with_opts(4, opts); - - let parent_hash = UInt256::from([0xD1; 32]); - let block_id = BlockIdExt::with_params( +fn test_check_collation_resets_stale_local_chain_head_on_window_change() { + let opts = SessionOptions { slots_per_leader_window: 2, ..Default::default() }; + let probe = TestFixture::new_with_opts(4, opts.clone()); + let local_idx = probe.description.get_leader(SlotIndex::new(2)).value() as usize; + let mut fixture = TestFixture::new_with_local_idx(4, local_idx, opts); + + let stale_raw_id = RawCandidateId { slot: SlotIndex::new(1), hash: UInt256::from([0xD0; 32]) }; + let stale_block_id = BlockIdExt::with_params( ShardIdent::masterchain(), - 1, + 51, + UInt256::from([0xD1; 32]), UInt256::from([0xD2; 32]), - UInt256::from([0xD3; 32]), ); - - // Simulate: slot 0 was locally generated fixture.processor.local_chain_head = Some(LocalChainHead { window: WindowIndex::new(0), - slot: SlotIndex::new(0), + slot: stale_raw_id.slot, parent_info: crate::block::CandidateParentInfo { - slot: SlotIndex::new(0), - hash: parent_hash.clone(), + slot: stale_raw_id.slot, + hash: stale_raw_id.hash.clone(), }, - block_id: block_id.clone(), + block_id: stale_block_id.clone(), + gen_utime_ms: None, }); - fixture - .processor - .generated_parent_cache - .insert(RawCandidateId { slot: SlotIndex::new(0), hash: parent_hash.clone() }, block_id); - fixture.processor.slot_set_generated(SlotIndex::new(0), true); - fixture.processor.last_consensus_finalized_seqno = Some(1); + fixture.processor.generated_parent_cache.insert(stale_raw_id.clone(), stale_block_id); - // Slot 0 NOT notarized → precollation for slot 1 should defer (no FSM base either) - assert!(!fixture.processor.simplex_state.has_notarized_block(SlotIndex::new(0))); - - // check_collation: slot 0 already generated → tries precollate_block(1) as retry. - // But parent not notarized AND no FSM base → still deferred. - fixture.processor.check_collation(); - assert!( - !fixture.processor.precollated_blocks.contains_key(&SlotIndex::new(1)), - "slot 1 must not be collated yet (parent not notarized, no FSM base)" + let fsm_parent_id = RawCandidateId { slot: SlotIndex::new(1), hash: UInt256::from([0xD3; 32]) }; + let fsm_parent_block_id = BlockIdExt::with_params( + ShardIdent::masterchain(), + 12, + UInt256::from([0xD4; 32]), + UInt256::from([0xD5; 32]), + ); + insert_received_candidate( + &mut fixture.processor, + &fsm_parent_id, + fsm_parent_block_id.clone(), + false, + Some(RawCandidateId { slot: SlotIndex::new(0), hash: UInt256::from([0xD6; 32]) }), ); - // Now notarize slot 0 — this propagates FSM base to slot 1. fixture.processor.simplex_state.on_block_notarized_for_test( &fixture.description, SlotIndex::new(0), - parent_hash.clone(), + UInt256::from([0xD7; 32]), ); - assert!(fixture.processor.simplex_state.has_notarized_block(SlotIndex::new(0))); + fixture.processor.simplex_state.on_block_notarized_for_test( + &fixture.description, + SlotIndex::new(1), + fsm_parent_id.hash.clone(), + ); + fixture.processor.last_consensus_finalized_seqno = Some(fsm_parent_block_id.seq_no); - // check_collation: slot 0 generated → retries precollate_block(1). - // Now the local chain head parent IS notarized → precollation proceeds. fixture.processor.check_collation(); + assert!( - fixture.processor.precollated_blocks.contains_key(&SlotIndex::new(1)), - "check_collation must retry deferred precollation after parent notarization" + fixture.processor.local_chain_head.is_none(), + "window change must invalidate the stale local chain head" ); -} - -/// The default SessionOptions has require_notarized_parent_for_collation = true. -#[test] -fn test_require_notarized_parent_default_is_true() { - let opts = SessionOptions::default(); + let precollated = fixture + .processor + .precollated_blocks + .get(&SlotIndex::new(2)) + .expect("after clearing stale local state, collation must fall back to the FSM base"); + let parent = precollated.parent.as_ref().expect("precollated slot must capture parent"); + assert_eq!(parent.slot, fsm_parent_id.slot); + assert_eq!(parent.hash, fsm_parent_id.hash); assert!( - opts.require_notarized_parent_for_collation, - "require_notarized_parent_for_collation must default to true" + fixture.processor.generated_parent_cache.is_empty(), + "reset_precollations must also clear the old generated_parent_cache" ); } diff --git a/src/node/simplex/src/tests/test_simplex_state.rs b/src/node/simplex/src/tests/test_simplex_state.rs index ce0203c..5dba259 100644 --- a/src/node/simplex/src/tests/test_simplex_state.rs +++ b/src/node/simplex/src/tests/test_simplex_state.rs @@ -118,16 +118,6 @@ fn create_test_desc_weights( .unwrap() } -/// Helper to create SimplexStateOptions for C++ compatible mode -fn opts_cpp() -> SimplexStateOptions { - SimplexStateOptions::cpp_compatible() -} - -/// Helper to create SimplexStateOptions with notarized-parent chain enabled (pool.cpp parity mode) -fn opts_notarized_parent_chain() -> SimplexStateOptions { - opts_cpp() -} - /// Helper to create test candidate for FSM tests /// /// Creates a minimal candidate for testing. Uses stub block to avoid @@ -174,15 +164,10 @@ fn create_stub_block(block_id: BlockIdExt) -> crate::block::BlockCandidate { } } -/// Helper to create SimplexStateOptions for Alpenglow mode -fn opts_alpenglow() -> SimplexStateOptions { - SimplexStateOptions::alpenglow() -} - #[test] fn test_new_creates_initial_state() { let desc = create_test_desc(4, 2); - let state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let state = SimplexState::new(&desc).expect("Failed to create SimplexState"); assert_eq!(state.first_non_finalized_slot, SlotIndex::new(0)); assert_eq!(state.current_leader_window_idx, WindowIndex::new(0)); @@ -216,7 +201,7 @@ fn test_new_validates_slots_per_leader_window() { SystemTime::now(), None, ) { - match SimplexState::new(&desc, opts_cpp()) { + match SimplexState::new(&desc) { Ok(_) => panic!("SimplexState::new should fail with slots_per_leader_window=0"), Err(err) => { let msg = err.to_string(); @@ -233,7 +218,7 @@ fn test_new_validates_slots_per_leader_window() { #[test] fn test_on_candidate_first_slot_with_genesis_parent() { let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Create candidate for slot 0 with genesis parent (None) let candidate = create_test_candidate(0, UInt256::default(), BlockIdExt::default(), None, 0); @@ -252,7 +237,7 @@ fn test_on_candidate_first_slot_with_genesis_parent() { #[test] fn test_on_candidate_stores_pending_when_no_parent() { let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Candidate for slot 1 with parent at slot 0, but parent isn't notarized yet // so it can't be resolved → candidate stored as pending @@ -283,7 +268,7 @@ fn test_on_candidate_stores_pending_when_no_parent() { #[test] fn test_on_vote_notarize_updates_weights() { let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block = BlockIdExt::default(); let vote = Vote::Notarize(NotarizeVote { @@ -308,12 +293,10 @@ fn test_on_vote_notarize_updates_weights() { } #[test] -fn test_on_vote_skip_after_notarize_allowed_without_fallback() { - // When enable_fallback_protocol=false (C++ compatible mode): - // Skip after Notarize is ALLOWED per C++ pool.cpp check_invariants() - // C++ only checks finalize+skip as misbehavior, not notarize+skip +fn test_on_vote_skip_after_notarize_allowed() { + // Skip after Notarize is allowed per C++ pool.cpp check_invariants(). let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block = BlockIdExt::default(); let notarize = Vote::Notarize(NotarizeVote { @@ -329,10 +312,7 @@ fn test_on_vote_skip_after_notarize_allowed_without_fallback() { // Skip should be ALLOWED (matches C++ behavior) let result = state.on_vote_test(&desc, ValidatorIndex::new(0), skip, Vec::new()); - assert!( - result.is_ok(), - "Skip after notarize should succeed with enable_fallback_protocol=false (C++ mode)" - ); + assert!(result.is_ok(), "Skip after notarize should succeed"); // Should have both notarize and skip let sv = state.slot_votes.get(&SlotIndex::new(0)).unwrap(); @@ -340,44 +320,10 @@ fn test_on_vote_skip_after_notarize_allowed_without_fallback() { assert!(sv.votes[0].skip.is_some()); } -#[test] -fn test_on_vote_skip_after_notarize_misbehavior_with_fallback() { - // When enable_fallback_protocol=true (full Alpenglow): - // Skip after Notarize is MISBEHAVIOR - // In Alpenglow, once you vote notarize (fast path), you shouldn't also vote skip - let desc = create_test_desc(4, 1); - let mut state = - SimplexState::new(&desc, opts_alpenglow()).expect("Failed to create SimplexState"); - - let block = BlockIdExt::default(); - let notarize = Vote::Notarize(NotarizeVote { - slot: SlotIndex::new(0), - block_hash: block.root_hash.clone(), - }); - let skip = Vote::Skip(SkipVote { slot: SlotIndex::new(0) }); - - // Notarize first - state - .on_vote_test(&desc, ValidatorIndex::new(0), notarize, Vec::new()) - .expect("on_vote should succeed"); - - // Skip should be REJECTED (misbehavior in full Alpenglow mode) - let result = state.on_vote_test(&desc, ValidatorIndex::new(0), skip, Vec::new()); - assert!( - result.is_err(), - "Skip after notarize should fail with enable_fallback_protocol=true (Alpenglow mode)" - ); - - // Should only have notarize - let sv = state.slot_votes.get(&SlotIndex::new(0)).unwrap(); - assert!(sv.votes[0].notarize.is_some()); - assert!(sv.votes[0].skip.is_none()); -} - #[test] fn test_debug_dump_format() { let desc = create_test_desc(4, 2); - let state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Test full dump format let full_dump = state.debug_dump(&desc, true); @@ -402,12 +348,10 @@ fn test_debug_dump_format() { */ #[test] -fn test_notarize_after_skip_allowed_without_fallback() { - // When enable_fallback_protocol=false (C++ compatible mode): - // Notarize after Skip is ALLOWED per C++ pool.cpp check_invariants() - // C++ only checks finalize+skip as misbehavior, not notarize+skip +fn test_notarize_after_skip_allowed() { + // Notarize after Skip is allowed per C++ pool.cpp check_invariants(). let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block = BlockIdExt::default(); let skip = Vote::Skip(SkipVote { slot: SlotIndex::new(0) }); @@ -423,10 +367,7 @@ fn test_notarize_after_skip_allowed_without_fallback() { // Notarize should be ALLOWED (matches C++ behavior) let result = state.on_vote_test(&desc, ValidatorIndex::new(0), notarize, Vec::new()); - assert!( - result.is_ok(), - "Notarize after skip should succeed with enable_fallback_protocol=false (C++ mode)" - ); + assert!(result.is_ok(), "Notarize after skip should succeed"); // Should have both skip and notarize let sv = state.slot_votes.get(&SlotIndex::new(0)).unwrap(); @@ -434,44 +375,10 @@ fn test_notarize_after_skip_allowed_without_fallback() { assert!(sv.votes[0].notarize.is_some()); } -#[test] -fn test_notarize_after_skip_misbehavior_with_fallback() { - // When enable_fallback_protocol=true (full Alpenglow): - // Notarize after Skip is MISBEHAVIOR - // In Alpenglow, once you vote skip, you shouldn't also vote notarize - let desc = create_test_desc(4, 1); - let mut state = - SimplexState::new(&desc, opts_alpenglow()).expect("Failed to create SimplexState"); - - let block = BlockIdExt::default(); - let skip = Vote::Skip(SkipVote { slot: SlotIndex::new(0) }); - let notarize = Vote::Notarize(NotarizeVote { - slot: SlotIndex::new(0), - block_hash: block.root_hash.clone(), - }); - - // Skip first - state - .on_vote_test(&desc, ValidatorIndex::new(0), skip, Vec::new()) - .expect("skip should succeed"); - - // Notarize should be REJECTED (misbehavior in full Alpenglow mode) - let result = state.on_vote_test(&desc, ValidatorIndex::new(0), notarize, Vec::new()); - assert!( - result.is_err(), - "Notarize after skip should fail with enable_fallback_protocol=true (Alpenglow mode)" - ); - - // Should only have skip - let sv = state.slot_votes.get(&SlotIndex::new(0)).unwrap(); - assert!(sv.votes[0].skip.is_some()); - assert!(sv.votes[0].notarize.is_none()); -} - #[test] fn test_misbehavior_conflicting_notarize_votes() { let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block1 = BlockIdExt::default(); let mut block2 = BlockIdExt::default(); @@ -502,7 +409,7 @@ fn test_misbehavior_conflicting_notarize_votes() { #[test] fn test_misbehavior_finalize_after_skip() { let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block = BlockIdExt::default(); let skip = Vote::Skip(SkipVote { slot: SlotIndex::new(0) }); @@ -519,80 +426,12 @@ fn test_misbehavior_finalize_after_skip() { assert!(result.is_err(), "Finalize after skip should return error"); } -#[test] -fn test_misbehavior_finalize_after_notar_fallback() { - let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); - - let block = BlockIdExt::default(); - let notar_fb = Vote::NotarizeFallback(NotarizeFallbackVote { - slot: SlotIndex::new(0), - block_hash: block.root_hash.clone(), - }); - let finalize = - Vote::Finalize(FinalizeVote { slot: SlotIndex::new(0), block_hash: block.root_hash }); - - // Notar-fallback first - state - .on_vote_test(&desc, ValidatorIndex::new(0), notar_fb, Vec::new()) - .expect("notar-fallback should succeed"); - - // Finalize should be rejected - let result = state.on_vote_test(&desc, ValidatorIndex::new(0), finalize, Vec::new()); - assert!(result.is_err(), "Finalize after notar-fallback should return error"); -} - -#[test] -fn test_misbehavior_finalize_after_skip_fallback() { - let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); - - let block = BlockIdExt::default(); - let skip_fb = Vote::SkipFallback(SkipFallbackVote { slot: SlotIndex::new(0) }); - let finalize = - Vote::Finalize(FinalizeVote { slot: SlotIndex::new(0), block_hash: block.root_hash }); - - // Skip-fallback first - state - .on_vote_test(&desc, ValidatorIndex::new(0), skip_fb, Vec::new()) - .expect("skip-fallback should succeed"); - - // Finalize should be rejected - let result = state.on_vote_test(&desc, ValidatorIndex::new(0), finalize, Vec::new()); - assert!(result.is_err(), "Finalize after skip-fallback should return error"); -} - -#[test] -fn test_misbehavior_too_many_notar_fallback() { - let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); - - // Send MAX_NOTAR_FALLBACK_VOTES_PER_VALIDATOR (3) notar-fallback votes - for i in 0..3u8 { - let hash = UInt256::from([i; 32]); - let vote = Vote::NotarizeFallback(NotarizeFallbackVote { - slot: SlotIndex::new(0), - block_hash: hash, - }); - state - .on_vote_test(&desc, ValidatorIndex::new(0), vote, Vec::new()) - .expect("notar-fallback should succeed"); - } - - // 4th vote should be rejected - let hash = UInt256::from([0xFFu8; 32]); - let vote = - Vote::NotarizeFallback(NotarizeFallbackVote { slot: SlotIndex::new(0), block_hash: hash }); - let result = state.on_vote_test(&desc, ValidatorIndex::new(0), vote, Vec::new()); - assert!(result.is_err(), "Too many notar-fallback should return error"); -} - #[test] fn test_misbehavior_invalid_skip_range() { // SkipVote now uses a single slot field, not slot_begin/slot_end // This test verifies that skip votes work correctly with the new structure let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Valid skip vote for slot 0 let skip = Vote::Skip(SkipVote { slot: SlotIndex::new(0) }); @@ -600,23 +439,10 @@ fn test_misbehavior_invalid_skip_range() { assert!(result.is_ok(), "Valid skip vote should succeed"); } -#[test] -fn test_misbehavior_invalid_skip_fallback_range() { - // SkipFallbackVote now uses a single slot field, not slot_begin/slot_end - // This test verifies that skip fallback votes work correctly with the new structure - let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); - - // Valid skip fallback vote for slot 0 - let skip_fb = Vote::SkipFallback(SkipFallbackVote { slot: SlotIndex::new(0) }); - let result = state.on_vote_test(&desc, ValidatorIndex::new(0), skip_fb, Vec::new()); - assert!(result.is_ok(), "Valid skip fallback vote should succeed"); -} - #[test] fn test_misbehavior_notarize_finalize_hash_mismatch() { let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let hash_a = UInt256::from([0xAAu8; 32]); let hash_b = UInt256::from([0xBBu8; 32]); @@ -640,7 +466,7 @@ fn test_misbehavior_notarize_finalize_hash_mismatch() { #[test] fn test_misbehavior_finalize_notarize_hash_mismatch() { let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let hash_a = UInt256::from([0xAAu8; 32]); let hash_b = UInt256::from([0xBBu8; 32]); @@ -671,7 +497,7 @@ fn test_misbehavior_finalize_notarize_hash_mismatch() { #[test] fn test_notarize_threshold_66_triggers_block_notarized() { let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block = BlockIdExt::default(); let vote = Vote::Notarize(NotarizeVote { @@ -693,119 +519,10 @@ fn test_notarize_threshold_66_triggers_block_notarized() { assert!(state.slot_votes.get(&SlotIndex::new(0)).unwrap().block_notarized_published); } -#[test] -fn test_safe_to_notar_requires_both_thresholds() { - // SafeToNotar requires ALL conditions (Alpenglow White Paper): - // 1. notar(b) >= 1/3 (threshold_33) - // 2. skip(s) + notar(b) >= 2/3 (threshold_66) - // 3. notar(b) < 2/3 (threshold_66) - fallback only when normal path fails - // - // SafeToNotar is a FALLBACK mechanism - it only triggers when notar alone - // isn't enough for BlockNotarized. If notar >= 2/3, BlockNotarized handles - // it via the normal path. - // - // NOTE: enable_fallback_protocol=true to test fallback threshold logic - let desc = create_test_desc(4, 1); - let mut state = - SimplexState::new(&desc, opts_alpenglow()).expect("Failed to create SimplexState"); - - let block = BlockIdExt::default(); - let vote = Vote::Notarize(NotarizeVote { - slot: SlotIndex::new(0), - block_hash: block.root_hash.clone(), - }); - - // With 4 validators: threshold_33 = 2, threshold_66 = 3 - - // 1 vote: threshold_33 not met - state.on_vote_test(&desc, ValidatorIndex::new(0), vote.clone(), Vec::new()).unwrap(); - assert!(!state - .slot_votes - .get(&SlotIndex::new(0)) - .unwrap() - .safe_to_notar_blocks - .contains(&block.root_hash)); - - // 2 votes: threshold_33 met, but skip + notar = 0 + 2 < 3 (threshold_66) - state.on_vote_test(&desc, ValidatorIndex::new(1), vote.clone(), Vec::new()).unwrap(); - assert!( - !state - .slot_votes - .get(&SlotIndex::new(0)) - .unwrap() - .safe_to_notar_blocks - .contains(&block.root_hash), - "SafeToNotar should NOT trigger at threshold_33 alone" - ); - - // 3 votes: notar = 3 >= threshold_66 - // BlockNotarized triggers via normal path, so SafeToNotar should NOT trigger - // (fallback is not needed when normal path succeeds) - state.on_vote_test(&desc, ValidatorIndex::new(2), vote, Vec::new()).unwrap(); - assert!( - !state - .slot_votes - .get(&SlotIndex::new(0)) - .unwrap() - .safe_to_notar_blocks - .contains(&block.root_hash), - "SafeToNotar should NOT trigger when notar >= 2/3 (BlockNotarized handles it)" - ); - - // Verify that BlockNotarized WAS triggered - assert!( - state.slot_votes.get(&SlotIndex::new(0)).unwrap().block_notarized_published, - "BlockNotarized should have triggered" - ); -} - -#[test] -fn test_safe_to_notar_with_skip_votes() { - // Test that skip votes contribute to the skip + notar >= 2/3 condition - // NOTE: enable_fallback_protocol=true to test SafeToNotar logic - let desc = create_test_desc(4, 1); - let mut state = - SimplexState::new(&desc, opts_alpenglow()).expect("Failed to create SimplexState"); - - let block = BlockIdExt::default(); - let notar_vote = Vote::Notarize(NotarizeVote { - slot: SlotIndex::new(0), - block_hash: block.root_hash.clone(), - }); - let skip_vote = Vote::Skip(SkipVote { slot: SlotIndex::new(0) }); - - // With 4 validators: threshold_33 = 2, threshold_66 = 3 - - // 1 skip vote + 1 notar vote: notar = 1 < threshold_33 - state.on_vote_test(&desc, ValidatorIndex::new(0), skip_vote.clone(), Vec::new()).unwrap(); - state.on_vote_test(&desc, ValidatorIndex::new(1), notar_vote.clone(), Vec::new()).unwrap(); - assert!( - !state - .slot_votes - .get(&SlotIndex::new(0)) - .unwrap() - .safe_to_notar_blocks - .contains(&block.root_hash), - "SafeToNotar should NOT trigger: notar < threshold_33" - ); - - // Add another notar vote: notar = 2 >= threshold_33, skip + notar = 1 + 2 = 3 >= threshold_66 - state.on_vote_test(&desc, ValidatorIndex::new(2), notar_vote, Vec::new()).unwrap(); - assert!( - state - .slot_votes - .get(&SlotIndex::new(0)) - .unwrap() - .safe_to_notar_blocks - .contains(&block.root_hash), - "SafeToNotar should trigger: notar >= 1/3 AND skip + notar >= 2/3" - ); -} - #[test] fn test_finalize_threshold_66_triggers_block_finalized() { let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block = BlockIdExt::default(); @@ -831,47 +548,29 @@ fn test_finalize_threshold_66_triggers_block_finalized() { } #[test] -fn test_safe_to_skip_broadcasts_skip_fallback_but_no_slot_skipped() { - // SafeToSkip triggers at 1/3 threshold and calls try_skip_window - // which broadcasts Skip votes for unvoted slots. - // SlotSkipped is only emitted when skip certificate (2/3) is reached. - // NOTE: enable_fallback_protocol=true to test SafeToSkip logic +fn test_skip_below_certificate_does_not_emit_slot_skipped() { + // SlotSkipped is emitted only when skip certificate (2/3) is reached. + // This test verifies behavior below that threshold. let desc = create_test_desc(4, 1); - let mut state = - SimplexState::new(&desc, opts_alpenglow()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let vote = Vote::Skip(SkipVote { slot: SlotIndex::new(0) }); - // threshold_33 for 4 validators = 2 - // SafeToSkip condition: notarize_or_skip_weight >= threshold_33 + max_notarize - // After 1 vote: notarize_or_skip_weight=1, threshold=2, max_notar=0, so 1 >= 2 is false + // threshold_66 for 4 validators = 3. state.on_vote_test(&desc, ValidatorIndex::new(0), vote.clone(), Vec::new()).unwrap(); - // After 1 vote, SafeToSkip should NOT have triggered + // After 1 vote, slot cannot be skipped yet. let events: Vec<_> = from_fn(|| state.pull_event()).collect(); assert!( !events.iter().any(|e| matches!(e, SimplexEvent::SlotSkipped(_))), "Should not trigger SlotSkipped after 1 skip vote" ); - // 2nd vote: notarize_or_skip_weight=2, 2 >= 2 is true → SafeToSkip triggers - // SafeToSkip calls try_skip_window which broadcasts Skip for unvoted slots - // But SlotSkipped is NOT emitted yet - only at skip certificate (2/3) + // 2nd vote is still below skip certificate threshold. state.on_vote_test(&desc, ValidatorIndex::new(1), vote.clone(), Vec::new()).unwrap(); let events: Vec<_> = from_fn(|| state.pull_event()).collect(); - // Check that we have a SkipFallback vote broadcast (from SafeToSkip) - // or Skip vote (from try_skip_window for unvoted slots) - assert!( - events.iter().any(|e| matches!( - e, - SimplexEvent::BroadcastVote(Vote::Skip(_)) - | SimplexEvent::BroadcastVote(Vote::SkipFallback(_)) - )), - "Expected Skip or SkipFallback vote broadcast" - ); - // SlotSkipped should NOT be emitted yet (need 3 votes for threshold_66) assert!( !events.iter().any(|e| matches!(e, SimplexEvent::SlotSkipped(_))), @@ -883,17 +582,17 @@ fn test_safe_to_skip_broadcasts_skip_fallback_but_no_slot_skipped() { fn test_skip_certificate_threshold_66_triggers_slot_skipped() { // Skip certificate (>=2/3) triggers SlotSkipped event in check_thresholds_and_trigger let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let vote = Vote::Skip(SkipVote { slot: SlotIndex::new(0) }); // Need 3 out of 4 for threshold_66 state.on_vote_test(&desc, ValidatorIndex::new(0), vote.clone(), Vec::new()).unwrap(); // Clear events from first vote - while state.pull_event().is_some() {} + drain_events(&mut state); state.on_vote_test(&desc, ValidatorIndex::new(1), vote.clone(), Vec::new()).unwrap(); - // After 2 votes: SafeToSkip triggered, but NOT SlotSkipped (need 3 for skip certificate) + // After 2 votes: still below skip certificate threshold (need 3 votes) let events: Vec<_> = from_fn(|| state.pull_event()).collect(); assert!( !events.iter().any(|e| matches!(e, SimplexEvent::SlotSkipped(_))), @@ -930,16 +629,16 @@ fn test_skip_certificate_reached_event_emitted_in_cpp_mode() { // - SlotSkipped (internal progress) // - SkipCertificateReached (for broadcasting the skip certificate) let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let slot = SlotIndex::new(0); let vote = Vote::Skip(SkipVote { slot }); // Need 3 out of 4 for threshold_66 state.on_vote_test(&desc, ValidatorIndex::new(0), vote.clone(), vec![0]).unwrap(); - while state.pull_event().is_some() {} + drain_events(&mut state); state.on_vote_test(&desc, ValidatorIndex::new(1), vote.clone(), vec![1]).unwrap(); - while state.pull_event().is_some() {} + drain_events(&mut state); // 3rd vote triggers skip certificate state.on_vote_test(&desc, ValidatorIndex::new(2), vote, vec![2]).unwrap(); @@ -975,40 +674,10 @@ fn test_skip_certificate_reached_event_emitted_in_cpp_mode() { assert_eq!(signer_idxs, vec![0, 1, 2], "certificate should include signers 0,1,2"); } -#[test] -fn test_skip_certificate_reached_event_not_emitted_in_fallback_mode() { - // In full Alpenglow (fallback) mode, SlotSkipped is still emitted at threshold_66, - // but SkipCertificateReached should NOT be emitted (explicit skip cert broadcast is C++-mode-only). - let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, SimplexStateOptions::alpenglow()) - .expect("Failed to create SimplexState"); - - let slot = SlotIndex::new(0); - let vote = Vote::Skip(SkipVote { slot }); - - // Need 3 out of 4 for threshold_66 - state.on_vote_test(&desc, ValidatorIndex::new(0), vote.clone(), vec![0]).unwrap(); - while state.pull_event().is_some() {} - state.on_vote_test(&desc, ValidatorIndex::new(1), vote.clone(), vec![1]).unwrap(); - while state.pull_event().is_some() {} - - state.on_vote_test(&desc, ValidatorIndex::new(2), vote, vec![2]).unwrap(); - let events: Vec<_> = from_fn(|| state.pull_event()).collect(); - - assert!( - events.iter().any(|e| matches!(e, SimplexEvent::SlotSkipped(_))), - "Expected SlotSkipped at skip threshold" - ); - assert!( - !events.iter().any(|e| matches!(e, SimplexEvent::SkipCertificateReached(_))), - "SkipCertificateReached must not be emitted in fallback mode" - ); -} - #[test] fn test_slot_skipped_not_emitted_if_finalized() { let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block = BlockIdExt::default(); @@ -1055,20 +724,20 @@ fn test_slot_skipped_not_emitted_if_finalized() { #[test] fn test_slot_skipped_not_emitted_twice() { - // SlotSkipped only emits at skip certificate (2/3), not at SafeToSkip + // SlotSkipped only emits at skip certificate (2/3) // With 5 validators, threshold_66 = (5*2+2)/3 = 4 let desc = create_test_desc(5, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let vote = Vote::Skip(SkipVote { slot: SlotIndex::new(0) }); // threshold_66 for 5 validators = 4 // Send 3 votes - not enough for skip certificate state.on_vote_test(&desc, ValidatorIndex::new(0), vote.clone(), Vec::new()).unwrap(); - while state.pull_event().is_some() {} + drain_events(&mut state); state.on_vote_test(&desc, ValidatorIndex::new(1), vote.clone(), Vec::new()).unwrap(); - while state.pull_event().is_some() {} + drain_events(&mut state); state.on_vote_test(&desc, ValidatorIndex::new(2), vote.clone(), Vec::new()).unwrap(); let events: Vec<_> = from_fn(|| state.pull_event()).collect(); @@ -1110,7 +779,7 @@ fn test_slot_skipped_not_emitted_twice() { #[test] fn test_ignore_finalized_slot_candidate() { let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Finalize slot 0 via normal path (3 finalize votes) let block = BlockIdExt::default(); @@ -1124,7 +793,7 @@ fn test_ignore_finalized_slot_candidate() { state.on_vote_test(&desc, ValidatorIndex::new(2), vote, Vec::new()).unwrap(); // Clear events and verify finalization - while state.pull_event().is_some() {} + drain_events(&mut state); assert_eq!(state.first_non_finalized_slot, SlotIndex::new(1)); // Try to send candidate for finalized slot @@ -1140,7 +809,7 @@ fn test_ignore_finalized_slot_candidate() { #[test] fn test_ignore_finalized_slot_vote() { let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Finalize slot 0 via normal path (3 finalize votes) let block = BlockIdExt::default(); @@ -1154,7 +823,7 @@ fn test_ignore_finalized_slot_vote() { state.on_vote_test(&desc, ValidatorIndex::new(2), vote, Vec::new()).unwrap(); // Clear events and verify finalization - while state.pull_event().is_some() {} + drain_events(&mut state); assert_eq!(state.first_non_finalized_slot, SlotIndex::new(1)); // Try to send vote for finalized slot - should be rejected (benign, slot already finalized) @@ -1174,10 +843,10 @@ fn test_ignore_finalized_slot_vote() { #[test] fn test_candidate_without_parent_accepted() { - // C++ consensus.cpp:173 — C++ never rejects a candidate for missing parent. + // C++ `consensus.cpp` — C++ never rejects a candidate for missing parent. // It only validates parent_slot < candidate_slot when parent exists. let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let candidate = Candidate::new( crate::block::CandidateId { @@ -1197,9 +866,9 @@ fn test_candidate_without_parent_accepted() { #[test] fn test_candidate_with_parent_slot_ge_rejected() { - // C++ consensus.cpp:173: parent_slot >= candidate_slot → misbehavior + // C++ `consensus.cpp`: parent_slot >= candidate_slot -> misbehavior let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let candidate = Candidate::new( crate::block::CandidateId { @@ -1223,9 +892,9 @@ fn test_candidate_with_parent_slot_ge_rejected() { #[test] fn test_candidate_with_valid_parent_accepted() { - // C++ consensus.cpp:173: parent_slot < candidate_slot → accepted + // C++ `consensus.cpp`: parent_slot < candidate_slot -> accepted let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let candidate = Candidate::new( crate::block::CandidateId { @@ -1250,7 +919,7 @@ fn test_candidate_with_valid_parent_accepted() { #[test] fn test_window_cleanup_after_finalization() { let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Ensure windows 0 and 1 exist state.ensure_window_exists(WindowIndex::new(0)); @@ -1280,7 +949,7 @@ fn test_window_cleanup_after_finalization() { state.on_vote_test(&desc, ValidatorIndex::new(2), vote1, Vec::new()).unwrap(); // Clear events - while state.pull_event().is_some() {} + drain_events(&mut state); assert_eq!(state.first_non_finalized_slot, SlotIndex::new(2)); @@ -1296,7 +965,7 @@ fn test_window_cleanup_after_finalization() { #[test] fn test_duplicate_vote_same_block_ok() { let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block = BlockIdExt::default(); let vote = Vote::Notarize(NotarizeVote { @@ -1318,7 +987,7 @@ fn test_duplicate_vote_same_block_ok() { #[test] fn test_invalid_validator_idx() { let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block = BlockIdExt::default(); let vote = @@ -1332,7 +1001,7 @@ fn test_invalid_validator_idx() { #[test] fn test_invalid_leader_in_candidate() { let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Create candidate with invalid leader (construct directly to test FSM validation) let candidate = Candidate::new( @@ -1360,7 +1029,7 @@ fn test_invalid_leader_in_candidate() { #[test] fn test_timeout_management() { let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // FSM is created with unarmed timeouts (skip_timestamp = None). // SessionProcessor::start() is responsible for calling reset_timeouts_on_start(). @@ -1378,7 +1047,7 @@ fn test_unarmed_fsm_no_skip_cascade_after_delay() { // Regression: the FSM must NOT fire skip votes when check_all() runs // with unarmed timeouts, even after an arbitrary delay. let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Simulate 60 s overlay warmup delay without arming timeouts let future = desc.get_time() + Duration::from_secs(60); @@ -1386,12 +1055,10 @@ fn test_unarmed_fsm_no_skip_cascade_after_delay() { state.check_all(&desc); - let mut skip_count = 0; - while let Some(event) = state.pull_event() { - if matches!(event, SimplexEvent::BroadcastVote(Vote::Skip(_))) { - skip_count += 1; - } - } + let skip_count = drain_events(&mut state) + .into_iter() + .filter(|event| matches!(event, SimplexEvent::BroadcastVote(Vote::Skip(_)))) + .count(); assert_eq!(skip_count, 0, "unarmed FSM must produce NO skip votes regardless of clock delay"); } @@ -1399,7 +1066,7 @@ fn test_unarmed_fsm_no_skip_cascade_after_delay() { fn test_armed_timeouts_enable_skip_after_expiry() { // After reset_timeouts_on_start() the skip timer fires once the deadline elapses. let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let t0 = desc.get_time(); @@ -1409,24 +1076,20 @@ fn test_armed_timeouts_enable_skip_after_expiry() { // Immediately after arming, check_all should produce no skips state.check_all(&desc); - let mut skip_count = 0; - while let Some(event) = state.pull_event() { - if matches!(event, SimplexEvent::BroadcastVote(Vote::Skip(_))) { - skip_count += 1; - } - } + let skip_count = drain_events(&mut state) + .into_iter() + .filter(|event| matches!(event, SimplexEvent::BroadcastVote(Vote::Skip(_)))) + .count(); assert_eq!(skip_count, 0, "no skip votes before timeout expires"); // Advance past first_block_timeout + target_rate (defaults: 3s + 1s = 4s) desc.set_time(t0 + Duration::from_secs(5)); state.check_all(&desc); - let mut skip_count = 0; - while let Some(event) = state.pull_event() { - if matches!(event, SimplexEvent::BroadcastVote(Vote::Skip(_))) { - skip_count += 1; - } - } + let skip_count = drain_events(&mut state) + .into_iter() + .filter(|event| matches!(event, SimplexEvent::BroadcastVote(Vote::Skip(_)))) + .count(); assert!(skip_count > 0, "skip votes must fire after timeout expires"); } @@ -1436,7 +1099,7 @@ fn test_try_skip_window_preserves_pending_block_cpp_mode() { // pending_block. The async try_notarize() coroutine can still complete // after a skip vote, producing both Skip and Notar for the same slot. let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Store a candidate as pending at slot 0 let hash = UInt256::from([1u8; 32]); @@ -1444,7 +1107,7 @@ fn test_try_skip_window_preserves_pending_block_cpp_mode() { let candidate = create_test_candidate(0, hash.clone(), block_id, None, 0); let _ = state.on_candidate(&desc, candidate); // Drain events from on_candidate - while state.pull_event().is_some() {} + drain_events(&mut state); // Verify pending_block is set let pending_before = state @@ -1472,28 +1135,6 @@ fn test_try_skip_window_preserves_pending_block_cpp_mode() { ); } -#[test] -fn test_try_skip_window_clears_pending_block_alpenglow_mode() { - // Alpenglow mode: pendingBlocks[k] <- bottom after skip - let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_alpenglow()).expect("Failed to create SimplexState"); - - let hash = UInt256::from([1u8; 32]); - let block_id = BlockIdExt::default(); - let candidate = create_test_candidate(0, hash.clone(), block_id, None, 0); - let _ = state.on_candidate(&desc, candidate); - while state.pull_event().is_some() {} - - state.try_skip_window_for_test(WindowIndex::new(0)); - - let pending_after = state - .get_window(WindowIndex::new(0)) - .and_then(|w| w.slots[0].pending_block.as_ref()) - .is_some(); - assert!(!pending_after, "Alpenglow mode: pending_block must be cleared after skip"); -} - /* ======================================================================== Available Parent Tests @@ -1504,7 +1145,7 @@ fn test_try_skip_window_clears_pending_block_alpenglow_mode() { fn test_has_available_parent_first_slot_with_genesis() { // First slot should have parent available when per-slot available_base is genesis (Some(None)). let desc = create_test_desc(4, 2); - let state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Slot 0 starts with genesis base (C++ SlotState::available_base = RawParentId{}). assert!( @@ -1517,7 +1158,7 @@ fn test_has_available_parent_first_slot_with_genesis() { fn test_has_available_parent_first_slot_no_bases() { // If a slot's available_base is unknown, it should not have a parent. let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Create window 1 (slot 2 is first in window 1). Base is unknown by default. state.ensure_window_exists(WindowIndex::new(1)); @@ -1532,7 +1173,7 @@ fn test_has_available_parent_first_slot_no_bases() { fn test_has_available_parent_non_first_slot_no_prev_voted() { // Non-first slot without propagated available_base should not have parent. let desc = create_test_desc(4, 2); - let state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Slot 1 is second in window 0, but slot 0 hasn't progressed yet, so base is unknown. assert!( @@ -1547,7 +1188,7 @@ fn test_has_available_parent_non_first_slot_with_prev_notarized() { // In C++ mode, parent must be notarized (reach threshold), not just voted. // Reference: C++ pool.cpp checks parent_slot->state->notarized.has_value() let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Submit a candidate for slot 0 to trigger voting let candidate_hash = UInt256::default(); @@ -1582,7 +1223,7 @@ fn test_has_available_parent_non_first_slot_with_prev_notarized() { } // Clear events - while state.pull_event().is_some() {} + drain_events(&mut state); // Now slot 0 is notarized, slot 1 should have parent available assert!( @@ -1599,7 +1240,7 @@ fn test_has_available_parent_non_first_slot_with_prev_notarized() { fn test_get_available_parent_first_slot_returns_genesis() { // get_available_parent for first slot should return genesis (None) when base is genesis. let desc = create_test_desc(4, 2); - let state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Slot 0 has genesis base (available_base = Some(None)). let parent = state.get_available_parent(&desc, SlotIndex::new(0)); @@ -1616,7 +1257,7 @@ fn test_get_available_parent_window_start_after_skipped_last_slot_uses_previous_ // - slot 0 notarized (base for slot 1 becomes slot 0 id) // - slot 1 skipped (base propagates to slot 2, the first slot of window 1) let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Notarize slot 0 (reach 2/3 threshold) let parent_hash = UInt256::from_slice(&[0xAA; 32]); @@ -1629,7 +1270,7 @@ fn test_get_available_parent_window_start_after_skipped_last_slot_uses_previous_ } // Clear events - while state.pull_event().is_some() {} + drain_events(&mut state); // Skip slot 1 (reach 2/3 threshold), propagating base forward to slot 2. let skip_vote = Vote::Skip(SkipVote { slot: SlotIndex::new(1) }); @@ -1638,7 +1279,7 @@ fn test_get_available_parent_window_start_after_skipped_last_slot_uses_previous_ } // Clear events - while state.pull_event().is_some() {} + drain_events(&mut state); // Slot 2 is first slot of window 1. Its base should be inherited from slot 1, // which was set to slot 0 on notarization. @@ -1655,7 +1296,7 @@ fn test_get_available_parent_non_first_slot_returns_notarized_block() { // In C++ mode, parent must be notarized (reach threshold), not just voted. // Reference: C++ pool.cpp checks parent_slot->state->notarized.has_value() let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Submit a candidate for slot 0 to trigger voting // Note: voted_notar uses the candidate hash (id.hash), not the block's root_hash @@ -1688,7 +1329,7 @@ fn test_get_available_parent_non_first_slot_returns_notarized_block() { } // Clear events - while state.pull_event().is_some() {} + drain_events(&mut state); // Now slot 0 is notarized, get_available_parent should return the parent let parent = state.get_available_parent(&desc, SlotIndex::new(1)); @@ -1703,7 +1344,7 @@ fn test_get_available_parent_non_first_slot_returns_notarized_block() { fn test_get_available_parent_non_first_slot_returns_none_when_not_voted() { // Non-first slot without voted_notar should return None let desc = create_test_desc(4, 2); - let state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Slot 1 is second in window 0, but slot 0 hasn't voted yet let parent = state.get_available_parent(&desc, SlotIndex::new(1)); @@ -1714,7 +1355,7 @@ fn test_get_available_parent_non_first_slot_returns_none_when_not_voted() { fn test_get_available_parent_nonexistent_window() { // get_available_parent for a slot in non-existent window should return None let desc = create_test_desc(4, 2); - let state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Window 5 doesn't exist let parent = state.get_available_parent(&desc, SlotIndex::new(10)); @@ -1741,7 +1382,7 @@ fn test_late_candidate_with_notarize_votes_also_proceeds() { // This tests that the full voting pipeline works correctly let desc = create_test_desc(4, 2); // 4 validators, 2 slots per window - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let candidate_hash = UInt256::from_slice(&[0xCC; 32]); @@ -1843,7 +1484,7 @@ fn test_notarization_certificate_created_at_threshold() { // When notarization threshold (2/3) is reached, a notarization certificate // should be created and cached in slot_votes.notarize_certificates let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block_hash = UInt256::from_slice(&[0xAA; 32]); let vote = @@ -1881,14 +1522,14 @@ fn test_notarization_certificate_created_at_threshold() { fn test_notarization_reached_event_emitted() { // When notarization threshold is reached, NotarizationReached event should be emitted let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block_hash = UInt256::from_slice(&[0xBB; 32]); let vote = Vote::Notarize(NotarizeVote { slot: SlotIndex::new(0), block_hash: block_hash.clone() }); // Clear any initial events - while state.pull_event().is_some() {} + drain_events(&mut state); // Submit votes with dummy signatures state.on_vote_test(&desc, ValidatorIndex::new(0), vote.clone(), vec![1]).unwrap(); @@ -1917,14 +1558,14 @@ fn test_notarization_reached_event_emitted() { fn test_finalization_certificate_in_block_finalized_event() { // BlockFinalizedEvent should contain a finalization certificate with signatures let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block_hash = UInt256::from_slice(&[0xCC; 32]); let vote = Vote::Finalize(FinalizeVote { slot: SlotIndex::new(0), block_hash: block_hash.clone() }); // Clear any initial events - while state.pull_event().is_some() {} + drain_events(&mut state); // Submit votes with dummy signatures state.on_vote_test(&desc, ValidatorIndex::new(0), vote.clone(), vec![10, 11]).unwrap(); @@ -1967,14 +1608,14 @@ fn test_finalization_reached_event_emitted() { // When finalization threshold is reached, FinalizationReached event should be emitted // (in addition to BlockFinalized) let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block_hash = UInt256::from_slice(&[0xEE; 32]); let vote = Vote::Finalize(FinalizeVote { slot: SlotIndex::new(0), block_hash: block_hash.clone() }); // Clear any initial events - while state.pull_event().is_some() {} + drain_events(&mut state); // Submit votes with dummy signatures state.on_vote_test(&desc, ValidatorIndex::new(0), vote.clone(), vec![10, 11]).unwrap(); @@ -2016,14 +1657,14 @@ fn test_finalization_reached_event_emitted() { fn test_finalization_reached_event_emitted_only_once() { // FinalizationReached should only be emitted once per block, even if more votes arrive let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block_hash = UInt256::from_slice(&[0xFF; 32]); let vote = Vote::Finalize(FinalizeVote { slot: SlotIndex::new(0), block_hash: block_hash.clone() }); // Clear any initial events - while state.pull_event().is_some() {} + drain_events(&mut state); // Submit 3 votes (threshold reached) state.on_vote_test(&desc, ValidatorIndex::new(0), vote.clone(), vec![1]).unwrap(); @@ -2050,14 +1691,14 @@ fn test_finalization_reached_event_emitted_only_once() { fn test_finalization_certificate_has_sufficient_weight() { // The finalization certificate should have weight >= threshold_66 let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block_hash = UInt256::from_slice(&[0xDD; 32]); let vote = Vote::Finalize(FinalizeVote { slot: SlotIndex::new(0), block_hash: block_hash.clone() }); // Clear any initial events - while state.pull_event().is_some() {} + drain_events(&mut state); // Submit votes state.on_vote_test(&desc, ValidatorIndex::new(0), vote.clone(), vec![1]).unwrap(); @@ -2093,7 +1734,7 @@ fn test_finalization_certificate_has_sufficient_weight() { fn test_certificate_signatures_match_voters() { // Each signature in the certificate should correspond to a validator who voted let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block_hash = UInt256::from_slice(&[0xEE; 32]); let vote = @@ -2104,7 +1745,7 @@ fn test_certificate_signatures_match_voters() { let sig1 = vec![0xB0, 0xB1, 0xB2]; let sig2 = vec![0xC0, 0xC1, 0xC2]; - while state.pull_event().is_some() {} + drain_events(&mut state); state.on_vote_test(&desc, ValidatorIndex::new(0), vote.clone(), sig0.clone()).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(1), vote.clone(), sig1.clone()).unwrap(); @@ -2139,7 +1780,7 @@ fn test_notarization_certificate_not_duplicated() { // Multiple votes for the same block from same validator should not create // duplicate signatures in the certificate let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block_hash = UInt256::from_slice(&[0xFF; 32]); let vote = @@ -2186,7 +1827,7 @@ fn test_multiple_blocks_votes_tracked_separately() { // With 7 validators (weight 1 each): // threshold_66 = (7 * 2) / 3 + 1 = 5 (strict > 2/3, matches C++) let desc = create_test_desc(7, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block_hash_a = UInt256::from_slice(&[0xAA; 32]); let block_hash_b = UInt256::from_slice(&[0xBB; 32]); @@ -2208,7 +1849,7 @@ fn test_multiple_blocks_votes_tracked_separately() { state.on_vote_test(&desc, ValidatorIndex::new(6), vote_b.clone(), vec![0x55]).unwrap(); // Clear events - while state.pull_event().is_some() {} + drain_events(&mut state); // Check that block A has a certificate (reached threshold first) let sv = state.slot_votes.get(&SlotIndex::new(0)).unwrap(); @@ -2233,13 +1874,13 @@ fn test_multiple_blocks_votes_tracked_separately() { fn test_certificate_stores_vote_type() { // The certificate should store the correct vote type (Notarize vs Finalize) let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block_hash = UInt256::from_slice(&[0x11; 32]); let vote = Vote::Finalize(FinalizeVote { slot: SlotIndex::new(0), block_hash: block_hash.clone() }); - while state.pull_event().is_some() {} + drain_events(&mut state); state.on_vote_test(&desc, ValidatorIndex::new(0), vote.clone(), vec![1]).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(1), vote.clone(), vec![2]).unwrap(); @@ -2265,13 +1906,13 @@ fn test_certificate_stores_vote_type() { fn test_notarization_certificate_vote_type() { // Notarization certificate should contain NotarizeVote let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block_hash = UInt256::from_slice(&[0x22; 32]); let vote = Vote::Notarize(NotarizeVote { slot: SlotIndex::new(0), block_hash: block_hash.clone() }); - while state.pull_event().is_some() {} + drain_events(&mut state); state.on_vote_test(&desc, ValidatorIndex::new(0), vote.clone(), vec![1]).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(1), vote.clone(), vec![2]).unwrap(); @@ -2297,7 +1938,7 @@ fn test_notarization_certificate_vote_type() { fn test_certificate_get_notarize_certificate() { // Test the get_notarize_certificate public API let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block_hash = UInt256::from_slice(&[0x33; 32]); let vote = @@ -2338,7 +1979,7 @@ fn test_skip_certificate_created_at_threshold() { // When skip threshold (2/3) is reached, internal skip_weight should be tracked // Note: Skip certificates are implicit in the FSM (via SlotSkipped event) let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let vote = Vote::Skip(SkipVote { slot: SlotIndex::new(0) }); @@ -2346,9 +1987,9 @@ fn test_skip_certificate_created_at_threshold() { state.on_vote_test(&desc, ValidatorIndex::new(0), vote.clone(), vec![1]).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(1), vote.clone(), vec![2]).unwrap(); - // After 2 votes, skip_or_skip_fallback_weight = 2, which is < 3 (threshold_66) + // After 2 votes, skip_weight = 2, which is < 3 (threshold_66) // Clear events - while state.pull_event().is_some() {} + drain_events(&mut state); // 3rd vote triggers skip certificate threshold state.on_vote_test(&desc, ValidatorIndex::new(2), vote, vec![3]).unwrap(); @@ -2388,7 +2029,7 @@ fn test_skip_certificate_created_at_threshold() { fn test_set_notarize_certificate_updates_vote_accounting() { // When setting a certificate from external source, vote accounting should be updated let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let slot = SlotIndex::new(0); let block_hash = UInt256::from_slice(&[0xAA; 32]); @@ -2422,7 +2063,7 @@ fn test_set_notarize_certificate_updates_vote_accounting() { fn test_set_notarize_certificate_idempotent() { // Calling set_notarize_certificate multiple times should not increase vote weight let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let slot = SlotIndex::new(0); let block_hash = UInt256::from_slice(&[0xBB; 32]); @@ -2442,7 +2083,7 @@ fn test_set_notarize_certificate_idempotent() { .expect("should not conflict"); let weight_after_first = state.get_notarize_weight(slot, &block_hash); // Drain first-store events so we can assert duplicate store emits none. - while state.pull_event().is_some() {} + drain_events(&mut state); let stored2 = state .set_notarize_certificate(&desc, slot, &block_hash, cert.clone()) @@ -2468,7 +2109,7 @@ fn test_set_notarize_certificate_idempotent() { fn test_set_notarize_certificate_does_not_overwrite_existing() { // If there's already a certificate (from local votes), set should not overwrite let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let slot = SlotIndex::new(0); let block_hash = UInt256::from_slice(&[0xCC; 32]); @@ -2514,7 +2155,7 @@ fn test_set_notarize_certificate_does_not_overwrite_existing() { fn test_set_notarize_certificate_sets_notarized_flag() { // Setting a certificate should set the block_notarized_published flag let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let slot = SlotIndex::new(0); let block_hash = UInt256::from_slice(&[0xDD; 32]); @@ -2556,10 +2197,10 @@ fn test_set_notarize_certificate_emits_notarization_reached_for_tracked_slot() { // External notar cert ingestion must emit NotarizationReached so SessionProcessor can // persist + cache + relay the cert (same observable behavior as threshold-driven path). let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Clear any initial events - while state.pull_event().is_some() {} + drain_events(&mut state); let slot = SlotIndex::new(0); let block_hash = UInt256::from([0xAB; 32]); @@ -2592,13 +2233,13 @@ fn test_set_notarize_certificate_does_not_emit_notarization_reached_for_old_slot // For slots already finalized (slot < first_non_finalized_slot), SimplexState stores the cert // for restart/recommit support but must not emit NotarizationReached. let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Mark slot 0 as already finalized/old state.set_first_non_finalized_slot_for_test(SlotIndex::new(1)); // Clear any initial events - while state.pull_event().is_some() {} + drain_events(&mut state); let slot0 = SlotIndex::new(0); let block_hash = UInt256::from([0xCD; 32]); @@ -2624,7 +2265,7 @@ fn test_set_notarize_certificate_propagates_available_base_to_next_slot() { // update per-slot `available_base` and advance the progress cursor, matching // C++ pool.cpp behavior. let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let slot0 = SlotIndex::new(0); let slot1 = SlotIndex::new(1); @@ -2665,7 +2306,7 @@ fn test_set_notarize_certificate_propagates_available_base_to_next_slot() { #[test] fn test_set_notarize_certificate_conflict_different_block() { let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let slot = SlotIndex::new(0); let hash_a = UInt256::from_slice(&[0xA1; 32]); @@ -2691,37 +2332,15 @@ fn test_set_notarize_certificate_conflict_different_block() { /* ======================================================================== - Parent Validation Mode Tests + Parent Validation Tests ======================================================================== - - These tests verify the require_finalized_parent option behavior. - - - require_finalized_parent=false (C++ mode, default): - Parent can be notarized OR finalized to build child block. - This prevents deadlock when some validators vote skip while others vote finalize. - - - require_finalized_parent=true (strict mode): - Parent must be finalized before child block can be generated. - WARNING: Can cause deadlock in certain scenarios. */ -/// Helper to create options with C++ mode (notarized parent OK) -fn opts_cpp_parent_mode() -> SimplexStateOptions { - SimplexStateOptions { require_finalized_parent: false, ..opts_cpp() } -} - -/// Helper to create options with strict mode (finalized parent required) -fn opts_strict_parent_mode() -> SimplexStateOptions { - SimplexStateOptions { require_finalized_parent: true, ..opts_cpp() } -} - #[test] -fn test_cpp_mode_notarized_parent_valid() { - // Test that with require_finalized_parent=false (C++ mode), a notarized block - // is a valid parent for the next slot, even if not finalized. +fn test_notarized_parent_valid() { + // A notarized block is a valid parent for the next slot, even if not finalized. let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_cpp_parent_mode()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block_hash = UInt256::default(); @@ -2733,7 +2352,7 @@ fn test_cpp_mode_notarized_parent_valid() { state.on_vote_test(&desc, ValidatorIndex::new(2), vote, Vec::new()).unwrap(); // Drain events - while state.pull_event().is_some() {} + drain_events(&mut state); // Slot 0 is notarized but NOT finalized assert!(state.has_notarized_block(SlotIndex::new(0)), "Slot 0 should be notarized"); @@ -2743,81 +2362,30 @@ fn test_cpp_mode_notarized_parent_valid() { "first_non_finalized_slot should still be 0 (not finalized)" ); - // In C++ mode, slot 0 should be valid parent for slot 1 - assert!( - state.is_parent_valid(SlotIndex::new(0)), - "C++ mode: notarized slot should be valid parent" - ); + assert!(state.is_parent_valid(SlotIndex::new(0)), "notarized slot should be valid parent"); } #[test] -fn test_strict_mode_notarized_parent_invalid() { - // Test that with require_finalized_parent=true (strict mode), a notarized block - // is NOT a valid parent until it is finalized. +fn test_finalized_parent_valid() { + // A finalized slot is always a valid parent. let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_strict_parent_mode()).expect("Failed to create SimplexState"); - let block_hash = UInt256::default(); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); - // Notarize slot 0 (but don't finalize) let vote = - Vote::Notarize(NotarizeVote { slot: SlotIndex::new(0), block_hash: block_hash.clone() }); + Vote::Finalize(FinalizeVote { slot: SlotIndex::new(0), block_hash: block_hash.clone() }); state.on_vote_test(&desc, ValidatorIndex::new(0), vote.clone(), Vec::new()).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(1), vote.clone(), Vec::new()).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(2), vote, Vec::new()).unwrap(); - // Drain events - while state.pull_event().is_some() {} - - // Slot 0 is notarized but NOT finalized - assert!(state.has_notarized_block(SlotIndex::new(0)), "Slot 0 should be notarized"); + drain_events(&mut state); - // In strict mode, slot 0 should NOT be valid parent (not finalized) - assert!( - !state.is_parent_valid(SlotIndex::new(0)), - "Strict mode: notarized-only slot should NOT be valid parent" + assert_eq!( + state.first_non_finalized_slot, + SlotIndex::new(1), + "first_non_finalized_slot should be 1 after finalization" ); -} - -#[test] -fn test_finalized_parent_valid_in_both_modes() { - // Test that a finalized slot is a valid parent in both modes - let desc = create_test_desc(4, 2); - let block_hash = UInt256::default(); - - for (mode_name, opts) in - [("C++ mode", opts_cpp_parent_mode()), ("strict mode", opts_strict_parent_mode())] - { - let mut state = SimplexState::new(&desc, opts).expect("Failed to create SimplexState"); - - // Finalize slot 0 - let vote = Vote::Finalize(FinalizeVote { - slot: SlotIndex::new(0), - block_hash: block_hash.clone(), - }); - state.on_vote_test(&desc, ValidatorIndex::new(0), vote.clone(), Vec::new()).unwrap(); - state.on_vote_test(&desc, ValidatorIndex::new(1), vote.clone(), Vec::new()).unwrap(); - state.on_vote_test(&desc, ValidatorIndex::new(2), vote, Vec::new()).unwrap(); - - // Drain events - while state.pull_event().is_some() {} - - // first_non_finalized_slot should have advanced past slot 0 - assert_eq!( - state.first_non_finalized_slot, - SlotIndex::new(1), - "{}: first_non_finalized_slot should be 1 after finalization", - mode_name - ); - - // Finalized slot should be valid parent in both modes - assert!( - state.is_parent_valid(SlotIndex::new(0)), - "{}: finalized slot should be valid parent", - mode_name - ); - } + assert!(state.is_parent_valid(SlotIndex::new(0)), "finalized slot should be valid parent"); } #[test] @@ -2826,8 +2394,7 @@ fn test_events_emitted_when_threshold_reached() { // threshold is reached, regardless of slot order (no sequential gating). // This matches C++ behavior where events are emitted as thresholds are reached. let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_cpp_parent_mode()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block_hash = UInt256::default(); @@ -2839,10 +2406,7 @@ fn test_events_emitted_when_threshold_reached() { state.on_vote_test(&desc, ValidatorIndex::new(2), vote1, Vec::new()).unwrap(); // Collect events - slot 1 should be finalized immediately - let mut events = Vec::new(); - while let Some(event) = state.pull_event() { - events.push(event); - } + let events = drain_events(&mut state); let finalized_slots: Vec<_> = events .iter() @@ -2870,7 +2434,7 @@ fn test_skip_events_emitted_when_threshold_reached() { // Test that SlotSkipped events are emitted immediately when // threshold is reached, regardless of slot order. let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Skip slot 1 first (before slot 0) let vote1 = Vote::Skip(SkipVote { slot: SlotIndex::new(1) }); @@ -2879,10 +2443,7 @@ fn test_skip_events_emitted_when_threshold_reached() { state.on_vote_test(&desc, ValidatorIndex::new(2), vote1, Vec::new()).unwrap(); // Collect events - slot 1 should be skipped immediately - let mut events = Vec::new(); - while let Some(event) = state.pull_event() { - events.push(event); - } + let events = drain_events(&mut state); let skipped_slots: Vec<_> = events .iter() @@ -2911,76 +2472,23 @@ fn test_skip_events_emitted_when_threshold_reached() { Deadlock Detection Tests ======================================================================== - These tests verify that C++ mode (require_finalized_parent=false) prevents + These tests verify that C++ parent selection prevents deadlock in scenarios where some validators vote skip while others vote finalize. Deadlock scenario: - 5 validators, threshold is 4 (80%) - Slot 0: 4 validators notarize, 2 skip, 3 finalize - - With strict mode: parent must be finalized, but only 3/5 finalize votes = no finalization + - If only finalized parents were accepted, 3/5 finalize votes would stall progression - Slot 1 cannot start because parent (slot 0) is not finalized = DEADLOCK - - With C++ mode: slot 0 is notarized, so it's valid parent for slot 1 = NO DEADLOCK + - With C++ parenting: slot 0 notarization is enough to progress = NO DEADLOCK */ -#[test] -fn test_deadlock_scenario_with_strict_mode() { - // Test that strict mode (require_finalized_parent=true) can deadlock - // when finalization is blocked by skip votes. - let desc = create_test_desc_weights(5, 2, vec![1, 1, 1, 1, 1]); // 5 validators, threshold=4 - let mut state = - SimplexState::new(&desc, opts_strict_parent_mode()).expect("Failed to create SimplexState"); - - let block_hash = UInt256::default(); - - // Slot 0: 4 validators notarize (threshold reached) - let notar_vote = - Vote::Notarize(NotarizeVote { slot: SlotIndex::new(0), block_hash: block_hash.clone() }); - for idx in 0..4 { - state - .on_vote_test(&desc, ValidatorIndex::new(idx), notar_vote.clone(), Vec::new()) - .unwrap(); - } - - // Slot 0: 2 validators vote skip - let skip_vote = Vote::Skip(SkipVote { slot: SlotIndex::new(0) }); - state.on_vote_test(&desc, ValidatorIndex::new(3), skip_vote.clone(), Vec::new()).unwrap(); - state.on_vote_test(&desc, ValidatorIndex::new(4), skip_vote, Vec::new()).unwrap(); - - // Slot 0: 3 validators vote finalize (NOT enough for threshold=4) - let finalize_vote = - Vote::Finalize(FinalizeVote { slot: SlotIndex::new(0), block_hash: block_hash.clone() }); - for idx in 0..3 { - state - .on_vote_test(&desc, ValidatorIndex::new(idx), finalize_vote.clone(), Vec::new()) - .unwrap(); - } - - // Drain events - while state.pull_event().is_some() {} - - // Slot 0 is notarized but NOT finalized (only 3/5 finalize votes, need 4) - assert!(state.has_notarized_block(SlotIndex::new(0)), "Slot 0 should be notarized"); - assert_eq!( - state.first_non_finalized_slot, - SlotIndex::new(0), - "Slot 0 should NOT be finalized (only 3/5 votes)" - ); - - // In strict mode: slot 0 is NOT valid parent (not finalized) - // This would cause DEADLOCK - cannot proceed to slot 1 - assert!( - !state.is_parent_valid(SlotIndex::new(0)), - "Strict mode: notarized-but-not-finalized slot should NOT be valid parent (DEADLOCK)" - ); -} - #[test] fn test_no_deadlock_in_cpp_mode() { - // Test that C++ mode (require_finalized_parent=false) prevents deadlock + // C++ mode prevents deadlock // by allowing notarized blocks as parents. let desc = create_test_desc_weights(5, 2, vec![1, 1, 1, 1, 1]); // 5 validators, threshold=4 - let mut state = - SimplexState::new(&desc, opts_cpp_parent_mode()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block_hash = UInt256::default(); @@ -3008,7 +2516,7 @@ fn test_no_deadlock_in_cpp_mode() { } // Drain events - while state.pull_event().is_some() {} + drain_events(&mut state); // Slot 0 is notarized but NOT finalized (only 3/5 finalize votes, need 4) assert!(state.has_notarized_block(SlotIndex::new(0)), "Slot 0 should be notarized"); @@ -3030,8 +2538,7 @@ fn test_no_deadlock_in_cpp_mode() { fn test_is_parent_valid_with_notarization() { // Test is_parent_valid with notarization - C++ mode allows notarized parent let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_cpp_parent_mode()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block_hash = UInt256::default(); @@ -3051,7 +2558,7 @@ fn test_is_parent_valid_with_notarization() { } // Drain events - while state.pull_event().is_some() {} + drain_events(&mut state); // Slot 0 is now notarized - in C++ mode, it should be valid parent assert!(state.has_notarized_block(SlotIndex::new(0)), "Slot 0 should be notarized"); @@ -3067,8 +2574,7 @@ fn test_out_of_order_finalization_abandons_earlier_slots() { // the earlier slot is effectively "abandoned" (no events emitted for it). // This matches C++ behavior: first_nonfinalized_slot_ = id.slot + 1 let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_cpp_parent_mode()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block_hash = UInt256::default(); @@ -3080,10 +2586,7 @@ fn test_out_of_order_finalization_abandons_earlier_slots() { state.on_vote_test(&desc, ValidatorIndex::new(2), vote1, Vec::new()).unwrap(); // Slot 1 should be finalized immediately - let mut events = Vec::new(); - while let Some(event) = state.pull_event() { - events.push(event); - } + let mut events = drain_events(&mut state); let finalized_slots: Vec<_> = events .iter() @@ -3114,10 +2617,7 @@ fn test_out_of_order_finalization_abandons_earlier_slots() { ); // Collect events - slot 0 skip should NOT be emitted (already past) - events.clear(); - while let Some(event) = state.pull_event() { - events.push(event); - } + events = drain_events(&mut state); let skipped_slots: Vec<_> = events .iter() @@ -3141,8 +2641,7 @@ fn test_sequential_finalization_order() { // Test that when slots are finalized in order (0, then 1), // both events are emitted correctly. let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_cpp_parent_mode()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block_hash = UInt256::default(); @@ -3154,10 +2653,7 @@ fn test_sequential_finalization_order() { state.on_vote_test(&desc, ValidatorIndex::new(2), vote0, Vec::new()).unwrap(); // Drain events - let mut events = Vec::new(); - while let Some(event) = state.pull_event() { - events.push(event); - } + let mut events = drain_events(&mut state); assert_eq!( state.first_non_finalized_slot, SlotIndex::new(1), @@ -3172,9 +2668,7 @@ fn test_sequential_finalization_order() { state.on_vote_test(&desc, ValidatorIndex::new(2), vote1, Vec::new()).unwrap(); // Drain events - while let Some(event) = state.pull_event() { - events.push(event); - } + events.extend(drain_events(&mut state)); let finalized_slots: Vec<_> = events .iter() @@ -3208,8 +2702,7 @@ fn test_batch_finalization_later_slot_finalized_first() { // This tests the batch finalization behavior where finalizing a later slot // should trigger finalization of its parent chain. let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_cpp_parent_mode()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block_hash_0 = UInt256::from([0u8; 32]); let block_hash_1 = UInt256::from([1u8; 32]); @@ -3278,10 +2771,7 @@ fn test_batch_finalization_later_slot_finalized_first() { } // Collect events - let mut events = Vec::new(); - while let Some(event) = state.pull_event() { - events.push(event); - } + let events = drain_events(&mut state); // Only slot 2's BlockFinalized event should be emitted // (Slots 0 and 1 are not finalized because we didn't send finalize votes for them) @@ -3312,8 +2802,7 @@ fn test_batch_finalization_multiple_slots_finalized_together() { // Test scenario: Finalize votes for slots 0, 1, 2 arrive in rapid succession // Each should trigger its own BlockFinalized event let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_cpp_parent_mode()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block_hash_0 = UInt256::from([0u8; 32]); let block_hash_1 = UInt256::from([1u8; 32]); @@ -3391,10 +2880,7 @@ fn test_batch_finalization_multiple_slots_finalized_together() { } // Collect events - let mut events = Vec::new(); - while let Some(event) = state.pull_event() { - events.push(event); - } + let events = drain_events(&mut state); let finalized_slots: Vec<_> = events .iter() @@ -3418,11 +2904,10 @@ fn test_batch_finalization_multiple_slots_finalized_together() { #[test] fn test_notarized_parent_enables_child_finalization() { - // Test that in C++ mode (require_finalized_parent=false), a notarized parent + // In C++ mode, a notarized parent // is sufficient for a child block to proceed to finalization let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_cpp_parent_mode()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let block_hash_0 = UInt256::from([0u8; 32]); let block_hash_1 = UInt256::from([1u8; 32]); @@ -3476,10 +2961,7 @@ fn test_notarized_parent_enables_child_finalization() { } // Collect events - let mut events = Vec::new(); - while let Some(event) = state.pull_event() { - events.push(event); - } + let events = drain_events(&mut state); let finalized_slots: Vec<_> = events .iter() @@ -3505,8 +2987,8 @@ fn test_notarized_parent_enables_child_finalization() { } /// Helper to drain all events from state -fn drain_events(state: &mut SimplexState) { - while state.pull_event().is_some() {} +fn drain_events(state: &mut SimplexState) -> Vec { + from_fn(|| state.pull_event()).collect() } /* @@ -3523,7 +3005,7 @@ fn test_restart_local_vote_flags() { // - voted_skip = true // - voted_final = true let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); let hash0 = UInt256::from([0x11u8; 32]); @@ -3540,20 +3022,20 @@ fn test_restart_local_vote_flags() { "notar vote should set voted_notar(slot,hash)" ); assert!(!slot0.voted_skip, "notar vote should NOT set voted_skip"); - assert!(!slot0.its_over, "notar vote should NOT set its_over"); + assert!(!slot0.is_completed, "notar vote should NOT set is_completed"); state.mark_slot_voted_on_restart(&desc, &Vote::Skip(SkipVote { slot: SlotIndex::new(1) })); let slot1 = &state.get_window(WindowIndex::new(0)).unwrap().slots[1]; assert!(slot1.is_voted, "skip vote should set is_voted"); assert!(slot1.voted_skip, "skip vote should set voted_skip"); - assert!(slot1.is_bad_window, "skip vote should set is_bad_window"); + assert!(slot1.is_timeout_skipped, "skip vote should set is_timeout_skipped"); state.mark_slot_voted_on_restart( &desc, &Vote::Finalize(FinalizeVote { slot: SlotIndex::new(0), block_hash: hash0 }), ); let slot0 = &state.get_window(WindowIndex::new(0)).unwrap().slots[0]; - assert!(slot0.its_over, "final vote should set its_over (voted_final)"); + assert!(slot0.is_completed, "final vote should set is_completed (voted_final)"); } #[test] @@ -3561,10 +3043,10 @@ fn test_restart_skip_marks_state() { // Restart skip generation must mark local skip state before broadcasting. // Reference: C++ consensus.cpp start_up() L74-77. let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Pretend slot 0 is already finalized (should not be skipped). - state.get_window_mut(WindowIndex::new(0)).unwrap().slots[0].its_over = true; + state.get_window_mut(WindowIndex::new(0)).unwrap().slots[0].is_completed = true; // first_nonannounced_window = 1 => previous window is 0 => slots [0,1] let queued = state.generate_restart_skip_votes(WindowIndex::new(1), 2); @@ -3574,7 +3056,7 @@ fn test_restart_skip_marks_state() { let slot1 = &state.get_window(WindowIndex::new(0)).unwrap().slots[1]; assert!(slot1.is_voted, "restart skip should set is_voted"); assert!(slot1.voted_skip, "restart skip should set voted_skip"); - assert!(slot1.is_bad_window, "restart skip should set is_bad_window"); + assert!(slot1.is_timeout_skipped, "restart skip should set is_timeout_skipped"); // Should enqueue a Skip vote for slot 1 let mut seen_skip_1 = false; @@ -3596,8 +3078,7 @@ fn test_restart_finalize_blocked_by_skip() { // Baseline: without voted_skip, try_final should broadcast Finalize after notar cert observed. { - let mut state = - SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); state.mark_slot_voted_on_restart( &desc, &Vote::Notarize(NotarizeVote { slot: SlotIndex::new(0), block_hash: hash0.clone() }), @@ -3616,14 +3097,13 @@ fn test_restart_finalize_blocked_by_skip() { // With voted_skip=true, try_final must not broadcast Finalize. { - let mut state = - SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); state.mark_slot_voted_on_restart( &desc, &Vote::Notarize(NotarizeVote { slot: SlotIndex::new(0), block_hash: hash0.clone() }), ); - // Simulate persisted local skip state (without forcing BadWindow). + // Simulate persisted local skip state (without forcing TimeoutSkipped). let slot0 = &mut state.get_window_mut(WindowIndex::new(0)).unwrap().slots[0]; slot0.voted_skip = true; @@ -3644,7 +3124,7 @@ fn test_cpp_mode_local_notarize_after_skip() { // C++ allows Notarize after Skip from the same validator (skip is not a notar block). // Reference: C++ consensus.cpp on_candidate_to_notarize checks only voted_notar, not voted_skip. let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create SimplexState"); + let mut state = SimplexState::new(&desc).expect("Failed to create SimplexState"); // Locally vote skip for slot 0 (window 0). state.try_skip_window(WindowIndex::new(0)); @@ -3671,7 +3151,7 @@ fn test_notarized_parent_chain_state_tracked_in_default_mode_on_notarization() { // Notarized-parent chain fields (`available_base`, `skipped`, // `first_non_progressed_slot`) are maintained in the active C++-parity mode. let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let h0 = UInt256::from([0xC0u8; 32]); let vote0 = Vote::Notarize(NotarizeVote { slot: SlotIndex::new(0), block_hash: h0.clone() }); @@ -3696,7 +3176,7 @@ fn test_notarized_parent_chain_state_tracked_in_default_mode_on_notarization() { fn test_notarized_parent_chain_state_tracked_in_default_mode_on_skip_cert() { // Skip certificates must update the active C++-parity tracking state too. let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let vote0 = Vote::Skip(SkipVote { slot: SlotIndex::new(0) }); state.on_vote_test(&desc, ValidatorIndex::new(0), vote0.clone(), vec![1]).unwrap(); @@ -3719,17 +3199,14 @@ fn test_notarized_parent_chain_state_tracked_in_default_mode_on_skip_cert() { /* ======================================================================== - Alpenglow Mode Compatibility Verification + Parent Chain Compatibility Verification ======================================================================== */ #[test] -fn test_alpenglow_mode_with_notarized_parent_chain_tracking() { - // Verify that Alpenglow's fallback protocol (`enable_fallback_protocol=true`) - // doesn't conflict with notarized-parent chain tracking (which is always maintained). - // The two features should be orthogonal. +fn test_notarized_parent_chain_tracking() { let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_alpenglow()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); // Progress slot0 via notarization let h0 = UInt256::from([0xF0u8; 32]); @@ -3742,20 +3219,20 @@ fn test_alpenglow_mode_with_notarized_parent_chain_tracking() { assert_eq!( state.first_non_progressed_slot, SlotIndex::new(1), - "first_non_progressed_slot should advance in Alpenglow mode" + "first_non_progressed_slot should advance" ); let w0 = state.get_window(WindowIndex::new(0)).unwrap(); assert_eq!( w0.slots[1].available_base, Some(Some(crate::block::CandidateParentInfo { slot: SlotIndex::new(0), hash: h0 })), - "slot1 base must be set in Alpenglow mode" + "slot1 base must be set" ); // Slot 0 is still inside window 0, so the leader window does not advance yet. assert_eq!( state.current_leader_window_idx, WindowIndex::new(0), - "Alpenglow mode should still remain in window 0 until progress crosses the boundary" + "should remain in window 0 until progress crosses the boundary" ); } @@ -3771,8 +3248,7 @@ fn test_notarized_parent_chain_startup_sets_slot0_base_and_first_non_progressed_ // - slot 0 available_base = RawParentId{} (genesis) // - now_ starts at 0 let desc = create_test_desc(4, 2); - let state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let state = SimplexState::new(&desc).expect("Failed to create state"); assert_eq!(state.first_non_progressed_slot, SlotIndex::new(0)); @@ -3787,8 +3263,7 @@ fn test_notarized_parent_chain_on_notarization_sets_next_base_and_advances_progr // - set available_base for next non-skipped slot to notarized id // - maybe_publish_new_leader_windows() advances now_ on notarized slots let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let h0 = UInt256::from([0xA0u8; 32]); let vote0 = Vote::Notarize(NotarizeVote { slot: SlotIndex::new(0), block_hash: h0.clone() }); @@ -3814,8 +3289,7 @@ fn test_notarized_parent_chain_advances_to_next_window_only_after_window_progres // window size = 2 slots: [0,1] in window0, [2,3] in window1. // progress cursor should cross to 2 only after slot1 is progressed. let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); // Progress slot0 via notarization let h0 = UInt256::from([0xA1u8; 32]); @@ -3862,8 +3336,7 @@ fn test_notarized_parent_chain_on_skip_propagates_base_and_advances_progress_cur // - propagate available_base forward when next base is unknown // - advance now_ on skipped slots let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); // Skip slot0 (genesis base) -> slot1 base should become genesis let vote0 = Vote::Skip(SkipVote { slot: SlotIndex::new(0) }); @@ -3889,8 +3362,7 @@ fn test_notarized_parent_chain_skipped_slot_is_skipped_by_next_nonskipped_on_not // 2) notarization of slot0 must set base for slot2 (skipping slot1) // This mirrors pool.cpp use of skip_intervals_ + next_nonskipped_slot_after(). let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); // Skip slot1 first (out-of-order) let vote1 = Vote::Skip(SkipVote { slot: SlotIndex::new(1) }); @@ -4004,7 +3476,7 @@ fn create_test_notar_cert( #[test] fn test_collect_cached_certificates_in_range_filters_and_sorts() { let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let signers = vec![ValidatorIndex::new(0), ValidatorIndex::new(1), ValidatorIndex::new(2)]; @@ -4072,7 +3544,7 @@ fn test_collect_cached_certificates_in_range_filters_and_sorts() { #[test] fn test_get_last_finalize_certificate_returns_highest_slot() { let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); assert!(state.get_last_finalize_certificate().is_none(), "no final certs yet"); @@ -4108,7 +3580,7 @@ fn test_get_last_finalize_certificate_returns_highest_slot() { #[test] fn test_set_finalize_certificate_stores_old_slot_without_tracking() { let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let signers = vec![ValidatorIndex::new(0), ValidatorIndex::new(1), ValidatorIndex::new(2)]; @@ -4154,8 +3626,7 @@ fn test_set_finalize_certificate_updates_vote_accounting() { // Test that set_finalize_certificate correctly updates // vote accounting when receiving a finalize certificate from an external source. let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let slot = SlotIndex::new(0); let block_hash = UInt256::from([0xAA; 32]); @@ -4182,8 +3653,7 @@ fn test_set_finalize_certificate_updates_vote_accounting() { fn test_set_finalize_certificate_deduplicates() { // Test that applying the same certificate twice doesn't change state. let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let slot = SlotIndex::new(0); let block_hash = UInt256::from([0xBB; 32]); @@ -4197,7 +3667,7 @@ fn test_set_finalize_certificate_deduplicates() { .expect("should not conflict"); assert!(stored1, "first application should store"); // Drain first-store events so we can assert duplicate store emits none. - while state.pull_event().is_some() {} + drain_events(&mut state); // Apply second time let stored2 = state @@ -4216,7 +3686,7 @@ fn test_set_finalize_certificate_deduplicates() { #[test] fn test_set_skip_certificate_deduplicates_without_events() { let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let slot = SlotIndex::new(2); let signers = vec![ValidatorIndex::new(0), ValidatorIndex::new(1), ValidatorIndex::new(2)]; @@ -4226,7 +3696,7 @@ fn test_set_skip_certificate_deduplicates_without_events() { .set_skip_certificate(&desc, slot, skip_cert.clone()) .expect("first set_skip_certificate should succeed"); assert!(stored1, "first skip certificate application should store"); - while state.pull_event().is_some() {} + drain_events(&mut state); let stored2 = state .set_skip_certificate(&desc, slot, skip_cert) @@ -4243,8 +3713,7 @@ fn test_set_skip_certificate_updates_vote_accounting() { // Test that set_skip_certificate correctly updates // vote accounting when receiving a skip certificate from an external source. let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let slot = SlotIndex::new(0); @@ -4269,8 +3738,7 @@ fn test_set_skip_certificate_marks_slot_skipped() { // Test that set_skip_certificate marks the slot as skipped // in the window state. let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let slot = SlotIndex::new(1); @@ -4294,8 +3762,7 @@ fn test_set_skip_certificate_propagates_base() { // Test that set_skip_certificate propagates available_base // to the next slot (C++ pool.cpp parity). let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); // First, notarize slot 0 to establish a base let slot0 = SlotIndex::new(0); @@ -4324,11 +3791,10 @@ fn test_set_skip_certificate_propagates_base() { #[test] fn test_set_skip_certificate_emits_slot_skipped_event_for_tracked_slot() { let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); // Clear any initial events - while state.pull_event().is_some() {} + drain_events(&mut state); let slot = SlotIndex::new(1); let signers = vec![ValidatorIndex::new(0), ValidatorIndex::new(1), ValidatorIndex::new(2)]; @@ -4351,9 +3817,9 @@ fn test_set_skip_certificate_emits_slot_skipped_event_for_tracked_slot() { #[test] fn test_set_skip_certificate_emits_skip_cert_reached() { let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); - while state.pull_event().is_some() {} + drain_events(&mut state); let slot = SlotIndex::new(1); let signers = vec![ValidatorIndex::new(0), ValidatorIndex::new(1), ValidatorIndex::new(2)]; @@ -4376,14 +3842,13 @@ fn test_set_skip_certificate_emits_skip_cert_reached() { #[test] fn test_set_skip_certificate_does_not_emit_slot_skipped_event_for_old_slot() { let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); // Mark slot 0 as already finalized/old state.set_first_non_finalized_slot_for_test(SlotIndex::new(1)); // Clear any initial events - while state.pull_event().is_some() {} + drain_events(&mut state); let slot0 = SlotIndex::new(0); let signers = vec![ValidatorIndex::new(0), ValidatorIndex::new(1), ValidatorIndex::new(2)]; @@ -4410,8 +3875,7 @@ fn test_external_finalize_certificate_for_missed_finalization_recovery() { // Scenario: Simulate a node that missed finalization votes but receives // the finalize certificate from a peer. This tests the recovery path. let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); // Simulate: We have notarization for slot 0 but missed finalize votes let slot0 = SlotIndex::new(0); @@ -4448,8 +3912,7 @@ fn test_set_finalize_certificate_advances_progress_cursor_past_pre_skipped_slots // finalization must run progress-cursor advancement (`advance_present` parity) // before leader-window advancement so we don't stop on a baseless skipped slot. let desc = create_test_desc(4, 4); // 4 slots per window - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let signers = vec![ValidatorIndex::new(0), ValidatorIndex::new(1), ValidatorIndex::new(2)]; // Pre-mark slots 1..3 as skipped (out of order, before slot 0 finalization). @@ -4497,10 +3960,10 @@ fn test_set_finalize_certificate_emits_block_finalized_and_finalization_reached_ // - FinalizationReached (standstill caching) // for tracked (non-old) slots. let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); // Clear any initial events - while state.pull_event().is_some() {} + drain_events(&mut state); let slot = SlotIndex::new(0); let block_hash = UInt256::from([0xEF; 32]); @@ -4551,7 +4014,7 @@ fn test_set_finalize_certificate_emits_block_finalized_and_finalization_reached_ These tests require more complex setup or time control: - test_try_final_blocked_by_bad_window: - - Set `BadWindow` and verify we do NOT broadcast Finalize in that window. + - Set `TimeoutSkipped` and verify we do NOT broadcast Finalize in that window. - test_try_skip_window_broadcasts_for_unvoted_on_timeout: - Trigger Timeout(s) via `check_all()` (time manipulation / deterministic clock) and verify @@ -4585,7 +4048,7 @@ fn test_set_finalize_certificate_emits_block_finalized_and_finalization_reached_ #[test] fn test_reject_far_future_vote() { let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("create"); + let mut state = SimplexState::new(&desc).expect("create"); let far_slot = state.first_too_new_vote_slot(); let vote = Vote::Notarize(NotarizeVote { slot: far_slot, block_hash: UInt256::rand() }); @@ -4602,7 +4065,7 @@ fn test_reject_far_future_vote() { #[test] fn test_accept_vote_at_boundary() { let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("create"); + let mut state = SimplexState::new(&desc).expect("create"); let boundary_slot = state.first_too_new_vote_slot() - 1; // Slot immediately before first_too_new should be accepted (not rejected by bounds check). @@ -4625,7 +4088,7 @@ fn test_accept_vote_at_boundary() { #[test] fn test_reject_far_future_candidate() { let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("create"); + let mut state = SimplexState::new(&desc).expect("create"); let max_future_slots = desc.opts().max_leader_window_desync.saturating_mul(desc.opts().slots_per_leader_window); @@ -4649,7 +4112,7 @@ fn test_reject_far_future_candidate() { #[test] fn test_reject_far_future_window_base_ready() { let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("create"); + let mut state = SimplexState::new(&desc).expect("create"); let max_future_slots = desc.opts().max_leader_window_desync.saturating_mul(desc.opts().slots_per_leader_window); @@ -4668,7 +4131,7 @@ fn test_reject_far_future_window_base_ready() { #[test] fn test_ensure_window_exists_capped() { let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("create"); + let mut state = SimplexState::new(&desc).expect("create"); let initial_len = state.leader_windows.len(); @@ -4687,7 +4150,7 @@ fn test_ensure_window_exists_capped() { #[test] fn test_vote_bound_with_advanced_finalization() { let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("create"); + let mut state = SimplexState::new(&desc).expect("create"); let expected_first_too_new = SlotIndex::new( ((5000 / desc.opts().slots_per_leader_window) + desc.opts().max_leader_window_desync + 1) * desc.opts().slots_per_leader_window, @@ -4727,7 +4190,7 @@ fn test_vote_bound_with_advanced_finalization() { #[test] fn test_is_slot_too_far_ahead_helper() { let desc = create_test_desc(4, 2); - let state = SimplexState::new(&desc, opts_cpp()).expect("create"); + let state = SimplexState::new(&desc).expect("create"); let max_future_slots = desc.opts().max_leader_window_desync.saturating_mul(desc.opts().slots_per_leader_window); @@ -4740,7 +4203,7 @@ fn test_is_slot_too_far_ahead_helper() { #[test] fn test_is_vote_slot_too_far_ahead_helper() { let desc = create_test_desc(4, 2); - let state = SimplexState::new(&desc, opts_cpp()).expect("create"); + let state = SimplexState::new(&desc).expect("create"); let first_too_new = state.first_too_new_vote_slot(); assert!(!state.is_vote_slot_too_far_ahead(first_too_new - 1)); @@ -4751,8 +4214,7 @@ fn test_is_vote_slot_too_far_ahead_helper() { #[test] fn test_max_acceptable_slot_uses_progress_cursor_after_skip() { let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let max_future_slots = desc.opts().max_leader_window_desync.saturating_mul(desc.opts().slots_per_leader_window); @@ -4770,8 +4232,7 @@ fn test_max_acceptable_slot_uses_progress_cursor_after_skip() { #[test] fn test_vote_bound_uses_progress_cursor_after_skip() { let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let expected_first_too_new = SlotIndex::new( ((1 / desc.opts().slots_per_leader_window) + desc.opts().max_leader_window_desync + 1) * desc.opts().slots_per_leader_window, @@ -4791,7 +4252,7 @@ fn test_vote_bound_uses_progress_cursor_after_skip() { #[test] fn test_standstill_slot_grid_dump_empty_state() { let desc = create_test_desc(4, 2); - let state = SimplexState::new(&desc, opts_cpp()).expect("create"); + let state = SimplexState::new(&desc).expect("create"); let grid = state.standstill_slot_grid_dump(&desc); @@ -4805,7 +4266,7 @@ fn test_standstill_slot_grid_dump_empty_state() { #[test] fn test_standstill_slot_grid_dump_with_votes() { let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("create"); + let mut state = SimplexState::new(&desc).expect("create"); let block_id = BlockIdExt::with_params( ShardIdent::masterchain(), @@ -4850,7 +4311,7 @@ fn test_standstill_slot_grid_dump_with_votes() { #[test] fn test_standstill_slot_grid_dump_with_certs() { let desc = create_test_desc_weights(5, 2, vec![1, 1, 1, 1, 1]); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("create"); + let mut state = SimplexState::new(&desc).expect("create"); let block_id = BlockIdExt::with_params( ShardIdent::masterchain(), @@ -4887,7 +4348,7 @@ fn test_standstill_slot_grid_dump_with_certs() { #[test] fn test_standstill_diagnostic_dump_includes_last_final_cert_summary() { let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("create"); + let mut state = SimplexState::new(&desc).expect("create"); let block_id = BlockIdExt::with_params( ShardIdent::masterchain(), @@ -4940,7 +4401,7 @@ fn test_available_base_max_merge_keeps_higher_slot() { // When two propagations compete for the same target slot, max-merge must // keep the higher parent (slot first, then hash), mirroring C++ ordering. let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_notarized_parent_chain()).expect("create"); + let mut state = SimplexState::new(&desc).expect("create"); let h0 = UInt256::from([0xB0u8; 32]); let h1 = UInt256::from([0xB1u8; 32]); @@ -4999,7 +4460,7 @@ fn test_available_base_skip_propagates_max_merge() { // Skip-propagation must max-merge into target slot: // if target has lower base and skipped slot has higher base, target must upgrade. let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_notarized_parent_chain()).expect("create"); + let mut state = SimplexState::new(&desc).expect("create"); let h_low = UInt256::from([0xC0u8; 32]); let h_high = UInt256::from([0xC1u8; 32]); @@ -5040,8 +4501,7 @@ fn test_stale_window_guard_current_leader_window_idx_updated_before_collation_ch // Setup: 4 validators, 2 slots per window. // Progress both slots in window 0 via notarization -> cursor crosses to window 1. let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); assert_eq!(state.current_leader_window_idx, WindowIndex::new(0)); assert_eq!(state.first_non_progressed_slot, SlotIndex::new(0)); @@ -5098,8 +4558,7 @@ fn test_stale_window_guard_skip_also_advances_window() { // Same as above but using skip votes instead of notarization. // Window advancement via skips must also update current_leader_window_idx. let desc = create_test_desc(4, 2); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); // Skip slot 0 (3 out of 4 validators) let skip0 = Vote::Skip(SkipVote { slot: SlotIndex::new(0) }); @@ -5142,8 +4601,7 @@ fn test_candidate_stored_as_pending_despite_skip_vote_cpp_mode() { // when try_notar fails (base not propagated yet). // Reference: C++ consensus.cpp CandidateReceived only checks voted_notar. let desc = create_test_desc(4, 4); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); // Cast local skip for all of window 1 (slots 4-7). state.try_skip_window(WindowIndex::new(1)); @@ -5180,7 +4638,7 @@ fn test_candidate_stored_as_pending_despite_skip_vote_cpp_mode() { fn test_cpp_mode_try_skip_window_preserves_existing_pending_block() { // Regression: in C++ mode, Skip must NOT drop an already buffered candidate. let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let parent_hash = UInt256::from([0x91; 32]); let child_hash = UInt256::from([0x92; 32]); @@ -5205,7 +4663,7 @@ fn test_cpp_mode_try_skip_window_preserves_existing_pending_block() { fn test_cpp_mode_restart_skip_paths_preserve_existing_pending_block() { // Regression: restart skip paths in C++ mode must preserve pending candidates. let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let parent_hash = UInt256::from([0xA1; 32]); let child_hash = UInt256::from([0xA2; 32]); @@ -5243,7 +4701,7 @@ fn test_cold_start_delayed_parent_recovery_notarizes_pending_cpp_mode() { let base_time = SystemTime::UNIX_EPOCH + Duration::from_secs(1_700_000_000); desc.set_time(base_time); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); assert!(state.get_next_timeout().is_none(), "constructor path must not arm startup timeout"); // Candidate for slot 1 depends on slot 0 parent that is not available yet. @@ -5296,33 +4754,12 @@ fn test_cold_start_delayed_parent_recovery_notarizes_pending_cpp_mode() { ); } -#[test] -fn test_alpenglow_mode_skip_clears_existing_pending_block() { - // Guardrail: fallback/Alpenglow mode keeps pendingBlocks[k] <- ⊥ on skip. - let desc = create_test_desc(4, 2); - let mut state = SimplexState::new(&desc, opts_alpenglow()).expect("Failed to create state"); - - let parent_hash = UInt256::from([0xC1; 32]); - let child_hash = UInt256::from([0xC2; 32]); - let candidate = - create_test_candidate(1, child_hash, BlockIdExt::default(), Some((0, parent_hash)), 0); - state.on_candidate(&desc, candidate).unwrap(); - assert!(state.get_window(WindowIndex::new(0)).unwrap().slots[1].pending_block.is_some()); - - state.try_skip_window(WindowIndex::new(0)); - assert!( - state.get_window(WindowIndex::new(0)).unwrap().slots[1].pending_block.is_none(), - "Alpenglow mode must clear pending_block on skip" - ); -} - #[test] fn test_pending_block_notarized_after_base_propagates_via_skip_certs() { // Full lifecycle: candidate stored as pending after skip vote, then notarized // when skip certs propagate the genesis base through to the candidate's slot. let desc = create_test_desc(4, 4); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); // Cast local skip for window 1 (slots 4-7) state.try_skip_window(WindowIndex::new(1)); @@ -5371,7 +4808,7 @@ fn test_candidate_dropped_when_voted_notar_cpp_mode() { // When voted_notar is already set for a slot, a second candidate with a different // hash must be correctly dropped (not stored as pending). let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); // Slot 0 has genesis base → first candidate succeeds immediately let h1 = UInt256::from([0x11; 32]); @@ -5412,8 +4849,7 @@ fn test_out_of_order_skip_certs_still_propagate_base_to_pending() { // processed, find_next_nonskipped_slot skips over s3 (already marked // skipped) and propagates genesis base directly to s4. let desc = create_test_desc(4, 4); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); // Cast local skip for window 1 (slots 4-7) state.try_skip_window(WindowIndex::new(1)); @@ -5496,8 +4932,7 @@ fn test_base_chains_through_already_skipped_slots() { // When slot 0's cert is finally processed, the chaining loop must // propagate the genesis base through slots 1→2→3→4→5→6→7. let desc = create_test_desc(4, 8); // 4 validators, 8 slots/window - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let signers = vec![ValidatorIndex::new(0), ValidatorIndex::new(1), ValidatorIndex::new(2)]; @@ -5547,8 +4982,7 @@ fn test_base_chaining_enables_pending_block_at_intermediate_skipped_slot() { // because `find_next_nonskipped_slot` jumped past it. The chaining fix // ensures the base reaches it. let desc = create_test_desc(4, 8); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let signers = vec![ValidatorIndex::new(0), ValidatorIndex::new(1), ValidatorIndex::new(2)]; @@ -5605,8 +5039,7 @@ fn test_pending_block_not_overwritten_by_second_candidate_cpp_mode() { // C++ parity: first pending candidate wins. A second candidate with a different // hash for the same slot must be rejected (equivocation), keeping the original. let desc = create_test_desc(4, 4); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); // Cast local skip for window 1 (slots 4-7) so candidates go to pending state.try_skip_window(WindowIndex::new(1)); @@ -5658,12 +5091,12 @@ fn test_pending_block_not_overwritten_by_second_candidate_cpp_mode() { } #[test] -fn test_try_notar_not_blocked_by_its_over_after_finalize_restart_cpp_mode() { - // C++ parity: after restart with a persisted Finalize vote, its_over=true and +fn test_try_notar_not_blocked_by_is_completed_after_finalize_restart_cpp_mode() { + // C++ parity: after restart with a persisted Finalize vote, is_completed=true and // voted_final=true are set, but voted_notar remains None. C++ try_notarize() // does NOT check voted_final, so notarization must still proceed. let desc = create_test_desc(4, 1); - let mut state = SimplexState::new(&desc, opts_cpp()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); // Simulate restart recovery: mark slot 0 as having a persisted Finalize vote let finalize_vote = Vote::Finalize(FinalizeVote { @@ -5674,7 +5107,10 @@ fn test_try_notar_not_blocked_by_its_over_after_finalize_restart_cpp_mode() { // Verify preconditions let w0 = state.get_window(WindowIndex::new(0)).unwrap(); - assert!(w0.slots[0].its_over, "precondition: its_over must be true after Finalize restart"); + assert!( + w0.slots[0].is_completed, + "precondition: is_completed must be true after Finalize restart" + ); assert!( w0.slots[0].voted_final, "precondition: voted_final must be true after Finalize restart" @@ -5694,7 +5130,7 @@ fn test_try_notar_not_blocked_by_its_over_after_finalize_restart_cpp_mode() { events.iter().any( |e| matches!(e, SimplexEvent::BroadcastVote(Vote::Notarize(NotarizeVote { slot, .. })) if *slot == SlotIndex::new(0)) ), - "must emit NotarVote for slot 0 — its_over must NOT block try_notar in C++ mode, got: {:?}", + "must emit NotarVote for slot 0 — is_completed must NOT block try_notar in C++ mode, got: {:?}", events ); } @@ -5712,8 +5148,7 @@ fn test_notarized_parent_chain_genesis_base_propagates_across_skipped_windows() // Reference: C++ pool.cpp advance_present() reads slot_at(now_)->state->available_base // and publishes it via LeaderWindowObserved(now_, base). let desc = create_test_desc(4, 2); // 2 slots per window - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); // Window 0 starts with genesis base assert!(state.has_available_parent(&desc, SlotIndex::new(0))); @@ -5724,7 +5159,7 @@ fn test_notarized_parent_chain_genesis_base_propagates_across_skipped_windows() state.on_vote_test(&desc, ValidatorIndex::new(0), skip_vote_0.clone(), Vec::new()).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(1), skip_vote_0.clone(), Vec::new()).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(2), skip_vote_0, Vec::new()).unwrap(); - while state.pull_event().is_some() {} + drain_events(&mut state); assert_eq!(state.first_non_progressed_slot, SlotIndex::new(1)); @@ -5733,7 +5168,7 @@ fn test_notarized_parent_chain_genesis_base_propagates_across_skipped_windows() state.on_vote_test(&desc, ValidatorIndex::new(0), skip_vote_1.clone(), Vec::new()).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(1), skip_vote_1.clone(), Vec::new()).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(2), skip_vote_1, Vec::new()).unwrap(); - while state.pull_event().is_some() {} + drain_events(&mut state); // Progress cursor should be at slot 2 (start of window 1) assert_eq!(state.first_non_progressed_slot, SlotIndex::new(2)); @@ -5775,8 +5210,7 @@ fn test_notarized_parent_chain_base_propagates_across_multiple_skipped_windows() // Verify that base propagation works across multiple consecutive skipped windows. // This is the sustained stall scenario: window 0 -> 1 -> 2 all skip without finalization. let desc = create_test_desc(4, 1); // 1 slot per window for simplicity - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); assert!(state.has_available_parent(&desc, SlotIndex::new(0))); assert_eq!(state.current_leader_window_idx, WindowIndex::new(0)); @@ -5786,7 +5220,7 @@ fn test_notarized_parent_chain_base_propagates_across_multiple_skipped_windows() state.on_vote_test(&desc, ValidatorIndex::new(0), skip.clone(), Vec::new()).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(1), skip.clone(), Vec::new()).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(2), skip, Vec::new()).unwrap(); - while state.pull_event().is_some() {} + drain_events(&mut state); assert_eq!(state.current_leader_window_idx, WindowIndex::new(1)); assert!( @@ -5799,7 +5233,7 @@ fn test_notarized_parent_chain_base_propagates_across_multiple_skipped_windows() state.on_vote_test(&desc, ValidatorIndex::new(0), skip.clone(), Vec::new()).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(1), skip.clone(), Vec::new()).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(2), skip, Vec::new()).unwrap(); - while state.pull_event().is_some() {} + drain_events(&mut state); assert_eq!(state.current_leader_window_idx, WindowIndex::new(2)); assert!( @@ -5812,7 +5246,7 @@ fn test_notarized_parent_chain_base_propagates_across_multiple_skipped_windows() state.on_vote_test(&desc, ValidatorIndex::new(0), skip.clone(), Vec::new()).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(1), skip.clone(), Vec::new()).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(2), skip, Vec::new()).unwrap(); - while state.pull_event().is_some() {} + drain_events(&mut state); assert_eq!(state.current_leader_window_idx, WindowIndex::new(3)); assert!( @@ -5834,8 +5268,7 @@ fn test_set_timeouts_arms_timeout_base() { // set_timeouts must set timeout_base = now + first_block_timeout // and skip_timestamp = timeout_base + target_rate. let desc = create_test_desc(4, 4); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let t0 = SystemTime::UNIX_EPOCH + Duration::from_secs(1_000_000); desc.set_time(t0); @@ -5872,8 +5305,7 @@ fn test_notarization_rearm_uses_fixed_base_not_sliding() { // // After fix, Rust must produce the C++ answer: t0 + 5s. let desc = create_test_desc(4, 4); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let t0 = SystemTime::UNIX_EPOCH + Duration::from_secs(1_000_000); desc.set_time(t0); @@ -5890,7 +5322,7 @@ fn test_notarization_rearm_uses_fixed_base_not_sliding() { state.on_vote_test(&desc, ValidatorIndex::new(0), vote.clone(), vec![1]).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(1), vote.clone(), vec![2]).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(2), vote, vec![3]).unwrap(); - while state.pull_event().is_some() {} + drain_events(&mut state); // skip_slot should advance to 1 (watching slot 1 now) assert_eq!( @@ -5922,8 +5354,7 @@ fn test_notarization_rearm_successive_slots() { // Notarize slots 0, 1, 2 in rapid succession — deadlines must follow the // fixed schedule: base+2*rate, base+3*rate, base+4*rate. let desc = create_test_desc(4, 4); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let t0 = SystemTime::UNIX_EPOCH + Duration::from_secs(1_000_000); desc.set_time(t0); @@ -5945,14 +5376,14 @@ fn test_notarization_rearm_successive_slots() { let candidate = create_test_candidate(slot_num, hash.clone(), BlockIdExt::default(), parent, 0); let _ = state.on_candidate(&desc, candidate); - while state.pull_event().is_some() {} + drain_events(&mut state); let vote = Vote::Notarize(NotarizeVote { slot: SlotIndex::new(slot_num), block_hash: hash }); state.on_vote_test(&desc, ValidatorIndex::new(0), vote.clone(), vec![1]).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(1), vote.clone(), vec![2]).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(2), vote, vec![3]).unwrap(); - while state.pull_event().is_some() {} + drain_events(&mut state); // C++ timeout_slot_ = slot+2 (non-end-of-window) → offset = slot+2 let expected = base + target_rate * (slot_num + 2); @@ -5976,8 +5407,7 @@ fn test_notarization_window_end_transitions_to_new_window() { // The guard `skip_slot <= slot` (C++ parity) prevents the per-notarization // timer update from overwriting the freshly set window 1 schedule. let desc = create_test_desc(4, 4); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let t0 = SystemTime::UNIX_EPOCH + Duration::from_secs(1_000_000); desc.set_time(t0); @@ -5998,14 +5428,14 @@ fn test_notarization_window_end_transitions_to_new_window() { let candidate = create_test_candidate(slot_num, hash.clone(), BlockIdExt::default(), parent, 0); let _ = state.on_candidate(&desc, candidate); - while state.pull_event().is_some() {} + drain_events(&mut state); let vote = Vote::Notarize(NotarizeVote { slot: SlotIndex::new(slot_num), block_hash: hash }); state.on_vote_test(&desc, ValidatorIndex::new(0), vote.clone(), vec![1]).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(1), vote.clone(), vec![2]).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(2), vote, vec![3]).unwrap(); - while state.pull_event().is_some() {} + drain_events(&mut state); } // Window transition should have occurred @@ -6034,8 +5464,7 @@ fn test_skip_cert_does_not_move_timer() { // C++ does NOT touch the consensus alarm when a skip certificate arrives. // Skip certs flow through the pool layer only. let desc = create_test_desc(4, 4); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let t0 = SystemTime::UNIX_EPOCH + Duration::from_secs(1_000_000); desc.set_time(t0); @@ -6057,7 +5486,7 @@ fn test_skip_cert_does_not_move_timer() { state.on_vote_test(&desc, ValidatorIndex::new(0), skip.clone(), Vec::new()).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(1), skip.clone(), Vec::new()).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(2), skip, Vec::new()).unwrap(); - while state.pull_event().is_some() {} + drain_events(&mut state); // Timer state must be UNCHANGED (skip_slot, skip_timestamp, timeout_base) assert_eq!(state.skip_slot, skip_slot_before, "skip_slot must NOT advance on skip cert"); @@ -6070,8 +5499,7 @@ fn test_window_skip_clears_timeout_base() { // When process_timeouts fires the C++ window-skip, both skip_timestamp // and timeout_base must be cleared (None). let desc = create_test_desc(4, 4); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let t0 = SystemTime::UNIX_EPOCH + Duration::from_secs(1_000_000); desc.set_time(t0); @@ -6082,7 +5510,7 @@ fn test_window_skip_clears_timeout_base() { // Advance well past the first deadline to trigger process_timeouts desc.set_time(t0 + Duration::from_secs(10)); state.check_all(&desc); - while state.pull_event().is_some() {} + drain_events(&mut state); assert!(state.skip_timestamp.is_none(), "skip_timestamp must be None after C++ window-skip"); assert!(state.timeout_base.is_none(), "timeout_base must be None after C++ window-skip"); @@ -6100,8 +5528,7 @@ fn test_new_window_rearms_timeout_base() { // advance_leader_window_on_progress_cursor → set_timeouts must // re-arm timeout_base with (possibly backed-off) first_block_timeout. let desc = create_test_desc(4, 4); - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); let t0 = SystemTime::UNIX_EPOCH + Duration::from_secs(1_000_000); desc.set_time(t0); @@ -6113,7 +5540,7 @@ fn test_new_window_rearms_timeout_base() { // Trigger timeout to skip window 0 desc.set_time(t0 + Duration::from_secs(10)); state.check_all(&desc); - while state.pull_event().is_some() {} + drain_events(&mut state); assert!(state.timeout_base.is_none(), "base cleared after window skip"); // Now feed skip certs for all 4 slots (to let progress cursor cross window boundary) @@ -6125,7 +5552,7 @@ fn test_new_window_rearms_timeout_base() { state.on_vote_test(&desc, ValidatorIndex::new(1), skip.clone(), Vec::new()).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(2), skip, Vec::new()).unwrap(); } - while state.pull_event().is_some() {} + drain_events(&mut state); // Window 0 had timeouts (had_timeouts=true), so adaptive backoff applies: // first_block_timeout *= timeout_increase_factor (1.05) @@ -6163,8 +5590,7 @@ fn test_new_window_rearms_timeout_base() { #[test] fn test_second_leader_collates_after_full_first_window_skip() { let desc = create_test_desc(4, 2); // 4 validators, 2 slots per window - let mut state = - SimplexState::new(&desc, opts_notarized_parent_chain()).expect("Failed to create state"); + let mut state = SimplexState::new(&desc).expect("Failed to create state"); // -- Skip entire window 0 (leader=v0 absent) -- @@ -6173,14 +5599,14 @@ fn test_second_leader_collates_after_full_first_window_skip() { state.on_vote_test(&desc, ValidatorIndex::new(0), skip0.clone(), Vec::new()).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(1), skip0.clone(), Vec::new()).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(2), skip0, Vec::new()).unwrap(); - while state.pull_event().is_some() {} + drain_events(&mut state); // Skip slot 1 (last slot in window 0) let skip1 = Vote::Skip(SkipVote { slot: SlotIndex::new(1) }); state.on_vote_test(&desc, ValidatorIndex::new(0), skip1.clone(), Vec::new()).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(1), skip1.clone(), Vec::new()).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(2), skip1, Vec::new()).unwrap(); - while state.pull_event().is_some() {} + drain_events(&mut state); // Verify window advanced and second leader has an available parent assert_eq!( @@ -6222,7 +5648,7 @@ fn test_second_leader_collates_after_full_first_window_skip() { state.on_vote_test(&desc, ValidatorIndex::new(1), notar2.clone(), Vec::new()).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(2), notar2.clone(), Vec::new()).unwrap(); state.on_vote_test(&desc, ValidatorIndex::new(3), notar2, Vec::new()).unwrap(); - while state.pull_event().is_some() {} + drain_events(&mut state); // Verify slot 2 is notarized assert!( diff --git a/src/node/simplex/src/utils.rs b/src/node/simplex/src/utils.rs index 1e0eee8..f1904e0 100644 --- a/src/node/simplex/src/utils.rs +++ b/src/node/simplex/src/utils.rs @@ -53,7 +53,8 @@ use ton_api::{ IntoBoxed, }; use ton_block::{ - error, fail, sha256_digest, Block, BlockIdExt, Deserializable, Result, ShardIdent, UInt256, + error, fail, read_boc, sha256_digest, Block, BlockIdExt, ConsensusExtraData, Deserializable, + Result, ShardIdent, UInt256, }; /* @@ -103,7 +104,7 @@ pub(crate) fn install_simplex_panic_hook_once() { let bt = Backtrace::force_capture(); log::error!( - "FATAL PANIC (PANIC-1): thread={} location={} payload=\"{}\" backtrace={:?}", + "FATAL PANIC: thread={} location={} payload=\"{}\" backtrace={:?}", thread_name, location, payload, @@ -443,6 +444,17 @@ pub fn build_candidate_hash_data_bytes_empty( Extracts BlockIdExt and collated_file_hash from validatorSession.candidate bytes. */ +/// Extract `gen_utime_ms` from collated data if it carries `ConsensusExtraData`. +/// +/// Returns `None` for empty / invalid BOCs or when the collated data does not +/// carry Simplex `ConsensusExtraData`. +pub fn extract_consensus_gen_utime_ms(collated_data: &[u8]) -> Option { + let roots = read_boc(collated_data).ok()?.roots; + roots.into_iter().find_map(|root| { + ConsensusExtraData::construct_from_cell(root).ok().map(|extra| extra.gen_utime_ms) + }) +} + /// Block info extracted from candidate bytes #[derive(Clone, Debug)] pub struct ExtractedBlockInfo { @@ -658,15 +670,10 @@ pub fn sign_candidate_u32( - Vote::Notarize(NotarizeVote) - Vote::Finalize(FinalizeVote) - Vote::Skip(SkipVote) - - Vote::NotarizeFallback(NotarizeFallbackVote) - - Vote::SkipFallback(SkipFallbackVote) - TL types (ton_api::ton::simplex_consensus::*): - UnsignedVote::SimplexConsensus_NotarizeVote - UnsignedVote::SimplexConsensus_FinalizeVote - UnsignedVote::SimplexConsensus_SkipVote - - UnsignedVote::SimplexConsensus_NotarizeFallbackVote - - UnsignedVote::SimplexConsensus_SkipFallbackVote Wire format: consensus.simplex.vote { vote, signature } */ @@ -688,12 +695,9 @@ use ton_api::ton::consensus::simplex::{self as tl_simplex, unsignedvote as tl_un /// - FinalizeVote { id: CandidateId } /// - SkipVote { slot: int } /// -/// Fallback votes (NotarizeFallback, SkipFallback) are FSM-internal only and -/// should NOT be serialized to wire. This function returns an error for them. -/// /// # Returns /// -/// Ok(UnsignedVote) for wire-compatible votes, Err for fallback votes. +/// Serialized TL vote for wire-compatible vote types. pub fn vote_to_tl_unsigned(vote: &Vote) -> Result { match vote { Vote::Notarize(v) => { @@ -712,14 +716,6 @@ pub fn vote_to_tl_unsigned(vote: &Vote) -> Result { } Vote::Skip(v) => Ok(tl_unsigned::SkipVote { slot: v.slot.value() as i32 }.into_boxed()), - - Vote::NotarizeFallback(v) => { - Err(error!("NotarizeFallback cannot be serialized to wire (slot={})", v.slot)) - } - - Vote::SkipFallback(v) => { - Err(error!("SkipFallback cannot be serialized to wire (slot={})", v.slot)) - } } } @@ -728,7 +724,6 @@ pub fn vote_to_tl_unsigned(vote: &Vote) -> Result { /// Used when processing incoming votes from the network. /// /// Note: C++ protocol only sends 3 vote types (Notarize, Finalize, Skip). -/// Fallback votes are FSM-internal and never appear on the wire. /// /// # Errors /// @@ -793,13 +788,13 @@ pub fn serialize_unsigned_vote(vote: &tl_simplex::UnsignedVote) -> Vec { /// /// # Returns /// -/// Ok(signed vote) for wire-compatible votes, Err for fallback votes. +/// Signed vote for network broadcast. pub fn sign_vote( vote: &Vote, session_id: &SessionId, private_key: &PrivateKey, ) -> Result { - // Convert to TL unsigned vote (returns error for fallback votes) + // Convert to TL unsigned vote. let unsigned_vote = vote_to_tl_unsigned(vote)?; // Serialize for signing (boxed, as in C++) diff --git a/src/node/simplex/tests/test_collation.rs b/src/node/simplex/tests/test_collation.rs index f7592a6..d7dbb93 100644 --- a/src/node/simplex/tests/test_collation.rs +++ b/src/node/simplex/tests/test_collation.rs @@ -198,12 +198,16 @@ impl SessionListener for CollationTestListener { fn get_approved_candidate( &self, _source: PublicKey, - _root_hash: BlockHash, + root_hash: BlockHash, _file_hash: BlockHash, _collated_data_hash: BlockHash, _callback: ValidatorBlockCandidateCallback, ) { - // Not used in this test + panic!( + "unexpected legacy get_approved_candidate request in simplex collation test \ + (root_hash={}); active simplex flow must not use this callback", + root_hash.to_hex_string() + ); } } diff --git a/src/node/simplex/tests/test_consensus.rs b/src/node/simplex/tests/test_consensus.rs index e69be0c..7f2539b 100644 --- a/src/node/simplex/tests/test_consensus.rs +++ b/src/node/simplex/tests/test_consensus.rs @@ -21,7 +21,7 @@ use serde::{Deserialize, Serialize}; use simplex::*; use spin::mutex::SpinMutex; use std::{ - collections::{HashMap, HashSet}, + collections::HashSet, fs::{self, File}, io::{self, Cursor, LineWriter, Write}, path::Path, @@ -371,10 +371,6 @@ struct SessionInstance { finalized_seqnos: Arc>>, /// Session errors count session_errors_count: Arc, - /// Approved candidates storage for get_approved_candidate() during restart recovery. - /// Keyed by root_hash to match lookup semantics. - approved_candidates: - Arc>>>, /// Shared finalized block roots for harness-level introspection. finalized_blocks: FinalizedBlocksMap, _session: SessionPtr, @@ -384,10 +380,6 @@ struct SessionInstance { /// Listener wrapper that delegates to SessionInstance struct SessionInstanceListener { instance: SpinMutex>>, - /// Approved candidates storage - shared with SessionInstance but available immediately - /// before session creation to support get_approved_candidate() during startup recovery. - approved_candidates: - Arc>>>, /// Maximum finalized seqno - shared with SessionInstance, available immediately. max_finalized_seqno: Arc, /// Finalized seqnos set - shared with SessionInstance, available immediately. @@ -500,34 +492,6 @@ impl SessionListener for SessionInstance { return; } - // Store approved candidate for get_approved_candidate() during restart recovery. - // This allows us to respond to requestCandidate queries after restart. - { - let collated_data_bytes = _collated_data.data().to_vec(); - let data_bytes = data.data().to_vec(); - let file_hash = UInt256::from_slice(&sha256_digest(&data_bytes)); - let collated_file_hash = UInt256::from_slice(&sha256_digest(&collated_data_bytes)); - - let candidate = Arc::new(consensus_common::ValidatorBlockCandidate { - public_key: source_info.source.clone(), - id: BlockIdExt::with_params( - ShardIdent::masterchain(), - collated_data.seqno, // Use seqno for lookup consistency - root_hash.clone(), - file_hash, - ), - collated_file_hash, - data: consensus_common::ConsensusCommonFactory::create_block_payload(data_bytes), - collated_data: consensus_common::ConsensusCommonFactory::create_block_payload( - collated_data_bytes, - ), - }); - - if let Ok(mut map) = self.approved_candidates.lock() { - map.insert(root_hash, candidate); - } - } - callback(Ok(SystemTime::now())) } @@ -708,62 +672,18 @@ impl SessionListener for SessionInstance { fn get_approved_candidate( &self, - source: PublicKey, + _source: PublicKey, root_hash: BlockHash, - file_hash: BlockHash, - collated_data_hash: BlockHash, - callback: ValidatorBlockCandidateCallback, + _file_hash: BlockHash, + _collated_data_hash: BlockHash, + _callback: ValidatorBlockCandidateCallback, ) { - log::info!( - "SessionListener::get_approved_candidate: \ - request for block hash {:?} from source {:?} (self source #{})", - root_hash.to_hex_string(), - source.id(), - self.source_index + panic!( + "unexpected legacy get_approved_candidate request in simplex consensus test \ + (source #{}, root_hash={}); active simplex flow must not use this callback", + self.source_index, + root_hash.to_hex_string() ); - - // Lookup candidate by root_hash - let candidate = - self.approved_candidates.lock().ok().and_then(|map| map.get(&root_hash).cloned()); - - match candidate { - Some(c) => { - log::debug!( - "SessionListener::get_approved_candidate: found candidate for root_hash={} (source #{})", - root_hash.to_hex_string(), - self.source_index - ); - // Sanity: file_hash and collated_data_hash should match - if c.id.file_hash != file_hash { - log::warn!( - "get_approved_candidate: file_hash mismatch: stored={} requested={} (source #{})", - c.id.file_hash.to_hex_string(), - file_hash.to_hex_string(), - self.source_index - ); - } - if c.collated_file_hash != collated_data_hash { - log::warn!( - "get_approved_candidate: collated_data_hash mismatch: stored={} requested={} (source #{})", - c.collated_file_hash.to_hex_string(), - collated_data_hash.to_hex_string(), - self.source_index - ); - } - callback(Ok(c)); - } - None => { - log::warn!( - "SessionListener::get_approved_candidate: candidate not found for root_hash={} (source #{})", - root_hash.to_hex_string(), - self.source_index - ); - callback(Err(error!( - "approved candidate not found for root_hash={}", - root_hash.to_hex_string() - ))); - } - } } } @@ -898,39 +818,13 @@ impl SessionListener for SessionInstanceListener { root_hash: BlockHash, _file_hash: BlockHash, _collated_data_hash: BlockHash, - callback: ValidatorBlockCandidateCallback, + _callback: ValidatorBlockCandidateCallback, ) { - // Access approved_candidates directly - available even before SessionInstance is wired. - // This fixes Issue 2 from RESTART-GREMLIN-1: startup recovery calls get_approved_candidate - // during create_session, before listener.instance is set. - let candidates_count = self.approved_candidates.lock().ok().map(|m| m.len()).unwrap_or(0); - log::info!( - "[restart-gremlin] get_approved_candidate: root_hash={} candidates_stored={}", - &root_hash.to_hex_string()[..8], - candidates_count + panic!( + "unexpected legacy get_approved_candidate request in simplex consensus listener \ + (root_hash={}); active simplex flow must not use this callback", + root_hash.to_hex_string() ); - - // Lookup candidate by root_hash - let candidate = - self.approved_candidates.lock().ok().and_then(|m| m.get(&root_hash).cloned()); - - if let Some(cand) = candidate { - log::info!( - "[restart-gremlin] get_approved_candidate: FOUND block {} (seq_no={})", - &root_hash.to_hex_string()[..8], - cand.id.seq_no() - ); - callback(Ok(cand)); - } else { - log::warn!( - "[restart-gremlin] get_approved_candidate: NOT FOUND block {}", - &root_hash.to_hex_string()[..8] - ); - callback(Err(error!( - "Approved candidate not found for root_hash={}", - root_hash.to_hex_string() - ))); - } } } @@ -1177,19 +1071,11 @@ where let local_key = nodes[i].public_key.clone(); let initial_block_seqno = 1u32; // First block seqno=1 (seqno 0 is zerostate) - // Create approved_candidates storage BEFORE session creation. - // This fixes Issue 2 from RESTART-GREMLIN-1: get_approved_candidate() is called - // during create_session (startup recovery), before SessionInstance is wired. - let approved_candidates: Arc< - Mutex>>, - > = Arc::new(Mutex::new(HashMap::new())); - let max_finalized_seqno = Arc::new(AtomicU32::new(initial_block_seqno)); let finalized_seqnos: Arc>> = Arc::new(Mutex::new(HashSet::new())); let listener = Arc::new(SessionInstanceListener { instance: SpinMutex::new(Weak::new()), - approved_candidates: approved_candidates.clone(), max_finalized_seqno: max_finalized_seqno.clone(), finalized_seqnos: finalized_seqnos.clone(), }); @@ -1225,7 +1111,6 @@ where max_finalized_seqno: max_finalized_seqno.clone(), finalized_seqnos: finalized_seqnos.clone(), session_errors_count: Arc::new(AtomicU32::new(0)), - approved_candidates: approved_candidates.clone(), finalized_blocks: finalized_blocks.clone(), source_index: i as u32, _session: session, @@ -1389,7 +1274,7 @@ where }); 'main_loop: loop { - // PANIC-1: fail fast if any session thread panicked. + // Fail fast if any session thread panicked. // Otherwise the test may stall waiting for progress that will never happen. for (idx, inst) in instances.iter().enumerate() { let inst = inst.lock(); @@ -1506,30 +1391,14 @@ where let ctx = &session_contexts[node_idx]; - // CRITICAL: Preserve the OLD approved_candidates from the stopped instance. - // The new session's startup recovery will call get_approved_candidate() - // to restore candidates from persistent storage. These candidates were - // stored by the old session's on_generate_slot and on_candidate calls. - let ( - old_approved_candidates, - prev_next_seqno, - prev_commits, - finalized_seqnos, - ) = { + let (prev_next_seqno, prev_commits, finalized_seqnos) = { let inst = instances[node_idx].lock(); ( - inst.approved_candidates.clone(), inst.max_finalized_seqno.load(Ordering::SeqCst), inst.on_block_finalized_count.load(Ordering::SeqCst), inst.finalized_seqnos.clone(), ) }; - let candidates_count = - old_approved_candidates.lock().map(|m| m.len()).unwrap_or(0); - log::info!( - "[restart-gremlin] Preserving {} approved candidates from old instance for recovery", - candidates_count - ); // Create seqno counters BEFORE listener - they will be updated by // on_block_finalized during recovery, before SessionInstance is wired. @@ -1541,7 +1410,6 @@ where let new_listener = Arc::new(SessionInstanceListener { instance: SpinMutex::new(Weak::new()), - approved_candidates: old_approved_candidates.clone(), max_finalized_seqno: max_finalized_seqno.clone(), finalized_seqnos: finalized_seqnos.clone(), }); @@ -1595,7 +1463,6 @@ where max_finalized_seqno: max_finalized_seqno.clone(), finalized_seqnos: finalized_seqnos.clone(), session_errors_count: Arc::new(AtomicU32::new(0)), - approved_candidates: old_approved_candidates.clone(), finalized_blocks: finalized_blocks.clone(), source_index: node_idx as u32, _session: session, @@ -1630,8 +1497,6 @@ where .clone_from(&new_inst.finalized_seqnos); old_inst.session_errors_count = new_inst.session_errors_count.clone(); - old_inst.approved_candidates = - new_inst.approved_candidates.clone(); old_inst.finalized_blocks = new_inst.finalized_blocks.clone(); old_inst._session = new_inst._session.clone(); old_inst._listener = new_inst._listener.clone(); @@ -2488,15 +2353,11 @@ fn test_simplex_start_gate() { for i in 0..node_count { let local_key = nodes[i].public_key.clone(); let db_path = format!("{}_node{}", db_path_base, i); - let approved_candidates: Arc< - Mutex>>, - > = Arc::new(Mutex::new(HashMap::new())); let max_finalized_seqno = Arc::new(AtomicU32::new(initial_block_seqno)); let finalized_seqnos: Arc>> = Arc::new(Mutex::new(HashSet::new())); let listener = Arc::new(SessionInstanceListener { instance: SpinMutex::new(Weak::new()), - approved_candidates: approved_candidates.clone(), max_finalized_seqno: max_finalized_seqno.clone(), finalized_seqnos: finalized_seqnos.clone(), }); @@ -2528,7 +2389,6 @@ fn test_simplex_start_gate() { max_finalized_seqno, finalized_seqnos, session_errors_count: Arc::new(AtomicU32::new(0)), - approved_candidates, finalized_blocks: finalized_blocks.clone(), _session: session.clone(), _listener: listener.clone(), diff --git a/src/node/simplex/tests/test_restart.rs b/src/node/simplex/tests/test_restart.rs index 169fe5b..6a85fa9 100644 --- a/src/node/simplex/tests/test_restart.rs +++ b/src/node/simplex/tests/test_restart.rs @@ -12,8 +12,8 @@ //! same database path, and that post-restart behavior preserves key invariants: //! //! - **Round monotonicity**: round numbers do not reset after restart -//! - **Candidate fetch on restart**: restart recovery can retrieve approved candidates -//! via `SessionListener::get_approved_candidate` (used for candidate cache restoration) +//! - **Restart-state parity**: restart recovery restores simplex state without +//! relying on the legacy `SessionListener::get_approved_candidate` callback //! - **No session errors**: `SessionStats.errors_count` remains 0 //! //! NOTE: These tests intentionally avoid crate-private symbols. Deeper byte-level @@ -35,7 +35,7 @@ use std::{ time::{Duration, Instant, SystemTime}, }; use ton_block::{ - error, sha256_digest, BlockIdExt, BlockSignaturesVariant, BocFlags, BocWriter, BuilderData, + sha256_digest, BlockIdExt, BlockSignaturesVariant, BocFlags, BocWriter, BuilderData, Ed25519KeyOption, ShardIdent, UInt256, }; @@ -61,10 +61,6 @@ const PHASE_TIMEOUT: Duration = Duration::from_secs(240); struct RestartSingleSessionListener { public_key: PublicKey, - // Candidate storage (simulates validator persistent storage for this test process) - // Keyed by root_hash which is used by get_approved_candidate(). - candidates: Mutex>>, - // Progress tracking (slot-based, not round-based for SIMPLEX_ROUNDLESS mode) last_slot_seen: AtomicU32, last_finalized_slot: AtomicU32, @@ -84,9 +80,6 @@ struct RestartSingleSessionListener { /// Invariant: each seqno is finalized exactly once with the same block identity. finalized_seqnos: Mutex>, - // Recovery verification - approved_candidate_requests: AtomicU32, - // Error tracking from SessionStats max_errors_count: AtomicU32, } @@ -95,7 +88,6 @@ impl RestartSingleSessionListener { fn new(public_key: PublicKey, initial_block_seqno: u32) -> Self { Self { public_key, - candidates: Mutex::new(HashMap::new()), last_slot_seen: AtomicU32::new(0), last_finalized_slot: AtomicU32::new(0), finalized_blocks_count: AtomicU32::new(0), @@ -104,7 +96,6 @@ impl RestartSingleSessionListener { first_slot_after_restart: AtomicU32::new(u32::MAX), max_finalized_seqno: AtomicU32::new(initial_block_seqno), finalized_seqnos: Mutex::new(HashMap::new()), - approved_candidate_requests: AtomicU32::new(0), max_errors_count: AtomicU32::new(0), } } @@ -144,18 +135,9 @@ impl RestartSingleSessionListener { self.collation_count.load(Ordering::SeqCst) } - fn approved_candidate_requests(&self) -> u32 { - self.approved_candidate_requests.load(Ordering::SeqCst) - } - fn max_errors_count(&self) -> u32 { self.max_errors_count.load(Ordering::SeqCst) } - - #[allow(dead_code)] - fn candidate_store_len(&self) -> usize { - self.candidates.lock().map(|m| m.len()).unwrap_or(0) - } } impl SessionListener for RestartSingleSessionListener { @@ -240,11 +222,6 @@ impl SessionListener for RestartSingleSessionListener { ), }); - // Store candidate by root_hash for get_approved_candidate() during restart recovery - if let Ok(mut map) = self.candidates.lock() { - map.insert(root_hash, candidate.clone()); - } - callback(Ok(candidate)); } @@ -311,28 +288,15 @@ impl SessionListener for RestartSingleSessionListener { &self, _source: PublicKey, root_hash: BlockHash, - file_hash: BlockHash, - collated_data_hash: BlockHash, - callback: ValidatorBlockCandidateCallback, + _file_hash: BlockHash, + _collated_data_hash: BlockHash, + _callback: ValidatorBlockCandidateCallback, ) { - self.approved_candidate_requests.fetch_add(1, Ordering::SeqCst); - - // Lookup candidate by root_hash - let candidate = self.candidates.lock().ok().and_then(|map| map.get(&root_hash).cloned()); - - match candidate { - Some(c) => { - // Sanity: hashes must match request - assert_eq!(c.id.root_hash, root_hash); - assert_eq!(c.id.file_hash, file_hash); - assert_eq!(c.collated_file_hash, collated_data_hash); - callback(Ok(c)); - } - None => callback(Err(error!( - "approved candidate not found for root_hash={}", - root_hash.to_hex_string() - ))), - } + panic!( + "unexpected legacy get_approved_candidate request in simplex restart test \ + (root_hash={}); active restart flow must not use this callback", + root_hash.to_hex_string() + ); } } @@ -493,7 +457,7 @@ fn run_single_node_restart_test(test_name: &str) { let start = Instant::now(); while start.elapsed() < PHASE_TIMEOUT { if session_1.is_panicked() { - log::error!("PANIC-1: session panicked during phase 1 (restart test '{}')", test_name); + log::error!("session panicked during phase 1 (restart test '{}')", test_name); panic!("session panicked during phase 1"); } // Wait for N finalized callbacks before restart. @@ -551,7 +515,7 @@ fn run_single_node_restart_test(test_name: &str) { let start = Instant::now(); while start.elapsed() < PHASE_TIMEOUT { if session_2.is_panicked() { - log::error!("PANIC-1: session panicked during phase 2 (restart test '{}')", test_name); + log::error!("session panicked during phase 2 (restart test '{}')", test_name); panic!("session panicked during phase 2"); } if listener.first_slot_after_restart().is_some() { @@ -583,7 +547,7 @@ fn run_single_node_restart_test(test_name: &str) { let start = Instant::now(); while start.elapsed() < PHASE_TIMEOUT { if session_2.is_panicked() { - log::error!("PANIC-1: session panicked during phase 2 (restart test '{}')", test_name); + log::error!("session panicked during phase 2 (restart test '{}')", test_name); panic!("session panicked during phase 2"); } if listener.collation_count() >= collation_before + 2 { @@ -621,9 +585,8 @@ fn run_single_node_restart_test(test_name: &str) { "Restart test '{test_name}' complete: \ last_finalized_slot_before={last_finalized_slot_before}, \ first_collation_after_restart={first_after}, collations_before={collation_before}, \ - collations_after={}, approved_candidate_requests={}", - listener.collation_count(), - listener.approved_candidate_requests() + collations_after={}", + listener.collation_count() ); } diff --git a/src/node/simplex/tests/test_validation.rs b/src/node/simplex/tests/test_validation.rs index 15ff043..88ac635 100644 --- a/src/node/simplex/tests/test_validation.rs +++ b/src/node/simplex/tests/test_validation.rs @@ -211,12 +211,17 @@ impl SessionListener for ValidationTestListener { fn get_approved_candidate( &self, _source: PublicKey, - _root_hash: BlockHash, + root_hash: BlockHash, _file_hash: BlockHash, _collated_data_hash: BlockHash, _callback: ValidatorBlockCandidateCallback, ) { - // Not used in this test + panic!( + "unexpected legacy get_approved_candidate request in simplex validation test \ + (node_idx={}, root_hash={}); active simplex flow must not use this callback", + self.node_idx, + root_hash.to_hex_string() + ); } } diff --git a/src/node/src/network/node_network.rs b/src/node/src/network/node_network.rs index b58d5d2..d6ca950 100644 --- a/src/node/src/network/node_network.rs +++ b/src/node/src/network/node_network.rs @@ -177,6 +177,7 @@ impl NodeNetwork { cancellation_token.clone(), None, tokio::runtime::Handle::current(), + None, ); overlay.set_quic(quic.clone())?; Some(quic) diff --git a/src/node/src/validator/collator.rs b/src/node/src/validator/collator.rs index 200ad97..9cb1e52 100644 --- a/src/node/src/validator/collator.rs +++ b/src/node/src/validator/collator.rs @@ -2187,11 +2187,14 @@ impl Collator { let prev = max(mc_data.state().state()?.gen_time(), prev_now); log::trace!("{}: init_utime prev_time: {}", self.collated_block_descr, prev); let allow_same_timestamp = self.allow_same_timestamp(mc_data); + let now_ms = self.collator_settings.min_gen_utime_ms.map_or_else( + || self.engine.now_ms(), + |min_now_ms| self.engine.now_ms().max(min_now_ms), + ); // Compute gen_utime_ms first, then derive gen_utime from it (like C++). // This guarantees gen_utime_ms / 1000 == gen_utime, avoiding second-boundary // mismatches in ConsensusExtraData validation. - let (gen_utime, gen_utime_ms) = - Self::calc_utime(prev, self.engine.now_ms(), allow_same_timestamp); + let (gen_utime, gen_utime_ms) = Self::calc_utime(prev, now_ms, allow_same_timestamp); Ok((gen_utime, gen_utime_ms)) } diff --git a/src/node/src/validator/consensus.rs b/src/node/src/validator/consensus.rs index de28f0a..569d86c 100644 --- a/src/node/src/validator/consensus.rs +++ b/src/node/src/validator/consensus.rs @@ -600,7 +600,7 @@ pub enum ConsensusType { /// Old catchain-based validator-session (default) #[default] Catchain, - /// New Alpenglow-based simplex consensus + /// Simplex consensus Simplex, } diff --git a/src/node/src/validator/fabric.rs b/src/node/src/validator/fabric.rs index e6a93c4..4975003 100644 --- a/src/node/src/validator/fabric.rs +++ b/src/node/src/validator/fabric.rs @@ -285,7 +285,7 @@ pub async fn run_accept_block_query( pub async fn run_collate_query( shard: ShardIdent, - _min_ts: SystemTime, + min_ts: SystemTime, min_mc_seqno: u32, prev: &PrevBlockHistory, pipeline_context: PipelineContext, @@ -308,7 +308,14 @@ pub async fn run_collate_query( UInt256::from(collator_id.pub_key()?), engine.clone(), None, - CollatorSettings { is_simplex, ..Default::default() }, + CollatorSettings { + is_simplex, + min_gen_utime_ms: Some( + min_ts.duration_since(SystemTime::UNIX_EPOCH).unwrap_or_default().as_millis() + as u64, + ), + ..Default::default() + }, )?; let collate_result = collator.collate().await; diff --git a/src/node/src/validator/mod.rs b/src/node/src/validator/mod.rs index 5d4e4f4..1e5d087 100644 --- a/src/node/src/validator/mod.rs +++ b/src/node/src/validator/mod.rs @@ -62,6 +62,8 @@ pub struct CollatorSettings { pub lt_compatible: bool, // true when running under simplex consensus (passed from ValidatorGroup) pub is_simplex: bool, + // when set, collator must not choose gen_utime_ms earlier than this value + pub min_gen_utime_ms: Option, } impl CollatorSettings { diff --git a/src/node/src/validator/tests/test_session_id.rs b/src/node/src/validator/tests/test_session_id.rs index 862b4e0..796f5dd 100644 --- a/src/node/src/validator/tests/test_session_id.rs +++ b/src/node/src/validator/tests/test_session_id.rs @@ -695,7 +695,7 @@ fn test_simplex_empty_block_lag_threshold_matches_cpp_policy() { } #[test] -fn test_runtime_simplex_options_use_temporary_strict_collation_mode() { +fn test_runtime_simplex_options_keep_cpp_empty_block_policy() { let catchain_options = CatchainSessionOptions { proto_version: 4, max_block_size: 1024, @@ -708,26 +708,12 @@ fn test_runtime_simplex_options_use_temporary_strict_collation_mode() { &SimplexConfig::default(), &catchain_options, ); - assert!( - mc_opts.require_notarized_parent_for_collation, - "runtime manager-built simplex sessions must currently stay in strict mode" - ); assert_eq!(mc_opts.empty_block_mc_lag_threshold, None); let shard = ShardIdent::with_tagged_prefix(0, 0x8000_0000_0000_0000).unwrap(); let shard_opts = build_runtime_simplex_session_options(&shard, &SimplexConfig::default(), &catchain_options); - assert!( - shard_opts.require_notarized_parent_for_collation, - "runtime shard sessions must currently stay in strict mode" - ); assert_eq!(shard_opts.empty_block_mc_lag_threshold, Some(8)); - - let default_opts = simplex::SessionOptions::default(); - assert!( - default_opts.require_notarized_parent_for_collation, - "default session options must remain in strict mode for tests/manual sessions" - ); } #[test] @@ -751,3 +737,55 @@ fn test_mc_registered_top_for_shard_returns_session_specific_block_id() { "unknown shard must not produce MC-registered top block id" ); } + +#[test] +fn test_classify_no_current_session_health_skips_when_not_applicable() { + assert_eq!( + classify_no_current_session_health(false, ValidationStatus::Active, 0, 0, 0), + None, + "outside the current validator set there is no no-current-session health state" + ); + assert_eq!( + classify_no_current_session_health(true, ValidationStatus::Waiting, 0, 0, 0), + None, + "validation must be enabled before classifying a no-current-session state" + ); + assert_eq!( + classify_no_current_session_health(true, ValidationStatus::Active, 1, 0, 0), + None, + "an existing current session suppresses the no-current-session classification" + ); +} + +#[test] +fn test_classify_no_current_session_health_detects_benign_subset_gap() { + let reason = classify_no_current_session_health(true, ValidationStatus::Active, 0, 0, 0) + .expect("expected a no-current-session reason"); + + assert_eq!(reason, NoCurrentSessionHealthReason::NoCurrentSubsetOwned); + assert!( + !reason.should_warn(), + "not being selected into a current subset must stay non-actionable" + ); +} + +#[test] +fn test_classify_no_current_session_health_detects_future_only_gap() { + let reason = classify_no_current_session_health(true, ValidationStatus::Active, 0, 0, 2) + .expect("expected a no-current-session reason"); + + assert_eq!(reason, NoCurrentSessionHealthReason::FutureSubsetOnly); + assert!(!reason.should_warn(), "future-only ownership must not emit the actionable warning"); +} + +#[test] +fn test_classify_no_current_session_health_warns_for_owned_current_subset() { + let reason = classify_no_current_session_health(true, ValidationStatus::Active, 0, 1, 1) + .expect("expected a no-current-session reason"); + + assert_eq!(reason, NoCurrentSessionHealthReason::MissingOwnedCurrentSubsetSession); + assert!( + reason.should_warn(), + "owning a current subset without a current session must remain actionable" + ); +} diff --git a/src/node/src/validator/validator_group.rs b/src/node/src/validator/validator_group.rs index 74f1609..7b357da 100644 --- a/src/node/src/validator/validator_group.rs +++ b/src/node/src/validator/validator_group.rs @@ -1022,6 +1022,7 @@ impl ValidatorGroup { ) }) .await; + let min_ts = min_ts.max(request.get_creation_time()); if !is_collator && self.is_accelerated_consensus_enabled { log::info!( @@ -1064,7 +1065,7 @@ impl ValidatorGroup { // - last_committed_seqno: from `prev_block_ids` (finalized/committed head) // - last_collated_seqno: from `pipeline_context` (accelerated consensus), if any // - // With notarized-parent collation (require_finalized_parent=false), the parent + // With notarized-parent collation, the parent // can be *ahead* of last_committed_seqno (notarized but not yet finalized). // This is expected and allowed. The guard only rejects parents that are *behind* // our current collation head (going backward). diff --git a/src/node/src/validator/validator_manager.rs b/src/node/src/validator/validator_manager.rs index 72800c3..faef9d8 100644 --- a/src/node/src/validator/validator_manager.rs +++ b/src/node/src/validator/validator_manager.rs @@ -50,10 +50,6 @@ const MC_ACCELERATED_CONSENSUS_ENABLED: bool = true; #[cfg(not(feature = "xp25"))] const MC_ACCELERATED_CONSENSUS_ENABLED: bool = false; -// TODO(simplex-mc-parity): flip this back to `false` once runtime collation is ready to -// use the C++-compatible optimistic whole-window pipelining mode in production. -const SIMPLEX_RUNTIME_REQUIRE_NOTARIZED_PARENT_FOR_COLLATION: bool = true; - fn format_shard_short(shard: &ShardIdent) -> String { if shard.is_masterchain() { "MC".to_string() @@ -122,6 +118,7 @@ fn build_runtime_simplex_session_options( proto_version: catchain_options.proto_version as u32, slots_per_leader_window: cfg.slots_per_leader_window, target_rate: Duration::from_millis(np.target_rate_ms as u64), + min_block_interval: Duration::from_millis(np.min_block_interval_ms as u64), first_block_timeout: Duration::from_millis(np.first_block_timeout_ms as u64), first_block_timeout_multiplier: f32::from_bits(np.first_block_timeout_multiplier_bits) as f64, @@ -139,13 +136,13 @@ fn build_runtime_simplex_session_options( max_leader_window_desync: np.max_leader_window_desync, bad_signature_ban_duration: Duration::from_millis(np.bad_signature_ban_duration_ms as u64), candidate_resolve_rate_limit: np.candidate_resolve_rate_limit, - // TODO(simplex-mc-parity): temporary strict runtime mode until optimistic - // whole-window pipelining is enabled for production sessions. - require_notarized_parent_for_collation: - SIMPLEX_RUNTIME_REQUIRE_NOTARIZED_PARENT_FOR_COLLATION, max_block_size: catchain_options.max_block_size as usize, max_collated_data_size: catchain_options.max_collated_data_size as usize, + use_callback_thread: false, use_quic: cfg.use_quic, + no_empty_blocks_on_error_timeout: Duration::from_millis( + np.no_empty_blocks_on_error_timeout_ms as u64, + ), // C++ parity: shard sessions use lag threshold 8 for empty-block recovery. // MC sessions use internal consensus-finalized tracking and keep this unset. empty_block_mc_lag_threshold: simplex_empty_block_lag_threshold(shard), @@ -616,6 +613,47 @@ impl ValidationStatus { } } +#[derive(PartialEq, Eq, Clone, Copy, Debug)] +enum NoCurrentSessionHealthReason { + MissingOwnedCurrentSubsetSession, + FutureSubsetOnly, + NoCurrentSubsetOwned, +} + +impl NoCurrentSessionHealthReason { + fn label(&self) -> &'static str { + match self { + Self::MissingOwnedCurrentSubsetSession => "owned_current_subset_missing_session", + Self::FutureSubsetOnly => "future_subset_only", + Self::NoCurrentSubsetOwned => "no_current_subset_owned", + } + } + + fn should_warn(&self) -> bool { + matches!(self, Self::MissingOwnedCurrentSubsetSession) + } +} + +fn classify_no_current_session_health( + in_current_set: bool, + validation_status: ValidationStatus, + current_sessions_len: usize, + owned_current_shards: usize, + owned_future_shards: usize, +) -> Option { + if !in_current_set || !validation_status.allows_validate() || current_sessions_len > 0 { + return None; + } + + if owned_current_shards > 0 { + Some(NoCurrentSessionHealthReason::MissingOwnedCurrentSubsetSession) + } else if owned_future_shards > 0 { + Some(NoCurrentSessionHealthReason::FutureSubsetOnly) + } else { + Some(NoCurrentSessionHealthReason::NoCurrentSubsetOwned) + } +} + /// Local node's participation record for a single validator list. /// /// Stores local validator keys in the same local-key order that C++ uses for `temp_keys_`, @@ -698,6 +736,12 @@ struct ValidatorManagerImpl { current_sessions: HashMap>, /// Sessions for the next (future) validator set with pre-created engines. future_sessions: HashMap>, + /// Number of current subsets selected for the local node on the last successful + /// `update_shards()` pass. + owned_current_shards: usize, + /// Number of future subsets selected for the local node on the last successful + /// `update_shards()` pass. + owned_future_shards: usize, validator_list_status: ValidatorListStatus, config: ValidatorManagerConfig, /// Session IDs that have been destroyed and must not be recreated until the next @@ -724,6 +768,8 @@ impl ValidatorManagerImpl { rt: rt.clone(), current_sessions: HashMap::default(), future_sessions: HashMap::default(), + owned_current_shards: 0, + owned_future_shards: 0, validator_list_status: ValidatorListStatus::default(), config, destroyed_sessions: HashSet::new(), @@ -1356,6 +1402,8 @@ impl ValidatorManagerImpl { self.engine.set_validation_status(ValidationStatus::Disabled); self.sync_complete = false; + self.owned_current_shards = 0; + self.owned_future_shards = 0; let existing_validator_sessions: HashSet = self.current_sessions.keys().chain(self.future_sessions.keys()).cloned().collect(); @@ -1520,6 +1568,16 @@ impl ValidatorManagerImpl { accelerated_consensus_enabled, ); + let local_id_option = self.find_us(&subset.validators); + + if local_id_option.is_some() { + // Track owned current subsets independently of whether this specific session + // is temporarily suppressed by the destroyed-session blacklist. + // Otherwise health classification can mislabel a missing owned current session + // as benign `NoCurrentSubsetOwned`. + our_current_shards.insert(ident.clone(), vsubset.clone()); + } + // C++ parity: skip sessions in the destroyed set if self.destroyed_sessions.contains(&session_id) { log::trace!( @@ -1529,11 +1587,7 @@ impl ValidatorManagerImpl { continue; } - let local_id_option = self.find_us(&subset.validators); - if let Some(local_id) = &local_id_option { - our_current_shards.insert(ident.clone(), vsubset.clone()); - log::debug!( target: "validator_manager", "subset for session: shard {}, cc_seqno {}, keyblock_seqno {}, \ @@ -1943,12 +1997,14 @@ impl ValidatorManagerImpl { log::trace!(target: "validator_manager", "Missing sessions started. Current shards:"); // Iterate over future shards and create all future sessions + let mut owned_future_shards = 0usize; for (ident, (wc, next_cc_seqno, next_val_list_id)) in our_future_shards.iter() { if ident.is_masterchain() { mc_validators.append(&mut wc.validators.clone()); } if let Some(local_id) = self.find_us_for_list(&wc.validators, next_val_list_id) { + owned_future_shards += 1; let max_vertical_seqno = self.engine.hardforks().len() as u32; let new_session_info = Arc::new(GeneralSessionInfo { shard: ident.clone(), @@ -2044,6 +2100,9 @@ impl ValidatorManagerImpl { } } + self.owned_current_shards = our_current_shards.len(); + self.owned_future_shards = owned_future_shards; + // Stale-future culling: remove future entries whose shard already has a current // group with equal or higher cc_seqno, or whose shard is an ancestor/descendant // of a current shard with strictly higher cc_seqno. @@ -2188,11 +2247,24 @@ impl ValidatorManagerImpl { log::warn!(target: "validator_manager", "HEALTH_CHECK: {} session(s) stalled (validation queue inactive)", stalled_count); } - if in_current_set && self.current_sessions.is_empty() && validation_status.allows_validate() - { - log::warn!(target: "validator_manager", - "HEALTH_CHECK: node is in current validator set but has no current sessions \ - (validation_status={:?})", validation_status); + if let Some(reason) = classify_no_current_session_health( + in_current_set, + validation_status, + self.current_sessions.len(), + self.owned_current_shards, + self.owned_future_shards, + ) { + if reason.should_warn() { + log::warn!(target: "validator_manager", + "HEALTH_CHECK: node is in current validator set but has no current sessions \ + (reason={}, owned_current_shards={}, owned_future_shards={}, \ + future_sessions={}, validation_status={:?})", + reason.label(), + self.owned_current_shards, + self.owned_future_shards, + self.future_sessions.len(), + validation_status); + } } if validation_status.allows_validate() { let sync_count = state_counts.get("sync").copied().unwrap_or(0); @@ -2283,12 +2355,15 @@ impl ValidatorManagerImpl { "=== VALIDATOR MANAGER METRICS (once/min) ===\n\ \x20 validation_status={:?} sync_complete={}\n\ \x20 in_current_set={} in_next_set={}\n\ + \x20 owned_subsets: current={} future={}\n\ \x20 sessions: current={} future={} total={} (simplex={} catchain={})\n\ \x20 by_state: [{}] stalled={}", validation_status, self.sync_complete, in_current_set, in_next_set, + self.owned_current_shards, + self.owned_future_shards, self.current_sessions.len(), self.future_sessions.len(), self.current_sessions.len() + self.future_sessions.len(), @@ -2298,6 +2373,24 @@ impl ValidatorManagerImpl { stalled_count, )); + if let Some(reason) = classify_no_current_session_health( + in_current_set, + validation_status, + self.current_sessions.len(), + self.owned_current_shards, + self.owned_future_shards, + ) { + lines.push(format!( + " NO CURRENT SESSIONS: reason={} owned_current_shards={} \ + owned_future_shards={} current_sessions={} future_sessions={}", + reason.label(), + self.owned_current_shards, + self.owned_future_shards, + self.current_sessions.len(), + self.future_sessions.len(), + )); + } + // ── Validator keys ── lines.push(String::from(" VALIDATOR KEYS:")); for (role, list_id_opt, utime_opt) in [ diff --git a/src/node/storage/src/cell_db.rs b/src/node/storage/src/cell_db.rs index 1ad3c4f..fce8f21 100644 --- a/src/node/storage/src/cell_db.rs +++ b/src/node/storage/src/cell_db.rs @@ -196,6 +196,16 @@ impl CellDb { } } + /// Check if a cell is present in the in-memory LRU cache. + pub fn is_in_cache(&self, cell_id: &UInt256) -> bool { + self.cell_cache.get(cell_id).is_some() + } + + /// Remove a cell from the in-memory LRU cache. + pub fn remove_from_cache(&self, cell_id: &UInt256) { + self.cell_cache.remove(cell_id); + } + #[cfg(test)] pub fn count(&self) -> usize { if let Ok(cf) = self.cells_cf() { diff --git a/src/node/storage/src/dynamic_boc_rc_db.rs b/src/node/storage/src/dynamic_boc_rc_db.rs index 1dc497b..cea6713 100644 --- a/src/node/storage/src/dynamic_boc_rc_db.rs +++ b/src/node/storage/src/dynamic_boc_rc_db.rs @@ -352,6 +352,9 @@ impl DynamicBocDb { transaction.delete_cf(&cells_cf, id.as_slice()); // if there is no counter with the key, then it will be just ignored transaction.delete_cf(&counters_cf, id.as_slice()); + // Remove from cell_cache so that save_boc won't treat this cell + // as still persisted in DB (is:: + cache hit). + self.cell_db.remove_from_cache(id); deleted += 1; } else { transaction.put_cf(&counters_cf, id.as_slice(), counter.to_le_bytes()); @@ -432,7 +435,7 @@ impl DynamicBocDb { let cell_id = cell.repr_hash(); - if cell.is::() { + if cell.is::() && self.cell_db.is_in_cache(&cell_id) { return Ok((false, None)); } diff --git a/src/node/tests/compat_test/src/test_helpers.rs b/src/node/tests/compat_test/src/test_helpers.rs index c774c99..93e68bd 100644 --- a/src/node/tests/compat_test/src/test_helpers.rs +++ b/src/node/tests/compat_test/src/test_helpers.rs @@ -18,7 +18,8 @@ use adnl::{ TaggedTlObject, }, node::{AdnlNode, AdnlNodeConfig, AdnlSendMethod, IpAddress}, - OverlayNode, OverlayNodeInfo, OverlayParams, OverlayShortId, QuicNode, RldpNode, + OverlayNode, OverlayNodeInfo, OverlayParams, OverlayShortId, QuicNode, QuicRateLimitConfig, + RldpNode, }; use std::{ net::{Ipv4Addr, SocketAddr}, @@ -532,6 +533,7 @@ impl RustQuicTestNode { cancellation_token.clone(), None, rt.handle().clone(), + Some(QuicRateLimitConfig::disabled()), ); let bind_addr = SocketAddr::new( Ipv4Addr::from(adnl.ip_address_adnl().ip()).into(),