From 208ddcde4e6002f3b145734ae32f8533276a87dc Mon Sep 17 00:00:00 2001 From: chrisgalanis Date: Wed, 14 Jan 2026 22:54:48 +0100 Subject: [PATCH 01/10] fix: fallback date comparison --- hil/src/commands/ota/system.rs | 68 ++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/hil/src/commands/ota/system.rs b/hil/src/commands/ota/system.rs index 590f549df..036828f59 100644 --- a/hil/src/commands/ota/system.rs +++ b/hil/src/commands/ota/system.rs @@ -130,7 +130,7 @@ fn update_versions_json_content( /// Wait for system time to be synchronized via NTP/chrony pub async fn wait_for_time_sync(session: &SshWrapper) -> Result<()> { use std::time::Duration; - use tracing::info; + use tracing::{info, warn}; const MAX_ATTEMPTS: u32 = 60; // 60 attempts = 2 minutes max wait const SLEEP_DURATION: Duration = Duration::from_secs(2); @@ -167,10 +167,72 @@ pub async fn wait_for_time_sync(session: &SshWrapper) -> Result<()> { } } - bail!( - "Timeout waiting for system time synchronization after {} seconds", + warn!( + "timedatectl did not report sync after {} seconds, falling back to date comparison", MAX_ATTEMPTS * 2 ); + + // Fallback: Compare Orb's date with PC's date + // If difference is less than 1 month, consider it acceptable + check_time_difference_fallback(session).await +} + +/// Fallback time check: Compare Orb's time with local PC time +/// Accept if difference is less than 1 month +async fn check_time_difference_fallback(session: &SshWrapper) -> Result<()> { + use tracing::info; + + info!("Checking time difference between Orb and local PC..."); + + // Get Orb's current timestamp (Unix epoch seconds) + let orb_time_result = session + .execute_command("TERM=dumb date +%s") + .await + .wrap_err("Failed to get Orb's timestamp")?; + + ensure!( + orb_time_result.is_success(), + "Failed to get Orb timestamp: {}", + orb_time_result.stderr + ); + + let orb_timestamp: i64 = orb_time_result + .stdout + .trim() + .parse() + .wrap_err("Failed to parse Orb timestamp")?; + + // Get local PC's current timestamp + let local_timestamp = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .wrap_err("Failed to get local system time")? + .as_secs() as i64; + + let time_diff_seconds = (orb_timestamp - local_timestamp).abs(); + let time_diff_days = time_diff_seconds / 86400; // 86400 seconds in a day + + const MAX_ACCEPTABLE_DIFF_DAYS: i64 = 30; // 1 month tolerance + + info!( + "Time difference: {} days ({} seconds)", + time_diff_days, time_diff_seconds + ); + + if time_diff_seconds < MAX_ACCEPTABLE_DIFF_DAYS * 86400 { + info!( + "Time difference of {} days is within acceptable range (< {} days)", + time_diff_days, MAX_ACCEPTABLE_DIFF_DAYS + ); + + Ok(()) + } else { + bail!( + "Time difference too large: {} days (max acceptable: {} days). \ + Orb time may be significantly out of sync.", + time_diff_days, + MAX_ACCEPTABLE_DIFF_DAYS + ); + } } /// Restart the update agent service and return the start timestamp From d8872146a5a55d58e60e08e940f028217335dbfa Mon Sep 17 00:00:00 2001 From: chrisgalanis Date: Wed, 14 Jan 2026 22:54:48 +0100 Subject: [PATCH 02/10] fix: fallback date comparison --- hil/src/commands/ota/system.rs | 68 ++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/hil/src/commands/ota/system.rs b/hil/src/commands/ota/system.rs index 590f549df..036828f59 100644 --- a/hil/src/commands/ota/system.rs +++ b/hil/src/commands/ota/system.rs @@ -130,7 +130,7 @@ fn update_versions_json_content( /// Wait for system time to be synchronized via NTP/chrony pub async fn wait_for_time_sync(session: &SshWrapper) -> Result<()> { use std::time::Duration; - use tracing::info; + use tracing::{info, warn}; const MAX_ATTEMPTS: u32 = 60; // 60 attempts = 2 minutes max wait const SLEEP_DURATION: Duration = Duration::from_secs(2); @@ -167,10 +167,72 @@ pub async fn wait_for_time_sync(session: &SshWrapper) -> Result<()> { } } - bail!( - "Timeout waiting for system time synchronization after {} seconds", + warn!( + "timedatectl did not report sync after {} seconds, falling back to date comparison", MAX_ATTEMPTS * 2 ); + + // Fallback: Compare Orb's date with PC's date + // If difference is less than 1 month, consider it acceptable + check_time_difference_fallback(session).await +} + +/// Fallback time check: Compare Orb's time with local PC time +/// Accept if difference is less than 1 month +async fn check_time_difference_fallback(session: &SshWrapper) -> Result<()> { + use tracing::info; + + info!("Checking time difference between Orb and local PC..."); + + // Get Orb's current timestamp (Unix epoch seconds) + let orb_time_result = session + .execute_command("TERM=dumb date +%s") + .await + .wrap_err("Failed to get Orb's timestamp")?; + + ensure!( + orb_time_result.is_success(), + "Failed to get Orb timestamp: {}", + orb_time_result.stderr + ); + + let orb_timestamp: i64 = orb_time_result + .stdout + .trim() + .parse() + .wrap_err("Failed to parse Orb timestamp")?; + + // Get local PC's current timestamp + let local_timestamp = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .wrap_err("Failed to get local system time")? + .as_secs() as i64; + + let time_diff_seconds = (orb_timestamp - local_timestamp).abs(); + let time_diff_days = time_diff_seconds / 86400; // 86400 seconds in a day + + const MAX_ACCEPTABLE_DIFF_DAYS: i64 = 30; // 1 month tolerance + + info!( + "Time difference: {} days ({} seconds)", + time_diff_days, time_diff_seconds + ); + + if time_diff_seconds < MAX_ACCEPTABLE_DIFF_DAYS * 86400 { + info!( + "Time difference of {} days is within acceptable range (< {} days)", + time_diff_days, MAX_ACCEPTABLE_DIFF_DAYS + ); + + Ok(()) + } else { + bail!( + "Time difference too large: {} days (max acceptable: {} days). \ + Orb time may be significantly out of sync.", + time_diff_days, + MAX_ACCEPTABLE_DIFF_DAYS + ); + } } /// Restart the update agent service and return the start timestamp From ce1e7fec117c93ab428497e60c35d74a178f6dab Mon Sep 17 00:00:00 2001 From: chrisgalanis <50170911+chrisgalanis@users.noreply.github.com> Date: Thu, 15 Jan 2026 18:10:41 +0100 Subject: [PATCH 03/10] fix(hil-ota): Revert To Working State (#935) This pull request introduces improvements to the OTA update process and device reboot logic, with a focus on configurability, reliability, and robustness around time synchronization and FTDI device handling. The most important changes are grouped below: **OTA Update Process Improvements:** * Added a `--skip-time-sync-before-reboot` flag to the `Ota` command, allowing users to optionally skip the NTP time synchronization check before the first reboot. Time sync is still enforced after reboot and before starting the update. [[1]](diffhunk://#diff-4417a067e971532511cd378570394d24d393625d1e510ecc9475239495917805R72-R76) [[2]](diffhunk://#diff-4417a067e971532511cd378570394d24d393625d1e510ecc9475239495917805L108-R132) * Ensured that the log directory is created or verified before starting the OTA process, improving reliability of log file handling. **Device Reboot and FTDI Handling:** * Refactored the reboot logic to use named constants for timing delays and clarified the handling of the recovery pin state, making the reboot process more maintainable and readable. [[1]](diffhunk://#diff-201f44f16c361cb2807a9ba916d437dd592b40a19e0a1b9f095b37e3ea38cd7cR26-R28) [[2]](diffhunk://#diff-201f44f16c361cb2807a9ba916d437dd592b40a19e0a1b9f095b37e3ea38cd7cL40-R48) [[3]](diffhunk://#diff-201f44f16c361cb2807a9ba916d437dd592b40a19e0a1b9f095b37e3ea38cd7cL49-R68) [[4]](diffhunk://#diff-201f44f16c361cb2807a9ba916d437dd592b40a19e0a1b9f095b37e3ea38cd7cL77-R85) **Time Synchronization Robustness:** * Enhanced the time synchronization check to detect and use either `chronyc` or `timedatectl` (preferring `chronyc` if available), with robust error handling and command timeouts. This ensures the system waits for accurate time sync using the best available tool. --- hil/src/boot.rs | 18 +- hil/src/commands/ota/mod.rs | 26 +- hil/src/commands/ota/reboot.rs | 429 +++++++++++++++++++++++++++++---- hil/src/commands/ota/system.rs | 111 +++++++-- 4 files changed, 519 insertions(+), 65 deletions(-) diff --git a/hil/src/boot.rs b/hil/src/boot.rs index 9116fd933..f984ab3ac 100644 --- a/hil/src/boot.rs +++ b/hil/src/boot.rs @@ -23,6 +23,9 @@ pub async fn is_recovery_mode_detected() -> Result { /// If `device` is `None`, will get the first available device. #[tracing::instrument] pub async fn reboot(recovery: bool, device: Option<&FtdiId>) -> Result<()> { + const DEFAULT_HOLDING_DELAY: u64 = 5; + const INBETWEEN_DELAY: u64 = 4; + fn make_ftdi(device: Option) -> Result { let builder = FtdiGpio::builder(); let builder = match &device { @@ -37,7 +40,12 @@ pub async fn reboot(recovery: bool, device: Option<&FtdiId>) -> Result<()> { info!("Turning off"); let device_clone = device.cloned(); - let ftdi = tokio::task::spawn_blocking(|| -> Result<_, color_eyre::Report> { + let recovery_state = if recovery { + OutputState::Low + } else { + OutputState::High + }; + let ftdi = tokio::task::spawn_blocking(move || -> Result<_, color_eyre::Report> { for d in FtdiGpio::list_devices().wrap_err("failed to list ftdi devices")? { debug!( "ftdi device: desc:{}, serial:{}, vid:{}, pid:{}", @@ -46,18 +54,18 @@ pub async fn reboot(recovery: bool, device: Option<&FtdiId>) -> Result<()> { } let mut ftdi = make_ftdi(device_clone)?; ftdi.set_pin(BUTTON_PIN, OutputState::Low)?; - ftdi.set_pin(RECOVERY_PIN, OutputState::High)?; + ftdi.set_pin(RECOVERY_PIN, recovery_state)?; Ok(ftdi) }) .await .wrap_err("task panicked")??; - tokio::time::sleep(Duration::from_secs(10)).await; + tokio::time::sleep(Duration::from_secs(DEFAULT_HOLDING_DELAY)).await; info!("Resetting FTDI"); tokio::task::spawn_blocking(move || ftdi.destroy()) .await .wrap_err("task panicked")??; - tokio::time::sleep(Duration::from_secs(4)).await; + tokio::time::sleep(Duration::from_secs(INBETWEEN_DELAY)).await; info!("Turning on"); let device_clone = device.cloned(); @@ -74,7 +82,7 @@ pub async fn reboot(recovery: bool, device: Option<&FtdiId>) -> Result<()> { }) .await .wrap_err("task panicked")??; - tokio::time::sleep(Duration::from_secs(4)).await; + tokio::time::sleep(Duration::from_secs(DEFAULT_HOLDING_DELAY)).await; tokio::task::spawn_blocking(move || ftdi.destroy()) .await diff --git a/hil/src/commands/ota/mod.rs b/hil/src/commands/ota/mod.rs index 084ef1fe8..18511a740 100644 --- a/hil/src/commands/ota/mod.rs +++ b/hil/src/commands/ota/mod.rs @@ -69,6 +69,11 @@ pub struct Ota { /// Serial port ID for boot log capture (alternative to --serial-path) #[arg(long, group = "serial")] serial_id: Option, + + /// Skip NTP time synchronization check before the first reboot (after wipe_overlays). + /// Time sync will still be checked after reboot and before starting the update. + #[arg(long, default_value = "false")] + skip_time_sync_before_reboot: bool, } #[derive(Debug, Clone, clap::ValueEnum)] @@ -94,6 +99,13 @@ impl Ota { let _start_time = Instant::now(); info!("Starting OTA update to version: {}", self.target_version); + if let Some(log_dir) = self.log_file.parent() { + tokio::fs::create_dir_all(log_dir).await.wrap_err_with(|| { + format!("Failed to create log directory: {}", log_dir.display()) + })?; + info!("Log directory created/verified: {}", log_dir.display()); + } + let session = self.connect_ssh().await.inspect_err(|e| { println!("OTA_RESULT=FAILED"); println!("OTA_ERROR=SSH_CONNECTION_FAILED: {e}"); @@ -105,7 +117,19 @@ impl Ota { system::wipe_overlays(&session).await.inspect_err(|e| { error!("Failed to wipe overlays: {}", e); })?; - info!("Overlays wiped successfully, rebooting device"); + info!("Overlays wiped successfully"); + + if !self.skip_time_sync_before_reboot { + info!("Waiting for NTP time synchronization before reboot"); + system::wait_for_time_sync(&session) + .await + .inspect_err(|e| { + error!("Failed to sync time before reboot: {}", e); + })?; + info!("NTP time synchronized, rebooting device"); + } else { + info!("Skipping NTP time synchronization before reboot (--skip-time-sync-before-reboot flag set)"); + } system::reboot_orb(&session).await?; info!("Reboot command sent to Orb device"); diff --git a/hil/src/commands/ota/reboot.rs b/hil/src/commands/ota/reboot.rs index 2560e4ca1..82a0b457c 100644 --- a/hil/src/commands/ota/reboot.rs +++ b/hil/src/commands/ota/reboot.rs @@ -15,21 +15,32 @@ use tracing::{debug, error, info, instrument, warn}; use super::Ota; +const DELAY_CAPTURE_LOGS: u64 = 200; + impl Ota { #[instrument(skip_all)] pub(super) async fn handle_reboot(&self, log_suffix: &str) -> Result { info!("Waiting for reboot and device to come back online"); - // Set recovery pin HIGH for 5 seconds to prevent entering recovery mode - info!("Setting recovery pin HIGH to prevent recovery mode during reboot"); + // Always wait for SSH to become unreachable before holding the recovery pin. + info!("Monitoring SSH connection to detect when shutdown actually begins"); + self.wait_for_ssh_disconnection(Duration::from_secs(30)) + .await?; + info!("SSH disconnected - system is shutting down, holding recovery pin"); + + let hold_duration = 20; + + info!( + "Setting recovery pin HIGH to prevent recovery mode during reboot (hold duration: {}s)", + hold_duration + ); let set_recovery = SetRecoveryPin { state: OutputState::High, serial_num: None, desc: None, - duration: 5, + duration: hold_duration, }; - // Run recovery pin setting in background task let recovery_task = tokio::spawn(async move { set_recovery .run() @@ -37,13 +48,26 @@ impl Ota { .wrap_err("failed to set recovery pin") }); - self.capture_boot_logs(log_suffix).await?; - - // Wait for recovery pin task to complete recovery_task .await .wrap_err("recovery pin task panicked")??; + // Brief delay to allow USB device to be re-enumerated and udev rules to apply + // after FTDI GPIO is released. The FTDI device detaches/reattaches kernel + // drivers which causes /dev/ttyUSB* to be recreated. + tokio::time::sleep(Duration::from_millis(DELAY_CAPTURE_LOGS)).await; + + // Spawn boot log capture as a background task so it runs concurrently + // with SSH reconnection attempts. Extract needed values upfront. + let platform = self.platform.clone(); + let log_file = self.log_file.clone(); + let serial_path = self.get_serial_path().ok(); + let boot_log_suffix = log_suffix.to_string(); + let boot_log_task = tokio::spawn(async move { + Self::capture_boot_logs(platform, log_file, serial_path, &boot_log_suffix) + .await + }); + let start_time = Instant::now(); let timeout = Duration::from_secs(900); // 15 minutes let mut attempt_count = 0; @@ -63,7 +87,37 @@ impl Ota { Ok(session) => match session.test_connection().await { Ok(_) => { info!("Device is back online and responsive after reboot (attempt {})", attempt_count); - return Ok(session); + + info!("Waiting for NTP time synchronization after reboot"); + match super::system::wait_for_time_sync(&session).await { + Ok(_) => { + info!("NTP time synchronized successfully"); + + // Wait for boot log capture to finish + match boot_log_task.await { + Ok(Ok(())) => { + info!( + "Boot log capture completed successfully" + ); + } + Ok(Err(e)) => { + warn!("Boot log capture failed: {}", e); + } + Err(e) => { + warn!("Boot log capture task panicked: {}", e); + } + } + + return Ok(session); + } + Err(e) => { + debug!( + "Time sync failed on attempt {}: {}", + attempt_count, e + ); + last_error = Some(e); + } + } } Err(e) => { debug!( @@ -95,6 +149,21 @@ impl Ota { "No specific error captured".to_string() }; + // Try hardware button reboot as fallback + warn!("SSH reconnection failed, attempting hardware button reboot recovery"); + match self.try_hardware_reboot_recovery(log_suffix).await { + Ok(session) => { + info!("Hardware button reboot recovery succeeded!"); + return Ok(session); + } + Err(recovery_err) => { + error!( + "Hardware button reboot recovery also failed: {}", + recovery_err + ); + } + } + bail!( "Device did not come back online within {:?} (attempted {} times). {}", elapsed, @@ -103,27 +172,40 @@ impl Ota { ); } + /// Captures boot logs from serial port in the background #[instrument(skip_all)] - async fn capture_boot_logs(&self, log_suffix: &str) -> Result<()> { - let platform_name = format!("{:?}", self.platform).to_lowercase(); + async fn capture_boot_logs( + platform: super::Platform, + log_file: std::path::PathBuf, + serial_path: Option, + log_suffix: &str, + ) -> Result<()> { + let platform_name = format!("{:?}", platform).to_lowercase(); info!( "Starting boot log capture for {} ({})", log_suffix, platform_name ); - let boot_log_path = self - .log_file + let boot_log_path = log_file .parent() .unwrap_or_else(|| std::path::Path::new(".")) .join(format!("boot_log_{platform_name}_{log_suffix}.txt")); - let serial_path = match self.get_serial_path() { - Ok(path) => path, - Err(e) => { - warn!( - "Failed to get serial path: {}. Skipping boot log capture.", - e - ); + // Create parent directory if it doesn't exist + if let Some(parent) = boot_log_path.parent() + && let Err(e) = tokio::fs::create_dir_all(parent).await + { + warn!( + "Failed to create directory {}: {}. Boot log capture may fail.", + parent.display(), + e + ); + } + + let serial_path = match serial_path { + Some(path) => path, + None => { + warn!("No serial path provided. Skipping boot log capture."); return Ok(()); } }; @@ -151,21 +233,52 @@ impl Ota { spawn_serial_reader_task(serial_reader, serial_output_tx); let boot_log_fut = async { - let mut boot_log_content = Vec::new(); + use tokio::io::AsyncWriteExt; + + // Open file for writing incrementally + let mut log_file = match tokio::fs::OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(&boot_log_path) + .await + { + Ok(f) => Some(f), + Err(e) => { + warn!( + "Failed to open boot log file {}: {}. Will continue without writing to disk.", + boot_log_path.display(), + e + ); + None + } + }; + + let mut total_bytes = 0; let mut serial_stream = BroadcastStream::new(serial_output_rx); - // 3-minute timeout for flaky serial connections - let timeout = Duration::from_secs(180); let start_time = Instant::now(); let mut found_login_prompt = false; - while start_time.elapsed() < timeout { + // Wait indefinitely until login prompt is detected + loop { match tokio::time::timeout(Duration::from_secs(1), serial_stream.next()) .await { Ok(Some(Ok(bytes))) => { - boot_log_content.extend_from_slice(&bytes); + // Write to file immediately as data arrives + if let Some(ref mut file) = log_file { + if let Err(e) = file.write_all(&bytes).await { + warn!("Failed to write to boot log file: {}. Continuing capture in memory only.", e); + log_file = None; + } else { + // Flush to ensure data is written to disk immediately + let _ = file.flush().await; + total_bytes += bytes.len(); + } + } + // Stop capturing when login prompt is detected if let Ok(text) = String::from_utf8(bytes.to_vec()) && text.contains(LOGIN_PROMPT_PATTERN) { @@ -190,31 +303,27 @@ impl Ota { } } - if start_time.elapsed() >= timeout && !found_login_prompt { + if found_login_prompt { + info!( + "Boot log capture completed successfully after {:?}", + start_time.elapsed() + ); + } else { warn!( - "Boot log capture timed out after {:?} without finding login prompt. Will proceed with SSH reconnection anyway.", - timeout + "Boot log capture ended without detecting login prompt after {:?}", + start_time.elapsed() ); } - if !boot_log_content.is_empty() { - match tokio::fs::write(&boot_log_path, &boot_log_content).await { - Ok(_) => { - info!( - "Boot log saved to: {} ({} bytes)", - boot_log_path.display(), - boot_log_content.len() - ); - } - Err(e) => { - warn!( - "Failed to write boot log to {}: {}. Continuing anyway.", - boot_log_path.display(), - e - ); - } - } - } else { + if let Some(mut file) = log_file { + let _ = file.flush().await; + let _ = file.shutdown().await; + info!( + "Boot log saved to: {} ({} bytes)", + boot_log_path.display(), + total_bytes + ); + } else if total_bytes == 0 { warn!("No boot log content captured from serial"); } @@ -235,4 +344,234 @@ impl Ota { Ok(()) } + + /// Wait for SSH connection to become unreachable, indicating shutdown has started + #[instrument(skip_all)] + async fn wait_for_ssh_disconnection(&self, timeout: Duration) -> Result<()> { + let start = Instant::now(); + let mut attempt = 0; + + loop { + if start.elapsed() > timeout { + bail!("SSH did not disconnect within {:?}", timeout); + } + + attempt += 1; + + // Try to establish connection with a lightweight command + match self.connect_ssh().await { + Ok(session) => match session.execute_command("echo").await { + Ok(_) => { + debug!( + "SSH still responsive (attempt {}), waiting for shutdown...", + attempt + ); + tokio::time::sleep(Duration::from_millis(500)).await; + } + Err(_) => { + info!("SSH connection degraded, shutdown likely in progress"); + return Ok(()); + } + }, + Err(_) => { + info!( + "SSH connection lost after {} attempts, shutdown confirmed", + attempt + ); + return Ok(()); + } + } + } + } + + /// Try hardware button reboot as fallback recovery mechanism + /// Performs up to 3 boot attempts with serial log capture + #[instrument(skip_all)] + async fn try_hardware_reboot_recovery( + &self, + log_suffix: &str, + ) -> Result { + const MAX_BOOT_ATTEMPTS: u32 = 3; + + info!( + "Starting hardware button reboot recovery (max {} attempts)", + MAX_BOOT_ATTEMPTS + ); + + for boot_attempt in 1..=MAX_BOOT_ATTEMPTS { + info!( + "Hardware reboot attempt {}/{}", + boot_attempt, MAX_BOOT_ATTEMPTS + ); + + // Perform hardware button reboot + info!("Triggering hardware button reboot (recovery=false)"); + crate::boot::reboot(false, None) + .await + .wrap_err("Failed to trigger hardware button reboot")?; + + // Brief delay to allow USB device to be re-enumerated + tokio::time::sleep(Duration::from_millis(DELAY_CAPTURE_LOGS)).await; + + // Capture boot logs and wait for login prompt + let serial_path = match self.get_serial_path() { + Ok(path) => path, + Err(e) => { + warn!("Failed to get serial path for boot log capture: {}", e); + // Continue without serial logs + continue; + } + }; + + info!( + "Opening serial port for boot log capture: {}", + serial_path.display() + ); + let serial = match tokio_serial::new( + &*serial_path.to_string_lossy(), + crate::serial::ORB_BAUD_RATE, + ) + .open_native_async() + { + Ok(s) => s, + Err(e) => { + warn!( + "Failed to open serial port: {}. Continuing without logs.", + e + ); + continue; + } + }; + + let (serial_reader, _serial_writer) = tokio::io::split(serial); + let (serial_output_tx, serial_output_rx) = broadcast::channel(64); + let (reader_task, kill_tx) = + spawn_serial_reader_task(serial_reader, serial_output_tx); + + // Capture boot logs in background while waiting for login prompt + let platform = self.platform.clone(); + let log_file = self.log_file.clone(); + let serial_path_clone = serial_path.clone(); + let boot_log_suffix = + format!("{}_hardware_recovery_{}", log_suffix, boot_attempt); + let boot_log_task = tokio::spawn(async move { + Self::capture_boot_logs( + platform, + log_file, + Some(serial_path_clone), + &boot_log_suffix, + ) + .await + }); + + // Wait for login prompt with timeout + info!("Waiting for login prompt..."); + let wait_result = tokio::time::timeout( + Duration::from_secs(300), // 5 minutes timeout per boot attempt + crate::serial::wait_for_pattern( + LOGIN_PROMPT_PATTERN.to_owned().into_bytes(), + BroadcastStream::new(serial_output_rx), + ), + ) + .await; + + let _ = kill_tx.send(()); + let _ = reader_task.await; + + match wait_result { + Ok(Ok(())) => { + info!("Login prompt detected on boot attempt {}", boot_attempt); + + // Wait a bit for boot to stabilize + tokio::time::sleep(Duration::from_secs(10)).await; + + // Try to SSH connect + info!("Attempting SSH connection after hardware reboot..."); + let mut ssh_attempts = 0; + const MAX_SSH_ATTEMPTS: u32 = 30; + + while ssh_attempts < MAX_SSH_ATTEMPTS { + ssh_attempts += 1; + tokio::time::sleep(Duration::from_secs(10)).await; + + match self.connect_ssh().await { + Ok(session) => { + match session.test_connection().await { + Ok(_) => { + info!("SSH connection established after hardware reboot!"); + + // Wait for time sync + info!("Waiting for NTP time synchronization"); + match super::system::wait_for_time_sync( + &session, + ) + .await + { + Ok(_) => { + info!("Hardware reboot recovery successful!"); + + // Wait for boot log capture to finish + match boot_log_task.await { + Ok(Ok(())) => { + info!("Boot log capture completed for attempt {}", boot_attempt); + } + Ok(Err(e)) => { + warn!("Boot log capture failed for attempt {}: {}", boot_attempt, e); + } + Err(e) => { + warn!("Boot log capture task panicked for attempt {}: {}", boot_attempt, e); + } + } + + return Ok(session); + } + Err(e) => { + warn!("Time sync failed after hardware reboot: {}", e); + } + } + } + Err(e) => { + debug!("SSH connection test failed (attempt {}): {}", ssh_attempts, e); + } + } + } + Err(e) => { + debug!( + "SSH connection failed (attempt {}): {}", + ssh_attempts, e + ); + } + } + } + + warn!( + "SSH connection failed after {} attempts on boot attempt {}", + MAX_SSH_ATTEMPTS, boot_attempt + ); + + // Clean up boot log task since we're moving to next attempt + boot_log_task.abort(); + } + Ok(Err(e)) => { + warn!( + "Error waiting for login prompt on boot attempt {}: {}", + boot_attempt, e + ); + boot_log_task.abort(); + } + Err(_) => { + warn!( + "Timeout waiting for login prompt on boot attempt {}", + boot_attempt + ); + boot_log_task.abort(); + } + } + } + + bail!( + "Hardware reboot recovery failed after {} boot attempts", + MAX_BOOT_ATTEMPTS + ); + } } diff --git a/hil/src/commands/ota/system.rs b/hil/src/commands/ota/system.rs index 036828f59..149c19ffb 100644 --- a/hil/src/commands/ota/system.rs +++ b/hil/src/commands/ota/system.rs @@ -134,28 +134,111 @@ pub async fn wait_for_time_sync(session: &SshWrapper) -> Result<()> { const MAX_ATTEMPTS: u32 = 60; // 60 attempts = 2 minutes max wait const SLEEP_DURATION: Duration = Duration::from_secs(2); + // Timeout for individual command execution (10 seconds is generous for timedatectl/chronyc) + const COMMAND_TIMEOUT: Duration = Duration::from_secs(10); info!("Waiting for system time synchronization..."); let sync_start = std::time::Instant::now(); - for attempt in 1..=MAX_ATTEMPTS { - let result = session - .execute_command("TERM=dumb timedatectl status") + // Detect which time sync tool is available (prefer chronyc over timedatectl) + let use_chronyc = session + .execute_command("TERM=dumb command -v chronyc") + .await + .map(|r| r.is_success()) + .unwrap_or(false); + + let use_timedatectl = if !use_chronyc { + session + .execute_command("TERM=dumb command -v timedatectl") .await - .wrap_err("Failed to check time synchronization status")?; + .map(|r| r.is_success()) + .unwrap_or(false) + } else { + false + }; - if result.is_success() { - // Check if "System clock synchronized: yes" appears in output - if result.stdout.contains("System clock synchronized: yes") - || result.stdout.contains("synchronized: yes") + if !use_timedatectl && !use_chronyc { + bail!("Neither chronyc nor timedatectl found on the system"); + } + + info!( + "Using {} for time sync check", + if use_chronyc { + "chronyc" + } else { + "timedatectl" + } + ); + + for attempt in 1..=MAX_ATTEMPTS { + let is_synced = if use_chronyc { + // Try chronyc tracking with timeout + match tokio::time::timeout( + COMMAND_TIMEOUT, + session.execute_command("TERM=dumb chronyc tracking"), + ) + .await { - let sync_duration = sync_start.elapsed(); - info!( - "System time synchronized successfully after {:?}", - sync_duration - ); - return Ok(()); + Ok(Ok(result)) if result.is_success() => { + // Check if chrony is synchronized + // Leap status should be "Normal" when synchronized + result.stdout.contains("Leap status : Normal") + && !result.stdout.contains("Reference ID : 0.0.0.0") + } + Ok(Ok(_)) => false, + Ok(Err(e)) => { + info!( + "Failed to check chronyc status (attempt {}/{}): {}", + attempt, MAX_ATTEMPTS, e + ); + false + } + Err(_) => { + info!( + "chronyc command timed out after {:?} (attempt {}/{})", + COMMAND_TIMEOUT, attempt, MAX_ATTEMPTS + ); + false + } } + } else { + // Try timedatectl with timeout + match tokio::time::timeout( + COMMAND_TIMEOUT, + session.execute_command("TERM=dumb timedatectl"), + ) + .await + { + Ok(Ok(result)) if result.is_success() => { + // Check if "System clock synchronized: yes" appears in output + result.stdout.contains("System clock synchronized: yes") + || result.stdout.contains("synchronized: yes") + } + Ok(Ok(_)) => false, + Ok(Err(e)) => { + info!( + "Failed to check timedatectl status (attempt {}/{}): {}", + attempt, MAX_ATTEMPTS, e + ); + false + } + Err(_) => { + info!( + "timedatectl command timed out after {:?} (attempt {}/{})", + COMMAND_TIMEOUT, attempt, MAX_ATTEMPTS + ); + false + } + } + }; + + if is_synced { + let sync_duration = sync_start.elapsed(); + info!( + "System time synchronized successfully after {:?}", + sync_duration + ); + return Ok(()); } if attempt < MAX_ATTEMPTS { From f97bbc40602a7b0df4c860cf9af3adb6b9cd0a58 Mon Sep 17 00:00:00 2001 From: chrisgalanis Date: Thu, 15 Jan 2026 19:30:56 +0100 Subject: [PATCH 04/10] fix: wait for attestatio token to be fetched --- hil/src/commands/ota/mod.rs | 19 ++++++++- hil/src/commands/ota/system.rs | 72 ++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/hil/src/commands/ota/mod.rs b/hil/src/commands/ota/mod.rs index 18511a740..0494ffd23 100644 --- a/hil/src/commands/ota/mod.rs +++ b/hil/src/commands/ota/mod.rs @@ -126,7 +126,15 @@ impl Ota { .inspect_err(|e| { error!("Failed to sync time before reboot: {}", e); })?; - info!("NTP time synchronized, rebooting device"); + info!("NTP time synchronized"); + + info!("Waiting for attestation token before reboot"); + system::wait_for_attestation_token(&session) + .await + .inspect_err(|e| { + error!("Failed to get attestation token before reboot: {}", e); + })?; + info!("Attestation token fetched, rebooting device"); } else { info!("Skipping NTP time synchronization before reboot (--skip-time-sync-before-reboot flag set)"); } @@ -176,6 +184,15 @@ impl Ota { })?; info!("System time synchronized"); + info!("Waiting for attestation token"); + system::wait_for_attestation_token(&session) + .await + .inspect_err(|e| { + println!("OTA_RESULT=FAILED"); + println!("OTA_ERROR=ATTESTATION_TOKEN_FAILED: {e}"); + })?; + info!("Attestation token fetched"); + info!("Restarting worldcoin-update-agent.service"); let start_timestamp = system::restart_update_agent(&session) .await diff --git a/hil/src/commands/ota/system.rs b/hil/src/commands/ota/system.rs index 149c19ffb..7e559d57a 100644 --- a/hil/src/commands/ota/system.rs +++ b/hil/src/commands/ota/system.rs @@ -318,6 +318,78 @@ async fn check_time_difference_fallback(session: &SshWrapper) -> Result<()> { } } +/// Wait for worldcoin-attest service to fetch a valid token +pub async fn wait_for_attestation_token(session: &SshWrapper) -> Result<()> { + use std::time::Duration; + use tracing::{info, warn}; + + const MAX_ATTEMPTS: u32 = 90; // 90 attempts = 3 minutes max wait + const SLEEP_DURATION: Duration = Duration::from_secs(2); + + info!("Waiting for worldcoin-attest to fetch attestation token..."); + let sync_start = std::time::Instant::now(); + + for attempt in 1..=MAX_ATTEMPTS { + // Check if worldcoin-attest service has fetched a token + // Look for "got a new token" in the service logs + let result = session + .execute_command( + "TERM=dumb journalctl -u worldcoin-attest.service -n 50 --no-pager", + ) + .await; + + match result { + Ok(output) if output.is_success() => { + // Check if we can find "got a new token" in recent logs + if output.stdout.contains("got a new token") { + let sync_duration = sync_start.elapsed(); + info!( + "Attestation token fetched successfully after {:?}", + sync_duration + ); + return Ok(()); + } + + // Also check for any recent errors that might indicate a problem + if output.stdout.contains("failed to get challenge") + || output.stdout.contains("failed to fetch challenge") + { + warn!( + "Attestation service showing errors (attempt {}/{}), continuing to wait...", + attempt, MAX_ATTEMPTS + ); + } + } + Ok(_) => { + info!( + "Failed to read attestation service logs (attempt {}/{})", + attempt, MAX_ATTEMPTS + ); + } + Err(e) => { + info!( + "Error reading attestation service logs (attempt {}/{}): {}", + attempt, MAX_ATTEMPTS, e + ); + } + } + + if attempt < MAX_ATTEMPTS { + info!( + "Attestation token not yet fetched (attempt {}/{}), waiting...", + attempt, MAX_ATTEMPTS + ); + tokio::time::sleep(SLEEP_DURATION).await; + } + } + + bail!( + "Attestation token was not fetched after {} seconds. \ + This may cause key retrieval to fail during boot.", + MAX_ATTEMPTS * 2 + ); +} + /// Restart the update agent service and return the start timestamp pub async fn restart_update_agent(session: &SshWrapper) -> Result { // Get current timestamp (ON THE ORB!) before restarting service From a58da5ffcb95ed6281467b4eb79d3e2c6020168b Mon Sep 17 00:00:00 2001 From: chrisgalanis Date: Thu, 15 Jan 2026 19:34:22 +0100 Subject: [PATCH 05/10] fix: logs after button reboot --- hil/src/commands/ota/reboot.rs | 105 +++++++++++++++++++++++---------- 1 file changed, 74 insertions(+), 31 deletions(-) diff --git a/hil/src/commands/ota/reboot.rs b/hil/src/commands/ota/reboot.rs index 82a0b457c..d75d5c17a 100644 --- a/hil/src/commands/ota/reboot.rs +++ b/hil/src/commands/ota/reboot.rs @@ -15,7 +15,7 @@ use tracing::{debug, error, info, instrument, warn}; use super::Ota; -const DELAY_CAPTURE_LOGS: u64 = 200; +const DELAY_CAPTURE_LOGS: u64 = 1000; // Increased from 200ms to 1000ms for better USB re-enumeration impl Ota { #[instrument(skip_all)] @@ -93,6 +93,18 @@ impl Ota { Ok(_) => { info!("NTP time synchronized successfully"); + info!("Waiting for attestation token after reboot"); + match super::system::wait_for_attestation_token(&session).await { + Ok(_) => { + info!("Attestation token fetched successfully"); + } + Err(e) => { + debug!("Attestation token fetch failed on attempt {}: {}", attempt_count, e); + last_error = Some(e); + continue; + } + } + // Wait for boot log capture to finish match boot_log_task.await { Ok(Ok(())) => { @@ -385,13 +397,13 @@ impl Ota { } /// Try hardware button reboot as fallback recovery mechanism - /// Performs up to 3 boot attempts with serial log capture + /// Performs a single boot attempt with serial log capture #[instrument(skip_all)] async fn try_hardware_reboot_recovery( &self, log_suffix: &str, ) -> Result { - const MAX_BOOT_ATTEMPTS: u32 = 3; + const MAX_BOOT_ATTEMPTS: u32 = 1; info!( "Starting hardware button reboot recovery (max {} attempts)", @@ -445,23 +457,57 @@ impl Ota { let (serial_reader, _serial_writer) = tokio::io::split(serial); let (serial_output_tx, serial_output_rx) = broadcast::channel(64); + + // Subscribe to channel for boot log capture BEFORE passing tx to reader task + let serial_output_rx2 = serial_output_tx.subscribe(); + let (reader_task, kill_tx) = spawn_serial_reader_task(serial_reader, serial_output_tx); - // Capture boot logs in background while waiting for login prompt - let platform = self.platform.clone(); - let log_file = self.log_file.clone(); - let serial_path_clone = serial_path.clone(); + // Prepare file for boot log capture + let platform_name = format!("{:?}", self.platform).to_lowercase(); let boot_log_suffix = format!("{}_hardware_recovery_{}", log_suffix, boot_attempt); + let boot_log_path = self + .log_file + .parent() + .unwrap_or_else(|| std::path::Path::new(".")) + .join(format!("boot_log_{platform_name}_{boot_log_suffix}.txt")); + + // Spawn task to save boot logs to file let boot_log_task = tokio::spawn(async move { - Self::capture_boot_logs( - platform, - log_file, - Some(serial_path_clone), - &boot_log_suffix, - ) - .await + use tokio::io::AsyncWriteExt; + let mut log_file = tokio::fs::OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(&boot_log_path) + .await + .ok(); + + let mut serial_stream = BroadcastStream::new(serial_output_rx2); + let mut total_bytes = 0; + + loop { + match tokio::time::timeout(Duration::from_secs(1), serial_stream.next()) + .await + { + Ok(Some(Ok(bytes))) => { + if let Some(ref mut file) = log_file { + let _ = file.write_all(&bytes).await; + let _ = file.flush().await; + total_bytes += bytes.len(); + } + } + Ok(Some(Err(_))) | Ok(None) => break, + Err(_) => continue, + } + } + + if let Some(mut file) = log_file { + let _ = file.shutdown().await; + info!("Boot log saved to: {} ({} bytes)", boot_log_path.display(), total_bytes); + } }); // Wait for login prompt with timeout @@ -477,6 +523,7 @@ impl Ota { let _ = kill_tx.send(()); let _ = reader_task.await; + boot_log_task.abort(); match wait_result { Ok(Ok(())) => { @@ -508,22 +555,23 @@ impl Ota { .await { Ok(_) => { - info!("Hardware reboot recovery successful!"); - - // Wait for boot log capture to finish - match boot_log_task.await { - Ok(Ok(())) => { - info!("Boot log capture completed for attempt {}", boot_attempt); - } - Ok(Err(e)) => { - warn!("Boot log capture failed for attempt {}: {}", boot_attempt, e); + info!("NTP time synchronized after hardware reboot"); + + info!("Waiting for attestation token"); + match super::system::wait_for_attestation_token( + &session, + ) + .await + { + Ok(_) => { + info!("Attestation token fetched successfully"); + info!("Hardware reboot recovery successful!"); + return Ok(session); } Err(e) => { - warn!("Boot log capture task panicked for attempt {}: {}", boot_attempt, e); + warn!("Attestation token fetch failed after hardware reboot: {}", e); } } - - return Ok(session); } Err(e) => { warn!("Time sync failed after hardware reboot: {}", e); @@ -548,23 +596,18 @@ impl Ota { "SSH connection failed after {} attempts on boot attempt {}", MAX_SSH_ATTEMPTS, boot_attempt ); - - // Clean up boot log task since we're moving to next attempt - boot_log_task.abort(); } Ok(Err(e)) => { warn!( "Error waiting for login prompt on boot attempt {}: {}", boot_attempt, e ); - boot_log_task.abort(); } Err(_) => { warn!( "Timeout waiting for login prompt on boot attempt {}", boot_attempt ); - boot_log_task.abort(); } } } From 46f240ceef4075655830bd7a752903d3f49311c1 Mon Sep 17 00:00:00 2001 From: chrisgalanis Date: Thu, 15 Jan 2026 19:40:02 +0100 Subject: [PATCH 06/10] fix: format --- hil/src/commands/ota/mod.rs | 5 ++++- hil/src/commands/ota/reboot.rs | 19 +++++++++++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/hil/src/commands/ota/mod.rs b/hil/src/commands/ota/mod.rs index 0494ffd23..5d3ed9f12 100644 --- a/hil/src/commands/ota/mod.rs +++ b/hil/src/commands/ota/mod.rs @@ -132,7 +132,10 @@ impl Ota { system::wait_for_attestation_token(&session) .await .inspect_err(|e| { - error!("Failed to get attestation token before reboot: {}", e); + error!( + "Failed to get attestation token before reboot: {}", + e + ); })?; info!("Attestation token fetched, rebooting device"); } else { diff --git a/hil/src/commands/ota/reboot.rs b/hil/src/commands/ota/reboot.rs index d75d5c17a..aa6f5b436 100644 --- a/hil/src/commands/ota/reboot.rs +++ b/hil/src/commands/ota/reboot.rs @@ -94,7 +94,11 @@ impl Ota { info!("NTP time synchronized successfully"); info!("Waiting for attestation token after reboot"); - match super::system::wait_for_attestation_token(&session).await { + match super::system::wait_for_attestation_token( + &session, + ) + .await + { Ok(_) => { info!("Attestation token fetched successfully"); } @@ -489,8 +493,11 @@ impl Ota { let mut total_bytes = 0; loop { - match tokio::time::timeout(Duration::from_secs(1), serial_stream.next()) - .await + match tokio::time::timeout( + Duration::from_secs(1), + serial_stream.next(), + ) + .await { Ok(Some(Ok(bytes))) => { if let Some(ref mut file) = log_file { @@ -506,7 +513,11 @@ impl Ota { if let Some(mut file) = log_file { let _ = file.shutdown().await; - info!("Boot log saved to: {} ({} bytes)", boot_log_path.display(), total_bytes); + info!( + "Boot log saved to: {} ({} bytes)", + boot_log_path.display(), + total_bytes + ); } }); From 232c80a11f4eb2a6e6f2724eb4162c8326c1374c Mon Sep 17 00:00:00 2001 From: chrisgalanis Date: Thu, 15 Jan 2026 21:09:06 +0100 Subject: [PATCH 07/10] fix: NTP sync & attestation sync --- hil/src/commands/ota/system.rs | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/hil/src/commands/ota/system.rs b/hil/src/commands/ota/system.rs index 7e559d57a..c1dd44f8a 100644 --- a/hil/src/commands/ota/system.rs +++ b/hil/src/commands/ota/system.rs @@ -233,12 +233,25 @@ pub async fn wait_for_time_sync(session: &SshWrapper) -> Result<()> { }; if is_synced { - let sync_duration = sync_start.elapsed(); - info!( - "System time synchronized successfully after {:?}", - sync_duration - ); - return Ok(()); + // Verify the actual time is correct by comparing with local PC + info!("Time sync reported by chrony/timedatectl, verifying actual time..."); + match check_time_difference_fallback(session).await { + Ok(_) => { + let sync_duration = sync_start.elapsed(); + info!( + "System time synchronized and verified after {:?}", + sync_duration + ); + return Ok(()); + } + Err(e) => { + warn!( + "Time sync reported but verification failed (attempt {}/{}): {}", + attempt, MAX_ATTEMPTS, e + ); + // Continue looping to check again + } + } } if attempt < MAX_ATTEMPTS { @@ -334,7 +347,7 @@ pub async fn wait_for_attestation_token(session: &SshWrapper) -> Result<()> { // Look for "got a new token" in the service logs let result = session .execute_command( - "TERM=dumb journalctl -u worldcoin-attest.service -n 50 --no-pager", + "TERM=dumb journalctl -u worldcoin-attest.service --no-pager", ) .await; From 029d26324b91b73adfbb35f452c04a1a4aa9862f Mon Sep 17 00:00:00 2001 From: chrisgalanis Date: Thu, 15 Jan 2026 21:41:15 +0100 Subject: [PATCH 08/10] fix: remove wait for attestation token --- hil/src/commands/ota/mod.rs | 22 +---------- hil/src/commands/ota/reboot.rs | 34 +--------------- hil/src/commands/ota/system.rs | 71 ---------------------------------- 3 files changed, 3 insertions(+), 124 deletions(-) diff --git a/hil/src/commands/ota/mod.rs b/hil/src/commands/ota/mod.rs index 5d3ed9f12..18511a740 100644 --- a/hil/src/commands/ota/mod.rs +++ b/hil/src/commands/ota/mod.rs @@ -126,18 +126,7 @@ impl Ota { .inspect_err(|e| { error!("Failed to sync time before reboot: {}", e); })?; - info!("NTP time synchronized"); - - info!("Waiting for attestation token before reboot"); - system::wait_for_attestation_token(&session) - .await - .inspect_err(|e| { - error!( - "Failed to get attestation token before reboot: {}", - e - ); - })?; - info!("Attestation token fetched, rebooting device"); + info!("NTP time synchronized, rebooting device"); } else { info!("Skipping NTP time synchronization before reboot (--skip-time-sync-before-reboot flag set)"); } @@ -187,15 +176,6 @@ impl Ota { })?; info!("System time synchronized"); - info!("Waiting for attestation token"); - system::wait_for_attestation_token(&session) - .await - .inspect_err(|e| { - println!("OTA_RESULT=FAILED"); - println!("OTA_ERROR=ATTESTATION_TOKEN_FAILED: {e}"); - })?; - info!("Attestation token fetched"); - info!("Restarting worldcoin-update-agent.service"); let start_timestamp = system::restart_update_agent(&session) .await diff --git a/hil/src/commands/ota/reboot.rs b/hil/src/commands/ota/reboot.rs index aa6f5b436..ccb273cd5 100644 --- a/hil/src/commands/ota/reboot.rs +++ b/hil/src/commands/ota/reboot.rs @@ -93,22 +93,6 @@ impl Ota { Ok(_) => { info!("NTP time synchronized successfully"); - info!("Waiting for attestation token after reboot"); - match super::system::wait_for_attestation_token( - &session, - ) - .await - { - Ok(_) => { - info!("Attestation token fetched successfully"); - } - Err(e) => { - debug!("Attestation token fetch failed on attempt {}: {}", attempt_count, e); - last_error = Some(e); - continue; - } - } - // Wait for boot log capture to finish match boot_log_task.await { Ok(Ok(())) => { @@ -567,22 +551,8 @@ impl Ota { { Ok(_) => { info!("NTP time synchronized after hardware reboot"); - - info!("Waiting for attestation token"); - match super::system::wait_for_attestation_token( - &session, - ) - .await - { - Ok(_) => { - info!("Attestation token fetched successfully"); - info!("Hardware reboot recovery successful!"); - return Ok(session); - } - Err(e) => { - warn!("Attestation token fetch failed after hardware reboot: {}", e); - } - } + info!("Hardware reboot recovery successful!"); + return Ok(session); } Err(e) => { warn!("Time sync failed after hardware reboot: {}", e); diff --git a/hil/src/commands/ota/system.rs b/hil/src/commands/ota/system.rs index c1dd44f8a..5159432b3 100644 --- a/hil/src/commands/ota/system.rs +++ b/hil/src/commands/ota/system.rs @@ -331,77 +331,6 @@ async fn check_time_difference_fallback(session: &SshWrapper) -> Result<()> { } } -/// Wait for worldcoin-attest service to fetch a valid token -pub async fn wait_for_attestation_token(session: &SshWrapper) -> Result<()> { - use std::time::Duration; - use tracing::{info, warn}; - - const MAX_ATTEMPTS: u32 = 90; // 90 attempts = 3 minutes max wait - const SLEEP_DURATION: Duration = Duration::from_secs(2); - - info!("Waiting for worldcoin-attest to fetch attestation token..."); - let sync_start = std::time::Instant::now(); - - for attempt in 1..=MAX_ATTEMPTS { - // Check if worldcoin-attest service has fetched a token - // Look for "got a new token" in the service logs - let result = session - .execute_command( - "TERM=dumb journalctl -u worldcoin-attest.service --no-pager", - ) - .await; - - match result { - Ok(output) if output.is_success() => { - // Check if we can find "got a new token" in recent logs - if output.stdout.contains("got a new token") { - let sync_duration = sync_start.elapsed(); - info!( - "Attestation token fetched successfully after {:?}", - sync_duration - ); - return Ok(()); - } - - // Also check for any recent errors that might indicate a problem - if output.stdout.contains("failed to get challenge") - || output.stdout.contains("failed to fetch challenge") - { - warn!( - "Attestation service showing errors (attempt {}/{}), continuing to wait...", - attempt, MAX_ATTEMPTS - ); - } - } - Ok(_) => { - info!( - "Failed to read attestation service logs (attempt {}/{})", - attempt, MAX_ATTEMPTS - ); - } - Err(e) => { - info!( - "Error reading attestation service logs (attempt {}/{}): {}", - attempt, MAX_ATTEMPTS, e - ); - } - } - - if attempt < MAX_ATTEMPTS { - info!( - "Attestation token not yet fetched (attempt {}/{}), waiting...", - attempt, MAX_ATTEMPTS - ); - tokio::time::sleep(SLEEP_DURATION).await; - } - } - - bail!( - "Attestation token was not fetched after {} seconds. \ - This may cause key retrieval to fail during boot.", - MAX_ATTEMPTS * 2 - ); -} /// Restart the update agent service and return the start timestamp pub async fn restart_update_agent(session: &SshWrapper) -> Result { From 6edff1d22de436b7ebdf22ddd70e4151d9b66263 Mon Sep 17 00:00:00 2001 From: chrisgalanis Date: Fri, 16 Jan 2026 10:21:02 +0100 Subject: [PATCH 09/10] fix: increase ntp sync delay --- hil/src/commands/ota/system.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hil/src/commands/ota/system.rs b/hil/src/commands/ota/system.rs index 5159432b3..23597717d 100644 --- a/hil/src/commands/ota/system.rs +++ b/hil/src/commands/ota/system.rs @@ -132,7 +132,7 @@ pub async fn wait_for_time_sync(session: &SshWrapper) -> Result<()> { use std::time::Duration; use tracing::{info, warn}; - const MAX_ATTEMPTS: u32 = 60; // 60 attempts = 2 minutes max wait + const MAX_ATTEMPTS: u32 = 300; // 300 attempts = 10 minutes max wait const SLEEP_DURATION: Duration = Duration::from_secs(2); // Timeout for individual command execution (10 seconds is generous for timedatectl/chronyc) const COMMAND_TIMEOUT: Duration = Duration::from_secs(10); From a7555a18e7ededb42b708a86c3473a406f188298 Mon Sep 17 00:00:00 2001 From: chrisgalanis Date: Fri, 16 Jan 2026 12:15:44 +0100 Subject: [PATCH 10/10] fix: format --- hil/src/commands/ota/system.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/hil/src/commands/ota/system.rs b/hil/src/commands/ota/system.rs index 23597717d..37c74d6c0 100644 --- a/hil/src/commands/ota/system.rs +++ b/hil/src/commands/ota/system.rs @@ -331,7 +331,6 @@ async fn check_time_difference_fallback(session: &SshWrapper) -> Result<()> { } } - /// Restart the update agent service and return the start timestamp pub async fn restart_update_agent(session: &SshWrapper) -> Result { // Get current timestamp (ON THE ORB!) before restarting service