diff --git a/hil/src/boot.rs b/hil/src/boot.rs index 9116fd933..f984ab3ac 100644 --- a/hil/src/boot.rs +++ b/hil/src/boot.rs @@ -23,6 +23,9 @@ pub async fn is_recovery_mode_detected() -> Result { /// If `device` is `None`, will get the first available device. #[tracing::instrument] pub async fn reboot(recovery: bool, device: Option<&FtdiId>) -> Result<()> { + const DEFAULT_HOLDING_DELAY: u64 = 5; + const INBETWEEN_DELAY: u64 = 4; + fn make_ftdi(device: Option) -> Result { let builder = FtdiGpio::builder(); let builder = match &device { @@ -37,7 +40,12 @@ pub async fn reboot(recovery: bool, device: Option<&FtdiId>) -> Result<()> { info!("Turning off"); let device_clone = device.cloned(); - let ftdi = tokio::task::spawn_blocking(|| -> Result<_, color_eyre::Report> { + let recovery_state = if recovery { + OutputState::Low + } else { + OutputState::High + }; + let ftdi = tokio::task::spawn_blocking(move || -> Result<_, color_eyre::Report> { for d in FtdiGpio::list_devices().wrap_err("failed to list ftdi devices")? { debug!( "ftdi device: desc:{}, serial:{}, vid:{}, pid:{}", @@ -46,18 +54,18 @@ pub async fn reboot(recovery: bool, device: Option<&FtdiId>) -> Result<()> { } let mut ftdi = make_ftdi(device_clone)?; ftdi.set_pin(BUTTON_PIN, OutputState::Low)?; - ftdi.set_pin(RECOVERY_PIN, OutputState::High)?; + ftdi.set_pin(RECOVERY_PIN, recovery_state)?; Ok(ftdi) }) .await .wrap_err("task panicked")??; - tokio::time::sleep(Duration::from_secs(10)).await; + tokio::time::sleep(Duration::from_secs(DEFAULT_HOLDING_DELAY)).await; info!("Resetting FTDI"); tokio::task::spawn_blocking(move || ftdi.destroy()) .await .wrap_err("task panicked")??; - tokio::time::sleep(Duration::from_secs(4)).await; + tokio::time::sleep(Duration::from_secs(INBETWEEN_DELAY)).await; info!("Turning on"); let device_clone = device.cloned(); @@ -74,7 +82,7 @@ pub async fn reboot(recovery: bool, device: Option<&FtdiId>) -> Result<()> { }) .await .wrap_err("task panicked")??; - tokio::time::sleep(Duration::from_secs(4)).await; + tokio::time::sleep(Duration::from_secs(DEFAULT_HOLDING_DELAY)).await; tokio::task::spawn_blocking(move || ftdi.destroy()) .await diff --git a/hil/src/commands/ota/mod.rs b/hil/src/commands/ota/mod.rs index 084ef1fe8..13ad0ec8a 100644 --- a/hil/src/commands/ota/mod.rs +++ b/hil/src/commands/ota/mod.rs @@ -30,9 +30,9 @@ pub struct Ota { #[arg(long)] target_version: String, - /// Hostname of the Orb device + /// Hostname of the Orb device (optional - will auto-discover via USB ethernet if not provided) #[arg(long)] - hostname: String, + hostname: Option, /// Username #[arg(long, default_value = "worldcoin")] @@ -69,6 +69,23 @@ pub struct Ota { /// Serial port ID for boot log capture (alternative to --serial-path) #[arg(long, group = "serial")] serial_id: Option, + + /// Skip NTP time synchronization check before the first reboot (after wipe_overlays). + /// Time sync will still be checked after reboot and before starting the update. + #[arg(long, default_value = "false")] + skip_time_sync_before_reboot: bool, + + /// IP range start for USB ethernet auto-discovery + #[arg(long, default_value = "2")] + discovery_ip_start: u8, + + /// IP range end for USB ethernet auto-discovery + #[arg(long, default_value = "10")] + discovery_ip_end: u8, + + /// Timeout for discovering Orb via USB ethernet (seconds) + #[arg(long, default_value = "30")] + discovery_timeout_secs: u64, } #[derive(Debug, Clone, clap::ValueEnum)] @@ -94,6 +111,13 @@ impl Ota { let _start_time = Instant::now(); info!("Starting OTA update to version: {}", self.target_version); + if let Some(log_dir) = self.log_file.parent() { + tokio::fs::create_dir_all(log_dir).await.wrap_err_with(|| { + format!("Failed to create log directory: {}", log_dir.display()) + })?; + info!("Log directory created/verified: {}", log_dir.display()); + } + let session = self.connect_ssh().await.inspect_err(|e| { println!("OTA_RESULT=FAILED"); println!("OTA_ERROR=SSH_CONNECTION_FAILED: {e}"); @@ -105,7 +129,19 @@ impl Ota { system::wipe_overlays(&session).await.inspect_err(|e| { error!("Failed to wipe overlays: {}", e); })?; - info!("Overlays wiped successfully, rebooting device"); + info!("Overlays wiped successfully"); + + if !self.skip_time_sync_before_reboot { + info!("Waiting for NTP time synchronization before reboot"); + system::wait_for_time_sync(&session) + .await + .inspect_err(|e| { + error!("Failed to sync time before reboot: {}", e); + })?; + info!("NTP time synchronized, rebooting device"); + } else { + info!("Skipping NTP time synchronization before reboot (--skip-time-sync-before-reboot flag set)"); + } system::reboot_orb(&session).await?; info!("Reboot command sent to Orb device"); @@ -315,24 +351,44 @@ impl Ota { } async fn connect_ssh(&self) -> Result { - info!( - "Connecting to Orb device at {}:{}", - self.hostname, self.port - ); - - let auth = match (&self.password, &self.key_path) { - (Some(password), None) => AuthMethod::Password(password.clone()), - (None, Some(key_path)) => AuthMethod::Key { - private_key_path: key_path.clone(), - }, - _ => unreachable!("Clap ensures exactly one auth method is specified"), + let hostname = match &self.hostname { + Some(h) => { + info!("Using provided hostname: {}", h); + h.clone() + } + None => { + info!("No hostname provided, starting USB ethernet auto-discovery"); + let discovery = orb_hil::NetworkDiscovery { + username: self.username.clone(), + auth: self.get_auth_method(), + port: self.port, + ip_range_start: self.discovery_ip_start, + ip_range_end: self.discovery_ip_end, + connection_timeout: std::time::Duration::from_secs( + self.discovery_timeout_secs, + ), + }; + + let discovered = discovery + .discover_orb() + .await + .wrap_err("Failed to discover Orb via USB ethernet")?; + + info!( + "Discovered Orb at {} on interface {}", + discovered.hostname, discovered.interface + ); + discovered.hostname + } }; + info!("Connecting to Orb device at {}:{}", hostname, self.port); + let connect_args = SshConnectArgs { - hostname: self.hostname.clone(), + hostname, port: self.port, username: self.username.clone(), - auth, + auth: self.get_auth_method(), }; let session = SshWrapper::connect(connect_args) @@ -342,4 +398,14 @@ impl Ota { info!("Successfully connected to Orb device"); Ok(session) } + + fn get_auth_method(&self) -> AuthMethod { + match (&self.password, &self.key_path) { + (Some(password), None) => AuthMethod::Password(password.clone()), + (None, Some(key_path)) => AuthMethod::Key { + private_key_path: key_path.clone(), + }, + _ => unreachable!("Clap ensures exactly one auth method is specified"), + } + } } diff --git a/hil/src/commands/ota/reboot.rs b/hil/src/commands/ota/reboot.rs index 2560e4ca1..82a0b457c 100644 --- a/hil/src/commands/ota/reboot.rs +++ b/hil/src/commands/ota/reboot.rs @@ -15,21 +15,32 @@ use tracing::{debug, error, info, instrument, warn}; use super::Ota; +const DELAY_CAPTURE_LOGS: u64 = 200; + impl Ota { #[instrument(skip_all)] pub(super) async fn handle_reboot(&self, log_suffix: &str) -> Result { info!("Waiting for reboot and device to come back online"); - // Set recovery pin HIGH for 5 seconds to prevent entering recovery mode - info!("Setting recovery pin HIGH to prevent recovery mode during reboot"); + // Always wait for SSH to become unreachable before holding the recovery pin. + info!("Monitoring SSH connection to detect when shutdown actually begins"); + self.wait_for_ssh_disconnection(Duration::from_secs(30)) + .await?; + info!("SSH disconnected - system is shutting down, holding recovery pin"); + + let hold_duration = 20; + + info!( + "Setting recovery pin HIGH to prevent recovery mode during reboot (hold duration: {}s)", + hold_duration + ); let set_recovery = SetRecoveryPin { state: OutputState::High, serial_num: None, desc: None, - duration: 5, + duration: hold_duration, }; - // Run recovery pin setting in background task let recovery_task = tokio::spawn(async move { set_recovery .run() @@ -37,13 +48,26 @@ impl Ota { .wrap_err("failed to set recovery pin") }); - self.capture_boot_logs(log_suffix).await?; - - // Wait for recovery pin task to complete recovery_task .await .wrap_err("recovery pin task panicked")??; + // Brief delay to allow USB device to be re-enumerated and udev rules to apply + // after FTDI GPIO is released. The FTDI device detaches/reattaches kernel + // drivers which causes /dev/ttyUSB* to be recreated. + tokio::time::sleep(Duration::from_millis(DELAY_CAPTURE_LOGS)).await; + + // Spawn boot log capture as a background task so it runs concurrently + // with SSH reconnection attempts. Extract needed values upfront. + let platform = self.platform.clone(); + let log_file = self.log_file.clone(); + let serial_path = self.get_serial_path().ok(); + let boot_log_suffix = log_suffix.to_string(); + let boot_log_task = tokio::spawn(async move { + Self::capture_boot_logs(platform, log_file, serial_path, &boot_log_suffix) + .await + }); + let start_time = Instant::now(); let timeout = Duration::from_secs(900); // 15 minutes let mut attempt_count = 0; @@ -63,7 +87,37 @@ impl Ota { Ok(session) => match session.test_connection().await { Ok(_) => { info!("Device is back online and responsive after reboot (attempt {})", attempt_count); - return Ok(session); + + info!("Waiting for NTP time synchronization after reboot"); + match super::system::wait_for_time_sync(&session).await { + Ok(_) => { + info!("NTP time synchronized successfully"); + + // Wait for boot log capture to finish + match boot_log_task.await { + Ok(Ok(())) => { + info!( + "Boot log capture completed successfully" + ); + } + Ok(Err(e)) => { + warn!("Boot log capture failed: {}", e); + } + Err(e) => { + warn!("Boot log capture task panicked: {}", e); + } + } + + return Ok(session); + } + Err(e) => { + debug!( + "Time sync failed on attempt {}: {}", + attempt_count, e + ); + last_error = Some(e); + } + } } Err(e) => { debug!( @@ -95,6 +149,21 @@ impl Ota { "No specific error captured".to_string() }; + // Try hardware button reboot as fallback + warn!("SSH reconnection failed, attempting hardware button reboot recovery"); + match self.try_hardware_reboot_recovery(log_suffix).await { + Ok(session) => { + info!("Hardware button reboot recovery succeeded!"); + return Ok(session); + } + Err(recovery_err) => { + error!( + "Hardware button reboot recovery also failed: {}", + recovery_err + ); + } + } + bail!( "Device did not come back online within {:?} (attempted {} times). {}", elapsed, @@ -103,27 +172,40 @@ impl Ota { ); } + /// Captures boot logs from serial port in the background #[instrument(skip_all)] - async fn capture_boot_logs(&self, log_suffix: &str) -> Result<()> { - let platform_name = format!("{:?}", self.platform).to_lowercase(); + async fn capture_boot_logs( + platform: super::Platform, + log_file: std::path::PathBuf, + serial_path: Option, + log_suffix: &str, + ) -> Result<()> { + let platform_name = format!("{:?}", platform).to_lowercase(); info!( "Starting boot log capture for {} ({})", log_suffix, platform_name ); - let boot_log_path = self - .log_file + let boot_log_path = log_file .parent() .unwrap_or_else(|| std::path::Path::new(".")) .join(format!("boot_log_{platform_name}_{log_suffix}.txt")); - let serial_path = match self.get_serial_path() { - Ok(path) => path, - Err(e) => { - warn!( - "Failed to get serial path: {}. Skipping boot log capture.", - e - ); + // Create parent directory if it doesn't exist + if let Some(parent) = boot_log_path.parent() + && let Err(e) = tokio::fs::create_dir_all(parent).await + { + warn!( + "Failed to create directory {}: {}. Boot log capture may fail.", + parent.display(), + e + ); + } + + let serial_path = match serial_path { + Some(path) => path, + None => { + warn!("No serial path provided. Skipping boot log capture."); return Ok(()); } }; @@ -151,21 +233,52 @@ impl Ota { spawn_serial_reader_task(serial_reader, serial_output_tx); let boot_log_fut = async { - let mut boot_log_content = Vec::new(); + use tokio::io::AsyncWriteExt; + + // Open file for writing incrementally + let mut log_file = match tokio::fs::OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(&boot_log_path) + .await + { + Ok(f) => Some(f), + Err(e) => { + warn!( + "Failed to open boot log file {}: {}. Will continue without writing to disk.", + boot_log_path.display(), + e + ); + None + } + }; + + let mut total_bytes = 0; let mut serial_stream = BroadcastStream::new(serial_output_rx); - // 3-minute timeout for flaky serial connections - let timeout = Duration::from_secs(180); let start_time = Instant::now(); let mut found_login_prompt = false; - while start_time.elapsed() < timeout { + // Wait indefinitely until login prompt is detected + loop { match tokio::time::timeout(Duration::from_secs(1), serial_stream.next()) .await { Ok(Some(Ok(bytes))) => { - boot_log_content.extend_from_slice(&bytes); + // Write to file immediately as data arrives + if let Some(ref mut file) = log_file { + if let Err(e) = file.write_all(&bytes).await { + warn!("Failed to write to boot log file: {}. Continuing capture in memory only.", e); + log_file = None; + } else { + // Flush to ensure data is written to disk immediately + let _ = file.flush().await; + total_bytes += bytes.len(); + } + } + // Stop capturing when login prompt is detected if let Ok(text) = String::from_utf8(bytes.to_vec()) && text.contains(LOGIN_PROMPT_PATTERN) { @@ -190,31 +303,27 @@ impl Ota { } } - if start_time.elapsed() >= timeout && !found_login_prompt { + if found_login_prompt { + info!( + "Boot log capture completed successfully after {:?}", + start_time.elapsed() + ); + } else { warn!( - "Boot log capture timed out after {:?} without finding login prompt. Will proceed with SSH reconnection anyway.", - timeout + "Boot log capture ended without detecting login prompt after {:?}", + start_time.elapsed() ); } - if !boot_log_content.is_empty() { - match tokio::fs::write(&boot_log_path, &boot_log_content).await { - Ok(_) => { - info!( - "Boot log saved to: {} ({} bytes)", - boot_log_path.display(), - boot_log_content.len() - ); - } - Err(e) => { - warn!( - "Failed to write boot log to {}: {}. Continuing anyway.", - boot_log_path.display(), - e - ); - } - } - } else { + if let Some(mut file) = log_file { + let _ = file.flush().await; + let _ = file.shutdown().await; + info!( + "Boot log saved to: {} ({} bytes)", + boot_log_path.display(), + total_bytes + ); + } else if total_bytes == 0 { warn!("No boot log content captured from serial"); } @@ -235,4 +344,234 @@ impl Ota { Ok(()) } + + /// Wait for SSH connection to become unreachable, indicating shutdown has started + #[instrument(skip_all)] + async fn wait_for_ssh_disconnection(&self, timeout: Duration) -> Result<()> { + let start = Instant::now(); + let mut attempt = 0; + + loop { + if start.elapsed() > timeout { + bail!("SSH did not disconnect within {:?}", timeout); + } + + attempt += 1; + + // Try to establish connection with a lightweight command + match self.connect_ssh().await { + Ok(session) => match session.execute_command("echo").await { + Ok(_) => { + debug!( + "SSH still responsive (attempt {}), waiting for shutdown...", + attempt + ); + tokio::time::sleep(Duration::from_millis(500)).await; + } + Err(_) => { + info!("SSH connection degraded, shutdown likely in progress"); + return Ok(()); + } + }, + Err(_) => { + info!( + "SSH connection lost after {} attempts, shutdown confirmed", + attempt + ); + return Ok(()); + } + } + } + } + + /// Try hardware button reboot as fallback recovery mechanism + /// Performs up to 3 boot attempts with serial log capture + #[instrument(skip_all)] + async fn try_hardware_reboot_recovery( + &self, + log_suffix: &str, + ) -> Result { + const MAX_BOOT_ATTEMPTS: u32 = 3; + + info!( + "Starting hardware button reboot recovery (max {} attempts)", + MAX_BOOT_ATTEMPTS + ); + + for boot_attempt in 1..=MAX_BOOT_ATTEMPTS { + info!( + "Hardware reboot attempt {}/{}", + boot_attempt, MAX_BOOT_ATTEMPTS + ); + + // Perform hardware button reboot + info!("Triggering hardware button reboot (recovery=false)"); + crate::boot::reboot(false, None) + .await + .wrap_err("Failed to trigger hardware button reboot")?; + + // Brief delay to allow USB device to be re-enumerated + tokio::time::sleep(Duration::from_millis(DELAY_CAPTURE_LOGS)).await; + + // Capture boot logs and wait for login prompt + let serial_path = match self.get_serial_path() { + Ok(path) => path, + Err(e) => { + warn!("Failed to get serial path for boot log capture: {}", e); + // Continue without serial logs + continue; + } + }; + + info!( + "Opening serial port for boot log capture: {}", + serial_path.display() + ); + let serial = match tokio_serial::new( + &*serial_path.to_string_lossy(), + crate::serial::ORB_BAUD_RATE, + ) + .open_native_async() + { + Ok(s) => s, + Err(e) => { + warn!( + "Failed to open serial port: {}. Continuing without logs.", + e + ); + continue; + } + }; + + let (serial_reader, _serial_writer) = tokio::io::split(serial); + let (serial_output_tx, serial_output_rx) = broadcast::channel(64); + let (reader_task, kill_tx) = + spawn_serial_reader_task(serial_reader, serial_output_tx); + + // Capture boot logs in background while waiting for login prompt + let platform = self.platform.clone(); + let log_file = self.log_file.clone(); + let serial_path_clone = serial_path.clone(); + let boot_log_suffix = + format!("{}_hardware_recovery_{}", log_suffix, boot_attempt); + let boot_log_task = tokio::spawn(async move { + Self::capture_boot_logs( + platform, + log_file, + Some(serial_path_clone), + &boot_log_suffix, + ) + .await + }); + + // Wait for login prompt with timeout + info!("Waiting for login prompt..."); + let wait_result = tokio::time::timeout( + Duration::from_secs(300), // 5 minutes timeout per boot attempt + crate::serial::wait_for_pattern( + LOGIN_PROMPT_PATTERN.to_owned().into_bytes(), + BroadcastStream::new(serial_output_rx), + ), + ) + .await; + + let _ = kill_tx.send(()); + let _ = reader_task.await; + + match wait_result { + Ok(Ok(())) => { + info!("Login prompt detected on boot attempt {}", boot_attempt); + + // Wait a bit for boot to stabilize + tokio::time::sleep(Duration::from_secs(10)).await; + + // Try to SSH connect + info!("Attempting SSH connection after hardware reboot..."); + let mut ssh_attempts = 0; + const MAX_SSH_ATTEMPTS: u32 = 30; + + while ssh_attempts < MAX_SSH_ATTEMPTS { + ssh_attempts += 1; + tokio::time::sleep(Duration::from_secs(10)).await; + + match self.connect_ssh().await { + Ok(session) => { + match session.test_connection().await { + Ok(_) => { + info!("SSH connection established after hardware reboot!"); + + // Wait for time sync + info!("Waiting for NTP time synchronization"); + match super::system::wait_for_time_sync( + &session, + ) + .await + { + Ok(_) => { + info!("Hardware reboot recovery successful!"); + + // Wait for boot log capture to finish + match boot_log_task.await { + Ok(Ok(())) => { + info!("Boot log capture completed for attempt {}", boot_attempt); + } + Ok(Err(e)) => { + warn!("Boot log capture failed for attempt {}: {}", boot_attempt, e); + } + Err(e) => { + warn!("Boot log capture task panicked for attempt {}: {}", boot_attempt, e); + } + } + + return Ok(session); + } + Err(e) => { + warn!("Time sync failed after hardware reboot: {}", e); + } + } + } + Err(e) => { + debug!("SSH connection test failed (attempt {}): {}", ssh_attempts, e); + } + } + } + Err(e) => { + debug!( + "SSH connection failed (attempt {}): {}", + ssh_attempts, e + ); + } + } + } + + warn!( + "SSH connection failed after {} attempts on boot attempt {}", + MAX_SSH_ATTEMPTS, boot_attempt + ); + + // Clean up boot log task since we're moving to next attempt + boot_log_task.abort(); + } + Ok(Err(e)) => { + warn!( + "Error waiting for login prompt on boot attempt {}: {}", + boot_attempt, e + ); + boot_log_task.abort(); + } + Err(_) => { + warn!( + "Timeout waiting for login prompt on boot attempt {}", + boot_attempt + ); + boot_log_task.abort(); + } + } + } + + bail!( + "Hardware reboot recovery failed after {} boot attempts", + MAX_BOOT_ATTEMPTS + ); + } } diff --git a/hil/src/commands/ota/system.rs b/hil/src/commands/ota/system.rs index 590f549df..5341d7d5b 100644 --- a/hil/src/commands/ota/system.rs +++ b/hil/src/commands/ota/system.rs @@ -134,28 +134,111 @@ pub async fn wait_for_time_sync(session: &SshWrapper) -> Result<()> { const MAX_ATTEMPTS: u32 = 60; // 60 attempts = 2 minutes max wait const SLEEP_DURATION: Duration = Duration::from_secs(2); + // Timeout for individual command execution (10 seconds is generous for timedatectl/chronyc) + const COMMAND_TIMEOUT: Duration = Duration::from_secs(10); info!("Waiting for system time synchronization..."); let sync_start = std::time::Instant::now(); - for attempt in 1..=MAX_ATTEMPTS { - let result = session - .execute_command("TERM=dumb timedatectl status") + // Detect which time sync tool is available (prefer chronyc over timedatectl) + let use_chronyc = session + .execute_command("TERM=dumb command -v chronyc") + .await + .map(|r| r.is_success()) + .unwrap_or(false); + + let use_timedatectl = if !use_chronyc { + session + .execute_command("TERM=dumb command -v timedatectl") .await - .wrap_err("Failed to check time synchronization status")?; + .map(|r| r.is_success()) + .unwrap_or(false) + } else { + false + }; + + if !use_timedatectl && !use_chronyc { + bail!("Neither chronyc nor timedatectl found on the system"); + } + + info!( + "Using {} for time sync check", + if use_chronyc { + "chronyc" + } else { + "timedatectl" + } + ); - if result.is_success() { - // Check if "System clock synchronized: yes" appears in output - if result.stdout.contains("System clock synchronized: yes") - || result.stdout.contains("synchronized: yes") + for attempt in 1..=MAX_ATTEMPTS { + let is_synced = if use_chronyc { + // Try chronyc tracking with timeout + match tokio::time::timeout( + COMMAND_TIMEOUT, + session.execute_command("TERM=dumb chronyc tracking"), + ) + .await { - let sync_duration = sync_start.elapsed(); - info!( - "System time synchronized successfully after {:?}", - sync_duration - ); - return Ok(()); + Ok(Ok(result)) if result.is_success() => { + // Check if chrony is synchronized + // Leap status should be "Normal" when synchronized + result.stdout.contains("Leap status : Normal") + && !result.stdout.contains("Reference ID : 0.0.0.0") + } + Ok(Ok(_)) => false, + Ok(Err(e)) => { + info!( + "Failed to check chronyc status (attempt {}/{}): {}", + attempt, MAX_ATTEMPTS, e + ); + false + } + Err(_) => { + info!( + "chronyc command timed out after {:?} (attempt {}/{})", + COMMAND_TIMEOUT, attempt, MAX_ATTEMPTS + ); + false + } } + } else { + // Try timedatectl with timeout + match tokio::time::timeout( + COMMAND_TIMEOUT, + session.execute_command("TERM=dumb timedatectl"), + ) + .await + { + Ok(Ok(result)) if result.is_success() => { + // Check if "System clock synchronized: yes" appears in output + result.stdout.contains("System clock synchronized: yes") + || result.stdout.contains("synchronized: yes") + } + Ok(Ok(_)) => false, + Ok(Err(e)) => { + info!( + "Failed to check timedatectl status (attempt {}/{}): {}", + attempt, MAX_ATTEMPTS, e + ); + false + } + Err(_) => { + info!( + "timedatectl command timed out after {:?} (attempt {}/{})", + COMMAND_TIMEOUT, attempt, MAX_ATTEMPTS + ); + false + } + } + }; + + if is_synced { + let sync_duration = sync_start.elapsed(); + info!( + "System time synchronized successfully after {:?}", + sync_duration + ); + return Ok(()); } if attempt < MAX_ATTEMPTS { diff --git a/hil/src/lib.rs b/hil/src/lib.rs index a02448d3e..a8535d710 100644 --- a/hil/src/lib.rs +++ b/hil/src/lib.rs @@ -1,5 +1,6 @@ #![forbid(unsafe_code)] +mod network_discovery; mod ssh_wrapper; #[path = "commands/ota/verify.rs"] @@ -8,4 +9,5 @@ pub mod verify; #[path = "commands/ota/mcu_util.rs"] pub mod mcu_util; +pub use network_discovery::{DiscoveredOrb, NetworkDiscovery}; pub use ssh_wrapper::{AuthMethod, CommandResult, SshConnectArgs, SshWrapper}; diff --git a/hil/src/network_discovery.rs b/hil/src/network_discovery.rs new file mode 100644 index 000000000..ceb2af06c --- /dev/null +++ b/hil/src/network_discovery.rs @@ -0,0 +1,206 @@ +use crate::{AuthMethod, SshConnectArgs, SshWrapper}; +use color_eyre::{eyre::bail, Result}; +use std::time::Duration; +use tokio::time::timeout; +use tracing::{debug, info, warn}; + +/// Configuration for network discovery on USB ethernet interfaces +#[derive(Debug, Clone)] +pub struct NetworkDiscovery { + pub username: String, + pub auth: AuthMethod, + pub port: u16, + pub ip_range_start: u8, + pub ip_range_end: u8, + pub connection_timeout: Duration, +} + +/// Information about a discovered Orb device +#[derive(Debug, Clone)] +pub struct DiscoveredOrb { + pub hostname: String, + pub interface: String, +} + +impl NetworkDiscovery { + /// Discovers an Orb device on USB ethernet interfaces (orbeth0-3) + pub async fn discover_orb(&self) -> Result { + info!("Starting Orb discovery on USB ethernet interfaces"); + + let interfaces = enumerate_orbeth_interfaces().await?; + + if interfaces.is_empty() { + bail!( + "No USB ethernet interfaces (orbeth0-3) found.\n\ + Ensure Orb is connected via USB and udev rules are configured." + ); + } + + info!("Found USB ethernet interfaces: {:?}", interfaces); + + let mut tasks = Vec::new(); + for interface in interfaces.iter() { + let interface = interface.clone(); + let discovery = self.clone(); + let task = + tokio::spawn(async move { discovery.scan_interface(&interface).await }); + tasks.push(task); + } + + let discovery_result = timeout(self.connection_timeout, async { + loop { + for task in &mut tasks { + if task.is_finished() { + match task.await { + Ok(Ok(discovered)) => return Ok(discovered), + Ok(Err(e)) => debug!("Interface scan failed: {}", e), + Err(e) => warn!("Task panicked: {}", e), + } + } + } + tokio::time::sleep(Duration::from_millis(100)).await; + } + }) + .await; + + match discovery_result { + Ok(Ok(discovered)) => Ok(discovered), + Ok(Err(e)) => Err(e), + Err(_) => bail!( + "Failed to discover Orb on USB ethernet after {}s.\n\ + Scanned interfaces: {}\n\ + IP range: 10.42.0.{}-{}\n\ + Suggestion: Verify Orb is powered on and SSH is running.\n\ + Or use --hostname to specify manually.", + self.connection_timeout.as_secs(), + interfaces.join(", "), + self.ip_range_start, + self.ip_range_end + ), + } + } + + /// Scans a specific interface for responsive Orb devices + async fn scan_interface(&self, interface: &str) -> Result { + debug!("Scanning interface {} for Orb devices", interface); + + let mut tasks = Vec::new(); + for ip_suffix in self.ip_range_start..=self.ip_range_end { + let ip = format!("10.42.0.{}", ip_suffix); + let interface = interface.to_string(); + let discovery = self.clone(); + + let task = tokio::spawn(async move { + discovery.test_ssh_connection(&ip, &interface).await + }); + tasks.push(task); + } + + loop { + for task in &mut tasks { + if task.is_finished() { + match task.await { + Ok(Ok(discovered)) => { + info!( + "Successfully connected to Orb at {} on {}", + discovered.hostname, discovered.interface + ); + return Ok(discovered); + } + Ok(Err(e)) => debug!("SSH test failed: {}", e), + Err(e) => warn!("Task panicked: {}", e), + } + } + } + + tokio::time::sleep(Duration::from_millis(100)).await; + + if tasks.iter().all(|t| t.is_finished()) { + break; + } + } + + bail!("No responsive Orb found on interface {}", interface) + } + + /// Tests SSH connection to a specific IP address + async fn test_ssh_connection( + &self, + ip: &str, + interface: &str, + ) -> Result { + debug!("Testing SSH connection to {} on {}", ip, interface); + + let connect_args = SshConnectArgs { + hostname: ip.to_string(), + port: self.port, + username: self.username.clone(), + auth: self.auth.clone(), + }; + + let test_result = timeout(Duration::from_secs(3), async { + SshWrapper::connect(connect_args).await + }) + .await; + + match test_result { + Ok(Ok(_wrapper)) => { + debug!("SSH connection successful to {} on {}", ip, interface); + + Ok(DiscoveredOrb { + hostname: ip.to_string(), + interface: interface.to_string(), + }) + } + Ok(Err(e)) => { + debug!("SSH connection failed to {}: {}", ip, e); + Err(e) + } + Err(_) => { + debug!("SSH connection timed out to {}", ip); + bail!("Connection timeout") + } + } + } +} + +/// Enumerates USB ethernet interfaces (orbeth0-3) that are currently UP +async fn enumerate_orbeth_interfaces() -> Result> { + let sys_net_path = "/sys/class/net"; + let mut interfaces = Vec::new(); + + let mut entries = tokio::fs::read_dir(sys_net_path).await?; + + while let Some(entry) = entries.next_entry().await? { + let interface_name = entry.file_name(); + let interface_str = interface_name.to_string_lossy(); + + if interface_str.starts_with("orbeth") && interface_str.len() == 7 { + let operstate_path = + format!("{}/{}/operstate", sys_net_path, interface_str); + + if let Ok(state) = tokio::fs::read_to_string(&operstate_path).await + && state.trim() == "up" + { + interfaces.push(interface_str.to_string()); + debug!("Found active USB ethernet interface: {}", interface_str); + } + } + } + + interfaces.sort(); + + Ok(interfaces) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + #[cfg(target_os = "linux")] + async fn test_enumerate_interfaces() { + let result = enumerate_orbeth_interfaces().await; + assert!(result.is_ok()); + } +}