From 401151bdb2a79431f55b6492061aeb0b4a34827a Mon Sep 17 00:00:00 2001 From: ejc3 Date: Wed, 4 Mar 2026 01:04:46 +0000 Subject: [PATCH 01/13] fix: skip pasta port probe during snapshot restore During snapshot restore, post_start() runs BEFORE the VM snapshot is loaded into Firecracker. wait_for_port_forwarding() probes ports via TcpStream::connect(), which pasta accepts (it's listening), but pasta can't forward to the non-existent guest. This forces pasta to attempt L2 forwarding that fails, potentially poisoning its internal connection tracking state and causing subsequent data connections to return 0 bytes. Add with_restore_mode() to PastaNetwork that skips the premature port probe in post_start(). Port forwarding is verified later via verify_port_forwarding() which runs after the VM is resumed and fc-agent has sent its gratuitous ARP. --- src/commands/snapshot.rs | 3 ++- src/network/pasta.rs | 23 ++++++++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/src/commands/snapshot.rs b/src/commands/snapshot.rs index 3b7756cf..55907217 100644 --- a/src/commands/snapshot.rs +++ b/src/commands/snapshot.rs @@ -724,7 +724,8 @@ pub async fn cmd_snapshot_run(args: SnapshotRunArgs) -> Result<()> { // With bridge mode, guest IP is always 10.0.2.100 on pasta network // Each clone runs in its own namespace, so no IP conflict let net = PastaNetwork::new(vm_id.clone(), tap_device.clone(), port_mappings.clone()) - .with_loopback_ip(loopback_ip); + .with_loopback_ip(loopback_ip) + .with_restore_mode(); Box::new(net) } }; diff --git a/src/network/pasta.rs b/src/network/pasta.rs index b246e141..d360854e 100644 --- a/src/network/pasta.rs +++ b/src/network/pasta.rs @@ -75,6 +75,7 @@ pub struct PastaNetwork { pid_file: Option, loopback_ip: Option, // Unique loopback IP for port forwarding (127.x.y.z) holder_pid: Option, // Namespace PID (set in post_start) + restore_mode: bool, // Skip port probe in post_start (VM not loaded yet) } impl PastaNetwork { @@ -90,6 +91,7 @@ impl PastaNetwork { pid_file: None, loopback_ip: None, holder_pid: None, + restore_mode: false, } } @@ -106,6 +108,19 @@ impl PastaNetwork { self } + /// Skip port forwarding probe in post_start() for snapshot restore. + /// + /// During snapshot restore, post_start() runs BEFORE the VM snapshot is loaded + /// into Firecracker. Probing ports at that point forces pasta to attempt L2 + /// forwarding to a non-existent guest, which can poison pasta's internal + /// connection tracking and cause subsequent connections to return 0 bytes. + /// The proper verification happens later via verify_port_forwarding() after + /// the VM is resumed and fc-agent has sent its gratuitous ARP. + pub fn with_restore_mode(mut self) -> Self { + self.restore_mode = true; + self + } + /// Get the loopback IP assigned to this VM for port forwarding pub fn loopback_ip(&self) -> Option<&str> { self.loopback_ip.as_deref() @@ -604,7 +619,13 @@ impl NetworkManager for PastaNetwork { // The PID file only means pasta spawned, not that ports are bound. // Health checks use nsenter (bridge path), so without this check // "healthy" doesn't mean port forwarding works. - if !self.port_mappings.is_empty() { + // + // Skip in restore mode: during snapshot restore, post_start() runs BEFORE + // the VM snapshot is loaded. Probing ports now forces pasta to attempt L2 + // forwarding to a non-existent guest, poisoning its connection state and + // causing subsequent connections to return 0 bytes. The port check happens + // later via verify_port_forwarding() after the VM is actually running. + if !self.restore_mode && !self.port_mappings.is_empty() { self.wait_for_port_forwarding().await?; } From ffa6facb6b1b0fe60830285cf27ff9ef7fc32dde Mon Sep 17 00:00:00 2001 From: ejc3 Date: Wed, 4 Mar 2026 01:04:52 +0000 Subject: [PATCH 02/13] test: add clone port forward stress test Stress test that spawns 3 clones from a snapshot with port forwarding and makes 20 rapid HTTP requests to each clone. Catches the "connect succeeded but 0 bytes" pattern seen in CI bench-vm failures. Specifically detects pasta connection tracking poisoning where TCP connect succeeds but no data flows through the forwarded port. --- tests/test_clone_port_forward_stress.rs | 364 ++++++++++++++++++++++++ 1 file changed, 364 insertions(+) create mode 100644 tests/test_clone_port_forward_stress.rs diff --git a/tests/test_clone_port_forward_stress.rs b/tests/test_clone_port_forward_stress.rs new file mode 100644 index 00000000..b59df624 --- /dev/null +++ b/tests/test_clone_port_forward_stress.rs @@ -0,0 +1,364 @@ +//! Stress test for clone port forwarding. +//! +//! This test exercises the pasta port forwarding path under stress: +//! - Multiple clones spawned from the same snapshot +//! - Rapid repeated HTTP requests to each clone +//! - Checks for 0-byte responses (pasta connection tracking poisoning) +//! +//! Background: During snapshot restore, `post_start()` calls +//! `wait_for_port_forwarding()` BEFORE the VM snapshot is loaded. +//! pasta accepts the TCP connection (it's listening on loopback), +//! but can't forward to the non-existent guest. This may poison +//! pasta's internal forwarding state, causing subsequent connections +//! to return 0 bytes instead of actual data. + +#![cfg(feature = "integration-slow")] + +mod common; + +use anyhow::{Context, Result}; +use std::time::{Duration, Instant}; + +/// Number of clones to spawn +const NUM_CLONES: usize = 3; + +/// Number of HTTP requests per clone +const REQUESTS_PER_CLONE: usize = 20; + +/// Stress test: multiple clones with port forwarding, rapid HTTP requests +/// +/// Reproduces the "connect succeeded but 0 bytes" pattern seen in CI bench-vm +/// failures. The hypothesis: pasta's `wait_for_port_forwarding()` probe during +/// restore (before guest exists) poisons pasta's internal forwarding state. +#[tokio::test] +async fn test_clone_port_forward_stress_rootless() -> Result<()> { + let (baseline_name, _, snapshot_name, _) = common::unique_names("pf-stress"); + + println!("\n╔═══════════════════════════════════════════════════════════════╗"); + println!("║ Clone Port Forward Stress Test (rootless) ║"); + println!( + "║ {} clones × {} requests each ║", + NUM_CLONES, REQUESTS_PER_CLONE + ); + println!("╚═══════════════════════════════════════════════════════════════╝\n"); + + let fcvm_path = common::find_fcvm_binary()?; + + // Allocate port before baseline so it's baked into the snapshot + let host_port = common::find_available_high_port().context("finding available port")?; + let publish_arg = format!("{}:80", host_port); + + // Step 1: Start baseline VM with nginx + port forwarding + println!( + "Step 1: Starting baseline VM (nginx, --publish {})...", + publish_arg + ); + let (_baseline_child, baseline_pid) = common::spawn_fcvm_with_logs( + &[ + "podman", + "run", + "--name", + &baseline_name, + "--network", + "rootless", + "--publish", + &publish_arg, + "--health-check", + "http://localhost:80", + common::TEST_IMAGE, + ], + &baseline_name, + ) + .await + .context("spawning baseline VM")?; + + println!(" Waiting for baseline VM to become healthy..."); + common::poll_health_by_pid(baseline_pid, 90).await?; + println!(" ✓ Baseline VM healthy (PID: {})", baseline_pid); + + // Verify baseline port forwarding works before snapshotting + let output = tokio::process::Command::new(&fcvm_path) + .args(["ls", "--json", "--pid", &baseline_pid.to_string()]) + .output() + .await + .context("getting baseline state")?; + + let stdout = String::from_utf8_lossy(&output.stdout); + let baseline_ip: String = serde_json::from_str::>(&stdout) + .ok() + .and_then(|v| v.first().cloned()) + .and_then(|v| { + v.get("config")? + .get("network")? + .get("loopback_ip")? + .as_str() + .map(|s| s.to_string()) + }) + .unwrap_or_default(); + + println!(" Baseline loopback IP: {}", baseline_ip); + + // Verify baseline nginx responds + let baseline_check = curl_check(&baseline_ip, host_port, 5).await; + println!( + " Baseline HTTP check: {} ({} bytes)", + if baseline_check.success { + "✓" + } else { + "✗ FAIL" + }, + baseline_check.body_len + ); + assert!( + baseline_check.success && baseline_check.body_len > 0, + "Baseline nginx must respond with data before snapshot" + ); + + // Step 2: Create snapshot + println!("\nStep 2: Creating snapshot..."); + let output = tokio::process::Command::new(&fcvm_path) + .args([ + "snapshot", + "create", + "--pid", + &baseline_pid.to_string(), + "--tag", + &snapshot_name, + ]) + .output() + .await + .context("running snapshot create")?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + anyhow::bail!("Snapshot creation failed: {}", stderr); + } + println!(" ✓ Snapshot created"); + + // Kill baseline - only need snapshot for clones + common::kill_process(baseline_pid).await; + println!(" Killed baseline VM"); + + // Step 3: Start memory server + println!("\nStep 3: Starting memory server..."); + let (_serve_child, serve_pid) = + common::spawn_fcvm_with_logs(&["snapshot", "serve", &snapshot_name], "uffd-pf-stress") + .await + .context("spawning memory server")?; + + common::poll_serve_ready(&snapshot_name, serve_pid, 30).await?; + println!(" ✓ Memory server ready (PID: {})", serve_pid); + + // Step 4: Spawn clones sequentially (each gets unique loopback IP) + println!("\nStep 4: Spawning {} clones...", NUM_CLONES); + let serve_pid_str = serve_pid.to_string(); + + struct CloneInfo { + name: String, + pid: u32, + loopback_ip: String, + _child: tokio::process::Child, + } + + let mut clones: Vec = Vec::new(); + + for i in 0..NUM_CLONES { + let clone_name = format!("pf-stress-clone-{}-{}", i, std::process::id()); + println!(" Spawning clone {} ({})...", i, clone_name); + + let (child, clone_pid) = common::spawn_fcvm_with_logs( + &[ + "snapshot", + "run", + "--pid", + &serve_pid_str, + "--name", + &clone_name, + ], + &clone_name, + ) + .await + .context(format!("spawning clone {}", i))?; + + // Wait for healthy + common::poll_health_by_pid(clone_pid, 120).await?; + println!(" ✓ Clone {} healthy (PID: {})", i, clone_pid); + + // Get loopback IP + let output = tokio::process::Command::new(&fcvm_path) + .args(["ls", "--json", "--pid", &clone_pid.to_string()]) + .output() + .await + .context("getting clone state")?; + + let stdout = String::from_utf8_lossy(&output.stdout); + let loopback_ip: String = serde_json::from_str::>(&stdout) + .ok() + .and_then(|v| v.first().cloned()) + .and_then(|v| { + v.get("config")? + .get("network")? + .get("loopback_ip")? + .as_str() + .map(|s| s.to_string()) + }) + .unwrap_or_default(); + + println!(" Loopback IP: {}", loopback_ip); + + clones.push(CloneInfo { + name: clone_name, + pid: clone_pid, + loopback_ip, + _child: child, + }); + } + + // Step 5: Rapid HTTP requests to all clones + println!( + "\nStep 5: Sending {} HTTP requests to each of {} clones...", + REQUESTS_PER_CLONE, NUM_CLONES + ); + + let mut total_requests = 0u32; + let mut total_success = 0u32; + let mut total_zero_bytes = 0u32; + let mut total_errors = 0u32; + + for clone in &clones { + let mut clone_success = 0u32; + let mut clone_zero = 0u32; + let mut clone_error = 0u32; + + let start = Instant::now(); + + for req in 0..REQUESTS_PER_CLONE { + let result = curl_check(&clone.loopback_ip, host_port, 5).await; + total_requests += 1; + + if result.success && result.body_len > 0 { + clone_success += 1; + total_success += 1; + } else if result.success && result.body_len == 0 { + // This is the pasta poisoning pattern: connect succeeds but 0 bytes + clone_zero += 1; + total_zero_bytes += 1; + println!( + " ⚠ Clone {} request {}: 0-byte response!", + clone.name, req + ); + } else { + clone_error += 1; + total_errors += 1; + if clone_error <= 3 { + println!( + " ✗ Clone {} request {}: error ({})", + clone.name, req, result.error + ); + } + } + + // Brief pause between requests (not too long - we want stress) + tokio::time::sleep(Duration::from_millis(50)).await; + } + + let elapsed = start.elapsed(); + println!( + " Clone {} ({}): {}/{} OK, {} zero-byte, {} errors ({:.1}s)", + clone.name, + clone.loopback_ip, + clone_success, + REQUESTS_PER_CLONE, + clone_zero, + clone_error, + elapsed.as_secs_f64() + ); + } + + // Cleanup + println!("\nCleaning up..."); + for clone in &clones { + common::kill_process(clone.pid).await; + } + common::kill_process(serve_pid).await; + println!(" Killed all clones and memory server"); + + // Results + println!("\n╔═══════════════════════════════════════════════════════════════╗"); + println!("║ RESULTS ║"); + println!("╠═══════════════════════════════════════════════════════════════╣"); + println!( + "║ Total requests: {:4} ║", + total_requests + ); + println!( + "║ Successful: {:4} ║", + total_success + ); + println!( + "║ Zero-byte: {:4} (pasta poisoning pattern) ║", + total_zero_bytes + ); + println!( + "║ Errors: {:4} ║", + total_errors + ); + println!("╚═══════════════════════════════════════════════════════════════╝"); + + if total_zero_bytes > 0 { + anyhow::bail!( + "PASTA POISONING DETECTED: {} out of {} requests returned 0 bytes. \ + This confirms that wait_for_port_forwarding() during restore \ + (before guest exists) corrupts pasta's forwarding state.", + total_zero_bytes, + total_requests + ); + } + + if total_errors > 0 { + anyhow::bail!( + "Port forwarding errors: {} out of {} requests failed", + total_errors, + total_requests + ); + } + + println!("\n✅ CLONE PORT FORWARD STRESS TEST PASSED!"); + println!( + " All {} requests across {} clones returned valid data", + total_requests, NUM_CLONES + ); + Ok(()) +} + +/// Result of a single HTTP check +struct CurlResult { + success: bool, + body_len: usize, + error: String, +} + +/// Make a single HTTP request and return the result +async fn curl_check(ip: &str, port: u16, timeout_secs: u32) -> CurlResult { + let url = format!("http://{}:{}", ip, port); + match tokio::process::Command::new("curl") + .args(["-s", "--max-time", &timeout_secs.to_string(), &url]) + .output() + .await + { + Ok(output) => CurlResult { + success: output.status.success(), + body_len: output.stdout.len(), + error: if output.status.success() { + String::new() + } else { + String::from_utf8_lossy(&output.stderr).to_string() + }, + }, + Err(e) => CurlResult { + success: false, + body_len: 0, + error: format!("curl failed: {}", e), + }, + } +} From 2e1f1e161d252e1897172889f643d9bce04684ce Mon Sep 17 00:00:00 2001 From: ejc3 Date: Wed, 4 Mar 2026 01:20:42 +0000 Subject: [PATCH 03/13] refactor: extract get_loopback_ip/curl_check to common, concurrent clone spawning - Extract get_loopback_ip() and curl_check()/CurlResult to tests/common (was duplicated 4x across test files) - Replace inline loopback IP parsing in test_snapshot_clone.rs (2 instances) - Spawn clones concurrently via JoinSet instead of sequential loop - Send HTTP requests concurrently across all clones (true stress test) - Remove 50ms sleep between requests and local CurlResult/curl_check --- tests/common/mod.rs | 55 ++++++ tests/test_clone_port_forward_stress.rs | 253 +++++++++--------------- tests/test_snapshot_clone.rs | 38 +--- 3 files changed, 154 insertions(+), 192 deletions(-) diff --git a/tests/common/mod.rs b/tests/common/mod.rs index f1bc76ec..faba1df8 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -1526,6 +1526,61 @@ pub fn find_available_high_port() -> anyhow::Result { find_available_port(10000 + offset, 50000 - offset) } +/// Get the loopback IP for a VM by PID (from `fcvm ls --json`). +pub async fn get_loopback_ip(pid: u32) -> anyhow::Result { + let fcvm_path = find_fcvm_binary()?; + let output = tokio::process::Command::new(&fcvm_path) + .args(["ls", "--json", "--pid", &pid.to_string()]) + .output() + .await + .context("getting VM state for loopback IP")?; + + let stdout = String::from_utf8_lossy(&output.stdout); + serde_json::from_str::>(&stdout) + .ok() + .and_then(|v| v.first().cloned()) + .and_then(|v| { + v.get("config")? + .get("network")? + .get("loopback_ip")? + .as_str() + .map(|s| s.to_string()) + }) + .ok_or_else(|| anyhow::anyhow!("loopback_ip not found for VM PID {}", pid)) +} + +/// Result of a single HTTP check via curl. +pub struct CurlResult { + pub success: bool, + pub body_len: usize, + pub error: String, +} + +/// Make a single HTTP request via curl and return the result. +pub async fn curl_check(ip: &str, port: u16, timeout_secs: u32) -> CurlResult { + let url = format!("http://{}:{}", ip, port); + match tokio::process::Command::new("curl") + .args(["-s", "--max-time", &timeout_secs.to_string(), &url]) + .output() + .await + { + Ok(output) => CurlResult { + success: output.status.success(), + body_len: output.stdout.len(), + error: if output.status.success() { + String::new() + } else { + String::from_utf8_lossy(&output.stderr).to_string() + }, + }, + Err(e) => CurlResult { + success: false, + body_len: 0, + error: format!("curl failed: {}", e), + }, + } +} + /// Wait for a TCP server to be ready by attempting to connect. /// /// # Arguments diff --git a/tests/test_clone_port_forward_stress.rs b/tests/test_clone_port_forward_stress.rs index b59df624..d1cecc44 100644 --- a/tests/test_clone_port_forward_stress.rs +++ b/tests/test_clone_port_forward_stress.rs @@ -2,7 +2,7 @@ //! //! This test exercises the pasta port forwarding path under stress: //! - Multiple clones spawned from the same snapshot -//! - Rapid repeated HTTP requests to each clone +//! - Concurrent rapid HTTP requests to all clones simultaneously //! - Checks for 0-byte responses (pasta connection tracking poisoning) //! //! Background: During snapshot restore, `post_start()` calls @@ -17,7 +17,9 @@ mod common; use anyhow::{Context, Result}; -use std::time::{Duration, Instant}; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::Arc; +use std::time::Instant; /// Number of clones to spawn const NUM_CLONES: usize = 3; @@ -25,7 +27,7 @@ const NUM_CLONES: usize = 3; /// Number of HTTP requests per clone const REQUESTS_PER_CLONE: usize = 20; -/// Stress test: multiple clones with port forwarding, rapid HTTP requests +/// Stress test: multiple clones with port forwarding, concurrent HTTP requests /// /// Reproduces the "connect succeeded but 0 bytes" pattern seen in CI bench-vm /// failures. The hypothesis: pasta's `wait_for_port_forwarding()` probe during @@ -37,7 +39,7 @@ async fn test_clone_port_forward_stress_rootless() -> Result<()> { println!("\n╔═══════════════════════════════════════════════════════════════╗"); println!("║ Clone Port Forward Stress Test (rootless) ║"); println!( - "║ {} clones × {} requests each ║", + "║ {} clones × {} requests each (concurrent) ║", NUM_CLONES, REQUESTS_PER_CLONE ); println!("╚═══════════════════════════════════════════════════════════════╝\n"); @@ -77,29 +79,10 @@ async fn test_clone_port_forward_stress_rootless() -> Result<()> { println!(" ✓ Baseline VM healthy (PID: {})", baseline_pid); // Verify baseline port forwarding works before snapshotting - let output = tokio::process::Command::new(&fcvm_path) - .args(["ls", "--json", "--pid", &baseline_pid.to_string()]) - .output() - .await - .context("getting baseline state")?; - - let stdout = String::from_utf8_lossy(&output.stdout); - let baseline_ip: String = serde_json::from_str::>(&stdout) - .ok() - .and_then(|v| v.first().cloned()) - .and_then(|v| { - v.get("config")? - .get("network")? - .get("loopback_ip")? - .as_str() - .map(|s| s.to_string()) - }) - .unwrap_or_default(); - + let baseline_ip = common::get_loopback_ip(baseline_pid).await?; println!(" Baseline loopback IP: {}", baseline_ip); - // Verify baseline nginx responds - let baseline_check = curl_check(&baseline_ip, host_port, 5).await; + let baseline_check = common::curl_check(&baseline_ip, host_port, 5).await; println!( " Baseline HTTP check: {} ({} bytes)", if baseline_check.success { @@ -149,8 +132,8 @@ async fn test_clone_port_forward_stress_rootless() -> Result<()> { common::poll_serve_ready(&snapshot_name, serve_pid, 30).await?; println!(" ✓ Memory server ready (PID: {})", serve_pid); - // Step 4: Spawn clones sequentially (each gets unique loopback IP) - println!("\nStep 4: Spawning {} clones...", NUM_CLONES); + // Step 4: Spawn all clones concurrently + println!("\nStep 4: Spawning {} clones concurrently...", NUM_CLONES); let serve_pid_str = serve_pid.to_string(); struct CloneInfo { @@ -160,121 +143,110 @@ async fn test_clone_port_forward_stress_rootless() -> Result<()> { _child: tokio::process::Child, } - let mut clones: Vec = Vec::new(); + // Spawn all clones concurrently using JoinSet + let mut spawn_set = tokio::task::JoinSet::new(); for i in 0..NUM_CLONES { let clone_name = format!("pf-stress-clone-{}-{}", i, std::process::id()); println!(" Spawning clone {} ({})...", i, clone_name); - - let (child, clone_pid) = common::spawn_fcvm_with_logs( - &[ - "snapshot", - "run", - "--pid", - &serve_pid_str, - "--name", + let pid_str = serve_pid_str.clone(); + spawn_set.spawn(async move { + let (child, clone_pid) = common::spawn_fcvm_with_logs( + &["snapshot", "run", "--pid", &pid_str, "--name", &clone_name], &clone_name, - ], - &clone_name, - ) - .await - .context(format!("spawning clone {}", i))?; - - // Wait for healthy - common::poll_health_by_pid(clone_pid, 120).await?; - println!(" ✓ Clone {} healthy (PID: {})", i, clone_pid); - - // Get loopback IP - let output = tokio::process::Command::new(&fcvm_path) - .args(["ls", "--json", "--pid", &clone_pid.to_string()]) - .output() + ) .await - .context("getting clone state")?; - - let stdout = String::from_utf8_lossy(&output.stdout); - let loopback_ip: String = serde_json::from_str::>(&stdout) - .ok() - .and_then(|v| v.first().cloned()) - .and_then(|v| { - v.get("config")? - .get("network")? - .get("loopback_ip")? - .as_str() - .map(|s| s.to_string()) - }) - .unwrap_or_default(); - - println!(" Loopback IP: {}", loopback_ip); + .context(format!("spawning clone {}", clone_name))?; + common::poll_health_by_pid(clone_pid, 120).await?; + let loopback_ip = common::get_loopback_ip(clone_pid).await?; + Ok::<_, anyhow::Error>((clone_name, clone_pid, loopback_ip, child)) + }); + } + + let mut clones: Vec = Vec::new(); + while let Some(result) = spawn_set.join_next().await { + let (name, pid, loopback_ip, child) = result??; + println!( + " ✓ Clone {} healthy (PID: {}, IP: {})", + name, pid, loopback_ip + ); clones.push(CloneInfo { - name: clone_name, - pid: clone_pid, + name, + pid, loopback_ip, _child: child, }); } - // Step 5: Rapid HTTP requests to all clones + // Step 5: Concurrent HTTP requests to all clones simultaneously println!( - "\nStep 5: Sending {} HTTP requests to each of {} clones...", + "\nStep 5: Sending {} HTTP requests to each of {} clones (concurrently)...", REQUESTS_PER_CLONE, NUM_CLONES ); - let mut total_requests = 0u32; - let mut total_success = 0u32; - let mut total_zero_bytes = 0u32; - let mut total_errors = 0u32; + let total_success = Arc::new(AtomicU32::new(0)); + let total_zero_bytes = Arc::new(AtomicU32::new(0)); + let total_errors = Arc::new(AtomicU32::new(0)); + + let start = Instant::now(); + // Spawn concurrent tasks for all clones + let mut handles = Vec::new(); for clone in &clones { - let mut clone_success = 0u32; - let mut clone_zero = 0u32; - let mut clone_error = 0u32; - - let start = Instant::now(); - - for req in 0..REQUESTS_PER_CLONE { - let result = curl_check(&clone.loopback_ip, host_port, 5).await; - total_requests += 1; - - if result.success && result.body_len > 0 { - clone_success += 1; - total_success += 1; - } else if result.success && result.body_len == 0 { - // This is the pasta poisoning pattern: connect succeeds but 0 bytes - clone_zero += 1; - total_zero_bytes += 1; - println!( - " ⚠ Clone {} request {}: 0-byte response!", - clone.name, req - ); - } else { - clone_error += 1; - total_errors += 1; - if clone_error <= 3 { - println!( - " ✗ Clone {} request {}: error ({})", - clone.name, req, result.error - ); + let ip = clone.loopback_ip.clone(); + let name = clone.name.clone(); + let success = Arc::clone(&total_success); + let zero = Arc::clone(&total_zero_bytes); + let errors = Arc::clone(&total_errors); + + let handle = tokio::spawn(async move { + let mut clone_success = 0u32; + let mut clone_zero = 0u32; + let mut clone_error = 0u32; + + for req in 0..REQUESTS_PER_CLONE { + let result = common::curl_check(&ip, host_port, 5).await; + + if result.success && result.body_len > 0 { + clone_success += 1; + success.fetch_add(1, Ordering::Relaxed); + } else if result.success && result.body_len == 0 { + clone_zero += 1; + zero.fetch_add(1, Ordering::Relaxed); + println!(" ⚠ Clone {} request {}: 0-byte response!", name, req); + } else { + clone_error += 1; + errors.fetch_add(1, Ordering::Relaxed); + if clone_error <= 3 { + println!( + " ✗ Clone {} request {}: error ({})", + name, req, result.error + ); + } } } - // Brief pause between requests (not too long - we want stress) - tokio::time::sleep(Duration::from_millis(50)).await; - } + (name, clone_success, clone_zero, clone_error) + }); + handles.push(handle); + } - let elapsed = start.elapsed(); + // Wait for all concurrent tasks to complete + for handle in handles { + let (name, ok, zero, err) = handle.await?; println!( - " Clone {} ({}): {}/{} OK, {} zero-byte, {} errors ({:.1}s)", - clone.name, - clone.loopback_ip, - clone_success, - REQUESTS_PER_CLONE, - clone_zero, - clone_error, - elapsed.as_secs_f64() + " Clone {}: {}/{} OK, {} zero-byte, {} errors", + name, ok, REQUESTS_PER_CLONE, zero, err ); } + let elapsed = start.elapsed(); + let total_requests = (NUM_CLONES * REQUESTS_PER_CLONE) as u32; + let success = total_success.load(Ordering::Relaxed); + let zero_bytes = total_zero_bytes.load(Ordering::Relaxed); + let errs = total_errors.load(Ordering::Relaxed); + // Cleanup println!("\nCleaning up..."); for clone in &clones { @@ -288,37 +260,38 @@ async fn test_clone_port_forward_stress_rootless() -> Result<()> { println!("║ RESULTS ║"); println!("╠═══════════════════════════════════════════════════════════════╣"); println!( - "║ Total requests: {:4} ║", - total_requests + "║ Total requests: {:4} ({:.1}s concurrent) ║", + total_requests, + elapsed.as_secs_f64() ); println!( "║ Successful: {:4} ║", - total_success + success ); println!( "║ Zero-byte: {:4} (pasta poisoning pattern) ║", - total_zero_bytes + zero_bytes ); println!( "║ Errors: {:4} ║", - total_errors + errs ); println!("╚═══════════════════════════════════════════════════════════════╝"); - if total_zero_bytes > 0 { + if zero_bytes > 0 { anyhow::bail!( "PASTA POISONING DETECTED: {} out of {} requests returned 0 bytes. \ This confirms that wait_for_port_forwarding() during restore \ (before guest exists) corrupts pasta's forwarding state.", - total_zero_bytes, + zero_bytes, total_requests ); } - if total_errors > 0 { + if errs > 0 { anyhow::bail!( "Port forwarding errors: {} out of {} requests failed", - total_errors, + errs, total_requests ); } @@ -330,35 +303,3 @@ async fn test_clone_port_forward_stress_rootless() -> Result<()> { ); Ok(()) } - -/// Result of a single HTTP check -struct CurlResult { - success: bool, - body_len: usize, - error: String, -} - -/// Make a single HTTP request and return the result -async fn curl_check(ip: &str, port: u16, timeout_secs: u32) -> CurlResult { - let url = format!("http://{}:{}", ip, port); - match tokio::process::Command::new("curl") - .args(["-s", "--max-time", &timeout_secs.to_string(), &url]) - .output() - .await - { - Ok(output) => CurlResult { - success: output.status.success(), - body_len: output.stdout.len(), - error: if output.status.success() { - String::new() - } else { - String::from_utf8_lossy(&output.stderr).to_string() - }, - }, - Err(e) => CurlResult { - success: false, - body_len: 0, - error: format!("curl failed: {}", e), - }, - } -} diff --git a/tests/test_snapshot_clone.rs b/tests/test_snapshot_clone.rs index e2bfb6fe..8b8e6149 100644 --- a/tests/test_snapshot_clone.rs +++ b/tests/test_snapshot_clone.rs @@ -1503,24 +1503,7 @@ async fn test_clone_port_forward_rootless() -> Result<()> { println!("\nStep 5: Testing port forwarding..."); // Get clone's loopback IP from state (rootless uses 127.x.y.z) - let output = tokio::process::Command::new(&fcvm_path) - .args(["ls", "--json", "--pid", &clone_pid.to_string()]) - .output() - .await - .context("getting clone state")?; - - let stdout = String::from_utf8_lossy(&output.stdout); - let loopback_ip: String = serde_json::from_str::>(&stdout) - .ok() - .and_then(|v| v.first().cloned()) - .and_then(|v| { - v.get("config")? - .get("network")? - .get("loopback_ip")? - .as_str() - .map(|s| s.to_string()) - }) - .unwrap_or_default(); + let loopback_ip = common::get_loopback_ip(clone_pid).await?; println!(" Clone loopback IP: {}", loopback_ip); @@ -1691,24 +1674,7 @@ async fn test_clone_port_forward_routed() -> Result<()> { println!("\nStep 5: Testing port forwarding..."); // Get clone's loopback IP from state (routed uses TCP proxy + loopback like rootless) - let output = tokio::process::Command::new(&fcvm_path) - .args(["ls", "--json", "--pid", &clone_pid.to_string()]) - .output() - .await - .context("getting clone state")?; - - let stdout = String::from_utf8_lossy(&output.stdout); - let loopback_ip: String = serde_json::from_str::>(&stdout) - .ok() - .and_then(|v| v.first().cloned()) - .and_then(|v| { - v.get("config")? - .get("network")? - .get("loopback_ip")? - .as_str() - .map(|s| s.to_string()) - }) - .unwrap_or_default(); + let loopback_ip = common::get_loopback_ip(clone_pid).await?; println!(" Clone loopback IP: {}", loopback_ip); From 1ceb20890026c2ea4bb76f0f4a49b40f34271133 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Wed, 4 Mar 2026 01:23:43 +0000 Subject: [PATCH 04/13] fix: propagate verify_port_forwarding error instead of warn-only The caller in snapshot.rs wrapped verify_port_forwarding() in if-let-Err that only logged a warning, allowing clones to come up with broken port forwarding. Now that post_start() skips the port probe in restore mode, verify_port_forwarding() is the only gate. Propagate its error so clones fail fast on broken forwarding. --- src/commands/snapshot.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/commands/snapshot.rs b/src/commands/snapshot.rs index 55907217..5ac0dda7 100644 --- a/src/commands/snapshot.rs +++ b/src/commands/snapshot.rs @@ -1086,9 +1086,10 @@ pub async fn cmd_snapshot_run(args: SnapshotRunArgs) -> Result<()> { // After snapshot restore, pasta may not have learned the guest's MAC yet. // This probes each forwarded port to trigger and verify ARP resolution — // no guest service needs to be running, just the guest's kernel. - if let Err(e) = network.verify_port_forwarding().await { - warn!(vm_id = %vm_id, error = %e, "port forwarding verification failed"); - } + network + .verify_port_forwarding() + .await + .context("port forwarding verification failed after snapshot restore")?; // Spawn health monitor task with startup snapshot trigger support let health_monitor_handle = crate::health::spawn_health_monitor_full( From 139f0957212fdbbfbf012acf4c2efaa97fa7e469 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Wed, 4 Mar 2026 03:15:58 +0000 Subject: [PATCH 05/13] fix: use active ping instead of passive ARP check in verify_port_forwarding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With arp_accept=0 (Linux default), gratuitous ARP only updates existing neighbor entries — never creates new ones. The old code checked `ip neigh show` which would never find an entry because fc-agent's gratuitous arping can't create one. Active ping forces a normal ARP exchange (request+reply) that always creates a REACHABLE entry. Changes: - Replace `ip neigh show` with `ping -c 1 -W 0.2` to the guest - Successful ping proves both ARP resolution and guest reachability - 200ms ping timeout allows ~16 retries within 5s deadline - Remove 10ms sleep (ping provides natural 200ms backoff) Tested: make test-root FILTER=clone_port_forward_stress — 60/60 OK, 0 errors --- src/network/pasta.rs | 47 +++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/src/network/pasta.rs b/src/network/pasta.rs index d360854e..fd82c542 100644 --- a/src/network/pasta.rs +++ b/src/network/pasta.rs @@ -662,14 +662,14 @@ impl NetworkManager for PastaNetwork { /// Verify pasta's L2 forwarding path is ready after snapshot restore. /// /// After snapshot restore, pasta needs the guest's MAC address to forward - /// L2 frames. fc-agent sends a gratuitous ARP (ping to gateway) during - /// restore, which broadcasts the guest's MAC to all bridge ports including - /// pasta0. We verify this by checking the namespace's ARP table — if the - /// namespace kernel learned the guest's MAC, pasta received the same - /// broadcast frame. + /// L2 frames. We actively ping the guest from the namespace to trigger a + /// normal ARP exchange. With arp_accept=0 (Linux default), the guest's + /// gratuitous arping does NOT create neighbor entries — only updates + /// existing ones. The active ping forces the namespace kernel to send an + /// ARP request that the guest replies to, creating a REACHABLE entry. /// - /// This runs after fc-agent's output vsock reconnects, so the gratuitous - /// ARP has already been sent. Typically resolves on the first check. + /// Once ARP is resolved, we probe each forwarded port to confirm pasta's + /// loopback port forwarding is end-to-end functional. async fn verify_port_forwarding(&self) -> Result<()> { if self.port_mappings.is_empty() { return Ok(()); @@ -684,25 +684,23 @@ impl NetworkManager for PastaNetwork { let nsenter_prefix = self.build_nsenter_prefix(holder_pid); loop { - let output = Command::new(&nsenter_prefix[0]) + // Ping the guest to trigger ARP resolution. A successful ping (exit 0) + // proves ARP resolved AND the guest is reachable — skip the ip neigh check. + // Use 200ms timeout for ~16 retries within the 5s deadline. + let ping_result = Command::new(&nsenter_prefix[0]) .args(&nsenter_prefix[1..]) - .args(["ip", "neigh", "show", GUEST_IP, "dev", BRIDGE_DEVICE]) - .stdout(Stdio::piped()) + .args(["ping", "-c", "1", "-W", "0.2", GUEST_IP]) + .stdout(Stdio::null()) .stderr(Stdio::null()) .output() - .await - .context("checking ARP table in namespace")?; - - let stdout = String::from_utf8_lossy(&output.stdout); - // Entry looks like: "10.0.2.100 lladdr aa:bb:cc:dd:ee:ff REACHABLE" - // If lladdr is present, the guest's MAC is known. - if stdout.contains("lladdr") { - info!(guest_ip = GUEST_IP, arp = %stdout.trim(), "ARP resolved"); - // ARP is resolved but pasta's loopback port forwarding may not be - // ready yet. Probe each mapped port on the loopback IP to confirm - // end-to-end forwarding works before declaring ready. - self.wait_for_port_forwarding().await?; - return Ok(()); + .await; + + if let Ok(ref out) = ping_result { + if out.status.success() { + info!(guest_ip = GUEST_IP, "guest reachable via ping, ARP resolved"); + self.wait_for_port_forwarding().await?; + return Ok(()); + } } if std::time::Instant::now() > deadline { @@ -713,8 +711,7 @@ impl NetworkManager for PastaNetwork { ); } - debug!(guest_ip = GUEST_IP, "ARP not yet resolved, waiting"); - tokio::time::sleep(std::time::Duration::from_millis(10)).await; + debug!(guest_ip = GUEST_IP, "ping to guest failed, retrying"); } } From 3a8fbd1c9bc246aa69e6c33429945ec68d7cf8bc Mon Sep 17 00:00:00 2001 From: "claude[bot]" Date: Wed, 4 Mar 2026 03:22:13 +0000 Subject: [PATCH 06/13] fix: propagate ping spawn errors in verify_port_forwarding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new ping-based ARP check used `if let Ok(...)` which silently swallowed command spawn failures (nsenter/ping not found, permission denied). This could cause a 5-second busy loop with a misleading "ARP not resolved" error. Propagate spawn errors immediately with `.context()?` to match the old code's behavior. Also update stale comment in snapshot.rs to reflect the ping-based ARP resolution approach. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- src/commands/snapshot.rs | 7 ++++--- src/network/pasta.rs | 18 ++++++++++-------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/commands/snapshot.rs b/src/commands/snapshot.rs index 5ac0dda7..7c044c3e 100644 --- a/src/commands/snapshot.rs +++ b/src/commands/snapshot.rs @@ -1082,10 +1082,11 @@ pub async fn cmd_snapshot_run(args: SnapshotRunArgs) -> Result<()> { } } - // Verify pasta's L2 forwarding path has ARP resolved before starting health monitor. + // Verify pasta's L2 forwarding path is ready before starting health monitor. // After snapshot restore, pasta may not have learned the guest's MAC yet. - // This probes each forwarded port to trigger and verify ARP resolution — - // no guest service needs to be running, just the guest's kernel. + // This pings the guest to trigger ARP resolution, then probes each forwarded + // port to confirm end-to-end forwarding works. Only the guest kernel needs + // to be running (no guest services required for the ARP/ping step). network .verify_port_forwarding() .await diff --git a/src/network/pasta.rs b/src/network/pasta.rs index fd82c542..77430602 100644 --- a/src/network/pasta.rs +++ b/src/network/pasta.rs @@ -687,20 +687,22 @@ impl NetworkManager for PastaNetwork { // Ping the guest to trigger ARP resolution. A successful ping (exit 0) // proves ARP resolved AND the guest is reachable — skip the ip neigh check. // Use 200ms timeout for ~16 retries within the 5s deadline. - let ping_result = Command::new(&nsenter_prefix[0]) + let output = Command::new(&nsenter_prefix[0]) .args(&nsenter_prefix[1..]) .args(["ping", "-c", "1", "-W", "0.2", GUEST_IP]) .stdout(Stdio::null()) .stderr(Stdio::null()) .output() - .await; + .await + .context("running ping via nsenter in namespace")?; - if let Ok(ref out) = ping_result { - if out.status.success() { - info!(guest_ip = GUEST_IP, "guest reachable via ping, ARP resolved"); - self.wait_for_port_forwarding().await?; - return Ok(()); - } + if output.status.success() { + info!( + guest_ip = GUEST_IP, + "guest reachable via ping, ARP resolved" + ); + self.wait_for_port_forwarding().await?; + return Ok(()); } if std::time::Instant::now() > deadline { From 651e73ec84c9e10730ce7bb51522c99f1263a399 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Wed, 4 Mar 2026 03:50:27 +0000 Subject: [PATCH 07/13] test: add diagnostics for clone port forward stress failures - Fix curl_check to use -sS (show errors) instead of -s (silent) - Add pre-storm verification curl for each clone before stress test - On first curl error per clone, dump ss -tlnp and verbose curl to capture the actual error (connection refused, timeout, etc.) --- tests/common/mod.rs | 2 +- tests/test_clone_port_forward_stress.rs | 59 +++++++++++++++++++++++-- 2 files changed, 56 insertions(+), 5 deletions(-) diff --git a/tests/common/mod.rs b/tests/common/mod.rs index faba1df8..27141367 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -1560,7 +1560,7 @@ pub struct CurlResult { pub async fn curl_check(ip: &str, port: u16, timeout_secs: u32) -> CurlResult { let url = format!("http://{}:{}", ip, port); match tokio::process::Command::new("curl") - .args(["-s", "--max-time", &timeout_secs.to_string(), &url]) + .args(["-sS", "--max-time", &timeout_secs.to_string(), &url]) .output() .await { diff --git a/tests/test_clone_port_forward_stress.rs b/tests/test_clone_port_forward_stress.rs index d1cecc44..238c1e51 100644 --- a/tests/test_clone_port_forward_stress.rs +++ b/tests/test_clone_port_forward_stress.rs @@ -179,9 +179,26 @@ async fn test_clone_port_forward_stress_rootless() -> Result<()> { }); } - // Step 5: Concurrent HTTP requests to all clones simultaneously + // Step 5: Verify each clone's port forwarding before the stress storm + println!("\nStep 5: Pre-storm verification of each clone..."); + for clone in &clones { + let check = common::curl_check(&clone.loopback_ip, host_port, 5).await; + println!( + " Clone {} ({}:{}): {} ({} bytes, err={})", + clone.name, clone.loopback_ip, host_port, + if check.success { "OK" } else { "FAIL" }, + check.body_len, check.error + ); + assert!( + check.success && check.body_len > 0, + "Pre-storm curl to clone {} failed: {}", + clone.name, check.error + ); + } + + // Step 6: Concurrent HTTP requests to all clones simultaneously println!( - "\nStep 5: Sending {} HTTP requests to each of {} clones (concurrently)...", + "\nStep 6: Sending {} HTTP requests to each of {} clones (concurrently)...", REQUESTS_PER_CLONE, NUM_CLONES ); @@ -220,10 +237,44 @@ async fn test_clone_port_forward_stress_rootless() -> Result<()> { errors.fetch_add(1, Ordering::Relaxed); if clone_error <= 3 { println!( - " ✗ Clone {} request {}: error ({})", - name, req, result.error + " ✗ Clone {} request {} to {}:{}: error ({})", + name, req, ip, host_port, result.error ); } + // On first error, dump diagnostics + if clone_error == 1 { + let ss = tokio::process::Command::new("ss") + .args(["-tlnp"]) + .output() + .await; + if let Ok(out) = ss { + let stdout = String::from_utf8_lossy(&out.stdout); + // Filter for our port + let port_str = host_port.to_string(); + let matching: Vec<&str> = stdout + .lines() + .filter(|l| l.contains(&port_str) || l.starts_with("State")) + .collect(); + println!( + " DIAG clone {} port {} ss output: {:?}", + name, host_port, matching + ); + } + // Also try a verbose curl + let verbose = tokio::process::Command::new("curl") + .args(["-v", "--max-time", "2", + &format!("http://{}:{}", ip, host_port)]) + .output() + .await; + if let Ok(out) = verbose { + let stderr = String::from_utf8_lossy(&out.stderr); + println!( + " DIAG clone {} verbose curl stderr: {}", + name, + stderr.chars().take(500).collect::() + ); + } + } } } From 874f89172d3f454c8b653da56b2d202419dffa1b Mon Sep 17 00:00:00 2001 From: ejc3 Date: Wed, 4 Mar 2026 04:07:11 +0000 Subject: [PATCH 08/13] fix: add iputils-ping to nested container, simplify verify comments - Add iputils-ping to Containerfile.nested so ping-based ARP probe works in container CI (was missing, causing all container clones to fail with "ARP not resolved within 5s") - Move loop comment above the loop in verify_port_forwarding - Trim snapshot.rs comment to match actual behavior --- Containerfile.nested | 2 +- src/commands/snapshot.rs | 3 +-- src/network/pasta.rs | 11 ++++------- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/Containerfile.nested b/Containerfile.nested index 79213d99..4f5084d0 100644 --- a/Containerfile.nested +++ b/Containerfile.nested @@ -17,7 +17,7 @@ FROM ubuntu:24.04 RUN apt-get update && apt-get install -y --no-install-recommends \ iproute2 iptables podman python3 kmod procps fuse3 curl nginx \ - fuse-overlayfs sudo iperf3 rsync btrfs-progs \ + fuse-overlayfs sudo iperf3 rsync btrfs-progs iputils-ping \ && rm -rf /var/lib/apt/lists/* # Configure podman to use fuse-overlayfs (required for nested containers) diff --git a/src/commands/snapshot.rs b/src/commands/snapshot.rs index 7c044c3e..d12cb817 100644 --- a/src/commands/snapshot.rs +++ b/src/commands/snapshot.rs @@ -1085,8 +1085,7 @@ pub async fn cmd_snapshot_run(args: SnapshotRunArgs) -> Result<()> { // Verify pasta's L2 forwarding path is ready before starting health monitor. // After snapshot restore, pasta may not have learned the guest's MAC yet. // This pings the guest to trigger ARP resolution, then probes each forwarded - // port to confirm end-to-end forwarding works. Only the guest kernel needs - // to be running (no guest services required for the ARP/ping step). + // port to confirm end-to-end forwarding works. network .verify_port_forwarding() .await diff --git a/src/network/pasta.rs b/src/network/pasta.rs index 77430602..30032b75 100644 --- a/src/network/pasta.rs +++ b/src/network/pasta.rs @@ -683,10 +683,10 @@ impl NetworkManager for PastaNetwork { let deadline = std::time::Instant::now() + std::time::Duration::from_secs(5); let nsenter_prefix = self.build_nsenter_prefix(holder_pid); + // Ping the guest from inside the namespace to trigger ARP resolution. + // A successful ping proves ARP resolved AND the guest is reachable. + // Use 200ms timeout for ~16 retries within the 5s deadline. loop { - // Ping the guest to trigger ARP resolution. A successful ping (exit 0) - // proves ARP resolved AND the guest is reachable — skip the ip neigh check. - // Use 200ms timeout for ~16 retries within the 5s deadline. let output = Command::new(&nsenter_prefix[0]) .args(&nsenter_prefix[1..]) .args(["ping", "-c", "1", "-W", "0.2", GUEST_IP]) @@ -697,10 +697,7 @@ impl NetworkManager for PastaNetwork { .context("running ping via nsenter in namespace")?; if output.status.success() { - info!( - guest_ip = GUEST_IP, - "guest reachable via ping, ARP resolved" - ); + info!(guest_ip = GUEST_IP, "guest reachable via ping, ARP resolved"); self.wait_for_port_forwarding().await?; return Ok(()); } From 59cce3fcb81a428115b90405025adb4d95ec20ca Mon Sep 17 00:00:00 2001 From: ejc3 Date: Wed, 4 Mar 2026 04:10:04 +0000 Subject: [PATCH 09/13] style: fix cargo fmt formatting --- src/network/pasta.rs | 5 ++++- tests/test_clone_port_forward_stress.rs | 18 +++++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/network/pasta.rs b/src/network/pasta.rs index 30032b75..654bee90 100644 --- a/src/network/pasta.rs +++ b/src/network/pasta.rs @@ -697,7 +697,10 @@ impl NetworkManager for PastaNetwork { .context("running ping via nsenter in namespace")?; if output.status.success() { - info!(guest_ip = GUEST_IP, "guest reachable via ping, ARP resolved"); + info!( + guest_ip = GUEST_IP, + "guest reachable via ping, ARP resolved" + ); self.wait_for_port_forwarding().await?; return Ok(()); } diff --git a/tests/test_clone_port_forward_stress.rs b/tests/test_clone_port_forward_stress.rs index 238c1e51..1ad3bc74 100644 --- a/tests/test_clone_port_forward_stress.rs +++ b/tests/test_clone_port_forward_stress.rs @@ -185,14 +185,18 @@ async fn test_clone_port_forward_stress_rootless() -> Result<()> { let check = common::curl_check(&clone.loopback_ip, host_port, 5).await; println!( " Clone {} ({}:{}): {} ({} bytes, err={})", - clone.name, clone.loopback_ip, host_port, + clone.name, + clone.loopback_ip, + host_port, if check.success { "OK" } else { "FAIL" }, - check.body_len, check.error + check.body_len, + check.error ); assert!( check.success && check.body_len > 0, "Pre-storm curl to clone {} failed: {}", - clone.name, check.error + clone.name, + check.error ); } @@ -262,8 +266,12 @@ async fn test_clone_port_forward_stress_rootless() -> Result<()> { } // Also try a verbose curl let verbose = tokio::process::Command::new("curl") - .args(["-v", "--max-time", "2", - &format!("http://{}:{}", ip, host_port)]) + .args([ + "-v", + "--max-time", + "2", + &format!("http://{}:{}", ip, host_port), + ]) .output() .await; if let Ok(out) = verbose { From 17b796bbc25e5d33bc6facb463d3c1f1d7b279c9 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Wed, 4 Mar 2026 04:15:22 +0000 Subject: [PATCH 10/13] test: add pasta process and connection diagnostics for stress failures Add more diagnostic dumps on first curl error per clone: - pgrep -a pasta: check if pasta is alive, find matching process - ss -tnp: show active connections (not just listening sockets) - All pasta processes listed for cross-clone comparison --- tests/test_clone_port_forward_stress.rs | 45 +++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/tests/test_clone_port_forward_stress.rs b/tests/test_clone_port_forward_stress.rs index 1ad3bc74..d0661c1e 100644 --- a/tests/test_clone_port_forward_stress.rs +++ b/tests/test_clone_port_forward_stress.rs @@ -247,14 +247,15 @@ async fn test_clone_port_forward_stress_rootless() -> Result<()> { } // On first error, dump diagnostics if clone_error == 1 { + let port_str = host_port.to_string(); + + // 1. Check listening sockets for our port let ss = tokio::process::Command::new("ss") .args(["-tlnp"]) .output() .await; if let Ok(out) = ss { let stdout = String::from_utf8_lossy(&out.stdout); - // Filter for our port - let port_str = host_port.to_string(); let matching: Vec<&str> = stdout .lines() .filter(|l| l.contains(&port_str) || l.starts_with("State")) @@ -264,7 +265,8 @@ async fn test_clone_port_forward_stress_rootless() -> Result<()> { name, host_port, matching ); } - // Also try a verbose curl + + // 2. Verbose curl to see connection details let verbose = tokio::process::Command::new("curl") .args([ "-v", @@ -282,6 +284,43 @@ async fn test_clone_port_forward_stress_rootless() -> Result<()> { stderr.chars().take(500).collect::() ); } + + // 3. Check if pasta is still alive (look for pasta process) + let pgrep = tokio::process::Command::new("pgrep") + .args(["-a", "pasta"]) + .output() + .await; + if let Ok(out) = pgrep { + let stdout = String::from_utf8_lossy(&out.stdout); + let matching: Vec<&str> = stdout + .lines() + .filter(|l| l.contains(&ip) || l.contains(&port_str)) + .collect(); + println!( + " DIAG clone {} pasta processes matching {}:{}: {:?}", + name, ip, host_port, matching + ); + // Also show all pasta processes for context + let all: Vec<&str> = stdout.lines().collect(); + println!(" DIAG clone {} all pasta processes: {:?}", name, all); + } + + // 4. Check connections (not just listening) + let ss_all = tokio::process::Command::new("ss") + .args(["-tnp"]) + .output() + .await; + if let Ok(out) = ss_all { + let stdout = String::from_utf8_lossy(&out.stdout); + let matching: Vec<&str> = stdout + .lines() + .filter(|l| l.contains(&ip) || l.starts_with("State")) + .collect(); + println!( + " DIAG clone {} connections to {}: {:?}", + name, ip, matching + ); + } } } } From 829d04ef64689b8c4a5bf5ddd326e7ed329f8c14 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Wed, 4 Mar 2026 07:16:57 +0000 Subject: [PATCH 11/13] fix: add curl retry with diagnostics, capture ping stderr - Add curl_check_retry() to tests/common: retries HTTP with 500ms backoff and dumps network diagnostics on failure (ARP cache, namespace sockets, bridge links, VM listening sockets, curl -v) - Extract dump_clone_network_diagnostics() as reusable test helper - Capture ping stderr in verify_port_forwarding for better error messages when ARP resolution fails - Use curl_check_retry in clone port forward tests (rootless, routed, stress) instead of single-shot curl - Revert wait_for_port_forwarding to TCP-connect-only (appropriate for initial boot when guest app isn't ready yet) --- src/network/pasta.rs | 9 +- tests/common/mod.rs | 127 ++++++++++++++++++++++++ tests/test_clone_port_forward_stress.rs | 12 ++- tests/test_snapshot_clone.rs | 59 ++++------- 4 files changed, 159 insertions(+), 48 deletions(-) diff --git a/src/network/pasta.rs b/src/network/pasta.rs index 654bee90..a3ed3fdb 100644 --- a/src/network/pasta.rs +++ b/src/network/pasta.rs @@ -691,7 +691,7 @@ impl NetworkManager for PastaNetwork { .args(&nsenter_prefix[1..]) .args(["ping", "-c", "1", "-W", "0.2", GUEST_IP]) .stdout(Stdio::null()) - .stderr(Stdio::null()) + .stderr(Stdio::piped()) .output() .await .context("running ping via nsenter in namespace")?; @@ -706,10 +706,13 @@ impl NetworkManager for PastaNetwork { } if std::time::Instant::now() > deadline { + let stderr = String::from_utf8_lossy(&output.stderr); + let stderr = stderr.trim(); anyhow::bail!( - "ARP for guest {} not resolved within 5s on {}", + "ARP for guest {} not resolved within 5s on {}: ping stderr: {}", GUEST_IP, - BRIDGE_DEVICE + BRIDGE_DEVICE, + if stderr.is_empty() { "(empty)" } else { stderr } ); } diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 27141367..da6ed882 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -1581,6 +1581,133 @@ pub async fn curl_check(ip: &str, port: u16, timeout_secs: u32) -> CurlResult { } } +/// Make HTTP requests via curl with retries until success or deadline. +/// +/// After snapshot restore, the network channel (L2 + pasta) is ready but the +/// guest application may need a moment. This retries with 500ms backoff +/// and prints extensive diagnostics on failure. +pub async fn curl_check_retry( + ip: &str, + port: u16, + timeout_secs: u32, + clone_pid: Option, +) -> CurlResult { + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(timeout_secs as u64); + let mut attempt = 0u32; + let mut last_result = CurlResult { + success: false, + body_len: 0, + error: "no attempt made".to_string(), + }; + + while std::time::Instant::now() < deadline { + attempt += 1; + last_result = curl_check(ip, port, 1).await; + if last_result.success && last_result.body_len > 0 { + if attempt > 1 { + println!( + " curl_check_retry: succeeded on attempt {} after retries", + attempt + ); + } + return last_result; + } + println!( + " curl_check_retry attempt {}: {}:{} -> success={} body={} err={}", + attempt, ip, port, last_result.success, last_result.body_len, last_result.error + ); + tokio::time::sleep(std::time::Duration::from_millis(500)).await; + } + + // All retries exhausted — dump diagnostics + println!( + "\n curl_check_retry FAILED after {} attempts to {}:{}", + attempt, ip, port + ); + println!(" last error: {}", last_result.error); + + // Verbose curl to see exactly what happens at TCP level + let url = format!("http://{}:{}", ip, port); + if let Ok(verbose) = tokio::process::Command::new("curl") + .args(["-v", "--max-time", "2", &url]) + .output() + .await + { + let stderr = String::from_utf8_lossy(&verbose.stderr); + println!(" curl -v output:"); + for line in stderr.lines() { + println!(" {}", line); + } + } + + if let Some(pid) = clone_pid { + dump_clone_network_diagnostics(pid).await; + } + + last_result +} + +/// Dump network diagnostics for a clone VM: pasta processes, ARP cache, +/// namespace sockets, bridge state, and VM listening sockets. +pub async fn dump_clone_network_diagnostics(pid: u32) { + println!(" --- Network diagnostics for clone PID {} ---", pid); + + // Check if pasta is still running + run_cmd_diagnostic("pasta processes", "pgrep", &["-a", "pasta"]).await; + + // Get holder PID from state for nsenter + let fcvm_path = find_fcvm_binary().unwrap_or_default(); + let holder_pid = get_holder_pid(&fcvm_path, pid).await; + + if let Some(hp) = holder_pid { + let hp_str = hp.to_string(); + run_nsenter_diagnostic(&hp_str, &["ip", "neigh", "show"], "ARP cache").await; + run_nsenter_diagnostic(&hp_str, &["ss", "-tnp"], "namespace sockets").await; + run_nsenter_diagnostic(&hp_str, &["bridge", "link"], "bridge links").await; + } + + // Check what the VM sees (listening sockets) + let pid_str = pid.to_string(); + run_cmd_diagnostic( + "VM listening sockets", + &fcvm_path.to_string_lossy(), + &["exec", "--pid", &pid_str, "--", "ss", "-tnl"], + ) + .await; +} + +async fn get_holder_pid(fcvm_path: &std::path::PathBuf, pid: u32) -> Option { + let out = tokio::process::Command::new(fcvm_path) + .args(["ls", "--json"]) + .output() + .await + .ok()?; + let json_str = String::from_utf8_lossy(&out.stdout); + let vms: Vec = serde_json::from_str(&json_str).ok()?; + let vm = vms.iter().find(|v| v["pid"].as_u64() == Some(pid as u64))?; + vm["holder_pid"].as_u64() +} + +async fn run_nsenter_diagnostic(holder_pid: &str, cmd: &[&str], label: &str) { + let mut args = vec!["-t", holder_pid, "--net"]; + args.extend(cmd); + run_cmd_diagnostic(label, "nsenter", &args).await; +} + +async fn run_cmd_diagnostic(label: &str, program: &str, args: &[&str]) { + if let Ok(out) = tokio::process::Command::new(program) + .args(args) + .output() + .await + { + let stdout = String::from_utf8_lossy(&out.stdout); + println!(" {}:", label); + for line in stdout.lines() { + println!(" {}", line); + } + } +} + /// Wait for a TCP server to be ready by attempting to connect. /// /// # Arguments diff --git a/tests/test_clone_port_forward_stress.rs b/tests/test_clone_port_forward_stress.rs index d0661c1e..383bb4a5 100644 --- a/tests/test_clone_port_forward_stress.rs +++ b/tests/test_clone_port_forward_stress.rs @@ -180,21 +180,23 @@ async fn test_clone_port_forward_stress_rootless() -> Result<()> { } // Step 5: Verify each clone's port forwarding before the stress storm - println!("\nStep 5: Pre-storm verification of each clone..."); + // Use curl_check_retry: the L2 channel is ready (verify_port_forwarding passed) + // but the guest application may need a moment after snapshot restore. + println!("\nStep 5: Pre-storm verification of each clone (with retries)..."); for clone in &clones { - let check = common::curl_check(&clone.loopback_ip, host_port, 5).await; + let check = + common::curl_check_retry(&clone.loopback_ip, host_port, 10, Some(clone.pid)).await; println!( - " Clone {} ({}:{}): {} ({} bytes, err={})", + " Clone {} ({}:{}): {} ({} bytes)", clone.name, clone.loopback_ip, host_port, if check.success { "OK" } else { "FAIL" }, check.body_len, - check.error ); assert!( check.success && check.body_len > 0, - "Pre-storm curl to clone {} failed: {}", + "Pre-storm curl to clone {} failed after retries: {}", clone.name, check.error ); diff --git a/tests/test_snapshot_clone.rs b/tests/test_snapshot_clone.rs index 8b8e6149..b31ed4ee 100644 --- a/tests/test_snapshot_clone.rs +++ b/tests/test_snapshot_clone.rs @@ -1508,33 +1508,22 @@ async fn test_clone_port_forward_rootless() -> Result<()> { println!(" Clone loopback IP: {}", loopback_ip); // Test: Access via loopback IP and forwarded port - // verify_port_forwarding() runs after snapshot restore and confirms end-to-end - // data flow through pasta's loopback → bridge → guest path before health monitor. + // verify_port_forwarding() confirmed the L2 channel is ready (ping + TCP connect). + // Use retry because the guest application may need a moment after restore. println!( - " Testing access via loopback {}:{}...", + " Testing access via loopback {}:{} (with retries)...", loopback_ip, host_port ); - let loopback_result = tokio::process::Command::new("curl") - .args([ - "-s", - "--max-time", - "5", - &format!("http://{}:{}", loopback_ip, host_port), - ]) - .output() - .await - .context("curl loopback port forward")?; - - let loopback_works = loopback_result.status.success() && !loopback_result.stdout.is_empty(); + let loopback_check = + common::curl_check_retry(&loopback_ip, host_port, 10, Some(clone_pid)).await; + let loopback_works = loopback_check.success && loopback_check.body_len > 0; if loopback_works { - let response = String::from_utf8_lossy(&loopback_result.stdout); - println!(" Loopback access: ✓ OK ({} bytes)", response.len()); - } else { - println!(" Loopback access: ✗ FAIL"); println!( - " stderr: {}", - String::from_utf8_lossy(&loopback_result.stderr) + " Loopback access: ✓ OK ({} bytes)", + loopback_check.body_len ); + } else { + println!(" Loopback access: ✗ FAIL ({})", loopback_check.error); } // Cleanup @@ -1679,31 +1668,21 @@ async fn test_clone_port_forward_routed() -> Result<()> { println!(" Clone loopback IP: {}", loopback_ip); // Test: Access via loopback IP and forwarded port + // Use retry because the guest application may need a moment after restore. println!( - " Testing access via loopback {}:{}...", + " Testing access via loopback {}:{} (with retries)...", loopback_ip, host_port ); - let loopback_result = tokio::process::Command::new("curl") - .args([ - "-s", - "--max-time", - "5", - &format!("http://{}:{}", loopback_ip, host_port), - ]) - .output() - .await - .context("curl loopback port forward")?; - - let loopback_works = loopback_result.status.success() && !loopback_result.stdout.is_empty(); + let loopback_check = + common::curl_check_retry(&loopback_ip, host_port, 10, Some(clone_pid)).await; + let loopback_works = loopback_check.success && loopback_check.body_len > 0; if loopback_works { - let response = String::from_utf8_lossy(&loopback_result.stdout); - println!(" Loopback access: ✓ OK ({} bytes)", response.len()); - } else { - println!(" Loopback access: ✗ FAIL"); println!( - " stderr: {}", - String::from_utf8_lossy(&loopback_result.stderr) + " Loopback access: ✓ OK ({} bytes)", + loopback_check.body_len ); + } else { + println!(" Loopback access: ✗ FAIL ({})", loopback_check.error); } // Cleanup From 2113b9d6a7c7eaffc34299dc75f56d06b9018217 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Wed, 4 Mar 2026 07:52:21 +0000 Subject: [PATCH 12/13] fix: add iputils-ping to Containerfile for container tests verify_port_forwarding uses nsenter + ping to check ARP resolution after snapshot restore. In container tests, nsenter runs in the container's mount namespace, so ping must be installed there. Was added to Containerfile.nested but not Containerfile, causing: nsenter: failed to execute ping: No such file or directory --- Containerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Containerfile b/Containerfile index 798d77ea..d813c543 100644 --- a/Containerfile +++ b/Containerfile @@ -15,7 +15,7 @@ RUN cargo install cargo-nextest cargo-audit cargo-deny --locked RUN apt-get update && apt-get install -y \ fuse3 libfuse3-dev autoconf automake libtool perl libclang-dev clang cmake \ musl-tools iproute2 iptables passt dnsmasq qemu-utils e2fsprogs btrfs-progs \ - parted fdisk podman skopeo git curl sudo procps zstd busybox-static cpio uidmap \ + parted fdisk podman skopeo git curl sudo procps zstd busybox-static cpio uidmap iputils-ping \ flex bison bc libelf-dev libssl-dev libseccomp-dev \ && rm -rf /var/lib/apt/lists/* From 4ec4b494123a791a80126fb391243d2891d44868 Mon Sep 17 00:00:00 2001 From: ejc3 Date: Wed, 4 Mar 2026 14:34:11 +0000 Subject: [PATCH 13/13] test: capture full logs and namespace diagnostics in bench failures - Dump full clone log instead of last 30 lines on clone_http failure - Add namespace diagnostics: ARP cache, namespace sockets, bridge links via nsenter using holder PID from fcvm ls - Add VM listening sockets via fcvm exec ss -tnl - Dump full serve log instead of last 30 lines on clone_exec failure These diagnostics match what tests/common/mod.rs already captures, giving visibility into pasta splice state when bench fails. --- benches/exec.rs | 102 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 79 insertions(+), 23 deletions(-) diff --git a/benches/exec.rs b/benches/exec.rs index 549751a4..79ab4d4b 100644 --- a/benches/exec.rs +++ b/benches/exec.rs @@ -501,20 +501,18 @@ impl CloneFixture { if !output.status.success() { let stderr = String::from_utf8_lossy(&output.stderr); let stdout = String::from_utf8_lossy(&output.stdout); - // Dump serve log for diagnostics - let serve_log = "/tmp/fcvm-bench-serve-clone-exec.log"; - if let Ok(logs) = std::fs::read_to_string(serve_log) { - let tail: Vec<&str> = logs.lines().rev().take(30).collect(); - eprintln!("=== Last 30 lines of serve log ==="); - for line in tail.into_iter().rev() { - eprintln!("{}", line); - } - } + let serve_log_path = "/tmp/fcvm-bench-serve-clone-exec.log"; + let serve_log_content = std::fs::read_to_string(serve_log_path).unwrap_or_default(); panic!( - "clone exec failed after {:.1}s:\nstderr: {}\nstdout: {}", + "clone exec failed after {:.1}s:\n\ + stderr: {}\n\ + stdout: {}\n\ + \n=== full serve log ({}) ===\n{}", elapsed.as_secs_f64(), stderr, - stdout + stdout, + serve_log_path, + serve_log_content, ); } @@ -637,16 +635,69 @@ impl CloneFixture { .map(|o| String::from_utf8_lossy(&o.stdout).to_string()) .unwrap_or_default(); - // Last 30 lines of clone log (full, not filtered) - let log_tail: String = clone_log - .lines() - .rev() - .take(30) - .collect::>() - .into_iter() - .rev() - .collect::>() - .join("\n"); + // Get holder PID for namespace diagnostics + let holder_diag = Command::new(&fcvm) + .args(["ls", "--json", "--pid", &clone_pid.to_string()]) + .output() + .ok() + .and_then(|o| { + let stdout = String::from_utf8_lossy(&o.stdout); + serde_json::from_str::>(&stdout).ok() + }) + .and_then(|vms| { + vms.first() + .and_then(|v| v["holder_pid"].as_u64()) + .map(|hp| { + let hp_str = hp.to_string(); + let mut diag = String::new(); + + // ARP cache in namespace + if let Ok(o) = Command::new("nsenter") + .args(["-t", &hp_str, "-n", "ip", "neigh", "show"]) + .output() + { + diag.push_str(&format!( + "\n=== ARP cache (ns {}) ===\n{}", + hp, + String::from_utf8_lossy(&o.stdout) + )); + } + + // Namespace sockets + if let Ok(o) = Command::new("nsenter") + .args(["-t", &hp_str, "-n", "ss", "-tnp"]) + .output() + { + diag.push_str(&format!( + "\n=== namespace sockets (ns {}) ===\n{}", + hp, + String::from_utf8_lossy(&o.stdout) + )); + } + + // Bridge links + if let Ok(o) = Command::new("nsenter") + .args(["-t", &hp_str, "-n", "bridge", "link"]) + .output() + { + diag.push_str(&format!( + "\n=== bridge links (ns {}) ===\n{}", + hp, + String::from_utf8_lossy(&o.stdout) + )); + } + + diag + }) + }) + .unwrap_or_default(); + + // VM listening sockets + let vm_ss = Command::new(&fcvm) + .args(["exec", "--pid", &clone_pid.to_string(), "--", "ss", "-tnl"]) + .output() + .map(|o| String::from_utf8_lossy(&o.stdout).to_string()) + .unwrap_or_else(|e| format!("exec ss failed: {}", e)); panic!( "clone HTTP failed after 10 attempts\n\ @@ -657,7 +708,9 @@ impl CloneFixture { \n=== listening sockets on {} ===\n{}\ \n=== pasta processes ===\n{}\ \n=== stale process counts ===\n{}\ - \n=== clone log (last 30 lines) ===\n{}", + {}\ + \n=== VM listening sockets ===\n{}\ + \n=== full clone log ({}) ===\n{}", loopback_ip, health_port, last_response.len(), @@ -667,7 +720,10 @@ impl CloneFixture { ss_check, pasta_check, stale_check, - log_tail, + holder_diag, + vm_ss, + clone_log_path, + clone_log, ); }