diff --git a/examples/Cargo.toml b/examples/Cargo.toml index 9a1d4d04ef7..22622670fe2 100644 --- a/examples/Cargo.toml +++ b/examples/Cargo.toml @@ -24,6 +24,9 @@ httparse = "1.0" httpdate = "1.0" once_cell = "1.5.2" +[target.'cfg(target_os = "linux")'.dev-dependencies] +libc = "0.2" + [target.'cfg(all(tokio_unstable, target_os = "linux"))'.dev-dependencies] tokio = { version = "1.0.0", path = "../tokio", features = ["full", "tracing", "taskdump"] } @@ -102,5 +105,9 @@ path = "named-pipe-multi-client.rs" name = "dump" path = "dump.rs" +[[example]] +name = "prewarm-fd-table" +path = "prewarm-fd-table.rs" + [lints] workspace = true diff --git a/examples/prewarm-fd-table.rs b/examples/prewarm-fd-table.rs new file mode 100644 index 00000000000..d8a595ea918 --- /dev/null +++ b/examples/prewarm-fd-table.rs @@ -0,0 +1,80 @@ +//! Demonstrates pre-warming the Linux file descriptor table to avoid latency +//! spikes caused by file descriptor table growth in multi-threaded processes. +//! +//! On Linux, the kernel's FD table is grown lazily and protected by RCU +//! synchronization. In multi-threaded processes, when a syscall like `socket()` +//! triggers a table resize, the calling thread blocks until all RCU readers +//! quiesce. This can cause stalls of tens of milliseconds on tokio worker threads, +//! blocking the entire event loop (not just one task). +//! +//! The workaround is to force the kernel to expand the FD table once per process +//! (before any runtime starts), by duplicating an FD to a high slot and then +//! closing it. The kernel never shrinks the FD table during a process's lifetime, +//! so the capacity persists. +//! +//! This is most relevant for services that open many connections concurrently +//! (e.g. HTTP servers, connection pools). The pre-warm target should be at least +//! your expected peak FD count. +//! +//! See: +//! +//! Usage: +//! +//! cargo run --example prewarm-fd-table + +#![warn(rust_2018_idioms)] + +#[cfg(target_os = "linux")] +fn prewarm_fd_table(target: u32) -> std::io::Result<()> { + use std::os::unix::io::{FromRawFd, OwnedFd}; + + // Open /dev/null to get a base FD. + let dev_null = std::fs::File::open("/dev/null")?; + + // Use F_DUPFD_CLOEXEC to duplicate to a slot >= target. This forces the + // kernel to expand the FD table in a single syscall, without clobbering + // existing FDs. CLOEXEC prevents leaking to child processes. + let raw = unsafe { + libc::fcntl( + std::os::unix::io::AsRawFd::as_raw_fd(&dev_null), + libc::F_DUPFD_CLOEXEC, + target, + ) + }; + if raw < 0 { + return Err(std::io::Error::last_os_error()); + } + + // Close both FDs. The table capacity persists. + let _owned = unsafe { OwnedFd::from_raw_fd(raw) }; + drop(dev_null); + + Ok(()) +} + +#[cfg(not(target_os = "linux"))] +fn prewarm_fd_table(_target: u32) -> std::io::Result<()> { + // FD table pre-warming is Linux-specific. + Ok(()) +} + +fn main() { + const FD_TARGET: u32 = 10_000; + + println!("Pre-warming FD table to {FD_TARGET} entries..."); + if let Err(e) = prewarm_fd_table(FD_TARGET) { + eprintln!("Warning: failed to pre-warm FD table: {e}"); + } else { + println!("FD table pre-warmed successfully."); + } + + // Build the runtime *after* pre-warming. + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + + rt.block_on(async { + println!("Runtime started. Worker threads will not stall on FD table growth up to {FD_TARGET} FDs."); + }); +} diff --git a/spellcheck.dic b/spellcheck.dic index d05aa25774a..857962d1a4e 100644 --- a/spellcheck.dic +++ b/spellcheck.dic @@ -1,4 +1,4 @@ -312 +313 & + < @@ -209,6 +209,7 @@ POSIX proxied qos RAII +RCU reallocations recv's refactors diff --git a/tokio/src/runtime/mod.rs b/tokio/src/runtime/mod.rs index 4ab79e88346..bd3c381bf7f 100644 --- a/tokio/src/runtime/mod.rs +++ b/tokio/src/runtime/mod.rs @@ -373,6 +373,13 @@ //! When a task is woken from a thread that is not a worker thread, then the //! task is placed in the global queue. //! +//! # Performance tuning +//! +//! ## File descriptor table pre-warming +//! +//! On Linux, file descriptor table growth can stall worker threads. See the +//! [`prewarm-fd-table`] example. +//! //! [`poll`]: std::future::Future::poll //! [`wake`]: std::task::Waker::wake //! [`yield_now`]: crate::task::yield_now @@ -385,6 +392,7 @@ //! [the lifo slot optimization]: crate::runtime::Builder::disable_lifo_slot //! [coop budget]: crate::task::coop#cooperative-scheduling //! [`worker_mean_poll_time`]: crate::runtime::RuntimeMetrics::worker_mean_poll_time +//! [`prewarm-fd-table`]: https://github.com/tokio-rs/tokio/blob/master/examples/prewarm-fd-table.rs // At the top due to macros #[cfg(test)]