From 1ade0c9bbcfbb13d664798d514088761adbbf674 Mon Sep 17 00:00:00 2001 From: Tristan Date: Tue, 24 Feb 2026 18:42:52 +0000 Subject: [PATCH 01/12] Update agave --- agave | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agave b/agave index 7fe1b415d86..ee7b51e1c2a 160000 --- a/agave +++ b/agave @@ -1 +1 @@ -Subproject commit 7fe1b415d8642a3621a725a382575f9685c4696c +Subproject commit ee7b51e1c2a110a696f9622dfeab92e68d140d05 From 233c128c4736d29248c1dab90b43e674bd0ab810 Mon Sep 17 00:00:00 2001 From: Tristan Date: Tue, 24 Feb 2026 19:44:20 +0000 Subject: [PATCH 02/12] prefbusy mode config options and sysfs linux config options --- src/app/fdctl/config/default.toml | 40 +++++++++ src/app/fdctl/main.c | 1 + src/app/fddev/main.h | 2 + src/app/firedancer/config/default.toml | 40 +++++++++ src/app/firedancer/main.c | 1 + src/app/shared/Local.mk | 1 + src/app/shared/commands/configure/configure.h | 1 + .../shared/commands/configure/sysfs-poll.c | 83 +++++++++++++++++++ src/app/shared/fd_config.h | 6 ++ src/app/shared/fd_config_parse.c | 5 ++ src/app/shared_dev/commands/pktgen/pktgen.c | 1 + src/app/shared_dev/commands/udpecho/udpecho.c | 1 + src/disco/net/fd_net_tile_topo.c | 8 ++ src/disco/topo/fd_topo.h | 6 ++ src/waltz/xdp/fd_xsk.h | 20 ++++- 15 files changed, 214 insertions(+), 2 deletions(-) create mode 100644 src/app/shared/commands/configure/sysfs-poll.c diff --git a/src/app/fdctl/config/default.toml b/src/app/fdctl/config/default.toml index abe1306c7f4..0805b1923ca 100644 --- a/src/app/fdctl/config/default.toml +++ b/src/app/fdctl/config/default.toml @@ -983,6 +983,46 @@ dynamic_port_range = "8900-9000" # "operation not supported". xdp_zero_copy = false + # This option moves the management of napi including + # when to poll as well as the poll budget, into userspace + # if in "prefbusy" mode. The fallback is "softirq" mode, + # which relies much more on linux to manage napi, through + # wakeups, softirqs and under higher network load, a seperate + # ksoftirqd thread which linux creates and manages. + # + # "prefbusy" mode is the recommended choice of mode, + # as this will also automatically fallback to "softirq" + # mode if preferred busy polling is not available or + # the right choice for whatever reason (e.g. on an older + # kernel). A warning will be emitted if this fallback is made + # alongside suggestions of why this might have happened. + poll_mode = "prefbusy" + + # AF_XDP socket configuration options which will eventually + # be moved to being fixed constants prior to the merge of + # prefbusy-poll-mode into main. + busy_poll_usecs = 100 + gro_flush_timeout_nanos = 5000000 + + # This is the minimum time between napi polls if in prefbusy + # mode. This is important for protecting against a livelock + # scenario inwhich Firedancer is not given enough time in + # userspace to do work. + # + # This is a protective mechanism against bugs as well + # as to ensure even in a low RX but high TX traffic scenario, + # TX is still given enough time to do work or else + # napi is polled whenever the xsk RX queue is empty + # which could starve userspace TX work in the edge + # case there is significantly more TX than RX traffic. + lwr_prefbusy_poll_timeout_micros = 5 + + # This is the maximum time between napi polls if in prefbusy + # mode. This is to call a napi poll in the case the normal + # prefbusy napi poll scheduling has stalled, napi polls + # can often resolve queue stalls so this increases robustness. + upr_prefbusy_poll_timeout_micros = 150 + # XDP uses metadata queues shared across the kernel and # userspace to relay events about incoming and outgoing packets. # This setting defines the number of entries in these metadata diff --git a/src/app/fdctl/main.c b/src/app/fdctl/main.c index 0589b27490f..3ca58b82e47 100644 --- a/src/app/fdctl/main.c +++ b/src/app/fdctl/main.c @@ -38,6 +38,7 @@ configure_stage_t * STAGES[] = { &fd_cfg_stage_ethtool_channels, &fd_cfg_stage_ethtool_offloads, &fd_cfg_stage_ethtool_loopback, + &fd_cfg_stage_sysfs_poll, NULL, }; diff --git a/src/app/fddev/main.h b/src/app/fddev/main.h index 5028caf57e6..3f8beb20ba7 100644 --- a/src/app/fddev/main.h +++ b/src/app/fddev/main.h @@ -33,6 +33,7 @@ extern configure_stage_t fd_cfg_stage_kill; extern configure_stage_t fd_cfg_stage_genesis; extern configure_stage_t fd_cfg_stage_keys; extern configure_stage_t fd_cfg_stage_blockstore; +extern configure_stage_t fd_cfg_stage_sysfs_poll; configure_stage_t * STAGES[] = { &fd_cfg_stage_kill, @@ -46,6 +47,7 @@ configure_stage_t * STAGES[] = { &fd_cfg_stage_keys, &fd_cfg_stage_genesis, &fd_cfg_stage_blockstore, + &fd_cfg_stage_sysfs_poll, NULL, }; diff --git a/src/app/firedancer/config/default.toml b/src/app/firedancer/config/default.toml index f6a2dd320ed..6b4f985763e 100644 --- a/src/app/firedancer/config/default.toml +++ b/src/app/firedancer/config/default.toml @@ -1052,6 +1052,46 @@ telemetry = true # "operation not supported". xdp_zero_copy = false + # This option moves the management of napi including + # when to poll as well as the poll budget, into userspace + # if in "prefbusy" mode. The fallback is "softirq" mode, + # which relies much more on linux to manage napi, through + # wakeups, softirqs and under higher network load, a seperate + # ksoftirqd thread which linux creates and manages. + # + # "prefbusy" mode is the recommended choice of mode, + # as this will also automatically fallback to "softirq" + # mode if preferred busy polling is not available or + # the right choice for whatever reason (e.g. on an older + # kernel). A warning will be emitted if this fallback is made + # alongside suggestions of why this might have happened. + poll_mode = "prefbusy" + + # AF_XDP socket configuration options which will eventually + # be moved to being fixed constants prior to the merge of + # prefbusy-poll-mode into main. + busy_poll_usecs = 100 + gro_flush_timeout_nanos = 5000000 + + # This is the minimum time between napi polls if in prefbusy + # mode. This is important for protecting against a livelock + # scenario inwhich Firedancer is not given enough time in + # userspace to do work. + # + # This is a protective mechanism against bugs as well + # as to ensure even in a low RX but high TX traffic scenario, + # TX is still given enough time to do work or else + # napi is polled whenever the xsk RX queue is empty + # which could starve userspace TX work in the edge + # case there is significantly more TX than RX traffic. + lwr_prefbusy_poll_timeout_micros = 5 + + # This is the maximum time between napi polls if in prefbusy + # mode. This is to call a napi poll in the case the normal + # prefbusy napi poll scheduling has stalled, napi polls + # can often resolve queue stalls so this increases robustness. + upr_prefbusy_poll_timeout_micros = 150 + # XDP uses metadata queues shared across the kernel and # userspace to relay events about incoming and outgoing packets. # This setting defines the number of entries in these metadata diff --git a/src/app/firedancer/main.c b/src/app/firedancer/main.c index ce398101c7f..5835cb720e8 100644 --- a/src/app/firedancer/main.c +++ b/src/app/firedancer/main.c @@ -70,6 +70,7 @@ configure_stage_t * STAGES[] = { &fd_cfg_stage_ethtool_loopback, &fd_cfg_stage_snapshots, &fd_cfg_stage_accdb, + &fd_cfg_stage_sysfs_poll, NULL, }; diff --git a/src/app/shared/Local.mk b/src/app/shared/Local.mk index 561be32d419..bc114679e03 100644 --- a/src/app/shared/Local.mk +++ b/src/app/shared/Local.mk @@ -31,6 +31,7 @@ $(call add-objs,commands/configure/fd_ethtool_ioctl,fdctl_shared) $(call add-objs,commands/configure/hugetlbfs,fdctl_shared) $(call add-objs,commands/configure/hyperthreads,fdctl_shared) $(call add-objs,commands/configure/sysctl,fdctl_shared) +$(call add-objs,commands/configure/sysfs-poll,fdctl_shared) $(call add-objs,commands/configure/snapshots,fdctl_shared) $(call add-objs,commands/monitor/monitor commands/monitor/helper,fdctl_shared) $(call add-objs,commands/watch/watch,fdctl_shared) diff --git a/src/app/shared/commands/configure/configure.h b/src/app/shared/commands/configure/configure.h index fc5eea3777e..713de669404 100644 --- a/src/app/shared/commands/configure/configure.h +++ b/src/app/shared/commands/configure/configure.h @@ -84,6 +84,7 @@ extern configure_stage_t fd_cfg_stage_bonding; extern configure_stage_t fd_cfg_stage_ethtool_channels; extern configure_stage_t fd_cfg_stage_ethtool_offloads; extern configure_stage_t fd_cfg_stage_ethtool_loopback; +extern configure_stage_t fd_cfg_stage_sysfs_poll; extern configure_stage_t fd_cfg_stage_snapshots; extern configure_stage_t * STAGES[]; diff --git a/src/app/shared/commands/configure/sysfs-poll.c b/src/app/shared/commands/configure/sysfs-poll.c new file mode 100644 index 00000000000..ff2436ed9f1 --- /dev/null +++ b/src/app/shared/commands/configure/sysfs-poll.c @@ -0,0 +1,83 @@ +/* This stage configures the OS to support effective preferred busy + polling, allowing for significantly improved network stack (XDP) + performance if enabled. */ + +#include "configure.h" + +#define NAME "sysfs-poll" + +#include "../../../platform/fd_file_util.h" + +#include +#include +#include /* access */ +#include + +#define VERY_HIGH_VAL 1000000U + +static char const setting_napi_defer_hard_irqs[] = "napi_defer_hard_irqs"; + +static char const setting_gro_flush_timeout[] = "gro_flush_timeout"; + +static int +enabled( config_t const * config ) { + return !strcmp( config->net.xdp.poll_mode, "prefbusy" ); +} + +static void +init_perm ( fd_cap_chk_t * chk, + config_t const * config FD_PARAM_UNUSED ) { + fd_cap_chk_cap( chk, NAME, CAP_NET_ADMIN, "configure preferred busy polling via `/sys/class/net/*/{napi_defer_hard_irqs, gro_flush_timeout}`" ); +} + +static void +sysfs_net_set( char const * device, + char const * setting, + ulong value ) { + char path[ PATH_MAX ]; + fd_cstr_printf_check( path, PATH_MAX, NULL, "/sys/class/net/%s/%s", device, setting ); + FD_LOG_NOTICE(( "RUN: `echo \"%lu\" > %s`", value, path )); + fd_file_util_write_uint( path, (uint)value ); +} + +static void +init( config_t const * config ) { + sysfs_net_set( config->net.interface, setting_napi_defer_hard_irqs, VERY_HIGH_VAL ); + sysfs_net_set( config->net.interface, setting_gro_flush_timeout, config->net.xdp.gro_flush_timeout_nanos ); +} + +static int +fini( config_t const * config, + int pre_init FD_PARAM_UNUSED ) { + sysfs_net_set( config->net.interface, setting_napi_defer_hard_irqs, 0U ); + sysfs_net_set( config->net.interface, setting_gro_flush_timeout, 0U ); + return 1; +} + +static configure_result_t +check( config_t const * config, + int check_type FD_PARAM_UNUSED ) { + char path[ PATH_MAX ]; + uint value; + fd_cstr_printf_check( path, PATH_MAX, NULL, "/sys/class/net/%s/%s", config->net.interface, setting_napi_defer_hard_irqs ); + if( fd_file_util_read_uint( path, &value ) || value < VERY_HIGH_VAL ) { + NOT_CONFIGURED("Setting napi_defer_hard_irqs failed."); + } + + fd_cstr_printf_check( path, PATH_MAX, NULL, "/sys/class/net/%s/%s", config->net.interface, setting_gro_flush_timeout ); + if( fd_file_util_read_uint( path, &value ) || value != config->net.xdp.gro_flush_timeout_nanos ) { + NOT_CONFIGURED("Setting gro_flush_timeout failed."); + } + + CONFIGURE_OK(); +} + +configure_stage_t fd_cfg_stage_sysfs_poll = { + .name = NAME, + .enabled = enabled, + .init_perm = init_perm, + .fini_perm = init_perm, + .init = init, + .fini = fini, + .check = check, +}; diff --git a/src/app/shared/fd_config.h b/src/app/shared/fd_config.h index 37a2b1dd154..43a38d53cca 100644 --- a/src/app/shared/fd_config.h +++ b/src/app/shared/fd_config.h @@ -201,6 +201,12 @@ struct fd_config_net { struct { char xdp_mode[ 8 ]; int xdp_zero_copy; + + char poll_mode[ 16 ]; /* "prefbusy" or "softirq" */ + uint busy_poll_usecs; + ulong gro_flush_timeout_nanos; + uint lwr_prefbusy_poll_timeout_micros; + uint upr_prefbusy_poll_timeout_micros; uint xdp_rx_queue_size; uint xdp_tx_queue_size; diff --git a/src/app/shared/fd_config_parse.c b/src/app/shared/fd_config_parse.c index 61efd89df09..331cda0b3a9 100644 --- a/src/app/shared/fd_config_parse.c +++ b/src/app/shared/fd_config_parse.c @@ -183,6 +183,11 @@ fd_config_extract_pod( uchar * pod, CFG_POP ( uint, net.ingress_buffer_size ); CFG_POP ( cstr, net.xdp.xdp_mode ); CFG_POP ( bool, net.xdp.xdp_zero_copy ); + CFG_POP ( cstr, net.xdp.poll_mode ); + CFG_POP ( uint, net.xdp.busy_poll_usecs ); + CFG_POP ( ulong, net.xdp.gro_flush_timeout_nanos ); + CFG_POP ( uint, net.xdp.lwr_prefbusy_poll_timeout_micros ); + CFG_POP ( uint, net.xdp.upr_prefbusy_poll_timeout_micros ); CFG_POP ( uint, net.xdp.xdp_rx_queue_size ); CFG_POP ( uint, net.xdp.xdp_tx_queue_size ); CFG_POP ( uint, net.xdp.flush_timeout_micros ); diff --git a/src/app/shared_dev/commands/pktgen/pktgen.c b/src/app/shared_dev/commands/pktgen/pktgen.c index 944201ce6f1..b076917c62c 100644 --- a/src/app/shared_dev/commands/pktgen/pktgen.c +++ b/src/app/shared_dev/commands/pktgen/pktgen.c @@ -208,6 +208,7 @@ pktgen_cmd_fn( args_t * args FD_PARAM_UNUSED, configure_stage( &fd_cfg_stage_bonding, CONFIGURE_CMD_INIT, config ); configure_stage( &fd_cfg_stage_ethtool_channels, CONFIGURE_CMD_INIT, config ); configure_stage( &fd_cfg_stage_ethtool_offloads, CONFIGURE_CMD_INIT, config ); + configure_stage( &fd_cfg_stage_sysfs_poll, CONFIGURE_CMD_INIT, config ); fdctl_check_configure( config ); /* FIXME this allocates lots of memory unnecessarily */ diff --git a/src/app/shared_dev/commands/udpecho/udpecho.c b/src/app/shared_dev/commands/udpecho/udpecho.c index 725263d8024..8b8bebf7273 100644 --- a/src/app/shared_dev/commands/udpecho/udpecho.c +++ b/src/app/shared_dev/commands/udpecho/udpecho.c @@ -100,6 +100,7 @@ udpecho_cmd_fn( args_t * args, configure_stage( &fd_cfg_stage_ethtool_channels, CONFIGURE_CMD_INIT, config ); configure_stage( &fd_cfg_stage_ethtool_offloads, CONFIGURE_CMD_INIT, config ); configure_stage( &fd_cfg_stage_ethtool_loopback, CONFIGURE_CMD_INIT, config ); + configure_stage( &fd_cfg_stage_sysfs_poll, CONFIGURE_CMD_INIT, config ); fdctl_check_configure( config ); /* FIXME this allocates lots of memory unnecessarily */ diff --git a/src/disco/net/fd_net_tile_topo.c b/src/disco/net/fd_net_tile_topo.c index c48ee1eda60..e0ba64735c7 100644 --- a/src/disco/net/fd_net_tile_topo.c +++ b/src/disco/net/fd_net_tile_topo.c @@ -44,6 +44,14 @@ setup_xdp_tile( fd_topo_t * topo, tile->xdp.zero_copy = net_cfg->xdp.xdp_zero_copy; fd_cstr_ncpy( tile->xdp.xdp_mode, net_cfg->xdp.xdp_mode, sizeof(tile->xdp.xdp_mode) ); + fd_cstr_ncpy( tile->xdp.poll_mode, net_cfg->xdp.poll_mode, sizeof(tile->xdp.poll_mode) ); + + tile->xdp.busy_poll_usecs = net_cfg->xdp.busy_poll_usecs; + tile->xdp.gro_flush_timeout_nanos = net_cfg->xdp.gro_flush_timeout_nanos; + + tile->xdp.lwr_prefbusy_poll_timeout_ns = (long)net_cfg->xdp.lwr_prefbusy_poll_timeout_micros * 1000L; + tile->xdp.upr_prefbusy_poll_timeout_ns = (long)net_cfg->xdp.upr_prefbusy_poll_timeout_micros * 1000L; + tile->xdp.net.umem_dcache_obj_id = umem_obj->id; tile->xdp.netdev_dbl_buf_obj_id = netlink_tile->netlink.netdev_dbl_buf_obj_id; tile->xdp.fib4_main_obj_id = netlink_tile->netlink.fib4_main_obj_id; diff --git a/src/disco/topo/fd_topo.h b/src/disco/topo/fd_topo.h index e2cedba5868..caa02d3f58b 100644 --- a/src/disco/topo/fd_topo.h +++ b/src/disco/topo/fd_topo.h @@ -182,6 +182,12 @@ struct fd_topo_tile { char xdp_mode[8]; int zero_copy; + char poll_mode[ 16 ]; /* "prefbusy" or "softirq" */ + uint busy_poll_usecs; + ulong gro_flush_timeout_nanos; + long lwr_prefbusy_poll_timeout_ns; + long upr_prefbusy_poll_timeout_ns; + ulong netdev_dbl_buf_obj_id; /* dbl_buf containing netdev_tbl */ ulong fib4_main_obj_id; /* fib4 containing main route table */ ulong fib4_local_obj_id; /* fib4 containing local route table */ diff --git a/src/waltz/xdp/fd_xsk.h b/src/waltz/xdp/fd_xsk.h index 55cfc1a4774..22d5a3e0f22 100644 --- a/src/waltz/xdp/fd_xsk.h +++ b/src/waltz/xdp/fd_xsk.h @@ -187,8 +187,8 @@ fd_xdp_ring_full( fd_xdp_ring_t * ring ) { return ring->cached_prod - ring->cached_cons >= ring->depth; } -/* fd_xsk_params_t: Memory layout parameters of XSK. - Can be retrieved using fd_xsk_get_params() */ +/* fd_xsk_params_t: XSK poll configuration and memory layout + parameters. Can be retrieved using fd_xsk_get_params() */ struct fd_xsk_params { /* {fr,rx,tx,cr}_depth: Number of frames allocated for the Fill, RX, @@ -217,6 +217,15 @@ struct fd_xsk_params { /* sockaddr_xdp.sxdp_flags additional params, e.g. XDP_ZEROCOPY */ uint bind_flags; + char * poll_mode; + + /* max time waiting for work during prefbusy napi poll. */ + uint busy_poll_usecs; + + /* max time linux waits for userspace to poll napi before + calling a softirq. */ + ulong gro_flush_timeout_nanos; + /* whether the xsk memory should be included in core dumps */ int core_dump; }; @@ -236,6 +245,13 @@ struct fd_xsk { /* AF_XDP socket file descriptor */ int xsk_fd; + /* Whether preferred busy polling was successfully enabled + during XSK socket setup. */ + int prefbusy_poll_enabled; + + /* napi_id: ID of this specific NAPI instance */ + uint napi_id; + /* ring_{rx,tx,fr,cr}: XSK ring descriptors */ fd_xdp_ring_t ring_rx; From f2d92073c47f0bc0ae709a6325734d7101cc7d11 Mon Sep 17 00:00:00 2001 From: Tristan Date: Tue, 24 Feb 2026 19:55:23 +0000 Subject: [PATCH 03/12] XSK socket configuration for prefbusy polling if requested --- src/waltz/xdp/fd_xsk.c | 99 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/src/waltz/xdp/fd_xsk.c b/src/waltz/xdp/fd_xsk.c index 89efba4a737..5347c28475d 100644 --- a/src/waltz/xdp/fd_xsk.c +++ b/src/waltz/xdp/fd_xsk.c @@ -7,6 +7,7 @@ #include #include /* snprintf */ #include +#include #include /* mmap */ #include #include /* sendto */ @@ -15,6 +16,24 @@ #include "../../util/log/fd_log.h" #include "fd_xsk.h" +/* Support for older kernels */ + +#ifndef SO_BUSY_POLL +#define SO_BUSY_POLL 46 +#endif + +#ifndef SO_INCOMING_NAPI_ID +#define SO_INCOMING_NAPI_ID 56 +#endif + +#ifndef SO_PREFER_BUSY_POLL +#define SO_PREFER_BUSY_POLL 69 +#endif + +#ifndef SO_BUSY_POLL_BUDGET +#define SO_BUSY_POLL_BUDGET 70 +#endif + /* Join/leave *********************************************************/ /* fd_xsk_mmap_offset_cstr: Returns a cstr describing the given offset @@ -187,6 +206,60 @@ fd_xsk_setup_umem( fd_xsk_t * xsk, return 0; } +/* fd_xsk_setup_poll: Setup preferred busy polling if the user has + set that to be their preferred polling method */ + +static void +fd_xsk_setup_poll( fd_xsk_t * xsk, + fd_xsk_params_t const * params ) { + xsk->prefbusy_poll_enabled = 0; + if( 0!=strcmp( params->poll_mode, "prefbusy" ) ) return; + + /* Configure socket options for preferred busy polling */ + + int prefbusy_poll = 1; + if( FD_UNLIKELY( 0!=setsockopt( xsk->xsk_fd, SOL_SOCKET, SO_PREFER_BUSY_POLL, &prefbusy_poll, sizeof(int) ) ) ) { + int err = errno; + FD_LOG_WARNING(( "setsockopt(xsk_fd,SOL_SOCKET,SO_PREFER_BUSY_POLL,1) failed (%i-%s)", err, fd_io_strerror( err ) )); + if( err==EINVAL ) { + FD_LOG_WARNING(( "Hint: Does your kernel support preferred busy polling? SO_PREFER_BUSY_POLL is available from Linux 5.11 onwards" )); + } + return; + } + + if( FD_UNLIKELY( 0!=setsockopt( xsk->xsk_fd, SOL_SOCKET, SO_BUSY_POLL, ¶ms->busy_poll_usecs, sizeof(uint) ) ) ) { + FD_LOG_WARNING(( "setsockopt(xsk_fd,SOL_SOCKET,SO_BUSY_POLL,%u) failed (%i-%s)", + params->busy_poll_usecs, errno, fd_io_strerror( errno ) )); + return; + } + + /* The greater busy_poll_budget is, the greater the bias towards max RX pps + over max TX pps in a max network load scenario. */ + uint busy_poll_budget = 64U; + if( FD_UNLIKELY( 0!=setsockopt( xsk->xsk_fd, SOL_SOCKET, SO_BUSY_POLL_BUDGET, &busy_poll_budget, sizeof(uint) ) ) ) { + FD_LOG_WARNING(( "setsockopt(xsk_fd,SOL_SOCKET,SO_BUSY_POLL_BUDGET,%u) failed (%i-%s)", + busy_poll_budget, errno, fd_io_strerror( errno ) )); + return; + } + + /* Set socket non blocking */ + + int sk_flags = fcntl( xsk->xsk_fd, F_GETFL, 0 ); + if( FD_UNLIKELY( sk_flags == -1 ) ) { + FD_LOG_WARNING(( "fcntl(xsk->xsf_fd, F_GETFL, 0) failed (%i-%s)", + errno, fd_io_strerror( errno ) )); + return; + } + if( FD_UNLIKELY( fcntl( xsk->xsk_fd, F_SETFL, sk_flags | O_NONBLOCK ) ) == -1 ) { + FD_LOG_WARNING(( "fcntl(xsk->xsk_fd, F_SETFL, sk_flags | O_NONBLOCK) failed (%i-%s)", + errno, fd_io_strerror( errno ) )); + return; + } + + /* Successfully finished setting up prefbusy polling */ + xsk->prefbusy_poll_enabled = 1U; +} + /* fd_xsk_init: Creates and configures an XSK socket object, and attaches to a preinstalled XDP program. The various steps are implemented in fd_xsk_setup_{...}. */ @@ -289,6 +362,32 @@ fd_xsk_init( fd_xsk_t * xsk, FD_LOG_INFO(( "AF_XDP socket initialized: bind( PF_XDP, ifindex=%u (%s), queue_id=%u, flags=%x ) success", xsk->if_idx, if_indextoname( xsk->if_idx, if_name ), xsk->if_queue_id, flags )); + /* Check if the XSK is aware of the driver's NAPI ID for the + associated RX queue. Without it, preferred busy polling is not + going to work correctly. Note it's not always associated straight + away so xsk->napi_id can sometimes be set to 0 when it shouldn't be. + This is not an issue currently as the napi_id is not used yet. */ + + socklen_t napi_id_sz = sizeof(uint); + if( FD_UNLIKELY( 0!=getsockopt( xsk->xsk_fd, SOL_SOCKET, SO_INCOMING_NAPI_ID, &xsk->napi_id, &napi_id_sz ) ) ) { + if( errno==ENOPROTOOPT ) { + xsk->napi_id = 0; + } else { + FD_LOG_WARNING(( "getsockopt(SOL_SOCKET,SO_INCOMING_NAPI_ID) failed (%i-%s)", errno, fd_io_strerror( errno ) )); + goto fail; + } + } + + if( xsk->napi_id ) { + FD_LOG_DEBUG(( "Interface %u Queue %u has NAPI ID %u", xsk->if_idx, xsk->if_queue_id, xsk->napi_id )); + } else { + FD_LOG_DEBUG(( "Interface %u Queue %u has unknown NAPI ID", xsk->if_idx, xsk->if_queue_id )); + } + + /* If requested, enable preferred busy polling */ + + fd_xsk_setup_poll( xsk, params ); + return xsk; fail: From 842c9f9c8aca59a6183488e74b154c1cce937259 Mon Sep 17 00:00:00 2001 From: Tristan Date: Tue, 24 Feb 2026 20:38:11 +0000 Subject: [PATCH 04/12] prefbusy poll mode runtime code --- src/disco/net/xdp/fd_xdp_tile.c | 133 +++++++++++++++++++++++++++++--- 1 file changed, 123 insertions(+), 10 deletions(-) diff --git a/src/disco/net/xdp/fd_xdp_tile.c b/src/disco/net/xdp/fd_xdp_tile.c index 400f13ad166..c8f67133b68 100644 --- a/src/disco/net/xdp/fd_xdp_tile.c +++ b/src/disco/net/xdp/fd_xdp_tile.c @@ -95,10 +95,13 @@ struct fd_net_flusher { wakeup. This can result in the tail of a burst getting delayed or overrun. If more than tail_flush_backoff ticks pass since the last sendto() wakeup and there are still unacknowledged packets in the - TX ring, issues another wakeup. */ + TX ring, issues another wakeup. Only used by "softirq" poll mode. */ long next_tail_flush_ticks; long tail_flush_backoff; + long last_prefbusy_poll_ticks; + long lwr_prefbusy_poll_ticks; + long upr_prefbusy_poll_ticks; }; typedef struct fd_net_flusher fd_net_flusher_t; @@ -213,6 +216,10 @@ typedef struct { ushort repair_serve_listen_port; ushort txsend_src_port; + char poll_mode[ 16 ]; + uint busy_poll_usecs; + ulong gro_flush_timeout_nanos; + ulong in_cnt; fd_net_in_ctx_t in[ MAX_NET_INS ]; @@ -1115,6 +1122,105 @@ net_rx_event( fd_net_ctx_t * ctx, fill_ring->cached_prod = fill_prod+1U; } +/* before_credit_softirq is called every loop iteration if net tile + is in softirq polling mode (fallback if prefbusy polling mode + is not possible). */ + +static void +before_credit_softirq( fd_net_ctx_t * ctx, + int * charge_busy, + uint rr_idx, + fd_xsk_t * rr_xsk ) { + + net_tx_periodic_wakeup( ctx, rr_idx, fd_tickcount(), charge_busy ); + + /* Fire RX event if we have RX desc avail */ + if( !fd_xdp_ring_empty( &rr_xsk->ring_rx, FD_XDP_RING_ROLE_CONS ) ) { + *charge_busy = 1; + net_rx_event( ctx, rr_xsk, rr_xsk->ring_rx.cached_cons ); + } else { + net_rx_wakeup( ctx, rr_xsk, charge_busy ); + + /* Iterate onto the next NAPI queue. */ + ctx->rr_idx++; + ctx->rr_idx = fd_uint_if( ctx->rr_idx>=ctx->xsk_cnt, 0, ctx->rr_idx ); + } + + +} + +static int +net_prefbusy_poll_ready( fd_xsk_t * rr_xsk, + fd_net_flusher_t * flusher ) { + + if( FD_UNLIKELY( fd_tickcount() < ( flusher->last_prefbusy_poll_ticks + flusher->lwr_prefbusy_poll_ticks ) ) ) return 0; + if( FD_UNLIKELY( fd_tickcount() > ( flusher->last_prefbusy_poll_ticks + flusher->upr_prefbusy_poll_ticks ) ) ) return 1; + + if( FD_UNLIKELY( fd_xdp_ring_empty( &rr_xsk->ring_tx, FD_XDP_RING_ROLE_PROD ) ) ) { + flusher->pending_cnt = 0UL; + } + + int rx_empty = fd_xdp_ring_empty( &rr_xsk->ring_rx, FD_XDP_RING_ROLE_CONS ); + + return rx_empty; +} + +static void +net_prefbusy_poll_flush( fd_net_flusher_t * flusher, + long now ) { + flusher->pending_cnt = 0UL; + flusher->last_prefbusy_poll_ticks = now; +} + +/* before_credit_prefbusy is called every loop iteration if net + tile is in preferred busy (often referred to as "prefbusy" in + Firedancer) polling mode. */ + +static void +before_credit_prefbusy( fd_net_ctx_t * ctx, + int * charge_busy, + uint rr_idx, + fd_xsk_t * rr_xsk ) { + + fd_net_flusher_t * flusher = ctx->tx_flusher+rr_idx; + if( FD_UNLIKELY( net_prefbusy_poll_ready( rr_xsk, flusher ) ) ) { + /* NAPI needs to be polled to process new TX from + Firedancer's net tile and process new RX from the NIC. */ + + FD_VOLATILE( *rr_xsk->ring_tx.prod ) = rr_xsk->ring_tx.cached_prod; /* write-back local copies to fseqs */ + FD_VOLATILE( *rr_xsk->ring_cr.cons ) = rr_xsk->ring_cr.cached_cons; + FD_VOLATILE( *rr_xsk->ring_rx.cons ) = rr_xsk->ring_rx.cached_cons; + FD_VOLATILE( *rr_xsk->ring_fr.prod ) = rr_xsk->ring_fr.cached_prod; + + if( FD_UNLIKELY( -1==sendto( rr_xsk->xsk_fd, NULL, 0, MSG_DONTWAIT, NULL, 0 ) ) ) { + if( FD_UNLIKELY( net_is_fatal_xdp_error( errno ) ) ) { + FD_LOG_ERR(( "xsk sendto failed xsk_fd=%d (%i-%s)", rr_xsk->xsk_fd, errno, fd_io_strerror( errno ) )); + } + if( FD_UNLIKELY( errno!=EAGAIN ) ) { + long ts = fd_log_wallclock(); + if( ts > rr_xsk->log_suppress_until_ns ) { + FD_LOG_WARNING(( "xsk sendto failed xsk_fd=%d (%i-%s)", rr_xsk->xsk_fd, errno, fd_io_strerror( errno ) )); + rr_xsk->log_suppress_until_ns = ts + (long)1e9; + } + } + } + net_prefbusy_poll_flush( flusher, fd_tickcount() ); + + /* Since xsk sendmsg in prefbusy mode drives both rx and tx, both are incremented */ + ctx->metrics.xsk_tx_wakeup_cnt++; + ctx->metrics.xsk_rx_wakeup_cnt++; + } + + /* Process new RX from kernel driver if there is any. */ + if( !fd_xdp_ring_empty( &rr_xsk->ring_rx, FD_XDP_RING_ROLE_CONS ) ) { + *charge_busy = 1; + net_rx_event( ctx, rr_xsk, rr_xsk->ring_rx.cached_cons ); + } + /* Iterate onto the next NAPI queue. */ + ctx->rr_idx++; + ctx->rr_idx = fd_uint_if( ctx->rr_idx>=ctx->xsk_cnt, 0, ctx->rr_idx ); +} + /* before_credit is called every loop iteration. */ static void @@ -1141,16 +1247,11 @@ before_credit( fd_net_ctx_t * ctx, uint rr_idx = ctx->rr_idx; fd_xsk_t * rr_xsk = &ctx->xsk[ rr_idx ]; - net_tx_periodic_wakeup( ctx, rr_idx, fd_tickcount(), charge_busy ); - - /* Fire RX event if we have RX desc avail */ - if( !fd_xdp_ring_empty( &rr_xsk->ring_rx, FD_XDP_RING_ROLE_CONS ) ) { - *charge_busy = 1; - net_rx_event( ctx, rr_xsk, rr_xsk->ring_rx.cached_cons ); + if( FD_LIKELY( rr_xsk->prefbusy_poll_enabled ) ) { + before_credit_prefbusy( ctx, charge_busy, rr_idx, rr_xsk ); } else { - net_rx_wakeup( ctx, rr_xsk, charge_busy ); - ctx->rr_idx++; - ctx->rr_idx = fd_uint_if( ctx->rr_idx>=ctx->xsk_cnt, 0, ctx->rr_idx ); + /* Fallback poll mode which relies on linux irqs and wakeups */ + before_credit_softirq( ctx, charge_busy, rr_idx, rr_xsk ); } /* Fire comp event if we have comp desc avail */ @@ -1281,6 +1382,10 @@ privileged_init( fd_topo_t * topo, (e.g. 5.14.0-503.23.1.el9_5 with i40e) */ .bind_flags = tile->xdp.zero_copy ? XDP_ZEROCOPY : XDP_COPY, + .poll_mode = tile->xdp.poll_mode, + .busy_poll_usecs = tile->xdp.busy_poll_usecs, + .gro_flush_timeout_nanos = tile->xdp.gro_flush_timeout_nanos, + .fr_depth = tile->xdp.xdp_rx_queue_size*2, .rx_depth = tile->xdp.xdp_rx_queue_size, .cr_depth = tile->xdp.xdp_tx_queue_size, @@ -1415,6 +1520,10 @@ unprivileged_init( fd_topo_t * topo, ctx->repair_serve_listen_port = tile->net.repair_serve_listen_port; ctx->txsend_src_port = tile->net.txsend_src_port; + strcpy( ctx->poll_mode, tile->xdp.poll_mode ); + ctx->busy_poll_usecs = tile->xdp.busy_poll_usecs; + ctx->gro_flush_timeout_nanos = tile->xdp.gro_flush_timeout_nanos; + /* Put a bound on chunks we read from the input, to make sure they are within in the data region of the workspace. */ @@ -1495,6 +1604,10 @@ unprivileged_init( fd_topo_t * topo, ctx->tx_flusher[ j ].pending_wmark = (ulong)( (double)tile->xdp.xdp_tx_queue_size * 0.7 ); ctx->tx_flusher[ j ].tail_flush_backoff = (long)( (double)tile->xdp.tx_flush_timeout_ns * fd_tempo_tick_per_ns( NULL ) ); ctx->tx_flusher[ j ].next_tail_flush_ticks = LONG_MAX; + + ctx->tx_flusher[ j ].last_prefbusy_poll_ticks = 0UL; + ctx->tx_flusher[ j ].lwr_prefbusy_poll_ticks = (long)( (double)tile->xdp.lwr_prefbusy_poll_timeout_ns * fd_tempo_tick_per_ns( NULL ) ); + ctx->tx_flusher[ j ].upr_prefbusy_poll_ticks = (long)( (double)tile->xdp.upr_prefbusy_poll_timeout_ns * fd_tempo_tick_per_ns( NULL ) ); } /* Join netbase objects */ From 6df64dcb4504a6e473c21f7bc1130a98fab55185 Mon Sep 17 00:00:00 2001 From: Tristan <91004717+tristan-carter@users.noreply.github.com> Date: Tue, 24 Feb 2026 23:17:33 +0000 Subject: [PATCH 05/12] Fix typo Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/waltz/xdp/fd_xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/waltz/xdp/fd_xsk.c b/src/waltz/xdp/fd_xsk.c index 5347c28475d..c31e892a670 100644 --- a/src/waltz/xdp/fd_xsk.c +++ b/src/waltz/xdp/fd_xsk.c @@ -246,7 +246,7 @@ fd_xsk_setup_poll( fd_xsk_t * xsk, int sk_flags = fcntl( xsk->xsk_fd, F_GETFL, 0 ); if( FD_UNLIKELY( sk_flags == -1 ) ) { - FD_LOG_WARNING(( "fcntl(xsk->xsf_fd, F_GETFL, 0) failed (%i-%s)", + FD_LOG_WARNING(( "fcntl(xsk->xsk_fd, F_GETFL, 0) failed (%i-%s)", errno, fd_io_strerror( errno ) )); return; } From 58ccca6999244efbdce42a56069466a8b8378abc Mon Sep 17 00:00:00 2001 From: Tristan Date: Wed, 25 Feb 2026 19:05:44 +0000 Subject: [PATCH 06/12] Change indentation from 4 spaces to 2 --- .../shared/commands/configure/sysfs-poll.c | 64 +++++++-------- src/disco/net/xdp/fd_xdp_tile.c | 10 +-- src/waltz/xdp/fd_xsk.c | 80 +++++++++---------- 3 files changed, 76 insertions(+), 78 deletions(-) diff --git a/src/app/shared/commands/configure/sysfs-poll.c b/src/app/shared/commands/configure/sysfs-poll.c index ff2436ed9f1..20f87c45b5e 100644 --- a/src/app/shared/commands/configure/sysfs-poll.c +++ b/src/app/shared/commands/configure/sysfs-poll.c @@ -21,63 +21,63 @@ static char const setting_gro_flush_timeout[] = "gro_flush_timeout"; static int enabled( config_t const * config ) { - return !strcmp( config->net.xdp.poll_mode, "prefbusy" ); + return !strcmp( config->net.xdp.poll_mode, "prefbusy" ); } static void init_perm ( fd_cap_chk_t * chk, config_t const * config FD_PARAM_UNUSED ) { - fd_cap_chk_cap( chk, NAME, CAP_NET_ADMIN, "configure preferred busy polling via `/sys/class/net/*/{napi_defer_hard_irqs, gro_flush_timeout}`" ); + fd_cap_chk_cap( chk, NAME, CAP_NET_ADMIN, "configure preferred busy polling via `/sys/class/net/*/{napi_defer_hard_irqs, gro_flush_timeout}`" ); } static void sysfs_net_set( char const * device, char const * setting, - ulong value ) { - char path[ PATH_MAX ]; - fd_cstr_printf_check( path, PATH_MAX, NULL, "/sys/class/net/%s/%s", device, setting ); - FD_LOG_NOTICE(( "RUN: `echo \"%lu\" > %s`", value, path )); - fd_file_util_write_uint( path, (uint)value ); + ulong value ) { + char path[ PATH_MAX ]; + fd_cstr_printf_check( path, PATH_MAX, NULL, "/sys/class/net/%s/%s", device, setting ); + FD_LOG_NOTICE(( "RUN: `echo \"%lu\" > %s`", value, path )); + fd_file_util_write_uint( path, (uint)value ); } static void init( config_t const * config ) { - sysfs_net_set( config->net.interface, setting_napi_defer_hard_irqs, VERY_HIGH_VAL ); - sysfs_net_set( config->net.interface, setting_gro_flush_timeout, config->net.xdp.gro_flush_timeout_nanos ); + sysfs_net_set( config->net.interface, setting_napi_defer_hard_irqs, VERY_HIGH_VAL ); + sysfs_net_set( config->net.interface, setting_gro_flush_timeout, config->net.xdp.gro_flush_timeout_nanos ); } static int fini( config_t const * config, int pre_init FD_PARAM_UNUSED ) { - sysfs_net_set( config->net.interface, setting_napi_defer_hard_irqs, 0U ); - sysfs_net_set( config->net.interface, setting_gro_flush_timeout, 0U ); - return 1; + sysfs_net_set( config->net.interface, setting_napi_defer_hard_irqs, 0U ); + sysfs_net_set( config->net.interface, setting_gro_flush_timeout, 0U ); + return 1; } static configure_result_t check( config_t const * config, int check_type FD_PARAM_UNUSED ) { - char path[ PATH_MAX ]; - uint value; - fd_cstr_printf_check( path, PATH_MAX, NULL, "/sys/class/net/%s/%s", config->net.interface, setting_napi_defer_hard_irqs ); - if( fd_file_util_read_uint( path, &value ) || value < VERY_HIGH_VAL ) { - NOT_CONFIGURED("Setting napi_defer_hard_irqs failed."); - } - - fd_cstr_printf_check( path, PATH_MAX, NULL, "/sys/class/net/%s/%s", config->net.interface, setting_gro_flush_timeout ); - if( fd_file_util_read_uint( path, &value ) || value != config->net.xdp.gro_flush_timeout_nanos ) { - NOT_CONFIGURED("Setting gro_flush_timeout failed."); - } - - CONFIGURE_OK(); + char path[ PATH_MAX ]; + uint value; + fd_cstr_printf_check( path, PATH_MAX, NULL, "/sys/class/net/%s/%s", config->net.interface, setting_napi_defer_hard_irqs ); + if( fd_file_util_read_uint( path, &value ) || value < VERY_HIGH_VAL ) { + NOT_CONFIGURED("Setting napi_defer_hard_irqs failed."); + } + + fd_cstr_printf_check( path, PATH_MAX, NULL, "/sys/class/net/%s/%s", config->net.interface, setting_gro_flush_timeout ); + if( fd_file_util_read_uint( path, &value ) || value != config->net.xdp.gro_flush_timeout_nanos ) { + NOT_CONFIGURED("Setting gro_flush_timeout failed."); + } + + CONFIGURE_OK(); } configure_stage_t fd_cfg_stage_sysfs_poll = { - .name = NAME, - .enabled = enabled, - .init_perm = init_perm, - .fini_perm = init_perm, - .init = init, - .fini = fini, - .check = check, + .name = NAME, + .enabled = enabled, + .init_perm = init_perm, + .fini_perm = init_perm, + .init = init, + .fini = fini, + .check = check, }; diff --git a/src/disco/net/xdp/fd_xdp_tile.c b/src/disco/net/xdp/fd_xdp_tile.c index c8f67133b68..72b5eecef97 100644 --- a/src/disco/net/xdp/fd_xdp_tile.c +++ b/src/disco/net/xdp/fd_xdp_tile.c @@ -1145,8 +1145,6 @@ before_credit_softirq( fd_net_ctx_t * ctx, ctx->rr_idx++; ctx->rr_idx = fd_uint_if( ctx->rr_idx>=ctx->xsk_cnt, 0, ctx->rr_idx ); } - - } static int @@ -1168,7 +1166,7 @@ net_prefbusy_poll_ready( fd_xsk_t * rr_xsk, static void net_prefbusy_poll_flush( fd_net_flusher_t * flusher, long now ) { - flusher->pending_cnt = 0UL; + flusher->pending_cnt = 0UL; flusher->last_prefbusy_poll_ticks = now; } @@ -1248,10 +1246,10 @@ before_credit( fd_net_ctx_t * ctx, fd_xsk_t * rr_xsk = &ctx->xsk[ rr_idx ]; if( FD_LIKELY( rr_xsk->prefbusy_poll_enabled ) ) { - before_credit_prefbusy( ctx, charge_busy, rr_idx, rr_xsk ); + before_credit_prefbusy( ctx, charge_busy, rr_idx, rr_xsk ); } else { - /* Fallback poll mode which relies on linux irqs and wakeups */ - before_credit_softirq( ctx, charge_busy, rr_idx, rr_xsk ); + /* Fallback poll mode which relies on linux irqs and wakeups */ + before_credit_softirq( ctx, charge_busy, rr_idx, rr_xsk ); } /* Fire comp event if we have comp desc avail */ diff --git a/src/waltz/xdp/fd_xsk.c b/src/waltz/xdp/fd_xsk.c index c31e892a670..c1345c3597e 100644 --- a/src/waltz/xdp/fd_xsk.c +++ b/src/waltz/xdp/fd_xsk.c @@ -212,52 +212,52 @@ fd_xsk_setup_umem( fd_xsk_t * xsk, static void fd_xsk_setup_poll( fd_xsk_t * xsk, fd_xsk_params_t const * params ) { - xsk->prefbusy_poll_enabled = 0; - if( 0!=strcmp( params->poll_mode, "prefbusy" ) ) return; - - /* Configure socket options for preferred busy polling */ - - int prefbusy_poll = 1; - if( FD_UNLIKELY( 0!=setsockopt( xsk->xsk_fd, SOL_SOCKET, SO_PREFER_BUSY_POLL, &prefbusy_poll, sizeof(int) ) ) ) { - int err = errno; - FD_LOG_WARNING(( "setsockopt(xsk_fd,SOL_SOCKET,SO_PREFER_BUSY_POLL,1) failed (%i-%s)", err, fd_io_strerror( err ) )); - if( err==EINVAL ) { - FD_LOG_WARNING(( "Hint: Does your kernel support preferred busy polling? SO_PREFER_BUSY_POLL is available from Linux 5.11 onwards" )); - } - return; - } + xsk->prefbusy_poll_enabled = 0; + if( 0!=strcmp( params->poll_mode, "prefbusy" ) ) return; - if( FD_UNLIKELY( 0!=setsockopt( xsk->xsk_fd, SOL_SOCKET, SO_BUSY_POLL, ¶ms->busy_poll_usecs, sizeof(uint) ) ) ) { - FD_LOG_WARNING(( "setsockopt(xsk_fd,SOL_SOCKET,SO_BUSY_POLL,%u) failed (%i-%s)", - params->busy_poll_usecs, errno, fd_io_strerror( errno ) )); - return; - } + /* Configure socket options for preferred busy polling */ - /* The greater busy_poll_budget is, the greater the bias towards max RX pps - over max TX pps in a max network load scenario. */ - uint busy_poll_budget = 64U; - if( FD_UNLIKELY( 0!=setsockopt( xsk->xsk_fd, SOL_SOCKET, SO_BUSY_POLL_BUDGET, &busy_poll_budget, sizeof(uint) ) ) ) { - FD_LOG_WARNING(( "setsockopt(xsk_fd,SOL_SOCKET,SO_BUSY_POLL_BUDGET,%u) failed (%i-%s)", - busy_poll_budget, errno, fd_io_strerror( errno ) )); - return; + int prefbusy_poll = 1; + if( FD_UNLIKELY( 0!=setsockopt( xsk->xsk_fd, SOL_SOCKET, SO_PREFER_BUSY_POLL, &prefbusy_poll, sizeof(int) ) ) ) { + int err = errno; + FD_LOG_WARNING(( "setsockopt(xsk_fd,SOL_SOCKET,SO_PREFER_BUSY_POLL,1) failed (%i-%s)", err, fd_io_strerror( err ) )); + if( err==EINVAL ) { + FD_LOG_WARNING(( "Hint: Does your kernel support preferred busy polling? SO_PREFER_BUSY_POLL is available from Linux 5.11 onwards" )); } + return; + } - /* Set socket non blocking */ + if( FD_UNLIKELY( 0!=setsockopt( xsk->xsk_fd, SOL_SOCKET, SO_BUSY_POLL, ¶ms->busy_poll_usecs, sizeof(uint) ) ) ) { + FD_LOG_WARNING(( "setsockopt(xsk_fd,SOL_SOCKET,SO_BUSY_POLL,%u) failed (%i-%s)", + params->busy_poll_usecs, errno, fd_io_strerror( errno ) )); + return; + } - int sk_flags = fcntl( xsk->xsk_fd, F_GETFL, 0 ); - if( FD_UNLIKELY( sk_flags == -1 ) ) { - FD_LOG_WARNING(( "fcntl(xsk->xsk_fd, F_GETFL, 0) failed (%i-%s)", - errno, fd_io_strerror( errno ) )); - return; - } - if( FD_UNLIKELY( fcntl( xsk->xsk_fd, F_SETFL, sk_flags | O_NONBLOCK ) ) == -1 ) { - FD_LOG_WARNING(( "fcntl(xsk->xsk_fd, F_SETFL, sk_flags | O_NONBLOCK) failed (%i-%s)", - errno, fd_io_strerror( errno ) )); - return; - } + /* The greater busy_poll_budget is, the greater the bias towards max RX pps + over max TX pps in a max network load scenario. */ + uint busy_poll_budget = 64U; + if( FD_UNLIKELY( 0!=setsockopt( xsk->xsk_fd, SOL_SOCKET, SO_BUSY_POLL_BUDGET, &busy_poll_budget, sizeof(uint) ) ) ) { + FD_LOG_WARNING(( "setsockopt(xsk_fd,SOL_SOCKET,SO_BUSY_POLL_BUDGET,%u) failed (%i-%s)", + busy_poll_budget, errno, fd_io_strerror( errno ) )); + return; + } + + /* Set socket non blocking */ + + int sk_flags = fcntl( xsk->xsk_fd, F_GETFL, 0 ); + if( FD_UNLIKELY( sk_flags == -1 ) ) { + FD_LOG_WARNING(( "fcntl(xsk->xsk_fd, F_GETFL, 0) failed (%i-%s)", + errno, fd_io_strerror( errno ) )); + return; + } + if( FD_UNLIKELY( fcntl( xsk->xsk_fd, F_SETFL, sk_flags | O_NONBLOCK ) ) == -1 ) { + FD_LOG_WARNING(( "fcntl(xsk->xsk_fd, F_SETFL, sk_flags | O_NONBLOCK) failed (%i-%s)", + errno, fd_io_strerror( errno ) )); + return; + } - /* Successfully finished setting up prefbusy polling */ - xsk->prefbusy_poll_enabled = 1U; + /* Successfully finished setting up prefbusy polling */ + xsk->prefbusy_poll_enabled = 1U; } /* fd_xsk_init: Creates and configures an XSK socket object, and From 189e6ad9ef7a356ffc0fda0ab5a170520d1b77ac Mon Sep 17 00:00:00 2001 From: Tristan Date: Tue, 24 Feb 2026 19:44:20 +0000 Subject: [PATCH 07/12] prefbusy mode config options and sysfs linux config options --- src/app/fdctl/config/default.toml | 40 +++++++++ src/app/fdctl/main.c | 1 + src/app/fddev/main.h | 2 + src/app/firedancer/config/default.toml | 40 +++++++++ src/app/firedancer/main.c | 1 + src/app/shared/Local.mk | 1 + src/app/shared/commands/configure/configure.h | 1 + .../shared/commands/configure/sysfs-poll.c | 83 +++++++++++++++++++ src/app/shared/fd_config.h | 6 ++ src/app/shared/fd_config_parse.c | 5 ++ src/app/shared_dev/commands/pktgen/pktgen.c | 1 + src/app/shared_dev/commands/udpecho/udpecho.c | 1 + src/disco/net/fd_net_tile_topo.c | 8 ++ src/disco/topo/fd_topo.h | 6 ++ src/waltz/xdp/fd_xsk.h | 20 ++++- 15 files changed, 214 insertions(+), 2 deletions(-) create mode 100644 src/app/shared/commands/configure/sysfs-poll.c diff --git a/src/app/fdctl/config/default.toml b/src/app/fdctl/config/default.toml index abe1306c7f4..0805b1923ca 100644 --- a/src/app/fdctl/config/default.toml +++ b/src/app/fdctl/config/default.toml @@ -983,6 +983,46 @@ dynamic_port_range = "8900-9000" # "operation not supported". xdp_zero_copy = false + # This option moves the management of napi including + # when to poll as well as the poll budget, into userspace + # if in "prefbusy" mode. The fallback is "softirq" mode, + # which relies much more on linux to manage napi, through + # wakeups, softirqs and under higher network load, a seperate + # ksoftirqd thread which linux creates and manages. + # + # "prefbusy" mode is the recommended choice of mode, + # as this will also automatically fallback to "softirq" + # mode if preferred busy polling is not available or + # the right choice for whatever reason (e.g. on an older + # kernel). A warning will be emitted if this fallback is made + # alongside suggestions of why this might have happened. + poll_mode = "prefbusy" + + # AF_XDP socket configuration options which will eventually + # be moved to being fixed constants prior to the merge of + # prefbusy-poll-mode into main. + busy_poll_usecs = 100 + gro_flush_timeout_nanos = 5000000 + + # This is the minimum time between napi polls if in prefbusy + # mode. This is important for protecting against a livelock + # scenario inwhich Firedancer is not given enough time in + # userspace to do work. + # + # This is a protective mechanism against bugs as well + # as to ensure even in a low RX but high TX traffic scenario, + # TX is still given enough time to do work or else + # napi is polled whenever the xsk RX queue is empty + # which could starve userspace TX work in the edge + # case there is significantly more TX than RX traffic. + lwr_prefbusy_poll_timeout_micros = 5 + + # This is the maximum time between napi polls if in prefbusy + # mode. This is to call a napi poll in the case the normal + # prefbusy napi poll scheduling has stalled, napi polls + # can often resolve queue stalls so this increases robustness. + upr_prefbusy_poll_timeout_micros = 150 + # XDP uses metadata queues shared across the kernel and # userspace to relay events about incoming and outgoing packets. # This setting defines the number of entries in these metadata diff --git a/src/app/fdctl/main.c b/src/app/fdctl/main.c index 0589b27490f..3ca58b82e47 100644 --- a/src/app/fdctl/main.c +++ b/src/app/fdctl/main.c @@ -38,6 +38,7 @@ configure_stage_t * STAGES[] = { &fd_cfg_stage_ethtool_channels, &fd_cfg_stage_ethtool_offloads, &fd_cfg_stage_ethtool_loopback, + &fd_cfg_stage_sysfs_poll, NULL, }; diff --git a/src/app/fddev/main.h b/src/app/fddev/main.h index 5028caf57e6..3f8beb20ba7 100644 --- a/src/app/fddev/main.h +++ b/src/app/fddev/main.h @@ -33,6 +33,7 @@ extern configure_stage_t fd_cfg_stage_kill; extern configure_stage_t fd_cfg_stage_genesis; extern configure_stage_t fd_cfg_stage_keys; extern configure_stage_t fd_cfg_stage_blockstore; +extern configure_stage_t fd_cfg_stage_sysfs_poll; configure_stage_t * STAGES[] = { &fd_cfg_stage_kill, @@ -46,6 +47,7 @@ configure_stage_t * STAGES[] = { &fd_cfg_stage_keys, &fd_cfg_stage_genesis, &fd_cfg_stage_blockstore, + &fd_cfg_stage_sysfs_poll, NULL, }; diff --git a/src/app/firedancer/config/default.toml b/src/app/firedancer/config/default.toml index d6d1b8bb1ee..34344109643 100644 --- a/src/app/firedancer/config/default.toml +++ b/src/app/firedancer/config/default.toml @@ -1065,6 +1065,46 @@ telemetry = true # "operation not supported". xdp_zero_copy = false + # This option moves the management of napi including + # when to poll as well as the poll budget, into userspace + # if in "prefbusy" mode. The fallback is "softirq" mode, + # which relies much more on linux to manage napi, through + # wakeups, softirqs and under higher network load, a seperate + # ksoftirqd thread which linux creates and manages. + # + # "prefbusy" mode is the recommended choice of mode, + # as this will also automatically fallback to "softirq" + # mode if preferred busy polling is not available or + # the right choice for whatever reason (e.g. on an older + # kernel). A warning will be emitted if this fallback is made + # alongside suggestions of why this might have happened. + poll_mode = "prefbusy" + + # AF_XDP socket configuration options which will eventually + # be moved to being fixed constants prior to the merge of + # prefbusy-poll-mode into main. + busy_poll_usecs = 100 + gro_flush_timeout_nanos = 5000000 + + # This is the minimum time between napi polls if in prefbusy + # mode. This is important for protecting against a livelock + # scenario inwhich Firedancer is not given enough time in + # userspace to do work. + # + # This is a protective mechanism against bugs as well + # as to ensure even in a low RX but high TX traffic scenario, + # TX is still given enough time to do work or else + # napi is polled whenever the xsk RX queue is empty + # which could starve userspace TX work in the edge + # case there is significantly more TX than RX traffic. + lwr_prefbusy_poll_timeout_micros = 5 + + # This is the maximum time between napi polls if in prefbusy + # mode. This is to call a napi poll in the case the normal + # prefbusy napi poll scheduling has stalled, napi polls + # can often resolve queue stalls so this increases robustness. + upr_prefbusy_poll_timeout_micros = 150 + # XDP uses metadata queues shared across the kernel and # userspace to relay events about incoming and outgoing packets. # This setting defines the number of entries in these metadata diff --git a/src/app/firedancer/main.c b/src/app/firedancer/main.c index 33e84f4bc74..1976e4498d7 100644 --- a/src/app/firedancer/main.c +++ b/src/app/firedancer/main.c @@ -74,6 +74,7 @@ configure_stage_t * STAGES[] = { &fd_cfg_stage_ethtool_loopback, &fd_cfg_stage_snapshots, &fd_cfg_stage_accdb, + &fd_cfg_stage_sysfs_poll, NULL, }; diff --git a/src/app/shared/Local.mk b/src/app/shared/Local.mk index 561be32d419..bc114679e03 100644 --- a/src/app/shared/Local.mk +++ b/src/app/shared/Local.mk @@ -31,6 +31,7 @@ $(call add-objs,commands/configure/fd_ethtool_ioctl,fdctl_shared) $(call add-objs,commands/configure/hugetlbfs,fdctl_shared) $(call add-objs,commands/configure/hyperthreads,fdctl_shared) $(call add-objs,commands/configure/sysctl,fdctl_shared) +$(call add-objs,commands/configure/sysfs-poll,fdctl_shared) $(call add-objs,commands/configure/snapshots,fdctl_shared) $(call add-objs,commands/monitor/monitor commands/monitor/helper,fdctl_shared) $(call add-objs,commands/watch/watch,fdctl_shared) diff --git a/src/app/shared/commands/configure/configure.h b/src/app/shared/commands/configure/configure.h index fc5eea3777e..713de669404 100644 --- a/src/app/shared/commands/configure/configure.h +++ b/src/app/shared/commands/configure/configure.h @@ -84,6 +84,7 @@ extern configure_stage_t fd_cfg_stage_bonding; extern configure_stage_t fd_cfg_stage_ethtool_channels; extern configure_stage_t fd_cfg_stage_ethtool_offloads; extern configure_stage_t fd_cfg_stage_ethtool_loopback; +extern configure_stage_t fd_cfg_stage_sysfs_poll; extern configure_stage_t fd_cfg_stage_snapshots; extern configure_stage_t * STAGES[]; diff --git a/src/app/shared/commands/configure/sysfs-poll.c b/src/app/shared/commands/configure/sysfs-poll.c new file mode 100644 index 00000000000..ff2436ed9f1 --- /dev/null +++ b/src/app/shared/commands/configure/sysfs-poll.c @@ -0,0 +1,83 @@ +/* This stage configures the OS to support effective preferred busy + polling, allowing for significantly improved network stack (XDP) + performance if enabled. */ + +#include "configure.h" + +#define NAME "sysfs-poll" + +#include "../../../platform/fd_file_util.h" + +#include +#include +#include /* access */ +#include + +#define VERY_HIGH_VAL 1000000U + +static char const setting_napi_defer_hard_irqs[] = "napi_defer_hard_irqs"; + +static char const setting_gro_flush_timeout[] = "gro_flush_timeout"; + +static int +enabled( config_t const * config ) { + return !strcmp( config->net.xdp.poll_mode, "prefbusy" ); +} + +static void +init_perm ( fd_cap_chk_t * chk, + config_t const * config FD_PARAM_UNUSED ) { + fd_cap_chk_cap( chk, NAME, CAP_NET_ADMIN, "configure preferred busy polling via `/sys/class/net/*/{napi_defer_hard_irqs, gro_flush_timeout}`" ); +} + +static void +sysfs_net_set( char const * device, + char const * setting, + ulong value ) { + char path[ PATH_MAX ]; + fd_cstr_printf_check( path, PATH_MAX, NULL, "/sys/class/net/%s/%s", device, setting ); + FD_LOG_NOTICE(( "RUN: `echo \"%lu\" > %s`", value, path )); + fd_file_util_write_uint( path, (uint)value ); +} + +static void +init( config_t const * config ) { + sysfs_net_set( config->net.interface, setting_napi_defer_hard_irqs, VERY_HIGH_VAL ); + sysfs_net_set( config->net.interface, setting_gro_flush_timeout, config->net.xdp.gro_flush_timeout_nanos ); +} + +static int +fini( config_t const * config, + int pre_init FD_PARAM_UNUSED ) { + sysfs_net_set( config->net.interface, setting_napi_defer_hard_irqs, 0U ); + sysfs_net_set( config->net.interface, setting_gro_flush_timeout, 0U ); + return 1; +} + +static configure_result_t +check( config_t const * config, + int check_type FD_PARAM_UNUSED ) { + char path[ PATH_MAX ]; + uint value; + fd_cstr_printf_check( path, PATH_MAX, NULL, "/sys/class/net/%s/%s", config->net.interface, setting_napi_defer_hard_irqs ); + if( fd_file_util_read_uint( path, &value ) || value < VERY_HIGH_VAL ) { + NOT_CONFIGURED("Setting napi_defer_hard_irqs failed."); + } + + fd_cstr_printf_check( path, PATH_MAX, NULL, "/sys/class/net/%s/%s", config->net.interface, setting_gro_flush_timeout ); + if( fd_file_util_read_uint( path, &value ) || value != config->net.xdp.gro_flush_timeout_nanos ) { + NOT_CONFIGURED("Setting gro_flush_timeout failed."); + } + + CONFIGURE_OK(); +} + +configure_stage_t fd_cfg_stage_sysfs_poll = { + .name = NAME, + .enabled = enabled, + .init_perm = init_perm, + .fini_perm = init_perm, + .init = init, + .fini = fini, + .check = check, +}; diff --git a/src/app/shared/fd_config.h b/src/app/shared/fd_config.h index 50e2194d373..b7ab58062c3 100644 --- a/src/app/shared/fd_config.h +++ b/src/app/shared/fd_config.h @@ -205,6 +205,12 @@ struct fd_config_net { struct { char xdp_mode[ 8 ]; int xdp_zero_copy; + + char poll_mode[ 16 ]; /* "prefbusy" or "softirq" */ + uint busy_poll_usecs; + ulong gro_flush_timeout_nanos; + uint lwr_prefbusy_poll_timeout_micros; + uint upr_prefbusy_poll_timeout_micros; uint xdp_rx_queue_size; uint xdp_tx_queue_size; diff --git a/src/app/shared/fd_config_parse.c b/src/app/shared/fd_config_parse.c index 6881308c680..9ce24ba726c 100644 --- a/src/app/shared/fd_config_parse.c +++ b/src/app/shared/fd_config_parse.c @@ -184,6 +184,11 @@ fd_config_extract_pod( uchar * pod, CFG_POP ( uint, net.ingress_buffer_size ); CFG_POP ( cstr, net.xdp.xdp_mode ); CFG_POP ( bool, net.xdp.xdp_zero_copy ); + CFG_POP ( cstr, net.xdp.poll_mode ); + CFG_POP ( uint, net.xdp.busy_poll_usecs ); + CFG_POP ( ulong, net.xdp.gro_flush_timeout_nanos ); + CFG_POP ( uint, net.xdp.lwr_prefbusy_poll_timeout_micros ); + CFG_POP ( uint, net.xdp.upr_prefbusy_poll_timeout_micros ); CFG_POP ( uint, net.xdp.xdp_rx_queue_size ); CFG_POP ( uint, net.xdp.xdp_tx_queue_size ); CFG_POP ( uint, net.xdp.flush_timeout_micros ); diff --git a/src/app/shared_dev/commands/pktgen/pktgen.c b/src/app/shared_dev/commands/pktgen/pktgen.c index 0360362ddba..6b191c41189 100644 --- a/src/app/shared_dev/commands/pktgen/pktgen.c +++ b/src/app/shared_dev/commands/pktgen/pktgen.c @@ -208,6 +208,7 @@ pktgen_cmd_fn( args_t * args FD_PARAM_UNUSED, configure_stage( &fd_cfg_stage_bonding, CONFIGURE_CMD_INIT, config ); configure_stage( &fd_cfg_stage_ethtool_channels, CONFIGURE_CMD_INIT, config ); configure_stage( &fd_cfg_stage_ethtool_offloads, CONFIGURE_CMD_INIT, config ); + configure_stage( &fd_cfg_stage_sysfs_poll, CONFIGURE_CMD_INIT, config ); fdctl_check_configure( config ); /* FIXME this allocates lots of memory unnecessarily */ diff --git a/src/app/shared_dev/commands/udpecho/udpecho.c b/src/app/shared_dev/commands/udpecho/udpecho.c index 725263d8024..8b8bebf7273 100644 --- a/src/app/shared_dev/commands/udpecho/udpecho.c +++ b/src/app/shared_dev/commands/udpecho/udpecho.c @@ -100,6 +100,7 @@ udpecho_cmd_fn( args_t * args, configure_stage( &fd_cfg_stage_ethtool_channels, CONFIGURE_CMD_INIT, config ); configure_stage( &fd_cfg_stage_ethtool_offloads, CONFIGURE_CMD_INIT, config ); configure_stage( &fd_cfg_stage_ethtool_loopback, CONFIGURE_CMD_INIT, config ); + configure_stage( &fd_cfg_stage_sysfs_poll, CONFIGURE_CMD_INIT, config ); fdctl_check_configure( config ); /* FIXME this allocates lots of memory unnecessarily */ diff --git a/src/disco/net/fd_net_tile_topo.c b/src/disco/net/fd_net_tile_topo.c index c48ee1eda60..e0ba64735c7 100644 --- a/src/disco/net/fd_net_tile_topo.c +++ b/src/disco/net/fd_net_tile_topo.c @@ -44,6 +44,14 @@ setup_xdp_tile( fd_topo_t * topo, tile->xdp.zero_copy = net_cfg->xdp.xdp_zero_copy; fd_cstr_ncpy( tile->xdp.xdp_mode, net_cfg->xdp.xdp_mode, sizeof(tile->xdp.xdp_mode) ); + fd_cstr_ncpy( tile->xdp.poll_mode, net_cfg->xdp.poll_mode, sizeof(tile->xdp.poll_mode) ); + + tile->xdp.busy_poll_usecs = net_cfg->xdp.busy_poll_usecs; + tile->xdp.gro_flush_timeout_nanos = net_cfg->xdp.gro_flush_timeout_nanos; + + tile->xdp.lwr_prefbusy_poll_timeout_ns = (long)net_cfg->xdp.lwr_prefbusy_poll_timeout_micros * 1000L; + tile->xdp.upr_prefbusy_poll_timeout_ns = (long)net_cfg->xdp.upr_prefbusy_poll_timeout_micros * 1000L; + tile->xdp.net.umem_dcache_obj_id = umem_obj->id; tile->xdp.netdev_dbl_buf_obj_id = netlink_tile->netlink.netdev_dbl_buf_obj_id; tile->xdp.fib4_main_obj_id = netlink_tile->netlink.fib4_main_obj_id; diff --git a/src/disco/topo/fd_topo.h b/src/disco/topo/fd_topo.h index dcb433adb1d..2f9279bfca5 100644 --- a/src/disco/topo/fd_topo.h +++ b/src/disco/topo/fd_topo.h @@ -182,6 +182,12 @@ struct fd_topo_tile { char xdp_mode[8]; int zero_copy; + char poll_mode[ 16 ]; /* "prefbusy" or "softirq" */ + uint busy_poll_usecs; + ulong gro_flush_timeout_nanos; + long lwr_prefbusy_poll_timeout_ns; + long upr_prefbusy_poll_timeout_ns; + ulong netdev_dbl_buf_obj_id; /* dbl_buf containing netdev_tbl */ ulong fib4_main_obj_id; /* fib4 containing main route table */ ulong fib4_local_obj_id; /* fib4 containing local route table */ diff --git a/src/waltz/xdp/fd_xsk.h b/src/waltz/xdp/fd_xsk.h index 55cfc1a4774..22d5a3e0f22 100644 --- a/src/waltz/xdp/fd_xsk.h +++ b/src/waltz/xdp/fd_xsk.h @@ -187,8 +187,8 @@ fd_xdp_ring_full( fd_xdp_ring_t * ring ) { return ring->cached_prod - ring->cached_cons >= ring->depth; } -/* fd_xsk_params_t: Memory layout parameters of XSK. - Can be retrieved using fd_xsk_get_params() */ +/* fd_xsk_params_t: XSK poll configuration and memory layout + parameters. Can be retrieved using fd_xsk_get_params() */ struct fd_xsk_params { /* {fr,rx,tx,cr}_depth: Number of frames allocated for the Fill, RX, @@ -217,6 +217,15 @@ struct fd_xsk_params { /* sockaddr_xdp.sxdp_flags additional params, e.g. XDP_ZEROCOPY */ uint bind_flags; + char * poll_mode; + + /* max time waiting for work during prefbusy napi poll. */ + uint busy_poll_usecs; + + /* max time linux waits for userspace to poll napi before + calling a softirq. */ + ulong gro_flush_timeout_nanos; + /* whether the xsk memory should be included in core dumps */ int core_dump; }; @@ -236,6 +245,13 @@ struct fd_xsk { /* AF_XDP socket file descriptor */ int xsk_fd; + /* Whether preferred busy polling was successfully enabled + during XSK socket setup. */ + int prefbusy_poll_enabled; + + /* napi_id: ID of this specific NAPI instance */ + uint napi_id; + /* ring_{rx,tx,fr,cr}: XSK ring descriptors */ fd_xdp_ring_t ring_rx; From fed91e11fcbf0aa0f5ed3e9b1b4ead5ab8f6d717 Mon Sep 17 00:00:00 2001 From: Tristan Date: Tue, 24 Feb 2026 19:55:23 +0000 Subject: [PATCH 08/12] XSK socket configuration for prefbusy polling if requested --- src/waltz/xdp/fd_xsk.c | 99 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/src/waltz/xdp/fd_xsk.c b/src/waltz/xdp/fd_xsk.c index 89efba4a737..5347c28475d 100644 --- a/src/waltz/xdp/fd_xsk.c +++ b/src/waltz/xdp/fd_xsk.c @@ -7,6 +7,7 @@ #include #include /* snprintf */ #include +#include #include /* mmap */ #include #include /* sendto */ @@ -15,6 +16,24 @@ #include "../../util/log/fd_log.h" #include "fd_xsk.h" +/* Support for older kernels */ + +#ifndef SO_BUSY_POLL +#define SO_BUSY_POLL 46 +#endif + +#ifndef SO_INCOMING_NAPI_ID +#define SO_INCOMING_NAPI_ID 56 +#endif + +#ifndef SO_PREFER_BUSY_POLL +#define SO_PREFER_BUSY_POLL 69 +#endif + +#ifndef SO_BUSY_POLL_BUDGET +#define SO_BUSY_POLL_BUDGET 70 +#endif + /* Join/leave *********************************************************/ /* fd_xsk_mmap_offset_cstr: Returns a cstr describing the given offset @@ -187,6 +206,60 @@ fd_xsk_setup_umem( fd_xsk_t * xsk, return 0; } +/* fd_xsk_setup_poll: Setup preferred busy polling if the user has + set that to be their preferred polling method */ + +static void +fd_xsk_setup_poll( fd_xsk_t * xsk, + fd_xsk_params_t const * params ) { + xsk->prefbusy_poll_enabled = 0; + if( 0!=strcmp( params->poll_mode, "prefbusy" ) ) return; + + /* Configure socket options for preferred busy polling */ + + int prefbusy_poll = 1; + if( FD_UNLIKELY( 0!=setsockopt( xsk->xsk_fd, SOL_SOCKET, SO_PREFER_BUSY_POLL, &prefbusy_poll, sizeof(int) ) ) ) { + int err = errno; + FD_LOG_WARNING(( "setsockopt(xsk_fd,SOL_SOCKET,SO_PREFER_BUSY_POLL,1) failed (%i-%s)", err, fd_io_strerror( err ) )); + if( err==EINVAL ) { + FD_LOG_WARNING(( "Hint: Does your kernel support preferred busy polling? SO_PREFER_BUSY_POLL is available from Linux 5.11 onwards" )); + } + return; + } + + if( FD_UNLIKELY( 0!=setsockopt( xsk->xsk_fd, SOL_SOCKET, SO_BUSY_POLL, ¶ms->busy_poll_usecs, sizeof(uint) ) ) ) { + FD_LOG_WARNING(( "setsockopt(xsk_fd,SOL_SOCKET,SO_BUSY_POLL,%u) failed (%i-%s)", + params->busy_poll_usecs, errno, fd_io_strerror( errno ) )); + return; + } + + /* The greater busy_poll_budget is, the greater the bias towards max RX pps + over max TX pps in a max network load scenario. */ + uint busy_poll_budget = 64U; + if( FD_UNLIKELY( 0!=setsockopt( xsk->xsk_fd, SOL_SOCKET, SO_BUSY_POLL_BUDGET, &busy_poll_budget, sizeof(uint) ) ) ) { + FD_LOG_WARNING(( "setsockopt(xsk_fd,SOL_SOCKET,SO_BUSY_POLL_BUDGET,%u) failed (%i-%s)", + busy_poll_budget, errno, fd_io_strerror( errno ) )); + return; + } + + /* Set socket non blocking */ + + int sk_flags = fcntl( xsk->xsk_fd, F_GETFL, 0 ); + if( FD_UNLIKELY( sk_flags == -1 ) ) { + FD_LOG_WARNING(( "fcntl(xsk->xsf_fd, F_GETFL, 0) failed (%i-%s)", + errno, fd_io_strerror( errno ) )); + return; + } + if( FD_UNLIKELY( fcntl( xsk->xsk_fd, F_SETFL, sk_flags | O_NONBLOCK ) ) == -1 ) { + FD_LOG_WARNING(( "fcntl(xsk->xsk_fd, F_SETFL, sk_flags | O_NONBLOCK) failed (%i-%s)", + errno, fd_io_strerror( errno ) )); + return; + } + + /* Successfully finished setting up prefbusy polling */ + xsk->prefbusy_poll_enabled = 1U; +} + /* fd_xsk_init: Creates and configures an XSK socket object, and attaches to a preinstalled XDP program. The various steps are implemented in fd_xsk_setup_{...}. */ @@ -289,6 +362,32 @@ fd_xsk_init( fd_xsk_t * xsk, FD_LOG_INFO(( "AF_XDP socket initialized: bind( PF_XDP, ifindex=%u (%s), queue_id=%u, flags=%x ) success", xsk->if_idx, if_indextoname( xsk->if_idx, if_name ), xsk->if_queue_id, flags )); + /* Check if the XSK is aware of the driver's NAPI ID for the + associated RX queue. Without it, preferred busy polling is not + going to work correctly. Note it's not always associated straight + away so xsk->napi_id can sometimes be set to 0 when it shouldn't be. + This is not an issue currently as the napi_id is not used yet. */ + + socklen_t napi_id_sz = sizeof(uint); + if( FD_UNLIKELY( 0!=getsockopt( xsk->xsk_fd, SOL_SOCKET, SO_INCOMING_NAPI_ID, &xsk->napi_id, &napi_id_sz ) ) ) { + if( errno==ENOPROTOOPT ) { + xsk->napi_id = 0; + } else { + FD_LOG_WARNING(( "getsockopt(SOL_SOCKET,SO_INCOMING_NAPI_ID) failed (%i-%s)", errno, fd_io_strerror( errno ) )); + goto fail; + } + } + + if( xsk->napi_id ) { + FD_LOG_DEBUG(( "Interface %u Queue %u has NAPI ID %u", xsk->if_idx, xsk->if_queue_id, xsk->napi_id )); + } else { + FD_LOG_DEBUG(( "Interface %u Queue %u has unknown NAPI ID", xsk->if_idx, xsk->if_queue_id )); + } + + /* If requested, enable preferred busy polling */ + + fd_xsk_setup_poll( xsk, params ); + return xsk; fail: From c48a572890f85b7dac6ba03f05b88ecbef59cd7d Mon Sep 17 00:00:00 2001 From: Tristan Date: Tue, 24 Feb 2026 20:38:11 +0000 Subject: [PATCH 09/12] prefbusy poll mode runtime code --- src/disco/net/xdp/fd_xdp_tile.c | 133 +++++++++++++++++++++++++++++--- 1 file changed, 123 insertions(+), 10 deletions(-) diff --git a/src/disco/net/xdp/fd_xdp_tile.c b/src/disco/net/xdp/fd_xdp_tile.c index c6955ce5972..8fcdd72d989 100644 --- a/src/disco/net/xdp/fd_xdp_tile.c +++ b/src/disco/net/xdp/fd_xdp_tile.c @@ -95,10 +95,13 @@ struct fd_net_flusher { wakeup. This can result in the tail of a burst getting delayed or overrun. If more than tail_flush_backoff ticks pass since the last sendto() wakeup and there are still unacknowledged packets in the - TX ring, issues another wakeup. */ + TX ring, issues another wakeup. Only used by "softirq" poll mode. */ long next_tail_flush_ticks; long tail_flush_backoff; + long last_prefbusy_poll_ticks; + long lwr_prefbusy_poll_ticks; + long upr_prefbusy_poll_ticks; }; typedef struct fd_net_flusher fd_net_flusher_t; @@ -213,6 +216,10 @@ typedef struct { ushort repair_serve_listen_port; ushort txsend_src_port; + char poll_mode[ 16 ]; + uint busy_poll_usecs; + ulong gro_flush_timeout_nanos; + ulong in_cnt; fd_net_in_ctx_t in[ MAX_NET_INS ]; @@ -1122,6 +1129,105 @@ net_rx_event( fd_net_ctx_t * ctx, fill_ring->cached_prod = fill_prod+1U; } +/* before_credit_softirq is called every loop iteration if net tile + is in softirq polling mode (fallback if prefbusy polling mode + is not possible). */ + +static void +before_credit_softirq( fd_net_ctx_t * ctx, + int * charge_busy, + uint rr_idx, + fd_xsk_t * rr_xsk ) { + + net_tx_periodic_wakeup( ctx, rr_idx, fd_tickcount(), charge_busy ); + + /* Fire RX event if we have RX desc avail */ + if( !fd_xdp_ring_empty( &rr_xsk->ring_rx, FD_XDP_RING_ROLE_CONS ) ) { + *charge_busy = 1; + net_rx_event( ctx, rr_xsk, rr_xsk->ring_rx.cached_cons ); + } else { + net_rx_wakeup( ctx, rr_xsk, charge_busy ); + + /* Iterate onto the next NAPI queue. */ + ctx->rr_idx++; + ctx->rr_idx = fd_uint_if( ctx->rr_idx>=ctx->xsk_cnt, 0, ctx->rr_idx ); + } + + +} + +static int +net_prefbusy_poll_ready( fd_xsk_t * rr_xsk, + fd_net_flusher_t * flusher ) { + + if( FD_UNLIKELY( fd_tickcount() < ( flusher->last_prefbusy_poll_ticks + flusher->lwr_prefbusy_poll_ticks ) ) ) return 0; + if( FD_UNLIKELY( fd_tickcount() > ( flusher->last_prefbusy_poll_ticks + flusher->upr_prefbusy_poll_ticks ) ) ) return 1; + + if( FD_UNLIKELY( fd_xdp_ring_empty( &rr_xsk->ring_tx, FD_XDP_RING_ROLE_PROD ) ) ) { + flusher->pending_cnt = 0UL; + } + + int rx_empty = fd_xdp_ring_empty( &rr_xsk->ring_rx, FD_XDP_RING_ROLE_CONS ); + + return rx_empty; +} + +static void +net_prefbusy_poll_flush( fd_net_flusher_t * flusher, + long now ) { + flusher->pending_cnt = 0UL; + flusher->last_prefbusy_poll_ticks = now; +} + +/* before_credit_prefbusy is called every loop iteration if net + tile is in preferred busy (often referred to as "prefbusy" in + Firedancer) polling mode. */ + +static void +before_credit_prefbusy( fd_net_ctx_t * ctx, + int * charge_busy, + uint rr_idx, + fd_xsk_t * rr_xsk ) { + + fd_net_flusher_t * flusher = ctx->tx_flusher+rr_idx; + if( FD_UNLIKELY( net_prefbusy_poll_ready( rr_xsk, flusher ) ) ) { + /* NAPI needs to be polled to process new TX from + Firedancer's net tile and process new RX from the NIC. */ + + FD_VOLATILE( *rr_xsk->ring_tx.prod ) = rr_xsk->ring_tx.cached_prod; /* write-back local copies to fseqs */ + FD_VOLATILE( *rr_xsk->ring_cr.cons ) = rr_xsk->ring_cr.cached_cons; + FD_VOLATILE( *rr_xsk->ring_rx.cons ) = rr_xsk->ring_rx.cached_cons; + FD_VOLATILE( *rr_xsk->ring_fr.prod ) = rr_xsk->ring_fr.cached_prod; + + if( FD_UNLIKELY( -1==sendto( rr_xsk->xsk_fd, NULL, 0, MSG_DONTWAIT, NULL, 0 ) ) ) { + if( FD_UNLIKELY( net_is_fatal_xdp_error( errno ) ) ) { + FD_LOG_ERR(( "xsk sendto failed xsk_fd=%d (%i-%s)", rr_xsk->xsk_fd, errno, fd_io_strerror( errno ) )); + } + if( FD_UNLIKELY( errno!=EAGAIN ) ) { + long ts = fd_log_wallclock(); + if( ts > rr_xsk->log_suppress_until_ns ) { + FD_LOG_WARNING(( "xsk sendto failed xsk_fd=%d (%i-%s)", rr_xsk->xsk_fd, errno, fd_io_strerror( errno ) )); + rr_xsk->log_suppress_until_ns = ts + (long)1e9; + } + } + } + net_prefbusy_poll_flush( flusher, fd_tickcount() ); + + /* Since xsk sendmsg in prefbusy mode drives both rx and tx, both are incremented */ + ctx->metrics.xsk_tx_wakeup_cnt++; + ctx->metrics.xsk_rx_wakeup_cnt++; + } + + /* Process new RX from kernel driver if there is any. */ + if( !fd_xdp_ring_empty( &rr_xsk->ring_rx, FD_XDP_RING_ROLE_CONS ) ) { + *charge_busy = 1; + net_rx_event( ctx, rr_xsk, rr_xsk->ring_rx.cached_cons ); + } + /* Iterate onto the next NAPI queue. */ + ctx->rr_idx++; + ctx->rr_idx = fd_uint_if( ctx->rr_idx>=ctx->xsk_cnt, 0, ctx->rr_idx ); +} + /* before_credit is called every loop iteration. */ static void @@ -1148,16 +1254,11 @@ before_credit( fd_net_ctx_t * ctx, uint rr_idx = ctx->rr_idx; fd_xsk_t * rr_xsk = &ctx->xsk[ rr_idx ]; - net_tx_periodic_wakeup( ctx, rr_idx, fd_tickcount(), charge_busy ); - - /* Fire RX event if we have RX desc avail */ - if( !fd_xdp_ring_empty( &rr_xsk->ring_rx, FD_XDP_RING_ROLE_CONS ) ) { - *charge_busy = 1; - net_rx_event( ctx, rr_xsk, rr_xsk->ring_rx.cached_cons ); + if( FD_LIKELY( rr_xsk->prefbusy_poll_enabled ) ) { + before_credit_prefbusy( ctx, charge_busy, rr_idx, rr_xsk ); } else { - net_rx_wakeup( ctx, rr_xsk, charge_busy ); - ctx->rr_idx++; - ctx->rr_idx = fd_uint_if( ctx->rr_idx>=ctx->xsk_cnt, 0, ctx->rr_idx ); + /* Fallback poll mode which relies on linux irqs and wakeups */ + before_credit_softirq( ctx, charge_busy, rr_idx, rr_xsk ); } /* Fire comp event if we have comp desc avail */ @@ -1288,6 +1389,10 @@ privileged_init( fd_topo_t * topo, (e.g. 5.14.0-503.23.1.el9_5 with i40e) */ .bind_flags = tile->xdp.zero_copy ? XDP_ZEROCOPY : XDP_COPY, + .poll_mode = tile->xdp.poll_mode, + .busy_poll_usecs = tile->xdp.busy_poll_usecs, + .gro_flush_timeout_nanos = tile->xdp.gro_flush_timeout_nanos, + .fr_depth = tile->xdp.xdp_rx_queue_size*2, .rx_depth = tile->xdp.xdp_rx_queue_size, .cr_depth = tile->xdp.xdp_tx_queue_size, @@ -1422,6 +1527,10 @@ unprivileged_init( fd_topo_t * topo, ctx->repair_serve_listen_port = tile->net.repair_serve_listen_port; ctx->txsend_src_port = tile->net.txsend_src_port; + strcpy( ctx->poll_mode, tile->xdp.poll_mode ); + ctx->busy_poll_usecs = tile->xdp.busy_poll_usecs; + ctx->gro_flush_timeout_nanos = tile->xdp.gro_flush_timeout_nanos; + /* Put a bound on chunks we read from the input, to make sure they are within in the data region of the workspace. */ @@ -1502,6 +1611,10 @@ unprivileged_init( fd_topo_t * topo, ctx->tx_flusher[ j ].pending_wmark = (ulong)( (double)tile->xdp.xdp_tx_queue_size * 0.7 ); ctx->tx_flusher[ j ].tail_flush_backoff = (long)( (double)tile->xdp.tx_flush_timeout_ns * fd_tempo_tick_per_ns( NULL ) ); ctx->tx_flusher[ j ].next_tail_flush_ticks = LONG_MAX; + + ctx->tx_flusher[ j ].last_prefbusy_poll_ticks = 0UL; + ctx->tx_flusher[ j ].lwr_prefbusy_poll_ticks = (long)( (double)tile->xdp.lwr_prefbusy_poll_timeout_ns * fd_tempo_tick_per_ns( NULL ) ); + ctx->tx_flusher[ j ].upr_prefbusy_poll_ticks = (long)( (double)tile->xdp.upr_prefbusy_poll_timeout_ns * fd_tempo_tick_per_ns( NULL ) ); } /* Join netbase objects */ From 11fc180893197592456eb433d42f00e74e56f796 Mon Sep 17 00:00:00 2001 From: Tristan <91004717+tristan-carter@users.noreply.github.com> Date: Tue, 24 Feb 2026 23:17:33 +0000 Subject: [PATCH 10/12] Fix typo Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/waltz/xdp/fd_xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/waltz/xdp/fd_xsk.c b/src/waltz/xdp/fd_xsk.c index 5347c28475d..c31e892a670 100644 --- a/src/waltz/xdp/fd_xsk.c +++ b/src/waltz/xdp/fd_xsk.c @@ -246,7 +246,7 @@ fd_xsk_setup_poll( fd_xsk_t * xsk, int sk_flags = fcntl( xsk->xsk_fd, F_GETFL, 0 ); if( FD_UNLIKELY( sk_flags == -1 ) ) { - FD_LOG_WARNING(( "fcntl(xsk->xsf_fd, F_GETFL, 0) failed (%i-%s)", + FD_LOG_WARNING(( "fcntl(xsk->xsk_fd, F_GETFL, 0) failed (%i-%s)", errno, fd_io_strerror( errno ) )); return; } From ef976127d69a775688b41bd1a622d54534ec0579 Mon Sep 17 00:00:00 2001 From: Tristan Date: Wed, 25 Feb 2026 19:05:44 +0000 Subject: [PATCH 11/12] Change indentation from 4 spaces to 2 --- .../shared/commands/configure/sysfs-poll.c | 64 +++++++-------- src/disco/net/xdp/fd_xdp_tile.c | 10 +-- src/waltz/xdp/fd_xsk.c | 80 +++++++++---------- 3 files changed, 76 insertions(+), 78 deletions(-) diff --git a/src/app/shared/commands/configure/sysfs-poll.c b/src/app/shared/commands/configure/sysfs-poll.c index ff2436ed9f1..20f87c45b5e 100644 --- a/src/app/shared/commands/configure/sysfs-poll.c +++ b/src/app/shared/commands/configure/sysfs-poll.c @@ -21,63 +21,63 @@ static char const setting_gro_flush_timeout[] = "gro_flush_timeout"; static int enabled( config_t const * config ) { - return !strcmp( config->net.xdp.poll_mode, "prefbusy" ); + return !strcmp( config->net.xdp.poll_mode, "prefbusy" ); } static void init_perm ( fd_cap_chk_t * chk, config_t const * config FD_PARAM_UNUSED ) { - fd_cap_chk_cap( chk, NAME, CAP_NET_ADMIN, "configure preferred busy polling via `/sys/class/net/*/{napi_defer_hard_irqs, gro_flush_timeout}`" ); + fd_cap_chk_cap( chk, NAME, CAP_NET_ADMIN, "configure preferred busy polling via `/sys/class/net/*/{napi_defer_hard_irqs, gro_flush_timeout}`" ); } static void sysfs_net_set( char const * device, char const * setting, - ulong value ) { - char path[ PATH_MAX ]; - fd_cstr_printf_check( path, PATH_MAX, NULL, "/sys/class/net/%s/%s", device, setting ); - FD_LOG_NOTICE(( "RUN: `echo \"%lu\" > %s`", value, path )); - fd_file_util_write_uint( path, (uint)value ); + ulong value ) { + char path[ PATH_MAX ]; + fd_cstr_printf_check( path, PATH_MAX, NULL, "/sys/class/net/%s/%s", device, setting ); + FD_LOG_NOTICE(( "RUN: `echo \"%lu\" > %s`", value, path )); + fd_file_util_write_uint( path, (uint)value ); } static void init( config_t const * config ) { - sysfs_net_set( config->net.interface, setting_napi_defer_hard_irqs, VERY_HIGH_VAL ); - sysfs_net_set( config->net.interface, setting_gro_flush_timeout, config->net.xdp.gro_flush_timeout_nanos ); + sysfs_net_set( config->net.interface, setting_napi_defer_hard_irqs, VERY_HIGH_VAL ); + sysfs_net_set( config->net.interface, setting_gro_flush_timeout, config->net.xdp.gro_flush_timeout_nanos ); } static int fini( config_t const * config, int pre_init FD_PARAM_UNUSED ) { - sysfs_net_set( config->net.interface, setting_napi_defer_hard_irqs, 0U ); - sysfs_net_set( config->net.interface, setting_gro_flush_timeout, 0U ); - return 1; + sysfs_net_set( config->net.interface, setting_napi_defer_hard_irqs, 0U ); + sysfs_net_set( config->net.interface, setting_gro_flush_timeout, 0U ); + return 1; } static configure_result_t check( config_t const * config, int check_type FD_PARAM_UNUSED ) { - char path[ PATH_MAX ]; - uint value; - fd_cstr_printf_check( path, PATH_MAX, NULL, "/sys/class/net/%s/%s", config->net.interface, setting_napi_defer_hard_irqs ); - if( fd_file_util_read_uint( path, &value ) || value < VERY_HIGH_VAL ) { - NOT_CONFIGURED("Setting napi_defer_hard_irqs failed."); - } - - fd_cstr_printf_check( path, PATH_MAX, NULL, "/sys/class/net/%s/%s", config->net.interface, setting_gro_flush_timeout ); - if( fd_file_util_read_uint( path, &value ) || value != config->net.xdp.gro_flush_timeout_nanos ) { - NOT_CONFIGURED("Setting gro_flush_timeout failed."); - } - - CONFIGURE_OK(); + char path[ PATH_MAX ]; + uint value; + fd_cstr_printf_check( path, PATH_MAX, NULL, "/sys/class/net/%s/%s", config->net.interface, setting_napi_defer_hard_irqs ); + if( fd_file_util_read_uint( path, &value ) || value < VERY_HIGH_VAL ) { + NOT_CONFIGURED("Setting napi_defer_hard_irqs failed."); + } + + fd_cstr_printf_check( path, PATH_MAX, NULL, "/sys/class/net/%s/%s", config->net.interface, setting_gro_flush_timeout ); + if( fd_file_util_read_uint( path, &value ) || value != config->net.xdp.gro_flush_timeout_nanos ) { + NOT_CONFIGURED("Setting gro_flush_timeout failed."); + } + + CONFIGURE_OK(); } configure_stage_t fd_cfg_stage_sysfs_poll = { - .name = NAME, - .enabled = enabled, - .init_perm = init_perm, - .fini_perm = init_perm, - .init = init, - .fini = fini, - .check = check, + .name = NAME, + .enabled = enabled, + .init_perm = init_perm, + .fini_perm = init_perm, + .init = init, + .fini = fini, + .check = check, }; diff --git a/src/disco/net/xdp/fd_xdp_tile.c b/src/disco/net/xdp/fd_xdp_tile.c index 8fcdd72d989..1e8c37c08c1 100644 --- a/src/disco/net/xdp/fd_xdp_tile.c +++ b/src/disco/net/xdp/fd_xdp_tile.c @@ -1152,8 +1152,6 @@ before_credit_softirq( fd_net_ctx_t * ctx, ctx->rr_idx++; ctx->rr_idx = fd_uint_if( ctx->rr_idx>=ctx->xsk_cnt, 0, ctx->rr_idx ); } - - } static int @@ -1175,7 +1173,7 @@ net_prefbusy_poll_ready( fd_xsk_t * rr_xsk, static void net_prefbusy_poll_flush( fd_net_flusher_t * flusher, long now ) { - flusher->pending_cnt = 0UL; + flusher->pending_cnt = 0UL; flusher->last_prefbusy_poll_ticks = now; } @@ -1255,10 +1253,10 @@ before_credit( fd_net_ctx_t * ctx, fd_xsk_t * rr_xsk = &ctx->xsk[ rr_idx ]; if( FD_LIKELY( rr_xsk->prefbusy_poll_enabled ) ) { - before_credit_prefbusy( ctx, charge_busy, rr_idx, rr_xsk ); + before_credit_prefbusy( ctx, charge_busy, rr_idx, rr_xsk ); } else { - /* Fallback poll mode which relies on linux irqs and wakeups */ - before_credit_softirq( ctx, charge_busy, rr_idx, rr_xsk ); + /* Fallback poll mode which relies on linux irqs and wakeups */ + before_credit_softirq( ctx, charge_busy, rr_idx, rr_xsk ); } /* Fire comp event if we have comp desc avail */ diff --git a/src/waltz/xdp/fd_xsk.c b/src/waltz/xdp/fd_xsk.c index c31e892a670..c1345c3597e 100644 --- a/src/waltz/xdp/fd_xsk.c +++ b/src/waltz/xdp/fd_xsk.c @@ -212,52 +212,52 @@ fd_xsk_setup_umem( fd_xsk_t * xsk, static void fd_xsk_setup_poll( fd_xsk_t * xsk, fd_xsk_params_t const * params ) { - xsk->prefbusy_poll_enabled = 0; - if( 0!=strcmp( params->poll_mode, "prefbusy" ) ) return; - - /* Configure socket options for preferred busy polling */ - - int prefbusy_poll = 1; - if( FD_UNLIKELY( 0!=setsockopt( xsk->xsk_fd, SOL_SOCKET, SO_PREFER_BUSY_POLL, &prefbusy_poll, sizeof(int) ) ) ) { - int err = errno; - FD_LOG_WARNING(( "setsockopt(xsk_fd,SOL_SOCKET,SO_PREFER_BUSY_POLL,1) failed (%i-%s)", err, fd_io_strerror( err ) )); - if( err==EINVAL ) { - FD_LOG_WARNING(( "Hint: Does your kernel support preferred busy polling? SO_PREFER_BUSY_POLL is available from Linux 5.11 onwards" )); - } - return; - } + xsk->prefbusy_poll_enabled = 0; + if( 0!=strcmp( params->poll_mode, "prefbusy" ) ) return; - if( FD_UNLIKELY( 0!=setsockopt( xsk->xsk_fd, SOL_SOCKET, SO_BUSY_POLL, ¶ms->busy_poll_usecs, sizeof(uint) ) ) ) { - FD_LOG_WARNING(( "setsockopt(xsk_fd,SOL_SOCKET,SO_BUSY_POLL,%u) failed (%i-%s)", - params->busy_poll_usecs, errno, fd_io_strerror( errno ) )); - return; - } + /* Configure socket options for preferred busy polling */ - /* The greater busy_poll_budget is, the greater the bias towards max RX pps - over max TX pps in a max network load scenario. */ - uint busy_poll_budget = 64U; - if( FD_UNLIKELY( 0!=setsockopt( xsk->xsk_fd, SOL_SOCKET, SO_BUSY_POLL_BUDGET, &busy_poll_budget, sizeof(uint) ) ) ) { - FD_LOG_WARNING(( "setsockopt(xsk_fd,SOL_SOCKET,SO_BUSY_POLL_BUDGET,%u) failed (%i-%s)", - busy_poll_budget, errno, fd_io_strerror( errno ) )); - return; + int prefbusy_poll = 1; + if( FD_UNLIKELY( 0!=setsockopt( xsk->xsk_fd, SOL_SOCKET, SO_PREFER_BUSY_POLL, &prefbusy_poll, sizeof(int) ) ) ) { + int err = errno; + FD_LOG_WARNING(( "setsockopt(xsk_fd,SOL_SOCKET,SO_PREFER_BUSY_POLL,1) failed (%i-%s)", err, fd_io_strerror( err ) )); + if( err==EINVAL ) { + FD_LOG_WARNING(( "Hint: Does your kernel support preferred busy polling? SO_PREFER_BUSY_POLL is available from Linux 5.11 onwards" )); } + return; + } - /* Set socket non blocking */ + if( FD_UNLIKELY( 0!=setsockopt( xsk->xsk_fd, SOL_SOCKET, SO_BUSY_POLL, ¶ms->busy_poll_usecs, sizeof(uint) ) ) ) { + FD_LOG_WARNING(( "setsockopt(xsk_fd,SOL_SOCKET,SO_BUSY_POLL,%u) failed (%i-%s)", + params->busy_poll_usecs, errno, fd_io_strerror( errno ) )); + return; + } - int sk_flags = fcntl( xsk->xsk_fd, F_GETFL, 0 ); - if( FD_UNLIKELY( sk_flags == -1 ) ) { - FD_LOG_WARNING(( "fcntl(xsk->xsk_fd, F_GETFL, 0) failed (%i-%s)", - errno, fd_io_strerror( errno ) )); - return; - } - if( FD_UNLIKELY( fcntl( xsk->xsk_fd, F_SETFL, sk_flags | O_NONBLOCK ) ) == -1 ) { - FD_LOG_WARNING(( "fcntl(xsk->xsk_fd, F_SETFL, sk_flags | O_NONBLOCK) failed (%i-%s)", - errno, fd_io_strerror( errno ) )); - return; - } + /* The greater busy_poll_budget is, the greater the bias towards max RX pps + over max TX pps in a max network load scenario. */ + uint busy_poll_budget = 64U; + if( FD_UNLIKELY( 0!=setsockopt( xsk->xsk_fd, SOL_SOCKET, SO_BUSY_POLL_BUDGET, &busy_poll_budget, sizeof(uint) ) ) ) { + FD_LOG_WARNING(( "setsockopt(xsk_fd,SOL_SOCKET,SO_BUSY_POLL_BUDGET,%u) failed (%i-%s)", + busy_poll_budget, errno, fd_io_strerror( errno ) )); + return; + } + + /* Set socket non blocking */ + + int sk_flags = fcntl( xsk->xsk_fd, F_GETFL, 0 ); + if( FD_UNLIKELY( sk_flags == -1 ) ) { + FD_LOG_WARNING(( "fcntl(xsk->xsk_fd, F_GETFL, 0) failed (%i-%s)", + errno, fd_io_strerror( errno ) )); + return; + } + if( FD_UNLIKELY( fcntl( xsk->xsk_fd, F_SETFL, sk_flags | O_NONBLOCK ) ) == -1 ) { + FD_LOG_WARNING(( "fcntl(xsk->xsk_fd, F_SETFL, sk_flags | O_NONBLOCK) failed (%i-%s)", + errno, fd_io_strerror( errno ) )); + return; + } - /* Successfully finished setting up prefbusy polling */ - xsk->prefbusy_poll_enabled = 1U; + /* Successfully finished setting up prefbusy polling */ + xsk->prefbusy_poll_enabled = 1U; } /* fd_xsk_init: Creates and configures an XSK socket object, and From d5c0a991c53c9c712bc13bd561edd418f5b85912 Mon Sep 17 00:00:00 2001 From: Tristan Date: Sun, 1 Mar 2026 20:04:19 +0000 Subject: [PATCH 12/12] Make softirq poll mode the default --- src/app/fdctl/config/default.toml | 14 ++++++++++---- src/app/firedancer/config/default.toml | 14 ++++++++++---- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/src/app/fdctl/config/default.toml b/src/app/fdctl/config/default.toml index 0805b1923ca..0209cbad67f 100644 --- a/src/app/fdctl/config/default.toml +++ b/src/app/fdctl/config/default.toml @@ -986,17 +986,23 @@ dynamic_port_range = "8900-9000" # This option moves the management of napi including # when to poll as well as the poll budget, into userspace # if in "prefbusy" mode. The fallback is "softirq" mode, - # which relies much more on linux to manage napi, through + # which relies significantly more on linux to manage napi, through # wakeups, softirqs and under higher network load, a seperate # ksoftirqd thread which linux creates and manages. # + # Please note that even in SKB mode or copy mode, "prefbusy" + # poll mode should work and be effective. + # # "prefbusy" mode is the recommended choice of mode, # as this will also automatically fallback to "softirq" # mode if preferred busy polling is not available or # the right choice for whatever reason (e.g. on an older - # kernel). A warning will be emitted if this fallback is made - # alongside suggestions of why this might have happened. - poll_mode = "prefbusy" + # kernel). A warning will be emitted if this fallback is made. + # + # On Intel's 100Gbps NIC ice driver it is reccommended to use + # "softirq" mode due to it not being able to support "prefbusy" + # mode, however on Mellanox's mlx5 it's well supported. + poll_mode = "softirq" # AF_XDP socket configuration options which will eventually # be moved to being fixed constants prior to the merge of diff --git a/src/app/firedancer/config/default.toml b/src/app/firedancer/config/default.toml index 34344109643..8347cb899e9 100644 --- a/src/app/firedancer/config/default.toml +++ b/src/app/firedancer/config/default.toml @@ -1068,17 +1068,23 @@ telemetry = true # This option moves the management of napi including # when to poll as well as the poll budget, into userspace # if in "prefbusy" mode. The fallback is "softirq" mode, - # which relies much more on linux to manage napi, through + # which relies significantly more on linux to manage napi, through # wakeups, softirqs and under higher network load, a seperate # ksoftirqd thread which linux creates and manages. # + # Please note that even in SKB mode or copy mode, "prefbusy" + # poll mode should work and be effective. + # # "prefbusy" mode is the recommended choice of mode, # as this will also automatically fallback to "softirq" # mode if preferred busy polling is not available or # the right choice for whatever reason (e.g. on an older - # kernel). A warning will be emitted if this fallback is made - # alongside suggestions of why this might have happened. - poll_mode = "prefbusy" + # kernel). A warning will be emitted if this fallback is made. + # + # On Intel's 100Gbps NIC ice driver it is reccommended to use + # "softirq" mode due to it not being able to support "prefbusy" + # mode, however on Mellanox's mlx5 it's well supported. + poll_mode = "softirq" # AF_XDP socket configuration options which will eventually # be moved to being fixed constants prior to the merge of