diff --git a/agave b/agave index 7fe1b415d86..ee7b51e1c2a 160000 --- a/agave +++ b/agave @@ -1 +1 @@ -Subproject commit 7fe1b415d8642a3621a725a382575f9685c4696c +Subproject commit ee7b51e1c2a110a696f9622dfeab92e68d140d05 diff --git a/src/app/fdctl/config/default.toml b/src/app/fdctl/config/default.toml index abe1306c7f4..0209cbad67f 100644 --- a/src/app/fdctl/config/default.toml +++ b/src/app/fdctl/config/default.toml @@ -983,6 +983,52 @@ dynamic_port_range = "8900-9000" # "operation not supported". xdp_zero_copy = false + # This option moves the management of napi including + # when to poll as well as the poll budget, into userspace + # if in "prefbusy" mode. The fallback is "softirq" mode, + # which relies significantly more on linux to manage napi, through + # wakeups, softirqs and under higher network load, a seperate + # ksoftirqd thread which linux creates and manages. + # + # Please note that even in SKB mode or copy mode, "prefbusy" + # poll mode should work and be effective. + # + # "prefbusy" mode is the recommended choice of mode, + # as this will also automatically fallback to "softirq" + # mode if preferred busy polling is not available or + # the right choice for whatever reason (e.g. on an older + # kernel). A warning will be emitted if this fallback is made. + # + # On Intel's 100Gbps NIC ice driver it is reccommended to use + # "softirq" mode due to it not being able to support "prefbusy" + # mode, however on Mellanox's mlx5 it's well supported. + poll_mode = "softirq" + + # AF_XDP socket configuration options which will eventually + # be moved to being fixed constants prior to the merge of + # prefbusy-poll-mode into main. + busy_poll_usecs = 100 + gro_flush_timeout_nanos = 5000000 + + # This is the minimum time between napi polls if in prefbusy + # mode. This is important for protecting against a livelock + # scenario inwhich Firedancer is not given enough time in + # userspace to do work. + # + # This is a protective mechanism against bugs as well + # as to ensure even in a low RX but high TX traffic scenario, + # TX is still given enough time to do work or else + # napi is polled whenever the xsk RX queue is empty + # which could starve userspace TX work in the edge + # case there is significantly more TX than RX traffic. + lwr_prefbusy_poll_timeout_micros = 5 + + # This is the maximum time between napi polls if in prefbusy + # mode. This is to call a napi poll in the case the normal + # prefbusy napi poll scheduling has stalled, napi polls + # can often resolve queue stalls so this increases robustness. + upr_prefbusy_poll_timeout_micros = 150 + # XDP uses metadata queues shared across the kernel and # userspace to relay events about incoming and outgoing packets. # This setting defines the number of entries in these metadata diff --git a/src/app/fdctl/main.c b/src/app/fdctl/main.c index 0589b27490f..3ca58b82e47 100644 --- a/src/app/fdctl/main.c +++ b/src/app/fdctl/main.c @@ -38,6 +38,7 @@ configure_stage_t * STAGES[] = { &fd_cfg_stage_ethtool_channels, &fd_cfg_stage_ethtool_offloads, &fd_cfg_stage_ethtool_loopback, + &fd_cfg_stage_sysfs_poll, NULL, }; diff --git a/src/app/fddev/main.h b/src/app/fddev/main.h index 5028caf57e6..3f8beb20ba7 100644 --- a/src/app/fddev/main.h +++ b/src/app/fddev/main.h @@ -33,6 +33,7 @@ extern configure_stage_t fd_cfg_stage_kill; extern configure_stage_t fd_cfg_stage_genesis; extern configure_stage_t fd_cfg_stage_keys; extern configure_stage_t fd_cfg_stage_blockstore; +extern configure_stage_t fd_cfg_stage_sysfs_poll; configure_stage_t * STAGES[] = { &fd_cfg_stage_kill, @@ -46,6 +47,7 @@ configure_stage_t * STAGES[] = { &fd_cfg_stage_keys, &fd_cfg_stage_genesis, &fd_cfg_stage_blockstore, + &fd_cfg_stage_sysfs_poll, NULL, }; diff --git a/src/app/firedancer/config/default.toml b/src/app/firedancer/config/default.toml index d6d1b8bb1ee..8347cb899e9 100644 --- a/src/app/firedancer/config/default.toml +++ b/src/app/firedancer/config/default.toml @@ -1065,6 +1065,52 @@ telemetry = true # "operation not supported". xdp_zero_copy = false + # This option moves the management of napi including + # when to poll as well as the poll budget, into userspace + # if in "prefbusy" mode. The fallback is "softirq" mode, + # which relies significantly more on linux to manage napi, through + # wakeups, softirqs and under higher network load, a seperate + # ksoftirqd thread which linux creates and manages. + # + # Please note that even in SKB mode or copy mode, "prefbusy" + # poll mode should work and be effective. + # + # "prefbusy" mode is the recommended choice of mode, + # as this will also automatically fallback to "softirq" + # mode if preferred busy polling is not available or + # the right choice for whatever reason (e.g. on an older + # kernel). A warning will be emitted if this fallback is made. + # + # On Intel's 100Gbps NIC ice driver it is reccommended to use + # "softirq" mode due to it not being able to support "prefbusy" + # mode, however on Mellanox's mlx5 it's well supported. + poll_mode = "softirq" + + # AF_XDP socket configuration options which will eventually + # be moved to being fixed constants prior to the merge of + # prefbusy-poll-mode into main. + busy_poll_usecs = 100 + gro_flush_timeout_nanos = 5000000 + + # This is the minimum time between napi polls if in prefbusy + # mode. This is important for protecting against a livelock + # scenario inwhich Firedancer is not given enough time in + # userspace to do work. + # + # This is a protective mechanism against bugs as well + # as to ensure even in a low RX but high TX traffic scenario, + # TX is still given enough time to do work or else + # napi is polled whenever the xsk RX queue is empty + # which could starve userspace TX work in the edge + # case there is significantly more TX than RX traffic. + lwr_prefbusy_poll_timeout_micros = 5 + + # This is the maximum time between napi polls if in prefbusy + # mode. This is to call a napi poll in the case the normal + # prefbusy napi poll scheduling has stalled, napi polls + # can often resolve queue stalls so this increases robustness. + upr_prefbusy_poll_timeout_micros = 150 + # XDP uses metadata queues shared across the kernel and # userspace to relay events about incoming and outgoing packets. # This setting defines the number of entries in these metadata diff --git a/src/app/firedancer/main.c b/src/app/firedancer/main.c index 33e84f4bc74..1976e4498d7 100644 --- a/src/app/firedancer/main.c +++ b/src/app/firedancer/main.c @@ -74,6 +74,7 @@ configure_stage_t * STAGES[] = { &fd_cfg_stage_ethtool_loopback, &fd_cfg_stage_snapshots, &fd_cfg_stage_accdb, + &fd_cfg_stage_sysfs_poll, NULL, }; diff --git a/src/app/shared/Local.mk b/src/app/shared/Local.mk index 561be32d419..bc114679e03 100644 --- a/src/app/shared/Local.mk +++ b/src/app/shared/Local.mk @@ -31,6 +31,7 @@ $(call add-objs,commands/configure/fd_ethtool_ioctl,fdctl_shared) $(call add-objs,commands/configure/hugetlbfs,fdctl_shared) $(call add-objs,commands/configure/hyperthreads,fdctl_shared) $(call add-objs,commands/configure/sysctl,fdctl_shared) +$(call add-objs,commands/configure/sysfs-poll,fdctl_shared) $(call add-objs,commands/configure/snapshots,fdctl_shared) $(call add-objs,commands/monitor/monitor commands/monitor/helper,fdctl_shared) $(call add-objs,commands/watch/watch,fdctl_shared) diff --git a/src/app/shared/commands/configure/configure.h b/src/app/shared/commands/configure/configure.h index fc5eea3777e..713de669404 100644 --- a/src/app/shared/commands/configure/configure.h +++ b/src/app/shared/commands/configure/configure.h @@ -84,6 +84,7 @@ extern configure_stage_t fd_cfg_stage_bonding; extern configure_stage_t fd_cfg_stage_ethtool_channels; extern configure_stage_t fd_cfg_stage_ethtool_offloads; extern configure_stage_t fd_cfg_stage_ethtool_loopback; +extern configure_stage_t fd_cfg_stage_sysfs_poll; extern configure_stage_t fd_cfg_stage_snapshots; extern configure_stage_t * STAGES[]; diff --git a/src/app/shared/commands/configure/sysfs-poll.c b/src/app/shared/commands/configure/sysfs-poll.c new file mode 100644 index 00000000000..20f87c45b5e --- /dev/null +++ b/src/app/shared/commands/configure/sysfs-poll.c @@ -0,0 +1,83 @@ +/* This stage configures the OS to support effective preferred busy + polling, allowing for significantly improved network stack (XDP) + performance if enabled. */ + +#include "configure.h" + +#define NAME "sysfs-poll" + +#include "../../../platform/fd_file_util.h" + +#include +#include +#include /* access */ +#include + +#define VERY_HIGH_VAL 1000000U + +static char const setting_napi_defer_hard_irqs[] = "napi_defer_hard_irqs"; + +static char const setting_gro_flush_timeout[] = "gro_flush_timeout"; + +static int +enabled( config_t const * config ) { + return !strcmp( config->net.xdp.poll_mode, "prefbusy" ); +} + +static void +init_perm ( fd_cap_chk_t * chk, + config_t const * config FD_PARAM_UNUSED ) { + fd_cap_chk_cap( chk, NAME, CAP_NET_ADMIN, "configure preferred busy polling via `/sys/class/net/*/{napi_defer_hard_irqs, gro_flush_timeout}`" ); +} + +static void +sysfs_net_set( char const * device, + char const * setting, + ulong value ) { + char path[ PATH_MAX ]; + fd_cstr_printf_check( path, PATH_MAX, NULL, "/sys/class/net/%s/%s", device, setting ); + FD_LOG_NOTICE(( "RUN: `echo \"%lu\" > %s`", value, path )); + fd_file_util_write_uint( path, (uint)value ); +} + +static void +init( config_t const * config ) { + sysfs_net_set( config->net.interface, setting_napi_defer_hard_irqs, VERY_HIGH_VAL ); + sysfs_net_set( config->net.interface, setting_gro_flush_timeout, config->net.xdp.gro_flush_timeout_nanos ); +} + +static int +fini( config_t const * config, + int pre_init FD_PARAM_UNUSED ) { + sysfs_net_set( config->net.interface, setting_napi_defer_hard_irqs, 0U ); + sysfs_net_set( config->net.interface, setting_gro_flush_timeout, 0U ); + return 1; +} + +static configure_result_t +check( config_t const * config, + int check_type FD_PARAM_UNUSED ) { + char path[ PATH_MAX ]; + uint value; + fd_cstr_printf_check( path, PATH_MAX, NULL, "/sys/class/net/%s/%s", config->net.interface, setting_napi_defer_hard_irqs ); + if( fd_file_util_read_uint( path, &value ) || value < VERY_HIGH_VAL ) { + NOT_CONFIGURED("Setting napi_defer_hard_irqs failed."); + } + + fd_cstr_printf_check( path, PATH_MAX, NULL, "/sys/class/net/%s/%s", config->net.interface, setting_gro_flush_timeout ); + if( fd_file_util_read_uint( path, &value ) || value != config->net.xdp.gro_flush_timeout_nanos ) { + NOT_CONFIGURED("Setting gro_flush_timeout failed."); + } + + CONFIGURE_OK(); +} + +configure_stage_t fd_cfg_stage_sysfs_poll = { + .name = NAME, + .enabled = enabled, + .init_perm = init_perm, + .fini_perm = init_perm, + .init = init, + .fini = fini, + .check = check, +}; diff --git a/src/app/shared/fd_config.h b/src/app/shared/fd_config.h index 50e2194d373..b7ab58062c3 100644 --- a/src/app/shared/fd_config.h +++ b/src/app/shared/fd_config.h @@ -205,6 +205,12 @@ struct fd_config_net { struct { char xdp_mode[ 8 ]; int xdp_zero_copy; + + char poll_mode[ 16 ]; /* "prefbusy" or "softirq" */ + uint busy_poll_usecs; + ulong gro_flush_timeout_nanos; + uint lwr_prefbusy_poll_timeout_micros; + uint upr_prefbusy_poll_timeout_micros; uint xdp_rx_queue_size; uint xdp_tx_queue_size; diff --git a/src/app/shared/fd_config_parse.c b/src/app/shared/fd_config_parse.c index 6881308c680..9ce24ba726c 100644 --- a/src/app/shared/fd_config_parse.c +++ b/src/app/shared/fd_config_parse.c @@ -184,6 +184,11 @@ fd_config_extract_pod( uchar * pod, CFG_POP ( uint, net.ingress_buffer_size ); CFG_POP ( cstr, net.xdp.xdp_mode ); CFG_POP ( bool, net.xdp.xdp_zero_copy ); + CFG_POP ( cstr, net.xdp.poll_mode ); + CFG_POP ( uint, net.xdp.busy_poll_usecs ); + CFG_POP ( ulong, net.xdp.gro_flush_timeout_nanos ); + CFG_POP ( uint, net.xdp.lwr_prefbusy_poll_timeout_micros ); + CFG_POP ( uint, net.xdp.upr_prefbusy_poll_timeout_micros ); CFG_POP ( uint, net.xdp.xdp_rx_queue_size ); CFG_POP ( uint, net.xdp.xdp_tx_queue_size ); CFG_POP ( uint, net.xdp.flush_timeout_micros ); diff --git a/src/app/shared_dev/commands/pktgen/pktgen.c b/src/app/shared_dev/commands/pktgen/pktgen.c index 0360362ddba..6b191c41189 100644 --- a/src/app/shared_dev/commands/pktgen/pktgen.c +++ b/src/app/shared_dev/commands/pktgen/pktgen.c @@ -208,6 +208,7 @@ pktgen_cmd_fn( args_t * args FD_PARAM_UNUSED, configure_stage( &fd_cfg_stage_bonding, CONFIGURE_CMD_INIT, config ); configure_stage( &fd_cfg_stage_ethtool_channels, CONFIGURE_CMD_INIT, config ); configure_stage( &fd_cfg_stage_ethtool_offloads, CONFIGURE_CMD_INIT, config ); + configure_stage( &fd_cfg_stage_sysfs_poll, CONFIGURE_CMD_INIT, config ); fdctl_check_configure( config ); /* FIXME this allocates lots of memory unnecessarily */ diff --git a/src/app/shared_dev/commands/udpecho/udpecho.c b/src/app/shared_dev/commands/udpecho/udpecho.c index 725263d8024..8b8bebf7273 100644 --- a/src/app/shared_dev/commands/udpecho/udpecho.c +++ b/src/app/shared_dev/commands/udpecho/udpecho.c @@ -100,6 +100,7 @@ udpecho_cmd_fn( args_t * args, configure_stage( &fd_cfg_stage_ethtool_channels, CONFIGURE_CMD_INIT, config ); configure_stage( &fd_cfg_stage_ethtool_offloads, CONFIGURE_CMD_INIT, config ); configure_stage( &fd_cfg_stage_ethtool_loopback, CONFIGURE_CMD_INIT, config ); + configure_stage( &fd_cfg_stage_sysfs_poll, CONFIGURE_CMD_INIT, config ); fdctl_check_configure( config ); /* FIXME this allocates lots of memory unnecessarily */ diff --git a/src/disco/net/fd_net_tile_topo.c b/src/disco/net/fd_net_tile_topo.c index c48ee1eda60..e0ba64735c7 100644 --- a/src/disco/net/fd_net_tile_topo.c +++ b/src/disco/net/fd_net_tile_topo.c @@ -44,6 +44,14 @@ setup_xdp_tile( fd_topo_t * topo, tile->xdp.zero_copy = net_cfg->xdp.xdp_zero_copy; fd_cstr_ncpy( tile->xdp.xdp_mode, net_cfg->xdp.xdp_mode, sizeof(tile->xdp.xdp_mode) ); + fd_cstr_ncpy( tile->xdp.poll_mode, net_cfg->xdp.poll_mode, sizeof(tile->xdp.poll_mode) ); + + tile->xdp.busy_poll_usecs = net_cfg->xdp.busy_poll_usecs; + tile->xdp.gro_flush_timeout_nanos = net_cfg->xdp.gro_flush_timeout_nanos; + + tile->xdp.lwr_prefbusy_poll_timeout_ns = (long)net_cfg->xdp.lwr_prefbusy_poll_timeout_micros * 1000L; + tile->xdp.upr_prefbusy_poll_timeout_ns = (long)net_cfg->xdp.upr_prefbusy_poll_timeout_micros * 1000L; + tile->xdp.net.umem_dcache_obj_id = umem_obj->id; tile->xdp.netdev_dbl_buf_obj_id = netlink_tile->netlink.netdev_dbl_buf_obj_id; tile->xdp.fib4_main_obj_id = netlink_tile->netlink.fib4_main_obj_id; diff --git a/src/disco/net/xdp/fd_xdp_tile.c b/src/disco/net/xdp/fd_xdp_tile.c index c6955ce5972..1e8c37c08c1 100644 --- a/src/disco/net/xdp/fd_xdp_tile.c +++ b/src/disco/net/xdp/fd_xdp_tile.c @@ -95,10 +95,13 @@ struct fd_net_flusher { wakeup. This can result in the tail of a burst getting delayed or overrun. If more than tail_flush_backoff ticks pass since the last sendto() wakeup and there are still unacknowledged packets in the - TX ring, issues another wakeup. */ + TX ring, issues another wakeup. Only used by "softirq" poll mode. */ long next_tail_flush_ticks; long tail_flush_backoff; + long last_prefbusy_poll_ticks; + long lwr_prefbusy_poll_ticks; + long upr_prefbusy_poll_ticks; }; typedef struct fd_net_flusher fd_net_flusher_t; @@ -213,6 +216,10 @@ typedef struct { ushort repair_serve_listen_port; ushort txsend_src_port; + char poll_mode[ 16 ]; + uint busy_poll_usecs; + ulong gro_flush_timeout_nanos; + ulong in_cnt; fd_net_in_ctx_t in[ MAX_NET_INS ]; @@ -1122,6 +1129,103 @@ net_rx_event( fd_net_ctx_t * ctx, fill_ring->cached_prod = fill_prod+1U; } +/* before_credit_softirq is called every loop iteration if net tile + is in softirq polling mode (fallback if prefbusy polling mode + is not possible). */ + +static void +before_credit_softirq( fd_net_ctx_t * ctx, + int * charge_busy, + uint rr_idx, + fd_xsk_t * rr_xsk ) { + + net_tx_periodic_wakeup( ctx, rr_idx, fd_tickcount(), charge_busy ); + + /* Fire RX event if we have RX desc avail */ + if( !fd_xdp_ring_empty( &rr_xsk->ring_rx, FD_XDP_RING_ROLE_CONS ) ) { + *charge_busy = 1; + net_rx_event( ctx, rr_xsk, rr_xsk->ring_rx.cached_cons ); + } else { + net_rx_wakeup( ctx, rr_xsk, charge_busy ); + + /* Iterate onto the next NAPI queue. */ + ctx->rr_idx++; + ctx->rr_idx = fd_uint_if( ctx->rr_idx>=ctx->xsk_cnt, 0, ctx->rr_idx ); + } +} + +static int +net_prefbusy_poll_ready( fd_xsk_t * rr_xsk, + fd_net_flusher_t * flusher ) { + + if( FD_UNLIKELY( fd_tickcount() < ( flusher->last_prefbusy_poll_ticks + flusher->lwr_prefbusy_poll_ticks ) ) ) return 0; + if( FD_UNLIKELY( fd_tickcount() > ( flusher->last_prefbusy_poll_ticks + flusher->upr_prefbusy_poll_ticks ) ) ) return 1; + + if( FD_UNLIKELY( fd_xdp_ring_empty( &rr_xsk->ring_tx, FD_XDP_RING_ROLE_PROD ) ) ) { + flusher->pending_cnt = 0UL; + } + + int rx_empty = fd_xdp_ring_empty( &rr_xsk->ring_rx, FD_XDP_RING_ROLE_CONS ); + + return rx_empty; +} + +static void +net_prefbusy_poll_flush( fd_net_flusher_t * flusher, + long now ) { + flusher->pending_cnt = 0UL; + flusher->last_prefbusy_poll_ticks = now; +} + +/* before_credit_prefbusy is called every loop iteration if net + tile is in preferred busy (often referred to as "prefbusy" in + Firedancer) polling mode. */ + +static void +before_credit_prefbusy( fd_net_ctx_t * ctx, + int * charge_busy, + uint rr_idx, + fd_xsk_t * rr_xsk ) { + + fd_net_flusher_t * flusher = ctx->tx_flusher+rr_idx; + if( FD_UNLIKELY( net_prefbusy_poll_ready( rr_xsk, flusher ) ) ) { + /* NAPI needs to be polled to process new TX from + Firedancer's net tile and process new RX from the NIC. */ + + FD_VOLATILE( *rr_xsk->ring_tx.prod ) = rr_xsk->ring_tx.cached_prod; /* write-back local copies to fseqs */ + FD_VOLATILE( *rr_xsk->ring_cr.cons ) = rr_xsk->ring_cr.cached_cons; + FD_VOLATILE( *rr_xsk->ring_rx.cons ) = rr_xsk->ring_rx.cached_cons; + FD_VOLATILE( *rr_xsk->ring_fr.prod ) = rr_xsk->ring_fr.cached_prod; + + if( FD_UNLIKELY( -1==sendto( rr_xsk->xsk_fd, NULL, 0, MSG_DONTWAIT, NULL, 0 ) ) ) { + if( FD_UNLIKELY( net_is_fatal_xdp_error( errno ) ) ) { + FD_LOG_ERR(( "xsk sendto failed xsk_fd=%d (%i-%s)", rr_xsk->xsk_fd, errno, fd_io_strerror( errno ) )); + } + if( FD_UNLIKELY( errno!=EAGAIN ) ) { + long ts = fd_log_wallclock(); + if( ts > rr_xsk->log_suppress_until_ns ) { + FD_LOG_WARNING(( "xsk sendto failed xsk_fd=%d (%i-%s)", rr_xsk->xsk_fd, errno, fd_io_strerror( errno ) )); + rr_xsk->log_suppress_until_ns = ts + (long)1e9; + } + } + } + net_prefbusy_poll_flush( flusher, fd_tickcount() ); + + /* Since xsk sendmsg in prefbusy mode drives both rx and tx, both are incremented */ + ctx->metrics.xsk_tx_wakeup_cnt++; + ctx->metrics.xsk_rx_wakeup_cnt++; + } + + /* Process new RX from kernel driver if there is any. */ + if( !fd_xdp_ring_empty( &rr_xsk->ring_rx, FD_XDP_RING_ROLE_CONS ) ) { + *charge_busy = 1; + net_rx_event( ctx, rr_xsk, rr_xsk->ring_rx.cached_cons ); + } + /* Iterate onto the next NAPI queue. */ + ctx->rr_idx++; + ctx->rr_idx = fd_uint_if( ctx->rr_idx>=ctx->xsk_cnt, 0, ctx->rr_idx ); +} + /* before_credit is called every loop iteration. */ static void @@ -1148,16 +1252,11 @@ before_credit( fd_net_ctx_t * ctx, uint rr_idx = ctx->rr_idx; fd_xsk_t * rr_xsk = &ctx->xsk[ rr_idx ]; - net_tx_periodic_wakeup( ctx, rr_idx, fd_tickcount(), charge_busy ); - - /* Fire RX event if we have RX desc avail */ - if( !fd_xdp_ring_empty( &rr_xsk->ring_rx, FD_XDP_RING_ROLE_CONS ) ) { - *charge_busy = 1; - net_rx_event( ctx, rr_xsk, rr_xsk->ring_rx.cached_cons ); + if( FD_LIKELY( rr_xsk->prefbusy_poll_enabled ) ) { + before_credit_prefbusy( ctx, charge_busy, rr_idx, rr_xsk ); } else { - net_rx_wakeup( ctx, rr_xsk, charge_busy ); - ctx->rr_idx++; - ctx->rr_idx = fd_uint_if( ctx->rr_idx>=ctx->xsk_cnt, 0, ctx->rr_idx ); + /* Fallback poll mode which relies on linux irqs and wakeups */ + before_credit_softirq( ctx, charge_busy, rr_idx, rr_xsk ); } /* Fire comp event if we have comp desc avail */ @@ -1288,6 +1387,10 @@ privileged_init( fd_topo_t * topo, (e.g. 5.14.0-503.23.1.el9_5 with i40e) */ .bind_flags = tile->xdp.zero_copy ? XDP_ZEROCOPY : XDP_COPY, + .poll_mode = tile->xdp.poll_mode, + .busy_poll_usecs = tile->xdp.busy_poll_usecs, + .gro_flush_timeout_nanos = tile->xdp.gro_flush_timeout_nanos, + .fr_depth = tile->xdp.xdp_rx_queue_size*2, .rx_depth = tile->xdp.xdp_rx_queue_size, .cr_depth = tile->xdp.xdp_tx_queue_size, @@ -1422,6 +1525,10 @@ unprivileged_init( fd_topo_t * topo, ctx->repair_serve_listen_port = tile->net.repair_serve_listen_port; ctx->txsend_src_port = tile->net.txsend_src_port; + strcpy( ctx->poll_mode, tile->xdp.poll_mode ); + ctx->busy_poll_usecs = tile->xdp.busy_poll_usecs; + ctx->gro_flush_timeout_nanos = tile->xdp.gro_flush_timeout_nanos; + /* Put a bound on chunks we read from the input, to make sure they are within in the data region of the workspace. */ @@ -1502,6 +1609,10 @@ unprivileged_init( fd_topo_t * topo, ctx->tx_flusher[ j ].pending_wmark = (ulong)( (double)tile->xdp.xdp_tx_queue_size * 0.7 ); ctx->tx_flusher[ j ].tail_flush_backoff = (long)( (double)tile->xdp.tx_flush_timeout_ns * fd_tempo_tick_per_ns( NULL ) ); ctx->tx_flusher[ j ].next_tail_flush_ticks = LONG_MAX; + + ctx->tx_flusher[ j ].last_prefbusy_poll_ticks = 0UL; + ctx->tx_flusher[ j ].lwr_prefbusy_poll_ticks = (long)( (double)tile->xdp.lwr_prefbusy_poll_timeout_ns * fd_tempo_tick_per_ns( NULL ) ); + ctx->tx_flusher[ j ].upr_prefbusy_poll_ticks = (long)( (double)tile->xdp.upr_prefbusy_poll_timeout_ns * fd_tempo_tick_per_ns( NULL ) ); } /* Join netbase objects */ diff --git a/src/disco/topo/fd_topo.h b/src/disco/topo/fd_topo.h index dcb433adb1d..2f9279bfca5 100644 --- a/src/disco/topo/fd_topo.h +++ b/src/disco/topo/fd_topo.h @@ -182,6 +182,12 @@ struct fd_topo_tile { char xdp_mode[8]; int zero_copy; + char poll_mode[ 16 ]; /* "prefbusy" or "softirq" */ + uint busy_poll_usecs; + ulong gro_flush_timeout_nanos; + long lwr_prefbusy_poll_timeout_ns; + long upr_prefbusy_poll_timeout_ns; + ulong netdev_dbl_buf_obj_id; /* dbl_buf containing netdev_tbl */ ulong fib4_main_obj_id; /* fib4 containing main route table */ ulong fib4_local_obj_id; /* fib4 containing local route table */ diff --git a/src/waltz/xdp/fd_xsk.c b/src/waltz/xdp/fd_xsk.c index 89efba4a737..c1345c3597e 100644 --- a/src/waltz/xdp/fd_xsk.c +++ b/src/waltz/xdp/fd_xsk.c @@ -7,6 +7,7 @@ #include #include /* snprintf */ #include +#include #include /* mmap */ #include #include /* sendto */ @@ -15,6 +16,24 @@ #include "../../util/log/fd_log.h" #include "fd_xsk.h" +/* Support for older kernels */ + +#ifndef SO_BUSY_POLL +#define SO_BUSY_POLL 46 +#endif + +#ifndef SO_INCOMING_NAPI_ID +#define SO_INCOMING_NAPI_ID 56 +#endif + +#ifndef SO_PREFER_BUSY_POLL +#define SO_PREFER_BUSY_POLL 69 +#endif + +#ifndef SO_BUSY_POLL_BUDGET +#define SO_BUSY_POLL_BUDGET 70 +#endif + /* Join/leave *********************************************************/ /* fd_xsk_mmap_offset_cstr: Returns a cstr describing the given offset @@ -187,6 +206,60 @@ fd_xsk_setup_umem( fd_xsk_t * xsk, return 0; } +/* fd_xsk_setup_poll: Setup preferred busy polling if the user has + set that to be their preferred polling method */ + +static void +fd_xsk_setup_poll( fd_xsk_t * xsk, + fd_xsk_params_t const * params ) { + xsk->prefbusy_poll_enabled = 0; + if( 0!=strcmp( params->poll_mode, "prefbusy" ) ) return; + + /* Configure socket options for preferred busy polling */ + + int prefbusy_poll = 1; + if( FD_UNLIKELY( 0!=setsockopt( xsk->xsk_fd, SOL_SOCKET, SO_PREFER_BUSY_POLL, &prefbusy_poll, sizeof(int) ) ) ) { + int err = errno; + FD_LOG_WARNING(( "setsockopt(xsk_fd,SOL_SOCKET,SO_PREFER_BUSY_POLL,1) failed (%i-%s)", err, fd_io_strerror( err ) )); + if( err==EINVAL ) { + FD_LOG_WARNING(( "Hint: Does your kernel support preferred busy polling? SO_PREFER_BUSY_POLL is available from Linux 5.11 onwards" )); + } + return; + } + + if( FD_UNLIKELY( 0!=setsockopt( xsk->xsk_fd, SOL_SOCKET, SO_BUSY_POLL, ¶ms->busy_poll_usecs, sizeof(uint) ) ) ) { + FD_LOG_WARNING(( "setsockopt(xsk_fd,SOL_SOCKET,SO_BUSY_POLL,%u) failed (%i-%s)", + params->busy_poll_usecs, errno, fd_io_strerror( errno ) )); + return; + } + + /* The greater busy_poll_budget is, the greater the bias towards max RX pps + over max TX pps in a max network load scenario. */ + uint busy_poll_budget = 64U; + if( FD_UNLIKELY( 0!=setsockopt( xsk->xsk_fd, SOL_SOCKET, SO_BUSY_POLL_BUDGET, &busy_poll_budget, sizeof(uint) ) ) ) { + FD_LOG_WARNING(( "setsockopt(xsk_fd,SOL_SOCKET,SO_BUSY_POLL_BUDGET,%u) failed (%i-%s)", + busy_poll_budget, errno, fd_io_strerror( errno ) )); + return; + } + + /* Set socket non blocking */ + + int sk_flags = fcntl( xsk->xsk_fd, F_GETFL, 0 ); + if( FD_UNLIKELY( sk_flags == -1 ) ) { + FD_LOG_WARNING(( "fcntl(xsk->xsk_fd, F_GETFL, 0) failed (%i-%s)", + errno, fd_io_strerror( errno ) )); + return; + } + if( FD_UNLIKELY( fcntl( xsk->xsk_fd, F_SETFL, sk_flags | O_NONBLOCK ) ) == -1 ) { + FD_LOG_WARNING(( "fcntl(xsk->xsk_fd, F_SETFL, sk_flags | O_NONBLOCK) failed (%i-%s)", + errno, fd_io_strerror( errno ) )); + return; + } + + /* Successfully finished setting up prefbusy polling */ + xsk->prefbusy_poll_enabled = 1U; +} + /* fd_xsk_init: Creates and configures an XSK socket object, and attaches to a preinstalled XDP program. The various steps are implemented in fd_xsk_setup_{...}. */ @@ -289,6 +362,32 @@ fd_xsk_init( fd_xsk_t * xsk, FD_LOG_INFO(( "AF_XDP socket initialized: bind( PF_XDP, ifindex=%u (%s), queue_id=%u, flags=%x ) success", xsk->if_idx, if_indextoname( xsk->if_idx, if_name ), xsk->if_queue_id, flags )); + /* Check if the XSK is aware of the driver's NAPI ID for the + associated RX queue. Without it, preferred busy polling is not + going to work correctly. Note it's not always associated straight + away so xsk->napi_id can sometimes be set to 0 when it shouldn't be. + This is not an issue currently as the napi_id is not used yet. */ + + socklen_t napi_id_sz = sizeof(uint); + if( FD_UNLIKELY( 0!=getsockopt( xsk->xsk_fd, SOL_SOCKET, SO_INCOMING_NAPI_ID, &xsk->napi_id, &napi_id_sz ) ) ) { + if( errno==ENOPROTOOPT ) { + xsk->napi_id = 0; + } else { + FD_LOG_WARNING(( "getsockopt(SOL_SOCKET,SO_INCOMING_NAPI_ID) failed (%i-%s)", errno, fd_io_strerror( errno ) )); + goto fail; + } + } + + if( xsk->napi_id ) { + FD_LOG_DEBUG(( "Interface %u Queue %u has NAPI ID %u", xsk->if_idx, xsk->if_queue_id, xsk->napi_id )); + } else { + FD_LOG_DEBUG(( "Interface %u Queue %u has unknown NAPI ID", xsk->if_idx, xsk->if_queue_id )); + } + + /* If requested, enable preferred busy polling */ + + fd_xsk_setup_poll( xsk, params ); + return xsk; fail: diff --git a/src/waltz/xdp/fd_xsk.h b/src/waltz/xdp/fd_xsk.h index 55cfc1a4774..22d5a3e0f22 100644 --- a/src/waltz/xdp/fd_xsk.h +++ b/src/waltz/xdp/fd_xsk.h @@ -187,8 +187,8 @@ fd_xdp_ring_full( fd_xdp_ring_t * ring ) { return ring->cached_prod - ring->cached_cons >= ring->depth; } -/* fd_xsk_params_t: Memory layout parameters of XSK. - Can be retrieved using fd_xsk_get_params() */ +/* fd_xsk_params_t: XSK poll configuration and memory layout + parameters. Can be retrieved using fd_xsk_get_params() */ struct fd_xsk_params { /* {fr,rx,tx,cr}_depth: Number of frames allocated for the Fill, RX, @@ -217,6 +217,15 @@ struct fd_xsk_params { /* sockaddr_xdp.sxdp_flags additional params, e.g. XDP_ZEROCOPY */ uint bind_flags; + char * poll_mode; + + /* max time waiting for work during prefbusy napi poll. */ + uint busy_poll_usecs; + + /* max time linux waits for userspace to poll napi before + calling a softirq. */ + ulong gro_flush_timeout_nanos; + /* whether the xsk memory should be included in core dumps */ int core_dump; }; @@ -236,6 +245,13 @@ struct fd_xsk { /* AF_XDP socket file descriptor */ int xsk_fd; + /* Whether preferred busy polling was successfully enabled + during XSK socket setup. */ + int prefbusy_poll_enabled; + + /* napi_id: ID of this specific NAPI instance */ + uint napi_id; + /* ring_{rx,tx,fr,cr}: XSK ring descriptors */ fd_xdp_ring_t ring_rx;