From 189d11ceabd16a011bdf961dd6eab97d43a9c291 Mon Sep 17 00:00:00 2001 From: Michael Margolin Date: Mon, 16 Feb 2026 17:19:55 +0000 Subject: [PATCH] libibverbs: Introduce Completion Counters verbs Extend verbs interface to support Completion Counters that can be seen as a light-weight alternative to polling CQ. A completion counter object separately counts successful and error completions, can be attached to multiple QPs and be configured to count completions of a subset of operation types. This is especially useful for batch or credit based workloads running on accelerators but can serve many other types of applications as well. Expose supported number of completion counters through query device extended verb. Reviewed-by: Yonatan Nachum Signed-off-by: Michael Margolin --- libibverbs/examples/devinfo.c | 1 + libibverbs/man/CMakeLists.txt | 7 + libibverbs/man/ibv_create_comp_cntr.3.md | 245 ++++++++++++++++++++ libibverbs/man/ibv_qp_attach_comp_cntr.3.md | 118 ++++++++++ libibverbs/man/ibv_query_device_ex.3 | 1 + libibverbs/verbs.h | 103 ++++++++ 6 files changed, 475 insertions(+) create mode 100644 libibverbs/man/ibv_create_comp_cntr.3.md create mode 100644 libibverbs/man/ibv_qp_attach_comp_cntr.3.md diff --git a/libibverbs/examples/devinfo.c b/libibverbs/examples/devinfo.c index c245b1f28..f44dd18b8 100644 --- a/libibverbs/examples/devinfo.c +++ b/libibverbs/examples/devinfo.c @@ -585,6 +585,7 @@ static int print_hca_cap(struct ibv_device *ib_dev, uint8_t ib_port) printf("\tmax_srq_sge:\t\t\t%d\n", device_attr.orig_attr.max_srq_sge); } printf("\tmax_pkeys:\t\t\t%d\n", device_attr.orig_attr.max_pkeys); + printf("\tmax_comp_cntr:\t\t\t\t%d\n", device_attr.max_comp_cntr); printf("\tlocal_ca_ack_delay:\t\t%d\n", device_attr.orig_attr.local_ca_ack_delay); print_odp_caps(&device_attr); diff --git a/libibverbs/man/CMakeLists.txt b/libibverbs/man/CMakeLists.txt index f498c1532..158a5127f 100644 --- a/libibverbs/man/CMakeLists.txt +++ b/libibverbs/man/CMakeLists.txt @@ -14,7 +14,9 @@ rdma_man_pages( ibv_create_ah.3 ibv_create_ah_from_wc.3 ibv_create_comp_channel.3 + ibv_create_comp_cntr.3.md ibv_create_counters.3.md + ibv_qp_attach_comp_cntr.3.md ibv_create_cq.3 ibv_create_cq_ex.3 ibv_modify_cq.3 @@ -98,6 +100,11 @@ rdma_alias_man_pages( ibv_create_ah.3 ibv_destroy_ah.3 ibv_create_ah_from_wc.3 ibv_init_ah_from_wc.3 ibv_create_comp_channel.3 ibv_destroy_comp_channel.3 + ibv_create_comp_cntr.3 ibv_destroy_comp_cntr.3 + ibv_create_comp_cntr.3 ibv_set_comp_cntr.3 + ibv_create_comp_cntr.3 ibv_set_err_comp_cntr.3 + ibv_create_comp_cntr.3 ibv_inc_comp_cntr.3 + ibv_create_comp_cntr.3 ibv_inc_err_comp_cntr.3 ibv_create_counters.3 ibv_destroy_counters.3 ibv_create_cq.3 ibv_destroy_cq.3 ibv_create_flow.3 ibv_destroy_flow.3 diff --git a/libibverbs/man/ibv_create_comp_cntr.3.md b/libibverbs/man/ibv_create_comp_cntr.3.md new file mode 100644 index 000000000..80fca5315 --- /dev/null +++ b/libibverbs/man/ibv_create_comp_cntr.3.md @@ -0,0 +1,245 @@ +--- +date: 2026-02-09 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: ibv_create_comp_cntr +tagline: Verbs +--- + +# NAME + +**ibv_create_comp_cntr**, **ibv_destroy_comp_cntr** - Create or destroy a +completion counter + +**ibv_set_comp_cntr**, **ibv_set_err_comp_cntr** - Set the value of a +completion or error counter + +**ibv_inc_comp_cntr**, **ibv_inc_err_comp_cntr** - Increment a completion or +error counter + +# SYNOPSIS + +```c +#include + +struct ibv_comp_cntr *ibv_create_comp_cntr(struct ibv_context *context, + struct ibv_comp_cntr_init_attr *cc_attr); + +int ibv_destroy_comp_cntr(struct ibv_comp_cntr *comp_cntr); + +int ibv_set_comp_cntr(struct ibv_comp_cntr *comp_cntr, uint64_t value); +int ibv_set_err_comp_cntr(struct ibv_comp_cntr *comp_cntr, uint64_t value); +int ibv_inc_comp_cntr(struct ibv_comp_cntr *comp_cntr, uint64_t amount); +int ibv_inc_err_comp_cntr(struct ibv_comp_cntr *comp_cntr, uint64_t amount); +``` + +# DESCRIPTION + +Completion counters provide a lightweight completion mechanism as an +alternative or extension to completion queues (CQs). Rather than generating +individual completion queue entries, a completion counter tracks the aggregate +number of completed operations. This makes them well suited for applications +that need to know how many requests have completed without requiring +per-request details, such as credit based flow control or tracking responses +from remote peers. + +Each completion counter maintains two distinct 64-bit values: a completion +count that is incremented on successful completions, and an error count that +is incremented when operations complete in error. + +**ibv_create_comp_cntr**() allocates a new completion counter for the RDMA +device context *context*. The properties of the counter are defined by +*cc_attr*. On success, the returned **ibv_comp_cntr** structure contains +pointers to the completion and error count values. The maximum number of +completion counters a device supports is reported by the *max_comp_cntr* +field of **ibv_device_attr_ex**. + +**ibv_destroy_comp_cntr**() releases all resources associated with the +completion counter *comp_cntr*. The counter must not be attached to any QP +when destroyed. + +**ibv_set_comp_cntr**() sets the completion count of *comp_cntr* to *value*. + +**ibv_set_err_comp_cntr**() sets the error count of *comp_cntr* to *value*. + +**ibv_inc_comp_cntr**() increments the completion count of *comp_cntr* by +*amount*. + +**ibv_inc_err_comp_cntr**() increments the error count of *comp_cntr* by +*amount*. + +## External memory + +By default, the memory backing the counter values is allocated internally. +When the **IBV_COMP_CNTR_INIT_WITH_EXTERNAL_MEM** flag is set in +*ibv_comp_cntr_init_attr.flags*, the application provides its own memory for +the completion and error counts via the *comp_cntr_ext_mem* and +*err_cntr_ext_mem* fields. The external memory is described by an +**ibv_memory_location** structure which supports two modes: a virtual address +(**IBV_MEMORY_LOCATION_VA**), where the application supplies a direct pointer, or +a DMA-BUF reference (**IBV_MEMORY_LOCATION_DMABUF**), where the application +supplies a file descriptor and offset into an exported DMA-BUF. When using +DMA-BUF, the *ptr* field may also be set to provide a process-accessible +mapping of the memory; if provided, the *comp_count* and *err_count* pointers +in the returned **ibv_comp_cntr** will point to it. Using external memory +allows the counter values to reside in application-managed buffers or in +memory exported through DMA-BUF, enabling zero-copy observation of completion +progress by co-located processes or devices. + +# ARGUMENTS + +## ibv_comp_cntr + +```c +struct ibv_comp_cntr { + struct ibv_context *context; + uint32_t handle; + uint64_t *comp_count; + uint64_t *err_count; + uint64_t comp_count_max_value; + uint64_t err_count_max_value; +}; +``` + +*context* +: Device context associated with the completion counter. + +*handle* +: Kernel object handle for the completion counter. + +*comp_count* +: Pointer to the current successful completion count. When the counter + is backed by CPU-accessible memory, this pointer may be read directly + by the application. + +*err_count* +: Pointer to the current error completion count. When the counter is + backed by CPU-accessible memory, this pointer may be read directly + by the application. + +*comp_count_max_value* +: The maximum value the completion count can hold. A subsequent + increment that would exceed this value wraps the counter to zero. + +*err_count_max_value* +: The maximum value the error count can hold. A subsequent increment + that would exceed this value wraps the counter to zero. + +## ibv_comp_cntr_init_attr + +```c +struct ibv_comp_cntr_init_attr { + uint32_t comp_mask; + uint32_t flags; + struct ibv_memory_location comp_cntr_ext_mem; + struct ibv_memory_location err_cntr_ext_mem; +}; +``` + +*comp_mask* +: Bitmask specifying what fields in the structure are valid. + +*flags* +: Creation flags. The following flags are supported: + + **IBV_COMP_CNTR_INIT_WITH_EXTERNAL_MEM** - Use application-provided + memory for the counter values, as specified by *comp_cntr_ext_mem* + and *err_cntr_ext_mem*. + +*comp_cntr_ext_mem* +: Memory location for the completion count when using external memory. + +*err_cntr_ext_mem* +: Memory location for the error count when using external memory. + +## ibv_memory_location + +```c +enum ibv_memory_location_type { + IBV_MEMORY_LOCATION_VA, + IBV_MEMORY_LOCATION_DMABUF, +}; + +struct ibv_memory_location { + uint8_t *ptr; + struct { + uint64_t offset; + int32_t fd; + uint32_t reserved; + } dmabuf; + uint8_t type; + uint8_t reserved[7]; +}; +``` + +*type* +: The type of memory location. **IBV_MEMORY_LOCATION_VA** for a virtual + address, or **IBV_MEMORY_LOCATION_DMABUF** for a DMA-BUF reference. + +*ptr* +: Virtual address pointer. Required when type is + **IBV_MEMORY_LOCATION_VA**. When type is + **IBV_MEMORY_LOCATION_DMABUF**, may optionally be set to provide a + process-accessible mapping of the DMA-BUF memory. + +*dmabuf.fd* +: DMA-BUF file descriptor (used when type is + **IBV_MEMORY_LOCATION_DMABUF**). + +*dmabuf.offset* +: Offset within the DMA-BUF. + +# RETURN VALUE + +**ibv_create_comp_cntr**() returns a pointer to the allocated ibv_comp_cntr +object, or NULL if the request fails (and sets errno to indicate the failure +reason). + +**ibv_destroy_comp_cntr**(), **ibv_set_comp_cntr**(), +**ibv_set_err_comp_cntr**(), **ibv_inc_comp_cntr**(), and +**ibv_inc_err_comp_cntr**() return 0 on success, or the value of errno on +failure (which indicates the failure reason). + +# ERRORS + +ENOTSUP +: Completion counters are not supported on this device. + +ENOMEM +: Not enough resources to create the completion counter. + +EINVAL +: Invalid argument(s) passed. + +EBUSY +: The completion counter is still attached to a QP + (**ibv_destroy_comp_cntr**() only). + +# NOTES + +Counter values should not be modified directly by writing to the memory +pointed to by *comp_count* or *err_count*. Applications must use the provided +API functions (**ibv_set_comp_cntr**(), **ibv_set_err_comp_cntr**(), +**ibv_inc_comp_cntr**(), **ibv_inc_err_comp_cntr**()) to update counter +values. + +Updates made to counter values (e.g. via **ibv_set_comp_cntr**() or +**ibv_inc_comp_cntr**()) may not be immediately visible when reading the +counter. A small delay may occur between the update and the observed value. +However, the final updated value will eventually be reflected. + +Applications should ensure that the counter value is stable before calling +**ibv_set_comp_cntr**() or **ibv_set_err_comp_cntr**(). Otherwise, concurrent +updates may be lost. + +# SEE ALSO + +**ibv_qp_attach_comp_cntr**(3), **ibv_create_cq**(3), +**ibv_create_cq_ex**(3), **ibv_create_qp**(3) + +# AUTHORS + +Michael Margolin diff --git a/libibverbs/man/ibv_qp_attach_comp_cntr.3.md b/libibverbs/man/ibv_qp_attach_comp_cntr.3.md new file mode 100644 index 000000000..3f82ffa51 --- /dev/null +++ b/libibverbs/man/ibv_qp_attach_comp_cntr.3.md @@ -0,0 +1,118 @@ +--- +date: 2026-02-09 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: ibv_qp_attach_comp_cntr +tagline: Verbs +--- + +# NAME + +**ibv_qp_attach_comp_cntr** - Attach a completion counter to a QP + +# SYNOPSIS + +```c +#include + +int ibv_qp_attach_comp_cntr(struct ibv_qp *qp, + struct ibv_comp_cntr *comp_cntr, + struct ibv_comp_cntr_attach_attr *attr); +``` + +# DESCRIPTION + +**ibv_qp_attach_comp_cntr**() attaches the completion counter *comp_cntr* to +the queue pair *qp*. The *attr* argument specifies which operation types +should update the counter. + +The QP must be in **IBV_QPS_RESET** or **IBV_QPS_INIT** state when attaching +a completion counter. Attempting to attach a counter to a QP in any other +state will fail with EINVAL. + +The completion counter starts collecting values for the specified QP once +attached. Attaching the same completion counter to multiple QPs will +accumulate values from all attached QPs into the same counter. + +Multiple completion counters can be attached to the same QP, provided their +*op_mask* values do not overlap. Attempting to attach a counter with an +*op_mask* that conflicts with an already attached counter will fail. + +The *op_mask* field controls which operation completions are counted. Local +operations (**IBV_COMP_CNTR_ATTACH_OP_SEND**, **IBV_COMP_CNTR_ATTACH_OP_RECV**, +**IBV_COMP_CNTR_ATTACH_OP_RDMA_READ**, **IBV_COMP_CNTR_ATTACH_OP_RDMA_WRITE**) +count completions initiated by the local QP. Remote operations +(**IBV_COMP_CNTR_ATTACH_OP_REMOTE_RDMA_READ**, +**IBV_COMP_CNTR_ATTACH_OP_REMOTE_RDMA_WRITE**) count completions of incoming +RDMA operations initiated by the remote side. Supported *op_mask* values may +vary by device; unsupported values will result in an ENOTSUP error. + +There is no explicit detach operation. A completion counter is implicitly +detached when the QP it is attached to is destroyed. A completion counter +cannot be destroyed while it is still attached to any QP; the QP must be +destroyed first. + +# ARGUMENTS + +*qp* +: The queue pair to attach the completion counter to. + +*comp_cntr* +: The completion counter to attach, previously created with + **ibv_create_comp_cntr**(). + +*attr* +: Attach attributes specifying which operation types update the counter. + +## ibv_comp_cntr_attach_attr + +```c +enum ibv_comp_cntr_attach_op { + IBV_COMP_CNTR_ATTACH_OP_SEND = 1 << 0, + IBV_COMP_CNTR_ATTACH_OP_RECV = 1 << 1, + IBV_COMP_CNTR_ATTACH_OP_RDMA_READ = 1 << 2, + IBV_COMP_CNTR_ATTACH_OP_REMOTE_RDMA_READ = 1 << 3, + IBV_COMP_CNTR_ATTACH_OP_RDMA_WRITE = 1 << 4, + IBV_COMP_CNTR_ATTACH_OP_REMOTE_RDMA_WRITE = 1 << 5, +}; + +struct ibv_comp_cntr_attach_attr { + uint32_t comp_mask; + uint32_t op_mask; +}; +``` + +*comp_mask* +: Bitmask specifying what fields in the structure are valid. + +*op_mask* +: Bitmask of **ibv_comp_cntr_attach_op** values specifying which + operation types should update the counter. + +# RETURN VALUE + +**ibv_qp_attach_comp_cntr**() returns 0 on success, or the value of errno on +failure (which indicates the failure reason). + +# ERRORS + +EINVAL +: Invalid argument(s) passed. + +ENOTSUP +: Requested operation is not supported on this device. + +EBUSY +: The *op_mask* overlaps with a completion counter already attached + to this QP. + +# SEE ALSO + +**ibv_create_comp_cntr**(3), **ibv_create_qp**(3) + +# AUTHORS + +Michael Margolin diff --git a/libibverbs/man/ibv_query_device_ex.3 b/libibverbs/man/ibv_query_device_ex.3 index c77e8b4f8..2d502e6ac 100644 --- a/libibverbs/man/ibv_query_device_ex.3 +++ b/libibverbs/man/ibv_query_device_ex.3 @@ -44,6 +44,7 @@ uint64_t max_dm_size; /* Max Device Memory size (in bytes) avail struct ibv_pci_atomic_caps atomic_caps; /* PCI atomic operations capabilities, use enum ibv_pci_atomic_op_size */ uint32_t xrc_odp_caps; /* Mask with enum ibv_odp_transport_cap_bits to know which operations are supported. */ uint32_t phys_port_cnt_ex /* Extended number of physical port count, allows exposing more than 255 ports device */ +uint32_t max_comp_cntr; /* Maximum number of completion counters supported (0 = unsupported) */ .in -8 }; diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index 36d120eec..f9ebc49a5 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -361,6 +361,7 @@ struct ibv_device_attr_ex { struct ibv_pci_atomic_caps pci_atomic_caps; uint32_t xrc_odp_caps; uint32_t phys_port_cnt_ex; + uint32_t max_comp_cntr; }; enum ibv_mtu { @@ -485,6 +486,22 @@ struct ibv_async_event { enum ibv_event_type event_type; }; +enum ibv_memory_location_type { + IBV_MEMORY_LOCATION_VA, + IBV_MEMORY_LOCATION_DMABUF, +}; + +struct ibv_memory_location { + uint8_t *ptr; + struct { + uint64_t offset; + int32_t fd; + uint32_t reserved; + } dmabuf; + uint8_t type; /* Use ibv_memory_location_type */ + uint8_t reserved[7]; +}; + enum ibv_wc_status { IBV_WC_SUCCESS, IBV_WC_LOC_LEN_ERR, @@ -3018,6 +3035,69 @@ static inline int ibv_modify_cq(struct ibv_cq *cq, struct ibv_modify_cq_attr *at return vctx->modify_cq(cq, attr); } + +struct ibv_comp_cntr { + struct ibv_context *context; + uint32_t handle; + uint64_t *comp_count; + uint64_t *err_count; + uint64_t comp_count_max_value; + uint64_t err_count_max_value; +}; + +enum { + IBV_COMP_CNTR_INIT_WITH_EXTERNAL_MEM, +}; + +struct ibv_comp_cntr_init_attr { + uint32_t comp_mask; /* Compatibility mask */ + uint32_t flags; + struct ibv_memory_location comp_cntr_ext_mem; + struct ibv_memory_location err_cntr_ext_mem; +}; + +/** + * ibv_create_comp_cntr - Create a completion counter + * @context: Device context to create the counter on. + * @cc_attr: Attributes for the completion counter. + */ +struct ibv_comp_cntr *ibv_create_comp_cntr(struct ibv_context *context, + struct ibv_comp_cntr_init_attr *cc_attr); + +/** + * ibv_destroy_comp_cntr - Destroy a completion counter + * @comp_cntr: The completion counter to destroy. + */ +int ibv_destroy_comp_cntr(struct ibv_comp_cntr *comp_cntr); + +/** + * ibv_set_comp_cntr - Set the completion count value + * @comp_cntr: The completion counter to update. + * @value: The value to set. + */ +int ibv_set_comp_cntr(struct ibv_comp_cntr *comp_cntr, uint64_t value); + +/** + * ibv_set_err_comp_cntr - Set the error count value + * @comp_cntr: The completion counter to update. + * @value: The value to set. + */ +int ibv_set_err_comp_cntr(struct ibv_comp_cntr *comp_cntr, uint64_t value); + +/** + * ibv_inc_comp_cntr - Increment the completion count + * @comp_cntr: The completion counter to increment. + * @amount: The amount to increment by. + */ +int ibv_inc_comp_cntr(struct ibv_comp_cntr *comp_cntr, uint64_t amount); + +/** + * ibv_inc_err_comp_cntr - Increment the error count + * @comp_cntr: The completion counter to increment. + * @amount: The amount to increment by. + */ +int ibv_inc_err_comp_cntr(struct ibv_comp_cntr *comp_cntr, uint64_t amount); + /** * ibv_create_srq - Creates a SRQ associated with the specified protection * domain. @@ -3293,6 +3373,29 @@ ibv_modify_qp_rate_limit(struct ibv_qp *qp, return vctx->modify_qp_rate_limit(qp, attr); } +enum ibv_comp_cntr_attach_op { + IBV_COMP_CNTR_ATTACH_OP_SEND = 1 << 0, + IBV_COMP_CNTR_ATTACH_OP_RECV = 1 << 1, + IBV_COMP_CNTR_ATTACH_OP_RDMA_READ = 1 << 2, + IBV_COMP_CNTR_ATTACH_OP_REMOTE_RDMA_READ = 1 << 3, + IBV_COMP_CNTR_ATTACH_OP_RDMA_WRITE = 1 << 4, + IBV_COMP_CNTR_ATTACH_OP_REMOTE_RDMA_WRITE = 1 << 5, +}; + +struct ibv_comp_cntr_attach_attr { + uint32_t comp_mask; /* Compatibility mask */ + uint32_t op_mask; /* Use ibv_comp_cntr_attach_op */ +}; + +/** + * ibv_qp_attach_comp_cntr - Attach a completion counter to a QP + * @qp: The queue pair to attach the counter to. + * @comp_cntr: The completion counter to attach. + * @attr: Attach attributes. + */ +int ibv_qp_attach_comp_cntr(struct ibv_qp *qp, struct ibv_comp_cntr *comp_cntr, + struct ibv_comp_cntr_attach_attr *attr); + /** * ibv_query_qp_data_in_order - Checks whether the data is guaranteed to be * written in-order.