From 99c169a071d1e248cfc75c883c9d19fd508b0da6 Mon Sep 17 00:00:00 2001 From: Yonatan Nachum Date: Sun, 8 Feb 2026 09:56:35 +0000 Subject: [PATCH 01/11] Update kernel headers To commit: e736a223ab15 ("RDMA/efa: Expose new extended max inline buff size"). Signed-off-by: Yonatan Nachum --- kernel-headers/rdma/efa-abi.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel-headers/rdma/efa-abi.h b/kernel-headers/rdma/efa-abi.h index 98b71b997..13225b038 100644 --- a/kernel-headers/rdma/efa-abi.h +++ b/kernel-headers/rdma/efa-abi.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */ /* - * Copyright 2018-2025 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2026 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_ABI_USER_H @@ -44,7 +44,8 @@ struct efa_ibv_alloc_ucontext_resp { __u32 max_llq_size; /* bytes */ __u16 max_tx_batch; /* units of 64 bytes */ __u16 min_sq_wr; - __u8 reserved_a0[4]; + __u16 inline_buf_size_ex; + __u8 reserved_b0[2]; }; struct efa_ibv_alloc_pd_resp { From c33bb33f00cfff692716dac82fe7d3c6ec72f8e3 Mon Sep 17 00:00:00 2001 From: Yonatan Nachum Date: Sun, 8 Feb 2026 10:20:18 +0000 Subject: [PATCH 02/11] efa: Store the new extended inline buf size in EFA ctx Store the newly added max inline buffer size in EFA context. For compatibility, if the driver doesn't support the new extended inline size, set it to the legacy one. Signed-off-by: Yonatan Nachum --- providers/efa/efa.c | 6 +++++- providers/efa/efa.h | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/providers/efa/efa.c b/providers/efa/efa.c index a0a95beb8..94a4126ba 100644 --- a/providers/efa/efa.c +++ b/providers/efa/efa.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause /* - * Copyright 2019-2025 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2019-2026 Amazon.com, Inc. or its affiliates. All rights reserved. */ #include @@ -83,6 +83,10 @@ static struct verbs_context *efa_alloc_context(struct ibv_device *vdev, ctx->cqe_size = sizeof(struct efa_io_rx_cdesc); ctx->ex_cqe_size = sizeof(struct efa_io_rx_cdesc_ex); ctx->inline_buf_size = resp.inline_buf_size; + ctx->inline_buf_size_ex = resp.inline_buf_size_ex; + if (ctx->inline_buf_size_ex == 0) + ctx->inline_buf_size_ex = ctx->inline_buf_size; + ctx->max_llq_size = resp.max_llq_size; ctx->max_tx_batch = resp.max_tx_batch; ctx->min_sq_wr = resp.min_sq_wr; diff --git a/providers/efa/efa.h b/providers/efa/efa.h index 25b5e8f99..906cb330b 100644 --- a/providers/efa/efa.h +++ b/providers/efa/efa.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2019-2025 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2019-2026 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef __EFA_H__ @@ -31,6 +31,7 @@ struct efa_context { uint32_t cmds_supp_udata_mask; uint16_t sub_cqs_per_cq; uint16_t inline_buf_size; + uint16_t inline_buf_size_ex; uint32_t max_llq_size; uint32_t device_caps; uint32_t max_sq_wr; From 0fcf2b03c178b4ae8499826cafb5161ca1363d0e Mon Sep 17 00:00:00 2001 From: Yonatan Nachum Date: Mon, 8 Dec 2025 11:52:45 +0000 Subject: [PATCH 03/11] efa: Add max inline validation on QP creation As preparation for 128-byte WQE, add a validation for the requested inline size compared to the device's max supported inline size. We must do this validation prior to calculating the WQE size. Use the newly extended inline size which fallback to legacy value if driver didn't support query it. Signed-off-by: Yonatan Nachum --- providers/efa/verbs.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/providers/efa/verbs.c b/providers/efa/verbs.c index c2cc31fdd..0091d4d65 100644 --- a/providers/efa/verbs.c +++ b/providers/efa/verbs.c @@ -1768,6 +1768,13 @@ static int efa_check_qp_limits(struct efa_context *ctx, return EINVAL; } + if (attr->cap.max_inline_data > ctx->inline_buf_size_ex) { + verbs_err(&ctx->ibvctx, + "Max inline data %u > %u\n", attr->cap.max_inline_data, + ctx->inline_buf_size_ex); + return EINVAL; + } + return 0; } From 77253fce177ef8a975439a9e4d87abe470c9f5a9 Mon Sep 17 00:00:00 2001 From: Yonatan Nachum Date: Mon, 8 Dec 2025 16:27:15 +0000 Subject: [PATCH 04/11] efa: Use IO structs in WR setters instead of TX WQE As preparation for supporting 128-byte WQE, remove direct usage of struct efa_io_tx_wqe wherever possible in WQE setters. This gives the ability to reuse those structs when needed for different WQE formats. Signed-off-by: Yonatan Nachum --- providers/efa/verbs.c | 48 ++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/providers/efa/verbs.c b/providers/efa/verbs.c index 0091d4d65..14f2cc364 100644 --- a/providers/efa/verbs.c +++ b/providers/efa/verbs.c @@ -2090,20 +2090,23 @@ static void efa_set_tx_buf(struct efa_io_tx_buf_desc *tx_buf, } static void efa_post_send_sgl(struct efa_io_tx_buf_desc *tx_bufs, + struct efa_io_tx_meta_desc *md, const struct ibv_sge *sg_list, int num_sge) { const struct ibv_sge *sge; size_t i; + md->length = num_sge; + for (i = 0; i < num_sge; i++) { sge = &sg_list[i]; efa_set_tx_buf(&tx_bufs[i], sge->addr, sge->lkey, sge->length); } } -static void efa_post_send_inline_data(const struct ibv_send_wr *wr, - struct efa_io_tx_wqe *tx_wqe) +static void efa_post_send_inline_data(const struct ibv_send_wr *wr, struct efa_io_tx_meta_desc *md, + uint8_t *inline_data) { const struct ibv_sge *sgl = wr->sg_list; uint32_t total_length = 0; @@ -2113,13 +2116,13 @@ static void efa_post_send_inline_data(const struct ibv_send_wr *wr, for (i = 0; i < wr->num_sge; i++) { length = sgl[i].length; - memcpy(tx_wqe->data.inline_data + total_length, + memcpy(inline_data + total_length, (void *)(uintptr_t)sgl[i].addr, length); total_length += length; } - EFA_SET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG, 1); - tx_wqe->meta.length = total_length; + EFA_SET(&md->ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG, 1); + md->length = total_length; } static size_t efa_sge_total_bytes(const struct ibv_sge *sg_list, int num_sge) @@ -2314,11 +2317,9 @@ int efa_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, ah = to_efa_ah(wr->wr.ud.ah); if (wr->send_flags & IBV_SEND_INLINE) { - efa_post_send_inline_data(wr, &tx_wqe); + efa_post_send_inline_data(wr, &tx_wqe.meta, tx_wqe.data.inline_data); } else { - meta_desc->length = wr->num_sge; - efa_post_send_sgl(tx_wqe.data.sgl, wr->sg_list, - wr->num_sge); + efa_post_send_sgl(tx_wqe.data.sgl, &tx_wqe.meta, wr->sg_list, wr->num_sge); } if (wr->opcode == IBV_WR_SEND_WITH_IMM) { @@ -2406,21 +2407,15 @@ static struct efa_io_tx_wqe *efa_send_wr_common(struct ibv_qp_ex *ibvqpx, return sq->curr_tx_wqe; } -static void efa_send_wr_set_imm_data(struct efa_io_tx_wqe *tx_wqe, __be32 imm_data) +static void efa_send_wr_set_imm_data(struct efa_io_tx_meta_desc *meta_desc, __be32 imm_data) { - struct efa_io_tx_meta_desc *meta_desc; - - meta_desc = &tx_wqe->meta; meta_desc->immediate_data = be32toh(imm_data); EFA_SET(&meta_desc->ctrl1, EFA_IO_TX_META_DESC_HAS_IMM, 1); } -static void efa_send_wr_set_rdma_addr(struct efa_io_tx_wqe *tx_wqe, uint32_t rkey, +static void efa_send_wr_set_rdma_addr(struct efa_io_remote_mem_addr *remote_mem, uint32_t rkey, uint64_t remote_addr) { - struct efa_io_remote_mem_addr *remote_mem; - - remote_mem = &tx_wqe->data.rdma_req.remote_mem; remote_mem->rkey = rkey; remote_mem->buf_addr_lo = remote_addr & 0xFFFFFFFF; remote_mem->buf_addr_hi = remote_addr >> 32; @@ -2439,7 +2434,7 @@ static void efa_send_wr_send_imm(struct ibv_qp_ex *ibvqpx, __be32 imm_data) if (unlikely(!tx_wqe)) return; - efa_send_wr_set_imm_data(tx_wqe, imm_data); + efa_send_wr_set_imm_data(&tx_wqe->meta, imm_data); } static void efa_send_wr_rdma_read(struct ibv_qp_ex *ibvqpx, uint32_t rkey, @@ -2451,7 +2446,7 @@ static void efa_send_wr_rdma_read(struct ibv_qp_ex *ibvqpx, uint32_t rkey, if (unlikely(!tx_wqe)) return; - efa_send_wr_set_rdma_addr(tx_wqe, rkey, remote_addr); + efa_send_wr_set_rdma_addr(&tx_wqe->data.rdma_req.remote_mem, rkey, remote_addr); } static void efa_send_wr_rdma_write(struct ibv_qp_ex *ibvqpx, uint32_t rkey, @@ -2463,7 +2458,7 @@ static void efa_send_wr_rdma_write(struct ibv_qp_ex *ibvqpx, uint32_t rkey, if (unlikely(!tx_wqe)) return; - efa_send_wr_set_rdma_addr(tx_wqe, rkey, remote_addr); + efa_send_wr_set_rdma_addr(&tx_wqe->data.rdma_req.remote_mem, rkey, remote_addr); } static void efa_send_wr_rdma_write_imm(struct ibv_qp_ex *ibvqpx, uint32_t rkey, @@ -2475,8 +2470,8 @@ static void efa_send_wr_rdma_write_imm(struct ibv_qp_ex *ibvqpx, uint32_t rkey, if (unlikely(!tx_wqe)) return; - efa_send_wr_set_rdma_addr(tx_wqe, rkey, remote_addr); - efa_send_wr_set_imm_data(tx_wqe, imm_data); + efa_send_wr_set_rdma_addr(&tx_wqe->data.rdma_req.remote_mem, rkey, remote_addr); + efa_send_wr_set_imm_data(&tx_wqe->meta, imm_data); } static void efa_send_wr_set_sge(struct ibv_qp_ex *ibvqpx, uint32_t lkey, @@ -2534,7 +2529,7 @@ static void efa_send_wr_set_sge_list(struct ibv_qp_ex *ibvqpx, size_t num_sge, qp->wr_session_err = EINVAL; return; } - efa_post_send_sgl(tx_wqe->data.sgl, sg_list, num_sge); + efa_post_send_sgl(tx_wqe->data.sgl, &tx_wqe->meta, sg_list, num_sge); break; case EFA_IO_RDMA_READ: case EFA_IO_RDMA_WRITE: @@ -2547,15 +2542,12 @@ static void efa_send_wr_set_sge_list(struct ibv_qp_ex *ibvqpx, size_t num_sge, return; } rdma_req = &tx_wqe->data.rdma_req; - rdma_req->remote_mem.length = efa_sge_total_bytes(sg_list, - num_sge); - efa_post_send_sgl(rdma_req->local_mem, sg_list, num_sge); + rdma_req->remote_mem.length = efa_sge_total_bytes(sg_list, num_sge); + efa_post_send_sgl(rdma_req->local_mem, &tx_wqe->meta, sg_list, num_sge); break; default: return; } - - tx_wqe->meta.length = num_sge; } static void efa_send_wr_set_inline_data(struct ibv_qp_ex *ibvqpx, void *addr, From ce85344b4197f3fba70b25c76d122e9db9b8c2d0 Mon Sep 17 00:00:00 2001 From: Yonatan Nachum Date: Mon, 8 Dec 2025 16:33:00 +0000 Subject: [PATCH 05/11] efa: Store SQ WQE size for dynamic usage As preparation for 128-byte WQE support, add a new SQ WQE size field to the SQ struct to store the dynamically configured WQE size. Change static usage of sizeof of the 64-byte WQE size with the new field. Signed-off-by: Yonatan Nachum --- providers/efa/efa.h | 1 + providers/efa/verbs.c | 44 +++++++++++++++++++++++-------------------- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/providers/efa/efa.h b/providers/efa/efa.h index 906cb330b..58ee5b80b 100644 --- a/providers/efa/efa.h +++ b/providers/efa/efa.h @@ -142,6 +142,7 @@ struct efa_sq { size_t max_inline_data; size_t max_wr_rdma_sge; uint16_t max_batch_wr; + uint16_t wqe_size; /* Buffer for pending WR entries in the current session */ uint8_t *local_queue; diff --git a/providers/efa/verbs.c b/providers/efa/verbs.c index 14f2cc364..920a9120d 100644 --- a/providers/efa/verbs.c +++ b/providers/efa/verbs.c @@ -1488,7 +1488,7 @@ static int efa_sq_initialize(struct efa_qp *qp, } sq->desc_offset = resp->llq_desc_offset; - desc_ring_size = sq->wq.wqe_cnt * sizeof(struct efa_io_tx_wqe); + desc_ring_size = sq->wq.wqe_cnt * sq->wqe_size; sq->desc_ring_mmap_size = align(desc_ring_size + sq->desc_offset, qp->page_size); sq->max_inline_data = attr->cap.max_inline_data; @@ -1512,7 +1512,7 @@ static int efa_sq_initialize(struct efa_qp *qp, sq->max_wr_rdma_sge = min_t(uint16_t, ctx->max_wr_rdma_sge, EFA_IO_TX_DESC_NUM_RDMA_BUFS); sq->max_batch_wr = ctx->max_tx_batch ? - (ctx->max_tx_batch * 64) / sizeof(struct efa_io_tx_wqe) : + (ctx->max_tx_batch * 64) / sq->wqe_size : UINT16_MAX; if (ctx->min_sq_wr) { /* The device can't accept a doorbell for the whole SQ at once, @@ -1603,6 +1603,11 @@ static void efa_qp_init_indices(struct efa_qp *qp) qp->rq.wq.wrid_idx_pool_next = 0; } +static int efa_calc_sq_wqe_size(struct ibv_qp_cap *cap) +{ + return sizeof(struct efa_io_tx_wqe); +} + static void efa_setup_qp(struct efa_context *ctx, struct efa_qp *qp, struct ibv_qp_cap *cap, @@ -1612,6 +1617,7 @@ static void efa_setup_qp(struct efa_context *ctx, efa_qp_init_indices(qp); + qp->sq.wqe_size = efa_calc_sq_wqe_size(cap); qp->sq.wq.wqe_cnt = roundup_pow_of_two(max_t(uint32_t, cap->max_send_wr, ctx->min_sq_wr)); qp->sq.wq.max_sge = cap->max_send_sge; @@ -1740,6 +1746,8 @@ static int efa_check_qp_attr(struct efa_context *ctx, static int efa_check_qp_limits(struct efa_context *ctx, struct ibv_qp_init_attr_ex *attr) { + int sq_wqe_size; + if (attr->cap.max_send_sge > ctx->max_sq_sge) { verbs_err(&ctx->ibvctx, "Max send SGE %u > %u\n", attr->cap.max_send_sge, @@ -1754,10 +1762,11 @@ static int efa_check_qp_limits(struct efa_context *ctx, return EINVAL; } - if (attr->cap.max_send_wr > ctx->max_sq_wr) { + sq_wqe_size = efa_calc_sq_wqe_size(&attr->cap); + if (attr->cap.max_send_wr * sq_wqe_size > ctx->max_llq_size) { verbs_err(&ctx->ibvctx, - "Max send WR %u > %u\n", attr->cap.max_send_wr, - ctx->max_sq_wr); + "Max Send WR %u > %u\n", attr->cap.max_send_wr, + ctx->max_llq_size / sq_wqe_size); return EINVAL; } @@ -1813,8 +1822,7 @@ static struct ibv_qp *create_qp(struct ibv_context *ibvctx, req.rq_ring_size = (qp->rq.wq.desc_mask + 1) * sizeof(struct efa_io_rx_desc); - req.sq_ring_size = (attr->cap.max_send_wr) * - sizeof(struct efa_io_tx_wqe); + req.sq_ring_size = attr->cap.max_send_wr * qp->sq.wqe_size; if (attr->qp_type == IBV_QPT_DRIVER) req.driver_qp_type = efa_attr->driver_qp_type; if (efa_attr->flags & EFADV_QP_FLAGS_UNSOLICITED_WRITE_RECV) @@ -2021,7 +2029,7 @@ int efadv_query_qp_wqs(struct ibv_qp *ibvqp, struct efadv_wq_attr *sq_attr, sq_attr->comp_mask = 0; sq_attr->buffer = qp->sq.desc; - sq_attr->entry_size = sizeof(struct efa_io_tx_wqe); + sq_attr->entry_size = qp->sq.wqe_size; sq_attr->num_entries = qp->sq.wq.wqe_cnt; sq_attr->doorbell = qp->sq.wq.db; sq_attr->max_batch = qp->sq.max_batch_wr; @@ -2312,7 +2320,7 @@ int efa_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, goto ring_db; } - memset(&tx_wqe, 0, sizeof(tx_wqe)); + memset(&tx_wqe, 0, sq->wqe_size); meta_desc = &tx_wqe.meta; ah = to_efa_ah(wr->wr.ud.ah); @@ -2337,10 +2345,8 @@ int efa_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, meta_desc->qkey = wr->wr.ud.remote_qkey; /* Copy descriptor */ - sq_desc_offset = (wq->pc & wq->desc_mask) * - sizeof(tx_wqe); - mmio_memcpy_x64(sq->desc + sq_desc_offset, &tx_wqe, - sizeof(tx_wqe)); + sq_desc_offset = (wq->pc & wq->desc_mask) * sq->wqe_size; + mmio_memcpy_x64(sq->desc + sq_desc_offset, &tx_wqe, sq->wqe_size); /* advance index and change phase */ efa_sq_advance_post_idx(sq); @@ -2391,8 +2397,8 @@ static struct efa_io_tx_wqe *efa_send_wr_common(struct ibv_qp_ex *ibvqpx, return NULL; } - sq->curr_tx_wqe = (struct efa_io_tx_wqe *)sq->local_queue + - sq->num_wqe_pending; + sq->curr_tx_wqe = (struct efa_io_tx_wqe *)(sq->local_queue + + sq->num_wqe_pending * sq->wqe_size); memset(sq->curr_tx_wqe, 0, sizeof(*sq->curr_tx_wqe)); meta_desc = &sq->curr_tx_wqe->meta; @@ -2689,11 +2695,9 @@ static int efa_send_wr_complete(struct ibv_qp_ex *ibvqpx) num_wqe_to_copy = min3(sq->num_wqe_pending, sq->wq.wqe_cnt - sq_desc_idx, max_txbatch - curbatch); - mmio_memcpy_x64((struct efa_io_tx_wqe *)sq->desc + - sq_desc_idx, - (struct efa_io_tx_wqe *)sq->local_queue + - local_idx, - num_wqe_to_copy * sizeof(struct efa_io_tx_wqe)); + mmio_memcpy_x64(sq->desc + sq_desc_idx * sq->wqe_size, + sq->local_queue + local_idx * sq->wqe_size, + num_wqe_to_copy * sq->wqe_size); sq->num_wqe_pending -= num_wqe_to_copy; local_idx += num_wqe_to_copy; From ab750fe1d99d3d6557a888b265f33acfe89e1d70 Mon Sep 17 00:00:00 2001 From: Yonatan Nachum Date: Mon, 8 Dec 2025 16:51:34 +0000 Subject: [PATCH 06/11] efa: Add support for 128-byte WQE Add the new 128-byte WQE structs and add the 128-byte WQE size to the SQ WQE calculation utility. For our datapath the logic is different between old and new APIs: 1. Old API - we use a buffer large enough for both WQEs and cast the buffer to the right pointers based on the WQE size. 2. New API - To have generic setters in the QP function pointers, add a new TX WQE context struct that holds the building blocks of the WQE and set them to the right fields for the WQE being constructed. This way the setters in the flow can access the fields in a generic way without being aware of the WQE type ore its size. Signed-off-by: Yonatan Nachum --- providers/efa/efa.h | 17 +- providers/efa/efa_io_defs.h | 37 +++- providers/efa/verbs.c | 412 ++++++++++++++++++++++++------------ 3 files changed, 332 insertions(+), 134 deletions(-) diff --git a/providers/efa/efa.h b/providers/efa/efa.h index 58ee5b80b..9bc4b056a 100644 --- a/providers/efa/efa.h +++ b/providers/efa/efa.h @@ -134,6 +134,21 @@ struct efa_rq { size_t buf_size; }; +struct efa_tx_wqe_ctx { + /* wqe buffer */ + void *buff; + /* wqe meta descriptor */ + struct efa_io_tx_meta_desc *md; + /* wqe local memory / SGL */ + struct efa_io_tx_buf_desc *local_mem; + /* wqe remote memory - RDMA only */ + struct efa_io_remote_mem_addr *remote_mem; + /* wqe inline data buffer */ + uint8_t *inline_data; + /* max sge allowed for this wqe */ + uint8_t max_sge; +}; + struct efa_sq { struct efa_wq wq; uint8_t *desc; @@ -151,7 +166,7 @@ struct efa_sq { /* Phase before current session */ int phase_rb; /* Current wqe being built */ - struct efa_io_tx_wqe *curr_tx_wqe; + struct efa_tx_wqe_ctx curr_tx_wqe; }; struct efa_qp { diff --git a/providers/efa/efa_io_defs.h b/providers/efa/efa_io_defs.h index e4f6f78ac..fccb217b7 100644 --- a/providers/efa/efa_io_defs.h +++ b/providers/efa/efa_io_defs.h @@ -9,6 +9,7 @@ #define EFA_IO_TX_DESC_NUM_BUFS 2 #define EFA_IO_TX_DESC_NUM_RDMA_BUFS 1 #define EFA_IO_TX_DESC_INLINE_MAX_SIZE 32 +#define EFA_IO_TX_DESC_INLINE_MAX_SIZE_128 80 #define EFA_IO_TX_DESC_IMM_DATA_SIZE 4 enum efa_io_queue_type { @@ -164,9 +165,22 @@ struct efa_io_rdma_req { struct efa_io_tx_buf_desc local_mem[1]; }; +struct efa_io_rdma_req_128 { + /* Remote memory address */ + struct efa_io_remote_mem_addr remote_mem; + + union { + /* Local memory address */ + struct efa_io_tx_buf_desc local_mem[1]; + + /* inline data for RDMA */ + uint8_t inline_data[80]; + }; +}; + /* - * Tx WQE, composed of tx meta descriptors followed by either tx buffer - * descriptors or inline data + * 64-byte Tx WQE, composed of tx meta descriptors followed by either tx + * buffer descriptors or inline data */ struct efa_io_tx_wqe { /* TX meta */ @@ -183,6 +197,25 @@ struct efa_io_tx_wqe { } data; }; +/* + * 128-byte Tx WQE, composed of tx meta descriptors followed by either tx + * buffer descriptors or inline data + */ +struct efa_io_tx_wqe_128 { + /* TX meta */ + struct efa_io_tx_meta_desc meta; + + union { + /* Send buffer descriptors */ + struct efa_io_tx_buf_desc sgl[2]; + + uint8_t inline_data[80]; + + /* RDMA local and remote memory addresses */ + struct efa_io_rdma_req_128 rdma_req; + } data; +}; + /* * Rx buffer descriptor; RX WQE is composed of one or more RX buffer * descriptors. diff --git a/providers/efa/verbs.c b/providers/efa/verbs.c index 920a9120d..6bd87dbbf 100644 --- a/providers/efa/verbs.c +++ b/providers/efa/verbs.c @@ -27,6 +27,9 @@ #define EFA_DEV_CAP(ctx, cap) \ ((ctx)->device_caps & EFA_QUERY_DEVICE_CAPS_##cap) +#define EFA_IO_TX_DESC_SIZE_64 (sizeof(struct efa_io_tx_wqe)) +#define EFA_IO_TX_DESC_SIZE_128 (sizeof(struct efa_io_tx_wqe_128)) + static bool is_buf_cleared(void *buf, size_t len) { int i; @@ -90,7 +93,7 @@ int efa_query_device_ex(struct ibv_context *context, } a->max_qp_wr = min_t(int, a->max_qp_wr, - ctx->max_llq_size / sizeof(struct efa_io_tx_wqe)); + ctx->max_llq_size / EFA_IO_TX_DESC_SIZE_64); memcpy(fw_ver, &resp.ibv_resp.base.fw_ver, sizeof(resp.ibv_resp.base.fw_ver)); snprintf(a->fw_ver, sizeof(a->fw_ver), "%u.%u.%u.%u", @@ -1605,7 +1608,10 @@ static void efa_qp_init_indices(struct efa_qp *qp) static int efa_calc_sq_wqe_size(struct ibv_qp_cap *cap) { - return sizeof(struct efa_io_tx_wqe); + if (cap->max_inline_data > EFA_IO_TX_DESC_INLINE_MAX_SIZE) + return EFA_IO_TX_DESC_SIZE_128; + + return EFA_IO_TX_DESC_SIZE_64; } static void efa_setup_qp(struct efa_context *ctx, @@ -1658,7 +1664,8 @@ static void efa_unlock_cqs(struct ibv_qp *ibvqp) } static void efa_qp_fill_wr_pfns(struct ibv_qp_ex *ibvqpx, - struct ibv_qp_init_attr_ex *attr_ex); + struct ibv_qp_init_attr_ex *attr_ex, + uint16_t wqe_size); static int efa_check_qp_attr(struct efa_context *ctx, struct ibv_qp_init_attr_ex *attr, @@ -1854,7 +1861,7 @@ static struct ibv_qp *create_qp(struct ibv_context *ibvctx, pthread_spin_unlock(&ctx->qp_table_lock); if (attr->comp_mask & IBV_QP_INIT_ATTR_SEND_OPS_FLAGS) { - efa_qp_fill_wr_pfns(&qp->verbs_qp.qp_ex, attr); + efa_qp_fill_wr_pfns(&qp->verbs_qp.qp_ex, attr, qp->sq.wqe_size); qp->verbs_qp.comp_mask |= VERBS_QP_EX; } @@ -2191,25 +2198,26 @@ static void efa_set_common_ctrl_flags(struct efa_io_tx_meta_desc *desc, } #if defined(LTTNG_ENABLED) || defined(USDT_ENABLED) -static uint32_t efa_get_wqe_length(struct efa_io_tx_wqe *tx_wqe) +static uint32_t efa_wqe_get_data_length(struct efa_sq *sq) { + struct efa_io_tx_meta_desc *md = sq->curr_tx_wqe.md; enum efa_io_send_op_type op_type; uint32_t length = 0; size_t i; - op_type = EFA_GET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_OP_TYPE); + op_type = EFA_GET(&md->ctrl1, EFA_IO_TX_META_DESC_OP_TYPE); switch (op_type) { case EFA_IO_SEND: - if (EFA_GET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG)) - return tx_wqe->meta.length; + if (EFA_GET(&md->ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG)) + return md->length; - for (i = 0; i < tx_wqe->meta.length; i++) - length += tx_wqe->data.sgl[i].length; + for (i = 0; i < md->length; i++) + length += sq->curr_tx_wqe.local_mem[i].length; return length; case EFA_IO_RDMA_READ: case EFA_IO_RDMA_WRITE: - return tx_wqe->data.rdma_req.remote_mem.length; + return sq->curr_tx_wqe.remote_mem->length; } return 0; @@ -2298,16 +2306,38 @@ static int efa_post_send_validate_wr(struct efa_qp *qp, int efa_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, struct ibv_send_wr **bad) { - struct efa_io_tx_meta_desc *meta_desc; + uint8_t wqe_buf[EFA_IO_TX_DESC_SIZE_128]; struct efa_qp *qp = to_efa_qp(ibvqp); - struct efa_io_tx_wqe tx_wqe; + struct efa_io_tx_wqe_128 *tx_wqe_128; + struct efa_io_tx_meta_desc *md; + struct efa_io_tx_buf_desc *sgl; + struct efa_io_tx_wqe *tx_wqe; struct efa_sq *sq = &qp->sq; struct efa_wq *wq = &sq->wq; uint32_t sq_desc_offset; uint32_t curbatch = 0; + uint8_t *inline_data; struct efa_ah *ah; int err = 0; + switch (sq->wqe_size) { + case EFA_IO_TX_DESC_SIZE_64: + tx_wqe = (struct efa_io_tx_wqe *)wqe_buf; + md = &tx_wqe->meta; + sgl = tx_wqe->data.sgl; + inline_data = tx_wqe->data.inline_data; + break; + case EFA_IO_TX_DESC_SIZE_128: + tx_wqe_128 = (struct efa_io_tx_wqe_128 *)wqe_buf; + md = &tx_wqe_128->meta; + sgl = tx_wqe_128->data.sgl; + inline_data = tx_wqe_128->data.inline_data; + break; + + default: + return EINVAL; + } + if (wq->need_lock) mmio_wc_spinlock(&wq->wqlock); else @@ -2320,33 +2350,30 @@ int efa_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, goto ring_db; } - memset(&tx_wqe, 0, sq->wqe_size); - meta_desc = &tx_wqe.meta; + memset(wqe_buf, 0, sq->wqe_size); ah = to_efa_ah(wr->wr.ud.ah); if (wr->send_flags & IBV_SEND_INLINE) { - efa_post_send_inline_data(wr, &tx_wqe.meta, tx_wqe.data.inline_data); + efa_post_send_inline_data(wr, md, inline_data); } else { - efa_post_send_sgl(tx_wqe.data.sgl, &tx_wqe.meta, wr->sg_list, wr->num_sge); + efa_post_send_sgl(sgl, md, wr->sg_list, wr->num_sge); } if (wr->opcode == IBV_WR_SEND_WITH_IMM) { - meta_desc->immediate_data = be32toh(wr->imm_data); - EFA_SET(&meta_desc->ctrl1, EFA_IO_TX_META_DESC_HAS_IMM, - 1); + md->immediate_data = be32toh(wr->imm_data); + EFA_SET(&md->ctrl1, EFA_IO_TX_META_DESC_HAS_IMM, 1); } /* Set rest of the descriptor fields */ - efa_set_common_ctrl_flags(meta_desc, sq, EFA_IO_SEND); - meta_desc->req_id = efa_wq_get_next_wrid_idx_locked(wq, - wr->wr_id); - meta_desc->dest_qp_num = wr->wr.ud.remote_qpn; - meta_desc->ah = ah->efa_ah; - meta_desc->qkey = wr->wr.ud.remote_qkey; + efa_set_common_ctrl_flags(md, sq, EFA_IO_SEND); + md->req_id = efa_wq_get_next_wrid_idx_locked(wq, wr->wr_id); + md->dest_qp_num = wr->wr.ud.remote_qpn; + md->ah = ah->efa_ah; + md->qkey = wr->wr.ud.remote_qkey; /* Copy descriptor */ sq_desc_offset = (wq->pc & wq->desc_mask) * sq->wqe_size; - mmio_memcpy_x64(sq->desc + sq_desc_offset, &tx_wqe, sq->wqe_size); + mmio_memcpy_x64(sq->desc + sq_desc_offset, wqe_buf, sq->wqe_size); /* advance index and change phase */ efa_sq_advance_post_idx(sq); @@ -2359,8 +2386,8 @@ int efa_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, mmio_wc_start(); } rdma_tracepoint(rdma_core_efa, post_send, qp->dev->name, wr->wr_id, - EFA_IO_SEND, ibvqp->qp_num, meta_desc->dest_qp_num, - ah->efa_ah, efa_get_wqe_length(&tx_wqe)); + EFA_IO_SEND, ibvqp->qp_num, md->dest_qp_num, + ah->efa_ah, efa_wqe_get_data_length(sq)); wr = wr->next; } @@ -2380,12 +2407,9 @@ int efa_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, return err; } -static struct efa_io_tx_wqe *efa_send_wr_common(struct ibv_qp_ex *ibvqpx, - enum efa_io_send_op_type op_type) +static void *efa_send_wr_alloc(struct efa_qp *qp, struct ibv_qp_ex *ibvqpx) { - struct efa_qp *qp = to_efa_qp_ex(ibvqpx); struct efa_sq *sq = &qp->sq; - struct efa_io_tx_meta_desc *meta_desc; int err; if (unlikely(qp->wr_session_err)) @@ -2397,20 +2421,33 @@ static struct efa_io_tx_wqe *efa_send_wr_common(struct ibv_qp_ex *ibvqpx, return NULL; } - sq->curr_tx_wqe = (struct efa_io_tx_wqe *)(sq->local_queue + - sq->num_wqe_pending * sq->wqe_size); - memset(sq->curr_tx_wqe, 0, sizeof(*sq->curr_tx_wqe)); + sq->curr_tx_wqe.buff = sq->local_queue + sq->num_wqe_pending * sq->wqe_size; + memset(sq->curr_tx_wqe.buff, 0, sq->wqe_size); + + return sq->curr_tx_wqe.buff; +} + +static void efa_send_wr_init(struct efa_qp *qp, struct ibv_qp_ex *ibvqpx, + enum efa_io_send_op_type op_type, uint8_t max_sge, + struct efa_io_tx_meta_desc *md, + struct efa_io_tx_buf_desc *local_mem, + struct efa_io_remote_mem_addr *remote_mem, + uint8_t *inline_data) +{ + struct efa_sq *sq = &qp->sq; - meta_desc = &sq->curr_tx_wqe->meta; - efa_set_common_ctrl_flags(meta_desc, sq, op_type); - meta_desc->req_id = efa_wq_get_next_wrid_idx_locked(&sq->wq, - ibvqpx->wr_id); + sq->curr_tx_wqe.md = md; + efa_set_common_ctrl_flags(sq->curr_tx_wqe.md, sq, op_type); + sq->curr_tx_wqe.md->req_id = efa_wq_get_next_wrid_idx_locked(&sq->wq, ibvqpx->wr_id); /* advance index and change phase */ efa_sq_advance_post_idx(sq); sq->num_wqe_pending++; - return sq->curr_tx_wqe; + sq->curr_tx_wqe.local_mem = local_mem; + sq->curr_tx_wqe.remote_mem = remote_mem; + sq->curr_tx_wqe.inline_data = inline_data; + sq->curr_tx_wqe.max_sge = max_sge; } static void efa_send_wr_set_imm_data(struct efa_io_tx_meta_desc *meta_desc, __be32 imm_data) @@ -2427,140 +2464,228 @@ static void efa_send_wr_set_rdma_addr(struct efa_io_remote_mem_addr *remote_mem, remote_mem->buf_addr_hi = remote_addr >> 32; } -static void efa_send_wr_send(struct ibv_qp_ex *ibvqpx) +static void efa_send_wr_send_64(struct ibv_qp_ex *ibvqpx) { - efa_send_wr_common(ibvqpx, EFA_IO_SEND); + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + struct efa_io_tx_wqe *tx_wqe; + + tx_wqe = (struct efa_io_tx_wqe *)efa_send_wr_alloc(qp, ibvqpx); + if (unlikely(!tx_wqe)) + return; + + efa_send_wr_init(qp, ibvqpx, EFA_IO_SEND, qp->sq.wq.max_sge, &tx_wqe->meta, + tx_wqe->data.sgl, NULL, tx_wqe->data.inline_data); +} + +static void efa_send_wr_send_128(struct ibv_qp_ex *ibvqpx) +{ + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + struct efa_io_tx_wqe_128 *tx_wqe; + + tx_wqe = (struct efa_io_tx_wqe_128 *)efa_send_wr_alloc(qp, ibvqpx); + if (unlikely(!tx_wqe)) + return; + + efa_send_wr_init(qp, ibvqpx, EFA_IO_SEND, qp->sq.wq.max_sge, &tx_wqe->meta, + tx_wqe->data.sgl, NULL, tx_wqe->data.inline_data); } -static void efa_send_wr_send_imm(struct ibv_qp_ex *ibvqpx, __be32 imm_data) +static void efa_send_wr_send_imm_64(struct ibv_qp_ex *ibvqpx, __be32 imm_data) { + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); struct efa_io_tx_wqe *tx_wqe; - tx_wqe = efa_send_wr_common(ibvqpx, EFA_IO_SEND); + tx_wqe = (struct efa_io_tx_wqe *)efa_send_wr_alloc(qp, ibvqpx); if (unlikely(!tx_wqe)) return; - efa_send_wr_set_imm_data(&tx_wqe->meta, imm_data); + efa_send_wr_init(qp, ibvqpx, EFA_IO_SEND, qp->sq.wq.max_sge, &tx_wqe->meta, + tx_wqe->data.sgl, NULL, tx_wqe->data.inline_data); + efa_send_wr_set_imm_data(qp->sq.curr_tx_wqe.md, imm_data); } -static void efa_send_wr_rdma_read(struct ibv_qp_ex *ibvqpx, uint32_t rkey, - uint64_t remote_addr) +static void efa_send_wr_send_imm_128(struct ibv_qp_ex *ibvqpx, __be32 imm_data) { + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + struct efa_io_tx_wqe_128 *tx_wqe; + + tx_wqe = (struct efa_io_tx_wqe_128 *)efa_send_wr_alloc(qp, ibvqpx); + if (unlikely(!tx_wqe)) + return; + + efa_send_wr_init(qp, ibvqpx, EFA_IO_SEND, qp->sq.wq.max_sge, &tx_wqe->meta, + tx_wqe->data.sgl, NULL, tx_wqe->data.inline_data); + efa_send_wr_set_imm_data(qp->sq.curr_tx_wqe.md, imm_data); +} + +static inline void efa_send_wr_rdma_common(struct efa_qp *qp, struct ibv_qp_ex *ibvqpx, + uint32_t rkey, uint64_t remote_addr, + enum efa_io_send_op_type op_type, + struct efa_io_tx_meta_desc *md, + struct efa_io_tx_buf_desc *local_mem, + struct efa_io_remote_mem_addr *remote_mem, + uint8_t *inline_data) ALWAYS_INLINE; +static inline void efa_send_wr_rdma_common(struct efa_qp *qp, struct ibv_qp_ex *ibvqpx, + uint32_t rkey, uint64_t remote_addr, + enum efa_io_send_op_type op_type, + struct efa_io_tx_meta_desc *md, + struct efa_io_tx_buf_desc *local_mem, + struct efa_io_remote_mem_addr *remote_mem, + uint8_t *inline_data) +{ + efa_send_wr_init(qp, ibvqpx, op_type, qp->sq.max_wr_rdma_sge, md, + local_mem, remote_mem, inline_data); + + efa_send_wr_set_rdma_addr(remote_mem, rkey, remote_addr); +} + +static void efa_send_wr_rdma_read_64(struct ibv_qp_ex *ibvqpx, uint32_t rkey, + uint64_t remote_addr) +{ + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); struct efa_io_tx_wqe *tx_wqe; - tx_wqe = efa_send_wr_common(ibvqpx, EFA_IO_RDMA_READ); + tx_wqe = (struct efa_io_tx_wqe *)efa_send_wr_alloc(qp, ibvqpx); + if (unlikely(!tx_wqe)) + return; + + efa_send_wr_rdma_common(qp, ibvqpx, rkey, remote_addr, EFA_IO_RDMA_READ, + &tx_wqe->meta, tx_wqe->data.rdma_req.local_mem, + &tx_wqe->data.rdma_req.remote_mem, NULL); +} + +static void efa_send_wr_rdma_read_128(struct ibv_qp_ex *ibvqpx, uint32_t rkey, + uint64_t remote_addr) +{ + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + struct efa_io_tx_wqe_128 *tx_wqe; + + tx_wqe = (struct efa_io_tx_wqe_128 *)efa_send_wr_alloc(qp, ibvqpx); if (unlikely(!tx_wqe)) return; - efa_send_wr_set_rdma_addr(&tx_wqe->data.rdma_req.remote_mem, rkey, remote_addr); + efa_send_wr_rdma_common(qp, ibvqpx, rkey, remote_addr, EFA_IO_RDMA_READ, + &tx_wqe->meta, tx_wqe->data.rdma_req.local_mem, + &tx_wqe->data.rdma_req.remote_mem, NULL); } -static void efa_send_wr_rdma_write(struct ibv_qp_ex *ibvqpx, uint32_t rkey, - uint64_t remote_addr) +static void efa_send_wr_rdma_write_64(struct ibv_qp_ex *ibvqpx, uint32_t rkey, + uint64_t remote_addr) { + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); struct efa_io_tx_wqe *tx_wqe; - tx_wqe = efa_send_wr_common(ibvqpx, EFA_IO_RDMA_WRITE); + tx_wqe = (struct efa_io_tx_wqe *)efa_send_wr_alloc(qp, ibvqpx); if (unlikely(!tx_wqe)) return; - efa_send_wr_set_rdma_addr(&tx_wqe->data.rdma_req.remote_mem, rkey, remote_addr); + efa_send_wr_rdma_common(qp, ibvqpx, rkey, remote_addr, EFA_IO_RDMA_WRITE, + &tx_wqe->meta, tx_wqe->data.rdma_req.local_mem, + &tx_wqe->data.rdma_req.remote_mem, NULL); } -static void efa_send_wr_rdma_write_imm(struct ibv_qp_ex *ibvqpx, uint32_t rkey, - uint64_t remote_addr, __be32 imm_data) +static void efa_send_wr_rdma_write_128(struct ibv_qp_ex *ibvqpx, uint32_t rkey, + uint64_t remote_addr) { + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + struct efa_io_tx_wqe_128 *tx_wqe; + + tx_wqe = (struct efa_io_tx_wqe_128 *)efa_send_wr_alloc(qp, ibvqpx); + if (unlikely(!tx_wqe)) + return; + + efa_send_wr_rdma_common(qp, ibvqpx, rkey, remote_addr, EFA_IO_RDMA_WRITE, + &tx_wqe->meta, tx_wqe->data.rdma_req.local_mem, + &tx_wqe->data.rdma_req.remote_mem, NULL); +} + +static void efa_send_wr_rdma_write_imm_64(struct ibv_qp_ex *ibvqpx, uint32_t rkey, + uint64_t remote_addr, __be32 imm_data) +{ + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); struct efa_io_tx_wqe *tx_wqe; - tx_wqe = efa_send_wr_common(ibvqpx, EFA_IO_RDMA_WRITE); + tx_wqe = (struct efa_io_tx_wqe *)efa_send_wr_alloc(qp, ibvqpx); if (unlikely(!tx_wqe)) return; - efa_send_wr_set_rdma_addr(&tx_wqe->data.rdma_req.remote_mem, rkey, remote_addr); - efa_send_wr_set_imm_data(&tx_wqe->meta, imm_data); + efa_send_wr_rdma_common(qp, ibvqpx, rkey, remote_addr, EFA_IO_RDMA_WRITE, + &tx_wqe->meta, tx_wqe->data.rdma_req.local_mem, + &tx_wqe->data.rdma_req.remote_mem, NULL); + efa_send_wr_set_imm_data(qp->sq.curr_tx_wqe.md, imm_data); +} + +static void efa_send_wr_rdma_write_imm_128(struct ibv_qp_ex *ibvqpx, uint32_t rkey, + uint64_t remote_addr, __be32 imm_data) +{ + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + struct efa_io_tx_wqe_128 *tx_wqe; + + tx_wqe = (struct efa_io_tx_wqe_128 *)efa_send_wr_alloc(qp, ibvqpx); + if (unlikely(!tx_wqe)) + return; + + efa_send_wr_rdma_common(qp, ibvqpx, rkey, remote_addr, EFA_IO_RDMA_WRITE, + &tx_wqe->meta, tx_wqe->data.rdma_req.local_mem, + &tx_wqe->data.rdma_req.remote_mem, NULL); + efa_send_wr_set_imm_data(qp->sq.curr_tx_wqe.md, imm_data); } static void efa_send_wr_set_sge(struct ibv_qp_ex *ibvqpx, uint32_t lkey, uint64_t addr, uint32_t length) { struct efa_qp *qp = to_efa_qp_ex(ibvqpx); - struct efa_io_tx_buf_desc *buf; - struct efa_io_tx_wqe *tx_wqe; + struct efa_io_tx_meta_desc *md; uint8_t op_type; if (unlikely(qp->wr_session_err)) return; - tx_wqe = qp->sq.curr_tx_wqe; - tx_wqe->meta.length = 1; + md = qp->sq.curr_tx_wqe.md; + md->length = 1; - op_type = EFA_GET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_OP_TYPE); - switch (op_type) { - case EFA_IO_SEND: - buf = &tx_wqe->data.sgl[0]; - break; - case EFA_IO_RDMA_READ: - case EFA_IO_RDMA_WRITE: - tx_wqe->data.rdma_req.remote_mem.length = length; - buf = &tx_wqe->data.rdma_req.local_mem[0]; - break; - default: - return; - } + op_type = EFA_GET(&md->ctrl1, EFA_IO_TX_META_DESC_OP_TYPE); + if (op_type == EFA_IO_RDMA_READ || op_type == EFA_IO_RDMA_WRITE) + qp->sq.curr_tx_wqe.remote_mem->length = length; - efa_set_tx_buf(buf, addr, lkey, length); + efa_set_tx_buf(qp->sq.curr_tx_wqe.local_mem, addr, lkey, length); } static void efa_send_wr_set_sge_list(struct ibv_qp_ex *ibvqpx, size_t num_sge, const struct ibv_sge *sg_list) { struct efa_qp *qp = to_efa_qp_ex(ibvqpx); - struct efa_io_rdma_req *rdma_req; - struct efa_io_tx_wqe *tx_wqe; + struct efa_io_tx_meta_desc *md; struct efa_sq *sq = &qp->sq; uint8_t op_type; if (unlikely(qp->wr_session_err)) return; - tx_wqe = sq->curr_tx_wqe; - op_type = EFA_GET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_OP_TYPE); - switch (op_type) { - case EFA_IO_SEND: - if (unlikely(num_sge > sq->wq.max_sge)) { - verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), - "SQ[%u] num_sge[%zu] > max_sge[%u]\n", - ibvqpx->qp_base.qp_num, num_sge, - sq->wq.max_sge); - qp->wr_session_err = EINVAL; - return; - } - efa_post_send_sgl(tx_wqe->data.sgl, &tx_wqe->meta, sg_list, num_sge); - break; - case EFA_IO_RDMA_READ: - case EFA_IO_RDMA_WRITE: - if (unlikely(num_sge > sq->max_wr_rdma_sge)) { - verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), - "SQ[%u] num_sge[%zu] > max_rdma_sge[%zu]\n", - ibvqpx->qp_base.qp_num, num_sge, - sq->max_wr_rdma_sge); - qp->wr_session_err = EINVAL; - return; - } - rdma_req = &tx_wqe->data.rdma_req; - rdma_req->remote_mem.length = efa_sge_total_bytes(sg_list, num_sge); - efa_post_send_sgl(rdma_req->local_mem, &tx_wqe->meta, sg_list, num_sge); - break; - default: + md = sq->curr_tx_wqe.md; + + op_type = EFA_GET(&md->ctrl1, EFA_IO_TX_META_DESC_OP_TYPE); + if (unlikely(num_sge > sq->curr_tx_wqe.max_sge)) { + verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), + "SQ[%u] op_type[%u] num_sge[%zu] > max_sge[%u]\n", + ibvqpx->qp_base.qp_num, op_type, num_sge, + sq->curr_tx_wqe.max_sge); + qp->wr_session_err = EINVAL; return; } + + if (op_type == EFA_IO_RDMA_READ || op_type == EFA_IO_RDMA_WRITE) + sq->curr_tx_wqe.remote_mem->length = efa_sge_total_bytes(sg_list, num_sge); + + efa_post_send_sgl(sq->curr_tx_wqe.local_mem, md, sg_list, num_sge); } static void efa_send_wr_set_inline_data(struct ibv_qp_ex *ibvqpx, void *addr, size_t length) { struct efa_qp *qp = to_efa_qp_ex(ibvqpx); - struct efa_io_tx_wqe *tx_wqe = qp->sq.curr_tx_wqe; + struct efa_io_tx_meta_desc *md = qp->sq.curr_tx_wqe.md; + uint8_t op_type; if (unlikely(qp->wr_session_err)) return; @@ -2574,9 +2699,18 @@ static void efa_send_wr_set_inline_data(struct ibv_qp_ex *ibvqpx, void *addr, return; } - EFA_SET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG, 1); - memcpy(tx_wqe->data.inline_data, addr, length); - tx_wqe->meta.length = length; + if (unlikely(!qp->sq.curr_tx_wqe.inline_data)) { + op_type = EFA_GET(&md->ctrl1, EFA_IO_TX_META_DESC_OP_TYPE); + verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), + "SQ[%u] inline op_type[%u] isn't supported\n", + ibvqpx->qp_base.qp_num, op_type); + qp->wr_session_err = EINVAL; + return; + } + + EFA_SET(&md->ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG, 1); + memcpy(qp->sq.curr_tx_wqe.inline_data, addr, length); + md->length = length; } static void @@ -2585,9 +2719,9 @@ efa_send_wr_set_inline_data_list(struct ibv_qp_ex *ibvqpx, const struct ibv_data_buf *buf_list) { struct efa_qp *qp = to_efa_qp_ex(ibvqpx); - struct efa_io_tx_wqe *tx_wqe = qp->sq.curr_tx_wqe; - uint32_t total_length = 0; - uint32_t length; + struct efa_io_tx_meta_desc *md = qp->sq.curr_tx_wqe.md; + uint32_t length, total_length = 0; + uint8_t op_type; size_t i; if (unlikely(qp->wr_session_err)) @@ -2604,16 +2738,25 @@ efa_send_wr_set_inline_data_list(struct ibv_qp_ex *ibvqpx, return; } + if (unlikely(!qp->sq.curr_tx_wqe.inline_data)) { + op_type = EFA_GET(&md->ctrl1, EFA_IO_TX_META_DESC_OP_TYPE); + verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), + "SQ[%u] inline op_type[%u] isn't supported\n", + ibvqpx->qp_base.qp_num, op_type); + qp->wr_session_err = EINVAL; + return; + } + for (i = 0; i < num_buf; i++) { length = buf_list[i].length; - memcpy(tx_wqe->data.inline_data + total_length, + memcpy(qp->sq.curr_tx_wqe.inline_data + total_length, buf_list[i].addr, length); total_length += length; } - EFA_SET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG, 1); - tx_wqe->meta.length = total_length; + EFA_SET(&md->ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG, 1); + md->length = total_length; } static void efa_send_wr_set_addr(struct ibv_qp_ex *ibvqpx, @@ -2622,20 +2765,21 @@ static void efa_send_wr_set_addr(struct ibv_qp_ex *ibvqpx, { struct efa_qp *qp = to_efa_qp_ex(ibvqpx); struct efa_ah *ah = to_efa_ah(ibvah); - struct efa_io_tx_wqe *tx_wqe; + struct efa_io_tx_meta_desc *md; if (unlikely(qp->wr_session_err)) return; - tx_wqe = qp->sq.curr_tx_wqe; + md = qp->sq.curr_tx_wqe.md; - tx_wqe->meta.dest_qp_num = remote_qpn; - tx_wqe->meta.ah = ah->efa_ah; - tx_wqe->meta.qkey = remote_qkey; + md->dest_qp_num = remote_qpn; + md->ah = ah->efa_ah; + md->qkey = remote_qkey; rdma_tracepoint(rdma_core_efa, post_send, qp->dev->name, ibvqpx->wr_id, - EFA_GET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_OP_TYPE), - ibvqpx->qp_base.qp_num, remote_qpn, ah->efa_ah, efa_get_wqe_length(tx_wqe)); + EFA_GET(&md->ctrl1, EFA_IO_TX_META_DESC_OP_TYPE), + ibvqpx->qp_base.qp_num, remote_qpn, ah->efa_ah, + efa_wqe_get_data_length(qp->sq)); } static void efa_send_wr_start(struct ibv_qp_ex *ibvqpx) @@ -2739,26 +2883,32 @@ static void efa_send_wr_abort(struct ibv_qp_ex *ibvqpx) } static void efa_qp_fill_wr_pfns(struct ibv_qp_ex *ibvqpx, - struct ibv_qp_init_attr_ex *attr_ex) + struct ibv_qp_init_attr_ex *attr_ex, + uint16_t wqe_size) { + bool use_64 = wqe_size == EFA_IO_TX_DESC_SIZE_64; + ibvqpx->wr_start = efa_send_wr_start; ibvqpx->wr_complete = efa_send_wr_complete; ibvqpx->wr_abort = efa_send_wr_abort; if (attr_ex->send_ops_flags & IBV_QP_EX_WITH_SEND) - ibvqpx->wr_send = efa_send_wr_send; + ibvqpx->wr_send = use_64 ? efa_send_wr_send_64 : efa_send_wr_send_128; if (attr_ex->send_ops_flags & IBV_QP_EX_WITH_SEND_WITH_IMM) - ibvqpx->wr_send_imm = efa_send_wr_send_imm; + ibvqpx->wr_send_imm = use_64 ? efa_send_wr_send_imm_64 : efa_send_wr_send_imm_128; if (attr_ex->send_ops_flags & IBV_QP_EX_WITH_RDMA_READ) - ibvqpx->wr_rdma_read = efa_send_wr_rdma_read; + ibvqpx->wr_rdma_read = use_64 ? efa_send_wr_rdma_read_64 : + efa_send_wr_rdma_read_128; if (attr_ex->send_ops_flags & IBV_QP_EX_WITH_RDMA_WRITE) - ibvqpx->wr_rdma_write = efa_send_wr_rdma_write; + ibvqpx->wr_rdma_write = use_64 ? efa_send_wr_rdma_write_64 : + efa_send_wr_rdma_write_128; if (attr_ex->send_ops_flags & IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM) - ibvqpx->wr_rdma_write_imm = efa_send_wr_rdma_write_imm; + ibvqpx->wr_rdma_write_imm = use_64 ? efa_send_wr_rdma_write_imm_64 : + efa_send_wr_rdma_write_imm_128; ibvqpx->wr_set_inline_data = efa_send_wr_set_inline_data; ibvqpx->wr_set_inline_data_list = efa_send_wr_set_inline_data_list; From 6ec0c63f9dade922b9afed79481b49ad650fcafd Mon Sep 17 00:00:00 2001 From: Yonatan Nachum Date: Mon, 8 Dec 2025 16:53:55 +0000 Subject: [PATCH 07/11] efa: Add inline support for RDMA write Add inline support for RDMA write with 128-byte WQE. To support write with inline ULP must provide a new flag in EFA create QP DV since EFA support write with inline only with 128-byte WQE. When the flag is provided 128-byte WQE is used for any requested inline size. Assign the inline buffer address in the SQ field to enable inline data setters for 128-byte WQE and set the remote length the same as local length. Signed-off-by: Yonatan Nachum --- providers/efa/efa.h | 1 + providers/efa/efadv.h | 3 +- providers/efa/man/efadv_create_qp_ex.3.md | 3 ++ providers/efa/verbs.c | 38 ++++++++++++++++------- pyverbs/providers/efa/efa_enums.pxd | 1 + 5 files changed, 34 insertions(+), 12 deletions(-) diff --git a/providers/efa/efa.h b/providers/efa/efa.h index 9bc4b056a..4abdbd6f8 100644 --- a/providers/efa/efa.h +++ b/providers/efa/efa.h @@ -158,6 +158,7 @@ struct efa_sq { size_t max_wr_rdma_sge; uint16_t max_batch_wr; uint16_t wqe_size; + bool inline_write_enabled; /* Buffer for pending WR entries in the current session */ uint8_t *local_queue; diff --git a/providers/efa/efadv.h b/providers/efa/efadv.h index 7c034f881..5e42c15ef 100644 --- a/providers/efa/efadv.h +++ b/providers/efa/efadv.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2019-2025 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2019-2026 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef __EFADV_H__ @@ -61,6 +61,7 @@ struct ibv_qp *efadv_create_driver_qp(struct ibv_pd *ibvpd, enum { EFADV_QP_FLAGS_UNSOLICITED_WRITE_RECV = 1 << 0, + EFADV_QP_FLAGS_INLINE_WRITE = 1 << 1, }; struct efadv_qp_init_attr { diff --git a/providers/efa/man/efadv_create_qp_ex.3.md b/providers/efa/man/efadv_create_qp_ex.3.md index aaeedfdee..8617fe363 100644 --- a/providers/efa/man/efadv_create_qp_ex.3.md +++ b/providers/efa/man/efadv_create_qp_ex.3.md @@ -68,6 +68,9 @@ struct efadv_qp_init_attr { EFADV_QP_FLAGS_UNSOLICITED_WRITE_RECV: Receive WRs will not be consumed for RDMA write with imm. + EFADV_QP_FLAGS_INLINE_WRITE: + QP supports RDMA write with inline operations. + *sl* : Service Level - 0 value implies default level. diff --git a/providers/efa/verbs.c b/providers/efa/verbs.c index 6bd87dbbf..257c6ec16 100644 --- a/providers/efa/verbs.c +++ b/providers/efa/verbs.c @@ -1606,9 +1606,9 @@ static void efa_qp_init_indices(struct efa_qp *qp) qp->rq.wq.wrid_idx_pool_next = 0; } -static int efa_calc_sq_wqe_size(struct ibv_qp_cap *cap) +static int efa_calc_sq_wqe_size(struct ibv_qp_cap *cap, bool inline_write_enabled) { - if (cap->max_inline_data > EFA_IO_TX_DESC_INLINE_MAX_SIZE) + if (cap->max_inline_data > EFA_IO_TX_DESC_INLINE_MAX_SIZE || inline_write_enabled) return EFA_IO_TX_DESC_SIZE_128; return EFA_IO_TX_DESC_SIZE_64; @@ -1616,18 +1616,22 @@ static int efa_calc_sq_wqe_size(struct ibv_qp_cap *cap) static void efa_setup_qp(struct efa_context *ctx, struct efa_qp *qp, - struct ibv_qp_cap *cap, + struct ibv_qp_init_attr_ex *attr, + struct efadv_qp_init_attr *efa_attr, size_t page_size) { + bool inline_write_enabled = !!(efa_attr->flags & EFADV_QP_FLAGS_INLINE_WRITE); + struct ibv_qp_cap *cap = &attr->cap; uint16_t rq_desc_cnt; efa_qp_init_indices(qp); - qp->sq.wqe_size = efa_calc_sq_wqe_size(cap); + qp->sq.wqe_size = efa_calc_sq_wqe_size(cap, inline_write_enabled); qp->sq.wq.wqe_cnt = roundup_pow_of_two(max_t(uint32_t, cap->max_send_wr, ctx->min_sq_wr)); qp->sq.wq.max_sge = cap->max_send_sge; qp->sq.wq.desc_mask = qp->sq.wq.wqe_cnt - 1; + qp->sq.inline_write_enabled = inline_write_enabled; qp->rq.wq.max_sge = cap->max_recv_sge; rq_desc_cnt = roundup_pow_of_two(cap->max_recv_sge * cap->max_recv_wr); @@ -1680,9 +1684,11 @@ static int efa_check_qp_attr(struct efa_context *ctx, if (EFA_DEV_CAP(ctx, RDMA_READ)) supp_srd_send_ops_mask |= IBV_QP_EX_WITH_RDMA_READ; - if (EFA_DEV_CAP(ctx, RDMA_WRITE)) + if (EFA_DEV_CAP(ctx, RDMA_WRITE)) { + supp_efa_flags |= EFADV_QP_FLAGS_INLINE_WRITE; supp_srd_send_ops_mask |= IBV_QP_EX_WITH_RDMA_WRITE | IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM; + } if (EFA_DEV_CAP(ctx, UNSOLICITED_WRITE_RECV)) supp_efa_flags |= EFADV_QP_FLAGS_UNSOLICITED_WRITE_RECV; @@ -1751,8 +1757,10 @@ static int efa_check_qp_attr(struct efa_context *ctx, } static int efa_check_qp_limits(struct efa_context *ctx, - struct ibv_qp_init_attr_ex *attr) + struct ibv_qp_init_attr_ex *attr, + struct efadv_qp_init_attr *efa_attr) { + bool inline_write_enabled = !!(efa_attr->flags & EFADV_QP_FLAGS_INLINE_WRITE); int sq_wqe_size; if (attr->cap.max_send_sge > ctx->max_sq_sge) { @@ -1769,7 +1777,7 @@ static int efa_check_qp_limits(struct efa_context *ctx, return EINVAL; } - sq_wqe_size = efa_calc_sq_wqe_size(&attr->cap); + sq_wqe_size = efa_calc_sq_wqe_size(&attr->cap, inline_write_enabled); if (attr->cap.max_send_wr * sq_wqe_size > ctx->max_llq_size) { verbs_err(&ctx->ibvctx, "Max Send WR %u > %u\n", attr->cap.max_send_wr, @@ -1812,7 +1820,7 @@ static struct ibv_qp *create_qp(struct ibv_context *ibvctx, if (err) goto err_out; - err = efa_check_qp_limits(ctx, attr); + err = efa_check_qp_limits(ctx, attr, efa_attr); if (err) goto err_out; @@ -1822,7 +1830,7 @@ static struct ibv_qp *create_qp(struct ibv_context *ibvctx, goto err_out; } - efa_setup_qp(ctx, qp, &attr->cap, dev->pg_sz); + efa_setup_qp(ctx, qp, attr, efa_attr, dev->pg_sz); attr->cap.max_send_wr = qp->sq.wq.wqe_cnt; attr->cap.max_recv_wr = qp->rq.wq.wqe_cnt; @@ -2596,7 +2604,9 @@ static void efa_send_wr_rdma_write_128(struct ibv_qp_ex *ibvqpx, uint32_t rkey, efa_send_wr_rdma_common(qp, ibvqpx, rkey, remote_addr, EFA_IO_RDMA_WRITE, &tx_wqe->meta, tx_wqe->data.rdma_req.local_mem, - &tx_wqe->data.rdma_req.remote_mem, NULL); + &tx_wqe->data.rdma_req.remote_mem, + qp->sq.inline_write_enabled ? tx_wqe->data.rdma_req.inline_data : + NULL); } static void efa_send_wr_rdma_write_imm_64(struct ibv_qp_ex *ibvqpx, uint32_t rkey, @@ -2627,7 +2637,9 @@ static void efa_send_wr_rdma_write_imm_128(struct ibv_qp_ex *ibvqpx, uint32_t rk efa_send_wr_rdma_common(qp, ibvqpx, rkey, remote_addr, EFA_IO_RDMA_WRITE, &tx_wqe->meta, tx_wqe->data.rdma_req.local_mem, - &tx_wqe->data.rdma_req.remote_mem, NULL); + &tx_wqe->data.rdma_req.remote_mem, + qp->sq.inline_write_enabled ? tx_wqe->data.rdma_req.inline_data : + NULL); efa_send_wr_set_imm_data(qp->sq.curr_tx_wqe.md, imm_data); } @@ -2711,6 +2723,8 @@ static void efa_send_wr_set_inline_data(struct ibv_qp_ex *ibvqpx, void *addr, EFA_SET(&md->ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG, 1); memcpy(qp->sq.curr_tx_wqe.inline_data, addr, length); md->length = length; + if (qp->sq.curr_tx_wqe.remote_mem) + qp->sq.curr_tx_wqe.remote_mem->length = length; } static void @@ -2757,6 +2771,8 @@ efa_send_wr_set_inline_data_list(struct ibv_qp_ex *ibvqpx, EFA_SET(&md->ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG, 1); md->length = total_length; + if (qp->sq.curr_tx_wqe.remote_mem) + qp->sq.curr_tx_wqe.remote_mem->length = total_length; } static void efa_send_wr_set_addr(struct ibv_qp_ex *ibvqpx, diff --git a/pyverbs/providers/efa/efa_enums.pxd b/pyverbs/providers/efa/efa_enums.pxd index 11c85d62f..cfb73d979 100644 --- a/pyverbs/providers/efa/efa_enums.pxd +++ b/pyverbs/providers/efa/efa_enums.pxd @@ -17,6 +17,7 @@ cdef extern from 'infiniband/efadv.h': cpdef enum: EFADV_QP_FLAGS_UNSOLICITED_WRITE_RECV + EFADV_QP_FLAGS_INLINE_WRITE cpdef enum: EFADV_WC_EX_WITH_SGID From f33793efe873b481358ef6895477b3b1e5d24327 Mon Sep 17 00:00:00 2001 From: Yonatan Nachum Date: Sun, 8 Feb 2026 10:32:07 +0000 Subject: [PATCH 08/11] efa: Add support to query new extended inline size in query DV Report the new extended max inline size in EFA query device DV. The new inline size deprecates the legacy one and should be used from now. Signed-off-by: Yonatan Nachum --- providers/efa/efadv.h | 2 +- providers/efa/man/efadv_query_device.3.md | 5 ++++- providers/efa/verbs.c | 3 ++- pyverbs/providers/efa/efadv.pyx | 7 ++++++- pyverbs/providers/efa/libefa.pxd | 4 ++-- 5 files changed, 15 insertions(+), 6 deletions(-) diff --git a/providers/efa/efadv.h b/providers/efa/efadv.h index 5e42c15ef..c47e5f8f2 100644 --- a/providers/efa/efadv.h +++ b/providers/efa/efadv.h @@ -32,7 +32,7 @@ struct efadv_device_attr { uint16_t max_sq_sge; uint16_t max_rq_sge; uint16_t inline_buf_size; - uint8_t reserved[2]; + uint16_t inline_buf_size_ex; uint32_t device_caps; uint32_t max_rdma_size; }; diff --git a/providers/efa/man/efadv_query_device.3.md b/providers/efa/man/efadv_query_device.3.md index c41bc3d9d..f46d362fc 100644 --- a/providers/efa/man/efadv_query_device.3.md +++ b/providers/efa/man/efadv_query_device.3.md @@ -36,7 +36,7 @@ struct efadv_device_attr { uint16_t max_sq_sge; uint16_t max_rq_sge; uint16_t inline_buf_size; - uint8_t reserved[2]; + uint16_t inline_buf_size_ex; uint32_t device_caps; uint32_t max_rdma_size; }; @@ -61,6 +61,9 @@ struct efadv_device_attr { : Maximum Receive Queue (RQ) Scatter Gather Elements (SGEs). *inline_buf_size* +: Maximum inline buffer size (deprecated by inline_buf_size_ex). + +*inline_buf_size_ex* : Maximum inline buffer size. *device_caps* diff --git a/providers/efa/verbs.c b/providers/efa/verbs.c index 257c6ec16..67ea94d2c 100644 --- a/providers/efa/verbs.c +++ b/providers/efa/verbs.c @@ -158,7 +158,7 @@ int efadv_query_device(struct ibv_context *ibvctx, return EOPNOTSUPP; } - if (!vext_field_avail(typeof(*attr), inline_buf_size, inlen)) { + if (!vext_field_avail(typeof(*attr), inline_buf_size_ex, inlen)) { verbs_err(verbs_get_ctx(ibvctx), "Compatibility issues\n"); return EINVAL; } @@ -169,6 +169,7 @@ int efadv_query_device(struct ibv_context *ibvctx, attr->max_sq_sge = ctx->max_sq_sge; attr->max_rq_sge = ctx->max_rq_sge; attr->inline_buf_size = ctx->inline_buf_size; + attr->inline_buf_size_ex = ctx->inline_buf_size_ex; if (vext_field_avail(typeof(*attr), device_caps, inlen)) { if (EFA_DEV_CAP(ctx, RNR_RETRY)) diff --git a/pyverbs/providers/efa/efadv.pyx b/pyverbs/providers/efa/efadv.pyx index ec21225aa..ab2a092ca 100644 --- a/pyverbs/providers/efa/efadv.pyx +++ b/pyverbs/providers/efa/efadv.pyx @@ -1,5 +1,5 @@ # SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) -# Copyright 2020-2024 Amazon.com, Inc. or its affiliates. All rights reserved. +# Copyright 2020-2026 Amazon.com, Inc. or its affiliates. All rights reserved. cimport pyverbs.providers.efa.efa_enums as dve cimport pyverbs.providers.efa.libefa as dv @@ -92,6 +92,10 @@ cdef class EfaDVDeviceAttr(PyverbsObject): def inline_buf_size(self): return self.device_attr.inline_buf_size + @property + def inline_buf_size_ex(self): + return self.device_attr.inline_buf_size_ex + @property def device_caps(self): return self.device_attr.device_caps @@ -108,6 +112,7 @@ cdef class EfaDVDeviceAttr(PyverbsObject): print_format.format('Max SQ SQE', self.device_attr.max_sq_sge) + \ print_format.format('Max RQ SQE', self.device_attr.max_rq_sge) + \ print_format.format('Inline buffer size', self.device_attr.inline_buf_size) + \ + print_format.format('Inline buffer size ex', self.device_attr.inline_buf_size_ex) + \ print_format.format('Device Capabilities', dev_cap_to_str(self.device_attr.device_caps)) + \ print_format.format('Max RDMA Size', self.device_attr.max_rdma_size) diff --git a/pyverbs/providers/efa/libefa.pxd b/pyverbs/providers/efa/libefa.pxd index 265868ac0..a58de9661 100644 --- a/pyverbs/providers/efa/libefa.pxd +++ b/pyverbs/providers/efa/libefa.pxd @@ -1,5 +1,5 @@ # SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) -# Copyright 2020-2024 Amazon.com, Inc. or its affiliates. All rights reserved. +# Copyright 2020-2026 Amazon.com, Inc. or its affiliates. All rights reserved. #cython: language_level=3 @@ -17,7 +17,7 @@ cdef extern from 'infiniband/efadv.h': uint16_t max_sq_sge; uint16_t max_rq_sge; uint16_t inline_buf_size; - uint8_t reserved[2]; + uint16_t inline_buf_size_ex; uint32_t device_caps; uint32_t max_rdma_size; From b24d316dd6e3906164c4a3f96c7c7b6ea4315433 Mon Sep 17 00:00:00 2001 From: Yonatan Nachum Date: Sun, 8 Feb 2026 12:15:34 +0000 Subject: [PATCH 09/11] efa: Add EFA DV to get max SQ depth based on init params With 128-byte WQE support, the max SQ depth is half the size compared to when using 64-byte WQE. Add an EFA DV that given SQ init parameters, calculates the max SQ depth so ULPs can use it to create QPs without failing on SQ depth. Signed-off-by: Yonatan Nachum --- debian/ibverbs-providers.symbols | 2 + providers/efa/CMakeLists.txt | 2 +- providers/efa/efadv.h | 15 ++++ providers/efa/libefa.map | 5 ++ providers/efa/man/efadv_get_sq_max_depth.3.md | 72 +++++++++++++++++++ providers/efa/verbs.c | 60 +++++++++++++--- pyverbs/providers/efa/efa_enums.pxd | 3 + pyverbs/providers/efa/efadv.pxd | 6 +- pyverbs/providers/efa/efadv.pyx | 45 ++++++++++++ pyverbs/providers/efa/libefa.pxd | 9 +++ util/util.h | 5 ++ 11 files changed, 214 insertions(+), 10 deletions(-) create mode 100644 providers/efa/man/efadv_get_sq_max_depth.3.md diff --git a/debian/ibverbs-providers.symbols b/debian/ibverbs-providers.symbols index e280809bd..2cc3079f9 100644 --- a/debian/ibverbs-providers.symbols +++ b/debian/ibverbs-providers.symbols @@ -173,6 +173,7 @@ libefa.so.1 ibverbs-providers #MINVER# EFA_1.2@EFA_1.2 43 EFA_1.3@EFA_1.3 50 EFA_1.4@EFA_1.4 59 + EFA_1.5@EFA_1.5 63 efadv_create_driver_qp@EFA_1.0 24 efadv_create_qp_ex@EFA_1.1 26 efadv_query_device@EFA_1.1 26 @@ -182,6 +183,7 @@ libefa.so.1 ibverbs-providers #MINVER# efadv_query_mr@EFA_1.3 50 efadv_query_qp_wqs@EFA_1.4 59 efadv_query_cq@EFA_1.4 59 + efadv_get_max_sq_depth@EFA_1.5 63 libhns.so.1 ibverbs-providers #MINVER# * Build-Depends-Package: libibverbs-dev HNS_1.0@HNS_1.0 51 diff --git a/providers/efa/CMakeLists.txt b/providers/efa/CMakeLists.txt index c4ce3c0fe..ea082f0cf 100644 --- a/providers/efa/CMakeLists.txt +++ b/providers/efa/CMakeLists.txt @@ -3,7 +3,7 @@ if (ENABLE_LTTNG AND LTTNGUST_FOUND) endif() rdma_shared_provider(efa libefa.map - 1 1.4.${PACKAGE_VERSION} + 1 1.5.${PACKAGE_VERSION} ${TRACE_FILE} efa.c verbs.c diff --git a/providers/efa/efadv.h b/providers/efa/efadv.h index c47e5f8f2..2d7f45c30 100644 --- a/providers/efa/efadv.h +++ b/providers/efa/efadv.h @@ -47,6 +47,21 @@ struct efadv_ah_attr { uint8_t reserved[6]; }; +enum { + EFADV_SQ_DEPTH_ATTR_INLINE_WRITE = 1 << 0, +}; + +struct efadv_sq_depth_attr { + uint64_t comp_mask; + uint32_t flags; + uint32_t max_send_sge; + uint32_t max_rdma_sge; + uint32_t max_inline_data; +}; + +int efadv_get_max_sq_depth(struct ibv_context *ibvctx, struct efadv_sq_depth_attr *attr, + uint32_t inlen); + int efadv_query_ah(struct ibv_ah *ibvah, struct efadv_ah_attr *attr, uint32_t inlen); diff --git a/providers/efa/libefa.map b/providers/efa/libefa.map index 13fac76a3..4e5ffaca4 100644 --- a/providers/efa/libefa.map +++ b/providers/efa/libefa.map @@ -29,3 +29,8 @@ EFA_1.4 { efadv_query_qp_wqs; efadv_query_cq; } EFA_1.3; + +EFA_1.5 { + global: + efadv_get_max_sq_depth; +} EFA_1.4; diff --git a/providers/efa/man/efadv_get_sq_max_depth.3.md b/providers/efa/man/efadv_get_sq_max_depth.3.md new file mode 100644 index 000000000..fd5c2d07d --- /dev/null +++ b/providers/efa/man/efadv_get_sq_max_depth.3.md @@ -0,0 +1,72 @@ +--- +layout: page +title: EFADV_GET_MAX_SQ_DEPTH +section: 3 +tagline: Verbs +date: 2026-02-17 +header: "EFA Direct Verbs Manual" +footer: efa +--- + +# NAME + +efadv_get_max_sq_depth - Get EFA send queue max depth based on send queue attributes + +# SYNOPSIS + +```c +#include + +int efadv_get_max_sq_depth(struct ibv_context *ibvctx, struct efadv_sq_depth_attr *attr, + uint32_t inlen); +``` + +# DESCRIPTION + +**efadv_get_max_sq_depth()** get device-specific send queue max depth based on SQ attributes. + +Compatibility is handled using the comp_mask and inlen fields. + +```c +struct efadv_sq_depth_attr { + uint64_t comp_mask; + uint32_t flags; + uint32_t max_send_sge; + uint32_t max_rdma_sge; + uint32_t max_inline_data; +}; +``` + +*inlen* +: In: Size of struct efadv_sq_depth_attr. + +*comp_mask* +: Compatibility mask. + +*flags* +: A bitwise OR of the values described below. + + EFADV_SQ_DEPTH_ATTR_INLINE_WRITE: + Inline RDMA write operation support is required. + +*max_send_sge* +: Requested max number of scatter/gather (s/g) elements in a send WR in the send queue. + +*max_rdma_sge* +: Requested max number of scatter/gather (s/g) elements in a RDMA WR in the send queue. + +*max_inline_data* +: Requested max number of data (bytes) that can be posted inline to the send queue. + +# RETURN VALUE + +**efadv_get_max_sq_depth()** returns max send queue depth on success, or the negative value of errno on failure +(which indicates the failure reason). + +# SEE ALSO + +**efadv**(7) + +# AUTHORS + +Yonatan Nachum diff --git a/providers/efa/verbs.c b/providers/efa/verbs.c index 67ea94d2c..a4bdfa8ba 100644 --- a/providers/efa/verbs.c +++ b/providers/efa/verbs.c @@ -1607,14 +1607,59 @@ static void efa_qp_init_indices(struct efa_qp *qp) qp->rq.wq.wrid_idx_pool_next = 0; } -static int efa_calc_sq_wqe_size(struct ibv_qp_cap *cap, bool inline_write_enabled) +static int efa_calc_sq_wqe_size(uint32_t max_inline_data, bool inline_write_enabled) { - if (cap->max_inline_data > EFA_IO_TX_DESC_INLINE_MAX_SIZE || inline_write_enabled) + if (max_inline_data > EFA_IO_TX_DESC_INLINE_MAX_SIZE || inline_write_enabled) return EFA_IO_TX_DESC_SIZE_128; return EFA_IO_TX_DESC_SIZE_64; } +static int efa_calc_sq_max_depth(struct efa_context *ctx, uint32_t max_inline_data, + bool write_with_inline) +{ + int sq_wqe_size = efa_calc_sq_wqe_size(max_inline_data, write_with_inline); + + return rounddown_pow_of_two(ctx->max_llq_size / sq_wqe_size); +} + +int efadv_get_max_sq_depth(struct ibv_context *ibvctx, struct efadv_sq_depth_attr *attr, + uint32_t inlen) +{ + bool write_with_inline = !!(attr->flags & EFADV_SQ_DEPTH_ATTR_INLINE_WRITE); + struct efa_context *ctx = to_efa_context(ibvctx); + + if (!is_efa_dev(ibvctx->device)) { + verbs_err(verbs_get_ctx(ibvctx), "Not an EFA device\n"); + return -EOPNOTSUPP; + } + + if (!vext_field_avail(typeof(*attr), max_inline_data, inlen) || attr->comp_mask) { + verbs_err(verbs_get_ctx(ibvctx), "Compatibility issues\n"); + return -EINVAL; + } + + if (attr->max_send_sge > ctx->max_sq_sge) { + verbs_err(verbs_get_ctx(ibvctx), "Max send SGE %u > %u\n", attr->max_send_sge, + ctx->max_sq_sge); + return -EINVAL; + } + + if (attr->max_rdma_sge > ctx->max_wr_rdma_sge) { + verbs_err(verbs_get_ctx(ibvctx), "Max RDMA SGE %u > %u\n", attr->max_rdma_sge, + ctx->max_wr_rdma_sge); + return -EINVAL; + } + + if (attr->max_inline_data > ctx->inline_buf_size_ex) { + verbs_err(verbs_get_ctx(ibvctx), "Max inline data %u > %u\n", attr->max_inline_data, + ctx->inline_buf_size_ex); + return -EINVAL; + } + + return efa_calc_sq_max_depth(ctx, attr->max_inline_data, write_with_inline); +} + static void efa_setup_qp(struct efa_context *ctx, struct efa_qp *qp, struct ibv_qp_init_attr_ex *attr, @@ -1627,7 +1672,7 @@ static void efa_setup_qp(struct efa_context *ctx, efa_qp_init_indices(qp); - qp->sq.wqe_size = efa_calc_sq_wqe_size(cap, inline_write_enabled); + qp->sq.wqe_size = efa_calc_sq_wqe_size(cap->max_inline_data, inline_write_enabled); qp->sq.wq.wqe_cnt = roundup_pow_of_two(max_t(uint32_t, cap->max_send_wr, ctx->min_sq_wr)); qp->sq.wq.max_sge = cap->max_send_sge; @@ -1762,7 +1807,7 @@ static int efa_check_qp_limits(struct efa_context *ctx, struct efadv_qp_init_attr *efa_attr) { bool inline_write_enabled = !!(efa_attr->flags & EFADV_QP_FLAGS_INLINE_WRITE); - int sq_wqe_size; + int sq_max_depth; if (attr->cap.max_send_sge > ctx->max_sq_sge) { verbs_err(&ctx->ibvctx, @@ -1778,11 +1823,10 @@ static int efa_check_qp_limits(struct efa_context *ctx, return EINVAL; } - sq_wqe_size = efa_calc_sq_wqe_size(&attr->cap, inline_write_enabled); - if (attr->cap.max_send_wr * sq_wqe_size > ctx->max_llq_size) { + sq_max_depth = efa_calc_sq_max_depth(ctx, attr->cap.max_inline_data, inline_write_enabled); + if (attr->cap.max_send_wr > sq_max_depth) { verbs_err(&ctx->ibvctx, - "Max Send WR %u > %u\n", attr->cap.max_send_wr, - ctx->max_llq_size / sq_wqe_size); + "Max Send WR %u > %u\n", attr->cap.max_send_wr, sq_max_depth); return EINVAL; } diff --git a/pyverbs/providers/efa/efa_enums.pxd b/pyverbs/providers/efa/efa_enums.pxd index cfb73d979..258b97866 100644 --- a/pyverbs/providers/efa/efa_enums.pxd +++ b/pyverbs/providers/efa/efa_enums.pxd @@ -27,3 +27,6 @@ cdef extern from 'infiniband/efadv.h': EFADV_MR_ATTR_VALIDITY_RECV_IC_ID EFADV_MR_ATTR_VALIDITY_RDMA_READ_IC_ID EFADV_MR_ATTR_VALIDITY_RDMA_RECV_IC_ID + + cpdef enum: + EFADV_SQ_DEPTH_ATTR_INLINE_WRITE diff --git a/pyverbs/providers/efa/efadv.pxd b/pyverbs/providers/efa/efadv.pxd index 12e11f8ce..74d0ad448 100644 --- a/pyverbs/providers/efa/efadv.pxd +++ b/pyverbs/providers/efa/efadv.pxd @@ -1,5 +1,5 @@ # SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) -# Copyright 2020-2024 Amazon.com, Inc. or its affiliates. All rights reserved. +# Copyright 2020-2026 Amazon.com, Inc. or its affiliates. All rights reserved. #cython: language_level=3 @@ -50,3 +50,7 @@ cdef class EfaDVCQInitAttr(PyverbsObject): cdef class EfaDVMRAttr(PyverbsObject): cdef dv.efadv_mr_attr mr_attr + + +cdef class EfaDVSQDepthAttr(PyverbsObject): + cdef dv.efadv_sq_depth_attr sq_depth_attr diff --git a/pyverbs/providers/efa/efadv.pyx b/pyverbs/providers/efa/efadv.pyx index ab2a092ca..bc94c788f 100644 --- a/pyverbs/providers/efa/efadv.pyx +++ b/pyverbs/providers/efa/efadv.pyx @@ -330,3 +330,48 @@ cdef class EfaMR(MR): raise PyverbsRDMAError(f'Failed to query EFA MR', rc) return mr_attr + + +cdef class EfaDVSQDepthAttr(PyverbsObject): + """ + Represents efadv_sq_depth_attr struct + """ + @property + def comp_mask(self): + return self.sq_depth_attr.comp_mask + + @comp_mask.setter + def comp_mask(self, val): + self.sq_depth_attr.comp_mask = val + + @property + def flags(self): + return self.sq_depth_attr.flags + + @flags.setter + def flags(self, val): + self.sq_depth_attr.flags = val + + @property + def max_send_sge(self): + return self.sq_depth_attr.max_send_sge + + @max_send_sge.setter + def max_send_sge(self, val): + self.sq_depth_attr.max_send_sge = val + + @property + def max_rdma_sge(self): + return self.sq_depth_attr.max_rdma_sge + + @max_rdma_sge.setter + def max_rdma_sge(self, val): + self.sq_depth_attr.max_rdma_sge = val + + @property + def max_inline_data(self): + return self.sq_depth_attr.max_inline_data + + @max_inline_data.setter + def max_inline_data(self, val): + self.sq_depth_attr.max_inline_data = val diff --git a/pyverbs/providers/efa/libefa.pxd b/pyverbs/providers/efa/libefa.pxd index a58de9661..ca5a09c23 100644 --- a/pyverbs/providers/efa/libefa.pxd +++ b/pyverbs/providers/efa/libefa.pxd @@ -47,6 +47,13 @@ cdef extern from 'infiniband/efadv.h': uint16_t rdma_read_ic_id; uint16_t rdma_recv_ic_id; + cdef struct efadv_sq_depth_attr: + uint64_t comp_mask; + uint32_t flags; + uint32_t max_send_sge; + uint32_t max_rdma_sge; + uint32_t max_inline_data; + int efadv_query_device(v.ibv_context *ibvctx, efadv_device_attr *attrs, uint32_t inlen) int efadv_query_ah(v.ibv_ah *ibvah, efadv_ah_attr *attr, @@ -65,3 +72,5 @@ cdef extern from 'infiniband/efadv.h': int efadv_wc_read_sgid(efadv_cq *efadv_cq, v.ibv_gid *sgid) bool efadv_wc_is_unsolicited(efadv_cq *efadv_cq) int efadv_query_mr(v.ibv_mr *ibvmr, efadv_mr_attr *attr, uint32_t inlen) + int efadv_get_max_sq_depth(v.ibv_context *ibvctx, efadv_sq_depth_attr *attr, + uint32_t inlen) diff --git a/util/util.h b/util/util.h index 92b674067..ffccd1d92 100644 --- a/util/util.h +++ b/util/util.h @@ -81,6 +81,11 @@ static inline uint64_t roundup_pow_of_two(uint64_t n) return n == 1 ? 1 : 1ULL << ilog64(n - 1); } +static inline uint64_t rounddown_pow_of_two(uint64_t n) +{ + return n == 0 ? 0 : 1ULL << (ilog64(n) - 1); +} + static inline unsigned long DIV_ROUND_UP(unsigned long n, unsigned long d) { return (n + d - 1) / d; From 807dfe7850c14ce49f01ee76a7c62aabb72b607b Mon Sep 17 00:00:00 2001 From: Yonatan Nachum Date: Sun, 8 Feb 2026 13:31:31 +0000 Subject: [PATCH 10/11] efa: Add EFA DV to get max RQ depth based on init params EFA RQ max depth reported is the max descriptors a single RQ supports. For RQ EFA support multiple SGEs which means a single recv WR consumes multiple descriptors. This means that if the ULP requests more than 1 descriptor per recv WR, it can't use the max RQ depth reported. To help ULPs, add a new EFA DV that given the max recv SGEs requested, reports back the max RQ depth. Signed-off-by: Yonatan Nachum --- debian/ibverbs-providers.symbols | 1 + providers/efa/efadv.h | 8 +++ providers/efa/libefa.map | 1 + providers/efa/man/efadv_get_rq_max_depth.3.md | 57 +++++++++++++++++++ providers/efa/verbs.c | 24 ++++++++ pyverbs/providers/efa/efadv.pxd | 4 ++ pyverbs/providers/efa/efadv.pyx | 21 +++++++ pyverbs/providers/efa/libefa.pxd | 6 ++ 8 files changed, 122 insertions(+) create mode 100644 providers/efa/man/efadv_get_rq_max_depth.3.md diff --git a/debian/ibverbs-providers.symbols b/debian/ibverbs-providers.symbols index 2cc3079f9..7dfe785f7 100644 --- a/debian/ibverbs-providers.symbols +++ b/debian/ibverbs-providers.symbols @@ -184,6 +184,7 @@ libefa.so.1 ibverbs-providers #MINVER# efadv_query_qp_wqs@EFA_1.4 59 efadv_query_cq@EFA_1.4 59 efadv_get_max_sq_depth@EFA_1.5 63 + efadv_get_max_rq_depth@EFA_1.5 63 libhns.so.1 ibverbs-providers #MINVER# * Build-Depends-Package: libibverbs-dev HNS_1.0@HNS_1.0 51 diff --git a/providers/efa/efadv.h b/providers/efa/efadv.h index 2d7f45c30..bb2f9282b 100644 --- a/providers/efa/efadv.h +++ b/providers/efa/efadv.h @@ -62,6 +62,14 @@ struct efadv_sq_depth_attr { int efadv_get_max_sq_depth(struct ibv_context *ibvctx, struct efadv_sq_depth_attr *attr, uint32_t inlen); +struct efadv_rq_depth_attr { + uint64_t comp_mask; + uint32_t max_recv_sge; +}; + +int efadv_get_max_rq_depth(struct ibv_context *ibvctx, struct efadv_rq_depth_attr *attr, + uint32_t inlen); + int efadv_query_ah(struct ibv_ah *ibvah, struct efadv_ah_attr *attr, uint32_t inlen); diff --git a/providers/efa/libefa.map b/providers/efa/libefa.map index 4e5ffaca4..03a6d8e23 100644 --- a/providers/efa/libefa.map +++ b/providers/efa/libefa.map @@ -33,4 +33,5 @@ EFA_1.4 { EFA_1.5 { global: efadv_get_max_sq_depth; + efadv_get_max_rq_depth; } EFA_1.4; diff --git a/providers/efa/man/efadv_get_rq_max_depth.3.md b/providers/efa/man/efadv_get_rq_max_depth.3.md new file mode 100644 index 000000000..ffaaf2c0a --- /dev/null +++ b/providers/efa/man/efadv_get_rq_max_depth.3.md @@ -0,0 +1,57 @@ +--- +layout: page +title: EFADV_GET_MAX_RQ_DEPTH +section: 3 +tagline: Verbs +date: 2026-02-17 +header: "EFA Direct Verbs Manual" +footer: efa +--- + +# NAME + +efadv_get_max_rq_depth - Get EFA receive queue max depth based on receive queue attributes + +# SYNOPSIS + +```c +#include + +int efadv_get_max_rq_depth(struct ibv_context *ibvctx, struct efadv_rq_depth_attr *attr, + uint32_t inlen); +``` + +# DESCRIPTION + +**efadv_get_max_rq_depth()** get device-specific receive queue max depth based on RQ attributes. + +Compatibility is handled using the comp_mask and inlen fields. + +```c +struct efadv_rq_depth_attr { + uint64_t comp_mask; + uint32_t max_recv_sge; +}; +``` + +*inlen* +: In: Size of struct efadv_rq_depth_attr. + +*comp_mask* +: Compatibility mask. + +*max_recv_sge* +: Requested max number of scatter/gather (s/g) elements in a WR in the receive queue. + +# RETURN VALUE + +**efadv_get_max_rq_depth()** returns max receive queue depth on success, or the negative value of errno on failure +(which indicates the failure reason). + +# SEE ALSO + +**efadv**(7) + +# AUTHORS + +Yonatan Nachum diff --git a/providers/efa/verbs.c b/providers/efa/verbs.c index a4bdfa8ba..8ae3912e7 100644 --- a/providers/efa/verbs.c +++ b/providers/efa/verbs.c @@ -1660,6 +1660,30 @@ int efadv_get_max_sq_depth(struct ibv_context *ibvctx, struct efadv_sq_depth_att return efa_calc_sq_max_depth(ctx, attr->max_inline_data, write_with_inline); } +int efadv_get_max_rq_depth(struct ibv_context *ibvctx, struct efadv_rq_depth_attr *attr, + uint32_t inlen) +{ + struct efa_context *ctx = to_efa_context(ibvctx); + + if (!is_efa_dev(ibvctx->device)) { + verbs_err(verbs_get_ctx(ibvctx), "Not an EFA device\n"); + return -EOPNOTSUPP; + } + + if (!vext_field_avail(typeof(*attr), max_recv_sge, inlen) || attr->comp_mask) { + verbs_err(verbs_get_ctx(ibvctx), "Compatibility issues\n"); + return -EINVAL; + } + + if (attr->max_recv_sge > ctx->max_rq_sge) { + verbs_err(verbs_get_ctx(ibvctx), "Max receive SGE %u > %u\n", attr->max_recv_sge, + ctx->max_rq_sge); + return -EINVAL; + } + + return rounddown_pow_of_two(ctx->max_rq_wr / attr->max_recv_sge); +} + static void efa_setup_qp(struct efa_context *ctx, struct efa_qp *qp, struct ibv_qp_init_attr_ex *attr, diff --git a/pyverbs/providers/efa/efadv.pxd b/pyverbs/providers/efa/efadv.pxd index 74d0ad448..249ce7570 100644 --- a/pyverbs/providers/efa/efadv.pxd +++ b/pyverbs/providers/efa/efadv.pxd @@ -54,3 +54,7 @@ cdef class EfaDVMRAttr(PyverbsObject): cdef class EfaDVSQDepthAttr(PyverbsObject): cdef dv.efadv_sq_depth_attr sq_depth_attr + + +cdef class EfaDVRQDepthAttr(PyverbsObject): + cdef dv.efadv_rq_depth_attr rq_depth_attr diff --git a/pyverbs/providers/efa/efadv.pyx b/pyverbs/providers/efa/efadv.pyx index bc94c788f..2f3e04cba 100644 --- a/pyverbs/providers/efa/efadv.pyx +++ b/pyverbs/providers/efa/efadv.pyx @@ -375,3 +375,24 @@ cdef class EfaDVSQDepthAttr(PyverbsObject): @max_inline_data.setter def max_inline_data(self, val): self.sq_depth_attr.max_inline_data = val + + +cdef class EfaDVRQDepthAttr(PyverbsObject): + """ + Represents efadv_rq_depth_attr struct + """ + @property + def comp_mask(self): + return self.sq_depth_attr.comp_mask + + @comp_mask.setter + def comp_mask(self, val): + self.rq_depth_attr.comp_mask = val + + @property + def max_recv_sge(self): + return self.sq_depth_attr.max_recv_sge + + @max_recv_sge.setter + def max_recv_sge(self, val): + self.rq_depth_attr.max_recv_sge = val diff --git a/pyverbs/providers/efa/libefa.pxd b/pyverbs/providers/efa/libefa.pxd index ca5a09c23..bf043434f 100644 --- a/pyverbs/providers/efa/libefa.pxd +++ b/pyverbs/providers/efa/libefa.pxd @@ -54,6 +54,10 @@ cdef extern from 'infiniband/efadv.h': uint32_t max_rdma_sge; uint32_t max_inline_data; + cdef struct efadv_rq_depth_attr: + uint64_t comp_mask; + uint32_t max_recv_sge; + int efadv_query_device(v.ibv_context *ibvctx, efadv_device_attr *attrs, uint32_t inlen) int efadv_query_ah(v.ibv_ah *ibvah, efadv_ah_attr *attr, @@ -74,3 +78,5 @@ cdef extern from 'infiniband/efadv.h': int efadv_query_mr(v.ibv_mr *ibvmr, efadv_mr_attr *attr, uint32_t inlen) int efadv_get_max_sq_depth(v.ibv_context *ibvctx, efadv_sq_depth_attr *attr, uint32_t inlen) + int efadv_get_max_rq_depth(v.ibv_context *ibvctx, efadv_rq_depth_attr *attr, + uint32_t inlen); From 547f89476c2568875733cbfdced5d9757a1d0ceb Mon Sep 17 00:00:00 2001 From: Yonatan Nachum Date: Mon, 2 Mar 2026 18:15:31 +0000 Subject: [PATCH 11/11] efa: Validate RQ depth relatively to max WR SGEs EFA RQ max depth reported is the max descriptors a single RQ supports. For RQ EFA support multiple SGEs which means a single recv WR consumes multiple descriptors. This means that if the ULP requests more than 1 descriptor per recv WR, it can't use the max RQ depth reported. Validate the requested RQ depth relatively to the supported max RQ depth with the max SGEs requested. Signed-off-by: Yonatan Nachum --- providers/efa/verbs.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/providers/efa/verbs.c b/providers/efa/verbs.c index 8ae3912e7..b83b9d2a8 100644 --- a/providers/efa/verbs.c +++ b/providers/efa/verbs.c @@ -1660,6 +1660,11 @@ int efadv_get_max_sq_depth(struct ibv_context *ibvctx, struct efadv_sq_depth_att return efa_calc_sq_max_depth(ctx, attr->max_inline_data, write_with_inline); } +static int efa_calc_rq_max_depth(struct efa_context *ctx, uint32_t max_recv_sge) +{ + return rounddown_pow_of_two(ctx->max_rq_wr / max_recv_sge); +} + int efadv_get_max_rq_depth(struct ibv_context *ibvctx, struct efadv_rq_depth_attr *attr, uint32_t inlen) { @@ -1681,7 +1686,7 @@ int efadv_get_max_rq_depth(struct ibv_context *ibvctx, struct efadv_rq_depth_att return -EINVAL; } - return rounddown_pow_of_two(ctx->max_rq_wr / attr->max_recv_sge); + return efa_calc_rq_max_depth(ctx, attr->max_recv_sge); } static void efa_setup_qp(struct efa_context *ctx, @@ -1831,7 +1836,7 @@ static int efa_check_qp_limits(struct efa_context *ctx, struct efadv_qp_init_attr *efa_attr) { bool inline_write_enabled = !!(efa_attr->flags & EFADV_QP_FLAGS_INLINE_WRITE); - int sq_max_depth; + int sq_max_depth, rq_max_depth; if (attr->cap.max_send_sge > ctx->max_sq_sge) { verbs_err(&ctx->ibvctx, @@ -1854,10 +1859,11 @@ static int efa_check_qp_limits(struct efa_context *ctx, return EINVAL; } - if (attr->cap.max_recv_wr > ctx->max_rq_wr) { + rq_max_depth = efa_calc_rq_max_depth(ctx, attr->cap.max_recv_sge); + if (attr->cap.max_recv_wr > rq_max_depth) { verbs_err(&ctx->ibvctx, - "Max receive WR %u > %u\n", attr->cap.max_recv_wr, - ctx->max_rq_wr); + "Requested max SGE %u, max receive WR %u > %u\n", attr->cap.max_recv_sge, + attr->cap.max_recv_wr, rq_max_depth); return EINVAL; }