diff --git a/debian/ibverbs-providers.symbols b/debian/ibverbs-providers.symbols index e280809bd..7dfe785f7 100644 --- a/debian/ibverbs-providers.symbols +++ b/debian/ibverbs-providers.symbols @@ -173,6 +173,7 @@ libefa.so.1 ibverbs-providers #MINVER# EFA_1.2@EFA_1.2 43 EFA_1.3@EFA_1.3 50 EFA_1.4@EFA_1.4 59 + EFA_1.5@EFA_1.5 63 efadv_create_driver_qp@EFA_1.0 24 efadv_create_qp_ex@EFA_1.1 26 efadv_query_device@EFA_1.1 26 @@ -182,6 +183,8 @@ libefa.so.1 ibverbs-providers #MINVER# efadv_query_mr@EFA_1.3 50 efadv_query_qp_wqs@EFA_1.4 59 efadv_query_cq@EFA_1.4 59 + efadv_get_max_sq_depth@EFA_1.5 63 + efadv_get_max_rq_depth@EFA_1.5 63 libhns.so.1 ibverbs-providers #MINVER# * Build-Depends-Package: libibverbs-dev HNS_1.0@HNS_1.0 51 diff --git a/kernel-headers/rdma/efa-abi.h b/kernel-headers/rdma/efa-abi.h index 98b71b997..13225b038 100644 --- a/kernel-headers/rdma/efa-abi.h +++ b/kernel-headers/rdma/efa-abi.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */ /* - * Copyright 2018-2025 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2026 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_ABI_USER_H @@ -44,7 +44,8 @@ struct efa_ibv_alloc_ucontext_resp { __u32 max_llq_size; /* bytes */ __u16 max_tx_batch; /* units of 64 bytes */ __u16 min_sq_wr; - __u8 reserved_a0[4]; + __u16 inline_buf_size_ex; + __u8 reserved_b0[2]; }; struct efa_ibv_alloc_pd_resp { diff --git a/providers/efa/CMakeLists.txt b/providers/efa/CMakeLists.txt index c4ce3c0fe..ea082f0cf 100644 --- a/providers/efa/CMakeLists.txt +++ b/providers/efa/CMakeLists.txt @@ -3,7 +3,7 @@ if (ENABLE_LTTNG AND LTTNGUST_FOUND) endif() rdma_shared_provider(efa libefa.map - 1 1.4.${PACKAGE_VERSION} + 1 1.5.${PACKAGE_VERSION} ${TRACE_FILE} efa.c verbs.c diff --git a/providers/efa/efa.c b/providers/efa/efa.c index a0a95beb8..94a4126ba 100644 --- a/providers/efa/efa.c +++ b/providers/efa/efa.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause /* - * Copyright 2019-2025 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2019-2026 Amazon.com, Inc. or its affiliates. All rights reserved. */ #include @@ -83,6 +83,10 @@ static struct verbs_context *efa_alloc_context(struct ibv_device *vdev, ctx->cqe_size = sizeof(struct efa_io_rx_cdesc); ctx->ex_cqe_size = sizeof(struct efa_io_rx_cdesc_ex); ctx->inline_buf_size = resp.inline_buf_size; + ctx->inline_buf_size_ex = resp.inline_buf_size_ex; + if (ctx->inline_buf_size_ex == 0) + ctx->inline_buf_size_ex = ctx->inline_buf_size; + ctx->max_llq_size = resp.max_llq_size; ctx->max_tx_batch = resp.max_tx_batch; ctx->min_sq_wr = resp.min_sq_wr; diff --git a/providers/efa/efa.h b/providers/efa/efa.h index 25b5e8f99..4abdbd6f8 100644 --- a/providers/efa/efa.h +++ b/providers/efa/efa.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2019-2025 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2019-2026 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef __EFA_H__ @@ -31,6 +31,7 @@ struct efa_context { uint32_t cmds_supp_udata_mask; uint16_t sub_cqs_per_cq; uint16_t inline_buf_size; + uint16_t inline_buf_size_ex; uint32_t max_llq_size; uint32_t device_caps; uint32_t max_sq_wr; @@ -133,6 +134,21 @@ struct efa_rq { size_t buf_size; }; +struct efa_tx_wqe_ctx { + /* wqe buffer */ + void *buff; + /* wqe meta descriptor */ + struct efa_io_tx_meta_desc *md; + /* wqe local memory / SGL */ + struct efa_io_tx_buf_desc *local_mem; + /* wqe remote memory - RDMA only */ + struct efa_io_remote_mem_addr *remote_mem; + /* wqe inline data buffer */ + uint8_t *inline_data; + /* max sge allowed for this wqe */ + uint8_t max_sge; +}; + struct efa_sq { struct efa_wq wq; uint8_t *desc; @@ -141,6 +157,8 @@ struct efa_sq { size_t max_inline_data; size_t max_wr_rdma_sge; uint16_t max_batch_wr; + uint16_t wqe_size; + bool inline_write_enabled; /* Buffer for pending WR entries in the current session */ uint8_t *local_queue; @@ -149,7 +167,7 @@ struct efa_sq { /* Phase before current session */ int phase_rb; /* Current wqe being built */ - struct efa_io_tx_wqe *curr_tx_wqe; + struct efa_tx_wqe_ctx curr_tx_wqe; }; struct efa_qp { diff --git a/providers/efa/efa_io_defs.h b/providers/efa/efa_io_defs.h index e4f6f78ac..fccb217b7 100644 --- a/providers/efa/efa_io_defs.h +++ b/providers/efa/efa_io_defs.h @@ -9,6 +9,7 @@ #define EFA_IO_TX_DESC_NUM_BUFS 2 #define EFA_IO_TX_DESC_NUM_RDMA_BUFS 1 #define EFA_IO_TX_DESC_INLINE_MAX_SIZE 32 +#define EFA_IO_TX_DESC_INLINE_MAX_SIZE_128 80 #define EFA_IO_TX_DESC_IMM_DATA_SIZE 4 enum efa_io_queue_type { @@ -164,9 +165,22 @@ struct efa_io_rdma_req { struct efa_io_tx_buf_desc local_mem[1]; }; +struct efa_io_rdma_req_128 { + /* Remote memory address */ + struct efa_io_remote_mem_addr remote_mem; + + union { + /* Local memory address */ + struct efa_io_tx_buf_desc local_mem[1]; + + /* inline data for RDMA */ + uint8_t inline_data[80]; + }; +}; + /* - * Tx WQE, composed of tx meta descriptors followed by either tx buffer - * descriptors or inline data + * 64-byte Tx WQE, composed of tx meta descriptors followed by either tx + * buffer descriptors or inline data */ struct efa_io_tx_wqe { /* TX meta */ @@ -183,6 +197,25 @@ struct efa_io_tx_wqe { } data; }; +/* + * 128-byte Tx WQE, composed of tx meta descriptors followed by either tx + * buffer descriptors or inline data + */ +struct efa_io_tx_wqe_128 { + /* TX meta */ + struct efa_io_tx_meta_desc meta; + + union { + /* Send buffer descriptors */ + struct efa_io_tx_buf_desc sgl[2]; + + uint8_t inline_data[80]; + + /* RDMA local and remote memory addresses */ + struct efa_io_rdma_req_128 rdma_req; + } data; +}; + /* * Rx buffer descriptor; RX WQE is composed of one or more RX buffer * descriptors. diff --git a/providers/efa/efadv.h b/providers/efa/efadv.h index 7c034f881..bb2f9282b 100644 --- a/providers/efa/efadv.h +++ b/providers/efa/efadv.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2019-2025 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2019-2026 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef __EFADV_H__ @@ -32,7 +32,7 @@ struct efadv_device_attr { uint16_t max_sq_sge; uint16_t max_rq_sge; uint16_t inline_buf_size; - uint8_t reserved[2]; + uint16_t inline_buf_size_ex; uint32_t device_caps; uint32_t max_rdma_size; }; @@ -47,6 +47,29 @@ struct efadv_ah_attr { uint8_t reserved[6]; }; +enum { + EFADV_SQ_DEPTH_ATTR_INLINE_WRITE = 1 << 0, +}; + +struct efadv_sq_depth_attr { + uint64_t comp_mask; + uint32_t flags; + uint32_t max_send_sge; + uint32_t max_rdma_sge; + uint32_t max_inline_data; +}; + +int efadv_get_max_sq_depth(struct ibv_context *ibvctx, struct efadv_sq_depth_attr *attr, + uint32_t inlen); + +struct efadv_rq_depth_attr { + uint64_t comp_mask; + uint32_t max_recv_sge; +}; + +int efadv_get_max_rq_depth(struct ibv_context *ibvctx, struct efadv_rq_depth_attr *attr, + uint32_t inlen); + int efadv_query_ah(struct ibv_ah *ibvah, struct efadv_ah_attr *attr, uint32_t inlen); @@ -61,6 +84,7 @@ struct ibv_qp *efadv_create_driver_qp(struct ibv_pd *ibvpd, enum { EFADV_QP_FLAGS_UNSOLICITED_WRITE_RECV = 1 << 0, + EFADV_QP_FLAGS_INLINE_WRITE = 1 << 1, }; struct efadv_qp_init_attr { diff --git a/providers/efa/libefa.map b/providers/efa/libefa.map index 13fac76a3..03a6d8e23 100644 --- a/providers/efa/libefa.map +++ b/providers/efa/libefa.map @@ -29,3 +29,9 @@ EFA_1.4 { efadv_query_qp_wqs; efadv_query_cq; } EFA_1.3; + +EFA_1.5 { + global: + efadv_get_max_sq_depth; + efadv_get_max_rq_depth; +} EFA_1.4; diff --git a/providers/efa/man/efadv_create_qp_ex.3.md b/providers/efa/man/efadv_create_qp_ex.3.md index aaeedfdee..8617fe363 100644 --- a/providers/efa/man/efadv_create_qp_ex.3.md +++ b/providers/efa/man/efadv_create_qp_ex.3.md @@ -68,6 +68,9 @@ struct efadv_qp_init_attr { EFADV_QP_FLAGS_UNSOLICITED_WRITE_RECV: Receive WRs will not be consumed for RDMA write with imm. + EFADV_QP_FLAGS_INLINE_WRITE: + QP supports RDMA write with inline operations. + *sl* : Service Level - 0 value implies default level. diff --git a/providers/efa/man/efadv_get_rq_max_depth.3.md b/providers/efa/man/efadv_get_rq_max_depth.3.md new file mode 100644 index 000000000..ffaaf2c0a --- /dev/null +++ b/providers/efa/man/efadv_get_rq_max_depth.3.md @@ -0,0 +1,57 @@ +--- +layout: page +title: EFADV_GET_MAX_RQ_DEPTH +section: 3 +tagline: Verbs +date: 2026-02-17 +header: "EFA Direct Verbs Manual" +footer: efa +--- + +# NAME + +efadv_get_max_rq_depth - Get EFA receive queue max depth based on receive queue attributes + +# SYNOPSIS + +```c +#include + +int efadv_get_max_rq_depth(struct ibv_context *ibvctx, struct efadv_rq_depth_attr *attr, + uint32_t inlen); +``` + +# DESCRIPTION + +**efadv_get_max_rq_depth()** get device-specific receive queue max depth based on RQ attributes. + +Compatibility is handled using the comp_mask and inlen fields. + +```c +struct efadv_rq_depth_attr { + uint64_t comp_mask; + uint32_t max_recv_sge; +}; +``` + +*inlen* +: In: Size of struct efadv_rq_depth_attr. + +*comp_mask* +: Compatibility mask. + +*max_recv_sge* +: Requested max number of scatter/gather (s/g) elements in a WR in the receive queue. + +# RETURN VALUE + +**efadv_get_max_rq_depth()** returns max receive queue depth on success, or the negative value of errno on failure +(which indicates the failure reason). + +# SEE ALSO + +**efadv**(7) + +# AUTHORS + +Yonatan Nachum diff --git a/providers/efa/man/efadv_get_sq_max_depth.3.md b/providers/efa/man/efadv_get_sq_max_depth.3.md new file mode 100644 index 000000000..fd5c2d07d --- /dev/null +++ b/providers/efa/man/efadv_get_sq_max_depth.3.md @@ -0,0 +1,72 @@ +--- +layout: page +title: EFADV_GET_MAX_SQ_DEPTH +section: 3 +tagline: Verbs +date: 2026-02-17 +header: "EFA Direct Verbs Manual" +footer: efa +--- + +# NAME + +efadv_get_max_sq_depth - Get EFA send queue max depth based on send queue attributes + +# SYNOPSIS + +```c +#include + +int efadv_get_max_sq_depth(struct ibv_context *ibvctx, struct efadv_sq_depth_attr *attr, + uint32_t inlen); +``` + +# DESCRIPTION + +**efadv_get_max_sq_depth()** get device-specific send queue max depth based on SQ attributes. + +Compatibility is handled using the comp_mask and inlen fields. + +```c +struct efadv_sq_depth_attr { + uint64_t comp_mask; + uint32_t flags; + uint32_t max_send_sge; + uint32_t max_rdma_sge; + uint32_t max_inline_data; +}; +``` + +*inlen* +: In: Size of struct efadv_sq_depth_attr. + +*comp_mask* +: Compatibility mask. + +*flags* +: A bitwise OR of the values described below. + + EFADV_SQ_DEPTH_ATTR_INLINE_WRITE: + Inline RDMA write operation support is required. + +*max_send_sge* +: Requested max number of scatter/gather (s/g) elements in a send WR in the send queue. + +*max_rdma_sge* +: Requested max number of scatter/gather (s/g) elements in a RDMA WR in the send queue. + +*max_inline_data* +: Requested max number of data (bytes) that can be posted inline to the send queue. + +# RETURN VALUE + +**efadv_get_max_sq_depth()** returns max send queue depth on success, or the negative value of errno on failure +(which indicates the failure reason). + +# SEE ALSO + +**efadv**(7) + +# AUTHORS + +Yonatan Nachum diff --git a/providers/efa/man/efadv_query_device.3.md b/providers/efa/man/efadv_query_device.3.md index c41bc3d9d..f46d362fc 100644 --- a/providers/efa/man/efadv_query_device.3.md +++ b/providers/efa/man/efadv_query_device.3.md @@ -36,7 +36,7 @@ struct efadv_device_attr { uint16_t max_sq_sge; uint16_t max_rq_sge; uint16_t inline_buf_size; - uint8_t reserved[2]; + uint16_t inline_buf_size_ex; uint32_t device_caps; uint32_t max_rdma_size; }; @@ -61,6 +61,9 @@ struct efadv_device_attr { : Maximum Receive Queue (RQ) Scatter Gather Elements (SGEs). *inline_buf_size* +: Maximum inline buffer size (deprecated by inline_buf_size_ex). + +*inline_buf_size_ex* : Maximum inline buffer size. *device_caps* diff --git a/providers/efa/verbs.c b/providers/efa/verbs.c index c2cc31fdd..b83b9d2a8 100644 --- a/providers/efa/verbs.c +++ b/providers/efa/verbs.c @@ -27,6 +27,9 @@ #define EFA_DEV_CAP(ctx, cap) \ ((ctx)->device_caps & EFA_QUERY_DEVICE_CAPS_##cap) +#define EFA_IO_TX_DESC_SIZE_64 (sizeof(struct efa_io_tx_wqe)) +#define EFA_IO_TX_DESC_SIZE_128 (sizeof(struct efa_io_tx_wqe_128)) + static bool is_buf_cleared(void *buf, size_t len) { int i; @@ -90,7 +93,7 @@ int efa_query_device_ex(struct ibv_context *context, } a->max_qp_wr = min_t(int, a->max_qp_wr, - ctx->max_llq_size / sizeof(struct efa_io_tx_wqe)); + ctx->max_llq_size / EFA_IO_TX_DESC_SIZE_64); memcpy(fw_ver, &resp.ibv_resp.base.fw_ver, sizeof(resp.ibv_resp.base.fw_ver)); snprintf(a->fw_ver, sizeof(a->fw_ver), "%u.%u.%u.%u", @@ -155,7 +158,7 @@ int efadv_query_device(struct ibv_context *ibvctx, return EOPNOTSUPP; } - if (!vext_field_avail(typeof(*attr), inline_buf_size, inlen)) { + if (!vext_field_avail(typeof(*attr), inline_buf_size_ex, inlen)) { verbs_err(verbs_get_ctx(ibvctx), "Compatibility issues\n"); return EINVAL; } @@ -166,6 +169,7 @@ int efadv_query_device(struct ibv_context *ibvctx, attr->max_sq_sge = ctx->max_sq_sge; attr->max_rq_sge = ctx->max_rq_sge; attr->inline_buf_size = ctx->inline_buf_size; + attr->inline_buf_size_ex = ctx->inline_buf_size_ex; if (vext_field_avail(typeof(*attr), device_caps, inlen)) { if (EFA_DEV_CAP(ctx, RNR_RETRY)) @@ -1488,7 +1492,7 @@ static int efa_sq_initialize(struct efa_qp *qp, } sq->desc_offset = resp->llq_desc_offset; - desc_ring_size = sq->wq.wqe_cnt * sizeof(struct efa_io_tx_wqe); + desc_ring_size = sq->wq.wqe_cnt * sq->wqe_size; sq->desc_ring_mmap_size = align(desc_ring_size + sq->desc_offset, qp->page_size); sq->max_inline_data = attr->cap.max_inline_data; @@ -1512,7 +1516,7 @@ static int efa_sq_initialize(struct efa_qp *qp, sq->max_wr_rdma_sge = min_t(uint16_t, ctx->max_wr_rdma_sge, EFA_IO_TX_DESC_NUM_RDMA_BUFS); sq->max_batch_wr = ctx->max_tx_batch ? - (ctx->max_tx_batch * 64) / sizeof(struct efa_io_tx_wqe) : + (ctx->max_tx_batch * 64) / sq->wqe_size : UINT16_MAX; if (ctx->min_sq_wr) { /* The device can't accept a doorbell for the whole SQ at once, @@ -1603,19 +1607,106 @@ static void efa_qp_init_indices(struct efa_qp *qp) qp->rq.wq.wrid_idx_pool_next = 0; } +static int efa_calc_sq_wqe_size(uint32_t max_inline_data, bool inline_write_enabled) +{ + if (max_inline_data > EFA_IO_TX_DESC_INLINE_MAX_SIZE || inline_write_enabled) + return EFA_IO_TX_DESC_SIZE_128; + + return EFA_IO_TX_DESC_SIZE_64; +} + +static int efa_calc_sq_max_depth(struct efa_context *ctx, uint32_t max_inline_data, + bool write_with_inline) +{ + int sq_wqe_size = efa_calc_sq_wqe_size(max_inline_data, write_with_inline); + + return rounddown_pow_of_two(ctx->max_llq_size / sq_wqe_size); +} + +int efadv_get_max_sq_depth(struct ibv_context *ibvctx, struct efadv_sq_depth_attr *attr, + uint32_t inlen) +{ + bool write_with_inline = !!(attr->flags & EFADV_SQ_DEPTH_ATTR_INLINE_WRITE); + struct efa_context *ctx = to_efa_context(ibvctx); + + if (!is_efa_dev(ibvctx->device)) { + verbs_err(verbs_get_ctx(ibvctx), "Not an EFA device\n"); + return -EOPNOTSUPP; + } + + if (!vext_field_avail(typeof(*attr), max_inline_data, inlen) || attr->comp_mask) { + verbs_err(verbs_get_ctx(ibvctx), "Compatibility issues\n"); + return -EINVAL; + } + + if (attr->max_send_sge > ctx->max_sq_sge) { + verbs_err(verbs_get_ctx(ibvctx), "Max send SGE %u > %u\n", attr->max_send_sge, + ctx->max_sq_sge); + return -EINVAL; + } + + if (attr->max_rdma_sge > ctx->max_wr_rdma_sge) { + verbs_err(verbs_get_ctx(ibvctx), "Max RDMA SGE %u > %u\n", attr->max_rdma_sge, + ctx->max_wr_rdma_sge); + return -EINVAL; + } + + if (attr->max_inline_data > ctx->inline_buf_size_ex) { + verbs_err(verbs_get_ctx(ibvctx), "Max inline data %u > %u\n", attr->max_inline_data, + ctx->inline_buf_size_ex); + return -EINVAL; + } + + return efa_calc_sq_max_depth(ctx, attr->max_inline_data, write_with_inline); +} + +static int efa_calc_rq_max_depth(struct efa_context *ctx, uint32_t max_recv_sge) +{ + return rounddown_pow_of_two(ctx->max_rq_wr / max_recv_sge); +} + +int efadv_get_max_rq_depth(struct ibv_context *ibvctx, struct efadv_rq_depth_attr *attr, + uint32_t inlen) +{ + struct efa_context *ctx = to_efa_context(ibvctx); + + if (!is_efa_dev(ibvctx->device)) { + verbs_err(verbs_get_ctx(ibvctx), "Not an EFA device\n"); + return -EOPNOTSUPP; + } + + if (!vext_field_avail(typeof(*attr), max_recv_sge, inlen) || attr->comp_mask) { + verbs_err(verbs_get_ctx(ibvctx), "Compatibility issues\n"); + return -EINVAL; + } + + if (attr->max_recv_sge > ctx->max_rq_sge) { + verbs_err(verbs_get_ctx(ibvctx), "Max receive SGE %u > %u\n", attr->max_recv_sge, + ctx->max_rq_sge); + return -EINVAL; + } + + return efa_calc_rq_max_depth(ctx, attr->max_recv_sge); +} + static void efa_setup_qp(struct efa_context *ctx, struct efa_qp *qp, - struct ibv_qp_cap *cap, + struct ibv_qp_init_attr_ex *attr, + struct efadv_qp_init_attr *efa_attr, size_t page_size) { + bool inline_write_enabled = !!(efa_attr->flags & EFADV_QP_FLAGS_INLINE_WRITE); + struct ibv_qp_cap *cap = &attr->cap; uint16_t rq_desc_cnt; efa_qp_init_indices(qp); + qp->sq.wqe_size = efa_calc_sq_wqe_size(cap->max_inline_data, inline_write_enabled); qp->sq.wq.wqe_cnt = roundup_pow_of_two(max_t(uint32_t, cap->max_send_wr, ctx->min_sq_wr)); qp->sq.wq.max_sge = cap->max_send_sge; qp->sq.wq.desc_mask = qp->sq.wq.wqe_cnt - 1; + qp->sq.inline_write_enabled = inline_write_enabled; qp->rq.wq.max_sge = cap->max_recv_sge; rq_desc_cnt = roundup_pow_of_two(cap->max_recv_sge * cap->max_recv_wr); @@ -1652,7 +1743,8 @@ static void efa_unlock_cqs(struct ibv_qp *ibvqp) } static void efa_qp_fill_wr_pfns(struct ibv_qp_ex *ibvqpx, - struct ibv_qp_init_attr_ex *attr_ex); + struct ibv_qp_init_attr_ex *attr_ex, + uint16_t wqe_size); static int efa_check_qp_attr(struct efa_context *ctx, struct ibv_qp_init_attr_ex *attr, @@ -1667,9 +1759,11 @@ static int efa_check_qp_attr(struct efa_context *ctx, if (EFA_DEV_CAP(ctx, RDMA_READ)) supp_srd_send_ops_mask |= IBV_QP_EX_WITH_RDMA_READ; - if (EFA_DEV_CAP(ctx, RDMA_WRITE)) + if (EFA_DEV_CAP(ctx, RDMA_WRITE)) { + supp_efa_flags |= EFADV_QP_FLAGS_INLINE_WRITE; supp_srd_send_ops_mask |= IBV_QP_EX_WITH_RDMA_WRITE | IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM; + } if (EFA_DEV_CAP(ctx, UNSOLICITED_WRITE_RECV)) supp_efa_flags |= EFADV_QP_FLAGS_UNSOLICITED_WRITE_RECV; @@ -1738,8 +1832,12 @@ static int efa_check_qp_attr(struct efa_context *ctx, } static int efa_check_qp_limits(struct efa_context *ctx, - struct ibv_qp_init_attr_ex *attr) + struct ibv_qp_init_attr_ex *attr, + struct efadv_qp_init_attr *efa_attr) { + bool inline_write_enabled = !!(efa_attr->flags & EFADV_QP_FLAGS_INLINE_WRITE); + int sq_max_depth, rq_max_depth; + if (attr->cap.max_send_sge > ctx->max_sq_sge) { verbs_err(&ctx->ibvctx, "Max send SGE %u > %u\n", attr->cap.max_send_sge, @@ -1754,17 +1852,25 @@ static int efa_check_qp_limits(struct efa_context *ctx, return EINVAL; } - if (attr->cap.max_send_wr > ctx->max_sq_wr) { + sq_max_depth = efa_calc_sq_max_depth(ctx, attr->cap.max_inline_data, inline_write_enabled); + if (attr->cap.max_send_wr > sq_max_depth) { verbs_err(&ctx->ibvctx, - "Max send WR %u > %u\n", attr->cap.max_send_wr, - ctx->max_sq_wr); + "Max Send WR %u > %u\n", attr->cap.max_send_wr, sq_max_depth); return EINVAL; } - if (attr->cap.max_recv_wr > ctx->max_rq_wr) { + rq_max_depth = efa_calc_rq_max_depth(ctx, attr->cap.max_recv_sge); + if (attr->cap.max_recv_wr > rq_max_depth) { verbs_err(&ctx->ibvctx, - "Max receive WR %u > %u\n", attr->cap.max_recv_wr, - ctx->max_rq_wr); + "Requested max SGE %u, max receive WR %u > %u\n", attr->cap.max_recv_sge, + attr->cap.max_recv_wr, rq_max_depth); + return EINVAL; + } + + if (attr->cap.max_inline_data > ctx->inline_buf_size_ex) { + verbs_err(&ctx->ibvctx, + "Max inline data %u > %u\n", attr->cap.max_inline_data, + ctx->inline_buf_size_ex); return EINVAL; } @@ -1789,7 +1895,7 @@ static struct ibv_qp *create_qp(struct ibv_context *ibvctx, if (err) goto err_out; - err = efa_check_qp_limits(ctx, attr); + err = efa_check_qp_limits(ctx, attr, efa_attr); if (err) goto err_out; @@ -1799,15 +1905,14 @@ static struct ibv_qp *create_qp(struct ibv_context *ibvctx, goto err_out; } - efa_setup_qp(ctx, qp, &attr->cap, dev->pg_sz); + efa_setup_qp(ctx, qp, attr, efa_attr, dev->pg_sz); attr->cap.max_send_wr = qp->sq.wq.wqe_cnt; attr->cap.max_recv_wr = qp->rq.wq.wqe_cnt; req.rq_ring_size = (qp->rq.wq.desc_mask + 1) * sizeof(struct efa_io_rx_desc); - req.sq_ring_size = (attr->cap.max_send_wr) * - sizeof(struct efa_io_tx_wqe); + req.sq_ring_size = attr->cap.max_send_wr * qp->sq.wqe_size; if (attr->qp_type == IBV_QPT_DRIVER) req.driver_qp_type = efa_attr->driver_qp_type; if (efa_attr->flags & EFADV_QP_FLAGS_UNSOLICITED_WRITE_RECV) @@ -1839,7 +1944,7 @@ static struct ibv_qp *create_qp(struct ibv_context *ibvctx, pthread_spin_unlock(&ctx->qp_table_lock); if (attr->comp_mask & IBV_QP_INIT_ATTR_SEND_OPS_FLAGS) { - efa_qp_fill_wr_pfns(&qp->verbs_qp.qp_ex, attr); + efa_qp_fill_wr_pfns(&qp->verbs_qp.qp_ex, attr, qp->sq.wqe_size); qp->verbs_qp.comp_mask |= VERBS_QP_EX; } @@ -2014,7 +2119,7 @@ int efadv_query_qp_wqs(struct ibv_qp *ibvqp, struct efadv_wq_attr *sq_attr, sq_attr->comp_mask = 0; sq_attr->buffer = qp->sq.desc; - sq_attr->entry_size = sizeof(struct efa_io_tx_wqe); + sq_attr->entry_size = qp->sq.wqe_size; sq_attr->num_entries = qp->sq.wq.wqe_cnt; sq_attr->doorbell = qp->sq.wq.db; sq_attr->max_batch = qp->sq.max_batch_wr; @@ -2083,20 +2188,23 @@ static void efa_set_tx_buf(struct efa_io_tx_buf_desc *tx_buf, } static void efa_post_send_sgl(struct efa_io_tx_buf_desc *tx_bufs, + struct efa_io_tx_meta_desc *md, const struct ibv_sge *sg_list, int num_sge) { const struct ibv_sge *sge; size_t i; + md->length = num_sge; + for (i = 0; i < num_sge; i++) { sge = &sg_list[i]; efa_set_tx_buf(&tx_bufs[i], sge->addr, sge->lkey, sge->length); } } -static void efa_post_send_inline_data(const struct ibv_send_wr *wr, - struct efa_io_tx_wqe *tx_wqe) +static void efa_post_send_inline_data(const struct ibv_send_wr *wr, struct efa_io_tx_meta_desc *md, + uint8_t *inline_data) { const struct ibv_sge *sgl = wr->sg_list; uint32_t total_length = 0; @@ -2106,13 +2214,13 @@ static void efa_post_send_inline_data(const struct ibv_send_wr *wr, for (i = 0; i < wr->num_sge; i++) { length = sgl[i].length; - memcpy(tx_wqe->data.inline_data + total_length, + memcpy(inline_data + total_length, (void *)(uintptr_t)sgl[i].addr, length); total_length += length; } - EFA_SET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG, 1); - tx_wqe->meta.length = total_length; + EFA_SET(&md->ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG, 1); + md->length = total_length; } static size_t efa_sge_total_bytes(const struct ibv_sge *sg_list, int num_sge) @@ -2173,25 +2281,26 @@ static void efa_set_common_ctrl_flags(struct efa_io_tx_meta_desc *desc, } #if defined(LTTNG_ENABLED) || defined(USDT_ENABLED) -static uint32_t efa_get_wqe_length(struct efa_io_tx_wqe *tx_wqe) +static uint32_t efa_wqe_get_data_length(struct efa_sq *sq) { + struct efa_io_tx_meta_desc *md = sq->curr_tx_wqe.md; enum efa_io_send_op_type op_type; uint32_t length = 0; size_t i; - op_type = EFA_GET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_OP_TYPE); + op_type = EFA_GET(&md->ctrl1, EFA_IO_TX_META_DESC_OP_TYPE); switch (op_type) { case EFA_IO_SEND: - if (EFA_GET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG)) - return tx_wqe->meta.length; + if (EFA_GET(&md->ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG)) + return md->length; - for (i = 0; i < tx_wqe->meta.length; i++) - length += tx_wqe->data.sgl[i].length; + for (i = 0; i < md->length; i++) + length += sq->curr_tx_wqe.local_mem[i].length; return length; case EFA_IO_RDMA_READ: case EFA_IO_RDMA_WRITE: - return tx_wqe->data.rdma_req.remote_mem.length; + return sq->curr_tx_wqe.remote_mem->length; } return 0; @@ -2280,16 +2389,38 @@ static int efa_post_send_validate_wr(struct efa_qp *qp, int efa_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, struct ibv_send_wr **bad) { - struct efa_io_tx_meta_desc *meta_desc; + uint8_t wqe_buf[EFA_IO_TX_DESC_SIZE_128]; struct efa_qp *qp = to_efa_qp(ibvqp); - struct efa_io_tx_wqe tx_wqe; + struct efa_io_tx_wqe_128 *tx_wqe_128; + struct efa_io_tx_meta_desc *md; + struct efa_io_tx_buf_desc *sgl; + struct efa_io_tx_wqe *tx_wqe; struct efa_sq *sq = &qp->sq; struct efa_wq *wq = &sq->wq; uint32_t sq_desc_offset; uint32_t curbatch = 0; + uint8_t *inline_data; struct efa_ah *ah; int err = 0; + switch (sq->wqe_size) { + case EFA_IO_TX_DESC_SIZE_64: + tx_wqe = (struct efa_io_tx_wqe *)wqe_buf; + md = &tx_wqe->meta; + sgl = tx_wqe->data.sgl; + inline_data = tx_wqe->data.inline_data; + break; + case EFA_IO_TX_DESC_SIZE_128: + tx_wqe_128 = (struct efa_io_tx_wqe_128 *)wqe_buf; + md = &tx_wqe_128->meta; + sgl = tx_wqe_128->data.sgl; + inline_data = tx_wqe_128->data.inline_data; + break; + + default: + return EINVAL; + } + if (wq->need_lock) mmio_wc_spinlock(&wq->wqlock); else @@ -2302,37 +2433,30 @@ int efa_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, goto ring_db; } - memset(&tx_wqe, 0, sizeof(tx_wqe)); - meta_desc = &tx_wqe.meta; + memset(wqe_buf, 0, sq->wqe_size); ah = to_efa_ah(wr->wr.ud.ah); if (wr->send_flags & IBV_SEND_INLINE) { - efa_post_send_inline_data(wr, &tx_wqe); + efa_post_send_inline_data(wr, md, inline_data); } else { - meta_desc->length = wr->num_sge; - efa_post_send_sgl(tx_wqe.data.sgl, wr->sg_list, - wr->num_sge); + efa_post_send_sgl(sgl, md, wr->sg_list, wr->num_sge); } if (wr->opcode == IBV_WR_SEND_WITH_IMM) { - meta_desc->immediate_data = be32toh(wr->imm_data); - EFA_SET(&meta_desc->ctrl1, EFA_IO_TX_META_DESC_HAS_IMM, - 1); + md->immediate_data = be32toh(wr->imm_data); + EFA_SET(&md->ctrl1, EFA_IO_TX_META_DESC_HAS_IMM, 1); } /* Set rest of the descriptor fields */ - efa_set_common_ctrl_flags(meta_desc, sq, EFA_IO_SEND); - meta_desc->req_id = efa_wq_get_next_wrid_idx_locked(wq, - wr->wr_id); - meta_desc->dest_qp_num = wr->wr.ud.remote_qpn; - meta_desc->ah = ah->efa_ah; - meta_desc->qkey = wr->wr.ud.remote_qkey; + efa_set_common_ctrl_flags(md, sq, EFA_IO_SEND); + md->req_id = efa_wq_get_next_wrid_idx_locked(wq, wr->wr_id); + md->dest_qp_num = wr->wr.ud.remote_qpn; + md->ah = ah->efa_ah; + md->qkey = wr->wr.ud.remote_qkey; /* Copy descriptor */ - sq_desc_offset = (wq->pc & wq->desc_mask) * - sizeof(tx_wqe); - mmio_memcpy_x64(sq->desc + sq_desc_offset, &tx_wqe, - sizeof(tx_wqe)); + sq_desc_offset = (wq->pc & wq->desc_mask) * sq->wqe_size; + mmio_memcpy_x64(sq->desc + sq_desc_offset, wqe_buf, sq->wqe_size); /* advance index and change phase */ efa_sq_advance_post_idx(sq); @@ -2345,8 +2469,8 @@ int efa_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, mmio_wc_start(); } rdma_tracepoint(rdma_core_efa, post_send, qp->dev->name, wr->wr_id, - EFA_IO_SEND, ibvqp->qp_num, meta_desc->dest_qp_num, - ah->efa_ah, efa_get_wqe_length(&tx_wqe)); + EFA_IO_SEND, ibvqp->qp_num, md->dest_qp_num, + ah->efa_ah, efa_wqe_get_data_length(sq)); wr = wr->next; } @@ -2366,12 +2490,9 @@ int efa_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, return err; } -static struct efa_io_tx_wqe *efa_send_wr_common(struct ibv_qp_ex *ibvqpx, - enum efa_io_send_op_type op_type) +static void *efa_send_wr_alloc(struct efa_qp *qp, struct ibv_qp_ex *ibvqpx) { - struct efa_qp *qp = to_efa_qp_ex(ibvqpx); struct efa_sq *sq = &qp->sq; - struct efa_io_tx_meta_desc *meta_desc; int err; if (unlikely(qp->wr_session_err)) @@ -2383,179 +2504,275 @@ static struct efa_io_tx_wqe *efa_send_wr_common(struct ibv_qp_ex *ibvqpx, return NULL; } - sq->curr_tx_wqe = (struct efa_io_tx_wqe *)sq->local_queue + - sq->num_wqe_pending; - memset(sq->curr_tx_wqe, 0, sizeof(*sq->curr_tx_wqe)); + sq->curr_tx_wqe.buff = sq->local_queue + sq->num_wqe_pending * sq->wqe_size; + memset(sq->curr_tx_wqe.buff, 0, sq->wqe_size); + + return sq->curr_tx_wqe.buff; +} + +static void efa_send_wr_init(struct efa_qp *qp, struct ibv_qp_ex *ibvqpx, + enum efa_io_send_op_type op_type, uint8_t max_sge, + struct efa_io_tx_meta_desc *md, + struct efa_io_tx_buf_desc *local_mem, + struct efa_io_remote_mem_addr *remote_mem, + uint8_t *inline_data) +{ + struct efa_sq *sq = &qp->sq; - meta_desc = &sq->curr_tx_wqe->meta; - efa_set_common_ctrl_flags(meta_desc, sq, op_type); - meta_desc->req_id = efa_wq_get_next_wrid_idx_locked(&sq->wq, - ibvqpx->wr_id); + sq->curr_tx_wqe.md = md; + efa_set_common_ctrl_flags(sq->curr_tx_wqe.md, sq, op_type); + sq->curr_tx_wqe.md->req_id = efa_wq_get_next_wrid_idx_locked(&sq->wq, ibvqpx->wr_id); /* advance index and change phase */ efa_sq_advance_post_idx(sq); sq->num_wqe_pending++; - return sq->curr_tx_wqe; + sq->curr_tx_wqe.local_mem = local_mem; + sq->curr_tx_wqe.remote_mem = remote_mem; + sq->curr_tx_wqe.inline_data = inline_data; + sq->curr_tx_wqe.max_sge = max_sge; } -static void efa_send_wr_set_imm_data(struct efa_io_tx_wqe *tx_wqe, __be32 imm_data) +static void efa_send_wr_set_imm_data(struct efa_io_tx_meta_desc *meta_desc, __be32 imm_data) { - struct efa_io_tx_meta_desc *meta_desc; - - meta_desc = &tx_wqe->meta; meta_desc->immediate_data = be32toh(imm_data); EFA_SET(&meta_desc->ctrl1, EFA_IO_TX_META_DESC_HAS_IMM, 1); } -static void efa_send_wr_set_rdma_addr(struct efa_io_tx_wqe *tx_wqe, uint32_t rkey, +static void efa_send_wr_set_rdma_addr(struct efa_io_remote_mem_addr *remote_mem, uint32_t rkey, uint64_t remote_addr) { - struct efa_io_remote_mem_addr *remote_mem; - - remote_mem = &tx_wqe->data.rdma_req.remote_mem; remote_mem->rkey = rkey; remote_mem->buf_addr_lo = remote_addr & 0xFFFFFFFF; remote_mem->buf_addr_hi = remote_addr >> 32; } -static void efa_send_wr_send(struct ibv_qp_ex *ibvqpx) +static void efa_send_wr_send_64(struct ibv_qp_ex *ibvqpx) { - efa_send_wr_common(ibvqpx, EFA_IO_SEND); + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + struct efa_io_tx_wqe *tx_wqe; + + tx_wqe = (struct efa_io_tx_wqe *)efa_send_wr_alloc(qp, ibvqpx); + if (unlikely(!tx_wqe)) + return; + + efa_send_wr_init(qp, ibvqpx, EFA_IO_SEND, qp->sq.wq.max_sge, &tx_wqe->meta, + tx_wqe->data.sgl, NULL, tx_wqe->data.inline_data); +} + +static void efa_send_wr_send_128(struct ibv_qp_ex *ibvqpx) +{ + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + struct efa_io_tx_wqe_128 *tx_wqe; + + tx_wqe = (struct efa_io_tx_wqe_128 *)efa_send_wr_alloc(qp, ibvqpx); + if (unlikely(!tx_wqe)) + return; + + efa_send_wr_init(qp, ibvqpx, EFA_IO_SEND, qp->sq.wq.max_sge, &tx_wqe->meta, + tx_wqe->data.sgl, NULL, tx_wqe->data.inline_data); } -static void efa_send_wr_send_imm(struct ibv_qp_ex *ibvqpx, __be32 imm_data) +static void efa_send_wr_send_imm_64(struct ibv_qp_ex *ibvqpx, __be32 imm_data) { + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); struct efa_io_tx_wqe *tx_wqe; - tx_wqe = efa_send_wr_common(ibvqpx, EFA_IO_SEND); + tx_wqe = (struct efa_io_tx_wqe *)efa_send_wr_alloc(qp, ibvqpx); if (unlikely(!tx_wqe)) return; - efa_send_wr_set_imm_data(tx_wqe, imm_data); + efa_send_wr_init(qp, ibvqpx, EFA_IO_SEND, qp->sq.wq.max_sge, &tx_wqe->meta, + tx_wqe->data.sgl, NULL, tx_wqe->data.inline_data); + efa_send_wr_set_imm_data(qp->sq.curr_tx_wqe.md, imm_data); } -static void efa_send_wr_rdma_read(struct ibv_qp_ex *ibvqpx, uint32_t rkey, - uint64_t remote_addr) +static void efa_send_wr_send_imm_128(struct ibv_qp_ex *ibvqpx, __be32 imm_data) { + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + struct efa_io_tx_wqe_128 *tx_wqe; + + tx_wqe = (struct efa_io_tx_wqe_128 *)efa_send_wr_alloc(qp, ibvqpx); + if (unlikely(!tx_wqe)) + return; + + efa_send_wr_init(qp, ibvqpx, EFA_IO_SEND, qp->sq.wq.max_sge, &tx_wqe->meta, + tx_wqe->data.sgl, NULL, tx_wqe->data.inline_data); + efa_send_wr_set_imm_data(qp->sq.curr_tx_wqe.md, imm_data); +} + +static inline void efa_send_wr_rdma_common(struct efa_qp *qp, struct ibv_qp_ex *ibvqpx, + uint32_t rkey, uint64_t remote_addr, + enum efa_io_send_op_type op_type, + struct efa_io_tx_meta_desc *md, + struct efa_io_tx_buf_desc *local_mem, + struct efa_io_remote_mem_addr *remote_mem, + uint8_t *inline_data) ALWAYS_INLINE; +static inline void efa_send_wr_rdma_common(struct efa_qp *qp, struct ibv_qp_ex *ibvqpx, + uint32_t rkey, uint64_t remote_addr, + enum efa_io_send_op_type op_type, + struct efa_io_tx_meta_desc *md, + struct efa_io_tx_buf_desc *local_mem, + struct efa_io_remote_mem_addr *remote_mem, + uint8_t *inline_data) +{ + efa_send_wr_init(qp, ibvqpx, op_type, qp->sq.max_wr_rdma_sge, md, + local_mem, remote_mem, inline_data); + + efa_send_wr_set_rdma_addr(remote_mem, rkey, remote_addr); +} + +static void efa_send_wr_rdma_read_64(struct ibv_qp_ex *ibvqpx, uint32_t rkey, + uint64_t remote_addr) +{ + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); struct efa_io_tx_wqe *tx_wqe; - tx_wqe = efa_send_wr_common(ibvqpx, EFA_IO_RDMA_READ); + tx_wqe = (struct efa_io_tx_wqe *)efa_send_wr_alloc(qp, ibvqpx); + if (unlikely(!tx_wqe)) + return; + + efa_send_wr_rdma_common(qp, ibvqpx, rkey, remote_addr, EFA_IO_RDMA_READ, + &tx_wqe->meta, tx_wqe->data.rdma_req.local_mem, + &tx_wqe->data.rdma_req.remote_mem, NULL); +} + +static void efa_send_wr_rdma_read_128(struct ibv_qp_ex *ibvqpx, uint32_t rkey, + uint64_t remote_addr) +{ + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + struct efa_io_tx_wqe_128 *tx_wqe; + + tx_wqe = (struct efa_io_tx_wqe_128 *)efa_send_wr_alloc(qp, ibvqpx); if (unlikely(!tx_wqe)) return; - efa_send_wr_set_rdma_addr(tx_wqe, rkey, remote_addr); + efa_send_wr_rdma_common(qp, ibvqpx, rkey, remote_addr, EFA_IO_RDMA_READ, + &tx_wqe->meta, tx_wqe->data.rdma_req.local_mem, + &tx_wqe->data.rdma_req.remote_mem, NULL); } -static void efa_send_wr_rdma_write(struct ibv_qp_ex *ibvqpx, uint32_t rkey, - uint64_t remote_addr) +static void efa_send_wr_rdma_write_64(struct ibv_qp_ex *ibvqpx, uint32_t rkey, + uint64_t remote_addr) { + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); struct efa_io_tx_wqe *tx_wqe; - tx_wqe = efa_send_wr_common(ibvqpx, EFA_IO_RDMA_WRITE); + tx_wqe = (struct efa_io_tx_wqe *)efa_send_wr_alloc(qp, ibvqpx); + if (unlikely(!tx_wqe)) + return; + + efa_send_wr_rdma_common(qp, ibvqpx, rkey, remote_addr, EFA_IO_RDMA_WRITE, + &tx_wqe->meta, tx_wqe->data.rdma_req.local_mem, + &tx_wqe->data.rdma_req.remote_mem, NULL); +} + +static void efa_send_wr_rdma_write_128(struct ibv_qp_ex *ibvqpx, uint32_t rkey, + uint64_t remote_addr) +{ + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + struct efa_io_tx_wqe_128 *tx_wqe; + + tx_wqe = (struct efa_io_tx_wqe_128 *)efa_send_wr_alloc(qp, ibvqpx); if (unlikely(!tx_wqe)) return; - efa_send_wr_set_rdma_addr(tx_wqe, rkey, remote_addr); + efa_send_wr_rdma_common(qp, ibvqpx, rkey, remote_addr, EFA_IO_RDMA_WRITE, + &tx_wqe->meta, tx_wqe->data.rdma_req.local_mem, + &tx_wqe->data.rdma_req.remote_mem, + qp->sq.inline_write_enabled ? tx_wqe->data.rdma_req.inline_data : + NULL); } -static void efa_send_wr_rdma_write_imm(struct ibv_qp_ex *ibvqpx, uint32_t rkey, - uint64_t remote_addr, __be32 imm_data) +static void efa_send_wr_rdma_write_imm_64(struct ibv_qp_ex *ibvqpx, uint32_t rkey, + uint64_t remote_addr, __be32 imm_data) { + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); struct efa_io_tx_wqe *tx_wqe; - tx_wqe = efa_send_wr_common(ibvqpx, EFA_IO_RDMA_WRITE); + tx_wqe = (struct efa_io_tx_wqe *)efa_send_wr_alloc(qp, ibvqpx); + if (unlikely(!tx_wqe)) + return; + + efa_send_wr_rdma_common(qp, ibvqpx, rkey, remote_addr, EFA_IO_RDMA_WRITE, + &tx_wqe->meta, tx_wqe->data.rdma_req.local_mem, + &tx_wqe->data.rdma_req.remote_mem, NULL); + efa_send_wr_set_imm_data(qp->sq.curr_tx_wqe.md, imm_data); +} + +static void efa_send_wr_rdma_write_imm_128(struct ibv_qp_ex *ibvqpx, uint32_t rkey, + uint64_t remote_addr, __be32 imm_data) +{ + struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + struct efa_io_tx_wqe_128 *tx_wqe; + + tx_wqe = (struct efa_io_tx_wqe_128 *)efa_send_wr_alloc(qp, ibvqpx); if (unlikely(!tx_wqe)) return; - efa_send_wr_set_rdma_addr(tx_wqe, rkey, remote_addr); - efa_send_wr_set_imm_data(tx_wqe, imm_data); + efa_send_wr_rdma_common(qp, ibvqpx, rkey, remote_addr, EFA_IO_RDMA_WRITE, + &tx_wqe->meta, tx_wqe->data.rdma_req.local_mem, + &tx_wqe->data.rdma_req.remote_mem, + qp->sq.inline_write_enabled ? tx_wqe->data.rdma_req.inline_data : + NULL); + efa_send_wr_set_imm_data(qp->sq.curr_tx_wqe.md, imm_data); } static void efa_send_wr_set_sge(struct ibv_qp_ex *ibvqpx, uint32_t lkey, uint64_t addr, uint32_t length) { struct efa_qp *qp = to_efa_qp_ex(ibvqpx); - struct efa_io_tx_buf_desc *buf; - struct efa_io_tx_wqe *tx_wqe; + struct efa_io_tx_meta_desc *md; uint8_t op_type; if (unlikely(qp->wr_session_err)) return; - tx_wqe = qp->sq.curr_tx_wqe; - tx_wqe->meta.length = 1; + md = qp->sq.curr_tx_wqe.md; + md->length = 1; - op_type = EFA_GET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_OP_TYPE); - switch (op_type) { - case EFA_IO_SEND: - buf = &tx_wqe->data.sgl[0]; - break; - case EFA_IO_RDMA_READ: - case EFA_IO_RDMA_WRITE: - tx_wqe->data.rdma_req.remote_mem.length = length; - buf = &tx_wqe->data.rdma_req.local_mem[0]; - break; - default: - return; - } + op_type = EFA_GET(&md->ctrl1, EFA_IO_TX_META_DESC_OP_TYPE); + if (op_type == EFA_IO_RDMA_READ || op_type == EFA_IO_RDMA_WRITE) + qp->sq.curr_tx_wqe.remote_mem->length = length; - efa_set_tx_buf(buf, addr, lkey, length); + efa_set_tx_buf(qp->sq.curr_tx_wqe.local_mem, addr, lkey, length); } static void efa_send_wr_set_sge_list(struct ibv_qp_ex *ibvqpx, size_t num_sge, const struct ibv_sge *sg_list) { struct efa_qp *qp = to_efa_qp_ex(ibvqpx); - struct efa_io_rdma_req *rdma_req; - struct efa_io_tx_wqe *tx_wqe; + struct efa_io_tx_meta_desc *md; struct efa_sq *sq = &qp->sq; uint8_t op_type; if (unlikely(qp->wr_session_err)) return; - tx_wqe = sq->curr_tx_wqe; - op_type = EFA_GET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_OP_TYPE); - switch (op_type) { - case EFA_IO_SEND: - if (unlikely(num_sge > sq->wq.max_sge)) { - verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), - "SQ[%u] num_sge[%zu] > max_sge[%u]\n", - ibvqpx->qp_base.qp_num, num_sge, - sq->wq.max_sge); - qp->wr_session_err = EINVAL; - return; - } - efa_post_send_sgl(tx_wqe->data.sgl, sg_list, num_sge); - break; - case EFA_IO_RDMA_READ: - case EFA_IO_RDMA_WRITE: - if (unlikely(num_sge > sq->max_wr_rdma_sge)) { - verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), - "SQ[%u] num_sge[%zu] > max_rdma_sge[%zu]\n", - ibvqpx->qp_base.qp_num, num_sge, - sq->max_wr_rdma_sge); - qp->wr_session_err = EINVAL; - return; - } - rdma_req = &tx_wqe->data.rdma_req; - rdma_req->remote_mem.length = efa_sge_total_bytes(sg_list, - num_sge); - efa_post_send_sgl(rdma_req->local_mem, sg_list, num_sge); - break; - default: + md = sq->curr_tx_wqe.md; + + op_type = EFA_GET(&md->ctrl1, EFA_IO_TX_META_DESC_OP_TYPE); + if (unlikely(num_sge > sq->curr_tx_wqe.max_sge)) { + verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), + "SQ[%u] op_type[%u] num_sge[%zu] > max_sge[%u]\n", + ibvqpx->qp_base.qp_num, op_type, num_sge, + sq->curr_tx_wqe.max_sge); + qp->wr_session_err = EINVAL; return; } - tx_wqe->meta.length = num_sge; + if (op_type == EFA_IO_RDMA_READ || op_type == EFA_IO_RDMA_WRITE) + sq->curr_tx_wqe.remote_mem->length = efa_sge_total_bytes(sg_list, num_sge); + + efa_post_send_sgl(sq->curr_tx_wqe.local_mem, md, sg_list, num_sge); } static void efa_send_wr_set_inline_data(struct ibv_qp_ex *ibvqpx, void *addr, size_t length) { struct efa_qp *qp = to_efa_qp_ex(ibvqpx); - struct efa_io_tx_wqe *tx_wqe = qp->sq.curr_tx_wqe; + struct efa_io_tx_meta_desc *md = qp->sq.curr_tx_wqe.md; + uint8_t op_type; if (unlikely(qp->wr_session_err)) return; @@ -2569,9 +2786,20 @@ static void efa_send_wr_set_inline_data(struct ibv_qp_ex *ibvqpx, void *addr, return; } - EFA_SET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG, 1); - memcpy(tx_wqe->data.inline_data, addr, length); - tx_wqe->meta.length = length; + if (unlikely(!qp->sq.curr_tx_wqe.inline_data)) { + op_type = EFA_GET(&md->ctrl1, EFA_IO_TX_META_DESC_OP_TYPE); + verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), + "SQ[%u] inline op_type[%u] isn't supported\n", + ibvqpx->qp_base.qp_num, op_type); + qp->wr_session_err = EINVAL; + return; + } + + EFA_SET(&md->ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG, 1); + memcpy(qp->sq.curr_tx_wqe.inline_data, addr, length); + md->length = length; + if (qp->sq.curr_tx_wqe.remote_mem) + qp->sq.curr_tx_wqe.remote_mem->length = length; } static void @@ -2580,9 +2808,9 @@ efa_send_wr_set_inline_data_list(struct ibv_qp_ex *ibvqpx, const struct ibv_data_buf *buf_list) { struct efa_qp *qp = to_efa_qp_ex(ibvqpx); - struct efa_io_tx_wqe *tx_wqe = qp->sq.curr_tx_wqe; - uint32_t total_length = 0; - uint32_t length; + struct efa_io_tx_meta_desc *md = qp->sq.curr_tx_wqe.md; + uint32_t length, total_length = 0; + uint8_t op_type; size_t i; if (unlikely(qp->wr_session_err)) @@ -2599,16 +2827,27 @@ efa_send_wr_set_inline_data_list(struct ibv_qp_ex *ibvqpx, return; } + if (unlikely(!qp->sq.curr_tx_wqe.inline_data)) { + op_type = EFA_GET(&md->ctrl1, EFA_IO_TX_META_DESC_OP_TYPE); + verbs_err(verbs_get_ctx(qp->verbs_qp.qp.context), + "SQ[%u] inline op_type[%u] isn't supported\n", + ibvqpx->qp_base.qp_num, op_type); + qp->wr_session_err = EINVAL; + return; + } + for (i = 0; i < num_buf; i++) { length = buf_list[i].length; - memcpy(tx_wqe->data.inline_data + total_length, + memcpy(qp->sq.curr_tx_wqe.inline_data + total_length, buf_list[i].addr, length); total_length += length; } - EFA_SET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG, 1); - tx_wqe->meta.length = total_length; + EFA_SET(&md->ctrl1, EFA_IO_TX_META_DESC_INLINE_MSG, 1); + md->length = total_length; + if (qp->sq.curr_tx_wqe.remote_mem) + qp->sq.curr_tx_wqe.remote_mem->length = total_length; } static void efa_send_wr_set_addr(struct ibv_qp_ex *ibvqpx, @@ -2617,20 +2856,21 @@ static void efa_send_wr_set_addr(struct ibv_qp_ex *ibvqpx, { struct efa_qp *qp = to_efa_qp_ex(ibvqpx); struct efa_ah *ah = to_efa_ah(ibvah); - struct efa_io_tx_wqe *tx_wqe; + struct efa_io_tx_meta_desc *md; if (unlikely(qp->wr_session_err)) return; - tx_wqe = qp->sq.curr_tx_wqe; + md = qp->sq.curr_tx_wqe.md; - tx_wqe->meta.dest_qp_num = remote_qpn; - tx_wqe->meta.ah = ah->efa_ah; - tx_wqe->meta.qkey = remote_qkey; + md->dest_qp_num = remote_qpn; + md->ah = ah->efa_ah; + md->qkey = remote_qkey; rdma_tracepoint(rdma_core_efa, post_send, qp->dev->name, ibvqpx->wr_id, - EFA_GET(&tx_wqe->meta.ctrl1, EFA_IO_TX_META_DESC_OP_TYPE), - ibvqpx->qp_base.qp_num, remote_qpn, ah->efa_ah, efa_get_wqe_length(tx_wqe)); + EFA_GET(&md->ctrl1, EFA_IO_TX_META_DESC_OP_TYPE), + ibvqpx->qp_base.qp_num, remote_qpn, ah->efa_ah, + efa_wqe_get_data_length(qp->sq)); } static void efa_send_wr_start(struct ibv_qp_ex *ibvqpx) @@ -2690,11 +2930,9 @@ static int efa_send_wr_complete(struct ibv_qp_ex *ibvqpx) num_wqe_to_copy = min3(sq->num_wqe_pending, sq->wq.wqe_cnt - sq_desc_idx, max_txbatch - curbatch); - mmio_memcpy_x64((struct efa_io_tx_wqe *)sq->desc + - sq_desc_idx, - (struct efa_io_tx_wqe *)sq->local_queue + - local_idx, - num_wqe_to_copy * sizeof(struct efa_io_tx_wqe)); + mmio_memcpy_x64(sq->desc + sq_desc_idx * sq->wqe_size, + sq->local_queue + local_idx * sq->wqe_size, + num_wqe_to_copy * sq->wqe_size); sq->num_wqe_pending -= num_wqe_to_copy; local_idx += num_wqe_to_copy; @@ -2736,26 +2974,32 @@ static void efa_send_wr_abort(struct ibv_qp_ex *ibvqpx) } static void efa_qp_fill_wr_pfns(struct ibv_qp_ex *ibvqpx, - struct ibv_qp_init_attr_ex *attr_ex) + struct ibv_qp_init_attr_ex *attr_ex, + uint16_t wqe_size) { + bool use_64 = wqe_size == EFA_IO_TX_DESC_SIZE_64; + ibvqpx->wr_start = efa_send_wr_start; ibvqpx->wr_complete = efa_send_wr_complete; ibvqpx->wr_abort = efa_send_wr_abort; if (attr_ex->send_ops_flags & IBV_QP_EX_WITH_SEND) - ibvqpx->wr_send = efa_send_wr_send; + ibvqpx->wr_send = use_64 ? efa_send_wr_send_64 : efa_send_wr_send_128; if (attr_ex->send_ops_flags & IBV_QP_EX_WITH_SEND_WITH_IMM) - ibvqpx->wr_send_imm = efa_send_wr_send_imm; + ibvqpx->wr_send_imm = use_64 ? efa_send_wr_send_imm_64 : efa_send_wr_send_imm_128; if (attr_ex->send_ops_flags & IBV_QP_EX_WITH_RDMA_READ) - ibvqpx->wr_rdma_read = efa_send_wr_rdma_read; + ibvqpx->wr_rdma_read = use_64 ? efa_send_wr_rdma_read_64 : + efa_send_wr_rdma_read_128; if (attr_ex->send_ops_flags & IBV_QP_EX_WITH_RDMA_WRITE) - ibvqpx->wr_rdma_write = efa_send_wr_rdma_write; + ibvqpx->wr_rdma_write = use_64 ? efa_send_wr_rdma_write_64 : + efa_send_wr_rdma_write_128; if (attr_ex->send_ops_flags & IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM) - ibvqpx->wr_rdma_write_imm = efa_send_wr_rdma_write_imm; + ibvqpx->wr_rdma_write_imm = use_64 ? efa_send_wr_rdma_write_imm_64 : + efa_send_wr_rdma_write_imm_128; ibvqpx->wr_set_inline_data = efa_send_wr_set_inline_data; ibvqpx->wr_set_inline_data_list = efa_send_wr_set_inline_data_list; diff --git a/pyverbs/providers/efa/efa_enums.pxd b/pyverbs/providers/efa/efa_enums.pxd index 11c85d62f..258b97866 100644 --- a/pyverbs/providers/efa/efa_enums.pxd +++ b/pyverbs/providers/efa/efa_enums.pxd @@ -17,6 +17,7 @@ cdef extern from 'infiniband/efadv.h': cpdef enum: EFADV_QP_FLAGS_UNSOLICITED_WRITE_RECV + EFADV_QP_FLAGS_INLINE_WRITE cpdef enum: EFADV_WC_EX_WITH_SGID @@ -26,3 +27,6 @@ cdef extern from 'infiniband/efadv.h': EFADV_MR_ATTR_VALIDITY_RECV_IC_ID EFADV_MR_ATTR_VALIDITY_RDMA_READ_IC_ID EFADV_MR_ATTR_VALIDITY_RDMA_RECV_IC_ID + + cpdef enum: + EFADV_SQ_DEPTH_ATTR_INLINE_WRITE diff --git a/pyverbs/providers/efa/efadv.pxd b/pyverbs/providers/efa/efadv.pxd index 12e11f8ce..249ce7570 100644 --- a/pyverbs/providers/efa/efadv.pxd +++ b/pyverbs/providers/efa/efadv.pxd @@ -1,5 +1,5 @@ # SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) -# Copyright 2020-2024 Amazon.com, Inc. or its affiliates. All rights reserved. +# Copyright 2020-2026 Amazon.com, Inc. or its affiliates. All rights reserved. #cython: language_level=3 @@ -50,3 +50,11 @@ cdef class EfaDVCQInitAttr(PyverbsObject): cdef class EfaDVMRAttr(PyverbsObject): cdef dv.efadv_mr_attr mr_attr + + +cdef class EfaDVSQDepthAttr(PyverbsObject): + cdef dv.efadv_sq_depth_attr sq_depth_attr + + +cdef class EfaDVRQDepthAttr(PyverbsObject): + cdef dv.efadv_rq_depth_attr rq_depth_attr diff --git a/pyverbs/providers/efa/efadv.pyx b/pyverbs/providers/efa/efadv.pyx index ec21225aa..2f3e04cba 100644 --- a/pyverbs/providers/efa/efadv.pyx +++ b/pyverbs/providers/efa/efadv.pyx @@ -1,5 +1,5 @@ # SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) -# Copyright 2020-2024 Amazon.com, Inc. or its affiliates. All rights reserved. +# Copyright 2020-2026 Amazon.com, Inc. or its affiliates. All rights reserved. cimport pyverbs.providers.efa.efa_enums as dve cimport pyverbs.providers.efa.libefa as dv @@ -92,6 +92,10 @@ cdef class EfaDVDeviceAttr(PyverbsObject): def inline_buf_size(self): return self.device_attr.inline_buf_size + @property + def inline_buf_size_ex(self): + return self.device_attr.inline_buf_size_ex + @property def device_caps(self): return self.device_attr.device_caps @@ -108,6 +112,7 @@ cdef class EfaDVDeviceAttr(PyverbsObject): print_format.format('Max SQ SQE', self.device_attr.max_sq_sge) + \ print_format.format('Max RQ SQE', self.device_attr.max_rq_sge) + \ print_format.format('Inline buffer size', self.device_attr.inline_buf_size) + \ + print_format.format('Inline buffer size ex', self.device_attr.inline_buf_size_ex) + \ print_format.format('Device Capabilities', dev_cap_to_str(self.device_attr.device_caps)) + \ print_format.format('Max RDMA Size', self.device_attr.max_rdma_size) @@ -325,3 +330,69 @@ cdef class EfaMR(MR): raise PyverbsRDMAError(f'Failed to query EFA MR', rc) return mr_attr + + +cdef class EfaDVSQDepthAttr(PyverbsObject): + """ + Represents efadv_sq_depth_attr struct + """ + @property + def comp_mask(self): + return self.sq_depth_attr.comp_mask + + @comp_mask.setter + def comp_mask(self, val): + self.sq_depth_attr.comp_mask = val + + @property + def flags(self): + return self.sq_depth_attr.flags + + @flags.setter + def flags(self, val): + self.sq_depth_attr.flags = val + + @property + def max_send_sge(self): + return self.sq_depth_attr.max_send_sge + + @max_send_sge.setter + def max_send_sge(self, val): + self.sq_depth_attr.max_send_sge = val + + @property + def max_rdma_sge(self): + return self.sq_depth_attr.max_rdma_sge + + @max_rdma_sge.setter + def max_rdma_sge(self, val): + self.sq_depth_attr.max_rdma_sge = val + + @property + def max_inline_data(self): + return self.sq_depth_attr.max_inline_data + + @max_inline_data.setter + def max_inline_data(self, val): + self.sq_depth_attr.max_inline_data = val + + +cdef class EfaDVRQDepthAttr(PyverbsObject): + """ + Represents efadv_rq_depth_attr struct + """ + @property + def comp_mask(self): + return self.sq_depth_attr.comp_mask + + @comp_mask.setter + def comp_mask(self, val): + self.rq_depth_attr.comp_mask = val + + @property + def max_recv_sge(self): + return self.sq_depth_attr.max_recv_sge + + @max_recv_sge.setter + def max_recv_sge(self, val): + self.rq_depth_attr.max_recv_sge = val diff --git a/pyverbs/providers/efa/libefa.pxd b/pyverbs/providers/efa/libefa.pxd index 265868ac0..bf043434f 100644 --- a/pyverbs/providers/efa/libefa.pxd +++ b/pyverbs/providers/efa/libefa.pxd @@ -1,5 +1,5 @@ # SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) -# Copyright 2020-2024 Amazon.com, Inc. or its affiliates. All rights reserved. +# Copyright 2020-2026 Amazon.com, Inc. or its affiliates. All rights reserved. #cython: language_level=3 @@ -17,7 +17,7 @@ cdef extern from 'infiniband/efadv.h': uint16_t max_sq_sge; uint16_t max_rq_sge; uint16_t inline_buf_size; - uint8_t reserved[2]; + uint16_t inline_buf_size_ex; uint32_t device_caps; uint32_t max_rdma_size; @@ -47,6 +47,17 @@ cdef extern from 'infiniband/efadv.h': uint16_t rdma_read_ic_id; uint16_t rdma_recv_ic_id; + cdef struct efadv_sq_depth_attr: + uint64_t comp_mask; + uint32_t flags; + uint32_t max_send_sge; + uint32_t max_rdma_sge; + uint32_t max_inline_data; + + cdef struct efadv_rq_depth_attr: + uint64_t comp_mask; + uint32_t max_recv_sge; + int efadv_query_device(v.ibv_context *ibvctx, efadv_device_attr *attrs, uint32_t inlen) int efadv_query_ah(v.ibv_ah *ibvah, efadv_ah_attr *attr, @@ -65,3 +76,7 @@ cdef extern from 'infiniband/efadv.h': int efadv_wc_read_sgid(efadv_cq *efadv_cq, v.ibv_gid *sgid) bool efadv_wc_is_unsolicited(efadv_cq *efadv_cq) int efadv_query_mr(v.ibv_mr *ibvmr, efadv_mr_attr *attr, uint32_t inlen) + int efadv_get_max_sq_depth(v.ibv_context *ibvctx, efadv_sq_depth_attr *attr, + uint32_t inlen) + int efadv_get_max_rq_depth(v.ibv_context *ibvctx, efadv_rq_depth_attr *attr, + uint32_t inlen); diff --git a/util/util.h b/util/util.h index 92b674067..ffccd1d92 100644 --- a/util/util.h +++ b/util/util.h @@ -81,6 +81,11 @@ static inline uint64_t roundup_pow_of_two(uint64_t n) return n == 1 ? 1 : 1ULL << ilog64(n - 1); } +static inline uint64_t rounddown_pow_of_two(uint64_t n) +{ + return n == 0 ? 0 : 1ULL << (ilog64(n) - 1); +} + static inline unsigned long DIV_ROUND_UP(unsigned long n, unsigned long d) { return (n + d - 1) / d;