From bf50872e9456dc5bfc0f2d83dbc28c18854a2aa3 Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Tue, 7 May 2024 11:54:40 -0700 Subject: [PATCH 01/11] src: Initial changes for transmission-side multiplexing --- src/runtime-mpi.c | 2 +- src/transport_ofi.c | 490 ++++++++++++++++++++++++++------------------ src/transport_ofi.h | 146 ++++++------- 3 files changed, 372 insertions(+), 266 deletions(-) diff --git a/src/runtime-mpi.c b/src/runtime-mpi.c index 3713ec3fd..7856556a9 100644 --- a/src/runtime-mpi.c +++ b/src/runtime-mpi.c @@ -28,7 +28,7 @@ /* Note: Increase MAX_KV_COUNT if more key/values are needed. MAX_KV_COUNT is * 2 * the number of key/value pairs. */ -#define MAX_KV_COUNT 20 +#define MAX_KV_COUNT 40 #define MAX_KV_LENGTH 512 static int rank = -1; diff --git a/src/transport_ofi.c b/src/transport_ofi.c index 1fdd9fbc0..02d49264e 100644 --- a/src/transport_ofi.c +++ b/src/transport_ofi.c @@ -52,6 +52,9 @@ #include "runtime.h" #include "uthash.h" +struct fi_info **provider_list = NULL; +size_t shmem_transport_ofi_num_nics = 0; + struct fabric_info { struct fi_info *fabrics; struct fi_info *p_info; @@ -386,7 +389,7 @@ struct shmem_transport_ofi_stx_t { int is_private; }; typedef struct shmem_transport_ofi_stx_t shmem_transport_ofi_stx_t; -static shmem_transport_ofi_stx_t* shmem_transport_ofi_stx_pool = NULL; +static shmem_transport_ofi_stx_t** shmem_transport_ofi_stx_pool = NULL; struct shmem_transport_ofi_stx_kvs_t { int stx_idx; @@ -397,7 +400,7 @@ typedef struct shmem_transport_ofi_stx_kvs_t shmem_transport_ofi_stx_kvs_t; static shmem_transport_ofi_stx_kvs_t* shmem_transport_ofi_stx_kvs = NULL; static inline -void shmem_transport_ofi_dump_stx(void) { +void shmem_transport_ofi_dump_stx(size_t idx) { char stx_str[256]; int i, offset; @@ -407,8 +410,8 @@ void shmem_transport_ofi_dump_stx(void) { for (i = offset = 0; i < shmem_transport_ofi_stx_max; i++) offset += snprintf(stx_str+offset, 256-offset, (i == shmem_transport_ofi_stx_max-1) ? "%ld%s" : "%ld%s ", - shmem_transport_ofi_stx_pool[i].ref_cnt, - shmem_transport_ofi_stx_pool[i].is_private ? "P" : "S"); + shmem_transport_ofi_stx_pool[idx][i].ref_cnt, + shmem_transport_ofi_stx_pool[idx][i].is_private ? "P" : "S"); DEBUG_MSG("STX[%ld] = [ %s ]\n", shmem_transport_ofi_stx_max, stx_str); } @@ -432,13 +435,13 @@ void shmem_transport_ofi_stx_rand_init(void) { } static inline -int shmem_transport_ofi_stx_search_unused(void) +int shmem_transport_ofi_stx_search_unused(size_t idx) { int stx_idx = -1, i; for (i = 0; i < shmem_transport_ofi_stx_max; i++) { - if (shmem_transport_ofi_stx_pool[i].ref_cnt == 0) { - shmem_internal_assert(!shmem_transport_ofi_stx_pool[i].is_private); + if (shmem_transport_ofi_stx_pool[idx][i].ref_cnt == 0) { + shmem_internal_assert(!shmem_transport_ofi_stx_pool[idx][i].is_private); stx_idx = i; break; } @@ -449,7 +452,7 @@ int shmem_transport_ofi_stx_search_unused(void) static inline -int shmem_transport_ofi_stx_search_shared(long threshold) +int shmem_transport_ofi_stx_search_shared(long threshold, size_t idx) { static int rr_start_idx = 0; int stx_idx = -1, i, count; @@ -458,9 +461,9 @@ int shmem_transport_ofi_stx_search_shared(long threshold) case ROUNDROBIN: i = rr_start_idx; for (count = 0; count < shmem_transport_ofi_stx_max; count++) { - if (shmem_transport_ofi_stx_pool[i].ref_cnt > 0 && - (shmem_transport_ofi_stx_pool[i].ref_cnt <= threshold || threshold == -1) && - !shmem_transport_ofi_stx_pool[i].is_private) { + if (shmem_transport_ofi_stx_pool[idx][i].ref_cnt > 0 && + (shmem_transport_ofi_stx_pool[idx][i].ref_cnt <= threshold || threshold == -1) && + !shmem_transport_ofi_stx_pool[idx][i].is_private) { stx_idx = i; rr_start_idx = (i + 1) % shmem_transport_ofi_stx_max; break; @@ -473,9 +476,9 @@ int shmem_transport_ofi_stx_search_shared(long threshold) case RANDOM: for (i = count = 0; i < shmem_transport_ofi_stx_max; i++) { - if (shmem_transport_ofi_stx_pool[i].ref_cnt > 0 && - (shmem_transport_ofi_stx_pool[i].ref_cnt <= threshold || threshold == -1) && - !shmem_transport_ofi_stx_pool[i].is_private) + if (shmem_transport_ofi_stx_pool[idx][i].ref_cnt > 0 && + (shmem_transport_ofi_stx_pool[idx][i].ref_cnt <= threshold || threshold == -1) && + !shmem_transport_ofi_stx_pool[idx][i].is_private) { ++count; break; @@ -489,9 +492,9 @@ int shmem_transport_ofi_stx_search_shared(long threshold) else { do { stx_idx = (int) (rand_r(&rand_pool_seed) / (RAND_MAX + 1.0) * shmem_transport_ofi_stx_max); - } while (!(shmem_transport_ofi_stx_pool[stx_idx].ref_cnt > 0 && - (shmem_transport_ofi_stx_pool[stx_idx].ref_cnt <= threshold || threshold == -1) && - !shmem_transport_ofi_stx_pool[stx_idx].is_private)); + } while (!(shmem_transport_ofi_stx_pool[idx][stx_idx].ref_cnt > 0 && + (shmem_transport_ofi_stx_pool[idx][stx_idx].ref_cnt <= threshold || threshold == -1) && + !shmem_transport_ofi_stx_pool[idx][stx_idx].is_private)); } break; @@ -506,21 +509,23 @@ int shmem_transport_ofi_stx_search_shared(long threshold) static inline -void shmem_transport_ofi_stx_allocate(shmem_transport_ctx_t *ctx) +void shmem_transport_ofi_stx_allocate(shmem_transport_ctx_t *ctx, size_t idx) { if (shmem_transport_ofi_stx_max == 0) { - ctx->stx_idx = -1; + ctx->stx_idx[idx] = -1; } else if (shmem_transport_ofi_is_private(ctx->options)) { /* SHMEM contexts that are private to the same thread (i.e. have * SHMEM_CTX_PRIVATE option set) share the same STX. */ + // TODO: Should f be an array of shmem_transport_ofi_stx_kvs_t pointers, or single pointer and + // stx_idx field is an array? shmem_transport_ofi_stx_kvs_t *f; HASH_FIND(hh, shmem_transport_ofi_stx_kvs, &ctx->tid, sizeof(struct shmem_internal_tid), f); if (f) { - shmem_transport_ofi_stx_pool[f->stx_idx].ref_cnt++; - ctx->stx_idx = f->stx_idx; + shmem_transport_ofi_stx_pool[idx][f->stx_idx].ref_cnt++; + ctx->stx_idx[idx] = f->stx_idx; } else { /* No STX allocated to the given TID, attempt to allocate one */ @@ -528,21 +533,21 @@ void shmem_transport_ofi_stx_allocate(shmem_transport_ctx_t *ctx) int stx_idx; shmem_transport_ofi_stx_t *stx = NULL; - stx_idx = shmem_transport_ofi_stx_search_unused(); + stx_idx = shmem_transport_ofi_stx_search_unused(idx); /* Couldn't get new STX, assign a shared one */ /* Note: When stx_max > 0, shared STX allocation is always successful */ if (stx_idx < 0) { DEBUG_STR("private STX unavailable, falling back to STX sharing"); is_unused = 0; - stx_idx = shmem_transport_ofi_stx_search_shared(shmem_transport_ofi_stx_threshold); + stx_idx = shmem_transport_ofi_stx_search_shared(shmem_transport_ofi_stx_threshold, idx); if (stx_idx < 0) - stx_idx = shmem_transport_ofi_stx_search_shared(-1); + stx_idx = shmem_transport_ofi_stx_search_shared(-1, idx); } shmem_internal_assert(stx_idx >= 0); - stx = &shmem_transport_ofi_stx_pool[stx_idx]; - ctx->stx_idx = stx_idx; + stx = &shmem_transport_ofi_stx_pool[idx][stx_idx]; + ctx->stx_idx[idx] = stx_idx; stx->ref_cnt++; if (is_unused) { @@ -552,7 +557,7 @@ void shmem_transport_ofi_stx_allocate(shmem_transport_ctx_t *ctx) RAISE_ERROR_STR("out of memory when allocating STX KVS entry"); } e->tid = ctx->tid; - e->stx_idx = ctx->stx_idx; + e->stx_idx = ctx->stx_idx[idx]; /* FIX? */ HASH_ADD(hh, shmem_transport_ofi_stx_kvs, tid, sizeof(struct shmem_internal_tid), e); } else { @@ -561,20 +566,20 @@ void shmem_transport_ofi_stx_allocate(shmem_transport_ctx_t *ctx) } /* TODO: Optimize this case? else if (ctx->options & SHMEM_CTX_SERIALIZED) */ } else { - int stx_idx = shmem_transport_ofi_stx_search_shared(shmem_transport_ofi_stx_threshold); + int stx_idx = shmem_transport_ofi_stx_search_shared(shmem_transport_ofi_stx_threshold, idx); if (stx_idx < 0) - stx_idx = shmem_transport_ofi_stx_search_unused(); + stx_idx = shmem_transport_ofi_stx_search_unused(idx); if (stx_idx < 0) - stx_idx = shmem_transport_ofi_stx_search_shared(-1); + stx_idx = shmem_transport_ofi_stx_search_shared(-1, idx); shmem_internal_assert(stx_idx >= 0); - ctx->stx_idx = stx_idx; - shmem_transport_ofi_stx_pool[ctx->stx_idx].ref_cnt++; + ctx->stx_idx[idx] = stx_idx; + shmem_transport_ofi_stx_pool[idx][ctx->stx_idx[idx]].ref_cnt++; } - shmem_transport_ofi_dump_stx(); + shmem_transport_ofi_dump_stx(idx); return; } @@ -592,24 +597,24 @@ void init_bounce_buffer(shmem_free_list_item_t *item) static inline -int bind_enable_ep_resources(shmem_transport_ctx_t *ctx) +int bind_enable_ep_resources(shmem_transport_ctx_t *ctx, size_t idx) { int ret = 0; /* If using SOS-managed STXs, bind the STX */ - if (ctx->stx_idx >= 0) { - ret = fi_ep_bind(ctx->ep, &shmem_transport_ofi_stx_pool[ctx->stx_idx].stx->fid, 0); + if (ctx->stx_idx[idx] >= 0) { + ret = fi_ep_bind(ctx->ep[idx], &shmem_transport_ofi_stx_pool[idx][ctx->stx_idx[idx]].stx->fid, 0); OFI_CHECK_RETURN_STR(ret, "fi_ep_bind STX to endpoint failed"); } /* Put counter captures completions for non-fetching operations (put, * atomic, etc.) */ - ret = fi_ep_bind(ctx->ep, &ctx->put_cntr->fid, FI_WRITE); + ret = fi_ep_bind(ctx->ep[idx], &ctx->put_cntr[idx]->fid, FI_WRITE); OFI_CHECK_RETURN_STR(ret, "fi_ep_bind put CNTR to endpoint failed"); /* Get counter captures completions for fetching operations (get, * fetch-atomic, etc.) */ - ret = fi_ep_bind(ctx->ep, &ctx->get_cntr->fid, FI_READ); + ret = fi_ep_bind(ctx->ep[idx], &ctx->get_cntr[idx]->fid, FI_READ); OFI_CHECK_RETURN_STR(ret, "fi_ep_bind get CNTR to endpoint failed"); /* In addition to incrementing the put counter, bounce buffered puts and @@ -622,14 +627,14 @@ int bind_enable_ep_resources(shmem_transport_ctx_t *ctx) * removed below. However, there aren't currently any cases where removing * FI_RECV significantly improves performance or resource usage. */ - ret = fi_ep_bind(ctx->ep, &ctx->cq->fid, + ret = fi_ep_bind(ctx->ep[idx], &ctx->cq[idx]->fid, FI_SELECTIVE_COMPLETION | FI_TRANSMIT | FI_RECV); OFI_CHECK_RETURN_STR(ret, "fi_ep_bind CQ to endpoint failed"); - ret = fi_ep_bind(ctx->ep, &shmem_transport_ofi_avfd->fid, 0); + ret = fi_ep_bind(ctx->ep[idx], &shmem_transport_ofi_avfd->fid, 0); OFI_CHECK_RETURN_STR(ret, "fi_ep_bind AV to endpoint failed"); - ret = fi_enable(ctx->ep); + ret = fi_enable(ctx->ep[idx]); OFI_CHECK_RETURN_STR(ret, "fi_enable on endpoint failed"); return ret; @@ -872,14 +877,14 @@ int publish_external_mr_info(void) #endif static -int publish_mr_info(void) +int publish_mr_info(struct fi_info *info) { #ifndef ENABLE_MR_SCALABLE { int err; uint64_t heap_key, data_key; - if (shmem_transport_ofi_info.p_info->domain_attr->mr_mode & FI_MR_PROV_KEY) { + if (info->domain_attr->mr_mode & FI_MR_PROV_KEY) { heap_key = fi_mr_key(shmem_transport_ofi_target_heap_mrfd); data_key = fi_mr_key(shmem_transport_ofi_target_data_mrfd); } else { @@ -901,7 +906,7 @@ int publish_mr_info(void) } #ifdef ENABLE_REMOTE_VIRTUAL_ADDRESSING - if (shmem_transport_ofi_info.p_info->domain_attr->mr_mode & FI_MR_VIRT_ADDR) + if (info->domain_attr->mr_mode & FI_MR_VIRT_ADDR) shmem_transport_ofi_use_absolute_address = 1; else shmem_transport_ofi_use_absolute_address = 0; @@ -910,7 +915,7 @@ int publish_mr_info(void) int err; void *heap_base, *data_base; - if (shmem_transport_ofi_info.p_info->domain_attr->mr_mode & FI_MR_VIRT_ADDR) { + if (info->domain_attr->mr_mode & FI_MR_VIRT_ADDR) { heap_base = shmem_internal_heap_base; data_base = shmem_internal_data_base; } else { @@ -1098,7 +1103,7 @@ int atomicvalid_rtncheck(int ret, int atomic_size, static inline int atomicvalid_DTxOP(int DT_MAX, int OPS_MAX, int *DT, int *OPS, - atomic_support_lv atomic_sup) + atomic_support_lv atomic_sup, size_t idx) { int i, j; size_t atomic_size; @@ -1106,7 +1111,7 @@ int atomicvalid_DTxOP(int DT_MAX, int OPS_MAX, int *DT, int *OPS, for (i = 0; i < DT_MAX; i++) { for (j = 0; j < OPS_MAX; j++) { int dt = SHMEM_TRANSPORT_DTYPE(DT[i]); - int ret = fi_atomicvalid(shmem_transport_ctx_default.ep, + int ret = fi_atomicvalid(shmem_transport_ctx_default.ep[idx], dt, OPS[j], &atomic_size); if (atomicvalid_rtncheck(ret, atomic_size, atomic_sup, SHMEM_OpName[OPS[j]], @@ -1120,7 +1125,7 @@ int atomicvalid_DTxOP(int DT_MAX, int OPS_MAX, int *DT, int *OPS, static inline int compare_atomicvalid_DTxOP(int DT_MAX, int OPS_MAX, int *DT, - int *OPS, atomic_support_lv atomic_sup) + int *OPS, atomic_support_lv atomic_sup, size_t idx) { int i, j; size_t atomic_size; @@ -1128,7 +1133,7 @@ int compare_atomicvalid_DTxOP(int DT_MAX, int OPS_MAX, int *DT, for (i = 0; i < DT_MAX; i++) { for (j = 0; j < OPS_MAX; j++) { int dt = SHMEM_TRANSPORT_DTYPE(DT[i]); - int ret = fi_compare_atomicvalid(shmem_transport_ctx_default.ep, + int ret = fi_compare_atomicvalid(shmem_transport_ctx_default.ep[idx], dt, OPS[j], &atomic_size); if (atomicvalid_rtncheck(ret, atomic_size, atomic_sup, SHMEM_OpName[OPS[j]], @@ -1142,7 +1147,7 @@ int compare_atomicvalid_DTxOP(int DT_MAX, int OPS_MAX, int *DT, static inline int fetch_atomicvalid_DTxOP(int DT_MAX, int OPS_MAX, int *DT, int *OPS, - atomic_support_lv atomic_sup) + atomic_support_lv atomic_sup, size_t idx) { int i, j; size_t atomic_size; @@ -1150,7 +1155,7 @@ int fetch_atomicvalid_DTxOP(int DT_MAX, int OPS_MAX, int *DT, int *OPS, for (i = 0; i < DT_MAX; i++) { for (j = 0; j < OPS_MAX; j++) { int dt = SHMEM_TRANSPORT_DTYPE(DT[i]); - int ret = fi_fetch_atomicvalid(shmem_transport_ctx_default.ep, + int ret = fi_fetch_atomicvalid(shmem_transport_ctx_default.ep[idx], dt, OPS[j], &atomic_size); if (atomicvalid_rtncheck(ret, atomic_size, atomic_sup, SHMEM_OpName[OPS[j]], @@ -1163,7 +1168,7 @@ int fetch_atomicvalid_DTxOP(int DT_MAX, int OPS_MAX, int *DT, int *OPS, } static inline -int atomic_limitations_check(void) +int atomic_limitations_check(size_t idx) { /* Retrieve messaging limitations from OFI * @@ -1182,54 +1187,54 @@ int atomic_limitations_check(void) /* Standard OPS check */ ret = atomicvalid_DTxOP(SIZEOF_AMO_DT, SIZEOF_AMO_OPS, DT_AMO_STANDARD, - AMO_STANDARD_OPS, general_atomic_sup); + AMO_STANDARD_OPS, general_atomic_sup, idx); if (ret) return ret; ret = fetch_atomicvalid_DTxOP(SIZEOF_AMO_DT, SIZEOF_AMO_FOPS, DT_AMO_STANDARD, FETCH_AMO_STANDARD_OPS, - general_atomic_sup); + general_atomic_sup, idx); if (ret) return ret; ret = compare_atomicvalid_DTxOP(SIZEOF_AMO_DT, SIZEOF_AMO_COPS, DT_AMO_STANDARD, COMPARE_AMO_STANDARD_OPS, - general_atomic_sup); + general_atomic_sup, idx); if (ret) return ret; /* Extended OPS check */ ret = atomicvalid_DTxOP(SIZEOF_AMO_EX_DT, SIZEOF_AMO_EX_OPS, DT_AMO_EXTENDED, - AMO_EXTENDED_OPS, general_atomic_sup); + AMO_EXTENDED_OPS, general_atomic_sup, idx); if (ret) return ret; ret = fetch_atomicvalid_DTxOP(SIZEOF_AMO_EX_DT, SIZEOF_AMO_EX_FOPS, DT_AMO_EXTENDED, FETCH_AMO_EXTENDED_OPS, - general_atomic_sup); + general_atomic_sup, idx); if (ret) return ret; /* Reduction OPS check */ ret = atomicvalid_DTxOP(SIZEOF_RED_DT, SIZEOF_RED_OPS, DT_REDUCE_BITWISE, - REDUCE_BITWISE_OPS, reduction_sup); + REDUCE_BITWISE_OPS, reduction_sup, idx); if (ret) return ret; ret = atomicvalid_DTxOP(SIZEOF_REDC_DT, SIZEOF_REDC_OPS, DT_REDUCE_COMPARE, - REDUCE_COMPARE_OPS, reduction_sup); + REDUCE_COMPARE_OPS, reduction_sup, idx); if (ret) return ret; ret = atomicvalid_DTxOP(SIZEOF_REDA_DT, SIZEOF_REDA_OPS, DT_REDUCE_ARITH, - REDUCE_ARITH_OPS, reduction_sup); + REDUCE_ARITH_OPS, reduction_sup, idx); if (ret) return ret; /* Internal atomic requirement */ ret = compare_atomicvalid_DTxOP(SIZEOF_INTERNAL_REQ_DT, SIZEOF_INTERNAL_REQ_OPS, DT_INTERNAL_REQ, INTERNAL_REQ_OPS, - general_atomic_sup); + general_atomic_sup, idx); if (ret) return ret; @@ -1356,6 +1361,11 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p ret = hwloc_get_proc_last_cpu_location(shmem_internal_topology, getpid(), bindset, HWLOC_CPUBIND_PROCESS); if (ret < 0) { RAISE_WARN_MSG("hwloc_get_proc_last_cpu_location failed (%s)\n", strerror(errno)); + provider_list = (struct fi_info **) malloc(num_nics * sizeof(struct fi_info *)); + for (size_t idx = 0; idx < num_nics; idx++) { + provider_list[idx] = provs[idx]; + } + shmem_transport_ofi_num_nics = num_nics; return provs[shmem_internal_my_pe % num_nics]; } @@ -1371,11 +1381,21 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p hwloc_obj_t io_device = hwloc_get_pcidev_by_busid(shmem_internal_topology, pci.domain_id, pci.bus_id, pci.device_id, pci.function_id); if (!io_device) { RAISE_WARN_MSG("hwloc_get_pcidev_by_busid failed\n"); + provider_list = (struct fi_info **) malloc(num_nics * sizeof(struct fi_info *)); + for (size_t idx = 0; idx < num_nics; idx++) { + provider_list[idx] = provs[idx]; + } + shmem_transport_ofi_num_nics = num_nics; return provs[shmem_internal_my_pe % num_nics]; }; hwloc_obj_t first_non_io = hwloc_get_non_io_ancestor_obj(shmem_internal_topology, io_device); if (!first_non_io) { RAISE_WARN_MSG("hwloc_get_non_io_ancestor_obj failed\n"); + provider_list = (struct fi_info **) malloc(num_nics * sizeof(struct fi_info *)); + for (size_t idx = 0; idx < num_nics; idx++) { + provider_list[idx] = provs[idx]; + } + shmem_transport_ofi_num_nics = num_nics; return provs[shmem_internal_my_pe % num_nics]; } @@ -1392,7 +1412,11 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p if (!close_provs) { RAISE_WARN_MSG("Could not detect any NICs with affinity to the process\n"); - + provider_list = (struct fi_info **) malloc(num_nics * sizeof(struct fi_info *)); + for (size_t idx = 0; idx < num_nics; idx++) { + provider_list[idx] = provs[idx]; + } + shmem_transport_ofi_num_nics = num_nics; /* If no 'close' NICs, select from list of all NICs using round-robin assignment */ return provs[shmem_internal_my_pe % num_nics]; } @@ -1400,16 +1424,17 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p last_added->next = NULL; int idx = 0; - struct fi_info **prov_list = (struct fi_info **) malloc(num_close_nics * sizeof(struct fi_info *)); + provider_list = (struct fi_info **) malloc(num_close_nics * sizeof(struct fi_info *)); for (struct fi_info *cur_fabric = close_provs; cur_fabric; cur_fabric = cur_fabric->next) { - prov_list[idx++] = cur_fabric; + provider_list[idx++] = cur_fabric; } hwloc_bitmap_free(bindset); - struct fi_info *provider = prov_list[shmem_internal_my_pe % num_close_nics]; - free(prov_list); + struct fi_info *provider = provider_list[shmem_internal_my_pe % num_close_nics]; + //free(prov_list); + shmem_transport_ofi_num_nics = num_close_nics; return provider; } #endif @@ -1565,7 +1590,10 @@ int query_for_fabric(struct fabric_info *info) info->p_info = NULL; if (shmem_internal_params.OFI_DISABLE_MULTIRAIL) { + provider_list = (struct fi_info **) malloc(sizeof(struct fi_info *)); + provider_list[0] = fabrics_list_head; info->p_info = fabrics_list_head; + shmem_transport_ofi_num_nics = 1; } else { /* Generate a linked list of all fabrics with a non-null nic value */ @@ -1581,26 +1609,34 @@ int query_for_fabric(struct fabric_info *info) if (multirail_fabric_list_tail) multirail_fabric_list_tail->next = NULL; if (num_nics == 0) { + provider_list = (struct fi_info **) malloc(sizeof(struct fi_info *)); + provider_list[0] = fallback; info->p_info = fallback; + shmem_transport_ofi_num_nics = 1; } else { int idx = 0; - struct fi_info **prov_list = (struct fi_info **) malloc(num_nics * sizeof(struct fi_info *)); + struct fi_info **sorted_prov_list = (struct fi_info **) malloc(num_nics * sizeof(struct fi_info *)); for (struct fi_info *cur_fabric = multirail_fabric_list_head; cur_fabric; cur_fabric = cur_fabric->next) { - prov_list[idx++] = cur_fabric; + sorted_prov_list[idx++] = cur_fabric; } - qsort(prov_list, num_nics, sizeof(struct fi_info *), compare_nic_names); + qsort(sorted_prov_list, num_nics, sizeof(struct fi_info *), compare_nic_names); #ifdef USE_HWLOC - info->p_info = assign_nic_with_hwloc(info->p_info, prov_list, num_nics); + info->p_info = assign_nic_with_hwloc(info->p_info, sorted_prov_list, num_nics); #else /* Round-robin assignment of NICs to PEs * FIXME: A more suitable indexing value would be * shmem_team_my_pe(SHMEM_TEAM_NODE) % num_nics, but it is too early in initialization to * do that here. We would also want to replace the similar occurrences in the * assign_nic_with_hwloc function. */ - info->p_info = prov_list[shmem_internal_my_pe % num_nics]; + provider_list = (struct fi_info **) malloc(num_nics * sizeof(struct fi_info *)); + for (size_t idx = 0; idx < num_nics; idx++) { + provider_list[idx] = sorted_prov_list[idx]; + } + info->p_info = provider_list[shmem_internal_my_pe % num_nics]; + shmem_transport_ofi_num_nics = num_nics; #endif - free(prov_list); + //free(prov_list); //Add free(provider_list) to cleanup } } if (NULL == info->p_info) { @@ -1734,37 +1770,67 @@ static int shmem_transport_ofi_ctx_init(shmem_transport_ctx_t *ctx, int id) struct fabric_info* info = &shmem_transport_ofi_info; - info->p_info->ep_attr->tx_ctx_cnt = shmem_transport_ofi_stx_max > 0 ? FI_SHARED_CONTEXT : 0; - info->p_info->caps = FI_RMA | FI_WRITE | FI_READ | FI_ATOMIC | FI_RECV; - info->p_info->tx_attr->op_flags = FI_DELIVERY_COMPLETE; - info->p_info->mode = 0; - info->p_info->tx_attr->mode = 0; - info->p_info->rx_attr->mode = 0; - info->p_info->tx_attr->caps = info->p_info->caps; - info->p_info->rx_attr->caps = FI_RECV; /* to drive progress on the CQ */; + // Need to do these steps for all providers in provider_list? + //info->p_info->ep_attr->tx_ctx_cnt = shmem_transport_ofi_stx_max > 0 ? FI_SHARED_CONTEXT : 0; + //info->p_info->caps = FI_RMA | FI_WRITE | FI_READ | FI_ATOMIC | FI_RECV; + //info->p_info->tx_attr->op_flags = FI_DELIVERY_COMPLETE; + //info->p_info->mode = 0; + //info->p_info->tx_attr->mode = 0; + //info->p_info->rx_attr->mode = 0; + //info->p_info->tx_attr->caps = info->p_info->caps; + //info->p_info->rx_attr->caps = FI_RECV; /* to drive progress on the CQ */; ctx->id = id; + ctx->ep = (struct fid_ep **) malloc(shmem_transport_ofi_num_nics * sizeof(struct fid_ep *)); + ctx->put_cntr = (struct fid_cntr **) malloc(shmem_transport_ofi_num_nics * sizeof(struct fid_cntr *)); + ctx->get_cntr = (struct fid_cntr **) malloc(shmem_transport_ofi_num_nics * sizeof(struct fid_cntr *)); +#ifdef USE_CTX_LOCK + ctx->pending_put_cntr = (uint64_t *) malloc(shmem_transport_ofi_num_nics * sizeof(uint64_t)); + ctx->pending_get_cntr = (uint64_t *) malloc(shmem_transport_ofi_num_nics * sizeof(uint64_t)); +#else + ctx->pending_put_cntr = (shmem_internal_cntr_t *) malloc(shmem_transport_ofi_num_nics * sizeof(shmem_internal_cntr_t)); + ctx->pending_get_cntr = (shmem_internal_cntr_t *) malloc(shmem_transport_ofi_num_nics * sizeof(shmem_internal_cntr_t)); +#endif + ctx->cq = (struct fid_cq **) malloc(shmem_transport_ofi_num_nics * sizeof(struct fid_cq *)); + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { +#ifdef USE_CTX_LOCK + ctx->pending_put_cntr[idx] = 0; + ctx->pending_get_cntr[idx] = 0; +#else + shmem_internal_cntr_write(&ctx->pending_put_cntr[idx], 0); + shmem_internal_cntr_write(&ctx->pending_get_cntr[idx], 0); +#endif + /* FIX */ + //shmem_transport_ofi_eps[idx]->info->ep_attr->tx_ctx_cnt = shmem_transport_ofi_stx_max > 0 ? FI_SHARED_CONTEXT : 0; + //shmem_transport_ofi_eps[idx]->info->caps = FI_RMA | FI_WRITE | FI_READ | FI_ATOMIC | FI_RECV; + //shmem_transport_ofi_eps[idx]->info->tx_attr->op_flags = FI_DELIVERY_COMPLETE; + //shmem_transport_ofi_eps[idx]->info->mode = 0; + //shmem_transport_ofi_eps[idx]->info->tx_attr->mode = 0; + //shmem_transport_ofi_eps[idx]->info->rx_attr->mode = 0; + //shmem_transport_ofi_eps[idx]->info->tx_attr->caps = info->p_info->caps; + //shmem_transport_ofi_eps[idx]->info->rx_attr->caps = FI_RECV; /* to drive progress on the CQ */; #ifdef USE_CTX_LOCK SHMEM_MUTEX_INIT(ctx->lock); #endif - ret = fi_cntr_open(shmem_transport_ofi_domainfd, &cntr_put_attr, - &ctx->put_cntr, NULL); - OFI_CHECK_RETURN_MSG(ret, "put_cntr creation failed (%s)\n", fi_strerror(errno)); + ret = fi_cntr_open(shmem_transport_ofi_domainfd, &cntr_put_attr, + &ctx->put_cntr[idx], NULL); + OFI_CHECK_RETURN_MSG(ret, "put_cntr creation failed (%s)\n", fi_strerror(errno)); - ret = fi_cntr_open(shmem_transport_ofi_domainfd, &cntr_get_attr, - &ctx->get_cntr, NULL); - OFI_CHECK_RETURN_MSG(ret, "get_cntr creation failed (%s)\n", fi_strerror(errno)); + ret = fi_cntr_open(shmem_transport_ofi_domainfd, &cntr_get_attr, + &ctx->get_cntr[idx], NULL); + OFI_CHECK_RETURN_MSG(ret, "get_cntr creation failed (%s)\n", fi_strerror(errno)); - ret = fi_cq_open(shmem_transport_ofi_domainfd, &cq_attr, &ctx->cq, NULL); - if (ret && errno == FI_EMFILE) { - DEBUG_STR("Context creation failed because of open files limit, consider increasing with 'ulimit' command"); - } - OFI_CHECK_RETURN_MSG(ret, "cq_open failed (%s)\n", fi_strerror(errno)); + ret = fi_cq_open(shmem_transport_ofi_domainfd, &cq_attr, &ctx->cq[idx], NULL); + if (ret && errno == FI_EMFILE) { + DEBUG_STR("Context creation failed because of open files limit, consider increasing with 'ulimit' command"); + } + OFI_CHECK_RETURN_MSG(ret, "cq_open failed (%s)\n", fi_strerror(errno)); - ret = fi_endpoint(shmem_transport_ofi_domainfd, - info->p_info, &ctx->ep, NULL); - OFI_CHECK_RETURN_MSG(ret, "ep creation failed (%s)\n", fi_strerror(errno)); + ret = fi_endpoint(shmem_transport_ofi_domainfd, + info->p_info, &ctx->ep[idx], NULL); + OFI_CHECK_RETURN_MSG(ret, "ep creation failed (%s)\n", fi_strerror(errno)); + } /* TODO: Fill in TX attr */ @@ -1773,11 +1839,12 @@ static int shmem_transport_ofi_ctx_init(shmem_transport_ctx_t *ctx, int id) shmem_transport_ofi_is_private(ctx->options)) { ctx->tid = shmem_transport_ofi_gettid(); } - shmem_transport_ofi_stx_allocate(ctx); - - ret = bind_enable_ep_resources(ctx); - OFI_CHECK_RETURN_MSG(ret, "context bind/enable endpoint failed (%s)\n", fi_strerror(errno)); + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + shmem_transport_ofi_stx_allocate(ctx, idx); + ret = bind_enable_ep_resources(ctx, idx); + OFI_CHECK_RETURN_MSG(ret, "context bind/enable endpoint failed (%s)\n", fi_strerror(errno)); + } if (ctx->options & SHMEMX_CTX_BOUNCE_BUFFER && shmem_transport_ofi_bounce_buffer_size > 0 && shmem_transport_ofi_max_bounce_buffers > 0) @@ -1892,7 +1959,7 @@ int shmem_transport_init(void) ret = shmem_transport_ofi_target_ep_init(); if (ret != 0) return ret; - ret = publish_mr_info(); + ret = publish_mr_info(shmem_transport_ofi_info.p_info); if (ret != 0) return ret; ret = publish_av_info(&shmem_transport_ofi_info); @@ -1906,72 +1973,83 @@ int shmem_transport_startup(void) int ret; int i; - if (shmem_internal_params.OFI_STX_AUTO && shmem_transport_ofi_stx_max == 0) { - RAISE_WARN_STR("STXs disabled, ignoring request for automatic STX management"); + shmem_transport_ofi_stx_pool = (shmem_transport_ofi_stx_t **) malloc(shmem_transport_ofi_num_nics * + sizeof(shmem_transport_ofi_stx_t *)); + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + shmem_transport_ofi_stx_pool[idx] = NULL; } - else if (shmem_internal_params.OFI_STX_AUTO) { + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + if (shmem_internal_params.OFI_STX_AUTO && shmem_transport_ofi_stx_max == 0) { + RAISE_WARN_STR("STXs disabled, ignoring request for automatic STX management"); + } + else if (shmem_internal_params.OFI_STX_AUTO) { + long ofi_tx_ctx_cnt = /*shmem_transport_ofi_info.fabrics*/provider_list[idx]->domain_attr->tx_ctx_cnt; + int num_on_node = shmem_runtime_get_node_size(); - long ofi_tx_ctx_cnt = shmem_transport_ofi_info.fabrics->domain_attr->tx_ctx_cnt; - int num_on_node = shmem_runtime_get_node_size(); + if (shmem_internal_params.OFI_STX_MAX_provided) { + RAISE_WARN_MSG("Auto-setting STX_MAX; ignoring provided STX_MAX value '%ld'\n", + shmem_internal_params.OFI_STX_MAX); + } - if (shmem_internal_params.OFI_STX_MAX_provided) { - RAISE_WARN_MSG("Auto-setting STX_MAX; ignoring provided STX_MAX value '%ld'\n", - shmem_internal_params.OFI_STX_MAX); - } + if (ofi_tx_ctx_cnt <= 0) + RAISE_ERROR_MSG("Invalid number of TX contexts (%ld)\n", ofi_tx_ctx_cnt); + + /* Paritition TX resources evenly across node-local PEs */ + /* Note: we assume that the domain reports the same tx_ctx_cnt for + * every PE on the node. We also assume that the resource reported + * should be divided equally among all PEs. These assumptions may not + * be valid in all cases, for example when the provider has already + * partitioned resources or when a node has multiple NICs. */ + shmem_transport_ofi_stx_max = ofi_tx_ctx_cnt / num_on_node; + int remainder = ofi_tx_ctx_cnt % num_on_node; + int node_pe = shmem_internal_my_pe % shmem_internal_num_pes; + if (remainder > 0 && ((node_pe % num_on_node) < remainder)) { + shmem_transport_ofi_stx_max++; + } - if (ofi_tx_ctx_cnt <= 0) - RAISE_ERROR_MSG("Invalid number of TX contexts (%ld)\n", ofi_tx_ctx_cnt); - - /* Paritition TX resources evenly across node-local PEs */ - /* Note: we assume that the domain reports the same tx_ctx_cnt for - * every PE on the node. We also assume that the resource reported - * should be divided equally among all PEs. These assumptions may not - * be valid in all cases, for example when the provider has already - * partitioned resources or when a node has multiple NICs. */ - shmem_transport_ofi_stx_max = ofi_tx_ctx_cnt / num_on_node; - int remainder = ofi_tx_ctx_cnt % num_on_node; - int node_pe = shmem_internal_my_pe % shmem_internal_num_pes; - if (remainder > 0 && ((node_pe % num_on_node) < remainder)) { - shmem_transport_ofi_stx_max++; - } + if (shmem_transport_ofi_stx_max <= 0) + RAISE_ERROR_MSG("Not enough TX contexts (%d)\n", num_on_node); - if (shmem_transport_ofi_stx_max <= 0) - RAISE_ERROR_MSG("Not enough TX contexts (%d)\n", num_on_node); + /* When running more PEs than available STXs, must assign each PE at least 1 */ + if (shmem_transport_ofi_stx_max <= 0) { + shmem_transport_ofi_stx_max = 1; + RAISE_WARN_MSG("Need at least 1 STX per PE, but detected %ld available STXs for %d PEs\n", + ofi_tx_ctx_cnt, num_on_node); + } - /* When running more PEs than available STXs, must assign each PE at least 1 */ - if (shmem_transport_ofi_stx_max <= 0) { - shmem_transport_ofi_stx_max = 1; - RAISE_WARN_MSG("Need at least 1 STX per PE, but detected %ld available STXs for %d PEs\n", - ofi_tx_ctx_cnt, num_on_node); + DEBUG_MSG("Auto-set STX max to %ld\n", shmem_transport_ofi_stx_max); } - DEBUG_MSG("Auto-set STX max to %ld\n", shmem_transport_ofi_stx_max); - } - - /* Allocate STX array with max length */ - if (shmem_transport_ofi_stx_max > 0) { - shmem_transport_ofi_stx_pool = malloc(shmem_transport_ofi_stx_max * - sizeof(shmem_transport_ofi_stx_t)); - if (shmem_transport_ofi_stx_pool == NULL) { - RAISE_ERROR_STR("Out of memory when allocating OFI STX pool"); + /* Allocate STX array with max length */ + if (shmem_transport_ofi_stx_max > 0) { + shmem_transport_ofi_stx_pool[idx] = malloc(shmem_transport_ofi_stx_max * + sizeof(shmem_transport_ofi_stx_t)); + if (shmem_transport_ofi_stx_pool == NULL) { + RAISE_ERROR_STR("Out of memory when allocating OFI STX pool"); + } } - } - for (i = 0; i < shmem_transport_ofi_stx_max; i++) { - ret = fi_stx_context(shmem_transport_ofi_domainfd, NULL, - &shmem_transport_ofi_stx_pool[i].stx, NULL); - OFI_CHECK_RETURN_MSG(ret, "STX context creation failed (%s)\n", fi_strerror(ret)); - shmem_transport_ofi_stx_pool[i].ref_cnt = 0; - shmem_transport_ofi_stx_pool[i].is_private = 0; + for (i = 0; i < shmem_transport_ofi_stx_max; i++) { + ret = fi_stx_context(shmem_transport_ofi_domainfd, NULL, + &shmem_transport_ofi_stx_pool[idx][i].stx, NULL); + OFI_CHECK_RETURN_MSG(ret, "STX context creation failed (%s)\n", fi_strerror(ret)); + shmem_transport_ofi_stx_pool[idx][i].ref_cnt = 0; + shmem_transport_ofi_stx_pool[idx][i].is_private = 0; + } } - shmem_transport_ctx_default.team = &shmem_internal_team_world; + shmem_transport_ctx_default.stx_idx = malloc(shmem_transport_ofi_num_nics * sizeof(int)); + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + shmem_transport_ctx_default.stx_idx[idx] = -1; + } ret = shmem_transport_ofi_ctx_init(&shmem_transport_ctx_default, SHMEM_TRANSPORT_CTX_DEFAULT_ID); if (ret != 0) return ret; - ret = atomic_limitations_check(); - if (ret != 0) return ret; + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + ret = atomic_limitations_check(idx); + if (ret != 0) return ret; + } ret = populate_mr_tables(); if (ret != 0) return ret; @@ -2020,12 +2098,20 @@ int shmem_transport_ctx_create(struct shmem_internal_team_t *team, long options, memset(ctxp, 0, sizeof(shmem_transport_ctx_t)); + ctxp->pending_put_cntr = malloc(shmem_transport_ofi_num_nics * sizeof(uint64_t)); + ctxp->pending_get_cntr = malloc(shmem_transport_ofi_num_nics * sizeof(uint64_t)); + ctxp->stx_idx = malloc(shmem_transport_ofi_num_nics * sizeof(int)); + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { #ifndef USE_CTX_LOCK - shmem_internal_cntr_write(&ctxp->pending_put_cntr, 0); - shmem_internal_cntr_write(&ctxp->pending_get_cntr, 0); + shmem_internal_cntr_write(&ctxp->pending_put_cntr, 0); + shmem_internal_cntr_write(&ctxp->pending_get_cntr, 0); +#else + ctxp->pending_put_cntr[idx] = 0; + ctxp->pending_get_cntr[idx] = 0; #endif - ctxp->stx_idx = -1; + ctxp->stx_idx[idx] = -1; + } ctxp->options = options; ctxp->team = team; @@ -2054,6 +2140,9 @@ void shmem_transport_ctx_destroy(shmem_transport_ctx_t *ctx) if(shmem_internal_params.DEBUG) { SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); if (ctx->bounce_buffers) SHMEM_TRANSPORT_OFI_CTX_BB_LOCK(ctx); + // TODO: May want to include pending/completed counters for ALL NICs or at least an aggregate + // for each counter type +/* Causes seg. fault right now for obvious reasons DEBUG_MSG("id = %d, options = %#0lx, stx_idx = %d\n" RAISE_PE_PREFIX "pending_put_cntr = %9"PRIu64", completed_put_cntr = %9"PRIu64"\n" RAISE_PE_PREFIX "pending_get_cntr = %9"PRIu64", completed_get_cntr = %9"PRIu64"\n" @@ -2068,60 +2157,67 @@ void shmem_transport_ctx_destroy(shmem_transport_ctx_t *ctx) shmem_internal_my_pe, ctx->pending_bb_cntr, ctx->completed_bb_cntr ); +*/ if (ctx->bounce_buffers) SHMEM_TRANSPORT_OFI_CTX_BB_UNLOCK(ctx); SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } - if (ctx->ep) { - ret = fi_close(&ctx->ep->fid); - OFI_CHECK_ERROR_MSG(ret, "Context endpoint close failed (%s)\n", fi_strerror(errno)); + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + if (ctx->ep[idx]) { + ret = fi_close(&ctx->ep[idx]->fid); + OFI_CHECK_ERROR_MSG(ret, "Context endpoint close failed (%s)\n", fi_strerror(errno)); + } } if (ctx->bounce_buffers) { shmem_free_list_destroy(ctx->bounce_buffers); } - if (ctx->stx_idx >= 0) { - SHMEM_MUTEX_LOCK(shmem_transport_ofi_lock); - if (shmem_transport_ofi_is_private(ctx->options)) { - shmem_transport_ofi_stx_kvs_t *e; - HASH_FIND(hh, shmem_transport_ofi_stx_kvs, &ctx->tid, - sizeof(struct shmem_internal_tid), e); - if (e) { - shmem_transport_ofi_stx_t *stx = &shmem_transport_ofi_stx_pool[ctx->stx_idx]; - stx->ref_cnt--; - if (stx->ref_cnt == 0) { - HASH_DEL(shmem_transport_ofi_stx_kvs, e); - free(e); - shmem_transport_ofi_stx_pool[ctx->stx_idx].is_private = 0; + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + if (ctx->stx_idx[idx] >= 0) { + SHMEM_MUTEX_LOCK(shmem_transport_ofi_lock); + if (shmem_transport_ofi_is_private(ctx->options)) { + shmem_transport_ofi_stx_kvs_t *e; + HASH_FIND(hh, shmem_transport_ofi_stx_kvs, &ctx->tid, + sizeof(struct shmem_internal_tid), e); + if (e) { + shmem_transport_ofi_stx_t *stx = &shmem_transport_ofi_stx_pool[idx][ctx->stx_idx[idx]]; + stx->ref_cnt--; + if (stx->ref_cnt == 0) { + HASH_DEL(shmem_transport_ofi_stx_kvs, e); + free(e); + shmem_transport_ofi_stx_pool[idx][ctx->stx_idx[idx]].is_private = 0; + } + } + else { + RAISE_WARN_STR("Unable to locate private STX"); + } + } else { + shmem_transport_ofi_stx_pool[idx][ctx->stx_idx[idx]].ref_cnt--; + if (shmem_transport_ofi_stx_pool[idx][ctx->stx_idx[idx]].is_private) { + SHMEM_MUTEX_UNLOCK(shmem_transport_ofi_lock); + RAISE_ERROR_STR("Destroyed a ctx with an inconsistent is_private field"); } } - else { - RAISE_WARN_STR("Unable to locate private STX"); - } - } else { - shmem_transport_ofi_stx_pool[ctx->stx_idx].ref_cnt--; - if (shmem_transport_ofi_stx_pool[ctx->stx_idx].is_private) { - SHMEM_MUTEX_UNLOCK(shmem_transport_ofi_lock); - RAISE_ERROR_STR("Destroyed a ctx with an inconsistent is_private field"); - } + SHMEM_MUTEX_UNLOCK(shmem_transport_ofi_lock); } - SHMEM_MUTEX_UNLOCK(shmem_transport_ofi_lock); } - if (ctx->put_cntr) { - ret = fi_close(&ctx->put_cntr->fid); - OFI_CHECK_ERROR_MSG(ret, "Context put CNTR close failed (%s)\n", fi_strerror(errno)); - } + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + if (ctx->put_cntr && ctx->put_cntr[idx]) { + ret = fi_close(&ctx->put_cntr[idx]->fid); + OFI_CHECK_ERROR_MSG(ret, "Context put CNTR close failed (%s)\n", fi_strerror(errno)); + } - if (ctx->get_cntr) { - ret = fi_close(&ctx->get_cntr->fid); - OFI_CHECK_ERROR_MSG(ret, "Context get CNTR close failed (%s)\n", fi_strerror(errno)); - } + if (ctx->get_cntr && ctx->get_cntr[idx]) { + ret = fi_close(&ctx->get_cntr[idx]->fid); + OFI_CHECK_ERROR_MSG(ret, "Context get CNTR close failed (%s)\n", fi_strerror(errno)); + } - if (ctx->cq) { - ret = fi_close(&ctx->cq->fid); - OFI_CHECK_ERROR_MSG(ret, "Context CQ close failed (%s)\n", fi_strerror(errno)); + if (ctx->cq && ctx->cq[idx]) { + ret = fi_close(&ctx->cq[idx]->fid); + OFI_CHECK_ERROR_MSG(ret, "Context CQ close failed (%s)\n", fi_strerror(errno)); + } } #ifdef USE_CTX_LOCK @@ -2161,13 +2257,15 @@ int shmem_transport_fini(void) RAISE_WARN_MSG("Key/value store contained %d unfreed private contexts\n", stx_len); } - for (long i = 0; i < shmem_transport_ofi_stx_max; ++i) { - if (shmem_transport_ofi_stx_pool[i].ref_cnt != 0) - RAISE_WARN_MSG("Closing a %s STX (%zu) with nonzero ref. count (%ld)\n", - shmem_transport_ofi_stx_pool[i].is_private ? "private" : "shared", - i, shmem_transport_ofi_stx_pool[i].ref_cnt); - ret = fi_close(&shmem_transport_ofi_stx_pool[i].stx->fid); - OFI_CHECK_ERROR_MSG(ret, "STX context close failed (%s)\n", fi_strerror(errno)); + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + for (long i = 0; i < shmem_transport_ofi_stx_max; ++i) { + if (shmem_transport_ofi_stx_pool[idx][i].ref_cnt != 0) + RAISE_WARN_MSG("Closing a %s STX (%zu) with nonzero ref. count (%ld)\n", + shmem_transport_ofi_stx_pool[idx][i].is_private ? "private" : "shared", + i, shmem_transport_ofi_stx_pool[idx][i].ref_cnt); + ret = fi_close(&shmem_transport_ofi_stx_pool[idx][i].stx->fid); + OFI_CHECK_ERROR_MSG(ret, "STX context close failed (%s)\n", fi_strerror(errno)); + } } if (shmem_transport_ofi_stx_pool) free(shmem_transport_ofi_stx_pool); diff --git a/src/transport_ofi.h b/src/transport_ofi.h index 616526bba..532f08478 100644 --- a/src/transport_ofi.h +++ b/src/transport_ofi.h @@ -33,6 +33,7 @@ #include "shmem_team.h" #include +extern size_t shmem_transport_ofi_num_nics; #if !defined(ENABLE_HARD_POLLING) #define ENABLE_TARGET_CNTR 1 @@ -316,23 +317,23 @@ struct shmem_transport_ctx_t { shmem_internal_mutex_t lock; #endif long options; - struct fid_ep* ep; - struct fid_cntr* put_cntr; - struct fid_cntr* get_cntr; - struct fid_cq* cq; + struct fid_ep** ep; + struct fid_cntr** put_cntr; + struct fid_cntr** get_cntr; + struct fid_cq** cq; #ifdef USE_CTX_LOCK /* Pending cntr accesses are protected by ctx lock */ - uint64_t pending_put_cntr; - uint64_t pending_get_cntr; + uint64_t* pending_put_cntr; + uint64_t* pending_get_cntr; #else - shmem_internal_cntr_t pending_put_cntr; - shmem_internal_cntr_t pending_get_cntr; + shmem_internal_cntr_t* pending_put_cntr; + shmem_internal_cntr_t* pending_get_cntr; #endif /* These counters are protected by the BB lock */ uint64_t pending_bb_cntr; uint64_t completed_bb_cntr; shmem_free_list_t *bounce_buffers; - int stx_idx; + int* stx_idx; struct shmem_internal_tid tid; struct shmem_internal_team_t *team; }; @@ -503,10 +504,14 @@ void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx) long poll_count = 0; while (poll_count < shmem_transport_ofi_put_poll_limit || shmem_transport_ofi_put_poll_limit < 0) { - success = fi_cntr_read(ctx->put_cntr); - fail = fi_cntr_readerr(ctx->put_cntr); - cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr); + success = 0; + fail = 0; + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + success = fi_cntr_read(ctx->put_cntr[idx]); /* FIX */ + fail = fi_cntr_readerr(ctx->put_cntr[idx]); /* FIX */ + cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[idx]); /* FIX */ + } shmem_transport_probe(); if (success < cnt && fail == 0) { @@ -521,14 +526,16 @@ void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx) } poll_count++; } - cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr); - do { - cnt = cnt_new; - ssize_t ret = fi_cntr_wait(ctx->put_cntr, cnt, -1); - cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr); - OFI_CTX_CHECK_ERROR(ctx, ret); - } while (cnt < cnt_new); - shmem_internal_assert(cnt == cnt_new); + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[idx]); /* FIX */ + do { + cnt = cnt_new; + ssize_t ret = fi_cntr_wait(ctx->put_cntr[idx], cnt, -1); /* FIX */ + cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[idx]); /* FIX */ + OFI_CTX_CHECK_ERROR(ctx, ret); + } while (cnt < cnt_new); + shmem_internal_assert(cnt == cnt_new); + } SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } @@ -621,11 +628,11 @@ void shmem_transport_put_scalar(shmem_transport_ctx_t* ctx, void *target, const shmem_internal_assert(len <= shmem_transport_ofi_max_buffered_send); SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[1]); /* FIX */ do { - ret = fi_inject_write(ctx->ep, + ret = fi_inject_write(ctx->ep[1], /* FIX */ source, len, GET_DEST(dst), @@ -660,10 +667,11 @@ void shmem_transport_ofi_put_large(shmem_transport_ctx_t* ctx, void *target, con (size_t) (((uint8_t *) source) + len - frag_source)); polled = 0; - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[1]); /* FIX */ do { - ret = fi_write(ctx->ep, + + ret = fi_write(ctx->ep[1], frag_source, frag_len, GET_MR_DESC(shmem_transport_ofi_get_mr_desc_index(source)), GET_DEST(dst), frag_target, @@ -695,7 +703,7 @@ void shmem_transport_put_nb(shmem_transport_ctx_t* ctx, void *target, const void } else if (len <= shmem_transport_ofi_bounce_buffer_size && ctx->bounce_buffers) { SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[1]); /* FIX */ shmem_transport_ofi_get_mr(target, pe, &addr, &key); shmem_transport_ofi_bounce_buffer_t *buff = @@ -715,7 +723,7 @@ void shmem_transport_put_nb(shmem_transport_ctx_t* ctx, void *target, const void .data = 0 }; do { - ret = fi_writemsg(ctx->ep, &msg, FI_COMPLETION | FI_DELIVERY_COMPLETE); + ret = fi_writemsg(ctx->ep[1], &msg, FI_COMPLETION | FI_DELIVERY_COMPLETE); /* FIX */ } while (try_again(ctx, ret, &polled)); SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); @@ -764,7 +772,7 @@ void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, co }; do { - ret = fi_writemsg(ctx->ep, &msg, FI_DELIVERY_COMPLETE | FI_INJECT); + ret = fi_writemsg(ctx->ep[1], &msg, FI_DELIVERY_COMPLETE | FI_INJECT); /* FIX */ } while (try_again(ctx, ret, &polled)); SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); @@ -809,10 +817,10 @@ void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, co msg.rma_iov = &rma_iov; msg.context = frag_source; - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[1]); /* FIX */ do { - ret = fi_writemsg(ctx->ep, &msg, FI_DELIVERY_COMPLETE); + ret = fi_writemsg(ctx->ep[1], &msg, FI_DELIVERY_COMPLETE); /* FIX */ } while (try_again(ctx, ret, &polled)); frag_source += frag_len; @@ -837,7 +845,7 @@ void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, co int atomic_op = (sig_op == SHMEM_SIGNAL_ADD) ? FI_SUM : FI_ATOMIC_WRITE; SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[1]); /* FIX */ const struct fi_ioc msg_iov_signal = { .addr = (uint8_t *) &signal, @@ -862,7 +870,7 @@ void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, co }; do { - ret = fi_atomicmsg(ctx->ep, &msg_signal, flags_signal); + ret = fi_atomicmsg(ctx->ep[1], &msg_signal, flags_signal); /* FIX */ } while (try_again(ctx, ret, &polled)); SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); @@ -909,9 +917,9 @@ void shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *s SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); if (len <= shmem_transport_ofi_max_msg_size) { - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[1]); /* FIX */ do { - ret = fi_read(ctx->ep, + ret = fi_read(ctx->ep[1], /* FIX */ target, len, GET_MR_DESC(shmem_transport_ofi_get_mr_desc_index(target)), @@ -931,10 +939,10 @@ void shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *s (size_t) (((uint8_t *) target) + len - frag_target)); polled = 0; - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[1]); /* FIX */ do { - ret = fi_read(ctx->ep, + ret = fi_read(ctx->ep[1], frag_target, frag_len, GET_MR_DESC(shmem_transport_ofi_get_mr_desc_index(target)), GET_DEST(dst), frag_source, @@ -967,9 +975,9 @@ void shmem_transport_get_wait(shmem_transport_ctx_t* ctx) while (poll_count < shmem_transport_ofi_get_poll_limit || shmem_transport_ofi_get_poll_limit < 0) { - success = fi_cntr_read(ctx->get_cntr); - fail = fi_cntr_readerr(ctx->get_cntr); - cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr); + success = fi_cntr_read(ctx->get_cntr[1]); /* FIX */ + fail = fi_cntr_readerr(ctx->get_cntr[1]); /* FIX */ + cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[1]); /* FIX */ shmem_transport_probe(); @@ -985,11 +993,11 @@ void shmem_transport_get_wait(shmem_transport_ctx_t* ctx) } poll_count++; } - cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr); + cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[1]); /* FIX */ do { cnt = cnt_new; - ssize_t ret = fi_cntr_wait(ctx->get_cntr, cnt, -1); - cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr); + ssize_t ret = fi_cntr_wait(ctx->get_cntr[1], cnt, -1); /* FIX */ + cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[1]); /* FIX */ OFI_CTX_CHECK_ERROR(ctx, ret); } while (cnt < cnt_new); shmem_internal_assert(cnt == cnt_new); @@ -1031,10 +1039,10 @@ void shmem_transport_cswap_nbi(shmem_transport_ctx_t* ctx, void *target, const }; SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[1]); /* FIX */ do { - ret = fi_compare_atomicmsg(ctx->ep, + ret = fi_compare_atomicmsg(ctx->ep[1], /* FIX */ &msg, &comparev, NULL, @@ -1072,10 +1080,10 @@ void shmem_transport_cswap(shmem_transport_ctx_t* ctx, void *target, const void shmem_internal_assert(SHMEM_Dtsize[SHMEM_TRANSPORT_DTYPE(datatype)] == len); SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[1]); /* FIX */ do { - ret = fi_compare_atomic(ctx->ep, + ret = fi_compare_atomic(ctx->ep[1], /* FIX */ source, 1, GET_MR_DESC(shmem_transport_ofi_get_mr_desc_index(source)), @@ -1111,10 +1119,10 @@ void shmem_transport_mswap(shmem_transport_ctx_t* ctx, void *target, const void shmem_internal_assert(SHMEM_Dtsize[SHMEM_TRANSPORT_DTYPE(datatype)] == len); SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[1]); /* FIX */ do { - ret = fi_compare_atomic(ctx->ep, + ret = fi_compare_atomic(ctx->ep[1], /* FIX */ source, 1, GET_MR_DESC(shmem_transport_ofi_get_mr_desc_index(source)), @@ -1148,10 +1156,10 @@ void shmem_transport_atomic(shmem_transport_ctx_t* ctx, void *target, const void shmem_internal_assert(SHMEM_Dtsize[SHMEM_TRANSPORT_DTYPE(datatype)] == len); SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[1]); /* FIX */ do { - ret = fi_inject_atomic(ctx->ep, + ret = fi_inject_atomic(ctx->ep[1], /* FIX */ source, 1, GET_DEST(dst), @@ -1181,7 +1189,7 @@ void shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const voi shmem_internal_assert(SHMEM_Dtsize[dt] * len == full_len); SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - ret = fi_atomicvalid(ctx->ep, dt, op, + ret = fi_atomicvalid(ctx->ep[1], dt, op, /* FIX */ &max_atomic_size); max_atomic_size = max_atomic_size * SHMEM_Dtsize[dt]; if (max_atomic_size > shmem_transport_ofi_max_msg_size @@ -1198,10 +1206,10 @@ void shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const voi polled = 0; - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[1]); /* FIX */ do { - ret = fi_inject_atomic(ctx->ep, + ret = fi_inject_atomic(ctx->ep[1], /* FIX */ source, len, GET_DEST(dst), @@ -1219,7 +1227,7 @@ void shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const voi create_bounce_buffer(ctx, source, full_len); polled = 0; - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[1]); /* FIX */ const struct fi_ioc msg_iov = { .addr = buff->data, .count = len }; const struct fi_rma_ioc rma_iov = { .addr = (uint64_t) addr, .count = len, .key = key }; @@ -1236,7 +1244,7 @@ void shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const voi .data = 0 }; do { - ret = fi_atomicmsg(ctx->ep, &msg, FI_COMPLETION | FI_DELIVERY_COMPLETE); + ret = fi_atomicmsg(ctx->ep[1], &msg, FI_COMPLETION | FI_DELIVERY_COMPLETE); /* FIX */ } while (try_again(ctx, ret, &polled)); } else { @@ -1247,9 +1255,9 @@ void shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const voi size_t chunksize = MIN((len-sent), (max_atomic_size/SHMEM_Dtsize[dt])); polled = 0; - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[1]); /* FIX */ do { - ret = fi_atomic(ctx->ep, + ret = fi_atomic(ctx->ep[1], /* FIX */ (void *)((char *)source + (sent*SHMEM_Dtsize[dt])), chunksize, @@ -1305,10 +1313,10 @@ void shmem_transport_fetch_atomic_nbi(shmem_transport_ctx_t* ctx, void *target, }; SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[1]); /* FIX */ do { - ret = fi_fetch_atomicmsg(ctx->ep, + ret = fi_fetch_atomicmsg(ctx->ep[1], /* FIX */ &msg, &resultv, GET_MR_DESC_ADDR(shmem_transport_ofi_get_mr_desc_index(dest)), @@ -1344,10 +1352,10 @@ void shmem_transport_fetch_atomic(shmem_transport_ctx_t* ctx, void *target, shmem_internal_assert(SHMEM_Dtsize[SHMEM_TRANSPORT_DTYPE(datatype)] == len); SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[1]); /* FIX */ do { - ret = fi_fetch_atomic(ctx->ep, + ret = fi_fetch_atomic(ctx->ep[1], /* FIX */ source, 1, GET_MR_DESC(shmem_transport_ofi_get_mr_desc_index(source)), @@ -1432,7 +1440,7 @@ int shmem_transport_atomic_supported(shm_internal_op_t op, * actually required by FI_THREAD_COMPLETION. */ SHMEM_TRANSPORT_OFI_CTX_LOCK(&shmem_transport_ctx_default); - int ret = fi_atomicvalid(shmem_transport_ctx_default.ep, + int ret = fi_atomicvalid(shmem_transport_ctx_default.ep[1], /* FIX */ SHMEM_TRANSPORT_DTYPE(datatype), op, &size); SHMEM_TRANSPORT_OFI_CTX_UNLOCK(&shmem_transport_ctx_default); @@ -1531,7 +1539,7 @@ uint64_t shmem_transport_pcntr_get_issued_write(shmem_transport_ctx_t *ctx) { uint64_t cnt; SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr); + cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[1]); /* FIX */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); if (ctx->options & SHMEMX_CTX_BOUNCE_BUFFER) { @@ -1547,7 +1555,7 @@ uint64_t shmem_transport_pcntr_get_issued_read(shmem_transport_ctx_t *ctx) { uint64_t cnt; SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr); + cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[1]); /* FIX */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); return cnt; } @@ -1557,7 +1565,7 @@ uint64_t shmem_transport_pcntr_get_completed_write(shmem_transport_ctx_t *ctx) { uint64_t cnt; SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - cnt = fi_cntr_read(ctx->put_cntr); + cnt = fi_cntr_read(ctx->put_cntr[1]); /* FIX */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); if (ctx->options & SHMEMX_CTX_BOUNCE_BUFFER) { @@ -1573,7 +1581,7 @@ uint64_t shmem_transport_pcntr_get_completed_read(shmem_transport_ctx_t *ctx) { uint64_t cnt; SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - cnt = fi_cntr_read(ctx->get_cntr); + cnt = fi_cntr_read(ctx->get_cntr[1]); /* FIX */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); return cnt; } @@ -1610,11 +1618,11 @@ void shmem_transport_pcntr_get_all(shmem_transport_ctx_t *ctx, shmemx_pcntr_t *p pcntr->pending_put = ctx->pending_bb_cntr; SHMEM_TRANSPORT_OFI_CTX_BB_UNLOCK(ctx); } - pcntr->completed_put += fi_cntr_read(ctx->put_cntr); - pcntr->completed_get = fi_cntr_read(ctx->get_cntr); + pcntr->completed_put += fi_cntr_read(ctx->put_cntr[1]); /* FIX */ + pcntr->completed_get = fi_cntr_read(ctx->get_cntr[1]); /* FIX */ - pcntr->pending_put += SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr); - pcntr->pending_get = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr); + pcntr->pending_put += SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[1]); /* FIX */ + pcntr->pending_get = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[1]); /* FIX */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); pcntr->target = shmem_transport_pcntr_get_completed_target(); From cc8b0e7c44e67b76ae98940b723f5353528e4a60 Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Tue, 7 May 2024 13:24:35 -0700 Subject: [PATCH 02/11] src: Add initial NIC selection logic for multiplexing --- src/atomic_c.c4 | 80 +++++++++++++++++++++++++++------------- src/atomic_nbi_c.c4 | 5 ++- src/data_c.c4 | 71 +++++++++++++++++++++++++---------- src/shmem_comm.h | 19 +++++----- src/shmem_internal.h | 16 ++++++++ src/shmem_lock.h | 28 +++++++------- src/transport_none.h | 2 +- src/transport_ofi.h | 24 +++++++----- src/transport_portals4.h | 6 +-- src/transport_ucx.h | 2 +- 10 files changed, 169 insertions(+), 84 deletions(-) diff --git a/src/atomic_c.c4 b/src/atomic_c.c4 index 7b9c648b5..f8b20dd6c 100644 --- a/src/atomic_c.c4 +++ b/src/atomic_c.c4 @@ -232,9 +232,12 @@ SHMEM_DEFINE_FOR_EXTENDED_AMO(`SHMEM_PROF_DEF_CTX_ATOMIC_SET') SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_swap(ctx, target, &value, &newval, \ - sizeof(TYPE), pe, ITYPE); \ - shmem_internal_get_wait(ctx); \ + sizeof(TYPE), pe, ITYPE, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return newval; \ } @@ -251,8 +254,11 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_PE(pe); SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(long)); - shmem_internal_swap(SHMEM_CTX_DEFAULT, target, &value, &newval, sizeof(long), pe, SHM_INTERNAL_LONG); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + + shmem_internal_swap(SHMEM_CTX_DEFAULT, target, &value, &newval, sizeof(long), pe, SHM_INTERNAL_LONG, nic_idx); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); return newval; } #endif @@ -267,9 +273,12 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_cswap(ctx, target, &value, &newval, &cond, \ - sizeof(TYPE), pe, ITYPE); \ - shmem_internal_get_wait(ctx); \ + sizeof(TYPE), pe, ITYPE, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return newval; \ } @@ -283,9 +292,12 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_cswap(ctx, target, &value, &newval, &cond, \ - sizeof(TYPE), pe, ITYPE); \ - shmem_internal_get_wait(ctx); \ + sizeof(TYPE), pe, ITYPE, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return newval; \ } @@ -311,10 +323,13 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic(ctx, target, &tmp, &oldval, \ sizeof(TYPE), pe, SHM_INTERNAL_SUM, \ - ITYPE); \ - shmem_internal_get_wait(ctx); \ + ITYPE, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return oldval; \ } @@ -327,10 +342,13 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic(ctx, target, &tmp, &oldval, \ sizeof(TYPE), pe, SHM_INTERNAL_SUM, \ - ITYPE); \ - shmem_internal_get_wait(ctx); \ + ITYPE, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return oldval; \ } @@ -358,10 +376,12 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic(ctx, target, &value, &oldval, \ sizeof(TYPE), pe, SHM_INTERNAL_SUM, \ - ITYPE); \ - shmem_internal_get_wait(ctx); \ + ITYPE, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return oldval; \ } @@ -375,10 +395,12 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic(ctx, target, &value, &oldval, \ sizeof(TYPE), pe, SHM_INTERNAL_SUM, \ - ITYPE); \ - shmem_internal_get_wait(ctx); \ + ITYPE, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return oldval; \ } @@ -393,9 +415,11 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(source, sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_atomic_fetch(ctx, &val, (void *) source, \ - sizeof(TYPE), pe, ITYPE); \ - shmem_internal_get_wait(ctx); \ + sizeof(TYPE), pe, ITYPE, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return val; \ } @@ -454,7 +478,7 @@ shmem_swap(long *target, long value, int pe) #define SHMEM_DEF_FETCH_XOR(STYPE,TYPE,ITYPE) \ TYPE SHMEM_FUNCTION_ATTRIBUTES \ - SHMEM_FUNC_PROTOTYPE(STYPE, fetch_xor, TYPE *target, TYPE value, \ + SHMEM_FUNC_PROTOTYPE(STYPE, fetch_xor, TYPE *target, TYPE value, \ int pe) \ TYPE oldval; \ SHMEM_ERR_CHECK_INITIALIZED(); \ @@ -462,10 +486,12 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic(ctx, target, &value, &oldval, \ sizeof(TYPE), pe, SHM_INTERNAL_BXOR, \ - ITYPE); \ - shmem_internal_get_wait(ctx); \ + ITYPE, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return oldval; \ } @@ -479,10 +505,12 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic(ctx, target, &value, &oldval, \ sizeof(TYPE), pe, SHM_INTERNAL_BAND, \ - ITYPE); \ - shmem_internal_get_wait(ctx); \ + ITYPE, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return oldval; \ } @@ -496,10 +524,12 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic(ctx, target, &value, &oldval, \ sizeof(TYPE), pe, SHM_INTERNAL_BOR, \ - ITYPE); \ - shmem_internal_get_wait(ctx); \ + ITYPE, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return oldval; \ } diff --git a/src/atomic_nbi_c.c4 b/src/atomic_nbi_c.c4 index b8e644058..3b69d6a89 100644 --- a/src/atomic_nbi_c.c4 +++ b/src/atomic_nbi_c.c4 @@ -179,8 +179,11 @@ SHMEM_DEFINE_FOR_BITWISE_AMO(`SHMEM_PROF_DEF_CTX_FETCH_XOR_NBI') SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(source, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_atomic_fetch(ctx, fetch, (void *) source, \ - sizeof(TYPE), pe, ITYPE); \ + sizeof(TYPE), pe, ITYPE, nic_idx); \ } diff --git a/src/data_c.c4 b/src/data_c.c4 index 31233b998..0973192cf 100644 --- a/src/data_c.c4 +++ b/src/data_c.c4 @@ -318,9 +318,12 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(addr, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_get(ctx, &tmp, addr, sizeof(TYPE),\ - pe); \ - shmem_internal_get_wait(ctx); \ + pe, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return tmp; \ } @@ -413,9 +416,12 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_ERR_CHECK_OVERLAP(target, source, sizeof(TYPE) * \ nelems, sizeof(TYPE) * nelems, 0, \ (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_get(ctx, target, source, \ - sizeof(TYPE) * nelems, pe); \ - shmem_internal_get_wait(ctx); \ + sizeof(TYPE) * nelems, pe, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ } @@ -432,9 +438,12 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_ERR_CHECK_OVERLAP(target, source, (SIZE)*nelems, \ (SIZE) * nelems, 0, \ (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_get(ctx, target, source, (SIZE)*nelems, \ - pe); \ - shmem_internal_get_wait(ctx); \ + pe, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ } @@ -451,8 +460,10 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_ERR_CHECK_OVERLAP(target, source, sizeof(TYPE) * \ nelems, sizeof(TYPE) * nelems, 0, \ (shmem_internal_my_pe == pe)); \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_get(ctx, target, source, sizeof(TYPE)*nelems, \ - pe); \ + pe, nic_idx); \ } @@ -469,7 +480,10 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_ERR_CHECK_OVERLAP(target, source, (SIZE) * nelems, \ (SIZE) * nelems, 0, \ (shmem_internal_my_pe == pe)); \ - shmem_internal_get(ctx, target, source, (SIZE)*nelems, pe);\ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ + shmem_internal_get(ctx, target, source, (SIZE)*nelems, \ + pe, nic_idx); \ } #define SHMEM_DEF_IPUT(STYPE,TYPE) \ @@ -593,13 +607,16 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') sizeof(TYPE) * ((nelems-1) * tst + 1), \ sizeof(TYPE) * ((nelems-1) * sst + 1), 0, \ (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ for ( ; nelems > 0 ; --nelems) { \ shmem_internal_get(ctx, target, source, sizeof(TYPE), \ - pe); \ + pe, nic_idx); \ target += tst; \ source += sst; \ } \ - shmem_internal_get_wait(ctx); \ + shmem_internal_get_wait(ctx, nic_idx); \ } #define SHMEM_DEF_IBGET(STYPE,TYPE) \ @@ -619,13 +636,16 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') sizeof(TYPE) * ((nblocks-1) * tst + bsize), \ sizeof(TYPE) * ((nblocks-1) * sst + bsize), \ 0, (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ for ( ; nblocks > 0 ; --nblocks) { \ shmem_internal_get(ctx, target, source, \ - bsize * sizeof(TYPE), pe); \ + bsize * sizeof(TYPE), pe, nic_idx); \ target += tst; \ source += sst; \ } \ - shmem_internal_get_wait(ctx); \ + shmem_internal_get_wait(ctx, nic_idx); \ } #define SHMEM_DEF_IGET_N(NAME,SIZE) \ @@ -646,12 +666,16 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') (SIZE) * ((nelems-1) * tst + 1), \ (SIZE) * ((nelems-1) * sst + 1), 0, \ (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ for ( ; nelems > 0 ; --nelems) { \ - shmem_internal_get(ctx, target, source, (SIZE), pe);\ + shmem_internal_get(ctx, target, source, (SIZE), \ + pe, nic_idx); \ target = (uint8_t *) target + tst * (SIZE); \ source = (uint8_t *) source + sst * (SIZE); \ } \ - shmem_internal_get_wait(ctx); \ + shmem_internal_get_wait(ctx, nic_idx); \ } #define SHMEM_DEF_IBGET_N(NAME,SIZE) \ @@ -672,13 +696,16 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') (SIZE) * ((nblocks-1) * tst + bsize), \ (SIZE) * ((nblocks-1) * sst + bsize), \ 0, (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ for ( ; nblocks > 0 ; --nblocks) { \ shmem_internal_get(ctx, target, source, \ - bsize * (SIZE), pe); \ + bsize * (SIZE), pe, nic_idx); \ target = (uint8_t *) target + tst * (SIZE); \ source = (uint8_t *) source + sst * (SIZE); \ } \ - shmem_internal_get_wait(ctx); \ + shmem_internal_get_wait(ctx, nic_idx); \ } #define SHMEM_DEF_PUT_SIGNAL(STYPE,TYPE) \ @@ -871,10 +898,12 @@ shmem_signal_fetch(const uint64_t* sig_addr) SHMEM_ERR_CHECK_INITIALIZED(); SHMEM_ERR_CHECK_SYMMETRIC(sig_addr, sizeof(uint64_t)); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &val, (void *) sig_addr, sizeof(uint64_t), shmem_internal_my_pe, - SHM_INTERNAL_UINT64); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + SHM_INTERNAL_UINT64, nic_idx); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); return val; } @@ -932,8 +961,10 @@ shmemx_getmem_ct(shmemx_ct_t ct, void *target, const void *source, size_t nelems SHMEM_ERR_CHECK_SYMMETRIC(source, nelems); SHMEM_ERR_CHECK_NULL(target, nelems); - shmem_internal_get_ct(ct, target, source, nelems, pe); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_get_ct(ct, target, source, nelems, pe, nic_idx); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); } void SHMEM_FUNCTION_ATTRIBUTES shmemx_putmem_ct(shmemx_ct_t ct, void *target, const void *source, diff --git a/src/shmem_comm.h b/src/shmem_comm.h index a08ede1db..f58889191 100644 --- a/src/shmem_comm.h +++ b/src/shmem_comm.h @@ -123,7 +123,7 @@ shmem_internal_put_ct_nb(shmemx_ct_t ct, void *target, const void *source, size_ static inline void -shmem_internal_get(shmem_ctx_t ctx, void *target, const void *source, size_t len, int pe) +shmem_internal_get(shmem_ctx_t ctx, void *target, const void *source, size_t len, int pe, size_t nic_idx) { if (len == 0) return; @@ -137,7 +137,8 @@ shmem_internal_get(shmem_ctx_t ctx, void *target, const void *source, size_t len static inline void -shmem_internal_get_ct(shmemx_ct_t ct, void *target, const void *source, size_t len, int pe) +shmem_internal_get_ct(shmemx_ct_t ct, void *target, const void *source, size_t len, + int pe, size_t nic_idx) { /* TODO: add shortcut for on-node-comms */ shmem_transport_get_ct((shmem_transport_ct_t *) ct, @@ -147,16 +148,16 @@ shmem_internal_get_ct(shmemx_ct_t ct, void *target, const void *source, size_t l static inline void -shmem_internal_get_wait(shmem_ctx_t ctx) +shmem_internal_get_wait(shmem_ctx_t ctx, size_t idx) { - shmem_transport_get_wait((shmem_transport_ctx_t *)ctx); + shmem_transport_get_wait((shmem_transport_ctx_t *)ctx, idx); /* on-node is always blocking, so this is a no-op for them */ } static inline void shmem_internal_swap(shmem_ctx_t ctx, void *target, void *source, void *dest, size_t len, - int pe, shm_internal_datatype_t datatype) + int pe, shm_internal_datatype_t datatype, size_t nic_idx) { shmem_internal_assert(len > 0); @@ -188,7 +189,7 @@ shmem_internal_swap_nbi(shmem_ctx_t ctx, void *target, void *source, static inline void shmem_internal_cswap(shmem_ctx_t ctx, void *target, void *source, void *dest, void *operand, size_t len, - int pe, shm_internal_datatype_t datatype) + int pe, shm_internal_datatype_t datatype, size_t nic_idx) { shmem_internal_assert(len > 0); @@ -221,7 +222,7 @@ shmem_internal_cswap_nbi(shmem_ctx_t ctx, void *target, void *source, static inline void shmem_internal_mswap(shmem_ctx_t ctx, void *target, void *source, void *dest, void *mask, size_t len, - int pe, shm_internal_datatype_t datatype) + int pe, shm_internal_datatype_t datatype, size_t nic_idx) { shmem_internal_assert(len > 0); @@ -262,7 +263,7 @@ shmem_internal_atomic(shmem_ctx_t ctx, void *target, const void *source, size_t static inline void shmem_internal_atomic_fetch(shmem_ctx_t ctx, void *target, const void *source, size_t len, - int pe, shm_internal_datatype_t datatype) + int pe, shm_internal_datatype_t datatype, size_t nic_idx) { shmem_internal_assert(len > 0); @@ -304,7 +305,7 @@ static inline void shmem_internal_fetch_atomic(shmem_ctx_t ctx, void *target, void *source, void *dest, size_t len, int pe, shm_internal_op_t op, - shm_internal_datatype_t datatype) + shm_internal_datatype_t datatype, size_t nic_idx) { shmem_internal_assert(len > 0); diff --git a/src/shmem_internal.h b/src/shmem_internal.h index d6fe359be..59bc7549c 100644 --- a/src/shmem_internal.h +++ b/src/shmem_internal.h @@ -186,6 +186,22 @@ extern hwloc_topology_t shmem_internal_topology; } \ } while(0) +/* TODO: Add definition if not using OFI or if multiplexing disabled. + * Would just return 0, or just do nothing since nic_idx will already + * be initialized to 0. + */ +#ifdef USE_OFI +#define SHMEM_GET_TRANSMIT_NIC_IDX(idx) \ + do { \ + int rand_int = rand_r(&shmem_internal_rand_seed); \ + double normalized = (double)rand_int / (double)RAND_MAX; \ + int range = shmem_transport_ofi_num_nics - 1; \ + idx = (int)(normalized * range); \ + } while (0) +#else +#define SHMEM_GET_TRANSMIT_NIC_IDX(idx) +#endif + #ifdef ENABLE_ERROR_CHECKING #define SHMEM_ERR_CHECK_INITIALIZED() \ do { \ diff --git a/src/shmem_lock.h b/src/shmem_lock.h index e0c2812ce..158cafc84 100644 --- a/src/shmem_lock.h +++ b/src/shmem_lock.h @@ -47,8 +47,8 @@ shmem_internal_clear_lock(long *lockp) /* release the lock if I'm the last to try to obtain it */ cond = shmem_internal_my_pe + 1; shmem_internal_cswap(SHMEM_CTX_DEFAULT, &(lock->last), &zero, &curr, &cond, - sizeof(int), 0, SHM_INTERNAL_INT); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + sizeof(int), 0, SHM_INTERNAL_INT, 0); // Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); // Multiplex across NICs? /* if local PE was not the last to hold the lock, look for the next in line */ if (curr != shmem_internal_my_pe + 1) { @@ -58,8 +58,8 @@ shmem_internal_clear_lock(long *lockp) for (;;) { shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &cur_data, &(lock->data), sizeof(int), shmem_internal_my_pe, - SHM_INTERNAL_INT); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + SHM_INTERNAL_INT, 0); // Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); // Multiplex across NICs? if (NEXT(cur_data) != 0) break; @@ -69,8 +69,8 @@ shmem_internal_clear_lock(long *lockp) /* set the signal bit on new lock holder */ shmem_internal_mswap(SHMEM_CTX_DEFAULT, &(lock->data), &sig, &curr, - &sig, sizeof(int), NEXT(cur_data) - 1, SHM_INTERNAL_INT); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + &sig, sizeof(int), NEXT(cur_data) - 1, SHM_INTERNAL_INT, 0);// Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); // Multiplex across NICs? } } @@ -88,24 +88,24 @@ shmem_internal_set_lock(long *lockp) /* update last with my value to add me to the queue */ shmem_internal_swap(SHMEM_CTX_DEFAULT, &(lock->last), &me, &curr, - sizeof(int), 0, SHM_INTERNAL_INT); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + sizeof(int), 0, SHM_INTERNAL_INT, 0); // Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); // Multiplex across NICs? /* If I wasn't the first, need to add myself to the previous last's next */ if (0 != curr) { int next_mask = NEXT_MASK; shmem_internal_mswap(SHMEM_CTX_DEFAULT, &(lock->data), &me, &curr, - &next_mask, sizeof(int), curr - 1, SHM_INTERNAL_INT); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + &next_mask, sizeof(int), curr - 1, SHM_INTERNAL_INT, 0); // Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); // Multiplex across NICs? /* now wait for the signal part of data to be non-zero */ for (;;) { int cur_data; shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &cur_data, &(lock->data), - sizeof(int), shmem_internal_my_pe, SHM_INTERNAL_INT); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + sizeof(int), shmem_internal_my_pe, SHM_INTERNAL_INT, 0); // Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); // Multiplex across NICs? if (SIGNAL(cur_data) != 0) break; @@ -134,8 +134,8 @@ shmem_internal_test_lock(long *lockp) /* add self to last if and only if the lock is zero (ie, no one has the lock) */ shmem_internal_cswap(SHMEM_CTX_DEFAULT, &(lock->last), &me, &curr, &zero, - sizeof(int), 0, SHM_INTERNAL_INT); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + sizeof(int), 0, SHM_INTERNAL_INT, 0); // Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); // Multiplex across NICs? if (0 == curr) { shmem_internal_membar_acquire(); diff --git a/src/transport_none.h b/src/transport_none.h index f2a8dfc3a..f0d517d07 100644 --- a/src/transport_none.h +++ b/src/transport_none.h @@ -157,7 +157,7 @@ shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *source static inline void -shmem_transport_get_wait(shmem_transport_ctx_t* ctx) +shmem_transport_get_wait(shmem_transport_ctx_t* ctx, size_t idx) { /* Nop */ } diff --git a/src/transport_ofi.h b/src/transport_ofi.h index 532f08478..39545b535 100644 --- a/src/transport_ofi.h +++ b/src/transport_ofi.h @@ -408,7 +408,7 @@ int shmem_transport_fini(void); extern size_t SHMEM_Dtsize[FI_DATATYPE_LAST]; -static inline void shmem_transport_get_wait(shmem_transport_ctx_t* ctx); +static inline void shmem_transport_get_wait(shmem_transport_ctx_t* ctx, size_t idx); /* Drain all available events from the CQ. Note, ctx->bounce_buffers must be * locked before calling this routine */ @@ -545,7 +545,9 @@ int shmem_transport_quiet(shmem_transport_ctx_t* ctx) { shmem_transport_put_quiet(ctx); - shmem_transport_get_wait(ctx); + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + shmem_transport_get_wait(ctx, idx); + } return 0; } @@ -560,7 +562,9 @@ int shmem_transport_fence(shmem_transport_ctx_t* ctx) shmem_transport_put_quiet(ctx); #endif /* Complete fetching ops; needed to support nonblocking fetch-atomics */ - shmem_transport_get_wait(ctx); + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + shmem_transport_get_wait(ctx, idx); + } return 0; } @@ -958,7 +962,7 @@ void shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *s static inline -void shmem_transport_get_wait(shmem_transport_ctx_t* ctx) +void shmem_transport_get_wait(shmem_transport_ctx_t* ctx, size_t idx) { /* wait for get counter to meet outstanding count value */ @@ -975,9 +979,9 @@ void shmem_transport_get_wait(shmem_transport_ctx_t* ctx) while (poll_count < shmem_transport_ofi_get_poll_limit || shmem_transport_ofi_get_poll_limit < 0) { - success = fi_cntr_read(ctx->get_cntr[1]); /* FIX */ - fail = fi_cntr_readerr(ctx->get_cntr[1]); /* FIX */ - cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[1]); /* FIX */ + success = fi_cntr_read(ctx->get_cntr[idx]); + fail = fi_cntr_readerr(ctx->get_cntr[idx]); + cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[idx]); shmem_transport_probe(); @@ -993,11 +997,11 @@ void shmem_transport_get_wait(shmem_transport_ctx_t* ctx) } poll_count++; } - cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[1]); /* FIX */ + cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[idx]); do { cnt = cnt_new; - ssize_t ret = fi_cntr_wait(ctx->get_cntr[1], cnt, -1); /* FIX */ - cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[1]); /* FIX */ + ssize_t ret = fi_cntr_wait(ctx->get_cntr[idx], cnt, -1); + cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[idx]); OFI_CTX_CHECK_ERROR(ctx, ret); } while (cnt < cnt_new); shmem_internal_assert(cnt == cnt_new); diff --git a/src/transport_portals4.h b/src/transport_portals4.h index af0223d9d..b31f1fb47 100644 --- a/src/transport_portals4.h +++ b/src/transport_portals4.h @@ -242,7 +242,7 @@ int shmem_transport_startup(void); int shmem_transport_fini(void); -static inline void shmem_transport_get_wait(shmem_transport_ctx_t*); +static inline void shmem_transport_get_wait(shmem_transport_ctx_t*, size_t idx); static inline void shmem_transport_probe(void) { return; @@ -257,7 +257,7 @@ shmem_transport_quiet(shmem_transport_ctx_t* ctx) uint64_t cnt, cnt_new; /* wait for completion of all pending NB get events */ - shmem_transport_get_wait(ctx); + shmem_transport_get_wait(ctx, 0); /* wait for remote completion (acks) of all buffered puts */ /* NOTE-MT: continue to wait if additional operations are issued during the quiet */ @@ -696,7 +696,7 @@ void shmem_transport_get_ct(shmem_transport_ct_t *ct, void *target, static inline void -shmem_transport_get_wait(shmem_transport_ctx_t* ctx) +shmem_transport_get_wait(shmem_transport_ctx_t* ctx, size_t idx) { int ret; ptl_ct_event_t ct; diff --git a/src/transport_ucx.h b/src/transport_ucx.h index 779c55ba7..c74165007 100644 --- a/src/transport_ucx.h +++ b/src/transport_ucx.h @@ -315,7 +315,7 @@ shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *source static inline void -shmem_transport_get_wait(shmem_transport_ctx_t* ctx) +shmem_transport_get_wait(shmem_transport_ctx_t* ctx, size_t idx) { /* Blocking fetching ops are completed in place, so this is a nop */ } From 2068d75245aa5fe92b9942d23134c75c6549ddd5 Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Wed, 8 May 2024 12:53:39 -0700 Subject: [PATCH 03/11] src: Add nic_idx paramater to where needed in Fortran files --- src/atomic_f.c | 60 +++++++++++++++++++++++++------------------------- src/data_f.c4 | 10 ++++----- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/src/atomic_f.c b/src/atomic_f.c index 663e033f4..d084be71c 100644 --- a/src/atomic_f.c +++ b/src/atomic_f.c @@ -41,8 +41,8 @@ FC_SHMEM_SWAP(fortran_integer_t *target, SHMEM_ERR_CHECK_SYMMETRIC(target, SIZEOF_FORTRAN_INTEGER); shmem_internal_swap(SHMEM_CTX_DEFAULT, target, value, &newval, SIZEOF_FORTRAN_INTEGER, - *pe, SHM_INTERNAL_FORTRAN_INTEGER); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + *pe, SHM_INTERNAL_FORTRAN_INTEGER, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return newval; } @@ -64,8 +64,8 @@ FC_SHMEM_INT4_SWAP(int32_t *target, SHMEM_ERR_CHECK_SYMMETRIC(target, 4); shmem_internal_swap(SHMEM_CTX_DEFAULT, target, value, &newval, 4, - *pe, SHM_INTERNAL_INT32); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + *pe, SHM_INTERNAL_INT32, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return newval; } @@ -87,8 +87,8 @@ FC_SHMEM_INT8_SWAP(int64_t *target, SHMEM_ERR_CHECK_SYMMETRIC(target, 8); shmem_internal_swap(SHMEM_CTX_DEFAULT, target, value, &newval, 8, - *pe, SHM_INTERNAL_INT64); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + *pe, SHM_INTERNAL_INT64, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return newval; } @@ -112,8 +112,8 @@ FC_SHMEM_REAL4_SWAP(float *target, shmem_internal_assert(sizeof(float) == 4); shmem_internal_swap(SHMEM_CTX_DEFAULT, target, value, &newval, 4, - *pe, SHM_INTERNAL_FLOAT); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + *pe, SHM_INTERNAL_FLOAT, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return newval; } @@ -137,8 +137,8 @@ FC_SHMEM_REAL8_SWAP(double *target, shmem_internal_assert(sizeof(double) == 8); shmem_internal_swap(SHMEM_CTX_DEFAULT, target, value, &newval, 8, - *pe, SHM_INTERNAL_DOUBLE); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + *pe, SHM_INTERNAL_DOUBLE, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return newval; } @@ -163,8 +163,8 @@ FC_SHMEM_INT4_CSWAP(int32_t *target, shmem_internal_cswap(SHMEM_CTX_DEFAULT, target, value, &newval, cond, 4, - *pe, SHM_INTERNAL_INT32); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + *pe, SHM_INTERNAL_INT32, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return newval; } @@ -189,8 +189,8 @@ FC_SHMEM_INT8_CSWAP(int64_t *target, shmem_internal_cswap(SHMEM_CTX_DEFAULT, target, value, &newval, cond, 8, - *pe, SHM_INTERNAL_INT64); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + *pe, SHM_INTERNAL_INT64, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return newval; } @@ -212,8 +212,8 @@ FC_SHMEM_INT4_FADD(int32_t *target, SHMEM_ERR_CHECK_SYMMETRIC(target, 4); shmem_internal_fetch_atomic(SHMEM_CTX_DEFAULT, target, value, &oldval, 4, - *pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT32); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + *pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT32, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return oldval; } @@ -235,8 +235,8 @@ FC_SHMEM_INT8_FADD(int64_t *target, SHMEM_ERR_CHECK_SYMMETRIC(target, 8); shmem_internal_fetch_atomic(SHMEM_CTX_DEFAULT, target, value, &oldval, 8, - *pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT64); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + *pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT64, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return oldval; } @@ -256,8 +256,8 @@ FC_SHMEM_INT4_FINC(int32_t *target, SHMEM_ERR_CHECK_SYMMETRIC(target, 4); shmem_internal_fetch_atomic(SHMEM_CTX_DEFAULT, target, &tmp, &oldval, 4, - *pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT32); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + *pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT32, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return oldval; } @@ -277,8 +277,8 @@ FC_SHMEM_INT8_FINC(int64_t *target, SHMEM_ERR_CHECK_SYMMETRIC(target, 8); shmem_internal_fetch_atomic(SHMEM_CTX_DEFAULT, target, &tmp, &oldval, 8, - *pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT64); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + *pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT64, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return oldval; } @@ -373,8 +373,8 @@ FC_SHMEM_INT4_FETCH(int32_t *source, SHMEM_ERR_CHECK_PE(*pe); SHMEM_ERR_CHECK_SYMMETRIC(source, 4); - shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &val, (void *) source, 4, *pe, SHM_INTERNAL_INT32); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &val, (void *) source, 4, *pe, SHM_INTERNAL_INT32, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return val; } @@ -394,8 +394,8 @@ FC_SHMEM_INT8_FETCH(int64_t *source, SHMEM_ERR_CHECK_PE(*pe); SHMEM_ERR_CHECK_SYMMETRIC(source, 8); - shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &val, (void *) source, 8, *pe, SHM_INTERNAL_INT64); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &val, (void *) source, 8, *pe, SHM_INTERNAL_INT64, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return val; } @@ -417,8 +417,8 @@ FC_SHMEM_REAL4_FETCH(float *source, shmem_internal_assert(sizeof(float) == 4); - shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &val, (void *) source, 4, *pe, SHM_INTERNAL_INT32); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &val, (void *) source, 4, *pe, SHM_INTERNAL_INT32, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return val; } @@ -440,8 +440,8 @@ FC_SHMEM_REAL8_FETCH(double *source, shmem_internal_assert(sizeof(double) == 8); - shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &val, (void *) source, 8, *pe, SHM_INTERNAL_INT64); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &val, (void *) source, 8, *pe, SHM_INTERNAL_INT64, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return val; } diff --git a/src/data_f.c4 b/src/data_f.c4 index 5753463d3..568286157 100644 --- a/src/data_f.c4 +++ b/src/data_f.c4 @@ -136,8 +136,8 @@ SHMEM_BIND_F_SIZES(`SHMEM_WRAP_FC_IPUT_SIZE') SHMEM_ERR_CHECK_NULL(target, *len); \ \ shmem_internal_get(SHMEM_CTX_DEFAULT, target, source, \ - SIZE * *len, *pe); \ - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); \ + SIZE * *len, *pe, 0); \ + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); \ } define(`SHMEM_WRAP_FC_GET', @@ -161,7 +161,7 @@ SHMEM_BIND_F_SIZES(`SHMEM_WRAP_FC_GET_SIZE') SHMEM_ERR_CHECK_NULL(target, *nelems); \ \ shmem_internal_get(SHMEM_CTX_DEFAULT, target, source, \ - SIZE * *nelems, *pe); \ + SIZE * *nelems, *pe, 0); \ } define(`SHMEM_WRAP_FC_GET_NBI', @@ -195,11 +195,11 @@ SHMEM_BIND_F_SIZES(`SHMEM_WRAP_FC_GET_NBI_SIZE') \ for ( ; len > 0 ; --len ) { \ shmem_internal_get(SHMEM_CTX_DEFAULT, target, source, SIZE, \ - *pe); \ + *pe, 0); \ target += (*tst * SIZE); \ source += (*sst * SIZE); \ } \ - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); \ + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); \ } define(`SHMEM_WRAP_FC_IGET', From 0cb61b12f4137e5aec187da6eaba79ad4291f601 Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Wed, 8 May 2024 15:17:44 -0700 Subject: [PATCH 04/11] src: Pass NIC index to transport layer functions --- src/atomic_c.c4 | 25 ++++-- src/atomic_nbi_c.c4 | 32 +++++-- src/data_c.c4 | 111 +++++++++++++++++------- src/shmem_comm.h | 79 +++++++++-------- src/shmem_internal.h | 3 +- src/shr_transport.h4 | 11 +-- src/transport_none.h | 33 +++---- src/transport_ofi.c | 73 +++++++++++----- src/transport_ofi.h | 180 ++++++++++++++++++++------------------- src/transport_portals4.h | 47 +++++----- src/transport_ucx.h | 39 ++++----- 11 files changed, 376 insertions(+), 257 deletions(-) diff --git a/src/atomic_c.c4 b/src/atomic_c.c4 index f8b20dd6c..7ae7ac7c4 100644 --- a/src/atomic_c.c4 +++ b/src/atomic_c.c4 @@ -310,8 +310,11 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_atomic(ctx, target, &tmp, sizeof(TYPE), \ - pe, SHM_INTERNAL_SUM, ITYPE); \ + pe, SHM_INTERNAL_SUM, ITYPE, nic_idx); \ } @@ -361,8 +364,10 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_atomic(ctx, target, &value, sizeof(TYPE), \ - pe, SHM_INTERNAL_SUM, ITYPE); \ + pe, SHM_INTERNAL_SUM, ITYPE, nic_idx); \ } @@ -432,8 +437,10 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(dest, sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_atomic_set(ctx, (void *) dest, &value, \ - sizeof(TYPE), pe, ITYPE); \ + sizeof(TYPE), pe, ITYPE, nic_idx); \ } @@ -445,8 +452,10 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_atomic(ctx, target, &value, sizeof(TYPE), \ - pe, SHM_INTERNAL_BXOR, ITYPE); \ + pe, SHM_INTERNAL_BXOR, ITYPE, nic_idx); \ } @@ -458,8 +467,10 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_atomic(ctx, target, &value, sizeof(TYPE), \ - pe, SHM_INTERNAL_BAND, ITYPE); \ + pe, SHM_INTERNAL_BAND, ITYPE, nic_idx); \ } @@ -471,8 +482,10 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_atomic(ctx, target, &value, sizeof(TYPE), \ - pe, SHM_INTERNAL_BOR, ITYPE); \ + pe, SHM_INTERNAL_BOR, ITYPE, nic_idx); \ } diff --git a/src/atomic_nbi_c.c4 b/src/atomic_nbi_c.c4 index 3b69d6a89..fdd687e7e 100644 --- a/src/atomic_nbi_c.c4 +++ b/src/atomic_nbi_c.c4 @@ -124,8 +124,12 @@ SHMEM_DEFINE_FOR_BITWISE_AMO(`SHMEM_PROF_DEF_CTX_FETCH_XOR_NBI') SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_swap_nbi(ctx, target, &value, fetch, \ - sizeof(TYPE), pe, ITYPE); \ + sizeof(TYPE), pe, ITYPE, \ + nic_idx); \ } @@ -151,9 +155,12 @@ SHMEM_DEFINE_FOR_BITWISE_AMO(`SHMEM_PROF_DEF_CTX_FETCH_XOR_NBI') SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic_nbi(ctx, target, &tmp, fetch, \ sizeof(TYPE), pe, SHM_INTERNAL_SUM, \ - ITYPE); \ + ITYPE, nic_idx); \ } @@ -165,9 +172,13 @@ SHMEM_DEFINE_FOR_BITWISE_AMO(`SHMEM_PROF_DEF_CTX_FETCH_XOR_NBI') SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic_nbi(ctx, target, &value, fetch, \ sizeof(TYPE), pe, \ - SHM_INTERNAL_SUM, ITYPE); \ + SHM_INTERNAL_SUM, ITYPE, \ + nic_idx); \ } @@ -195,9 +206,12 @@ SHMEM_DEFINE_FOR_BITWISE_AMO(`SHMEM_PROF_DEF_CTX_FETCH_XOR_NBI') SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic_nbi(ctx, target, &value, fetch, \ sizeof(TYPE), pe, SHM_INTERNAL_BXOR,\ - ITYPE); \ + ITYPE, nic_idx); \ } @@ -209,9 +223,12 @@ SHMEM_DEFINE_FOR_BITWISE_AMO(`SHMEM_PROF_DEF_CTX_FETCH_XOR_NBI') SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic_nbi(ctx, target, &value, fetch, \ sizeof(TYPE), pe, SHM_INTERNAL_BAND,\ - ITYPE); \ + ITYPE, nic_idx); \ } @@ -223,9 +240,12 @@ SHMEM_DEFINE_FOR_BITWISE_AMO(`SHMEM_PROF_DEF_CTX_FETCH_XOR_NBI') SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic_nbi(ctx, target, &value, fetch, \ sizeof(TYPE), pe, SHM_INTERNAL_BOR, \ - ITYPE); \ + ITYPE, nic_idx); \ } /* Function prototype for v1.4 routines with the default context: */ diff --git a/src/data_c.c4 b/src/data_c.c4 index 0973192cf..45a934445 100644 --- a/src/data_c.c4 +++ b/src/data_c.c4 @@ -305,8 +305,11 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(addr, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_put_scalar(ctx, addr, &value, sizeof(TYPE), \ - pe); \ + pe, nic_idx); \ } #define SHMEM_DEF_G(STYPE,TYPE) \ @@ -340,10 +343,13 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_ERR_CHECK_OVERLAP(target, source, sizeof(TYPE) * \ nelems, sizeof(TYPE) * nelems, 0, \ (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_put_nb(ctx, target, source, \ sizeof(TYPE) * nelems, pe, \ - &completion); \ - shmem_internal_put_wait(ctx, &completion); \ + &completion, nic_idx); \ + shmem_internal_put_wait(ctx, &completion, nic_idx); \ } @@ -361,9 +367,12 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_ERR_CHECK_OVERLAP(target, source, (SIZE) * nelems, \ (SIZE) * nelems, 0, \ (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_put_nb(ctx, target, source, (SIZE) * nelems,\ - pe, &completion); \ - shmem_internal_put_wait(ctx, &completion); \ + pe, &completion, nic_idx); \ + shmem_internal_put_wait(ctx, &completion, nic_idx); \ } @@ -379,9 +388,12 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_ERR_CHECK_OVERLAP(target, source, sizeof(TYPE) * \ nelems, sizeof(TYPE) * nelems, 0, \ (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_put_nbi(ctx, target, source, \ sizeof(TYPE)*nelems, \ - pe); \ + pe, nic_idx); \ } @@ -398,8 +410,11 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_ERR_CHECK_OVERLAP(target, source, (SIZE) * nelems, \ (SIZE) * nelems, 0, \ (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_put_nbi(ctx, target, source, (SIZE)*nelems, \ - pe); \ + pe, nic_idx); \ } @@ -502,9 +517,12 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') sizeof(TYPE) * ((nelems-1) * tst + 1), \ sizeof(TYPE) * ((nelems-1) * sst + 1), 0, \ (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ for ( ; nelems > 0 ; --nelems) { \ shmem_internal_put_scalar(ctx, target, source, \ - sizeof(TYPE), pe); \ + sizeof(TYPE), pe, nic_idx); \ target += tst; \ source += sst; \ } \ @@ -527,14 +545,17 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') sizeof(TYPE) * ((nblocks-1) * tst + bsize), \ sizeof(TYPE) * ((nblocks-1) * sst + bsize), \ 0, (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ for ( ; nblocks > 0 ; --nblocks) { \ shmem_internal_put_nb(ctx, target, source, \ bsize * sizeof(TYPE), pe, \ - &completion); \ + &completion, nic_idx); \ target += tst; \ source += sst; \ } \ - shmem_internal_put_wait(ctx, &completion); \ + shmem_internal_put_wait(ctx, &completion, nic_idx); \ } #define SHMEM_DEF_IPUT_N(NAME,SIZE) \ @@ -554,9 +575,12 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') (SIZE) * ((nelems-1) * tst + 1), \ (SIZE) * ((nelems-1) * sst + 1), 0, \ (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ for ( ; nelems > 0 ; --nelems) { \ shmem_internal_put_scalar(ctx, target, source, (SIZE), \ - pe); \ + pe, nic_idx); \ target = (uint8_t *) target + tst * (SIZE); \ source = (uint8_t *) source + sst * (SIZE); \ } \ @@ -580,14 +604,17 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') (SIZE) * ((nblocks-1) * tst + bsize), \ (SIZE) * ((nblocks-1) * sst + bsize), \ 0, (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ for ( ; nblocks > 0 ; --nblocks) { \ shmem_internal_put_nb(ctx, target, source, \ bsize * (SIZE), pe, \ - &completion); \ + &completion, nic_idx); \ target = (uint8_t *) target + tst * (SIZE); \ source = (uint8_t *) source + sst * (SIZE); \ } \ - shmem_internal_put_wait(ctx, &completion); \ + shmem_internal_put_wait(ctx, &completion, nic_idx); \ } #define SHMEM_DEF_IGET(STYPE,TYPE) \ @@ -698,7 +725,7 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') 0, (shmem_internal_my_pe == pe)); \ \ size_t nic_idx = 0; \ - SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ for ( ; nblocks > 0 ; --nblocks) { \ shmem_internal_get(ctx, target, source, \ bsize * (SIZE), pe, nic_idx); \ @@ -724,19 +751,22 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') sizeof(uint64_t), 0, \ (shmem_internal_my_pe == pe)); \ SHMEM_ERR_CHECK_SIG_OP(sig_op); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_put_nb(ctx, target, source, \ sizeof(TYPE) * nelems, pe, \ - &completion); \ - shmem_internal_put_wait(ctx, &completion); \ + &completion, nic_idx); \ + shmem_internal_put_wait(ctx, &completion, nic_idx); \ shmem_internal_fence(ctx); \ if (sig_op == SHMEM_SIGNAL_ADD) \ shmem_internal_atomic(ctx, sig_addr, &signal, sizeof(uint64_t), \ pe, SHM_INTERNAL_SUM, \ - SHM_INTERNAL_UINT64); \ + SHM_INTERNAL_UINT64, nic_idx); \ else \ shmem_internal_atomic_set(ctx, sig_addr, &signal, \ sizeof(uint64_t), pe, \ - SHM_INTERNAL_UINT64); \ + SHM_INTERNAL_UINT64, nic_idx); \ } @@ -756,18 +786,21 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') sizeof(uint64_t), 0, \ (shmem_internal_my_pe == pe)); \ SHMEM_ERR_CHECK_SIG_OP(sig_op); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_put_nb(ctx, target, source, (SIZE) * nelems, \ - pe, &completion); \ - shmem_internal_put_wait(ctx, &completion); \ + pe, &completion, nic_idx); \ + shmem_internal_put_wait(ctx, &completion, nic_idx); \ shmem_internal_fence(ctx); \ if (sig_op == SHMEM_SIGNAL_ADD) \ shmem_internal_atomic(ctx, sig_addr, &signal, sizeof(uint64_t), \ pe, SHM_INTERNAL_SUM, \ - SHM_INTERNAL_UINT64); \ + SHM_INTERNAL_UINT64, nic_idx); \ else \ shmem_internal_atomic_set(ctx, sig_addr, &signal, \ sizeof(uint64_t), pe, \ - SHM_INTERNAL_UINT64); \ + SHM_INTERNAL_UINT64, nic_idx); \ } #define SHMEM_DEF_PUT_SIGNAL_NBI(STYPE,TYPE) \ @@ -784,10 +817,14 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_ERR_CHECK_OVERLAP(target, sig_addr, sizeof(TYPE) * nelems, \ sizeof(uint64_t), 0, \ (shmem_internal_my_pe == pe)); \ + \ SHMEM_ERR_CHECK_SIG_OP(sig_op); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_put_signal_nbi(ctx, target, source, \ sizeof(TYPE) * nelems, sig_addr, \ - signal, sig_op, pe); \ + signal, sig_op, pe, nic_idx); \ } @@ -806,8 +843,12 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') sizeof(uint64_t), 0, \ (shmem_internal_my_pe == pe)); \ SHMEM_ERR_CHECK_SIG_OP(sig_op); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_put_signal_nbi(ctx, target, source, (SIZE) * nelems, \ - sig_addr, signal, sig_op, pe); \ + sig_addr, signal, sig_op, \ + pe, nic_idx); \ } @@ -914,8 +955,10 @@ shmemx_signal_add(uint64_t *sig_addr, uint64_t signal, int pe) SHMEM_ERR_CHECK_PE(pe); SHMEM_ERR_CHECK_SYMMETRIC(sig_addr, sizeof(uint64_t)); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_atomic(SHMEM_CTX_DEFAULT, sig_addr, &signal, sizeof(uint64_t), - pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64); + pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64, nic_idx); } void SHMEM_FUNCTION_ATTRIBUTES @@ -926,8 +969,10 @@ shmemx_ctx_signal_add(shmem_ctx_t ctx, uint64_t *sig_addr, uint64_t signal, int SHMEM_ERR_CHECK_CTX(ctx); SHMEM_ERR_CHECK_SYMMETRIC(sig_addr, sizeof(uint64_t)); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_atomic(ctx, sig_addr, &signal, sizeof(uint64_t), - pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64); + pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64, nic_idx); } void SHMEM_FUNCTION_ATTRIBUTES @@ -937,8 +982,10 @@ shmemx_signal_set(uint64_t *sig_addr, uint64_t signal, int pe) SHMEM_ERR_CHECK_PE(pe); SHMEM_ERR_CHECK_SYMMETRIC(sig_addr, sizeof(uint64_t)); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_atomic_set(SHMEM_CTX_DEFAULT, (void *) sig_addr, &signal, - sizeof(uint64_t), pe, SHM_INTERNAL_UINT64); + sizeof(uint64_t), pe, SHM_INTERNAL_UINT64, nic_idx); } void SHMEM_FUNCTION_ATTRIBUTES @@ -949,8 +996,10 @@ shmemx_ctx_signal_set(shmem_ctx_t ctx, uint64_t *sig_addr, uint64_t signal, int SHMEM_ERR_CHECK_CTX(ctx); SHMEM_ERR_CHECK_SYMMETRIC(sig_addr, sizeof(uint64_t)); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_atomic_set(ctx, (void *) sig_addr, &signal, - sizeof(uint64_t), pe, SHM_INTERNAL_UINT64); + sizeof(uint64_t), pe, SHM_INTERNAL_UINT64, nic_idx); } void SHMEM_FUNCTION_ATTRIBUTES @@ -977,8 +1026,10 @@ void SHMEM_FUNCTION_ATTRIBUTES shmemx_putmem_ct(shmemx_ct_t ct, void *target, co SHMEM_ERR_CHECK_SYMMETRIC(target, nelems); SHMEM_ERR_CHECK_NULL(source, nelems); - shmem_internal_put_ct_nb(ct, target, source, nelems, pe, &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_put_ct_nb(ct, target, source, nelems, pe, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); } diff --git a/src/shmem_comm.h b/src/shmem_comm.h index f58889191..32185ac78 100644 --- a/src/shmem_comm.h +++ b/src/shmem_comm.h @@ -33,7 +33,7 @@ static inline void shmem_internal_put_nb(shmem_ctx_t ctx, void *target, const void *source, size_t len, int pe, - long *completion) + long *completion, size_t nic_idx) { if (len == 0) return; @@ -41,23 +41,23 @@ shmem_internal_put_nb(shmem_ctx_t ctx, void *target, const void *source, size_t if (shmem_shr_transport_use_write(ctx, target, source, len, pe)) { shmem_shr_transport_put(ctx, target, source, len, pe); } else { - shmem_transport_put_nb((shmem_transport_ctx_t *)ctx, target, source, len, pe, completion); + shmem_transport_put_nb((shmem_transport_ctx_t *)ctx, target, source, len, pe, completion, nic_idx); } } static inline void -shmem_internal_put_wait(shmem_ctx_t ctx, long *completion) +shmem_internal_put_wait(shmem_ctx_t ctx, long *completion, size_t nic_idx) { - shmem_transport_put_wait((shmem_transport_ctx_t *)ctx, completion); + shmem_transport_put_wait((shmem_transport_ctx_t *)ctx, completion, nic_idx); /* on-node is always blocking, so this is a no-op for them */ } static inline void -shmem_internal_put_scalar(shmem_ctx_t ctx, void *target, const void *source, size_t len, int pe) +shmem_internal_put_scalar(shmem_ctx_t ctx, void *target, const void *source, size_t len, int pe, size_t nic_idx) { shmem_internal_assert(len > 0); @@ -65,11 +65,11 @@ shmem_internal_put_scalar(shmem_ctx_t ctx, void *target, const void *source, siz shmem_shr_transport_put_scalar(ctx, target, source, len, pe); } else { #ifndef DISABLE_OFI_INJECT - shmem_transport_put_scalar((shmem_transport_ctx_t *)ctx, target, source, len, pe); + shmem_transport_put_scalar((shmem_transport_ctx_t *)ctx, target, source, len, pe, nic_idx); #else long completion = 0; - shmem_transport_put_nb((shmem_transport_ctx_t *)ctx, target, source, len, pe, &completion); - shmem_internal_put_wait(ctx, &completion); + shmem_transport_put_nb((shmem_transport_ctx_t *)ctx, target, source, len, pe, &completion, nic_idx); + shmem_internal_put_wait(ctx, &completion, nic_idx); #endif } } @@ -77,35 +77,35 @@ shmem_internal_put_scalar(shmem_ctx_t ctx, void *target, const void *source, siz static inline void shmem_internal_put_signal_nbi(shmem_ctx_t ctx, void *target, const void *source, size_t len, - uint64_t *sig_addr, uint64_t signal, int sig_op, int pe) + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe, size_t nic_idx) { if (len == 0) { if (sig_op == SHMEM_SIGNAL_ADD) shmem_transport_atomic((shmem_transport_ctx_t *) ctx, sig_addr, &signal, sizeof(uint64_t), - pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64); + pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64, nic_idx); else shmem_transport_atomic_set((shmem_transport_ctx_t *) ctx, sig_addr, &signal, - sizeof(uint64_t), pe, SHM_INTERNAL_UINT64); + sizeof(uint64_t), pe, SHM_INTERNAL_UINT64, nic_idx); return; } if (shmem_shr_transport_use_write(ctx, target, source, len, pe)) { - shmem_shr_transport_put_signal(ctx, target, source, len, sig_addr, signal, sig_op, pe); + shmem_shr_transport_put_signal(ctx, target, source, len, sig_addr, signal, sig_op, pe, nic_idx); } else { - shmem_transport_put_signal_nbi((shmem_transport_ctx_t *) ctx, target, source, len, sig_addr, signal, sig_op, pe); + shmem_transport_put_signal_nbi((shmem_transport_ctx_t *) ctx, target, source, len, sig_addr, signal, sig_op, pe, nic_idx); } } static inline void -shmem_internal_put_nbi(shmem_ctx_t ctx, void *target, const void *source, size_t len, int pe) +shmem_internal_put_nbi(shmem_ctx_t ctx, void *target, const void *source, size_t len, int pe, size_t nic_idx) { if (len == 0) return; if (shmem_shr_transport_use_write(ctx, target, source, len, pe)) { shmem_shr_transport_put(ctx, target, source, len, pe); } else { - shmem_transport_put_nbi((shmem_transport_ctx_t *)ctx, target, source, len, pe); + shmem_transport_put_nbi((shmem_transport_ctx_t *)ctx, target, source, len, pe, nic_idx); } } @@ -113,11 +113,11 @@ shmem_internal_put_nbi(shmem_ctx_t ctx, void *target, const void *source, size_t static inline void shmem_internal_put_ct_nb(shmemx_ct_t ct, void *target, const void *source, size_t len, int pe, - long *completion) + long *completion, size_t nic_idx) { /* TODO: add shortcut for on-node-comms */ shmem_transport_put_ct_nb((shmem_transport_ct_t *) - ct, target, source, len, pe, completion); + ct, target, source, len, pe, completion, nic_idx); } @@ -130,7 +130,7 @@ shmem_internal_get(shmem_ctx_t ctx, void *target, const void *source, size_t len if (shmem_shr_transport_use_read(ctx, target, source, len, pe)) { shmem_shr_transport_get(ctx, target, source, len, pe); } else { - shmem_transport_get((shmem_transport_ctx_t *)ctx, target, source, len, pe); + shmem_transport_get((shmem_transport_ctx_t *)ctx, target, source, len, pe, nic_idx); } } @@ -142,7 +142,7 @@ shmem_internal_get_ct(shmemx_ct_t ct, void *target, const void *source, size_t l { /* TODO: add shortcut for on-node-comms */ shmem_transport_get_ct((shmem_transport_ct_t *) ct, - target, source, len, pe); + target, source, len, pe, nic_idx); } @@ -164,7 +164,7 @@ shmem_internal_swap(shmem_ctx_t ctx, void *target, void *source, void *dest, siz if (shmem_shr_transport_use_atomic(ctx, target, len, pe, datatype)) { shmem_shr_transport_swap(ctx, target, source, dest, len, pe, datatype); } else { - shmem_transport_swap((shmem_transport_ctx_t *)ctx, target, source, dest, len, pe, datatype); + shmem_transport_swap((shmem_transport_ctx_t *)ctx, target, source, dest, len, pe, datatype, nic_idx); } } @@ -173,7 +173,7 @@ static inline void shmem_internal_swap_nbi(shmem_ctx_t ctx, void *target, void *source, void *dest, size_t len, int pe, - shm_internal_datatype_t datatype) + shm_internal_datatype_t datatype, size_t nic_idx) { shmem_internal_assert(len > 0); @@ -181,7 +181,7 @@ shmem_internal_swap_nbi(shmem_ctx_t ctx, void *target, void *source, shmem_shr_transport_swap(ctx, target, source, dest, len, pe, datatype); } else { shmem_transport_swap_nbi((shmem_transport_ctx_t *)ctx, target, source, - dest, len, pe, datatype); + dest, len, pe, datatype, nic_idx); } } @@ -197,7 +197,7 @@ shmem_internal_cswap(shmem_ctx_t ctx, void *target, void *source, void *dest, vo shmem_shr_transport_cswap(ctx, target, source, dest, operand, len, pe, datatype); } else { shmem_transport_cswap((shmem_transport_ctx_t *)ctx, target, source, - dest, operand, len, pe, datatype); + dest, operand, len, pe, datatype, nic_idx); } } @@ -230,7 +230,7 @@ shmem_internal_mswap(shmem_ctx_t ctx, void *target, void *source, void *dest, vo shmem_shr_transport_mswap(ctx, target, source, dest, mask, len, pe, datatype); } else { shmem_transport_mswap((shmem_transport_ctx_t *)ctx, target, source, - dest, mask, len, pe, datatype); + dest, mask, len, pe, datatype, nic_idx); } } @@ -238,7 +238,8 @@ shmem_internal_mswap(shmem_ctx_t ctx, void *target, void *source, void *dest, vo static inline void shmem_internal_atomic(shmem_ctx_t ctx, void *target, const void *source, size_t len, - int pe, shm_internal_op_t op, shm_internal_datatype_t datatype) + int pe, shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx) { shmem_internal_assert(len > 0); @@ -254,8 +255,7 @@ shmem_internal_atomic(shmem_ctx_t ctx, void *target, const void *source, size_t shmem_transport_get_wait((shmem_transport_ctx_t *)ctx); #else shmem_transport_atomic((shmem_transport_ctx_t *)ctx, target, source, - len, pe, op, datatype); -#endif + len, pe, op, datatype, nic_idx); } } @@ -271,7 +271,7 @@ shmem_internal_atomic_fetch(shmem_ctx_t ctx, void *target, const void *source, s shmem_shr_transport_atomic_fetch(ctx, target, source, len, pe, datatype); } else { shmem_transport_atomic_fetch((shmem_transport_ctx_t *)ctx, target, - source, len, pe, datatype); + source, len, pe, datatype, nic_idx); } } @@ -279,7 +279,7 @@ shmem_internal_atomic_fetch(shmem_ctx_t ctx, void *target, const void *source, s static inline void shmem_internal_atomic_set(shmem_ctx_t ctx, void *target, const void *source, size_t len, - int pe, shm_internal_datatype_t datatype) + int pe, shm_internal_datatype_t datatype, size_t nic_idx) { shmem_internal_assert(len > 0); @@ -295,8 +295,7 @@ shmem_internal_atomic_set(shmem_ctx_t ctx, void *target, const void *source, siz shmem_transport_get_wait((shmem_transport_ctx_t *)ctx); #else shmem_transport_atomic_set((shmem_transport_ctx_t *)ctx, target, - source, len, pe, datatype); -#endif + source, len, pe, datatype, nic_idx); } } @@ -314,7 +313,7 @@ shmem_internal_fetch_atomic(shmem_ctx_t ctx, void *target, void *source, void *d op, datatype); } else { shmem_transport_fetch_atomic((shmem_transport_ctx_t *)ctx, target, - source, dest, len, pe, op, datatype); + source, dest, len, pe, op, datatype, nic_idx); } } @@ -323,7 +322,7 @@ static inline void shmem_internal_atomicv(shmem_ctx_t ctx, void *target, const void *source, size_t len, int pe, shm_internal_op_t op, - shm_internal_datatype_t datatype, long *completion) + shm_internal_datatype_t datatype, long *completion, size_t nic_idx) { shmem_internal_assert(len > 0); @@ -336,14 +335,14 @@ shmem_internal_atomicv(shmem_ctx_t ctx, void *target, const void *source, for (size_t i = 0; i < count; i++) { shmem_internal_fetch_atomic(ctx, ((uint8_t *) target) + (i * type_size), ((uint8_t *) source) + (i * type_size), &tmp_fetch, type_size, - pe, op, datatype); + pe, op, datatype, nic_idx); } #else if (shmem_shr_transport_use_atomic(ctx, target, len, pe, datatype)) { shmem_shr_transport_atomicv(ctx, target, source, len, pe, op, datatype); } else { shmem_transport_atomicv((shmem_transport_ctx_t *)ctx, target, source, len, - pe, op, datatype, completion); + pe, op, datatype, completion, nic_idx); } #endif } @@ -353,7 +352,7 @@ static inline void shmem_internal_fetch_atomic_nbi(shmem_ctx_t ctx, void *target, void *source, void *dest, size_t len, int pe, - shm_internal_op_t op, shm_internal_datatype_t datatype) + shm_internal_op_t op, shm_internal_datatype_t datatype, size_t nic_idx) { shmem_internal_assert(len > 0); @@ -362,7 +361,7 @@ shmem_internal_fetch_atomic_nbi(shmem_ctx_t ctx, void *target, void *source, op, datatype); } else { shmem_transport_fetch_atomic_nbi((shmem_transport_ctx_t *)ctx, target, - source, dest, len, pe, op, datatype); + source, dest, len, pe, op, datatype, nic_idx); } } @@ -404,7 +403,7 @@ void shmem_internal_ct_wait(shmemx_ct_t ct, long wait_for) /* Uses internal put for external heap config; otherwise memcpy */ static inline -void shmem_internal_copy_self(void *dest, const void *source, size_t nelems) +void shmem_internal_copy_self(void *dest, const void *source, size_t nelems, size_t nic_idx) { #ifdef USE_FI_HMEM // "completion" set to 1 to wait for completion of put operation initiated @@ -412,8 +411,8 @@ void shmem_internal_copy_self(void *dest, const void *source, size_t nelems) // to shmem_internal_put_nb. long completion = 1; shmem_internal_put_nb(SHMEM_CTX_DEFAULT, dest, source, nelems, - shmem_internal_my_pe, &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + shmem_internal_my_pe, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); #else memcpy(dest, source, nelems); #endif diff --git a/src/shmem_internal.h b/src/shmem_internal.h index 59bc7549c..377a6814e 100644 --- a/src/shmem_internal.h +++ b/src/shmem_internal.h @@ -195,8 +195,7 @@ extern hwloc_topology_t shmem_internal_topology; do { \ int rand_int = rand_r(&shmem_internal_rand_seed); \ double normalized = (double)rand_int / (double)RAND_MAX; \ - int range = shmem_transport_ofi_num_nics - 1; \ - idx = (int)(normalized * range); \ + idx = (int)(normalized * shmem_transport_ofi_num_nics); \ } while (0) #else #define SHMEM_GET_TRANSMIT_NIC_IDX(idx) diff --git a/src/shr_transport.h4 b/src/shr_transport.h4 index 9379ef2e5..fd7db7633 100644 --- a/src/shr_transport.h4 +++ b/src/shr_transport.h4 @@ -566,7 +566,8 @@ SHMEM_DEFINE_FOR_AMO(SHMEM_DEF_SUM_OP) static inline void shmem_shr_transport_put_signal(shmem_ctx_t ctx, void *target, const void *source, size_t len, - uint64_t *sig_addr, uint64_t signal, int sig_op, int pe) + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe, + size_t nic_idx) { #if USE_MEMCPY memcpy(target, source, len); @@ -587,10 +588,10 @@ shmem_shr_transport_put_signal(shmem_ctx_t ctx, void *target, #else if (sig_op == SHMEM_SIGNAL_ADD) shmem_transport_atomic((shmem_transport_ctx_t *) ctx, sig_addr, &signal, sizeof(uint64_t), - pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64); + pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64, nic_idx); else shmem_transport_atomic_set((shmem_transport_ctx_t *) ctx, sig_addr, &signal, - sizeof(uint64_t), pe, SHM_INTERNAL_UINT64); + sizeof(uint64_t), pe, SHM_INTERNAL_UINT64, nic_idx); #endif #elif USE_CMA shmem_transport_cma_put(target, source, len, pe, @@ -600,10 +601,10 @@ shmem_shr_transport_put_signal(shmem_ctx_t ctx, void *target, /* Using network atomics as CMA does not support atomic operations */ if (sig_op == SHMEM_SIGNAL_ADD) shmem_transport_atomic((shmem_transport_ctx_t *) ctx, sig_addr, &signal, sizeof(uint64_t), - pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64); + pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64, nic_idx); else shmem_transport_atomic_set((shmem_transport_ctx_t *) ctx, sig_addr, &signal, - sizeof(uint64_t), pe, SHM_INTERNAL_UINT64); + sizeof(uint64_t), pe, SHM_INTERNAL_UINT64, nic_idx); #else RAISE_ERROR_STR("No path to peer"); #endif diff --git a/src/transport_none.h b/src/transport_none.h index f0d517d07..f85d121c9 100644 --- a/src/transport_none.h +++ b/src/transport_none.h @@ -112,7 +112,7 @@ shmem_transport_fence(shmem_transport_ctx_t* ctx) static inline void -shmem_transport_put_scalar(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe) +shmem_transport_put_scalar(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } @@ -128,14 +128,14 @@ shmem_transport_put_nb(shmem_transport_ctx_t* ctx, void *target, const void *sou static inline void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - uint64_t *sig_addr, uint64_t signal, int sig_op, int pe) + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } static inline void -shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion) +shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion, size_t nic_idx) { /* No op */ } @@ -143,14 +143,14 @@ shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion) static inline void shmem_transport_put_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe) + int pe, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } static inline void -shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe) +shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } @@ -166,7 +166,7 @@ shmem_transport_get_wait(shmem_transport_ctx_t* ctx, size_t idx) static inline void shmem_transport_swap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, - size_t len, int pe, shm_internal_datatype_t datatype) + size_t len, int pe, shm_internal_datatype_t datatype, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } @@ -174,7 +174,7 @@ shmem_transport_swap(shmem_transport_ctx_t* ctx, void *target, const void *sourc static inline void shmem_transport_swap_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, - size_t len, int pe, shm_internal_datatype_t datatype) + size_t len, int pe, shm_internal_datatype_t datatype, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } @@ -183,7 +183,7 @@ static inline void shmem_transport_cswap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, const void *operand, size_t len, int pe, - shm_internal_datatype_t datatype) + shm_internal_datatype_t datatype, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } @@ -201,7 +201,7 @@ static inline void shmem_transport_mswap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, const void *mask, size_t len, int pe, - shm_internal_datatype_t datatype) + shm_internal_datatype_t datatype, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } @@ -209,7 +209,7 @@ shmem_transport_mswap(shmem_transport_ctx_t* ctx, void *target, const void *sour static inline void shmem_transport_atomic(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, shm_internal_op_t op, shm_internal_datatype_t datatype) + int pe, shm_internal_op_t op, shm_internal_datatype_t datatype, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } @@ -225,7 +225,7 @@ shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const void *so static inline void shmem_transport_fetch_atomic(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, size_t len, - int pe, shm_internal_op_t op, shm_internal_datatype_t datatype) + int pe, shm_internal_op_t op, shm_internal_datatype_t datatype, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } @@ -233,7 +233,7 @@ shmem_transport_fetch_atomic(shmem_transport_ctx_t* ctx, void *target, const voi static inline void shmem_transport_fetch_atomic_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, size_t len, - int pe, shm_internal_op_t op, shm_internal_datatype_t datatype) + int pe, shm_internal_op_t op, shm_internal_datatype_t datatype, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } @@ -241,7 +241,7 @@ shmem_transport_fetch_atomic_nbi(shmem_transport_ctx_t* ctx, void *target, const static inline void shmem_transport_atomic_fetch(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, shm_internal_datatype_t datatype) + int pe, shm_internal_datatype_t datatype, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } @@ -249,7 +249,7 @@ shmem_transport_atomic_fetch(shmem_transport_ctx_t* ctx, void *target, const voi static inline void shmem_transport_atomic_set(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, shm_internal_datatype_t datatype) + int pe, shm_internal_datatype_t datatype, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } @@ -294,14 +294,15 @@ void shmem_transport_ct_wait(shmem_transport_ct_t *ct, long wait_for) static inline void shmem_transport_put_ct_nb(shmem_transport_ct_t *ct, void *target, const void - *source, size_t len, int pe, long *completion) + *source, size_t len, int pe, long *completion, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } static inline void shmem_transport_get_ct(shmem_transport_ct_t *ct, void - *target, const void *source, size_t len, int pe) + *target, const void *source, size_t len, int pe, + size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } diff --git a/src/transport_ofi.c b/src/transport_ofi.c index 02d49264e..690c55054 100644 --- a/src/transport_ofi.c +++ b/src/transport_ofi.c @@ -631,8 +631,9 @@ int bind_enable_ep_resources(shmem_transport_ctx_t *ctx, size_t idx) FI_SELECTIVE_COMPLETION | FI_TRANSMIT | FI_RECV); OFI_CHECK_RETURN_STR(ret, "fi_ep_bind CQ to endpoint failed"); - ret = fi_ep_bind(ctx->ep[idx], &shmem_transport_ofi_avfd->fid, 0); - OFI_CHECK_RETURN_STR(ret, "fi_ep_bind AV to endpoint failed"); + ret = fi_ep_bind(ctx->ep[idx], /*&shmem_transport_ofi_avfd->fid*/ &ctx->av[idx]->fid, 0); /* Currently failing */ + //OFI_CHECK_RETURN_STR(ret, "fi_ep_bind AV to endpoint failed"); + OFI_CHECK_RETURN_MSG(ret, "fi_ep_bind AV to endpoint failed(%s)\n", fi_strerror(errno)); ret = fi_enable(ctx->ep[idx]); OFI_CHECK_RETURN_STR(ret, "fi_enable on endpoint failed"); @@ -1265,11 +1266,12 @@ int publish_av_info(struct fabric_info *info) return ret; } +char * alladdrs = NULL; static inline int populate_av(void) { int i, ret, err = 0; - char *alladdrs = NULL; + //char *alladdrs = NULL; alladdrs = malloc(shmem_internal_num_pes * shmem_transport_ofi_addrlen); if (alladdrs == NULL) { @@ -1296,7 +1298,7 @@ int populate_av(void) return ret; } - free(alladdrs); + //free(alladdrs); return 0; } @@ -1781,6 +1783,9 @@ static int shmem_transport_ofi_ctx_init(shmem_transport_ctx_t *ctx, int id) //info->p_info->rx_attr->caps = FI_RECV; /* to drive progress on the CQ */; ctx->id = id; + ctx->fabric = (struct fid_fabric **) malloc(shmem_transport_ofi_num_nics * sizeof(struct fid_fabric *)); + ctx->domain = (struct fid_domain **) malloc(shmem_transport_ofi_num_nics * sizeof(struct fid_domain *)); + ctx->av = (struct fid_av **) malloc(shmem_transport_ofi_num_nics * sizeof(struct fid_av *)); ctx->ep = (struct fid_ep **) malloc(shmem_transport_ofi_num_nics * sizeof(struct fid_ep *)); ctx->put_cntr = (struct fid_cntr **) malloc(shmem_transport_ofi_num_nics * sizeof(struct fid_cntr *)); ctx->get_cntr = (struct fid_cntr **) malloc(shmem_transport_ofi_num_nics * sizeof(struct fid_cntr *)); @@ -1794,41 +1799,65 @@ static int shmem_transport_ofi_ctx_init(shmem_transport_ctx_t *ctx, int id) ctx->cq = (struct fid_cq **) malloc(shmem_transport_ofi_num_nics * sizeof(struct fid_cq *)); for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { #ifdef USE_CTX_LOCK - ctx->pending_put_cntr[idx] = 0; - ctx->pending_get_cntr[idx] = 0; + ctx->pending_put_cntr[idx] = 0; + ctx->pending_get_cntr[idx] = 0; #else - shmem_internal_cntr_write(&ctx->pending_put_cntr[idx], 0); - shmem_internal_cntr_write(&ctx->pending_get_cntr[idx], 0); + shmem_internal_cntr_write(&ctx->pending_put_cntr[idx], 0); + shmem_internal_cntr_write(&ctx->pending_get_cntr[idx], 0); #endif - /* FIX */ - //shmem_transport_ofi_eps[idx]->info->ep_attr->tx_ctx_cnt = shmem_transport_ofi_stx_max > 0 ? FI_SHARED_CONTEXT : 0; - //shmem_transport_ofi_eps[idx]->info->caps = FI_RMA | FI_WRITE | FI_READ | FI_ATOMIC | FI_RECV; - //shmem_transport_ofi_eps[idx]->info->tx_attr->op_flags = FI_DELIVERY_COMPLETE; - //shmem_transport_ofi_eps[idx]->info->mode = 0; - //shmem_transport_ofi_eps[idx]->info->tx_attr->mode = 0; - //shmem_transport_ofi_eps[idx]->info->rx_attr->mode = 0; - //shmem_transport_ofi_eps[idx]->info->tx_attr->caps = info->p_info->caps; - //shmem_transport_ofi_eps[idx]->info->rx_attr->caps = FI_RECV; /* to drive progress on the CQ */; + provider_list[idx]->ep_attr->tx_ctx_cnt = shmem_transport_ofi_stx_max > 0 ? FI_SHARED_CONTEXT : 0; + provider_list[idx]->caps = FI_RMA | FI_WRITE | FI_READ | FI_ATOMIC | FI_RECV; + provider_list[idx]->tx_attr->op_flags = FI_DELIVERY_COMPLETE; + provider_list[idx]->mode = 0; + provider_list[idx]->tx_attr->mode = 0; + provider_list[idx]->rx_attr->mode = 0; + provider_list[idx]->tx_attr->caps = provider_list[idx]->caps; + provider_list[idx]->rx_attr->caps = FI_RECV; /* to drive progress on the CQ */; #ifdef USE_CTX_LOCK SHMEM_MUTEX_INIT(ctx->lock); #endif + ret = fi_fabric(provider_list[idx]->fabric_attr, &ctx->fabric[idx], NULL); + OFI_CHECK_RETURN_STR(ret, "fabric initialization failed"); - ret = fi_cntr_open(shmem_transport_ofi_domainfd, &cntr_put_attr, + ret = fi_domain(/*shmem_transport_ofi_fabfd*/ ctx->fabric[idx], provider_list[idx], + &ctx->domain[idx], NULL); + OFI_CHECK_RETURN_STR(ret, "domain initialization failed"); + + struct fi_av_attr av_attr = {0}; +#ifdef USE_AV_MAP + av_attr.type = FI_AV_MAP; +#else + av_attr.type = FI_AV_TABLE; +#endif + ret = fi_av_open(/*shmem_transport_ofi_domainfd*/ ctx->domain[idx], + &av_attr, + /*&shmem_transport_ofi_avfd*/ &ctx->av[idx], + NULL); + OFI_CHECK_RETURN_STR(ret, "AV creation failed"); + + ret = fi_av_insert(/*shmem_transport_ofi_avfd*/ ctx->av[idx], + alladdrs, + shmem_internal_num_pes, + addr_table, + 0, + NULL); + + ret = fi_cntr_open(/*shmem_transport_ofi_domainfd*/ ctx->domain[idx], &cntr_put_attr, &ctx->put_cntr[idx], NULL); OFI_CHECK_RETURN_MSG(ret, "put_cntr creation failed (%s)\n", fi_strerror(errno)); - ret = fi_cntr_open(shmem_transport_ofi_domainfd, &cntr_get_attr, + ret = fi_cntr_open(/*shmem_transport_ofi_domainfd*/ ctx->domain[idx], &cntr_get_attr, &ctx->get_cntr[idx], NULL); OFI_CHECK_RETURN_MSG(ret, "get_cntr creation failed (%s)\n", fi_strerror(errno)); - ret = fi_cq_open(shmem_transport_ofi_domainfd, &cq_attr, &ctx->cq[idx], NULL); + ret = fi_cq_open(/*shmem_transport_ofi_domainfd*/ ctx->domain[idx], &cq_attr, &ctx->cq[idx], NULL); if (ret && errno == FI_EMFILE) { DEBUG_STR("Context creation failed because of open files limit, consider increasing with 'ulimit' command"); } OFI_CHECK_RETURN_MSG(ret, "cq_open failed (%s)\n", fi_strerror(errno)); - ret = fi_endpoint(shmem_transport_ofi_domainfd, - info->p_info, &ctx->ep[idx], NULL); + ret = fi_endpoint(/*shmem_transport_ofi_domainfd*/ ctx->domain[idx], + /*info->p_info*/ provider_list[idx], &ctx->ep[idx], NULL); OFI_CHECK_RETURN_MSG(ret, "ep creation failed (%s)\n", fi_strerror(errno)); } diff --git a/src/transport_ofi.h b/src/transport_ofi.h index 39545b535..2edb1d577 100644 --- a/src/transport_ofi.h +++ b/src/transport_ofi.h @@ -317,6 +317,9 @@ struct shmem_transport_ctx_t { shmem_internal_mutex_t lock; #endif long options; + struct fid_fabric** fabric; + struct fid_domain** domain; + struct fid_av** av; struct fid_ep** ep; struct fid_cntr** put_cntr; struct fid_cntr** get_cntr; @@ -508,9 +511,9 @@ void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx) fail = 0; for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { - success = fi_cntr_read(ctx->put_cntr[idx]); /* FIX */ - fail = fi_cntr_readerr(ctx->put_cntr[idx]); /* FIX */ - cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[idx]); /* FIX */ + success = fi_cntr_read(ctx->put_cntr[idx]); /* FIXED? */ + fail = fi_cntr_readerr(ctx->put_cntr[idx]); /* FIXED? */ + cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[idx]); /* FIXED? */ } shmem_transport_probe(); @@ -527,11 +530,11 @@ void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx) poll_count++; } for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { - cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[idx]); /* FIX */ + cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[idx]); /* FIXED? */ do { cnt = cnt_new; - ssize_t ret = fi_cntr_wait(ctx->put_cntr[idx], cnt, -1); /* FIX */ - cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[idx]); /* FIX */ + ssize_t ret = fi_cntr_wait(ctx->put_cntr[idx], cnt, -1); /* FIXED? */ + cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[idx]); /* FIXED? */ OFI_CTX_CHECK_ERROR(ctx, ret); } while (cnt < cnt_new); shmem_internal_assert(cnt == cnt_new); @@ -574,8 +577,7 @@ int shmem_transport_fence(shmem_transport_ctx_t* ctx) * to reclaim resources and indicate that the operation should be retried. If * retry limit (ofi_max_poll) is exceeded, abort. */ static inline -int try_again(shmem_transport_ctx_t *ctx, const int ret, uint64_t *polled) { - +int try_again(shmem_transport_ctx_t *ctx, const int ret, uint64_t *polled, size_t nic_idx) { if (ret) { if (ret == -FI_EAGAIN) { if (ctx->bounce_buffers) { @@ -586,9 +588,9 @@ int try_again(shmem_transport_ctx_t *ctx, const int ret, uint64_t *polled) { else { /* Poke CQ for errors to encourage progress */ struct fi_cq_err_entry e = {0}; - ssize_t ret = fi_cq_readerr(ctx->cq, (void *)&e, 0); + ssize_t ret = fi_cq_readerr(ctx->cq[nic_idx], (void *)&e, 0); /* FIXED? */ if (ret == 1) { - const char *errmsg = fi_cq_strerror(ctx->cq, e.prov_errno, + const char *errmsg = fi_cq_strerror(ctx->cq[nic_idx], e.prov_errno, /* FIXED? */ e.err_data, NULL, 0); RAISE_ERROR_MSG("Error in operation: %s\n", errmsg); } else if (ret && ret != -FI_EAGAIN) { @@ -619,7 +621,7 @@ int try_again(shmem_transport_ctx_t *ctx, const int ret, uint64_t *polled) { static inline void shmem_transport_put_scalar(shmem_transport_ctx_t* ctx, void *target, const - void *source, size_t len, int pe) + void *source, size_t len, int pe, size_t nic_idx) { int ret = 0; uint64_t dst = (uint64_t) pe; @@ -632,24 +634,24 @@ void shmem_transport_put_scalar(shmem_transport_ctx_t* ctx, void *target, const shmem_internal_assert(len <= shmem_transport_ofi_max_buffered_send); SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[1]); /* FIX */ + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_inject_write(ctx->ep[1], /* FIX */ + ret = fi_inject_write(ctx->ep[nic_idx], /* FIXED? */ source, len, GET_DEST(dst), (uint64_t) addr, key); - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } static inline void shmem_transport_ofi_put_large(shmem_transport_ctx_t* ctx, void *target, const void *source, - size_t len, int pe) + size_t len, int pe, size_t nic_idx) { int ret = 0; uint64_t dst = (uint64_t) pe; @@ -671,16 +673,15 @@ void shmem_transport_ofi_put_large(shmem_transport_ctx_t* ctx, void *target, con (size_t) (((uint8_t *) source) + len - frag_source)); polled = 0; - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[1]); /* FIX */ + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ do { - - ret = fi_write(ctx->ep[1], + ret = fi_write(ctx->ep[nic_idx], frag_source, frag_len, GET_MR_DESC(shmem_transport_ofi_get_mr_desc_index(source)), GET_DEST(dst), frag_target, key, NULL); - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ frag_source += frag_len; frag_target += frag_len; @@ -690,7 +691,7 @@ void shmem_transport_ofi_put_large(shmem_transport_ctx_t* ctx, void *target, con static inline void shmem_transport_put_nb(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, long *completion) + int pe, long *completion, size_t nic_idx) { int ret = 0; uint64_t dst = (uint64_t) pe; @@ -702,12 +703,12 @@ void shmem_transport_put_nb(shmem_transport_ctx_t* ctx, void *target, const void if (len <= shmem_transport_ofi_max_buffered_send) { - shmem_transport_put_scalar(ctx, target, source, len, pe); + shmem_transport_put_scalar(ctx, target, source, len, pe, nic_idx); } else if (len <= shmem_transport_ofi_bounce_buffer_size && ctx->bounce_buffers) { SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[1]); /* FIX */ + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ shmem_transport_ofi_get_mr(target, pe, &addr, &key); shmem_transport_ofi_bounce_buffer_t *buff = @@ -727,19 +728,19 @@ void shmem_transport_put_nb(shmem_transport_ctx_t* ctx, void *target, const void .data = 0 }; do { - ret = fi_writemsg(ctx->ep[1], &msg, FI_COMPLETION | FI_DELIVERY_COMPLETE); /* FIX */ - } while (try_again(ctx, ret, &polled)); + ret = fi_writemsg(ctx->ep[nic_idx], &msg, FI_COMPLETION | FI_DELIVERY_COMPLETE); /* FIXED? */ + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } else { - shmem_transport_ofi_put_large(ctx, target, source,len, pe); + shmem_transport_ofi_put_large(ctx, target, source,len, pe, nic_idx); (*completion)++; } } static inline void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - uint64_t *sig_addr, uint64_t signal, int sig_op, int pe) + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe, size_t nic_idx) { int ret = 0; uint64_t dst = (uint64_t) pe; @@ -776,8 +777,8 @@ void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, co }; do { - ret = fi_writemsg(ctx->ep[1], &msg, FI_DELIVERY_COMPLETE | FI_INJECT); /* FIX */ - } while (try_again(ctx, ret, &polled)); + ret = fi_writemsg(ctx->ep[nic_idx], &msg, FI_DELIVERY_COMPLETE | FI_INJECT); /* FIXED? */ + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } else { @@ -821,11 +822,11 @@ void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, co msg.rma_iov = &rma_iov; msg.context = frag_source; - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[1]); /* FIX */ + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_writemsg(ctx->ep[1], &msg, FI_DELIVERY_COMPLETE); /* FIX */ - } while (try_again(ctx, ret, &polled)); + ret = fi_writemsg(ctx->ep[nic_idx], &msg, FI_DELIVERY_COMPLETE); /* FIXED? */ + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ frag_source += frag_len; frag_target += frag_len; @@ -875,14 +876,14 @@ void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, co do { ret = fi_atomicmsg(ctx->ep[1], &msg_signal, flags_signal); /* FIX */ - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, 1)); /* FIX */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } /* compatibility with Portals transport */ static inline -void shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion) { +void shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion, size_t nic_idx) { shmem_internal_assert((*completion) >= 0); @@ -894,21 +895,21 @@ void shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion) { static inline void shmem_transport_put_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe) + int pe, size_t nic_idx) { if (len <= shmem_transport_ofi_max_buffered_send) { - shmem_transport_put_scalar(ctx, target, source, len, pe); + shmem_transport_put_scalar(ctx, target, source, len, pe, nic_idx); } else { - shmem_transport_ofi_put_large(ctx, target, source, len, pe); + shmem_transport_ofi_put_large(ctx, target, source, len, pe, nic_idx); } } static inline -void shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe) +void shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe, size_t nic_idx) { int ret = 0; uint64_t dst = (uint64_t) pe; @@ -921,9 +922,9 @@ void shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *s SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); if (len <= shmem_transport_ofi_max_msg_size) { - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[1]); /* FIX */ + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_read(ctx->ep[1], /* FIX */ + ret = fi_read(ctx->ep[nic_idx], /* FIXED? */ target, len, GET_MR_DESC(shmem_transport_ofi_get_mr_desc_index(target)), @@ -931,7 +932,7 @@ void shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *s (uint64_t) addr, key, NULL); - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ } else { uint8_t *frag_target = (uint8_t *) target; @@ -943,15 +944,15 @@ void shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *s (size_t) (((uint8_t *) target) + len - frag_target)); polled = 0; - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[1]); /* FIX */ + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_read(ctx->ep[1], + ret = fi_read(ctx->ep[nic_idx], frag_target, frag_len, GET_MR_DESC(shmem_transport_ofi_get_mr_desc_index(target)), GET_DEST(dst), frag_source, key, NULL); - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ frag_source += frag_len; frag_target += frag_len; @@ -1056,14 +1057,14 @@ void shmem_transport_cswap_nbi(shmem_transport_ctx_t* ctx, void *target, const 1, FI_INJECT); /* FI_DELIVERY_COMPLETE is not required as it is implied for fetch atomicmsgs */ - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, 1)); /* FIX */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } static inline void shmem_transport_cswap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, - const void *operand, size_t len, int pe, int datatype) + const void *operand, size_t len, int pe, int datatype, size_t nic_idx) { #ifdef ENABLE_MR_ENDPOINT /* CXI provider currently does not support fetch atomics with FI_DELIVERY_COMPLETE @@ -1084,10 +1085,10 @@ void shmem_transport_cswap(shmem_transport_ctx_t* ctx, void *target, const void shmem_internal_assert(SHMEM_Dtsize[SHMEM_TRANSPORT_DTYPE(datatype)] == len); SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[1]); /* FIX */ + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_compare_atomic(ctx->ep[1], /* FIX */ + ret = fi_compare_atomic(ctx->ep[nic_idx], /* FIXED? */ source, 1, GET_MR_DESC(shmem_transport_ofi_get_mr_desc_index(source)), @@ -1101,7 +1102,7 @@ void shmem_transport_cswap(shmem_transport_ctx_t* ctx, void *target, const void SHMEM_TRANSPORT_DTYPE(datatype), FI_CSWAP, NULL); - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); #endif } @@ -1109,7 +1110,7 @@ void shmem_transport_cswap(shmem_transport_ctx_t* ctx, void *target, const void static inline void shmem_transport_mswap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, - const void *mask, size_t len, int pe, int datatype) + const void *mask, size_t len, int pe, int datatype, size_t nic_idx) { int ret = 0; uint64_t dst = (uint64_t) pe; @@ -1123,10 +1124,10 @@ void shmem_transport_mswap(shmem_transport_ctx_t* ctx, void *target, const void shmem_internal_assert(SHMEM_Dtsize[SHMEM_TRANSPORT_DTYPE(datatype)] == len); SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[1]); /* FIX */ + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_compare_atomic(ctx->ep[1], /* FIX */ + ret = fi_compare_atomic(ctx->ep[nic_idx], /* FIXED? */ source, 1, GET_MR_DESC(shmem_transport_ofi_get_mr_desc_index(source)), @@ -1140,14 +1141,14 @@ void shmem_transport_mswap(shmem_transport_ctx_t* ctx, void *target, const void SHMEM_TRANSPORT_DTYPE(datatype), FI_MSWAP, NULL); - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } static inline void shmem_transport_atomic(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, int op, int datatype) + int pe, int op, int datatype, size_t nic_idx) { int ret = 0; uint64_t dst = (uint64_t) pe; @@ -1160,10 +1161,10 @@ void shmem_transport_atomic(shmem_transport_ctx_t* ctx, void *target, const void shmem_internal_assert(SHMEM_Dtsize[SHMEM_TRANSPORT_DTYPE(datatype)] == len); SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[1]); /* FIX */ + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_inject_atomic(ctx->ep[1], /* FIX */ + ret = fi_inject_atomic(ctx->ep[nic_idx], /* FIXED? */ source, 1, GET_DEST(dst), @@ -1171,7 +1172,7 @@ void shmem_transport_atomic(shmem_transport_ctx_t* ctx, void *target, const void key, SHMEM_TRANSPORT_DTYPE(datatype), op); - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } @@ -1179,7 +1180,7 @@ void shmem_transport_atomic(shmem_transport_ctx_t* ctx, void *target, const void static inline void shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t full_len, int pe, int op, int datatype, - long *completion) + long *completion, size_t nic_idx) { int ret = 0; uint64_t dst = (uint64_t) pe; @@ -1193,7 +1194,7 @@ void shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const voi shmem_internal_assert(SHMEM_Dtsize[dt] * len == full_len); SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - ret = fi_atomicvalid(ctx->ep[1], dt, op, /* FIX */ + ret = fi_atomicvalid(ctx->ep[nic_idx], dt, op, /* FIXED? */ &max_atomic_size); max_atomic_size = max_atomic_size * SHMEM_Dtsize[dt]; if (max_atomic_size > shmem_transport_ofi_max_msg_size @@ -1210,10 +1211,10 @@ void shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const voi polled = 0; - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[1]); /* FIX */ + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_inject_atomic(ctx->ep[1], /* FIX */ + ret = fi_inject_atomic(ctx->ep[nic_idx], /* FIXED? */ source, len, GET_DEST(dst), @@ -1221,7 +1222,7 @@ void shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const voi key, dt, op); - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ } else if (full_len <= MIN(shmem_transport_ofi_bounce_buffer_size, max_atomic_size) && @@ -1231,7 +1232,7 @@ void shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const voi create_bounce_buffer(ctx, source, full_len); polled = 0; - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[1]); /* FIX */ + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ const struct fi_ioc msg_iov = { .addr = buff->data, .count = len }; const struct fi_rma_ioc rma_iov = { .addr = (uint64_t) addr, .count = len, .key = key }; @@ -1248,8 +1249,8 @@ void shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const voi .data = 0 }; do { - ret = fi_atomicmsg(ctx->ep[1], &msg, FI_COMPLETION | FI_DELIVERY_COMPLETE); /* FIX */ - } while (try_again(ctx, ret, &polled)); + ret = fi_atomicmsg(ctx->ep[nic_idx], &msg, FI_COMPLETION | FI_DELIVERY_COMPLETE); /* FIXED? */ + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ } else { size_t sent = 0; @@ -1259,9 +1260,9 @@ void shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const voi size_t chunksize = MIN((len-sent), (max_atomic_size/SHMEM_Dtsize[dt])); polled = 0; - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[1]); /* FIX */ + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_atomic(ctx->ep[1], /* FIX */ + ret = fi_atomic(ctx->ep[nic_idx], /* FIXED? */ (void *)((char *)source + (sent*SHMEM_Dtsize[dt])), chunksize, @@ -1273,7 +1274,7 @@ void shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const voi dt, op, NULL); - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ sent += chunksize; } @@ -1288,7 +1289,7 @@ void shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const voi static inline void shmem_transport_fetch_atomic_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, - size_t len, int pe, int op, int datatype) + size_t len, int pe, int op, int datatype, size_t nic_idx) { int ret = 0; uint64_t dst = (uint64_t) pe; @@ -1317,17 +1318,17 @@ void shmem_transport_fetch_atomic_nbi(shmem_transport_ctx_t* ctx, void *target, }; SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[1]); /* FIX */ + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_fetch_atomicmsg(ctx->ep[1], /* FIX */ + ret = fi_fetch_atomicmsg(ctx->ep[nic_idx], /* FIXED? */ &msg, &resultv, GET_MR_DESC_ADDR(shmem_transport_ofi_get_mr_desc_index(dest)), 1, FI_INJECT); /* FI_DELIVERY_COMPLETE is not required as it's implied for fetch atomicmsgs */ - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } @@ -1335,14 +1336,15 @@ void shmem_transport_fetch_atomic_nbi(shmem_transport_ctx_t* ctx, void *target, static inline void shmem_transport_fetch_atomic(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, - size_t len, int pe, int op, int datatype) + size_t len, int pe, int op, int datatype, + size_t nic_idx) { #ifdef ENABLE_MR_ENDPOINT /* CXI provider currently does not support fetch atomics with FI_DELIVERY_COMPLETE * That is why non-blocking API is used which uses FI_INJECT. FI_ATOMIC_READ is * also not supported currently */ shmem_transport_fetch_atomic_nbi(ctx, target, source, - dest, len, pe, op, datatype); + dest, len, pe, op, datatype, nic_idx); #else int ret = 0; uint64_t dst = (uint64_t) pe; @@ -1356,10 +1358,10 @@ void shmem_transport_fetch_atomic(shmem_transport_ctx_t* ctx, void *target, shmem_internal_assert(SHMEM_Dtsize[SHMEM_TRANSPORT_DTYPE(datatype)] == len); SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[1]); /* FIX */ + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_fetch_atomic(ctx->ep[1], /* FIX */ + ret = fi_fetch_atomic(ctx->ep[nic_idx], /* FIXED */ source, 1, GET_MR_DESC(shmem_transport_ofi_get_mr_desc_index(source)), @@ -1371,7 +1373,7 @@ void shmem_transport_fetch_atomic(shmem_transport_ctx_t* ctx, void *target, SHMEM_TRANSPORT_DTYPE(datatype), op, NULL); - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); #endif } @@ -1380,37 +1382,38 @@ void shmem_transport_fetch_atomic(shmem_transport_ctx_t* ctx, void *target, static inline void shmem_transport_swap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, - size_t len, int pe, int datatype) + size_t len, int pe, int datatype, + size_t nic_idx) { shmem_transport_fetch_atomic(ctx, target, source, dest, len, pe, - FI_ATOMIC_WRITE, datatype); + FI_ATOMIC_WRITE, datatype, nic_idx); } static inline void shmem_transport_swap_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, size_t len, - int pe, int datatype) + int pe, int datatype, size_t nic_idx) { shmem_transport_fetch_atomic_nbi(ctx, target, source, dest, len, pe, - FI_ATOMIC_WRITE, datatype); + FI_ATOMIC_WRITE, datatype, nic_idx); } static inline void shmem_transport_atomic_set(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe, - int datatype) + int datatype, size_t nic_idx) { shmem_transport_atomic(ctx, target, source, len, pe, FI_ATOMIC_WRITE, - datatype); + datatype, nic_idx); } static inline void shmem_transport_atomic_fetch(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe, - int datatype) + int datatype, size_t nic_idx) { #ifdef ENABLE_MR_ENDPOINT /* CXI provider currently does not support fetch atomics with FI_DELIVERY_COMPLETE @@ -1418,10 +1421,10 @@ void shmem_transport_atomic_fetch(shmem_transport_ctx_t* ctx, void *target, * also not supported currently */ long long dummy = 0; shmem_transport_fetch_atomic_nbi(ctx, (void *) source, (const void *) &dummy, - target, len, pe, FI_SUM, datatype); + target, len, pe, FI_SUM, datatype, nic_idx); #else - shmem_transport_fetch_atomic_nbi(ctx, (void *) source, (const void *) NULL, - target, len, pe, FI_ATOMIC_READ, datatype); + shmem_transport_fetch_atomic(ctx, (void *) source, (const void *) NULL, + target, len, pe, FI_ATOMIC_READ, datatype, nic_idx); #endif } @@ -1457,14 +1460,15 @@ int shmem_transport_atomic_supported(shm_internal_op_t op, static inline void shmem_transport_put_ct_nb(shmem_transport_ct_t *ct, void *target, const void *source, size_t len, int pe, - long *completion) + long *completion, size_t nic_idx) { RAISE_ERROR_STR("OFI transport does not currently support CT operations"); } static inline void shmem_transport_get_ct(shmem_transport_ct_t *ct, void *target, - const void *source, size_t len, int pe) + const void *source, size_t len, int pe, + size_t nic_idx) { RAISE_ERROR_STR("OFI transport does not currently support CT operations"); } diff --git a/src/transport_portals4.h b/src/transport_portals4.h index b31f1fb47..b578e900d 100644 --- a/src/transport_portals4.h +++ b/src/transport_portals4.h @@ -368,7 +368,7 @@ shmem_transport_portals4_drain_eq(void) static inline void -shmem_transport_put_scalar(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe) +shmem_transport_put_scalar(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe, size_t nic_idx) { int ret; ptl_process_t peer; @@ -571,7 +571,7 @@ shmem_transport_portals4_put_nbi_internal(shmem_transport_ctx_t* ctx, void *targ static inline void -shmem_transport_put_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe) +shmem_transport_put_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe, size_t nic_idx) { #ifdef ENABLE_REMOTE_VIRTUAL_ADDRESSING shmem_transport_portals4_put_nbi_internal(ctx, target, source, len, pe, @@ -588,7 +588,7 @@ shmem_transport_put_nbi(shmem_transport_ctx_t* ctx, void *target, const void *so static inline void shmem_transport_put_nb(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, long *completion) + int pe, long *completion, size_t nic_idx) { if (ctx->options & SHMEMX_CTX_BOUNCE_BUFFER) { #ifdef ENABLE_REMOTE_VIRTUAL_ADDRESSING @@ -603,7 +603,7 @@ shmem_transport_put_nb(shmem_transport_ctx_t* ctx, void *target, const void *sou shmem_transport_portals4_heap_pt); #endif } else { - shmem_transport_put_nbi(ctx, target, source, len, pe); + shmem_transport_put_nbi(ctx, target, source, len, pe, nic_idx); } } @@ -611,7 +611,7 @@ shmem_transport_put_nb(shmem_transport_ctx_t* ctx, void *target, const void *sou static inline void shmem_transport_put_ct_nb(shmem_transport_ct_t *ct, void *target, const void *source, - size_t len, int pe, long *completion) + size_t len, int pe, long *completion, size_t nic_idx) { #ifdef ENABLE_REMOTE_VIRTUAL_ADDRESSING shmem_transport_portals4_put_nb_internal((shmem_transport_ctx_t *)SHMEM_CTX_DEFAULT, target, source, len, pe, @@ -624,7 +624,7 @@ shmem_transport_put_ct_nb(shmem_transport_ct_t *ct, void *target, const void *so static inline void -shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion) +shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion, size_t nic_idx) { if (ctx->options & SHMEMX_CTX_BOUNCE_BUFFER) { while (*completion > 0) { @@ -668,7 +668,7 @@ shmem_transport_portals4_get_internal(shmem_transport_ctx_t* ctx, void *target, static inline -void shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe) +void shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe, size_t nic_idx) { #ifdef ENABLE_REMOTE_VIRTUAL_ADDRESSING shmem_transport_portals4_get_internal(ctx, target, source, len, pe, @@ -683,7 +683,8 @@ void shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *s static inline void shmem_transport_get_ct(shmem_transport_ct_t *ct, void *target, - const void *source, size_t len, int pe) + const void *source, size_t len, int pe, + size_t nic_idx) { #ifdef ENABLE_REMOTE_VIRTUAL_ADDRESSING shmem_transport_portals4_get_internal((shmem_transport_ctx_t *)SHMEM_CTX_DEFAULT, target, source, len, pe, ct->shr_pt, -1); @@ -718,7 +719,7 @@ shmem_transport_get_wait(shmem_transport_ctx_t* ctx, size_t idx) static inline void shmem_transport_swap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, size_t len, - int pe, ptl_datatype_t datatype) + int pe, ptl_datatype_t datatype, size_t nic_idx) { int ret; ptl_process_t peer; @@ -758,7 +759,7 @@ static inline void shmem_transport_swap_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, size_t len, - int pe, ptl_datatype_t datatype) + int pe, ptl_datatype_t datatype, size_t nic_idx) { /* transport_swap already buffers the source argument */ shmem_transport_swap(ctx, target, source, dest, len, pe, datatype); @@ -769,7 +770,7 @@ static inline void shmem_transport_cswap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, const void *operand, size_t len, int pe, - ptl_datatype_t datatype) + ptl_datatype_t datatype, size_t nic_idx) { int ret; ptl_process_t peer; @@ -821,7 +822,7 @@ static inline void shmem_transport_mswap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, const void *mask, size_t len, int pe, - ptl_datatype_t datatype) + ptl_datatype_t datatype, size_t nic_idx) { int ret; ptl_process_t peer; @@ -860,7 +861,7 @@ shmem_transport_mswap(shmem_transport_ctx_t* ctx, void *target, const void *sour static inline void shmem_transport_atomic(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, ptl_op_t op, ptl_datatype_t datatype) + int pe, ptl_op_t op, ptl_datatype_t datatype, size_t nic_idx) { int ret; ptl_pt_index_t pt; @@ -1020,7 +1021,7 @@ static inline void shmem_transport_fetch_atomic(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, size_t len, int pe, ptl_op_t op, - ptl_datatype_t datatype) + ptl_datatype_t datatype, size_t nic_idx) { int ret; ptl_pt_index_t pt; @@ -1060,7 +1061,7 @@ void shmem_transport_fetch_atomic_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, size_t len, int pe, ptl_op_t op, - ptl_datatype_t datatype) + ptl_datatype_t datatype, size_t nic_idx) { /* transport_fetch_atomic already buffers the source argument */ shmem_transport_fetch_atomic(ctx, target, source, dest, len, pe, op, datatype); @@ -1070,22 +1071,22 @@ shmem_transport_fetch_atomic_nbi(shmem_transport_ctx_t* ctx, void *target, static inline void shmem_transport_atomic_set(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, int datatype) + int pe, int datatype, size_t nic_idx) { shmem_internal_assert(len <= shmem_transport_portals4_max_atomic_size); - shmem_transport_put_scalar(ctx, target, source, len, pe); + shmem_transport_put_scalar(ctx, target, source, len, pe, nic_idx); } static inline void shmem_transport_atomic_fetch(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, int datatype) + int pe, int datatype, size_t nic_idx) { shmem_internal_assert(len <= shmem_transport_portals4_max_fetch_atomic_size); - shmem_transport_get(ctx, target, source, len, pe); + shmem_transport_get(ctx, target, source, len, pe, nic_idx); } @@ -1102,16 +1103,16 @@ int shmem_transport_atomic_supported(ptl_op_t op, ptl_datatype_t datatype) static inline void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - uint64_t *sig_addr, uint64_t signal, int sig_op, int pe) + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe, size_t nic_idx) { /* FIXME: Need to optimize non-blocking put with signal for Portals. Current implementation below keeps * * the "fence" in between data and signal put */ - shmem_transport_put_nbi(ctx, target, source, len, pe); + shmem_transport_put_nbi(ctx, target, source, len, pe, nic_idx); shmem_transport_fence(ctx); if (sig_op == SHMEM_SIGNAL_ADD) - shmem_transport_atomic(ctx, sig_addr, &signal, sizeof(uint64_t), pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64); + shmem_transport_atomic(ctx, sig_addr, &signal, sizeof(uint64_t), pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64, nic_idx); else - shmem_transport_atomic_set(ctx, sig_addr, &signal, sizeof(uint64_t), pe, SHM_INTERNAL_UINT64); + shmem_transport_atomic_set(ctx, sig_addr, &signal, sizeof(uint64_t), pe, SHM_INTERNAL_UINT64, nic_idx); } static inline diff --git a/src/transport_ucx.h b/src/transport_ucx.h index c74165007..5f906d21d 100644 --- a/src/transport_ucx.h +++ b/src/transport_ucx.h @@ -230,7 +230,7 @@ shmem_transport_fence(shmem_transport_ctx_t* ctx) static inline void -shmem_transport_put_scalar(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe) +shmem_transport_put_scalar(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe, size_t nic_idx) { ucs_status_t status; ucp_rkey_h rkey; @@ -275,7 +275,7 @@ shmem_transport_put_nb(shmem_transport_ctx_t* ctx, void *target, const void *sou static inline void -shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion) +shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion, size_t nic_idx) { while (__atomic_load_n(completion, __ATOMIC_ACQUIRE) > 0) shmem_transport_probe(); @@ -284,7 +284,7 @@ shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion) static inline void shmem_transport_put_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe) + int pe, size_t nic_idx) { ucs_status_t status; ucp_rkey_h rkey; @@ -298,7 +298,7 @@ shmem_transport_put_nbi(shmem_transport_ctx_t* ctx, void *target, const void *so static inline void -shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe) +shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe, size_t nic_idx) { ucs_status_ptr_t pstatus; ucp_rkey_h rkey; @@ -324,7 +324,7 @@ shmem_transport_get_wait(shmem_transport_ctx_t* ctx, size_t idx) static inline void shmem_transport_swap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, - size_t len, int pe, shm_internal_datatype_t datatype) + size_t len, int pe, shm_internal_datatype_t datatype, size_t nic_idx) { uint8_t *remote_addr; ucp_rkey_h rkey; @@ -361,7 +361,7 @@ shmem_transport_swap(shmem_transport_ctx_t* ctx, void *target, const void *sourc static inline void shmem_transport_swap_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, - size_t len, int pe, shm_internal_datatype_t datatype) + size_t len, int pe, shm_internal_datatype_t datatype, size_t nic_idx) { uint8_t *remote_addr; ucp_rkey_h rkey; @@ -402,7 +402,7 @@ static inline void shmem_transport_cswap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, const void *operand, size_t len, int pe, - shm_internal_datatype_t datatype) + shm_internal_datatype_t datatype, size_t nic_idx) { uint8_t *remote_addr; ucp_rkey_h rkey; @@ -484,7 +484,7 @@ shmem_transport_cswap_nbi(shmem_transport_ctx_t* ctx, void *target, const void * static inline void shmem_transport_atomic(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, shm_internal_op_t op, shm_internal_datatype_t datatype) + int pe, shm_internal_op_t op, shm_internal_datatype_t datatype, size_t nic_idx) { uint8_t *remote_addr; ucp_rkey_h rkey; @@ -530,7 +530,7 @@ shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const void *so static inline void shmem_transport_fetch_atomic(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, size_t len, - int pe, shm_internal_op_t op, shm_internal_datatype_t datatype) + int pe, shm_internal_op_t op, shm_internal_datatype_t datatype, size_t nic_idx) { uint8_t *remote_addr; ucp_rkey_h rkey; @@ -570,7 +570,7 @@ shmem_transport_fetch_atomic(shmem_transport_ctx_t* ctx, void *target, const voi static inline void shmem_transport_fetch_atomic_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, size_t len, - int pe, shm_internal_op_t op, shm_internal_datatype_t datatype) + int pe, shm_internal_op_t op, shm_internal_datatype_t datatype, size_t nic_idx) { uint8_t *remote_addr; ucp_rkey_h rkey; @@ -613,7 +613,7 @@ shmem_transport_fetch_atomic_nbi(shmem_transport_ctx_t* ctx, void *target, const static inline void shmem_transport_atomic_fetch(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, shm_internal_datatype_t datatype) + int pe, shm_internal_datatype_t datatype, size_t nic_idx) { uint8_t *remote_addr; ucp_rkey_h rkey; @@ -632,7 +632,7 @@ shmem_transport_atomic_fetch(shmem_transport_ctx_t* ctx, void *target, const voi static inline void shmem_transport_atomic_set(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, shm_internal_datatype_t datatype) + int pe, shm_internal_datatype_t datatype, size_t nic_idx) { uint8_t *remote_addr; ucp_rkey_h rkey; @@ -675,7 +675,7 @@ static inline void shmem_transport_mswap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, const void *mask, size_t len, int pe, - shm_internal_datatype_t datatype) + shm_internal_datatype_t datatype, size_t nic_idx) { uint8_t *remote_addr; ucp_rkey_h rkey; @@ -718,18 +718,18 @@ void shmem_transport_syncmem(void) static inline void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - uint64_t *sig_addr, uint64_t signal, int sig_op, int pe) + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe, size_t nic_idx) { - shmem_transport_put_nbi(ctx, target, source, len, pe); + shmem_transport_put_nbi(ctx, target, source, len, pe, nic_idx); shmem_transport_fence(ctx); switch (sig_op) { case SHMEM_SIGNAL_ADD: shmem_transport_atomic(ctx, sig_addr, &signal, sizeof(uint64_t), - pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64); + pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64, nic_idx); break; case SHMEM_SIGNAL_SET: shmem_transport_atomic_set(ctx, sig_addr, &signal, sizeof(uint64_t), - pe, SHM_INTERNAL_UINT64); + pe, SHM_INTERNAL_UINT64, nic_idx); break; default: RAISE_ERROR_MSG("Unsupported operation (%d)\n", sig_op); @@ -772,14 +772,15 @@ void shmem_transport_ct_wait(shmem_transport_ct_t *ct, long wait_for) static inline void shmem_transport_put_ct_nb(shmem_transport_ct_t *ct, void *target, const void - *source, size_t len, int pe, long *completion) + *source, size_t len, int pe, long *completion, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } static inline void shmem_transport_get_ct(shmem_transport_ct_t *ct, void - *target, const void *source, size_t len, int pe) + *target, const void *source, size_t len, int pe, + size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } From cf7376233f9dc24132e2b619d4b79a9a73848565 Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Fri, 17 May 2024 14:16:00 -0700 Subject: [PATCH 05/11] src: Continue passing NIC index where needed --- src/collectives.c | 233 +++++++++++++++++++++------------------- src/collectives_c.c4 | 142 +++++++++++++++++------- src/init.c | 4 +- src/lock_c.c | 12 ++- src/shmem_collectives.h | 120 ++++++++++++--------- src/shmem_lock.h | 34 +++--- src/shmem_team.c | 32 +++--- src/shmem_team.h | 7 +- src/symmetric_heap_c.c | 30 ++++-- src/teams_c.c4 | 10 +- 10 files changed, 371 insertions(+), 253 deletions(-) diff --git a/src/collectives.c b/src/collectives.c index ee51f869e..7a277ebba 100644 --- a/src/collectives.c +++ b/src/collectives.c @@ -244,7 +244,8 @@ shmem_internal_collectives_init(void) * *****************************************/ void -shmem_internal_sync_linear(int PE_start, int PE_stride, int PE_size, long *pSync) +shmem_internal_sync_linear(int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { long zero = 0, one = 1; @@ -259,27 +260,27 @@ shmem_internal_sync_linear(int PE_start, int PE_stride, int PE_size, long *pSync /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); /* Send acks down psync tree */ for (pe = PE_start + PE_stride, i = 1 ; i < PE_size ; i++, pe += PE_stride) { - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), pe, nic_idx); } } else { /* send message to root */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), PE_start, - SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); /* wait for ack down psync tree */ SHMEM_WAIT(pSync, 0); /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } @@ -287,7 +288,8 @@ shmem_internal_sync_linear(int PE_start, int PE_stride, int PE_size, long *pSync void -shmem_internal_sync_tree(int PE_start, int PE_stride, int PE_size, long *pSync) +shmem_internal_sync_tree(int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { long zero = 0, one = 1; int parent, num_children, *children; @@ -318,13 +320,13 @@ shmem_internal_sync_tree(int PE_start, int PE_stride, int PE_size, long *pSync) /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); /* Send acks down to children */ for (i = 0 ; i < num_children ; ++i) { shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - children[i], SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + children[i], SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } } else { @@ -332,20 +334,20 @@ shmem_internal_sync_tree(int PE_start, int PE_stride, int PE_size, long *pSync) /* send ack to parent */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); /* wait for ack from parent */ SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, num_children + 1); /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); /* Send acks down to children */ for (i = 0 ; i < num_children ; ++i) { shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - children[i], SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + children[i], SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } } @@ -354,21 +356,22 @@ shmem_internal_sync_tree(int PE_start, int PE_stride, int PE_size, long *pSync) /* send message up psync tree */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), parent, - SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); /* wait for ack down psync tree */ SHMEM_WAIT(pSync, 0); /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } } void -shmem_internal_sync_dissem(int PE_start, int PE_stride, int PE_size, long *pSync) +shmem_internal_sync_dissem(int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { int one = 1, neg_one = -1; int distance, to, i; @@ -389,7 +392,7 @@ shmem_internal_sync_dissem(int PE_start, int PE_stride, int PE_size, long *pSync to = PE_start + (to * PE_stride); shmem_internal_atomic(SHMEM_CTX_DEFAULT, &pSync_ints[i], &one, sizeof(int), - to, SHM_INTERNAL_SUM, SHM_INTERNAL_INT); + to, SHM_INTERNAL_SUM, SHM_INTERNAL_INT, nic_idx); SHMEM_WAIT_UNTIL(&pSync_ints[i], SHMEM_CMP_NE, 0); /* There's a path where the next update from a peer can get @@ -399,7 +402,7 @@ shmem_internal_sync_dissem(int PE_start, int PE_stride, int PE_size, long *pSync /* this slot is no longer used, so subtract off results now */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, &pSync_ints[i], &neg_one, sizeof(int), - shmem_internal_my_pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT); + shmem_internal_my_pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT, nic_idx); } /* Ensure local pSync decrements are done before a subsequent barrier */ @@ -415,7 +418,7 @@ shmem_internal_sync_dissem(int PE_start, int PE_stride, int PE_size, long *pSync void shmem_internal_bcast_linear(void *target, const void *source, size_t len, int PE_root, int PE_start, int PE_stride, int PE_size, - long *pSync, int complete) + long *pSync, int complete, size_t nic_idx) { long zero = 0, one = 1; int real_root = PE_start + PE_root * PE_stride; @@ -432,16 +435,16 @@ shmem_internal_bcast_linear(void *target, const void *source, size_t len, /* send data to all peers */ for (pe = PE_start,i=0; i < PE_size; pe += PE_stride, i++) { if (pe == shmem_internal_my_pe) continue; - shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, source, len, pe, &completion); + shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, source, len, pe, &completion, nic_idx); } - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); /* send completion ack to all peers */ for (pe = PE_start,i=0; i < PE_size; pe += PE_stride, i++) { if (pe == shmem_internal_my_pe) continue; - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(long), pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(long), pe, nic_idx); } if (1 == complete) { @@ -450,7 +453,7 @@ shmem_internal_bcast_linear(void *target, const void *source, size_t len, /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } @@ -460,13 +463,13 @@ shmem_internal_bcast_linear(void *target, const void *source, size_t len, /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); if (1 == complete) { /* send ack back to root */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - real_root, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + real_root, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } } } @@ -475,7 +478,7 @@ shmem_internal_bcast_linear(void *target, const void *source, size_t len, void shmem_internal_bcast_tree(void *target, const void *source, size_t len, int PE_root, int PE_start, int PE_stride, int PE_size, - long *pSync, int complete) + long *pSync, int complete, size_t nic_idx) { long zero = 0, one = 1; long completion = 0; @@ -510,23 +513,23 @@ shmem_internal_bcast_tree(void *target, const void *source, size_t len, /* if complete, send ack */ if (1 == complete) { shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } } /* send data to all leaves */ for (i = 0 ; i < num_children ; ++i) { shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, send_buf, len, children[i], - &completion); + &completion, nic_idx); } - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); /* send completion ack to all peers */ for (i = 0 ; i < num_children ; ++i) { shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(long), - children[i]); + children[i], nic_idx); } if (1 == complete) { @@ -539,7 +542,7 @@ shmem_internal_bcast_tree(void *target, const void *source, size_t len, /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } else { @@ -549,12 +552,12 @@ shmem_internal_bcast_tree(void *target, const void *source, size_t len, /* if complete, send ack */ if (1 == complete) { shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } } @@ -569,7 +572,8 @@ void shmem_internal_op_to_all_linear(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype) + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx) { long zero = 0, one = 1; @@ -586,22 +590,22 @@ shmem_internal_op_to_all_linear(void *target, const void *source, size_t count, will flush any atomic cache value that may currently exist. */ shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, source, count * type_size, - shmem_internal_my_pe, &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + shmem_internal_my_pe, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_quiet(SHMEM_CTX_DEFAULT); /* let everyone know that it's safe to send to us */ for (pe = PE_start + PE_stride, i = 1 ; i < PE_size ; i++, pe += PE_stride) { - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), pe, nic_idx); } /* Wait for others to acknowledge sending data */ SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, PE_size - 1); /* reset pSync */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } else { @@ -609,22 +613,22 @@ shmem_internal_op_to_all_linear(void *target, const void *source, size_t count, SHMEM_WAIT(pSync, 0); /* reset pSync */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); /* send data, ack, and wait for completion */ shmem_internal_atomicv(SHMEM_CTX_DEFAULT, target, source, count * type_size, - PE_start, op, datatype, &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + PE_start, op, datatype, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - PE_start, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + PE_start, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } /* broadcast out */ shmem_internal_bcast(target, target, count * type_size, 0, - PE_start, PE_stride, PE_size, pSync + 2, 0); + PE_start, PE_stride, PE_size, pSync + 2, 0, nic_idx); } @@ -635,7 +639,8 @@ void shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype) + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx) { int group_rank = (shmem_internal_my_pe - PE_start) / PE_stride; long zero = 0, one = 1; @@ -650,7 +655,7 @@ shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, si if (PE_size == 1) { if (target != source) - shmem_internal_copy_self(target, source, count * type_size); + shmem_internal_copy_self(target, source, count * type_size, nic_idx); return; } @@ -662,11 +667,11 @@ shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, si if (NULL == tmp) RAISE_ERROR_MSG("Unable to allocate %zub temporary buffer\n", count*type_size); - shmem_internal_copy_self(tmp, target, count * type_size); + shmem_internal_copy_self(tmp, target, count * type_size, nic_idx); free_source = 1; source = tmp; - shmem_internal_sync(PE_start, PE_stride, PE_size, pSync + 2); + shmem_internal_sync(PE_start, PE_stride, PE_size, pSync + 2, nic_idx); } /* Perform reduce-scatter: @@ -700,10 +705,10 @@ shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, si i == 0 ? ((uint8_t *) source) + chunk_out_disp : ((uint8_t *) target) + chunk_out_disp, - chunk_out_count * type_size, peer); + chunk_out_count * type_size, peer, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - peer, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + peer, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); /* Wait for chunk */ SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_GE, i+1); @@ -714,7 +719,7 @@ shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, si } /* Reset reduce-scatter pSync */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); /* Perform all-gather: @@ -733,17 +738,17 @@ shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, si shmem_internal_put_nbi(SHMEM_CTX_DEFAULT, ((uint8_t *) target) + chunk_out_disp, ((uint8_t *) target) + chunk_out_disp, - chunk_out_count * type_size, peer); + chunk_out_count * type_size, peer, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync+1, &one, sizeof(one), - peer, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + peer, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); /* Wait for chunk */ SHMEM_WAIT_UNTIL(pSync+1, SHMEM_CMP_GE, i+1); } /* reset pSync */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync+1, &zero, sizeof(zero), shmem_internal_my_pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync+1, &zero, sizeof(zero), shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync+1, SHMEM_CMP_EQ, 0); if (free_source) @@ -755,7 +760,8 @@ void shmem_internal_op_to_all_tree(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype) + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx) { long zero = 0, one = 1; long completion = 0; @@ -766,7 +772,7 @@ shmem_internal_op_to_all_tree(void *target, const void *source, size_t count, si if (PE_size == 1) { if (target != source) { - shmem_internal_copy_self(target, source, type_size * count); + shmem_internal_copy_self(target, source, type_size * count, nic_idx); } return; } @@ -791,20 +797,20 @@ shmem_internal_op_to_all_tree(void *target, const void *source, size_t count, si will flush any atomic cache value that may currently exist. */ shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, source, count * type_size, - shmem_internal_my_pe, &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + shmem_internal_my_pe, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_quiet(SHMEM_CTX_DEFAULT); /* let everyone know that it's safe to send to us */ for (i = 0 ; i < num_children ; ++i) { - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync + 1, &one, sizeof(one), children[i]); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync + 1, &one, sizeof(one), children[i], nic_idx); } /* Wait for others to acknowledge sending data */ SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, num_children); /* reset pSync */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } @@ -813,24 +819,24 @@ shmem_internal_op_to_all_tree(void *target, const void *source, size_t count, si SHMEM_WAIT(pSync + 1, 0); /* reset pSync */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync + 1, &zero, sizeof(zero), shmem_internal_my_pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync + 1, &zero, sizeof(zero), shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync + 1, SHMEM_CMP_EQ, 0); /* send data, ack, and wait for completion */ shmem_internal_atomicv(SHMEM_CTX_DEFAULT, target, (num_children == 0) ? source : target, count * type_size, parent, - op, datatype, &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + op, datatype, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } /* broadcast out */ shmem_internal_bcast(target, target, count * type_size, 0, PE_start, - PE_stride, PE_size, pSync + 2, 0); + PE_stride, PE_size, pSync + 2, 0, nic_idx); } @@ -838,7 +844,8 @@ void shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype) + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx) { int my_id = ((shmem_internal_my_pe - PE_start) / PE_stride); int log2_proc = 1, pow2_proc = 2; @@ -851,7 +858,7 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t coun if (PE_size == 1) { if (target != source) { - shmem_internal_copy_self(target, source, type_size * count); + shmem_internal_copy_self(target, source, type_size * count, nic_idx); } free(current_target); return; @@ -896,17 +903,17 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t coun SHMEM_WAIT_UNTIL(pSync_extra_peer, SHMEM_CMP_EQ, ps_target_ready); shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, current_target, wrk_size, peer, - &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync_extra_peer, &ps_data_ready, sizeof(long), peer); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync_extra_peer, &ps_data_ready, sizeof(long), peer, nic_idx); SHMEM_WAIT_UNTIL(pSync_extra_peer, SHMEM_CMP_EQ, ps_data_ready); } else { if (my_id < PE_size - pow2_proc) { int peer = (my_id + pow2_proc) * PE_stride + PE_start; - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync_extra_peer, &ps_target_ready, sizeof(long), peer); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync_extra_peer, &ps_target_ready, sizeof(long), peer, nic_idx); SHMEM_WAIT_UNTIL(pSync_extra_peer, SHMEM_CMP_EQ, ps_data_ready); shmem_internal_reduce_local(op, datatype, count, target, current_target); @@ -922,25 +929,25 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t coun if (shmem_internal_my_pe < peer) { shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, step_psync, &ps_target_ready, - sizeof(long), peer); + sizeof(long), peer, nic_idx); SHMEM_WAIT_UNTIL(step_psync, SHMEM_CMP_EQ, ps_data_ready); shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, current_target, - wrk_size, peer, &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + wrk_size, peer, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, step_psync, &ps_data_ready, - sizeof(long), peer); + sizeof(long), peer, nic_idx); } else { SHMEM_WAIT_UNTIL(step_psync, SHMEM_CMP_EQ, ps_target_ready); shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, current_target, - wrk_size, peer, &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + wrk_size, peer, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, step_psync, &ps_data_ready, - sizeof(long), peer); + sizeof(long), peer, nic_idx); SHMEM_WAIT_UNTIL(step_psync, SHMEM_CMP_EQ, ps_data_ready); } @@ -954,11 +961,11 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t coun int peer = (my_id + pow2_proc) * PE_stride + PE_start; shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, current_target, wrk_size, - peer, &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + peer, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync_extra_peer, &ps_data_ready, - sizeof(long), peer); + sizeof(long), peer, nic_idx); } memcpy(target, current_target, wrk_size); @@ -978,7 +985,8 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t coun *****************************************/ void shmem_internal_collect_linear(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { size_t my_offset; long tmp[2]; @@ -991,7 +999,7 @@ shmem_internal_collect_linear(void *target, const void *source, size_t len, target, source, len, PE_start, PE_stride, PE_size, (void*) pSync); if (PE_size == 1) { - if (target != source) shmem_internal_copy_self(target, source, len); + if (target != source) shmem_internal_copy_self(target, source, len, nic_idx); return; } @@ -1000,7 +1008,7 @@ shmem_internal_collect_linear(void *target, const void *source, size_t len, my_offset = 0; tmp[0] = (long) len; /* FIXME: Potential truncation of size_t into long */ tmp[1] = 1; /* FIXME: Packing flag with data relies on byte ordering */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, tmp, 2 * sizeof(long), PE_start + PE_stride); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, tmp, 2 * sizeof(long), PE_start + PE_stride, nic_idx); } else { /* wait for send data */ @@ -1012,7 +1020,7 @@ shmem_internal_collect_linear(void *target, const void *source, size_t len, tmp[0] = (long) (my_offset + len); tmp[1] = 1; shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, tmp, 2 * sizeof(long), - shmem_internal_my_pe + PE_stride); + shmem_internal_my_pe + PE_stride, nic_idx); } } @@ -1024,13 +1032,13 @@ shmem_internal_collect_linear(void *target, const void *source, size_t len, do { if (len > 0) { shmem_internal_put_nbi(SHMEM_CTX_DEFAULT, ((uint8_t *) target) + my_offset, source, - len, peer); + len, peer, nic_idx); } peer = shmem_internal_circular_iter_next(peer, PE_start, PE_stride, PE_size); } while (peer != start_pe); - shmem_internal_barrier(PE_start, PE_stride, PE_size, &pSync[2]); + shmem_internal_barrier(PE_start, PE_stride, PE_size, &pSync[2], nic_idx); pSync[0] = SHMEM_SYNC_VALUE; pSync[1] = SHMEM_SYNC_VALUE; @@ -1047,7 +1055,8 @@ shmem_internal_collect_linear(void *target, const void *source, size_t len, *****************************************/ void shmem_internal_fcollect_linear(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { long tmp = 1; long completion = 0; @@ -1057,36 +1066,36 @@ shmem_internal_fcollect_linear(void *target, const void *source, size_t len, if (PE_start == shmem_internal_my_pe) { /* Copy data into the target */ - if (source != target) shmem_internal_copy_self(target, source, len); + if (source != target) shmem_internal_copy_self(target, source, len, nic_idx); /* send completion update */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &tmp, sizeof(long), - PE_start, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + PE_start, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); /* wait for N updates */ SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, PE_size); /* Clear pSync */ tmp = 0; - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &tmp, sizeof(tmp), PE_start); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &tmp, sizeof(tmp), PE_start, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } else { /* Push data into the target */ size_t offset = ((shmem_internal_my_pe - PE_start) / PE_stride) * len; shmem_internal_put_nb(SHMEM_CTX_DEFAULT, (char*) target + offset, source, len, PE_start, - &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); /* ensure ordering */ shmem_internal_fence(SHMEM_CTX_DEFAULT); /* send completion update */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &tmp, sizeof(long), - PE_start, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + PE_start, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } shmem_internal_bcast(target, target, len * PE_size, 0, PE_start, PE_stride, - PE_size, pSync + 1, 0); + PE_size, pSync + 1, 0, nic_idx); } @@ -1099,7 +1108,8 @@ shmem_internal_fcollect_linear(void *target, const void *source, size_t len, */ void shmem_internal_fcollect_ring(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { int i; /* my_id is the index in a theoretical 0...N-1 array of @@ -1115,7 +1125,7 @@ shmem_internal_fcollect_ring(void *target, const void *source, size_t len, if (len == 0) return; /* copy my portion to the right place */ - shmem_internal_copy_self((char*) target + (my_id * len), source, len); + shmem_internal_copy_self((char*) target + (my_id * len), source, len, nic_idx); /* send n - 1 messages to the next highest proc. Each message contains what we received the previous step (including our own @@ -1125,8 +1135,8 @@ shmem_internal_fcollect_ring(void *target, const void *source, size_t len, /* send data to me + 1 */ shmem_internal_put_nb(SHMEM_CTX_DEFAULT, (char*) target + iter_offset, (char*) target + iter_offset, - len, next_proc, &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + len, next_proc, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); /* send completion for this round to next proc. Note that we @@ -1134,14 +1144,14 @@ shmem_internal_fcollect_ring(void *target, const void *source, size_t len, between successive calls to the put above. So a rolling counter is safe here. */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(long), - next_proc, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + next_proc, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); /* wait for completion for this round */ SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_GE, i); } /* zero out psync */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(long), shmem_internal_my_pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(long), shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } @@ -1155,7 +1165,8 @@ shmem_internal_fcollect_ring(void *target, const void *source, size_t len, */ void shmem_internal_fcollect_recdbl(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { int my_id = ((shmem_internal_my_pe - PE_start) / PE_stride); int i; @@ -1179,7 +1190,7 @@ shmem_internal_fcollect_recdbl(void *target, const void *source, size_t len, /* copy my portion to the right place */ curr_offset = my_id * len; - shmem_internal_copy_self((char*) target + curr_offset, source, len); + shmem_internal_copy_self((char*) target + curr_offset, source, len, nic_idx); for (i = 0, distance = 0x1 ; distance < PE_size ; i++, distance <<= 1) { int peer = my_id ^ distance; @@ -1187,19 +1198,19 @@ shmem_internal_fcollect_recdbl(void *target, const void *source, size_t len, /* send data to peer */ shmem_internal_put_nb(SHMEM_CTX_DEFAULT, (char*) target + curr_offset, (char*) target + curr_offset, - distance * len, real_peer, &completion); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); + distance * len, real_peer, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); /* mark completion for this round */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, &pSync_ints[i], &one, sizeof(int), - real_peer, SHM_INTERNAL_SUM, SHM_INTERNAL_INT); + real_peer, SHM_INTERNAL_SUM, SHM_INTERNAL_INT, nic_idx); SHMEM_WAIT_UNTIL(&pSync_ints[i], SHMEM_CMP_NE, 0); /* this slot is no longer used, so subtract off results now */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, &pSync_ints[i], &neg_one, sizeof(int), - shmem_internal_my_pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT); + shmem_internal_my_pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT, nic_idx); if (my_id > peer) { curr_offset -= (distance * len); @@ -1212,7 +1223,8 @@ shmem_internal_fcollect_recdbl(void *target, const void *source, size_t len, void shmem_internal_alltoall(void *dest, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { const int my_as_rank = (shmem_internal_my_pe - PE_start) / PE_stride; const void *dest_ptr = (uint8_t *) dest + my_as_rank * len; @@ -1232,12 +1244,12 @@ shmem_internal_alltoall(void *dest, const void *source, size_t len, int peer_as_rank = (peer - PE_start) / PE_stride; /* Peer's index in active set */ shmem_internal_put_nbi(SHMEM_CTX_DEFAULT, (void *) dest_ptr, (uint8_t *) source + peer_as_rank * len, - len, peer); + len, peer, nic_idx); peer = shmem_internal_circular_iter_next(peer, PE_start, PE_stride, PE_size); } while (peer != start_pe); - shmem_internal_barrier(PE_start, PE_stride, PE_size, pSync); + shmem_internal_barrier(PE_start, PE_stride, PE_size, pSync, nic_idx); for (i = 0; i < SHMEM_BARRIER_SYNC_SIZE; i++) pSync[i] = SHMEM_SYNC_VALUE; @@ -1247,7 +1259,8 @@ shmem_internal_alltoall(void *dest, const void *source, size_t len, void shmem_internal_alltoalls(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, size_t elem_size, size_t nelems, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { const int my_as_rank = (shmem_internal_my_pe - PE_start) / PE_stride; const void *dest_base = (uint8_t *) dest + my_as_rank * nelems * dst * elem_size; @@ -1279,7 +1292,7 @@ shmem_internal_alltoalls(void *dest, const void *source, ptrdiff_t dst, for (i = nelems ; i > 0; i--) { shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, (void *) dest_ptr, (uint8_t *) source_ptr, - elem_size, peer); + elem_size, peer, nic_idx); source_ptr += sst * elem_size; dest_ptr += dst * elem_size; @@ -1288,7 +1301,7 @@ shmem_internal_alltoalls(void *dest, const void *source, ptrdiff_t dst, PE_size); } while (peer != start_pe); - shmem_internal_barrier(PE_start, PE_stride, PE_size, pSync); + shmem_internal_barrier(PE_start, PE_stride, PE_size, pSync, nic_idx); for (i = 0; i < SHMEM_BARRIER_SYNC_SIZE; i++) pSync[i] = SHMEM_SYNC_VALUE; diff --git a/src/collectives_c.c4 b/src/collectives_c.c4 index 70c8876b5..62c9c7ce5 100644 --- a/src/collectives_c.c4 +++ b/src/collectives_c.c4 @@ -158,7 +158,9 @@ shmem_barrier_all(void) { SHMEM_ERR_CHECK_INITIALIZED(); - shmem_internal_barrier_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); } @@ -169,7 +171,9 @@ shmem_barrier(int PE_start, int logPE_stride, int PE_size, long *pSync) SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, 1 << logPE_stride, PE_size); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long)*SHMEM_BARRIER_SYNC_SIZE); - shmem_internal_barrier(PE_start, 1 << logPE_stride, PE_size, pSync); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier(PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } @@ -178,7 +182,9 @@ shmem_sync_all(void) { SHMEM_ERR_CHECK_INITIALIZED(); - shmem_internal_sync_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_sync_all(nic_idx); } void SHMEM_FUNCTION_ATTRIBUTES @@ -188,7 +194,9 @@ shmem_sync(int PE_start, int logPE_stride, int PE_size, long *pSync) SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, 1 << logPE_stride, PE_size); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long)*SHMEM_BARRIER_SYNC_SIZE); - shmem_internal_sync(PE_start, 1 << logPE_stride, PE_size, pSync); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_sync(PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } /* Team-based Collective Routines */ @@ -199,9 +207,11 @@ shmem_team_sync(shmem_team_t team) SHMEM_ERR_CHECK_INITIALIZED(); SHMEM_ERR_CHECK_TEAM_VALID(team); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; - long *psync = shmem_internal_team_choose_psync(myteam, SYNC); - shmem_internal_sync(myteam->start, myteam->stride, myteam->size, psync); + long *psync = shmem_internal_team_choose_psync(myteam, SYNC, nic_idx); + shmem_internal_sync(myteam->start, myteam->stride, myteam->size, psync, nic_idx); shmem_internal_team_release_psyncs(myteam, SYNC); return 0; } @@ -228,9 +238,11 @@ shmem_team_sync(shmem_team_t team) SHMEM_ERR_CHECK_OVERLAP(target, source, sizeof(TYPE)*nreduce, \ sizeof(TYPE)*nreduce, 1, 1); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_op_to_all(target, source, nreduce, sizeof(TYPE), \ PE_start, 1 << logPE_stride, PE_size, \ - pWrk, pSync, IOP, ITYPE); \ + pWrk, pSync, IOP, ITYPE, nic_idx); \ } #define SHMEM_DEF_REDUCE(STYPE,TYPE,ITYPE,SOP,IOP) \ @@ -247,11 +259,14 @@ shmem_team_sync(shmem_team_t team) sizeof(TYPE)*nreduce, 1, 1); \ TYPE *pWrk = NULL; \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ - long *psync = shmem_internal_team_choose_psync(myteam, REDUCE); \ + long *psync = shmem_internal_team_choose_psync(myteam, REDUCE, \ + nic_idx); \ shmem_internal_op_to_all(dest, source, nreduce, sizeof(TYPE), \ myteam->start, myteam->stride, myteam->size, pWrk, \ - psync, IOP, ITYPE); \ + psync, IOP, ITYPE, nic_idx); \ shmem_internal_team_release_psyncs(myteam, REDUCE); \ return 0; \ } @@ -292,9 +307,11 @@ shmem_broadcast32(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long)*SHMEM_BCAST_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(target, source, nlong * 4, nlong * 4, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_bcast(target, source, nlong * 4, PE_root, PE_start, 1 << logPE_stride, PE_size, - pSync, 1); + pSync, 1, nic_idx); } @@ -311,9 +328,11 @@ shmem_broadcast64(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long)*SHMEM_BCAST_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(target, source, nlong * 8, nlong * 8, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_bcast(target, source, nlong * 8, PE_root, PE_start, 1 << logPE_stride, PE_size, - pSync, 1); + pSync, 1, nic_idx); } int SHMEM_FUNCTION_ATTRIBUTES @@ -327,15 +346,17 @@ shmem_broadcastmem(shmem_team_t team, void *dest, const void *source, SHMEM_ERR_CHECK_SYMMETRIC(source, nelems); SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems, nelems, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; - long *psync = shmem_internal_team_choose_psync(myteam, BCAST); + long *psync = shmem_internal_team_choose_psync(myteam, BCAST, nic_idx); shmem_internal_bcast(dest, source, nelems, PE_root, myteam->start, myteam->stride, myteam->size, - psync, 1); + psync, 1, nic_idx); shmem_internal_team_release_psyncs(myteam, BCAST); int team_root = myteam->start + PE_root * myteam->stride; if (shmem_internal_my_pe == team_root && dest != source) - shmem_internal_copy_self(dest, source, nelems); + shmem_internal_copy_self(dest, source, nelems, nic_idx); return 0; } @@ -353,16 +374,19 @@ shmem_broadcastmem(shmem_team_t team, void *dest, const void *source, SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems * sizeof(TYPE), \ nelems * sizeof(TYPE), 1, 1); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ - long *psync = shmem_internal_team_choose_psync(myteam, BCAST); \ + long *psync = shmem_internal_team_choose_psync(myteam, BCAST, \ + nic_idx); \ shmem_internal_bcast(dest, source, nelems * sizeof(TYPE), \ PE_root, myteam->start, myteam->stride, \ - myteam->size, psync, 1); \ + myteam->size, psync, 1, nic_idx); \ shmem_internal_team_release_psyncs(myteam, BCAST); \ int team_root = myteam->start + PE_root * myteam->stride; \ if (shmem_internal_my_pe == team_root && dest != source) { \ shmem_internal_copy_self(dest, source, \ - nelems * sizeof(TYPE)); \ + nelems * sizeof(TYPE), nic_idx); \ } \ return 0; \ } @@ -380,8 +404,10 @@ shmem_collect32(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_COLLECT_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(target, source, nlong * 4, nlong * 4, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_collect(target, source, nlong * 4, - PE_start, 1 << logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } @@ -396,8 +422,10 @@ shmem_collect64(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_COLLECT_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(target, source, nlong * 8, nlong * 8, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_collect(target, source, nlong * 8, - PE_start, 1 << logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } #define SHMEM_DEF_COLLECT(STYPE,TYPE) \ @@ -412,12 +440,15 @@ shmem_collect64(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems * sizeof(TYPE), \ nelems * sizeof(TYPE), 1, 1); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ long *psync = shmem_internal_team_choose_psync(myteam, \ - COLLECT); \ + COLLECT, \ + nic_idx); \ shmem_internal_collect(dest, source, nelems * sizeof(TYPE), \ myteam->start, myteam->stride, \ - myteam->size, psync); \ + myteam->size, psync, nic_idx); \ shmem_internal_team_release_psyncs(myteam, COLLECT); \ return 0; \ } @@ -434,10 +465,12 @@ shmem_collectmem(shmem_team_t team, void *dest, const void *source, SHMEM_ERR_CHECK_SYMMETRIC(source, nelems); SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems, nelems, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; - long *psync = shmem_internal_team_choose_psync(myteam, COLLECT); + long *psync = shmem_internal_team_choose_psync(myteam, COLLECT, nic_idx); shmem_internal_collect(dest, source, nelems, myteam->start, - myteam->stride, myteam->size, psync); + myteam->stride, myteam->size, psync, nic_idx); shmem_internal_team_release_psyncs(myteam, COLLECT); return 0; } @@ -453,8 +486,10 @@ shmem_fcollect32(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_COLLECT_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(target, source, nlong * 4, nlong * 4, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_fcollect(target, source, nlong * 4, - PE_start, 1 << logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } @@ -469,8 +504,10 @@ shmem_fcollect64(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_COLLECT_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(target, source, nlong * 8, nlong * 8, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_fcollect(target, source, nlong * 8, - PE_start, 1 << logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } #define SHMEM_DEF_FCOLLECT(STYPE,TYPE) \ @@ -485,12 +522,15 @@ shmem_fcollect64(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems * sizeof(TYPE), \ nelems * sizeof(TYPE), 1, 1); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ long *psync = shmem_internal_team_choose_psync(myteam, \ - COLLECT); \ + COLLECT, \ + nic_idx); \ shmem_internal_fcollect(dest, source, nelems * sizeof(TYPE), \ myteam->start, myteam->stride, \ - myteam->size, psync); \ + myteam->size, psync, nic_idx); \ shmem_internal_team_release_psyncs(myteam, COLLECT); \ return 0; \ } @@ -507,10 +547,12 @@ shmem_fcollectmem(shmem_team_t team, void *dest, const void *source, SHMEM_ERR_CHECK_SYMMETRIC(source, nelems); SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems, nelems, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; - long *psync = shmem_internal_team_choose_psync(myteam, COLLECT); + long *psync = shmem_internal_team_choose_psync(myteam, COLLECT, nic_idx); shmem_internal_fcollect(dest, source, nelems, myteam->start, - myteam->stride, myteam->size, psync); + myteam->stride, myteam->size, psync, nic_idx); shmem_internal_team_release_psyncs(myteam, COLLECT); return 0; } @@ -526,8 +568,10 @@ shmem_alltoall32(void *dest, const void *source, size_t nelems, int PE_start, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_ALLTOALL_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems * 4, nelems * 4, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_alltoall(dest, source, nelems * 4, - PE_start, 1 << logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } @@ -542,8 +586,10 @@ shmem_alltoall64(void *dest, const void *source, size_t nelems, int PE_start, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_ALLTOALL_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems * 8, nelems * 8, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_alltoall(dest, source, nelems * 8, - PE_start, 1 << logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } #define SHMEM_DEF_ALLTOALL(STYPE,TYPE) \ @@ -558,12 +604,15 @@ shmem_alltoall64(void *dest, const void *source, size_t nelems, int PE_start, SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems * sizeof(TYPE), \ nelems * sizeof(TYPE), 1, 1); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ long *psync = shmem_internal_team_choose_psync(myteam, \ - ALLTOALL); \ + ALLTOALL, \ + nic_idx); \ shmem_internal_alltoall(dest, source, nelems * sizeof(TYPE), \ myteam->start, myteam->stride, \ - myteam->size, psync); \ + myteam->size, psync, nic_idx); \ shmem_internal_team_release_psyncs(myteam, ALLTOALL); \ return 0; \ } @@ -580,10 +629,12 @@ shmem_alltoallmem(shmem_team_t team, void *dest, const void *source, SHMEM_ERR_CHECK_SYMMETRIC(source, nelems); SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems, nelems, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; - long *psync = shmem_internal_team_choose_psync(myteam, ALLTOALL); + long *psync = shmem_internal_team_choose_psync(myteam, ALLTOALL, nic_idx); shmem_internal_alltoall(dest, source, nelems, myteam->start, - myteam->stride, myteam->size, psync); + myteam->stride, myteam->size, psync, nic_idx); shmem_internal_team_release_psyncs(myteam, ALLTOALL); return 0; } @@ -602,8 +653,10 @@ shmem_alltoalls32(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, SHMEM_ERR_CHECK_SYMMETRIC(source, 4 * ((nelems-1) * sst + 1)); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_ALLTOALL_SYNC_SIZE); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_alltoalls(dest, source, dst, sst, 4, nelems, PE_start, - 1 << logPE_stride, PE_size, pSync); + 1 << logPE_stride, PE_size, pSync, nic_idx); } @@ -620,8 +673,10 @@ shmem_alltoalls64(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, SHMEM_ERR_CHECK_SYMMETRIC(source, 8 * ((nelems-1) * sst + 1)); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_ALLTOALL_SYNC_SIZE); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_alltoalls(dest, source, dst, sst, 8, nelems, PE_start, - 1 << logPE_stride, PE_size, pSync); + 1 << logPE_stride, PE_size, pSync, nic_idx); } #define SHMEM_DEF_ALLTOALLS(STYPE,TYPE) \ @@ -635,11 +690,14 @@ shmem_alltoalls64(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, SHMEM_ERR_CHECK_SYMMETRIC(dest, nelems * sizeof(TYPE)); \ SHMEM_ERR_CHECK_SYMMETRIC(source, nelems * sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ - long *psync = shmem_internal_team_choose_psync(myteam, ALLTOALL); \ + long *psync = shmem_internal_team_choose_psync(myteam, ALLTOALL, \ + nic_idx); \ shmem_internal_alltoalls(dest, source, dst, sst, sizeof(TYPE), \ nelems, myteam->start, myteam->stride, \ - myteam->size, psync); \ + myteam->size, psync, nic_idx); \ shmem_internal_team_release_psyncs(myteam, ALLTOALL); \ return 0; \ } @@ -655,11 +713,13 @@ shmem_alltoallsmem(shmem_team_t team, void *dest, const void *source, SHMEM_ERR_CHECK_SYMMETRIC(dest, nelems); SHMEM_ERR_CHECK_SYMMETRIC(source, nelems); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; - long *psync = shmem_internal_team_choose_psync(myteam, ALLTOALL); + long *psync = shmem_internal_team_choose_psync(myteam, ALLTOALL, nic_idx); shmem_internal_alltoalls(dest, source, dst, sst, 1, nelems, myteam->start, myteam->stride, myteam->size, - psync); + psync, nic_idx); shmem_internal_team_release_psyncs(myteam, ALLTOALL); return 0; } diff --git a/src/init.c b/src/init.c index 01ca23dfd..b7480ebe2 100644 --- a/src/init.c +++ b/src/init.c @@ -143,7 +143,9 @@ shmem_internal_shutdown(void) return; } - shmem_internal_barrier_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); shmem_internal_finalized = 1; diff --git a/src/lock_c.c b/src/lock_c.c index 269dd0ed2..7008dd8f8 100644 --- a/src/lock_c.c +++ b/src/lock_c.c @@ -44,7 +44,9 @@ shmem_clear_lock(long *lockp) SHMEM_ERR_CHECK_INITIALIZED(); SHMEM_ERR_CHECK_SYMMETRIC(lockp, sizeof(long)); - shmem_internal_clear_lock(lockp); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_clear_lock(lockp, nic_idx); } @@ -54,7 +56,9 @@ shmem_set_lock(long *lockp) SHMEM_ERR_CHECK_INITIALIZED(); SHMEM_ERR_CHECK_SYMMETRIC(lockp, sizeof(long)); - shmem_internal_set_lock(lockp); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_set_lock(lockp, nic_idx); } @@ -64,5 +68,7 @@ shmem_test_lock(long *lockp) SHMEM_ERR_CHECK_INITIALIZED(); SHMEM_ERR_CHECK_SYMMETRIC(lockp, sizeof(long)); - return shmem_internal_test_lock(lockp); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + return shmem_internal_test_lock(lockp, nic_idx); } diff --git a/src/shmem_collectives.h b/src/shmem_collectives.h index 6409c5178..acfae7b41 100644 --- a/src/shmem_collectives.h +++ b/src/shmem_collectives.h @@ -40,13 +40,13 @@ extern coll_type_t shmem_internal_reduce_type; extern coll_type_t shmem_internal_collect_type; extern coll_type_t shmem_internal_fcollect_type; -void shmem_internal_sync_linear(int PE_start, int PE_stride, int PE_size, long *pSync); -void shmem_internal_sync_tree(int PE_start, int PE_stride, int PE_size, long *pSync); -void shmem_internal_sync_dissem(int PE_start, int PE_stride, int PE_size, long *pSync); +void shmem_internal_sync_linear(int PE_start, int PE_stride, int PE_size, long *pSync, size_t nic_idx); +void shmem_internal_sync_tree(int PE_start, int PE_stride, int PE_size, long *pSync, size_t nic_idx); +void shmem_internal_sync_dissem(int PE_start, int PE_stride, int PE_size, long *pSync, size_t nic_idx); static inline void -shmem_internal_sync(int PE_start, int PE_stride, int PE_size, long *pSync) +shmem_internal_sync(int PE_start, int PE_stride, int PE_size, long *pSync, size_t nic_idx) { if (shmem_internal_params.BARRIERS_FLUSH) { fflush(stdout); @@ -58,19 +58,19 @@ shmem_internal_sync(int PE_start, int PE_stride, int PE_size, long *pSync) switch (shmem_internal_barrier_type) { case AUTO: if (PE_size < shmem_internal_params.COLL_CROSSOVER) { - shmem_internal_sync_linear(PE_start, PE_stride, PE_size, pSync); + shmem_internal_sync_linear(PE_start, PE_stride, PE_size, pSync, nic_idx); } else { - shmem_internal_sync_tree(PE_start, PE_stride, PE_size, pSync); + shmem_internal_sync_tree(PE_start, PE_stride, PE_size, pSync, nic_idx); } break; case LINEAR: - shmem_internal_sync_linear(PE_start, PE_stride, PE_size, pSync); + shmem_internal_sync_linear(PE_start, PE_stride, PE_size, pSync, nic_idx); break; case TREE: - shmem_internal_sync_tree(PE_start, PE_stride, PE_size, pSync); + shmem_internal_sync_tree(PE_start, PE_stride, PE_size, pSync, nic_idx); break; case DISSEM: - shmem_internal_sync_dissem(PE_start, PE_stride, PE_size, pSync); + shmem_internal_sync_dissem(PE_start, PE_stride, PE_size, pSync, nic_idx); break; default: RAISE_ERROR_MSG("Illegal barrier/sync type (%d)\n", @@ -85,60 +85,64 @@ shmem_internal_sync(int PE_start, int PE_stride, int PE_size, long *pSync) static inline void -shmem_internal_sync_all(void) +shmem_internal_sync_all(size_t nic_idx) { - shmem_internal_sync(0, 1, shmem_internal_num_pes, shmem_internal_sync_all_psync); + shmem_internal_sync(0, 1, shmem_internal_num_pes, shmem_internal_sync_all_psync, nic_idx); } static inline void -shmem_internal_barrier(int PE_start, int PE_stride, int PE_size, long *pSync) +shmem_internal_barrier(int PE_start, int PE_stride, int PE_size, long *pSync, size_t nic_idx) { shmem_internal_quiet(SHMEM_CTX_DEFAULT); - shmem_internal_sync(PE_start, PE_stride, PE_size, pSync); + shmem_internal_sync(PE_start, PE_stride, PE_size, pSync, nic_idx); } static inline void -shmem_internal_barrier_all(void) +shmem_internal_barrier_all(size_t nic_idx) { shmem_internal_quiet(SHMEM_CTX_DEFAULT); - shmem_internal_sync(0, 1, shmem_internal_num_pes, shmem_internal_barrier_all_psync); + shmem_internal_sync(0, 1, shmem_internal_num_pes, shmem_internal_barrier_all_psync, nic_idx); } void shmem_internal_bcast_linear(void *target, const void *source, size_t len, int PE_root, int PE_start, int PE_stride, int PE_size, - long *pSync, int complete); + long *pSync, int complete, size_t nic_idx); void shmem_internal_bcast_tree(void *target, const void *source, size_t len, int PE_root, int PE_start, int PE_stride, int PE_size, - long *pSync, int complete); + long *pSync, int complete, size_t nic_idx); static inline void shmem_internal_bcast(void *target, const void *source, size_t len, int PE_root, int PE_start, int PE_stride, int PE_size, - long *pSync, int complete) + long *pSync, int complete, size_t nic_idx) { switch (shmem_internal_bcast_type) { case AUTO: if (PE_size < shmem_internal_params.COLL_CROSSOVER) { shmem_internal_bcast_linear(target, source, len, PE_root, PE_start, - PE_stride, PE_size, pSync, complete); + PE_stride, PE_size, pSync, complete, + nic_idx); } else { shmem_internal_bcast_tree(target, source, len, PE_root, PE_start, - PE_stride, PE_size, pSync, complete); + PE_stride, PE_size, pSync, complete, + nic_idx); } break; case LINEAR: shmem_internal_bcast_linear(target, source, len, PE_root, PE_start, - PE_stride, PE_size, pSync, complete); + PE_stride, PE_size, pSync, complete, + nic_idx); break; case TREE: shmem_internal_bcast_tree(target, source, len, PE_root, PE_start, - PE_stride, PE_size, pSync, complete); + PE_stride, PE_size, pSync, complete, + nic_idx); break; default: RAISE_ERROR_MSG("Illegal broadcast type (%d)\n", @@ -150,20 +154,24 @@ shmem_internal_bcast(void *target, const void *source, size_t len, void shmem_internal_op_to_all_linear(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype); + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx); void shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype); + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx); void shmem_internal_op_to_all_tree(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype); + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx); void shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype); + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx); static inline void @@ -171,7 +179,7 @@ shmem_internal_op_to_all(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, shm_internal_op_t op, - shm_internal_datatype_t datatype) + shm_internal_datatype_t datatype, size_t nic_idx) { shmem_internal_assert(type_size > 0); @@ -181,21 +189,21 @@ shmem_internal_op_to_all(void *target, const void *source, size_t count, if (PE_size < shmem_internal_params.COLL_CROSSOVER) { shmem_internal_op_to_all_linear(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); } else { shmem_internal_op_to_all_tree(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); } } else { if (count * type_size < shmem_internal_params.COLL_SIZE_CROSSOVER) shmem_internal_op_to_all_recdbl_sw(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); else shmem_internal_op_to_all_ring(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); } break; @@ -203,33 +211,33 @@ shmem_internal_op_to_all(void *target, const void *source, size_t count, if (shmem_transport_atomic_supported(op, datatype)) { shmem_internal_op_to_all_linear(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); } else { shmem_internal_op_to_all_recdbl_sw(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); } break; case RING: shmem_internal_op_to_all_ring(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); break; case TREE: if (shmem_transport_atomic_supported(op, datatype)) { shmem_internal_op_to_all_tree(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); } else { shmem_internal_op_to_all_recdbl_sw(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); } break; case RECDBL: shmem_internal_op_to_all_recdbl_sw(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); break; default: RAISE_ERROR_MSG("Illegal reduction type (%d)\n", @@ -239,21 +247,23 @@ shmem_internal_op_to_all(void *target, const void *source, size_t count, void shmem_internal_collect_linear(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx); static inline void shmem_internal_collect(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { switch (shmem_internal_collect_type) { case AUTO: shmem_internal_collect_linear(target, source, len, PE_start, PE_stride, - PE_size, pSync); + PE_size, pSync, nic_idx); break; case LINEAR: shmem_internal_collect_linear(target, source, len, PE_start, PE_stride, - PE_size, pSync); + PE_size, pSync, nic_idx); break; default: RAISE_ERROR_MSG("Illegal collect type (%d)\n", @@ -263,37 +273,41 @@ shmem_internal_collect(void *target, const void *source, size_t len, void shmem_internal_fcollect_linear(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx); void shmem_internal_fcollect_ring(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx); void shmem_internal_fcollect_recdbl(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx); static inline void shmem_internal_fcollect(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { switch (shmem_internal_fcollect_type) { case AUTO: shmem_internal_fcollect_ring(target, source, len, PE_start, PE_stride, - PE_size, pSync); + PE_size, pSync, nic_idx); break; case LINEAR: shmem_internal_fcollect_linear(target, source, len, PE_start, PE_stride, - PE_size, pSync); + PE_size, pSync, nic_idx); break; case RING: shmem_internal_fcollect_ring(target, source, len, PE_start, PE_stride, - PE_size, pSync); + PE_size, pSync, nic_idx); break; case RECDBL: if (0 == (PE_size & (PE_size - 1))) { shmem_internal_fcollect_recdbl(target, source, len, PE_start, PE_stride, - PE_size, pSync); + PE_size, pSync, nic_idx); } else { shmem_internal_fcollect_ring(target, source, len, PE_start, PE_stride, - PE_size, pSync); + PE_size, pSync, nic_idx); } break; default: @@ -304,9 +318,11 @@ shmem_internal_fcollect(void *target, const void *source, size_t len, void shmem_internal_alltoall(void *dest, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx); void shmem_internal_alltoalls(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, size_t elem_size, size_t nelems, - int PE_start, int PE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx); #endif diff --git a/src/shmem_lock.h b/src/shmem_lock.h index 158cafc84..ca78e5410 100644 --- a/src/shmem_lock.h +++ b/src/shmem_lock.h @@ -37,7 +37,7 @@ typedef struct lock_t lock_t; static inline void -shmem_internal_clear_lock(long *lockp) +shmem_internal_clear_lock(long *lockp, size_t nic_idx) { lock_t *lock = (lock_t*) lockp; int curr, cond, zero = 0, sig = SIGNAL_MASK; @@ -47,8 +47,8 @@ shmem_internal_clear_lock(long *lockp) /* release the lock if I'm the last to try to obtain it */ cond = shmem_internal_my_pe + 1; shmem_internal_cswap(SHMEM_CTX_DEFAULT, &(lock->last), &zero, &curr, &cond, - sizeof(int), 0, SHM_INTERNAL_INT, 0); // Multiplex across NICs? - shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); // Multiplex across NICs? + sizeof(int), 0, SHM_INTERNAL_INT, nic_idx); // Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); // Multiplex across NICs? /* if local PE was not the last to hold the lock, look for the next in line */ if (curr != shmem_internal_my_pe + 1) { @@ -58,8 +58,8 @@ shmem_internal_clear_lock(long *lockp) for (;;) { shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &cur_data, &(lock->data), sizeof(int), shmem_internal_my_pe, - SHM_INTERNAL_INT, 0); // Multiplex across NICs? - shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); // Multiplex across NICs? + SHM_INTERNAL_INT, nic_idx); // Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); // Multiplex across NICs? if (NEXT(cur_data) != 0) break; @@ -69,21 +69,21 @@ shmem_internal_clear_lock(long *lockp) /* set the signal bit on new lock holder */ shmem_internal_mswap(SHMEM_CTX_DEFAULT, &(lock->data), &sig, &curr, - &sig, sizeof(int), NEXT(cur_data) - 1, SHM_INTERNAL_INT, 0);// Multiplex across NICs? - shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); // Multiplex across NICs? + &sig, sizeof(int), NEXT(cur_data) - 1, SHM_INTERNAL_INT, nic_idx);// Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); // Multiplex across NICs? } } static inline void -shmem_internal_set_lock(long *lockp) +shmem_internal_set_lock(long *lockp, size_t nic_idx) { lock_t *lock = (lock_t*) lockp; int curr, zero = 0, me = shmem_internal_my_pe + 1; /* initialize my elements to zero */ shmem_internal_atomic_set(SHMEM_CTX_DEFAULT, &(lock->data), &zero, - sizeof(zero), shmem_internal_my_pe, SHM_INTERNAL_INT); + sizeof(zero), shmem_internal_my_pe, SHM_INTERNAL_INT, nic_idx); shmem_internal_quiet(SHMEM_CTX_DEFAULT); /* update last with my value to add me to the queue */ @@ -96,16 +96,16 @@ shmem_internal_set_lock(long *lockp) int next_mask = NEXT_MASK; shmem_internal_mswap(SHMEM_CTX_DEFAULT, &(lock->data), &me, &curr, - &next_mask, sizeof(int), curr - 1, SHM_INTERNAL_INT, 0); // Multiplex across NICs? - shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); // Multiplex across NICs? + &next_mask, sizeof(int), curr - 1, SHM_INTERNAL_INT, nic_idx); // Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); // Multiplex across NICs? /* now wait for the signal part of data to be non-zero */ for (;;) { int cur_data; shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &cur_data, &(lock->data), - sizeof(int), shmem_internal_my_pe, SHM_INTERNAL_INT, 0); // Multiplex across NICs? - shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); // Multiplex across NICs? + sizeof(int), shmem_internal_my_pe, SHM_INTERNAL_INT, nic_idx); // Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); // Multiplex across NICs? if (SIGNAL(cur_data) != 0) break; @@ -122,20 +122,20 @@ shmem_internal_set_lock(long *lockp) static inline int -shmem_internal_test_lock(long *lockp) +shmem_internal_test_lock(long *lockp, size_t nic_idx) { lock_t *lock = (lock_t*) lockp; int curr, me = shmem_internal_my_pe + 1, zero = 0; /* initialize my elements to zero */ shmem_internal_atomic_set(SHMEM_CTX_DEFAULT, &(lock->data), &zero, - sizeof(zero), shmem_internal_my_pe, SHM_INTERNAL_INT); + sizeof(zero), shmem_internal_my_pe, SHM_INTERNAL_INT, nic_idx); shmem_internal_quiet(SHMEM_CTX_DEFAULT); /* add self to last if and only if the lock is zero (ie, no one has the lock) */ shmem_internal_cswap(SHMEM_CTX_DEFAULT, &(lock->last), &me, &curr, &zero, - sizeof(int), 0, SHM_INTERNAL_INT, 0); // Multiplex across NICs? - shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); // Multiplex across NICs? + sizeof(int), 0, SHM_INTERNAL_INT, nic_idx); // Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); // Multiplex across NICs? if (0 == curr) { shmem_internal_membar_acquire(); diff --git a/src/shmem_team.c b/src/shmem_team.c index ed54fd239..833a03a17 100644 --- a/src/shmem_team.c +++ b/src/shmem_team.c @@ -289,7 +289,7 @@ int shmem_internal_team_translate_pe(shmem_internal_team_t *src_team, int src_pe int shmem_internal_team_split_strided(shmem_internal_team_t *parent_team, int PE_start, int PE_stride, int PE_size, const shmem_team_config_t *config, long config_mask, - shmem_internal_team_t **new_team) + shmem_internal_team_t **new_team, size_t nic_idx) { *new_team = SHMEM_TEAM_INVALID; @@ -320,7 +320,7 @@ int shmem_internal_team_split_strided(shmem_internal_team_t *parent_team, int PE int my_pe = shmem_internal_pe_in_active_set(shmem_internal_my_pe, global_PE_start, PE_stride, PE_size); - long *psync = shmem_internal_team_choose_psync(parent_team, REDUCE); + long *psync = shmem_internal_team_choose_psync(parent_team, REDUCE, nic_idx); shmem_internal_team_t *myteam = NULL; *team_ret_val = 0; *team_ret_val_reduced = 0; @@ -366,7 +366,7 @@ int shmem_internal_team_split_strided(shmem_internal_team_t *parent_team, int PE shmem_internal_op_to_all(psync_pool_avail_reduced, psync_pool_avail, N_PSYNC_BYTES, 1, myteam->start, PE_stride, PE_size, NULL, - psync, SHM_INTERNAL_BAND, SHM_INTERNAL_UCHAR); + psync, SHM_INTERNAL_BAND, SHM_INTERNAL_UCHAR, nic_idx); /* We cannot release the psync here, because this reduction may not * have been performed on the entire parent team. */ @@ -406,18 +406,18 @@ int shmem_internal_team_split_strided(shmem_internal_team_t *parent_team, int PE /* This barrier on the parent team eliminates problematic race conditions * during psync allocation between back-to-back team creations. */ - psync = shmem_internal_team_choose_psync(parent_team, SYNC); + psync = shmem_internal_team_choose_psync(parent_team, SYNC, nic_idx); - shmem_internal_barrier(parent_team->start, parent_team->stride, parent_team->size, psync); + shmem_internal_barrier(parent_team->start, parent_team->stride, parent_team->size, psync, nic_idx); shmem_internal_team_release_psyncs(parent_team, SYNC); - /* This MAX reduction assures all PEs return the same value. */ - psync = shmem_internal_team_choose_psync(parent_team, REDUCE); + /* This OR reduction assures all PEs return the same value. */ + psync = shmem_internal_team_choose_psync(parent_team, REDUCE, nic_idx); shmem_internal_op_to_all(team_ret_val_reduced, team_ret_val, 1, sizeof(int), parent_team->start, parent_team->stride, parent_team->size, NULL, - psync, SHM_INTERNAL_MAX, SHM_INTERNAL_INT); + psync, SHM_INTERNAL_MAX, SHM_INTERNAL_INT, nic_idx); shmem_internal_team_release_psyncs(parent_team, REDUCE); @@ -433,7 +433,7 @@ int shmem_internal_team_split_strided(shmem_internal_team_t *parent_team, int PE int shmem_internal_team_split_2d(shmem_internal_team_t *parent_team, int xrange, const shmem_team_config_t *xaxis_config, long xaxis_mask, shmem_internal_team_t **xaxis_team, const shmem_team_config_t *yaxis_config, - long yaxis_mask, shmem_internal_team_t **yaxis_team) + long yaxis_mask, shmem_internal_team_t **yaxis_team, size_t nic_idx) { *xaxis_team = SHMEM_TEAM_INVALID; *yaxis_team = SHMEM_TEAM_INVALID; @@ -460,7 +460,8 @@ int shmem_internal_team_split_2d(shmem_internal_team_t *parent_team, int xrange, int xsize = (i == num_xteams - 1 && parent_size % xrange) ? parent_size % xrange : xrange; ret = shmem_internal_team_split_strided(parent_team, start, parent_stride, - xsize, xaxis_config, xaxis_mask, &my_xteam); + xsize, xaxis_config, xaxis_mask, &my_xteam, + nic_idx); if (ret) { RAISE_ERROR_MSG("Creation of x-axis team %d of %d failed\n", i+1, num_xteams); } @@ -481,7 +482,8 @@ int shmem_internal_team_split_2d(shmem_internal_team_t *parent_team, int xrange, int ysize = (remainder && i < remainder) ? yrange + 1 : yrange; ret = shmem_internal_team_split_strided(parent_team, start, xrange*parent_stride, - ysize, yaxis_config, yaxis_mask, &my_yteam); + ysize, yaxis_config, yaxis_mask, &my_yteam, + nic_idx); if (ret) { RAISE_ERROR_MSG("Creation of y-axis team %d of %d failed\n", i+1, num_yteams); } @@ -493,9 +495,9 @@ int shmem_internal_team_split_2d(shmem_internal_team_t *parent_team, int xrange, } } - long *psync = shmem_internal_team_choose_psync(parent_team, SYNC); + long *psync = shmem_internal_team_choose_psync(parent_team, SYNC, nic_idx); - shmem_internal_barrier(parent_start, parent_stride, parent_size, psync); + shmem_internal_barrier(parent_start, parent_stride, parent_size, psync, nic_idx); shmem_internal_team_release_psyncs(parent_team, SYNC); @@ -535,7 +537,7 @@ int shmem_internal_team_destroy(shmem_internal_team_t *team) /* Returns a psync from the given team that can be safely used for the * specified collective operation. */ -long * shmem_internal_team_choose_psync(shmem_internal_team_t *team, shmem_internal_team_op_t op) +long * shmem_internal_team_choose_psync(shmem_internal_team_t *team, shmem_internal_team_op_t op, size_t nic_idx) { switch (op) { @@ -556,7 +558,7 @@ long * shmem_internal_team_choose_psync(shmem_internal_team_t *team, shmem_inter size_t psync = team->psync_idx * SHMEM_SYNC_SIZE; shmem_internal_sync(team->start, team->stride, team->size, - &shmem_internal_psync_barrier_pool[psync]); + &shmem_internal_psync_barrier_pool[psync], nic_idx); for (int i = 0; i < N_PSYNCS_PER_TEAM; i++) { team->psync_avail[i] = 1; diff --git a/src/shmem_team.h b/src/shmem_team.h index 195730864..bf006c8b6 100644 --- a/src/shmem_team.h +++ b/src/shmem_team.h @@ -58,11 +58,12 @@ int shmem_internal_team_translate_pe(shmem_internal_team_t *src_team, int src_pe int shmem_internal_team_split_strided(shmem_internal_team_t *parent_team, int PE_start, int PE_stride, int PE_size, const shmem_team_config_t *config, long config_mask, - shmem_internal_team_t **new_team); + shmem_internal_team_t **new_team, size_t nic_idx); int shmem_internal_team_split_2d(shmem_internal_team_t *parent_team, int xrange, const shmem_team_config_t *xaxis_config, long xaxis_mask, shmem_internal_team_t **xaxis_team, - const shmem_team_config_t *yaxis_config, long yaxis_mask, shmem_internal_team_t **yaxis_team); + const shmem_team_config_t *yaxis_config, long yaxis_mask, shmem_internal_team_t **yaxis_team, + size_t nic_idx); int shmem_internal_team_destroy(shmem_internal_team_t *team); @@ -70,7 +71,7 @@ int shmem_internal_team_create_ctx(shmem_internal_team_t *team, long options, sh int shmem_internal_ctx_get_team(shmem_ctx_t ctx, shmem_internal_team_t **team); -long * shmem_internal_team_choose_psync(shmem_internal_team_t *team, shmem_internal_team_op_t op); +long * shmem_internal_team_choose_psync(shmem_internal_team_t *team, shmem_internal_team_op_t op, size_t nic_idx); void shmem_internal_team_release_psyncs(shmem_internal_team_t *team, shmem_internal_team_op_t op); diff --git a/src/symmetric_heap_c.c b/src/symmetric_heap_c.c index 30b319ea9..176f4d01b 100644 --- a/src/symmetric_heap_c.c +++ b/src/symmetric_heap_c.c @@ -295,7 +295,9 @@ shmem_malloc(size_t size) ret = dlmalloc(size); SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - shmem_internal_barrier_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); return ret; } @@ -313,7 +315,9 @@ shmem_calloc(size_t count, size_t size) ret = dlcalloc(count, size); SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - shmem_internal_barrier_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); return ret; } @@ -326,7 +330,9 @@ shmem_free(void *ptr) SHMEM_ERR_CHECK_SYMMETRIC_HEAP(ptr); } - shmem_internal_barrier_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); shmem_internal_free(ptr); } @@ -344,7 +350,9 @@ shmem_realloc(void *ptr, size_t size) SHMEM_ERR_CHECK_SYMMETRIC_HEAP(ptr); } - shmem_internal_barrier_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); SHMEM_MUTEX_LOCK(shmem_internal_mutex_alloc); if (size == 0 && ptr != NULL) { @@ -355,7 +363,7 @@ shmem_realloc(void *ptr, size_t size) } SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - shmem_internal_barrier_all(); + shmem_internal_barrier_all(nic_idx); return ret; } @@ -376,7 +384,9 @@ shmem_align(size_t alignment, size_t size) ret = dlmemalign(alignment, size); SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - shmem_internal_barrier_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); return ret; } @@ -430,9 +440,11 @@ shmem_malloc_with_hints(size_t size, long hints) ret = dlmalloc(size); SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - if (!(hints & SHMEMX_MALLOC_NO_BARRIER)) - shmem_internal_barrier_all(); - + if (!(hints & SHMEMX_MALLOC_NO_BARRIER)) { + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); + } return ret; } diff --git a/src/teams_c.c4 b/src/teams_c.c4 index c86065f73..1c0e3aa7f 100644 --- a/src/teams_c.c4 +++ b/src/teams_c.c4 @@ -115,9 +115,12 @@ shmem_team_split_strided(shmem_team_t parent_team, int PE_start, { SHMEM_ERR_CHECK_INITIALIZED(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); return shmem_internal_team_split_strided((shmem_internal_team_t *)parent_team, PE_start, PE_stride, PE_size, config, - config_mask, (shmem_internal_team_t **)new_team); + config_mask, (shmem_internal_team_t **)new_team, + nic_idx); } int SHMEM_FUNCTION_ATTRIBUTES @@ -128,11 +131,14 @@ shmem_team_split_2d(shmem_team_t parent_team, int xrange, { SHMEM_ERR_CHECK_INITIALIZED(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); return shmem_internal_team_split_2d((shmem_internal_team_t *)parent_team, xrange, xaxis_config, xaxis_mask, (shmem_internal_team_t **)xaxis_team, yaxis_config, yaxis_mask, - (shmem_internal_team_t **)yaxis_team); + (shmem_internal_team_t **)yaxis_team, + nic_idx); } int SHMEM_FUNCTION_ATTRIBUTES From 168131ba22a7814858b45750c3b19d11a92e0a2d Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Mon, 3 Jun 2024 10:10:28 -0700 Subject: [PATCH 06/11] src: More fixes --- src/atomic_nbi_c.c4 | 5 ++++- src/shmem_comm.h | 4 ++-- src/shmem_synchronization.h | 9 ++++++++- src/transport_none.h | 2 +- src/transport_ofi.c | 40 +++++++++++++++++++++++++++---------- src/transport_ofi.h | 38 +++++++++++++++++------------------ src/transport_portals4.h | 2 +- src/transport_ucx.h | 2 +- 8 files changed, 66 insertions(+), 36 deletions(-) diff --git a/src/atomic_nbi_c.c4 b/src/atomic_nbi_c.c4 index fdd687e7e..924b2b2b2 100644 --- a/src/atomic_nbi_c.c4 +++ b/src/atomic_nbi_c.c4 @@ -141,8 +141,11 @@ SHMEM_DEFINE_FOR_BITWISE_AMO(`SHMEM_PROF_DEF_CTX_FETCH_XOR_NBI') SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_cswap_nbi(ctx, target, &value, fetch, &cond, \ - sizeof(TYPE), pe, ITYPE); \ + sizeof(TYPE), pe, ITYPE, nic_idx); \ } diff --git a/src/shmem_comm.h b/src/shmem_comm.h index 32185ac78..506cd5a28 100644 --- a/src/shmem_comm.h +++ b/src/shmem_comm.h @@ -206,7 +206,7 @@ static inline void shmem_internal_cswap_nbi(shmem_ctx_t ctx, void *target, void *source, void *dest, void *operand, size_t len, int pe, - shm_internal_datatype_t datatype) + shm_internal_datatype_t datatype, size_t nic_idx) { shmem_internal_assert(len > 0); @@ -214,7 +214,7 @@ shmem_internal_cswap_nbi(shmem_ctx_t ctx, void *target, void *source, shmem_shr_transport_cswap(ctx, target, source, dest, operand, len, pe, datatype); } else { shmem_transport_cswap_nbi((shmem_transport_ctx_t *)ctx, target, source, - dest, operand, len, pe, datatype); + dest, operand, len, pe, datatype, nic_idx); } } diff --git a/src/shmem_synchronization.h b/src/shmem_synchronization.h index 0270d6d7b..407cc6314 100644 --- a/src/shmem_synchronization.h +++ b/src/shmem_synchronization.h @@ -108,9 +108,16 @@ shmem_internal_fence(shmem_ctx_t ctx) #define SHMEM_WAIT_UNTIL_POLL(var, cond, value) \ do { \ - int cmpret; \ + /* Adding volatile attribute resolves + hanging behavior observed in put/get perf + tests, though put perf test will still hang + frequently, and specifically after 4096 byte msg + size test*/ \ + volatile int cmpret; \ \ + /*shmem_transport_probe();*/ \ COMP(cond, SYNC_LOAD(var), value, cmpret); \ + /*shmem_transport_probe();*/ \ while (!cmpret) { \ shmem_transport_probe(); \ SPINLOCK_BODY(); \ diff --git a/src/transport_none.h b/src/transport_none.h index f85d121c9..6d4a9c547 100644 --- a/src/transport_none.h +++ b/src/transport_none.h @@ -192,7 +192,7 @@ static inline void shmem_transport_cswap_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, const void *operand, size_t len, int pe, - shm_internal_datatype_t datatype) + shm_internal_datatype_t datatype, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } diff --git a/src/transport_ofi.c b/src/transport_ofi.c index 690c55054..256ab18e7 100644 --- a/src/transport_ofi.c +++ b/src/transport_ofi.c @@ -1266,12 +1266,11 @@ int publish_av_info(struct fabric_info *info) return ret; } -char * alladdrs = NULL; static inline int populate_av(void) { int i, ret, err = 0; - //char *alladdrs = NULL; + char *alladdrs = NULL; alladdrs = malloc(shmem_internal_num_pes * shmem_transport_ofi_addrlen); if (alladdrs == NULL) { @@ -1298,7 +1297,20 @@ int populate_av(void) return ret; } - //free(alladdrs); + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + ret = fi_av_insert(shmem_transport_ctx_default.av[idx], + alladdrs, + shmem_internal_num_pes, + addr_table, + 0, + NULL); + if (ret != shmem_internal_num_pes) { + RAISE_WARN_STR("av insert failed"); + return ret; + } + } + + free(alladdrs); return 0; } @@ -1835,13 +1847,6 @@ static int shmem_transport_ofi_ctx_init(shmem_transport_ctx_t *ctx, int id) NULL); OFI_CHECK_RETURN_STR(ret, "AV creation failed"); - ret = fi_av_insert(/*shmem_transport_ofi_avfd*/ ctx->av[idx], - alladdrs, - shmem_internal_num_pes, - addr_table, - 0, - NULL); - ret = fi_cntr_open(/*shmem_transport_ofi_domainfd*/ ctx->domain[idx], &cntr_put_attr, &ctx->put_cntr[idx], NULL); OFI_CHECK_RETURN_MSG(ret, "put_cntr creation failed (%s)\n", fi_strerror(errno)); @@ -2247,6 +2252,21 @@ void shmem_transport_ctx_destroy(shmem_transport_ctx_t *ctx) ret = fi_close(&ctx->cq[idx]->fid); OFI_CHECK_ERROR_MSG(ret, "Context CQ close failed (%s)\n", fi_strerror(errno)); } + + if (ctx->av && ctx->av[idx]) { + ret = fi_close(&ctx->av[idx]->fid); + OFI_CHECK_ERROR_MSG(ret, "Context AV close failed (%s)\n", fi_strerror(errno)); + } + + if (ctx->domain && ctx->domain[idx]) { + ret = fi_close(&ctx->domain[idx]->fid); + OFI_CHECK_ERROR_MSG(ret, "Context domain close failed (%s)\n", fi_strerror(errno)); + } + + if (ctx->fabric && ctx->fabric[idx]) { + ret = fi_close(&ctx->fabric[idx]->fid); + OFI_CHECK_ERROR_MSG(ret, "Context fabric close failed (%s)\n", fi_strerror(errno)); + } } #ifdef USE_CTX_LOCK diff --git a/src/transport_ofi.h b/src/transport_ofi.h index 2edb1d577..9319fa4ee 100644 --- a/src/transport_ofi.h +++ b/src/transport_ofi.h @@ -89,9 +89,9 @@ extern pthread_mutex_t shmem_transport_ofi_progress_lock; do { \ if ((err) == -FI_EAVAIL) { \ struct fi_cq_err_entry e = {0}; \ - ssize_t ret = fi_cq_readerr((ctx)->cq, (void *)&e, 0); \ + ssize_t ret = fi_cq_readerr((ctx)->cq, (void *)&e, 0); /* FIX */ \ if (ret == 1) { \ - const char *errmsg = fi_cq_strerror((ctx)->cq, e.prov_errno, \ + const char *errmsg = fi_cq_strerror((ctx)->cq /* FIX */, e.prov_errno, \ e.err_data, NULL, 0); \ RAISE_ERROR_MSG("Error in operation: %s\n", errmsg); \ } else { \ @@ -422,7 +422,7 @@ void shmem_transport_ofi_drain_cq(shmem_transport_ctx_t *ctx) struct fi_cq_entry buf; for (;;) { - ret = fi_cq_read(ctx->cq, (void *)&buf, 1); + ret = fi_cq_read(ctx->cq, (void *)&buf, 1); /* FIX */ if (ret == -FI_EAGAIN) break; /* No events */ @@ -850,7 +850,7 @@ void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, co int atomic_op = (sig_op == SHMEM_SIGNAL_ADD) ? FI_SUM : FI_ATOMIC_WRITE; SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[1]); /* FIX */ + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ const struct fi_ioc msg_iov_signal = { .addr = (uint8_t *) &signal, @@ -875,8 +875,8 @@ void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, co }; do { - ret = fi_atomicmsg(ctx->ep[1], &msg_signal, flags_signal); /* FIX */ - } while (try_again(ctx, ret, &polled, 1)); /* FIX */ + ret = fi_atomicmsg(ctx->ep[nic_idx], &msg_signal, flags_signal); /* FIXED? */ + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } @@ -963,7 +963,7 @@ void shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *s static inline -void shmem_transport_get_wait(shmem_transport_ctx_t* ctx, size_t idx) +void shmem_transport_get_wait(shmem_transport_ctx_t* ctx, size_t nic_idx) { /* wait for get counter to meet outstanding count value */ @@ -980,9 +980,9 @@ void shmem_transport_get_wait(shmem_transport_ctx_t* ctx, size_t idx) while (poll_count < shmem_transport_ofi_get_poll_limit || shmem_transport_ofi_get_poll_limit < 0) { - success = fi_cntr_read(ctx->get_cntr[idx]); - fail = fi_cntr_readerr(ctx->get_cntr[idx]); - cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[idx]); + success = fi_cntr_read(ctx->get_cntr[nic_idx]); + fail = fi_cntr_readerr(ctx->get_cntr[nic_idx]); + cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[nic_idx]); shmem_transport_probe(); @@ -998,11 +998,11 @@ void shmem_transport_get_wait(shmem_transport_ctx_t* ctx, size_t idx) } poll_count++; } - cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[idx]); + cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[nic_idx]); do { cnt = cnt_new; - ssize_t ret = fi_cntr_wait(ctx->get_cntr[idx], cnt, -1); - cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[idx]); + ssize_t ret = fi_cntr_wait(ctx->get_cntr[nic_idx], cnt, -1); + cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[nic_idx]); OFI_CTX_CHECK_ERROR(ctx, ret); } while (cnt < cnt_new); shmem_internal_assert(cnt == cnt_new); @@ -1014,7 +1014,7 @@ void shmem_transport_get_wait(shmem_transport_ctx_t* ctx, size_t idx) static inline void shmem_transport_cswap_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, const void *operand, - size_t len, int pe, int datatype) + size_t len, int pe, int datatype, size_t nic_idx) { int ret = 0; uint64_t dst = (uint64_t) pe; @@ -1044,10 +1044,10 @@ void shmem_transport_cswap_nbi(shmem_transport_ctx_t* ctx, void *target, const }; SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[1]); /* FIX */ + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_compare_atomicmsg(ctx->ep[1], /* FIX */ + ret = fi_compare_atomicmsg(ctx->ep[nic_idx], /* FIXED? */ &msg, &comparev, NULL, @@ -1057,7 +1057,7 @@ void shmem_transport_cswap_nbi(shmem_transport_ctx_t* ctx, void *target, const 1, FI_INJECT); /* FI_DELIVERY_COMPLETE is not required as it is implied for fetch atomicmsgs */ - } while (try_again(ctx, ret, &polled, 1)); /* FIX */ + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } @@ -1071,7 +1071,7 @@ void shmem_transport_cswap(shmem_transport_ctx_t* ctx, void *target, const void * That is why non-blocking API is used which uses FI_INJECT. FI_ATOMIC_READ is * also not supported currently */ shmem_transport_cswap_nbi(ctx, target, source, - dest, operand, len, pe, datatype); + dest, operand, len, pe, datatype, nic_idx); #else int ret = 0; uint64_t dst = (uint64_t) pe; @@ -1447,7 +1447,7 @@ int shmem_transport_atomic_supported(shm_internal_op_t op, * actually required by FI_THREAD_COMPLETION. */ SHMEM_TRANSPORT_OFI_CTX_LOCK(&shmem_transport_ctx_default); - int ret = fi_atomicvalid(shmem_transport_ctx_default.ep[1], /* FIX */ + int ret = fi_atomicvalid(shmem_transport_ctx_default.ep[0], /* FIX */ SHMEM_TRANSPORT_DTYPE(datatype), op, &size); SHMEM_TRANSPORT_OFI_CTX_UNLOCK(&shmem_transport_ctx_default); diff --git a/src/transport_portals4.h b/src/transport_portals4.h index b578e900d..9d7383a9d 100644 --- a/src/transport_portals4.h +++ b/src/transport_portals4.h @@ -811,7 +811,7 @@ void shmem_transport_cswap_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, const void *operand, size_t len, int pe, - ptl_datatype_t datatype) + ptl_datatype_t datatype, size_t nic_idx) { /* transport_cswap already buffers the source and operand arguments */ shmem_transport_cswap(ctx, target, source, dest, operand, len, pe, datatype); diff --git a/src/transport_ucx.h b/src/transport_ucx.h index 5f906d21d..a561682a6 100644 --- a/src/transport_ucx.h +++ b/src/transport_ucx.h @@ -442,7 +442,7 @@ static inline void shmem_transport_cswap_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, const void *operand, size_t len, int pe, - shm_internal_datatype_t datatype) + shm_internal_datatype_t datatype, size_t nic_idx) { uint8_t *remote_addr; ucp_rkey_h rkey; From 4e08aef0be1a1fabda1b5217e50d7703c6aa4e2a Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Wed, 5 Jun 2024 09:43:16 -0700 Subject: [PATCH 07/11] src: Fix shmem_transport_put_quiet --- src/shmem_internal.h | 4 ---- src/shmem_synchronization.h | 11 +++-------- src/transport_ofi.h | 28 +++++++++++++--------------- 3 files changed, 16 insertions(+), 27 deletions(-) diff --git a/src/shmem_internal.h b/src/shmem_internal.h index 377a6814e..83c2ff477 100644 --- a/src/shmem_internal.h +++ b/src/shmem_internal.h @@ -186,10 +186,6 @@ extern hwloc_topology_t shmem_internal_topology; } \ } while(0) -/* TODO: Add definition if not using OFI or if multiplexing disabled. - * Would just return 0, or just do nothing since nic_idx will already - * be initialized to 0. - */ #ifdef USE_OFI #define SHMEM_GET_TRANSMIT_NIC_IDX(idx) \ do { \ diff --git a/src/shmem_synchronization.h b/src/shmem_synchronization.h index 407cc6314..590d79787 100644 --- a/src/shmem_synchronization.h +++ b/src/shmem_synchronization.h @@ -108,16 +108,11 @@ shmem_internal_fence(shmem_ctx_t ctx) #define SHMEM_WAIT_UNTIL_POLL(var, cond, value) \ do { \ - /* Adding volatile attribute resolves - hanging behavior observed in put/get perf - tests, though put perf test will still hang - frequently, and specifically after 4096 byte msg - size test*/ \ - volatile int cmpret; \ + int cmpret; \ \ - /*shmem_transport_probe();*/ \ + /*shmem_transport_probe();*/ \ COMP(cond, SYNC_LOAD(var), value, cmpret); \ - /*shmem_transport_probe();*/ \ + /*shmem_transport_probe();*/ \ while (!cmpret) { \ shmem_transport_probe(); \ SPINLOCK_BODY(); \ diff --git a/src/transport_ofi.h b/src/transport_ofi.h index 9319fa4ee..1a1607b51 100644 --- a/src/transport_ofi.h +++ b/src/transport_ofi.h @@ -480,7 +480,7 @@ shmem_transport_ofi_bounce_buffer_t * create_bounce_buffer(shmem_transport_ctx_t } static inline -void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx) +void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx, size_t idx) { SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); @@ -510,11 +510,11 @@ void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx) success = 0; fail = 0; - for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + //for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { success = fi_cntr_read(ctx->put_cntr[idx]); /* FIXED? */ fail = fi_cntr_readerr(ctx->put_cntr[idx]); /* FIXED? */ cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[idx]); /* FIXED? */ - } + //} shmem_transport_probe(); if (success < cnt && fail == 0) { @@ -529,7 +529,7 @@ void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx) } poll_count++; } - for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + //for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[idx]); /* FIXED? */ do { cnt = cnt_new; @@ -538,7 +538,7 @@ void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx) OFI_CTX_CHECK_ERROR(ctx, ret); } while (cnt < cnt_new); shmem_internal_assert(cnt == cnt_new); - } + //} SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } @@ -546,9 +546,8 @@ void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx) static inline int shmem_transport_quiet(shmem_transport_ctx_t* ctx) { - - shmem_transport_put_quiet(ctx); for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + shmem_transport_put_quiet(ctx, idx); shmem_transport_get_wait(ctx, idx); } @@ -559,13 +558,13 @@ int shmem_transport_quiet(shmem_transport_ctx_t* ctx) static inline int shmem_transport_fence(shmem_transport_ctx_t* ctx) { + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { #if WANT_TOTAL_DATA_ORDERING == 0 - /* Communication is unordered; must wait for puts and buffered (injected) - * non-fetching atomics to be completed in order to ensure ordering. */ - shmem_transport_put_quiet(ctx); + /* Communication is unordered; must wait for puts and buffered (injected) + * non-fetching atomics to be completed in order to ensure ordering. */ + shmem_transport_put_quiet(ctx, idx); #endif - /* Complete fetching ops; needed to support nonblocking fetch-atomics */ - for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + /* Complete fetching ops; needed to support nonblocking fetch-atomics */ shmem_transport_get_wait(ctx, idx); } @@ -733,7 +732,7 @@ void shmem_transport_put_nb(shmem_transport_ctx_t* ctx, void *target, const void SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } else { - shmem_transport_ofi_put_large(ctx, target, source,len, pe, nic_idx); + shmem_transport_ofi_put_large(ctx, target, source, len, pe, nic_idx); (*completion)++; } } @@ -886,9 +885,8 @@ static inline void shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion, size_t nic_idx) { shmem_internal_assert((*completion) >= 0); - if((*completion) > 0) { - shmem_transport_put_quiet(ctx); + shmem_transport_put_quiet(ctx, nic_idx); (*completion)--; } } From f7e20dc6af7354744d4d71ee98256f23969bff70 Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Tue, 11 Jun 2024 14:25:19 -0700 Subject: [PATCH 08/11] src: Experiment with changes to shmem_transport_probe --- src/shmem_comm.h | 10 ++++++---- src/transport_ofi.c | 4 ++-- src/transport_ofi.h | 14 ++++++++++---- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/shmem_comm.h b/src/shmem_comm.h index 506cd5a28..02034ed6d 100644 --- a/src/shmem_comm.h +++ b/src/shmem_comm.h @@ -251,11 +251,12 @@ shmem_internal_atomic(shmem_ctx_t ctx, void *target, const void *source, size_t the CXI provider */ unsigned long long tmp_fetch = 0; shmem_transport_fetch_atomic((shmem_transport_ctx_t *)ctx, target, - source, &tmp_fetch, len, pe, op, datatype); - shmem_transport_get_wait((shmem_transport_ctx_t *)ctx); + source, &tmp_fetch, len, pe, op, datatype, nic_idx); + shmem_transport_get_wait((shmem_transport_ctx_t *)ctx, nic_idx); #else shmem_transport_atomic((shmem_transport_ctx_t *)ctx, target, source, len, pe, op, datatype, nic_idx); +#endif } } @@ -291,11 +292,12 @@ shmem_internal_atomic_set(shmem_ctx_t ctx, void *target, const void *source, siz the CXI provider */ unsigned long long tmp_fetch = 0; shmem_transport_fetch_atomic((shmem_transport_ctx_t *)ctx, target, - source, &tmp_fetch, len, pe, FI_ATOMIC_WRITE, datatype); - shmem_transport_get_wait((shmem_transport_ctx_t *)ctx); + source, &tmp_fetch, len, pe, FI_ATOMIC_WRITE, datatype, nic_idx); + shmem_transport_get_wait((shmem_transport_ctx_t *)ctx, nic_idx); #else shmem_transport_atomic_set((shmem_transport_ctx_t *)ctx, target, source, len, pe, datatype, nic_idx); +#endif } } diff --git a/src/transport_ofi.c b/src/transport_ofi.c index 256ab18e7..7a4451eb5 100644 --- a/src/transport_ofi.c +++ b/src/transport_ofi.c @@ -2137,8 +2137,8 @@ int shmem_transport_ctx_create(struct shmem_internal_team_t *team, long options, ctxp->stx_idx = malloc(shmem_transport_ofi_num_nics * sizeof(int)); for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { #ifndef USE_CTX_LOCK - shmem_internal_cntr_write(&ctxp->pending_put_cntr, 0); - shmem_internal_cntr_write(&ctxp->pending_get_cntr, 0); + shmem_internal_cntr_write(&ctxp->pending_put_cntr[idx], 0); + shmem_internal_cntr_write(&ctxp->pending_get_cntr[idx], 0); #else ctxp->pending_put_cntr[idx] = 0; ctxp->pending_get_cntr[idx] = 0; diff --git a/src/transport_ofi.h b/src/transport_ofi.h index 1a1607b51..90517dd95 100644 --- a/src/transport_ofi.h +++ b/src/transport_ofi.h @@ -389,10 +389,16 @@ void shmem_transport_probe(void) # ifdef USE_THREAD_COMPLETION if (0 == pthread_mutex_trylock(&shmem_transport_ofi_progress_lock)) { # endif - struct fi_cq_entry buf; - int ret = fi_cq_read(shmem_transport_ofi_target_cq, &buf, 1); - if (ret == 1) - RAISE_WARN_STR("Unexpected event"); +// struct fi_cq_entry buf; +// int ret = fi_cq_read(shmem_transport_ofi_target_cq, &buf, 1); +// if (ret == 1) +// RAISE_WARN_STR("Unexpected event"); + for (size_t i = 0; i < shmem_transport_ofi_num_nics; i++) { + struct fi_cq_entry buf; + int ret = fi_cq_read(shmem_transport_ctx_default.cq[i], &buf, 1); + if (ret == 1) + RAISE_WARN_STR("Unexpected event"); + } # ifdef USE_THREAD_COMPLETION pthread_mutex_unlock(&shmem_transport_ofi_progress_lock); } From 8134b9733bf2f74e7f12f5701ab2d0d37e5a4c8d Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Mon, 24 Jun 2024 11:55:25 -0700 Subject: [PATCH 09/11] src: Work on support with bounce buffers --- src/transport_ofi.h | 45 ++++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/src/transport_ofi.h b/src/transport_ofi.h index 90517dd95..5f37b1672 100644 --- a/src/transport_ofi.h +++ b/src/transport_ofi.h @@ -422,14 +422,13 @@ static inline void shmem_transport_get_wait(shmem_transport_ctx_t* ctx, size_t i /* Drain all available events from the CQ. Note, ctx->bounce_buffers must be * locked before calling this routine */ static inline -void shmem_transport_ofi_drain_cq(shmem_transport_ctx_t *ctx) +void shmem_transport_ofi_drain_cq(shmem_transport_ctx_t *ctx, size_t nic_idx) { ssize_t ret = 0; struct fi_cq_entry buf; for (;;) { - ret = fi_cq_read(ctx->cq, (void *)&buf, 1); /* FIX */ - + ret = fi_cq_read(ctx->cq[nic_idx], (void *)&buf, 1); /* FIX */ if (ret == -FI_EAGAIN) break; /* No events */ else if (ret == 1) { @@ -467,7 +466,9 @@ shmem_transport_ofi_bounce_buffer_t * create_bounce_buffer(shmem_transport_ctx_t shmem_internal_assert(shmem_transport_ofi_max_bounce_buffers > 0); while (ctx->bounce_buffers->nalloc >= (uint64_t) shmem_transport_ofi_max_bounce_buffers) { - shmem_transport_ofi_drain_cq(ctx); + for (size_t i = 0; i < shmem_transport_ofi_num_nics; i++) { + shmem_transport_ofi_drain_cq(ctx, i); + } } buff = (shmem_transport_ofi_bounce_buffer_t*) shmem_free_list_alloc(ctx->bounce_buffers); @@ -486,7 +487,7 @@ shmem_transport_ofi_bounce_buffer_t * create_bounce_buffer(shmem_transport_ctx_t } static inline -void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx, size_t idx) +void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx, size_t nic_idx) { SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); @@ -495,7 +496,7 @@ void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx, size_t idx) SHMEM_TRANSPORT_OFI_CTX_BB_LOCK(ctx); while (ctx->bounce_buffers->nalloc > 0) { - shmem_transport_ofi_drain_cq(ctx); + shmem_transport_ofi_drain_cq(ctx, nic_idx); } SHMEM_TRANSPORT_OFI_CTX_BB_UNLOCK(ctx); @@ -513,14 +514,10 @@ void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx, size_t idx) long poll_count = 0; while (poll_count < shmem_transport_ofi_put_poll_limit || shmem_transport_ofi_put_poll_limit < 0) { - success = 0; - fail = 0; - - //for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { - success = fi_cntr_read(ctx->put_cntr[idx]); /* FIXED? */ - fail = fi_cntr_readerr(ctx->put_cntr[idx]); /* FIXED? */ - cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[idx]); /* FIXED? */ - //} + + success = fi_cntr_read(ctx->put_cntr[nic_idx]); /* FIXED? */ + fail = fi_cntr_readerr(ctx->put_cntr[nic_idx]); /* FIXED? */ + cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ shmem_transport_probe(); if (success < cnt && fail == 0) { @@ -535,16 +532,14 @@ void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx, size_t idx) } poll_count++; } - //for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { - cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[idx]); /* FIXED? */ - do { - cnt = cnt_new; - ssize_t ret = fi_cntr_wait(ctx->put_cntr[idx], cnt, -1); /* FIXED? */ - cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[idx]); /* FIXED? */ - OFI_CTX_CHECK_ERROR(ctx, ret); - } while (cnt < cnt_new); - shmem_internal_assert(cnt == cnt_new); - //} + cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ + do { + cnt = cnt_new; + ssize_t ret = fi_cntr_wait(ctx->put_cntr[nic_idx], cnt, -1); /* FIXED? */ + cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ + OFI_CTX_CHECK_ERROR(ctx, ret); + } while (cnt < cnt_new); + shmem_internal_assert(cnt == cnt_new); SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } @@ -587,7 +582,7 @@ int try_again(shmem_transport_ctx_t *ctx, const int ret, uint64_t *polled, size_ if (ret == -FI_EAGAIN) { if (ctx->bounce_buffers) { SHMEM_TRANSPORT_OFI_CTX_BB_LOCK(ctx); - shmem_transport_ofi_drain_cq(ctx); + shmem_transport_ofi_drain_cq(ctx, nic_idx); SHMEM_TRANSPORT_OFI_CTX_BB_UNLOCK(ctx); } else { From 3b01d15c28b5c1e2c534f6d219bb3f96e1d6405c Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Tue, 25 Jun 2024 07:28:02 -0700 Subject: [PATCH 10/11] src: Make shmem_transport_probe take a NIC index --- src/collectives.c | 26 +++++++------- src/data_c.c4 | 14 ++++---- src/shmem_comm.h | 8 ++--- src/shmem_synchronization.h | 59 ++++++++++++++++--------------- src/symmetric_heap_c.c | 40 +++++++++++---------- src/synchronization_c.c4 | 38 ++++++++++---------- src/transport_none.h | 2 +- src/transport_ofi.h | 69 +++++++++++++++++++++++-------------- src/transport_portals4.h | 4 +-- src/transport_ucx.c | 2 +- src/transport_ucx.h | 18 +++++----- 11 files changed, 152 insertions(+), 128 deletions(-) diff --git a/src/collectives.c b/src/collectives.c index 7a277ebba..07f9c311c 100644 --- a/src/collectives.c +++ b/src/collectives.c @@ -437,7 +437,7 @@ shmem_internal_bcast_linear(void *target, const void *source, size_t len, if (pe == shmem_internal_my_pe) continue; shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, source, len, pe, &completion, nic_idx); } - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); shmem_internal_fence(SHMEM_CTX_DEFAULT); @@ -522,7 +522,7 @@ shmem_internal_bcast_tree(void *target, const void *source, size_t len, shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, send_buf, len, children[i], &completion, nic_idx); } - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); shmem_internal_fence(SHMEM_CTX_DEFAULT); @@ -591,7 +591,7 @@ shmem_internal_op_to_all_linear(void *target, const void *source, size_t count, exist. */ shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, source, count * type_size, shmem_internal_my_pe, &completion, nic_idx); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); shmem_internal_quiet(SHMEM_CTX_DEFAULT); /* let everyone know that it's safe to send to us */ @@ -619,7 +619,7 @@ shmem_internal_op_to_all_linear(void *target, const void *source, size_t count, /* send data, ack, and wait for completion */ shmem_internal_atomicv(SHMEM_CTX_DEFAULT, target, source, count * type_size, PE_start, op, datatype, &completion, nic_idx); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), @@ -798,7 +798,7 @@ shmem_internal_op_to_all_tree(void *target, const void *source, size_t count, si exist. */ shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, source, count * type_size, shmem_internal_my_pe, &completion, nic_idx); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); shmem_internal_quiet(SHMEM_CTX_DEFAULT); /* let everyone know that it's safe to send to us */ @@ -827,7 +827,7 @@ shmem_internal_op_to_all_tree(void *target, const void *source, size_t count, si (num_children == 0) ? source : target, count * type_size, parent, op, datatype, &completion, nic_idx); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), @@ -904,7 +904,7 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t coun shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, current_target, wrk_size, peer, &completion, nic_idx); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync_extra_peer, &ps_data_ready, sizeof(long), peer, nic_idx); @@ -934,7 +934,7 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t coun shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, current_target, wrk_size, peer, &completion, nic_idx); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, step_psync, &ps_data_ready, sizeof(long), peer, nic_idx); @@ -944,7 +944,7 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t coun shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, current_target, wrk_size, peer, &completion, nic_idx); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, step_psync, &ps_data_ready, sizeof(long), peer, nic_idx); @@ -962,7 +962,7 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t coun shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, current_target, wrk_size, peer, &completion, nic_idx); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync_extra_peer, &ps_data_ready, sizeof(long), peer, nic_idx); @@ -1084,7 +1084,7 @@ shmem_internal_fcollect_linear(void *target, const void *source, size_t len, size_t offset = ((shmem_internal_my_pe - PE_start) / PE_stride) * len; shmem_internal_put_nb(SHMEM_CTX_DEFAULT, (char*) target + offset, source, len, PE_start, &completion, nic_idx); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); /* ensure ordering */ shmem_internal_fence(SHMEM_CTX_DEFAULT); @@ -1136,7 +1136,7 @@ shmem_internal_fcollect_ring(void *target, const void *source, size_t len, /* send data to me + 1 */ shmem_internal_put_nb(SHMEM_CTX_DEFAULT, (char*) target + iter_offset, (char*) target + iter_offset, len, next_proc, &completion, nic_idx); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); shmem_internal_fence(SHMEM_CTX_DEFAULT); /* send completion for this round to next proc. Note that we @@ -1199,7 +1199,7 @@ shmem_internal_fcollect_recdbl(void *target, const void *source, size_t len, /* send data to peer */ shmem_internal_put_nb(SHMEM_CTX_DEFAULT, (char*) target + curr_offset, (char*) target + curr_offset, distance * len, real_peer, &completion, nic_idx); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); shmem_internal_fence(SHMEM_CTX_DEFAULT); /* mark completion for this round */ diff --git a/src/data_c.c4 b/src/data_c.c4 index 45a934445..d0f08a9e2 100644 --- a/src/data_c.c4 +++ b/src/data_c.c4 @@ -349,7 +349,7 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') shmem_internal_put_nb(ctx, target, source, \ sizeof(TYPE) * nelems, pe, \ &completion, nic_idx); \ - shmem_internal_put_wait(ctx, &completion, nic_idx); \ + shmem_internal_put_wait(ctx, &completion); \ } @@ -372,7 +372,7 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_put_nb(ctx, target, source, (SIZE) * nelems,\ pe, &completion, nic_idx); \ - shmem_internal_put_wait(ctx, &completion, nic_idx); \ + shmem_internal_put_wait(ctx, &completion); \ } @@ -555,7 +555,7 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') target += tst; \ source += sst; \ } \ - shmem_internal_put_wait(ctx, &completion, nic_idx); \ + shmem_internal_put_wait(ctx, &completion); \ } #define SHMEM_DEF_IPUT_N(NAME,SIZE) \ @@ -614,7 +614,7 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') target = (uint8_t *) target + tst * (SIZE); \ source = (uint8_t *) source + sst * (SIZE); \ } \ - shmem_internal_put_wait(ctx, &completion, nic_idx); \ + shmem_internal_put_wait(ctx, &completion); \ } #define SHMEM_DEF_IGET(STYPE,TYPE) \ @@ -757,7 +757,7 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') shmem_internal_put_nb(ctx, target, source, \ sizeof(TYPE) * nelems, pe, \ &completion, nic_idx); \ - shmem_internal_put_wait(ctx, &completion, nic_idx); \ + shmem_internal_put_wait(ctx, &completion); \ shmem_internal_fence(ctx); \ if (sig_op == SHMEM_SIGNAL_ADD) \ shmem_internal_atomic(ctx, sig_addr, &signal, sizeof(uint64_t), \ @@ -791,7 +791,7 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_put_nb(ctx, target, source, (SIZE) * nelems, \ pe, &completion, nic_idx); \ - shmem_internal_put_wait(ctx, &completion, nic_idx); \ + shmem_internal_put_wait(ctx, &completion); \ shmem_internal_fence(ctx); \ if (sig_op == SHMEM_SIGNAL_ADD) \ shmem_internal_atomic(ctx, sig_addr, &signal, sizeof(uint64_t), \ @@ -1029,7 +1029,7 @@ void SHMEM_FUNCTION_ATTRIBUTES shmemx_putmem_ct(shmemx_ct_t ct, void *target, co size_t nic_idx = 0; SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_put_ct_nb(ct, target, source, nelems, pe, &completion, nic_idx); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); } diff --git a/src/shmem_comm.h b/src/shmem_comm.h index 02034ed6d..54ea6ce20 100644 --- a/src/shmem_comm.h +++ b/src/shmem_comm.h @@ -48,9 +48,9 @@ shmem_internal_put_nb(shmem_ctx_t ctx, void *target, const void *source, size_t static inline void -shmem_internal_put_wait(shmem_ctx_t ctx, long *completion, size_t nic_idx) +shmem_internal_put_wait(shmem_ctx_t ctx, long *completion) { - shmem_transport_put_wait((shmem_transport_ctx_t *)ctx, completion, nic_idx); + shmem_transport_put_wait((shmem_transport_ctx_t *)ctx, completion); /* on-node is always blocking, so this is a no-op for them */ } @@ -69,7 +69,7 @@ shmem_internal_put_scalar(shmem_ctx_t ctx, void *target, const void *source, siz #else long completion = 0; shmem_transport_put_nb((shmem_transport_ctx_t *)ctx, target, source, len, pe, &completion, nic_idx); - shmem_internal_put_wait(ctx, &completion, nic_idx); + shmem_internal_put_wait(ctx, &completion); #endif } } @@ -414,7 +414,7 @@ void shmem_internal_copy_self(void *dest, const void *source, size_t nelems, siz long completion = 1; shmem_internal_put_nb(SHMEM_CTX_DEFAULT, dest, source, nelems, shmem_internal_my_pe, &completion, nic_idx); - shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion, nic_idx); + shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); #else memcpy(dest, source, nelems); #endif diff --git a/src/shmem_synchronization.h b/src/shmem_synchronization.h index 590d79787..4e15a9266 100644 --- a/src/shmem_synchronization.h +++ b/src/shmem_synchronization.h @@ -99,37 +99,42 @@ shmem_internal_fence(shmem_ctx_t ctx) #define SHMEM_TEST(type, a, b, ret) COMP(type, SYNC_LOAD(a), b, ret) -#define SHMEM_WAIT_POLL(var, value) \ - do { \ - while (SYNC_LOAD(var) == value) { \ - shmem_transport_probe(); \ - SPINLOCK_BODY(); } \ +#define SHMEM_WAIT_POLL(var, value) \ + do { \ + while (SYNC_LOAD(var) == value) { \ + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { \ + shmem_transport_probe(nic_idx); \ + SPINLOCK_BODY(); \ + } \ + } \ } while(0) -#define SHMEM_WAIT_UNTIL_POLL(var, cond, value) \ - do { \ - int cmpret; \ - \ - /*shmem_transport_probe();*/ \ - COMP(cond, SYNC_LOAD(var), value, cmpret); \ - /*shmem_transport_probe();*/ \ - while (!cmpret) { \ - shmem_transport_probe(); \ - SPINLOCK_BODY(); \ - COMP(cond, SYNC_LOAD(var), value, cmpret); \ - } \ +#define SHMEM_WAIT_UNTIL_POLL(var, cond, value) \ + do { \ + int cmpret; \ + \ + COMP(cond, SYNC_LOAD(var), value, cmpret); \ + while (!cmpret) { \ + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { \ + shmem_transport_probe(nic_idx); \ + SPINLOCK_BODY(); \ + } \ + COMP(cond, SYNC_LOAD(var), value, cmpret); \ + } \ } while(0) -#define SHMEM_SIGNAL_WAIT_UNTIL_POLL(var, cond, value, sat_value) \ - do { \ - int cmpret; \ - \ - COMP_SIGNAL(cond, SYNC_LOAD(var), value, cmpret, sat_value); \ - while (!cmpret) { \ - shmem_transport_probe(); \ - SPINLOCK_BODY(); \ - COMP_SIGNAL(cond, SYNC_LOAD(var), value, cmpret, sat_value);\ - } \ +#define SHMEM_SIGNAL_WAIT_UNTIL_POLL(var, cond, value, sat_value) \ + do { \ + int cmpret; \ + \ + COMP_SIGNAL(cond, SYNC_LOAD(var), value, cmpret, sat_value); \ + while (!cmpret) { \ + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { \ + shmem_transport_probe(nic_idx); \ + SPINLOCK_BODY(); \ + } \ + COMP_SIGNAL(cond, SYNC_LOAD(var), value, cmpret, sat_value); \ + } \ } while(0) #define SHMEM_WAIT_BLOCK(var, value) \ diff --git a/src/symmetric_heap_c.c b/src/symmetric_heap_c.c index 176f4d01b..16d8df706 100644 --- a/src/symmetric_heap_c.c +++ b/src/symmetric_heap_c.c @@ -295,9 +295,9 @@ shmem_malloc(size_t size) ret = dlmalloc(size); SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - size_t nic_idx = 0; - SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); - shmem_internal_barrier_all(nic_idx); + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { + shmem_internal_barrier_all(nic_idx); + } return ret; } @@ -315,9 +315,9 @@ shmem_calloc(size_t count, size_t size) ret = dlcalloc(count, size); SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - size_t nic_idx = 0; - SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); - shmem_internal_barrier_all(nic_idx); + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { + shmem_internal_barrier_all(nic_idx); + } return ret; } @@ -330,9 +330,9 @@ shmem_free(void *ptr) SHMEM_ERR_CHECK_SYMMETRIC_HEAP(ptr); } - size_t nic_idx = 0; - SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); - shmem_internal_barrier_all(nic_idx); + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { + shmem_internal_barrier_all(nic_idx); + } shmem_internal_free(ptr); } @@ -350,9 +350,9 @@ shmem_realloc(void *ptr, size_t size) SHMEM_ERR_CHECK_SYMMETRIC_HEAP(ptr); } - size_t nic_idx = 0; - SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); - shmem_internal_barrier_all(nic_idx); + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { + shmem_internal_barrier_all(nic_idx); + } SHMEM_MUTEX_LOCK(shmem_internal_mutex_alloc); if (size == 0 && ptr != NULL) { @@ -363,7 +363,9 @@ shmem_realloc(void *ptr, size_t size) } SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - shmem_internal_barrier_all(nic_idx); + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { + shmem_internal_barrier_all(nic_idx); + } return ret; } @@ -384,9 +386,9 @@ shmem_align(size_t alignment, size_t size) ret = dlmemalign(alignment, size); SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - size_t nic_idx = 0; - SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); - shmem_internal_barrier_all(nic_idx); + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { + shmem_internal_barrier_all(nic_idx); + } return ret; } @@ -441,9 +443,9 @@ shmem_malloc_with_hints(size_t size, long hints) SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); if (!(hints & SHMEMX_MALLOC_NO_BARRIER)) { - size_t nic_idx = 0; - SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); - shmem_internal_barrier_all(nic_idx); + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { + shmem_internal_barrier_all(nic_idx); + } } return ret; } diff --git a/src/synchronization_c.c4 b/src/synchronization_c.c4 index 6e3eef0f2..8016ea9fd 100644 --- a/src/synchronization_c.c4 +++ b/src/synchronization_c.c4 @@ -232,7 +232,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_WAIT_UNTIL') } \ } \ if (nelems == 0 || num_ignored == nelems) { \ - shmem_transport_probe(); \ + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ return; \ } \ \ @@ -268,7 +268,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_WAIT_UNTIL_ALL') } \ \ if (nelems == 0 || num_ignored == nelems) { \ - shmem_transport_probe(); \ + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); }; \ return; \ } \ \ @@ -304,7 +304,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_WAIT_UNTIL_ALL_VECTOR') } \ } \ if (nelems == 0 || num_ignored == nelems) { \ - shmem_transport_probe(); \ + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ return SIZE_MAX; \ } \ \ @@ -324,7 +324,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_WAIT_UNTIL_ALL_VECTOR') } \ } \ } \ - if (!cmpret) shmem_transport_probe(); \ + if (!cmpret) for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ } \ \ shmem_internal_membar_acq_rel(); \ @@ -354,7 +354,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_WAIT_UNTIL_ANY') } \ } \ if (nelems == 0 || num_ignored == nelems) { \ - shmem_transport_probe(); \ + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ return SIZE_MAX; \ } \ \ @@ -374,7 +374,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_WAIT_UNTIL_ANY') } \ } \ } \ - if (!cmpret) shmem_transport_probe(); \ + if (!cmpret) for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ } \ \ shmem_internal_membar_acq_rel(); \ @@ -408,7 +408,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_WAIT_UNTIL_ANY_VECTOR') } \ } \ if (nelems == 0 || num_ignored == nelems) { \ - shmem_transport_probe(); \ + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ return 0; \ } \ \ @@ -423,7 +423,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_WAIT_UNTIL_ANY_VECTOR') } \ } \ } \ - if (!cmpret) shmem_transport_probe(); \ + if (!cmpret) for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ } \ shmem_internal_membar_acq_rel(); \ shmem_transport_syncmem(); \ @@ -456,7 +456,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_WAIT_UNTIL_SOME') } \ } \ if (nelems == 0 || num_ignored == nelems) { \ - shmem_transport_probe(); \ + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ return 0; \ } \ \ @@ -471,7 +471,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_WAIT_UNTIL_SOME') } \ } \ } \ - if (!cmpret) shmem_transport_probe(); \ + if (!cmpret) for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ } \ shmem_internal_membar_acq_rel(); \ shmem_transport_syncmem(); \ @@ -495,7 +495,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_WAIT_UNTIL_SOME_VECTOR') shmem_internal_membar_acq_rel(); \ shmem_transport_syncmem(); \ } else { \ - shmem_transport_probe(); \ + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ } \ return cmpret; \ } @@ -520,7 +520,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_TEST') int cmpret; \ SHMEM_TEST(cond, &vars[i], value, cmpret); \ if (!cmpret) { \ - shmem_transport_probe(); \ + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ return 0; \ } \ } \ @@ -551,7 +551,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_TEST_ALL') int cmpret; \ SHMEM_TEST(cond, &vars[i], values[i], cmpret); \ if (!cmpret) { \ - shmem_transport_probe(); \ + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ return 0; \ } \ } \ @@ -596,7 +596,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_TEST_ALL_VECTOR') shmem_internal_membar_acq_rel(); \ shmem_transport_syncmem(); \ } else \ - shmem_transport_probe(); \ + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ \ return found_idx; \ } @@ -635,7 +635,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_TEST_ANY') shmem_internal_membar_acq_rel(); \ shmem_transport_syncmem(); \ } else \ - shmem_transport_probe(); \ + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ \ return found_idx; \ } @@ -666,7 +666,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_TEST_ANY_VECTOR') } \ } \ if (nelems == 0 || num_ignored == nelems) { \ - shmem_transport_probe(); \ + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ return 0; \ } \ \ @@ -680,7 +680,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_TEST_ANY_VECTOR') } \ } \ } \ - if (!cmpret) shmem_transport_probe(); \ + if (!cmpret) for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ shmem_internal_membar_acq_rel(); \ shmem_transport_syncmem(); \ return ncompleted; \ @@ -712,7 +712,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_TEST_SOME') } \ } \ if (nelems == 0 || num_ignored == nelems) { \ - shmem_transport_probe(); \ + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ return 0; \ } \ \ @@ -726,7 +726,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_TEST_SOME') } \ } \ } \ - if (!cmpret) shmem_transport_probe(); \ + if (!cmpret) for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ shmem_internal_membar_acq_rel(); \ shmem_transport_syncmem(); \ return ncompleted; \ diff --git a/src/transport_none.h b/src/transport_none.h index 6d4a9c547..3411be4a7 100644 --- a/src/transport_none.h +++ b/src/transport_none.h @@ -59,7 +59,7 @@ shmem_transport_fini(void) static inline void -shmem_transport_probe(void) +shmem_transport_probe(size_t nic_idx) { return; } diff --git a/src/transport_ofi.h b/src/transport_ofi.h index 5f37b1672..c7aae42f5 100644 --- a/src/transport_ofi.h +++ b/src/transport_ofi.h @@ -383,22 +383,24 @@ extern struct fid_ep* shmem_transport_ofi_target_ep; } while (0) static inline -void shmem_transport_probe(void) +void shmem_transport_probe(size_t nic_idx) { #if defined(ENABLE_MANUAL_PROGRESS) # ifdef USE_THREAD_COMPLETION if (0 == pthread_mutex_trylock(&shmem_transport_ofi_progress_lock)) { # endif -// struct fi_cq_entry buf; -// int ret = fi_cq_read(shmem_transport_ofi_target_cq, &buf, 1); -// if (ret == 1) -// RAISE_WARN_STR("Unexpected event"); - for (size_t i = 0; i < shmem_transport_ofi_num_nics; i++) { - struct fi_cq_entry buf; - int ret = fi_cq_read(shmem_transport_ctx_default.cq[i], &buf, 1); - if (ret == 1) - RAISE_WARN_STR("Unexpected event"); - } + struct fi_cq_entry buf; + int ret = fi_cq_read(shmem_transport_ctx_default.cq[nic_idx], &buf, 1); + if (ret == 1) + RAISE_WARN_STR("Unexpected event A"); + if (ret < 0) + RAISE_WARN_STR("Unexpected event B"); +// for (size_t i = 0; i < shmem_transport_ofi_num_nics; i++) { +// struct fi_cq_entry buf; +// int ret = fi_cq_read(shmem_transport_ctx_default.cq[i], &buf, 1); +// if (ret == 1) +// RAISE_WARN_STR("Unexpected event"); +// } # ifdef USE_THREAD_COMPLETION pthread_mutex_unlock(&shmem_transport_ofi_progress_lock); } @@ -496,7 +498,9 @@ void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx, size_t nic_idx) SHMEM_TRANSPORT_OFI_CTX_BB_LOCK(ctx); while (ctx->bounce_buffers->nalloc > 0) { - shmem_transport_ofi_drain_cq(ctx, nic_idx); + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { + shmem_transport_ofi_drain_cq(ctx, nic_idx); + } } SHMEM_TRANSPORT_OFI_CTX_BB_UNLOCK(ctx); @@ -515,10 +519,19 @@ void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx, size_t nic_idx) while (poll_count < shmem_transport_ofi_put_poll_limit || shmem_transport_ofi_put_poll_limit < 0) { +// success = 0; +// fail = 0; +// cnt = 0; +// for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { +// success += fi_cntr_read(ctx->put_cntr[nic_idx]); /* FIXED? */ +// fail += fi_cntr_readerr(ctx->put_cntr[nic_idx]); /* FIXED? */ +// cnt += SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ +// } +// shmem_transport_probe(); success = fi_cntr_read(ctx->put_cntr[nic_idx]); /* FIXED? */ fail = fi_cntr_readerr(ctx->put_cntr[nic_idx]); /* FIXED? */ cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ - shmem_transport_probe(); + shmem_transport_probe(nic_idx); if (success < cnt && fail == 0) { SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); @@ -532,14 +545,17 @@ void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx, size_t nic_idx) } poll_count++; } - cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ - do { - cnt = cnt_new; - ssize_t ret = fi_cntr_wait(ctx->put_cntr[nic_idx], cnt, -1); /* FIXED? */ + + //for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ - OFI_CTX_CHECK_ERROR(ctx, ret); - } while (cnt < cnt_new); - shmem_internal_assert(cnt == cnt_new); + do { + cnt = cnt_new; + ssize_t ret = fi_cntr_wait(ctx->put_cntr[nic_idx], cnt, -1); /* FIXED? */ + cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ + OFI_CTX_CHECK_ERROR(ctx, ret); + } while (cnt < cnt_new); + shmem_internal_assert(cnt == cnt_new); + //} SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } @@ -598,10 +614,9 @@ int try_again(shmem_transport_ctx_t *ctx, const int ret, uint64_t *polled, size_ } } - shmem_transport_probe(); - + shmem_transport_probe(nic_idx); + (*polled)++; - if ((*polled) <= shmem_transport_ofi_max_poll) { return 1; } @@ -883,11 +898,13 @@ void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, co /* compatibility with Portals transport */ static inline -void shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion, size_t nic_idx) { +void shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion) { shmem_internal_assert((*completion) >= 0); if((*completion) > 0) { - shmem_transport_put_quiet(ctx, nic_idx); + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { + shmem_transport_put_quiet(ctx, nic_idx); + } (*completion)--; } } @@ -983,7 +1000,7 @@ void shmem_transport_get_wait(shmem_transport_ctx_t* ctx, size_t nic_idx) fail = fi_cntr_readerr(ctx->get_cntr[nic_idx]); cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[nic_idx]); - shmem_transport_probe(); + shmem_transport_probe(nic_idx); if (success < cnt && fail == 0) { SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); diff --git a/src/transport_portals4.h b/src/transport_portals4.h index 9d7383a9d..46f1aed84 100644 --- a/src/transport_portals4.h +++ b/src/transport_portals4.h @@ -244,7 +244,7 @@ int shmem_transport_fini(void); static inline void shmem_transport_get_wait(shmem_transport_ctx_t*, size_t idx); -static inline void shmem_transport_probe(void) { +static inline void shmem_transport_probe(size_t nic_idx) { return; } @@ -624,7 +624,7 @@ shmem_transport_put_ct_nb(shmem_transport_ct_t *ct, void *target, const void *so static inline void -shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion, size_t nic_idx) +shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion) { if (ctx->options & SHMEMX_CTX_BOUNCE_BUFFER) { while (*completion > 0) { diff --git a/src/transport_ucx.c b/src/transport_ucx.c index 5d2c9f265..f6cb9b833 100644 --- a/src/transport_ucx.c +++ b/src/transport_ucx.c @@ -72,7 +72,7 @@ static int shmem_transport_ucx_progress_thread_enabled = 1; static void * shmem_transport_ucx_progress_thread_func(void *arg) { while (__atomic_load_n(&shmem_transport_ucx_progress_thread_enabled, __ATOMIC_ACQUIRE)) { - shmem_transport_probe(); + shmem_transport_probe(0); usleep(shmem_internal_params.PROGRESS_INTERVAL); } diff --git a/src/transport_ucx.h b/src/transport_ucx.h index a561682a6..19ec9fe09 100644 --- a/src/transport_ucx.h +++ b/src/transport_ucx.h @@ -83,7 +83,7 @@ int shmem_transport_fini(void); static inline void -shmem_transport_probe(void) +shmem_transport_probe(size_t nic_idx) { ucp_worker_progress(shmem_transport_ucp_worker); } @@ -93,14 +93,14 @@ ucs_status_t shmem_transport_ucx_complete_op(ucs_status_ptr_t req) { if (req == NULL) { /* All calls to complete_op must generate progress to avoid deadlock * in application-level polling loops */ - shmem_transport_probe(); + shmem_transport_probe(0); return UCS_OK; } else if (UCS_PTR_IS_ERR(req)) { return UCS_PTR_STATUS(req); } else { ucs_status_t status; do { - shmem_transport_probe(); + shmem_transport_probe(0); status = ucp_request_check_status(req); } while (status == UCS_INPROGRESS); ucp_request_free(req); @@ -275,10 +275,10 @@ shmem_transport_put_nb(shmem_transport_ctx_t* ctx, void *target, const void *sou static inline void -shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion, size_t nic_idx) +shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion) { while (__atomic_load_n(completion, __ATOMIC_ACQUIRE) > 0) - shmem_transport_probe(); + shmem_transport_probe(0); } static inline @@ -392,7 +392,7 @@ shmem_transport_swap_nbi(shmem_transport_ctx_t* ctx, void *target, const void *s &shmem_transport_ucx_cb_nop); /* Manual progress to avoid deadlock for application-level polling */ - shmem_transport_probe(); + shmem_transport_probe(0); ucs_status_t status = shmem_transport_ucx_release_op(pstatus); UCX_CHECK_STATUS_INPROGRESS(status); @@ -475,7 +475,7 @@ shmem_transport_cswap_nbi(shmem_transport_ctx_t* ctx, void *target, const void * &shmem_transport_ucx_cb_nop); /* Manual progress to avoid deadlock for application-level polling */ - shmem_transport_probe(); + shmem_transport_probe(0); ucs_status_t status = shmem_transport_ucx_release_op(pstatus); UCX_CHECK_STATUS_INPROGRESS(status); @@ -604,7 +604,7 @@ shmem_transport_fetch_atomic_nbi(shmem_transport_ctx_t* ctx, void *target, const &shmem_transport_ucx_cb_nop); /* Manual progress to avoid deadlock for application-level polling */ - shmem_transport_probe(); + shmem_transport_probe(0); ucs_status_t status = shmem_transport_ucx_release_op(pstatus); UCX_CHECK_STATUS_INPROGRESS(status); @@ -698,7 +698,7 @@ shmem_transport_mswap(shmem_transport_ctx_t* ctx, void *target, const void *sour if (*(uint32_t *)dest == v) done = 1; /* Manual progress to avoid deadlock for application-level polling */ - shmem_transport_probe(); + shmem_transport_probe(0); } } From aad2e23dbed0285667d144efcdae62783b83136e Mon Sep 17 00:00:00 2001 From: Philip Marshall Date: Wed, 26 Jun 2024 13:00:35 -0700 Subject: [PATCH 11/11] Undo changes to shmem_transport_probe --- src/shmem_synchronization.h | 42 +++++++++++++++------------------- src/symmetric_heap_c.c | 41 +++++++++++++++++---------------- src/synchronization_c.c4 | 38 +++++++++++++++---------------- src/transport_none.h | 2 +- src/transport_ofi.h | 45 +++++++++++-------------------------- src/transport_portals4.h | 2 +- src/transport_ucx.c | 2 +- src/transport_ucx.h | 16 ++++++------- 8 files changed, 81 insertions(+), 107 deletions(-) diff --git a/src/shmem_synchronization.h b/src/shmem_synchronization.h index 4e15a9266..f18fb4072 100644 --- a/src/shmem_synchronization.h +++ b/src/shmem_synchronization.h @@ -99,28 +99,24 @@ shmem_internal_fence(shmem_ctx_t ctx) #define SHMEM_TEST(type, a, b, ret) COMP(type, SYNC_LOAD(a), b, ret) -#define SHMEM_WAIT_POLL(var, value) \ - do { \ - while (SYNC_LOAD(var) == value) { \ - for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { \ - shmem_transport_probe(nic_idx); \ - SPINLOCK_BODY(); \ - } \ - } \ +#define SHMEM_WAIT_POLL(var, value) \ + do { \ + while (SYNC_LOAD(var) == value) { \ + shmem_transport_probe(); \ + SPINLOCK_BODY(); \ + } \ } while(0) -#define SHMEM_WAIT_UNTIL_POLL(var, cond, value) \ - do { \ - int cmpret; \ - \ - COMP(cond, SYNC_LOAD(var), value, cmpret); \ - while (!cmpret) { \ - for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { \ - shmem_transport_probe(nic_idx); \ - SPINLOCK_BODY(); \ - } \ - COMP(cond, SYNC_LOAD(var), value, cmpret); \ - } \ +#define SHMEM_WAIT_UNTIL_POLL(var, cond, value) \ + do { \ + int cmpret; \ + \ + COMP(cond, SYNC_LOAD(var), value, cmpret); \ + while (!cmpret) { \ + shmem_transport_probe(); \ + SPINLOCK_BODY(); \ + COMP(cond, SYNC_LOAD(var), value, cmpret); \ + } \ } while(0) #define SHMEM_SIGNAL_WAIT_UNTIL_POLL(var, cond, value, sat_value) \ @@ -129,10 +125,8 @@ shmem_internal_fence(shmem_ctx_t ctx) \ COMP_SIGNAL(cond, SYNC_LOAD(var), value, cmpret, sat_value); \ while (!cmpret) { \ - for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { \ - shmem_transport_probe(nic_idx); \ - SPINLOCK_BODY(); \ - } \ + shmem_transport_probe(); \ + SPINLOCK_BODY(); \ COMP_SIGNAL(cond, SYNC_LOAD(var), value, cmpret, sat_value); \ } \ } while(0) diff --git a/src/symmetric_heap_c.c b/src/symmetric_heap_c.c index 16d8df706..4230c1e6b 100644 --- a/src/symmetric_heap_c.c +++ b/src/symmetric_heap_c.c @@ -295,9 +295,9 @@ shmem_malloc(size_t size) ret = dlmalloc(size); SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { - shmem_internal_barrier_all(nic_idx); - } + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); return ret; } @@ -315,9 +315,9 @@ shmem_calloc(size_t count, size_t size) ret = dlcalloc(count, size); SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { - shmem_internal_barrier_all(nic_idx); - } + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); return ret; } @@ -330,9 +330,9 @@ shmem_free(void *ptr) SHMEM_ERR_CHECK_SYMMETRIC_HEAP(ptr); } - for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { - shmem_internal_barrier_all(nic_idx); - } + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); shmem_internal_free(ptr); } @@ -350,9 +350,9 @@ shmem_realloc(void *ptr, size_t size) SHMEM_ERR_CHECK_SYMMETRIC_HEAP(ptr); } - for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { - shmem_internal_barrier_all(nic_idx); - } + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); SHMEM_MUTEX_LOCK(shmem_internal_mutex_alloc); if (size == 0 && ptr != NULL) { @@ -363,9 +363,8 @@ shmem_realloc(void *ptr, size_t size) } SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { - shmem_internal_barrier_all(nic_idx); - } + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); return ret; } @@ -386,9 +385,9 @@ shmem_align(size_t alignment, size_t size) ret = dlmemalign(alignment, size); SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { - shmem_internal_barrier_all(nic_idx); - } + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); return ret; } @@ -443,9 +442,9 @@ shmem_malloc_with_hints(size_t size, long hints) SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); if (!(hints & SHMEMX_MALLOC_NO_BARRIER)) { - for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { - shmem_internal_barrier_all(nic_idx); - } + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); } return ret; } diff --git a/src/synchronization_c.c4 b/src/synchronization_c.c4 index 8016ea9fd..6e3eef0f2 100644 --- a/src/synchronization_c.c4 +++ b/src/synchronization_c.c4 @@ -232,7 +232,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_WAIT_UNTIL') } \ } \ if (nelems == 0 || num_ignored == nelems) { \ - for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ + shmem_transport_probe(); \ return; \ } \ \ @@ -268,7 +268,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_WAIT_UNTIL_ALL') } \ \ if (nelems == 0 || num_ignored == nelems) { \ - for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); }; \ + shmem_transport_probe(); \ return; \ } \ \ @@ -304,7 +304,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_WAIT_UNTIL_ALL_VECTOR') } \ } \ if (nelems == 0 || num_ignored == nelems) { \ - for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ + shmem_transport_probe(); \ return SIZE_MAX; \ } \ \ @@ -324,7 +324,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_WAIT_UNTIL_ALL_VECTOR') } \ } \ } \ - if (!cmpret) for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ + if (!cmpret) shmem_transport_probe(); \ } \ \ shmem_internal_membar_acq_rel(); \ @@ -354,7 +354,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_WAIT_UNTIL_ANY') } \ } \ if (nelems == 0 || num_ignored == nelems) { \ - for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ + shmem_transport_probe(); \ return SIZE_MAX; \ } \ \ @@ -374,7 +374,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_WAIT_UNTIL_ANY') } \ } \ } \ - if (!cmpret) for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ + if (!cmpret) shmem_transport_probe(); \ } \ \ shmem_internal_membar_acq_rel(); \ @@ -408,7 +408,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_WAIT_UNTIL_ANY_VECTOR') } \ } \ if (nelems == 0 || num_ignored == nelems) { \ - for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ + shmem_transport_probe(); \ return 0; \ } \ \ @@ -423,7 +423,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_WAIT_UNTIL_ANY_VECTOR') } \ } \ } \ - if (!cmpret) for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ + if (!cmpret) shmem_transport_probe(); \ } \ shmem_internal_membar_acq_rel(); \ shmem_transport_syncmem(); \ @@ -456,7 +456,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_WAIT_UNTIL_SOME') } \ } \ if (nelems == 0 || num_ignored == nelems) { \ - for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ + shmem_transport_probe(); \ return 0; \ } \ \ @@ -471,7 +471,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_WAIT_UNTIL_SOME') } \ } \ } \ - if (!cmpret) for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ + if (!cmpret) shmem_transport_probe(); \ } \ shmem_internal_membar_acq_rel(); \ shmem_transport_syncmem(); \ @@ -495,7 +495,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_WAIT_UNTIL_SOME_VECTOR') shmem_internal_membar_acq_rel(); \ shmem_transport_syncmem(); \ } else { \ - for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ + shmem_transport_probe(); \ } \ return cmpret; \ } @@ -520,7 +520,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_TEST') int cmpret; \ SHMEM_TEST(cond, &vars[i], value, cmpret); \ if (!cmpret) { \ - for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ + shmem_transport_probe(); \ return 0; \ } \ } \ @@ -551,7 +551,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_TEST_ALL') int cmpret; \ SHMEM_TEST(cond, &vars[i], values[i], cmpret); \ if (!cmpret) { \ - for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ + shmem_transport_probe(); \ return 0; \ } \ } \ @@ -596,7 +596,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_TEST_ALL_VECTOR') shmem_internal_membar_acq_rel(); \ shmem_transport_syncmem(); \ } else \ - for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ + shmem_transport_probe(); \ \ return found_idx; \ } @@ -635,7 +635,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_TEST_ANY') shmem_internal_membar_acq_rel(); \ shmem_transport_syncmem(); \ } else \ - for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ + shmem_transport_probe(); \ \ return found_idx; \ } @@ -666,7 +666,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_TEST_ANY_VECTOR') } \ } \ if (nelems == 0 || num_ignored == nelems) { \ - for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ + shmem_transport_probe(); \ return 0; \ } \ \ @@ -680,7 +680,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_TEST_ANY_VECTOR') } \ } \ } \ - if (!cmpret) for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ + if (!cmpret) shmem_transport_probe(); \ shmem_internal_membar_acq_rel(); \ shmem_transport_syncmem(); \ return ncompleted; \ @@ -712,7 +712,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_TEST_SOME') } \ } \ if (nelems == 0 || num_ignored == nelems) { \ - for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ + shmem_transport_probe(); \ return 0; \ } \ \ @@ -726,7 +726,7 @@ SHMEM_BIND_C_SYNC(`SHMEM_DEF_TEST_SOME') } \ } \ } \ - if (!cmpret) for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { shmem_transport_probe( nic_idx ); } \ + if (!cmpret) shmem_transport_probe(); \ shmem_internal_membar_acq_rel(); \ shmem_transport_syncmem(); \ return ncompleted; \ diff --git a/src/transport_none.h b/src/transport_none.h index 3411be4a7..6d4a9c547 100644 --- a/src/transport_none.h +++ b/src/transport_none.h @@ -59,7 +59,7 @@ shmem_transport_fini(void) static inline void -shmem_transport_probe(size_t nic_idx) +shmem_transport_probe(void) { return; } diff --git a/src/transport_ofi.h b/src/transport_ofi.h index c7aae42f5..8b2049e99 100644 --- a/src/transport_ofi.h +++ b/src/transport_ofi.h @@ -383,24 +383,16 @@ extern struct fid_ep* shmem_transport_ofi_target_ep; } while (0) static inline -void shmem_transport_probe(size_t nic_idx) +void shmem_transport_probe(void) { #if defined(ENABLE_MANUAL_PROGRESS) # ifdef USE_THREAD_COMPLETION if (0 == pthread_mutex_trylock(&shmem_transport_ofi_progress_lock)) { # endif struct fi_cq_entry buf; - int ret = fi_cq_read(shmem_transport_ctx_default.cq[nic_idx], &buf, 1); + int ret = fi_cq_read(shmem_transport_ofi_target_cq, &buf, 1); if (ret == 1) - RAISE_WARN_STR("Unexpected event A"); - if (ret < 0) - RAISE_WARN_STR("Unexpected event B"); -// for (size_t i = 0; i < shmem_transport_ofi_num_nics; i++) { -// struct fi_cq_entry buf; -// int ret = fi_cq_read(shmem_transport_ctx_default.cq[i], &buf, 1); -// if (ret == 1) -// RAISE_WARN_STR("Unexpected event"); -// } + RAISE_WARN_STR("Unexpected event"); # ifdef USE_THREAD_COMPLETION pthread_mutex_unlock(&shmem_transport_ofi_progress_lock); } @@ -519,19 +511,10 @@ void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx, size_t nic_idx) while (poll_count < shmem_transport_ofi_put_poll_limit || shmem_transport_ofi_put_poll_limit < 0) { -// success = 0; -// fail = 0; -// cnt = 0; -// for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { -// success += fi_cntr_read(ctx->put_cntr[nic_idx]); /* FIXED? */ -// fail += fi_cntr_readerr(ctx->put_cntr[nic_idx]); /* FIXED? */ -// cnt += SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ -// } -// shmem_transport_probe(); success = fi_cntr_read(ctx->put_cntr[nic_idx]); /* FIXED? */ fail = fi_cntr_readerr(ctx->put_cntr[nic_idx]); /* FIXED? */ cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ - shmem_transport_probe(nic_idx); + shmem_transport_probe(); if (success < cnt && fail == 0) { SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); @@ -546,16 +529,14 @@ void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx, size_t nic_idx) poll_count++; } - //for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { + cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ + do { + cnt = cnt_new; + ssize_t ret = fi_cntr_wait(ctx->put_cntr[nic_idx], cnt, -1); /* FIXED? */ cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ - do { - cnt = cnt_new; - ssize_t ret = fi_cntr_wait(ctx->put_cntr[nic_idx], cnt, -1); /* FIXED? */ - cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ - OFI_CTX_CHECK_ERROR(ctx, ret); - } while (cnt < cnt_new); - shmem_internal_assert(cnt == cnt_new); - //} + OFI_CTX_CHECK_ERROR(ctx, ret); + } while (cnt < cnt_new); + shmem_internal_assert(cnt == cnt_new); SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } @@ -614,7 +595,7 @@ int try_again(shmem_transport_ctx_t *ctx, const int ret, uint64_t *polled, size_ } } - shmem_transport_probe(nic_idx); + shmem_transport_probe(); (*polled)++; if ((*polled) <= shmem_transport_ofi_max_poll) { @@ -1000,7 +981,7 @@ void shmem_transport_get_wait(shmem_transport_ctx_t* ctx, size_t nic_idx) fail = fi_cntr_readerr(ctx->get_cntr[nic_idx]); cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[nic_idx]); - shmem_transport_probe(nic_idx); + shmem_transport_probe(); if (success < cnt && fail == 0) { SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); diff --git a/src/transport_portals4.h b/src/transport_portals4.h index 46f1aed84..b66fa5210 100644 --- a/src/transport_portals4.h +++ b/src/transport_portals4.h @@ -244,7 +244,7 @@ int shmem_transport_fini(void); static inline void shmem_transport_get_wait(shmem_transport_ctx_t*, size_t idx); -static inline void shmem_transport_probe(size_t nic_idx) { +static inline void shmem_transport_probe(void) { return; } diff --git a/src/transport_ucx.c b/src/transport_ucx.c index f6cb9b833..5d2c9f265 100644 --- a/src/transport_ucx.c +++ b/src/transport_ucx.c @@ -72,7 +72,7 @@ static int shmem_transport_ucx_progress_thread_enabled = 1; static void * shmem_transport_ucx_progress_thread_func(void *arg) { while (__atomic_load_n(&shmem_transport_ucx_progress_thread_enabled, __ATOMIC_ACQUIRE)) { - shmem_transport_probe(0); + shmem_transport_probe(); usleep(shmem_internal_params.PROGRESS_INTERVAL); } diff --git a/src/transport_ucx.h b/src/transport_ucx.h index 19ec9fe09..fb1b3299f 100644 --- a/src/transport_ucx.h +++ b/src/transport_ucx.h @@ -83,7 +83,7 @@ int shmem_transport_fini(void); static inline void -shmem_transport_probe(size_t nic_idx) +shmem_transport_probe(void) { ucp_worker_progress(shmem_transport_ucp_worker); } @@ -93,14 +93,14 @@ ucs_status_t shmem_transport_ucx_complete_op(ucs_status_ptr_t req) { if (req == NULL) { /* All calls to complete_op must generate progress to avoid deadlock * in application-level polling loops */ - shmem_transport_probe(0); + shmem_transport_probe(); return UCS_OK; } else if (UCS_PTR_IS_ERR(req)) { return UCS_PTR_STATUS(req); } else { ucs_status_t status; do { - shmem_transport_probe(0); + shmem_transport_probe(); status = ucp_request_check_status(req); } while (status == UCS_INPROGRESS); ucp_request_free(req); @@ -278,7 +278,7 @@ void shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion) { while (__atomic_load_n(completion, __ATOMIC_ACQUIRE) > 0) - shmem_transport_probe(0); + shmem_transport_probe(); } static inline @@ -392,7 +392,7 @@ shmem_transport_swap_nbi(shmem_transport_ctx_t* ctx, void *target, const void *s &shmem_transport_ucx_cb_nop); /* Manual progress to avoid deadlock for application-level polling */ - shmem_transport_probe(0); + shmem_transport_probe(); ucs_status_t status = shmem_transport_ucx_release_op(pstatus); UCX_CHECK_STATUS_INPROGRESS(status); @@ -475,7 +475,7 @@ shmem_transport_cswap_nbi(shmem_transport_ctx_t* ctx, void *target, const void * &shmem_transport_ucx_cb_nop); /* Manual progress to avoid deadlock for application-level polling */ - shmem_transport_probe(0); + shmem_transport_probe(); ucs_status_t status = shmem_transport_ucx_release_op(pstatus); UCX_CHECK_STATUS_INPROGRESS(status); @@ -604,7 +604,7 @@ shmem_transport_fetch_atomic_nbi(shmem_transport_ctx_t* ctx, void *target, const &shmem_transport_ucx_cb_nop); /* Manual progress to avoid deadlock for application-level polling */ - shmem_transport_probe(0); + shmem_transport_probe(); ucs_status_t status = shmem_transport_ucx_release_op(pstatus); UCX_CHECK_STATUS_INPROGRESS(status); @@ -698,7 +698,7 @@ shmem_transport_mswap(shmem_transport_ctx_t* ctx, void *target, const void *sour if (*(uint32_t *)dest == v) done = 1; /* Manual progress to avoid deadlock for application-level polling */ - shmem_transport_probe(0); + shmem_transport_probe(); } }