diff --git a/src/atomic_c.c4 b/src/atomic_c.c4 index 7b9c648b5..7ae7ac7c4 100644 --- a/src/atomic_c.c4 +++ b/src/atomic_c.c4 @@ -232,9 +232,12 @@ SHMEM_DEFINE_FOR_EXTENDED_AMO(`SHMEM_PROF_DEF_CTX_ATOMIC_SET') SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_swap(ctx, target, &value, &newval, \ - sizeof(TYPE), pe, ITYPE); \ - shmem_internal_get_wait(ctx); \ + sizeof(TYPE), pe, ITYPE, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return newval; \ } @@ -251,8 +254,11 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_PE(pe); SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(long)); - shmem_internal_swap(SHMEM_CTX_DEFAULT, target, &value, &newval, sizeof(long), pe, SHM_INTERNAL_LONG); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + + shmem_internal_swap(SHMEM_CTX_DEFAULT, target, &value, &newval, sizeof(long), pe, SHM_INTERNAL_LONG, nic_idx); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); return newval; } #endif @@ -267,9 +273,12 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_cswap(ctx, target, &value, &newval, &cond, \ - sizeof(TYPE), pe, ITYPE); \ - shmem_internal_get_wait(ctx); \ + sizeof(TYPE), pe, ITYPE, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return newval; \ } @@ -283,9 +292,12 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_cswap(ctx, target, &value, &newval, &cond, \ - sizeof(TYPE), pe, ITYPE); \ - shmem_internal_get_wait(ctx); \ + sizeof(TYPE), pe, ITYPE, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return newval; \ } @@ -298,8 +310,11 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_atomic(ctx, target, &tmp, sizeof(TYPE), \ - pe, SHM_INTERNAL_SUM, ITYPE); \ + pe, SHM_INTERNAL_SUM, ITYPE, nic_idx); \ } @@ -311,10 +326,13 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic(ctx, target, &tmp, &oldval, \ sizeof(TYPE), pe, SHM_INTERNAL_SUM, \ - ITYPE); \ - shmem_internal_get_wait(ctx); \ + ITYPE, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return oldval; \ } @@ -327,10 +345,13 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic(ctx, target, &tmp, &oldval, \ sizeof(TYPE), pe, SHM_INTERNAL_SUM, \ - ITYPE); \ - shmem_internal_get_wait(ctx); \ + ITYPE, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return oldval; \ } @@ -343,8 +364,10 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_atomic(ctx, target, &value, sizeof(TYPE), \ - pe, SHM_INTERNAL_SUM, ITYPE); \ + pe, SHM_INTERNAL_SUM, ITYPE, nic_idx); \ } @@ -358,10 +381,12 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic(ctx, target, &value, &oldval, \ sizeof(TYPE), pe, SHM_INTERNAL_SUM, \ - ITYPE); \ - shmem_internal_get_wait(ctx); \ + ITYPE, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return oldval; \ } @@ -375,10 +400,12 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic(ctx, target, &value, &oldval, \ sizeof(TYPE), pe, SHM_INTERNAL_SUM, \ - ITYPE); \ - shmem_internal_get_wait(ctx); \ + ITYPE, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return oldval; \ } @@ -393,9 +420,11 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(source, sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_atomic_fetch(ctx, &val, (void *) source, \ - sizeof(TYPE), pe, ITYPE); \ - shmem_internal_get_wait(ctx); \ + sizeof(TYPE), pe, ITYPE, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return val; \ } @@ -408,8 +437,10 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(dest, sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_atomic_set(ctx, (void *) dest, &value, \ - sizeof(TYPE), pe, ITYPE); \ + sizeof(TYPE), pe, ITYPE, nic_idx); \ } @@ -421,8 +452,10 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_atomic(ctx, target, &value, sizeof(TYPE), \ - pe, SHM_INTERNAL_BXOR, ITYPE); \ + pe, SHM_INTERNAL_BXOR, ITYPE, nic_idx); \ } @@ -434,8 +467,10 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_atomic(ctx, target, &value, sizeof(TYPE), \ - pe, SHM_INTERNAL_BAND, ITYPE); \ + pe, SHM_INTERNAL_BAND, ITYPE, nic_idx); \ } @@ -447,14 +482,16 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_atomic(ctx, target, &value, sizeof(TYPE), \ - pe, SHM_INTERNAL_BOR, ITYPE); \ + pe, SHM_INTERNAL_BOR, ITYPE, nic_idx); \ } #define SHMEM_DEF_FETCH_XOR(STYPE,TYPE,ITYPE) \ TYPE SHMEM_FUNCTION_ATTRIBUTES \ - SHMEM_FUNC_PROTOTYPE(STYPE, fetch_xor, TYPE *target, TYPE value, \ + SHMEM_FUNC_PROTOTYPE(STYPE, fetch_xor, TYPE *target, TYPE value, \ int pe) \ TYPE oldval; \ SHMEM_ERR_CHECK_INITIALIZED(); \ @@ -462,10 +499,12 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic(ctx, target, &value, &oldval, \ sizeof(TYPE), pe, SHM_INTERNAL_BXOR, \ - ITYPE); \ - shmem_internal_get_wait(ctx); \ + ITYPE, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return oldval; \ } @@ -479,10 +518,12 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic(ctx, target, &value, &oldval, \ sizeof(TYPE), pe, SHM_INTERNAL_BAND, \ - ITYPE); \ - shmem_internal_get_wait(ctx); \ + ITYPE, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return oldval; \ } @@ -496,10 +537,12 @@ shmem_swap(long *target, long value, int pe) SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic(ctx, target, &value, &oldval, \ sizeof(TYPE), pe, SHM_INTERNAL_BOR, \ - ITYPE); \ - shmem_internal_get_wait(ctx); \ + ITYPE, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return oldval; \ } diff --git a/src/atomic_f.c b/src/atomic_f.c index 663e033f4..d084be71c 100644 --- a/src/atomic_f.c +++ b/src/atomic_f.c @@ -41,8 +41,8 @@ FC_SHMEM_SWAP(fortran_integer_t *target, SHMEM_ERR_CHECK_SYMMETRIC(target, SIZEOF_FORTRAN_INTEGER); shmem_internal_swap(SHMEM_CTX_DEFAULT, target, value, &newval, SIZEOF_FORTRAN_INTEGER, - *pe, SHM_INTERNAL_FORTRAN_INTEGER); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + *pe, SHM_INTERNAL_FORTRAN_INTEGER, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return newval; } @@ -64,8 +64,8 @@ FC_SHMEM_INT4_SWAP(int32_t *target, SHMEM_ERR_CHECK_SYMMETRIC(target, 4); shmem_internal_swap(SHMEM_CTX_DEFAULT, target, value, &newval, 4, - *pe, SHM_INTERNAL_INT32); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + *pe, SHM_INTERNAL_INT32, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return newval; } @@ -87,8 +87,8 @@ FC_SHMEM_INT8_SWAP(int64_t *target, SHMEM_ERR_CHECK_SYMMETRIC(target, 8); shmem_internal_swap(SHMEM_CTX_DEFAULT, target, value, &newval, 8, - *pe, SHM_INTERNAL_INT64); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + *pe, SHM_INTERNAL_INT64, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return newval; } @@ -112,8 +112,8 @@ FC_SHMEM_REAL4_SWAP(float *target, shmem_internal_assert(sizeof(float) == 4); shmem_internal_swap(SHMEM_CTX_DEFAULT, target, value, &newval, 4, - *pe, SHM_INTERNAL_FLOAT); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + *pe, SHM_INTERNAL_FLOAT, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return newval; } @@ -137,8 +137,8 @@ FC_SHMEM_REAL8_SWAP(double *target, shmem_internal_assert(sizeof(double) == 8); shmem_internal_swap(SHMEM_CTX_DEFAULT, target, value, &newval, 8, - *pe, SHM_INTERNAL_DOUBLE); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + *pe, SHM_INTERNAL_DOUBLE, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return newval; } @@ -163,8 +163,8 @@ FC_SHMEM_INT4_CSWAP(int32_t *target, shmem_internal_cswap(SHMEM_CTX_DEFAULT, target, value, &newval, cond, 4, - *pe, SHM_INTERNAL_INT32); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + *pe, SHM_INTERNAL_INT32, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return newval; } @@ -189,8 +189,8 @@ FC_SHMEM_INT8_CSWAP(int64_t *target, shmem_internal_cswap(SHMEM_CTX_DEFAULT, target, value, &newval, cond, 8, - *pe, SHM_INTERNAL_INT64); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + *pe, SHM_INTERNAL_INT64, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return newval; } @@ -212,8 +212,8 @@ FC_SHMEM_INT4_FADD(int32_t *target, SHMEM_ERR_CHECK_SYMMETRIC(target, 4); shmem_internal_fetch_atomic(SHMEM_CTX_DEFAULT, target, value, &oldval, 4, - *pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT32); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + *pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT32, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return oldval; } @@ -235,8 +235,8 @@ FC_SHMEM_INT8_FADD(int64_t *target, SHMEM_ERR_CHECK_SYMMETRIC(target, 8); shmem_internal_fetch_atomic(SHMEM_CTX_DEFAULT, target, value, &oldval, 8, - *pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT64); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + *pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT64, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return oldval; } @@ -256,8 +256,8 @@ FC_SHMEM_INT4_FINC(int32_t *target, SHMEM_ERR_CHECK_SYMMETRIC(target, 4); shmem_internal_fetch_atomic(SHMEM_CTX_DEFAULT, target, &tmp, &oldval, 4, - *pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT32); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + *pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT32, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return oldval; } @@ -277,8 +277,8 @@ FC_SHMEM_INT8_FINC(int64_t *target, SHMEM_ERR_CHECK_SYMMETRIC(target, 8); shmem_internal_fetch_atomic(SHMEM_CTX_DEFAULT, target, &tmp, &oldval, 8, - *pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT64); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + *pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT64, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return oldval; } @@ -373,8 +373,8 @@ FC_SHMEM_INT4_FETCH(int32_t *source, SHMEM_ERR_CHECK_PE(*pe); SHMEM_ERR_CHECK_SYMMETRIC(source, 4); - shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &val, (void *) source, 4, *pe, SHM_INTERNAL_INT32); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &val, (void *) source, 4, *pe, SHM_INTERNAL_INT32, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return val; } @@ -394,8 +394,8 @@ FC_SHMEM_INT8_FETCH(int64_t *source, SHMEM_ERR_CHECK_PE(*pe); SHMEM_ERR_CHECK_SYMMETRIC(source, 8); - shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &val, (void *) source, 8, *pe, SHM_INTERNAL_INT64); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &val, (void *) source, 8, *pe, SHM_INTERNAL_INT64, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return val; } @@ -417,8 +417,8 @@ FC_SHMEM_REAL4_FETCH(float *source, shmem_internal_assert(sizeof(float) == 4); - shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &val, (void *) source, 4, *pe, SHM_INTERNAL_INT32); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &val, (void *) source, 4, *pe, SHM_INTERNAL_INT32, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return val; } @@ -440,8 +440,8 @@ FC_SHMEM_REAL8_FETCH(double *source, shmem_internal_assert(sizeof(double) == 8); - shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &val, (void *) source, 8, *pe, SHM_INTERNAL_INT64); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &val, (void *) source, 8, *pe, SHM_INTERNAL_INT64, 0); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); return val; } diff --git a/src/atomic_nbi_c.c4 b/src/atomic_nbi_c.c4 index b8e644058..924b2b2b2 100644 --- a/src/atomic_nbi_c.c4 +++ b/src/atomic_nbi_c.c4 @@ -124,8 +124,12 @@ SHMEM_DEFINE_FOR_BITWISE_AMO(`SHMEM_PROF_DEF_CTX_FETCH_XOR_NBI') SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_swap_nbi(ctx, target, &value, fetch, \ - sizeof(TYPE), pe, ITYPE); \ + sizeof(TYPE), pe, ITYPE, \ + nic_idx); \ } @@ -137,8 +141,11 @@ SHMEM_DEFINE_FOR_BITWISE_AMO(`SHMEM_PROF_DEF_CTX_FETCH_XOR_NBI') SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_cswap_nbi(ctx, target, &value, fetch, &cond, \ - sizeof(TYPE), pe, ITYPE); \ + sizeof(TYPE), pe, ITYPE, nic_idx); \ } @@ -151,9 +158,12 @@ SHMEM_DEFINE_FOR_BITWISE_AMO(`SHMEM_PROF_DEF_CTX_FETCH_XOR_NBI') SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic_nbi(ctx, target, &tmp, fetch, \ sizeof(TYPE), pe, SHM_INTERNAL_SUM, \ - ITYPE); \ + ITYPE, nic_idx); \ } @@ -165,9 +175,13 @@ SHMEM_DEFINE_FOR_BITWISE_AMO(`SHMEM_PROF_DEF_CTX_FETCH_XOR_NBI') SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic_nbi(ctx, target, &value, fetch, \ sizeof(TYPE), pe, \ - SHM_INTERNAL_SUM, ITYPE); \ + SHM_INTERNAL_SUM, ITYPE, \ + nic_idx); \ } @@ -179,8 +193,11 @@ SHMEM_DEFINE_FOR_BITWISE_AMO(`SHMEM_PROF_DEF_CTX_FETCH_XOR_NBI') SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(source, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_atomic_fetch(ctx, fetch, (void *) source, \ - sizeof(TYPE), pe, ITYPE); \ + sizeof(TYPE), pe, ITYPE, nic_idx); \ } @@ -192,9 +209,12 @@ SHMEM_DEFINE_FOR_BITWISE_AMO(`SHMEM_PROF_DEF_CTX_FETCH_XOR_NBI') SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic_nbi(ctx, target, &value, fetch, \ sizeof(TYPE), pe, SHM_INTERNAL_BXOR,\ - ITYPE); \ + ITYPE, nic_idx); \ } @@ -206,9 +226,12 @@ SHMEM_DEFINE_FOR_BITWISE_AMO(`SHMEM_PROF_DEF_CTX_FETCH_XOR_NBI') SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic_nbi(ctx, target, &value, fetch, \ sizeof(TYPE), pe, SHM_INTERNAL_BAND,\ - ITYPE); \ + ITYPE, nic_idx); \ } @@ -220,9 +243,12 @@ SHMEM_DEFINE_FOR_BITWISE_AMO(`SHMEM_PROF_DEF_CTX_FETCH_XOR_NBI') SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(target, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_fetch_atomic_nbi(ctx, target, &value, fetch, \ sizeof(TYPE), pe, SHM_INTERNAL_BOR, \ - ITYPE); \ + ITYPE, nic_idx); \ } /* Function prototype for v1.4 routines with the default context: */ diff --git a/src/collectives.c b/src/collectives.c index ee51f869e..07f9c311c 100644 --- a/src/collectives.c +++ b/src/collectives.c @@ -244,7 +244,8 @@ shmem_internal_collectives_init(void) * *****************************************/ void -shmem_internal_sync_linear(int PE_start, int PE_stride, int PE_size, long *pSync) +shmem_internal_sync_linear(int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { long zero = 0, one = 1; @@ -259,27 +260,27 @@ shmem_internal_sync_linear(int PE_start, int PE_stride, int PE_size, long *pSync /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); /* Send acks down psync tree */ for (pe = PE_start + PE_stride, i = 1 ; i < PE_size ; i++, pe += PE_stride) { - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), pe, nic_idx); } } else { /* send message to root */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), PE_start, - SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); /* wait for ack down psync tree */ SHMEM_WAIT(pSync, 0); /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } @@ -287,7 +288,8 @@ shmem_internal_sync_linear(int PE_start, int PE_stride, int PE_size, long *pSync void -shmem_internal_sync_tree(int PE_start, int PE_stride, int PE_size, long *pSync) +shmem_internal_sync_tree(int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { long zero = 0, one = 1; int parent, num_children, *children; @@ -318,13 +320,13 @@ shmem_internal_sync_tree(int PE_start, int PE_stride, int PE_size, long *pSync) /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); /* Send acks down to children */ for (i = 0 ; i < num_children ; ++i) { shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - children[i], SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + children[i], SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } } else { @@ -332,20 +334,20 @@ shmem_internal_sync_tree(int PE_start, int PE_stride, int PE_size, long *pSync) /* send ack to parent */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); /* wait for ack from parent */ SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, num_children + 1); /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); /* Send acks down to children */ for (i = 0 ; i < num_children ; ++i) { shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - children[i], SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + children[i], SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } } @@ -354,21 +356,22 @@ shmem_internal_sync_tree(int PE_start, int PE_stride, int PE_size, long *pSync) /* send message up psync tree */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), parent, - SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); /* wait for ack down psync tree */ SHMEM_WAIT(pSync, 0); /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } } void -shmem_internal_sync_dissem(int PE_start, int PE_stride, int PE_size, long *pSync) +shmem_internal_sync_dissem(int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { int one = 1, neg_one = -1; int distance, to, i; @@ -389,7 +392,7 @@ shmem_internal_sync_dissem(int PE_start, int PE_stride, int PE_size, long *pSync to = PE_start + (to * PE_stride); shmem_internal_atomic(SHMEM_CTX_DEFAULT, &pSync_ints[i], &one, sizeof(int), - to, SHM_INTERNAL_SUM, SHM_INTERNAL_INT); + to, SHM_INTERNAL_SUM, SHM_INTERNAL_INT, nic_idx); SHMEM_WAIT_UNTIL(&pSync_ints[i], SHMEM_CMP_NE, 0); /* There's a path where the next update from a peer can get @@ -399,7 +402,7 @@ shmem_internal_sync_dissem(int PE_start, int PE_stride, int PE_size, long *pSync /* this slot is no longer used, so subtract off results now */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, &pSync_ints[i], &neg_one, sizeof(int), - shmem_internal_my_pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT); + shmem_internal_my_pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT, nic_idx); } /* Ensure local pSync decrements are done before a subsequent barrier */ @@ -415,7 +418,7 @@ shmem_internal_sync_dissem(int PE_start, int PE_stride, int PE_size, long *pSync void shmem_internal_bcast_linear(void *target, const void *source, size_t len, int PE_root, int PE_start, int PE_stride, int PE_size, - long *pSync, int complete) + long *pSync, int complete, size_t nic_idx) { long zero = 0, one = 1; int real_root = PE_start + PE_root * PE_stride; @@ -432,7 +435,7 @@ shmem_internal_bcast_linear(void *target, const void *source, size_t len, /* send data to all peers */ for (pe = PE_start,i=0; i < PE_size; pe += PE_stride, i++) { if (pe == shmem_internal_my_pe) continue; - shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, source, len, pe, &completion); + shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, source, len, pe, &completion, nic_idx); } shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); @@ -441,7 +444,7 @@ shmem_internal_bcast_linear(void *target, const void *source, size_t len, /* send completion ack to all peers */ for (pe = PE_start,i=0; i < PE_size; pe += PE_stride, i++) { if (pe == shmem_internal_my_pe) continue; - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(long), pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(long), pe, nic_idx); } if (1 == complete) { @@ -450,7 +453,7 @@ shmem_internal_bcast_linear(void *target, const void *source, size_t len, /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } @@ -460,13 +463,13 @@ shmem_internal_bcast_linear(void *target, const void *source, size_t len, /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); if (1 == complete) { /* send ack back to root */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - real_root, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + real_root, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } } } @@ -475,7 +478,7 @@ shmem_internal_bcast_linear(void *target, const void *source, size_t len, void shmem_internal_bcast_tree(void *target, const void *source, size_t len, int PE_root, int PE_start, int PE_stride, int PE_size, - long *pSync, int complete) + long *pSync, int complete, size_t nic_idx) { long zero = 0, one = 1; long completion = 0; @@ -510,14 +513,14 @@ shmem_internal_bcast_tree(void *target, const void *source, size_t len, /* if complete, send ack */ if (1 == complete) { shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } } /* send data to all leaves */ for (i = 0 ; i < num_children ; ++i) { shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, send_buf, len, children[i], - &completion); + &completion, nic_idx); } shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); @@ -526,7 +529,7 @@ shmem_internal_bcast_tree(void *target, const void *source, size_t len, /* send completion ack to all peers */ for (i = 0 ; i < num_children ; ++i) { shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(long), - children[i]); + children[i], nic_idx); } if (1 == complete) { @@ -539,7 +542,7 @@ shmem_internal_bcast_tree(void *target, const void *source, size_t len, /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } else { @@ -549,12 +552,12 @@ shmem_internal_bcast_tree(void *target, const void *source, size_t len, /* if complete, send ack */ if (1 == complete) { shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } /* Clear pSync */ shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), - shmem_internal_my_pe); + shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } } @@ -569,7 +572,8 @@ void shmem_internal_op_to_all_linear(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype) + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx) { long zero = 0, one = 1; @@ -586,7 +590,7 @@ shmem_internal_op_to_all_linear(void *target, const void *source, size_t count, will flush any atomic cache value that may currently exist. */ shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, source, count * type_size, - shmem_internal_my_pe, &completion); + shmem_internal_my_pe, &completion, nic_idx); shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); shmem_internal_quiet(SHMEM_CTX_DEFAULT); @@ -594,14 +598,14 @@ shmem_internal_op_to_all_linear(void *target, const void *source, size_t count, for (pe = PE_start + PE_stride, i = 1 ; i < PE_size ; i++, pe += PE_stride) { - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), pe, nic_idx); } /* Wait for others to acknowledge sending data */ SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, PE_size - 1); /* reset pSync */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } else { @@ -609,22 +613,22 @@ shmem_internal_op_to_all_linear(void *target, const void *source, size_t count, SHMEM_WAIT(pSync, 0); /* reset pSync */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); /* send data, ack, and wait for completion */ shmem_internal_atomicv(SHMEM_CTX_DEFAULT, target, source, count * type_size, - PE_start, op, datatype, &completion); + PE_start, op, datatype, &completion, nic_idx); shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - PE_start, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + PE_start, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } /* broadcast out */ shmem_internal_bcast(target, target, count * type_size, 0, - PE_start, PE_stride, PE_size, pSync + 2, 0); + PE_start, PE_stride, PE_size, pSync + 2, 0, nic_idx); } @@ -635,7 +639,8 @@ void shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype) + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx) { int group_rank = (shmem_internal_my_pe - PE_start) / PE_stride; long zero = 0, one = 1; @@ -650,7 +655,7 @@ shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, si if (PE_size == 1) { if (target != source) - shmem_internal_copy_self(target, source, count * type_size); + shmem_internal_copy_self(target, source, count * type_size, nic_idx); return; } @@ -662,11 +667,11 @@ shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, si if (NULL == tmp) RAISE_ERROR_MSG("Unable to allocate %zub temporary buffer\n", count*type_size); - shmem_internal_copy_self(tmp, target, count * type_size); + shmem_internal_copy_self(tmp, target, count * type_size, nic_idx); free_source = 1; source = tmp; - shmem_internal_sync(PE_start, PE_stride, PE_size, pSync + 2); + shmem_internal_sync(PE_start, PE_stride, PE_size, pSync + 2, nic_idx); } /* Perform reduce-scatter: @@ -700,10 +705,10 @@ shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, si i == 0 ? ((uint8_t *) source) + chunk_out_disp : ((uint8_t *) target) + chunk_out_disp, - chunk_out_count * type_size, peer); + chunk_out_count * type_size, peer, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - peer, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + peer, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); /* Wait for chunk */ SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_GE, i+1); @@ -714,7 +719,7 @@ shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, si } /* Reset reduce-scatter pSync */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); /* Perform all-gather: @@ -733,17 +738,17 @@ shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, si shmem_internal_put_nbi(SHMEM_CTX_DEFAULT, ((uint8_t *) target) + chunk_out_disp, ((uint8_t *) target) + chunk_out_disp, - chunk_out_count * type_size, peer); + chunk_out_count * type_size, peer, nic_idx); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync+1, &one, sizeof(one), - peer, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + peer, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); /* Wait for chunk */ SHMEM_WAIT_UNTIL(pSync+1, SHMEM_CMP_GE, i+1); } /* reset pSync */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync+1, &zero, sizeof(zero), shmem_internal_my_pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync+1, &zero, sizeof(zero), shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync+1, SHMEM_CMP_EQ, 0); if (free_source) @@ -755,7 +760,8 @@ void shmem_internal_op_to_all_tree(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype) + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx) { long zero = 0, one = 1; long completion = 0; @@ -766,7 +772,7 @@ shmem_internal_op_to_all_tree(void *target, const void *source, size_t count, si if (PE_size == 1) { if (target != source) { - shmem_internal_copy_self(target, source, type_size * count); + shmem_internal_copy_self(target, source, type_size * count, nic_idx); } return; } @@ -791,20 +797,20 @@ shmem_internal_op_to_all_tree(void *target, const void *source, size_t count, si will flush any atomic cache value that may currently exist. */ shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, source, count * type_size, - shmem_internal_my_pe, &completion); + shmem_internal_my_pe, &completion, nic_idx); shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); shmem_internal_quiet(SHMEM_CTX_DEFAULT); /* let everyone know that it's safe to send to us */ for (i = 0 ; i < num_children ; ++i) { - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync + 1, &one, sizeof(one), children[i]); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync + 1, &one, sizeof(one), children[i], nic_idx); } /* Wait for others to acknowledge sending data */ SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, num_children); /* reset pSync */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(zero), shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } @@ -813,24 +819,24 @@ shmem_internal_op_to_all_tree(void *target, const void *source, size_t count, si SHMEM_WAIT(pSync + 1, 0); /* reset pSync */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync + 1, &zero, sizeof(zero), shmem_internal_my_pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync + 1, &zero, sizeof(zero), shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync + 1, SHMEM_CMP_EQ, 0); /* send data, ack, and wait for completion */ shmem_internal_atomicv(SHMEM_CTX_DEFAULT, target, (num_children == 0) ? source : target, count * type_size, parent, - op, datatype, &completion); + op, datatype, &completion, nic_idx); shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(one), - parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + parent, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } /* broadcast out */ shmem_internal_bcast(target, target, count * type_size, 0, PE_start, - PE_stride, PE_size, pSync + 2, 0); + PE_stride, PE_size, pSync + 2, 0, nic_idx); } @@ -838,7 +844,8 @@ void shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype) + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx) { int my_id = ((shmem_internal_my_pe - PE_start) / PE_stride); int log2_proc = 1, pow2_proc = 2; @@ -851,7 +858,7 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t coun if (PE_size == 1) { if (target != source) { - shmem_internal_copy_self(target, source, type_size * count); + shmem_internal_copy_self(target, source, type_size * count, nic_idx); } free(current_target); return; @@ -896,17 +903,17 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t coun SHMEM_WAIT_UNTIL(pSync_extra_peer, SHMEM_CMP_EQ, ps_target_ready); shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, current_target, wrk_size, peer, - &completion); + &completion, nic_idx); shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); shmem_internal_fence(SHMEM_CTX_DEFAULT); - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync_extra_peer, &ps_data_ready, sizeof(long), peer); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync_extra_peer, &ps_data_ready, sizeof(long), peer, nic_idx); SHMEM_WAIT_UNTIL(pSync_extra_peer, SHMEM_CMP_EQ, ps_data_ready); } else { if (my_id < PE_size - pow2_proc) { int peer = (my_id + pow2_proc) * PE_stride + PE_start; - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync_extra_peer, &ps_target_ready, sizeof(long), peer); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync_extra_peer, &ps_target_ready, sizeof(long), peer, nic_idx); SHMEM_WAIT_UNTIL(pSync_extra_peer, SHMEM_CMP_EQ, ps_data_ready); shmem_internal_reduce_local(op, datatype, count, target, current_target); @@ -922,25 +929,25 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t coun if (shmem_internal_my_pe < peer) { shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, step_psync, &ps_target_ready, - sizeof(long), peer); + sizeof(long), peer, nic_idx); SHMEM_WAIT_UNTIL(step_psync, SHMEM_CMP_EQ, ps_data_ready); shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, current_target, - wrk_size, peer, &completion); + wrk_size, peer, &completion, nic_idx); shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, step_psync, &ps_data_ready, - sizeof(long), peer); + sizeof(long), peer, nic_idx); } else { SHMEM_WAIT_UNTIL(step_psync, SHMEM_CMP_EQ, ps_target_ready); shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, current_target, - wrk_size, peer, &completion); + wrk_size, peer, &completion, nic_idx); shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, step_psync, &ps_data_ready, - sizeof(long), peer); + sizeof(long), peer, nic_idx); SHMEM_WAIT_UNTIL(step_psync, SHMEM_CMP_EQ, ps_data_ready); } @@ -954,11 +961,11 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t coun int peer = (my_id + pow2_proc) * PE_stride + PE_start; shmem_internal_put_nb(SHMEM_CTX_DEFAULT, target, current_target, wrk_size, - peer, &completion); + peer, &completion, nic_idx); shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); shmem_internal_fence(SHMEM_CTX_DEFAULT); shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync_extra_peer, &ps_data_ready, - sizeof(long), peer); + sizeof(long), peer, nic_idx); } memcpy(target, current_target, wrk_size); @@ -978,7 +985,8 @@ shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t coun *****************************************/ void shmem_internal_collect_linear(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { size_t my_offset; long tmp[2]; @@ -991,7 +999,7 @@ shmem_internal_collect_linear(void *target, const void *source, size_t len, target, source, len, PE_start, PE_stride, PE_size, (void*) pSync); if (PE_size == 1) { - if (target != source) shmem_internal_copy_self(target, source, len); + if (target != source) shmem_internal_copy_self(target, source, len, nic_idx); return; } @@ -1000,7 +1008,7 @@ shmem_internal_collect_linear(void *target, const void *source, size_t len, my_offset = 0; tmp[0] = (long) len; /* FIXME: Potential truncation of size_t into long */ tmp[1] = 1; /* FIXME: Packing flag with data relies on byte ordering */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, tmp, 2 * sizeof(long), PE_start + PE_stride); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, tmp, 2 * sizeof(long), PE_start + PE_stride, nic_idx); } else { /* wait for send data */ @@ -1012,7 +1020,7 @@ shmem_internal_collect_linear(void *target, const void *source, size_t len, tmp[0] = (long) (my_offset + len); tmp[1] = 1; shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, tmp, 2 * sizeof(long), - shmem_internal_my_pe + PE_stride); + shmem_internal_my_pe + PE_stride, nic_idx); } } @@ -1024,13 +1032,13 @@ shmem_internal_collect_linear(void *target, const void *source, size_t len, do { if (len > 0) { shmem_internal_put_nbi(SHMEM_CTX_DEFAULT, ((uint8_t *) target) + my_offset, source, - len, peer); + len, peer, nic_idx); } peer = shmem_internal_circular_iter_next(peer, PE_start, PE_stride, PE_size); } while (peer != start_pe); - shmem_internal_barrier(PE_start, PE_stride, PE_size, &pSync[2]); + shmem_internal_barrier(PE_start, PE_stride, PE_size, &pSync[2], nic_idx); pSync[0] = SHMEM_SYNC_VALUE; pSync[1] = SHMEM_SYNC_VALUE; @@ -1047,7 +1055,8 @@ shmem_internal_collect_linear(void *target, const void *source, size_t len, *****************************************/ void shmem_internal_fcollect_linear(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { long tmp = 1; long completion = 0; @@ -1057,24 +1066,24 @@ shmem_internal_fcollect_linear(void *target, const void *source, size_t len, if (PE_start == shmem_internal_my_pe) { /* Copy data into the target */ - if (source != target) shmem_internal_copy_self(target, source, len); + if (source != target) shmem_internal_copy_self(target, source, len, nic_idx); /* send completion update */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &tmp, sizeof(long), - PE_start, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + PE_start, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); /* wait for N updates */ SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, PE_size); /* Clear pSync */ tmp = 0; - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &tmp, sizeof(tmp), PE_start); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &tmp, sizeof(tmp), PE_start, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } else { /* Push data into the target */ size_t offset = ((shmem_internal_my_pe - PE_start) / PE_stride) * len; shmem_internal_put_nb(SHMEM_CTX_DEFAULT, (char*) target + offset, source, len, PE_start, - &completion); + &completion, nic_idx); shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); /* ensure ordering */ @@ -1082,11 +1091,11 @@ shmem_internal_fcollect_linear(void *target, const void *source, size_t len, /* send completion update */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &tmp, sizeof(long), - PE_start, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + PE_start, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); } shmem_internal_bcast(target, target, len * PE_size, 0, PE_start, PE_stride, - PE_size, pSync + 1, 0); + PE_size, pSync + 1, 0, nic_idx); } @@ -1099,7 +1108,8 @@ shmem_internal_fcollect_linear(void *target, const void *source, size_t len, */ void shmem_internal_fcollect_ring(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { int i; /* my_id is the index in a theoretical 0...N-1 array of @@ -1115,7 +1125,7 @@ shmem_internal_fcollect_ring(void *target, const void *source, size_t len, if (len == 0) return; /* copy my portion to the right place */ - shmem_internal_copy_self((char*) target + (my_id * len), source, len); + shmem_internal_copy_self((char*) target + (my_id * len), source, len, nic_idx); /* send n - 1 messages to the next highest proc. Each message contains what we received the previous step (including our own @@ -1125,7 +1135,7 @@ shmem_internal_fcollect_ring(void *target, const void *source, size_t len, /* send data to me + 1 */ shmem_internal_put_nb(SHMEM_CTX_DEFAULT, (char*) target + iter_offset, (char*) target + iter_offset, - len, next_proc, &completion); + len, next_proc, &completion, nic_idx); shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); shmem_internal_fence(SHMEM_CTX_DEFAULT); @@ -1134,14 +1144,14 @@ shmem_internal_fcollect_ring(void *target, const void *source, size_t len, between successive calls to the put above. So a rolling counter is safe here. */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, pSync, &one, sizeof(long), - next_proc, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG); + next_proc, SHM_INTERNAL_SUM, SHM_INTERNAL_LONG, nic_idx); /* wait for completion for this round */ SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_GE, i); } /* zero out psync */ - shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(long), shmem_internal_my_pe); + shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, pSync, &zero, sizeof(long), shmem_internal_my_pe, nic_idx); SHMEM_WAIT_UNTIL(pSync, SHMEM_CMP_EQ, 0); } @@ -1155,7 +1165,8 @@ shmem_internal_fcollect_ring(void *target, const void *source, size_t len, */ void shmem_internal_fcollect_recdbl(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { int my_id = ((shmem_internal_my_pe - PE_start) / PE_stride); int i; @@ -1179,7 +1190,7 @@ shmem_internal_fcollect_recdbl(void *target, const void *source, size_t len, /* copy my portion to the right place */ curr_offset = my_id * len; - shmem_internal_copy_self((char*) target + curr_offset, source, len); + shmem_internal_copy_self((char*) target + curr_offset, source, len, nic_idx); for (i = 0, distance = 0x1 ; distance < PE_size ; i++, distance <<= 1) { int peer = my_id ^ distance; @@ -1187,19 +1198,19 @@ shmem_internal_fcollect_recdbl(void *target, const void *source, size_t len, /* send data to peer */ shmem_internal_put_nb(SHMEM_CTX_DEFAULT, (char*) target + curr_offset, (char*) target + curr_offset, - distance * len, real_peer, &completion); + distance * len, real_peer, &completion, nic_idx); shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); shmem_internal_fence(SHMEM_CTX_DEFAULT); /* mark completion for this round */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, &pSync_ints[i], &one, sizeof(int), - real_peer, SHM_INTERNAL_SUM, SHM_INTERNAL_INT); + real_peer, SHM_INTERNAL_SUM, SHM_INTERNAL_INT, nic_idx); SHMEM_WAIT_UNTIL(&pSync_ints[i], SHMEM_CMP_NE, 0); /* this slot is no longer used, so subtract off results now */ shmem_internal_atomic(SHMEM_CTX_DEFAULT, &pSync_ints[i], &neg_one, sizeof(int), - shmem_internal_my_pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT); + shmem_internal_my_pe, SHM_INTERNAL_SUM, SHM_INTERNAL_INT, nic_idx); if (my_id > peer) { curr_offset -= (distance * len); @@ -1212,7 +1223,8 @@ shmem_internal_fcollect_recdbl(void *target, const void *source, size_t len, void shmem_internal_alltoall(void *dest, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { const int my_as_rank = (shmem_internal_my_pe - PE_start) / PE_stride; const void *dest_ptr = (uint8_t *) dest + my_as_rank * len; @@ -1232,12 +1244,12 @@ shmem_internal_alltoall(void *dest, const void *source, size_t len, int peer_as_rank = (peer - PE_start) / PE_stride; /* Peer's index in active set */ shmem_internal_put_nbi(SHMEM_CTX_DEFAULT, (void *) dest_ptr, (uint8_t *) source + peer_as_rank * len, - len, peer); + len, peer, nic_idx); peer = shmem_internal_circular_iter_next(peer, PE_start, PE_stride, PE_size); } while (peer != start_pe); - shmem_internal_barrier(PE_start, PE_stride, PE_size, pSync); + shmem_internal_barrier(PE_start, PE_stride, PE_size, pSync, nic_idx); for (i = 0; i < SHMEM_BARRIER_SYNC_SIZE; i++) pSync[i] = SHMEM_SYNC_VALUE; @@ -1247,7 +1259,8 @@ shmem_internal_alltoall(void *dest, const void *source, size_t len, void shmem_internal_alltoalls(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, size_t elem_size, size_t nelems, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { const int my_as_rank = (shmem_internal_my_pe - PE_start) / PE_stride; const void *dest_base = (uint8_t *) dest + my_as_rank * nelems * dst * elem_size; @@ -1279,7 +1292,7 @@ shmem_internal_alltoalls(void *dest, const void *source, ptrdiff_t dst, for (i = nelems ; i > 0; i--) { shmem_internal_put_scalar(SHMEM_CTX_DEFAULT, (void *) dest_ptr, (uint8_t *) source_ptr, - elem_size, peer); + elem_size, peer, nic_idx); source_ptr += sst * elem_size; dest_ptr += dst * elem_size; @@ -1288,7 +1301,7 @@ shmem_internal_alltoalls(void *dest, const void *source, ptrdiff_t dst, PE_size); } while (peer != start_pe); - shmem_internal_barrier(PE_start, PE_stride, PE_size, pSync); + shmem_internal_barrier(PE_start, PE_stride, PE_size, pSync, nic_idx); for (i = 0; i < SHMEM_BARRIER_SYNC_SIZE; i++) pSync[i] = SHMEM_SYNC_VALUE; diff --git a/src/collectives_c.c4 b/src/collectives_c.c4 index 70c8876b5..62c9c7ce5 100644 --- a/src/collectives_c.c4 +++ b/src/collectives_c.c4 @@ -158,7 +158,9 @@ shmem_barrier_all(void) { SHMEM_ERR_CHECK_INITIALIZED(); - shmem_internal_barrier_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); } @@ -169,7 +171,9 @@ shmem_barrier(int PE_start, int logPE_stride, int PE_size, long *pSync) SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, 1 << logPE_stride, PE_size); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long)*SHMEM_BARRIER_SYNC_SIZE); - shmem_internal_barrier(PE_start, 1 << logPE_stride, PE_size, pSync); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier(PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } @@ -178,7 +182,9 @@ shmem_sync_all(void) { SHMEM_ERR_CHECK_INITIALIZED(); - shmem_internal_sync_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_sync_all(nic_idx); } void SHMEM_FUNCTION_ATTRIBUTES @@ -188,7 +194,9 @@ shmem_sync(int PE_start, int logPE_stride, int PE_size, long *pSync) SHMEM_ERR_CHECK_ACTIVE_SET(PE_start, 1 << logPE_stride, PE_size); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long)*SHMEM_BARRIER_SYNC_SIZE); - shmem_internal_sync(PE_start, 1 << logPE_stride, PE_size, pSync); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_sync(PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } /* Team-based Collective Routines */ @@ -199,9 +207,11 @@ shmem_team_sync(shmem_team_t team) SHMEM_ERR_CHECK_INITIALIZED(); SHMEM_ERR_CHECK_TEAM_VALID(team); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; - long *psync = shmem_internal_team_choose_psync(myteam, SYNC); - shmem_internal_sync(myteam->start, myteam->stride, myteam->size, psync); + long *psync = shmem_internal_team_choose_psync(myteam, SYNC, nic_idx); + shmem_internal_sync(myteam->start, myteam->stride, myteam->size, psync, nic_idx); shmem_internal_team_release_psyncs(myteam, SYNC); return 0; } @@ -228,9 +238,11 @@ shmem_team_sync(shmem_team_t team) SHMEM_ERR_CHECK_OVERLAP(target, source, sizeof(TYPE)*nreduce, \ sizeof(TYPE)*nreduce, 1, 1); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_op_to_all(target, source, nreduce, sizeof(TYPE), \ PE_start, 1 << logPE_stride, PE_size, \ - pWrk, pSync, IOP, ITYPE); \ + pWrk, pSync, IOP, ITYPE, nic_idx); \ } #define SHMEM_DEF_REDUCE(STYPE,TYPE,ITYPE,SOP,IOP) \ @@ -247,11 +259,14 @@ shmem_team_sync(shmem_team_t team) sizeof(TYPE)*nreduce, 1, 1); \ TYPE *pWrk = NULL; \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ - long *psync = shmem_internal_team_choose_psync(myteam, REDUCE); \ + long *psync = shmem_internal_team_choose_psync(myteam, REDUCE, \ + nic_idx); \ shmem_internal_op_to_all(dest, source, nreduce, sizeof(TYPE), \ myteam->start, myteam->stride, myteam->size, pWrk, \ - psync, IOP, ITYPE); \ + psync, IOP, ITYPE, nic_idx); \ shmem_internal_team_release_psyncs(myteam, REDUCE); \ return 0; \ } @@ -292,9 +307,11 @@ shmem_broadcast32(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long)*SHMEM_BCAST_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(target, source, nlong * 4, nlong * 4, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_bcast(target, source, nlong * 4, PE_root, PE_start, 1 << logPE_stride, PE_size, - pSync, 1); + pSync, 1, nic_idx); } @@ -311,9 +328,11 @@ shmem_broadcast64(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long)*SHMEM_BCAST_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(target, source, nlong * 8, nlong * 8, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_bcast(target, source, nlong * 8, PE_root, PE_start, 1 << logPE_stride, PE_size, - pSync, 1); + pSync, 1, nic_idx); } int SHMEM_FUNCTION_ATTRIBUTES @@ -327,15 +346,17 @@ shmem_broadcastmem(shmem_team_t team, void *dest, const void *source, SHMEM_ERR_CHECK_SYMMETRIC(source, nelems); SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems, nelems, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; - long *psync = shmem_internal_team_choose_psync(myteam, BCAST); + long *psync = shmem_internal_team_choose_psync(myteam, BCAST, nic_idx); shmem_internal_bcast(dest, source, nelems, PE_root, myteam->start, myteam->stride, myteam->size, - psync, 1); + psync, 1, nic_idx); shmem_internal_team_release_psyncs(myteam, BCAST); int team_root = myteam->start + PE_root * myteam->stride; if (shmem_internal_my_pe == team_root && dest != source) - shmem_internal_copy_self(dest, source, nelems); + shmem_internal_copy_self(dest, source, nelems, nic_idx); return 0; } @@ -353,16 +374,19 @@ shmem_broadcastmem(shmem_team_t team, void *dest, const void *source, SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems * sizeof(TYPE), \ nelems * sizeof(TYPE), 1, 1); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ - long *psync = shmem_internal_team_choose_psync(myteam, BCAST); \ + long *psync = shmem_internal_team_choose_psync(myteam, BCAST, \ + nic_idx); \ shmem_internal_bcast(dest, source, nelems * sizeof(TYPE), \ PE_root, myteam->start, myteam->stride, \ - myteam->size, psync, 1); \ + myteam->size, psync, 1, nic_idx); \ shmem_internal_team_release_psyncs(myteam, BCAST); \ int team_root = myteam->start + PE_root * myteam->stride; \ if (shmem_internal_my_pe == team_root && dest != source) { \ shmem_internal_copy_self(dest, source, \ - nelems * sizeof(TYPE)); \ + nelems * sizeof(TYPE), nic_idx); \ } \ return 0; \ } @@ -380,8 +404,10 @@ shmem_collect32(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_COLLECT_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(target, source, nlong * 4, nlong * 4, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_collect(target, source, nlong * 4, - PE_start, 1 << logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } @@ -396,8 +422,10 @@ shmem_collect64(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_COLLECT_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(target, source, nlong * 8, nlong * 8, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_collect(target, source, nlong * 8, - PE_start, 1 << logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } #define SHMEM_DEF_COLLECT(STYPE,TYPE) \ @@ -412,12 +440,15 @@ shmem_collect64(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems * sizeof(TYPE), \ nelems * sizeof(TYPE), 1, 1); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ long *psync = shmem_internal_team_choose_psync(myteam, \ - COLLECT); \ + COLLECT, \ + nic_idx); \ shmem_internal_collect(dest, source, nelems * sizeof(TYPE), \ myteam->start, myteam->stride, \ - myteam->size, psync); \ + myteam->size, psync, nic_idx); \ shmem_internal_team_release_psyncs(myteam, COLLECT); \ return 0; \ } @@ -434,10 +465,12 @@ shmem_collectmem(shmem_team_t team, void *dest, const void *source, SHMEM_ERR_CHECK_SYMMETRIC(source, nelems); SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems, nelems, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; - long *psync = shmem_internal_team_choose_psync(myteam, COLLECT); + long *psync = shmem_internal_team_choose_psync(myteam, COLLECT, nic_idx); shmem_internal_collect(dest, source, nelems, myteam->start, - myteam->stride, myteam->size, psync); + myteam->stride, myteam->size, psync, nic_idx); shmem_internal_team_release_psyncs(myteam, COLLECT); return 0; } @@ -453,8 +486,10 @@ shmem_fcollect32(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_COLLECT_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(target, source, nlong * 4, nlong * 4, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_fcollect(target, source, nlong * 4, - PE_start, 1 << logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } @@ -469,8 +504,10 @@ shmem_fcollect64(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_COLLECT_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(target, source, nlong * 8, nlong * 8, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_fcollect(target, source, nlong * 8, - PE_start, 1 << logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } #define SHMEM_DEF_FCOLLECT(STYPE,TYPE) \ @@ -485,12 +522,15 @@ shmem_fcollect64(void *target, const void *source, size_t nlong, SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems * sizeof(TYPE), \ nelems * sizeof(TYPE), 1, 1); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ long *psync = shmem_internal_team_choose_psync(myteam, \ - COLLECT); \ + COLLECT, \ + nic_idx); \ shmem_internal_fcollect(dest, source, nelems * sizeof(TYPE), \ myteam->start, myteam->stride, \ - myteam->size, psync); \ + myteam->size, psync, nic_idx); \ shmem_internal_team_release_psyncs(myteam, COLLECT); \ return 0; \ } @@ -507,10 +547,12 @@ shmem_fcollectmem(shmem_team_t team, void *dest, const void *source, SHMEM_ERR_CHECK_SYMMETRIC(source, nelems); SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems, nelems, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; - long *psync = shmem_internal_team_choose_psync(myteam, COLLECT); + long *psync = shmem_internal_team_choose_psync(myteam, COLLECT, nic_idx); shmem_internal_fcollect(dest, source, nelems, myteam->start, - myteam->stride, myteam->size, psync); + myteam->stride, myteam->size, psync, nic_idx); shmem_internal_team_release_psyncs(myteam, COLLECT); return 0; } @@ -526,8 +568,10 @@ shmem_alltoall32(void *dest, const void *source, size_t nelems, int PE_start, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_ALLTOALL_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems * 4, nelems * 4, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_alltoall(dest, source, nelems * 4, - PE_start, 1 << logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } @@ -542,8 +586,10 @@ shmem_alltoall64(void *dest, const void *source, size_t nelems, int PE_start, SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_ALLTOALL_SYNC_SIZE); SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems * 8, nelems * 8, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_alltoall(dest, source, nelems * 8, - PE_start, 1 << logPE_stride, PE_size, pSync); + PE_start, 1 << logPE_stride, PE_size, pSync, nic_idx); } #define SHMEM_DEF_ALLTOALL(STYPE,TYPE) \ @@ -558,12 +604,15 @@ shmem_alltoall64(void *dest, const void *source, size_t nelems, int PE_start, SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems * sizeof(TYPE), \ nelems * sizeof(TYPE), 1, 1); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ long *psync = shmem_internal_team_choose_psync(myteam, \ - ALLTOALL); \ + ALLTOALL, \ + nic_idx); \ shmem_internal_alltoall(dest, source, nelems * sizeof(TYPE), \ myteam->start, myteam->stride, \ - myteam->size, psync); \ + myteam->size, psync, nic_idx); \ shmem_internal_team_release_psyncs(myteam, ALLTOALL); \ return 0; \ } @@ -580,10 +629,12 @@ shmem_alltoallmem(shmem_team_t team, void *dest, const void *source, SHMEM_ERR_CHECK_SYMMETRIC(source, nelems); SHMEM_ERR_CHECK_OVERLAP(dest, source, nelems, nelems, 1, 1); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; - long *psync = shmem_internal_team_choose_psync(myteam, ALLTOALL); + long *psync = shmem_internal_team_choose_psync(myteam, ALLTOALL, nic_idx); shmem_internal_alltoall(dest, source, nelems, myteam->start, - myteam->stride, myteam->size, psync); + myteam->stride, myteam->size, psync, nic_idx); shmem_internal_team_release_psyncs(myteam, ALLTOALL); return 0; } @@ -602,8 +653,10 @@ shmem_alltoalls32(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, SHMEM_ERR_CHECK_SYMMETRIC(source, 4 * ((nelems-1) * sst + 1)); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_ALLTOALL_SYNC_SIZE); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_alltoalls(dest, source, dst, sst, 4, nelems, PE_start, - 1 << logPE_stride, PE_size, pSync); + 1 << logPE_stride, PE_size, pSync, nic_idx); } @@ -620,8 +673,10 @@ shmem_alltoalls64(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, SHMEM_ERR_CHECK_SYMMETRIC(source, 8 * ((nelems-1) * sst + 1)); SHMEM_ERR_CHECK_SYMMETRIC(pSync, sizeof(long) * SHMEM_ALLTOALL_SYNC_SIZE); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_alltoalls(dest, source, dst, sst, 8, nelems, PE_start, - 1 << logPE_stride, PE_size, pSync); + 1 << logPE_stride, PE_size, pSync, nic_idx); } #define SHMEM_DEF_ALLTOALLS(STYPE,TYPE) \ @@ -635,11 +690,14 @@ shmem_alltoalls64(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, SHMEM_ERR_CHECK_SYMMETRIC(dest, nelems * sizeof(TYPE)); \ SHMEM_ERR_CHECK_SYMMETRIC(source, nelems * sizeof(TYPE)); \ \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; \ - long *psync = shmem_internal_team_choose_psync(myteam, ALLTOALL); \ + long *psync = shmem_internal_team_choose_psync(myteam, ALLTOALL, \ + nic_idx); \ shmem_internal_alltoalls(dest, source, dst, sst, sizeof(TYPE), \ nelems, myteam->start, myteam->stride, \ - myteam->size, psync); \ + myteam->size, psync, nic_idx); \ shmem_internal_team_release_psyncs(myteam, ALLTOALL); \ return 0; \ } @@ -655,11 +713,13 @@ shmem_alltoallsmem(shmem_team_t team, void *dest, const void *source, SHMEM_ERR_CHECK_SYMMETRIC(dest, nelems); SHMEM_ERR_CHECK_SYMMETRIC(source, nelems); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_team_t *myteam = (shmem_internal_team_t *)team; - long *psync = shmem_internal_team_choose_psync(myteam, ALLTOALL); + long *psync = shmem_internal_team_choose_psync(myteam, ALLTOALL, nic_idx); shmem_internal_alltoalls(dest, source, dst, sst, 1, nelems, myteam->start, myteam->stride, myteam->size, - psync); + psync, nic_idx); shmem_internal_team_release_psyncs(myteam, ALLTOALL); return 0; } diff --git a/src/data_c.c4 b/src/data_c.c4 index 31233b998..d0f08a9e2 100644 --- a/src/data_c.c4 +++ b/src/data_c.c4 @@ -305,8 +305,11 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(addr, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_put_scalar(ctx, addr, &value, sizeof(TYPE), \ - pe); \ + pe, nic_idx); \ } #define SHMEM_DEF_G(STYPE,TYPE) \ @@ -318,9 +321,12 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_ERR_CHECK_PE(pe); \ SHMEM_ERR_CHECK_CTX(ctx); \ SHMEM_ERR_CHECK_SYMMETRIC(addr, sizeof(TYPE)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_get(ctx, &tmp, addr, sizeof(TYPE),\ - pe); \ - shmem_internal_get_wait(ctx); \ + pe, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ return tmp; \ } @@ -337,9 +343,12 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_ERR_CHECK_OVERLAP(target, source, sizeof(TYPE) * \ nelems, sizeof(TYPE) * nelems, 0, \ (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_put_nb(ctx, target, source, \ sizeof(TYPE) * nelems, pe, \ - &completion); \ + &completion, nic_idx); \ shmem_internal_put_wait(ctx, &completion); \ } @@ -358,8 +367,11 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_ERR_CHECK_OVERLAP(target, source, (SIZE) * nelems, \ (SIZE) * nelems, 0, \ (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_put_nb(ctx, target, source, (SIZE) * nelems,\ - pe, &completion); \ + pe, &completion, nic_idx); \ shmem_internal_put_wait(ctx, &completion); \ } @@ -376,9 +388,12 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_ERR_CHECK_OVERLAP(target, source, sizeof(TYPE) * \ nelems, sizeof(TYPE) * nelems, 0, \ (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_put_nbi(ctx, target, source, \ sizeof(TYPE)*nelems, \ - pe); \ + pe, nic_idx); \ } @@ -395,8 +410,11 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_ERR_CHECK_OVERLAP(target, source, (SIZE) * nelems, \ (SIZE) * nelems, 0, \ (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_put_nbi(ctx, target, source, (SIZE)*nelems, \ - pe); \ + pe, nic_idx); \ } @@ -413,9 +431,12 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_ERR_CHECK_OVERLAP(target, source, sizeof(TYPE) * \ nelems, sizeof(TYPE) * nelems, 0, \ (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_get(ctx, target, source, \ - sizeof(TYPE) * nelems, pe); \ - shmem_internal_get_wait(ctx); \ + sizeof(TYPE) * nelems, pe, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ } @@ -432,9 +453,12 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_ERR_CHECK_OVERLAP(target, source, (SIZE)*nelems, \ (SIZE) * nelems, 0, \ (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_get(ctx, target, source, (SIZE)*nelems, \ - pe); \ - shmem_internal_get_wait(ctx); \ + pe, nic_idx); \ + shmem_internal_get_wait(ctx, nic_idx); \ } @@ -451,8 +475,10 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_ERR_CHECK_OVERLAP(target, source, sizeof(TYPE) * \ nelems, sizeof(TYPE) * nelems, 0, \ (shmem_internal_my_pe == pe)); \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_get(ctx, target, source, sizeof(TYPE)*nelems, \ - pe); \ + pe, nic_idx); \ } @@ -469,7 +495,10 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_ERR_CHECK_OVERLAP(target, source, (SIZE) * nelems, \ (SIZE) * nelems, 0, \ (shmem_internal_my_pe == pe)); \ - shmem_internal_get(ctx, target, source, (SIZE)*nelems, pe);\ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ + shmem_internal_get(ctx, target, source, (SIZE)*nelems, \ + pe, nic_idx); \ } #define SHMEM_DEF_IPUT(STYPE,TYPE) \ @@ -488,9 +517,12 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') sizeof(TYPE) * ((nelems-1) * tst + 1), \ sizeof(TYPE) * ((nelems-1) * sst + 1), 0, \ (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ for ( ; nelems > 0 ; --nelems) { \ shmem_internal_put_scalar(ctx, target, source, \ - sizeof(TYPE), pe); \ + sizeof(TYPE), pe, nic_idx); \ target += tst; \ source += sst; \ } \ @@ -513,10 +545,13 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') sizeof(TYPE) * ((nblocks-1) * tst + bsize), \ sizeof(TYPE) * ((nblocks-1) * sst + bsize), \ 0, (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ for ( ; nblocks > 0 ; --nblocks) { \ shmem_internal_put_nb(ctx, target, source, \ bsize * sizeof(TYPE), pe, \ - &completion); \ + &completion, nic_idx); \ target += tst; \ source += sst; \ } \ @@ -540,9 +575,12 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') (SIZE) * ((nelems-1) * tst + 1), \ (SIZE) * ((nelems-1) * sst + 1), 0, \ (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ for ( ; nelems > 0 ; --nelems) { \ shmem_internal_put_scalar(ctx, target, source, (SIZE), \ - pe); \ + pe, nic_idx); \ target = (uint8_t *) target + tst * (SIZE); \ source = (uint8_t *) source + sst * (SIZE); \ } \ @@ -566,10 +604,13 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') (SIZE) * ((nblocks-1) * tst + bsize), \ (SIZE) * ((nblocks-1) * sst + bsize), \ 0, (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ for ( ; nblocks > 0 ; --nblocks) { \ shmem_internal_put_nb(ctx, target, source, \ bsize * (SIZE), pe, \ - &completion); \ + &completion, nic_idx); \ target = (uint8_t *) target + tst * (SIZE); \ source = (uint8_t *) source + sst * (SIZE); \ } \ @@ -593,13 +634,16 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') sizeof(TYPE) * ((nelems-1) * tst + 1), \ sizeof(TYPE) * ((nelems-1) * sst + 1), 0, \ (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ for ( ; nelems > 0 ; --nelems) { \ shmem_internal_get(ctx, target, source, sizeof(TYPE), \ - pe); \ + pe, nic_idx); \ target += tst; \ source += sst; \ } \ - shmem_internal_get_wait(ctx); \ + shmem_internal_get_wait(ctx, nic_idx); \ } #define SHMEM_DEF_IBGET(STYPE,TYPE) \ @@ -619,13 +663,16 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') sizeof(TYPE) * ((nblocks-1) * tst + bsize), \ sizeof(TYPE) * ((nblocks-1) * sst + bsize), \ 0, (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ for ( ; nblocks > 0 ; --nblocks) { \ shmem_internal_get(ctx, target, source, \ - bsize * sizeof(TYPE), pe); \ + bsize * sizeof(TYPE), pe, nic_idx); \ target += tst; \ source += sst; \ } \ - shmem_internal_get_wait(ctx); \ + shmem_internal_get_wait(ctx, nic_idx); \ } #define SHMEM_DEF_IGET_N(NAME,SIZE) \ @@ -646,12 +693,16 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') (SIZE) * ((nelems-1) * tst + 1), \ (SIZE) * ((nelems-1) * sst + 1), 0, \ (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ for ( ; nelems > 0 ; --nelems) { \ - shmem_internal_get(ctx, target, source, (SIZE), pe);\ + shmem_internal_get(ctx, target, source, (SIZE), \ + pe, nic_idx); \ target = (uint8_t *) target + tst * (SIZE); \ source = (uint8_t *) source + sst * (SIZE); \ } \ - shmem_internal_get_wait(ctx); \ + shmem_internal_get_wait(ctx, nic_idx); \ } #define SHMEM_DEF_IBGET_N(NAME,SIZE) \ @@ -672,13 +723,16 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') (SIZE) * ((nblocks-1) * tst + bsize), \ (SIZE) * ((nblocks-1) * sst + bsize), \ 0, (shmem_internal_my_pe == pe)); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ for ( ; nblocks > 0 ; --nblocks) { \ shmem_internal_get(ctx, target, source, \ - bsize * (SIZE), pe); \ + bsize * (SIZE), pe, nic_idx); \ target = (uint8_t *) target + tst * (SIZE); \ source = (uint8_t *) source + sst * (SIZE); \ } \ - shmem_internal_get_wait(ctx); \ + shmem_internal_get_wait(ctx, nic_idx); \ } #define SHMEM_DEF_PUT_SIGNAL(STYPE,TYPE) \ @@ -697,19 +751,22 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') sizeof(uint64_t), 0, \ (shmem_internal_my_pe == pe)); \ SHMEM_ERR_CHECK_SIG_OP(sig_op); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_put_nb(ctx, target, source, \ sizeof(TYPE) * nelems, pe, \ - &completion); \ + &completion, nic_idx); \ shmem_internal_put_wait(ctx, &completion); \ shmem_internal_fence(ctx); \ if (sig_op == SHMEM_SIGNAL_ADD) \ shmem_internal_atomic(ctx, sig_addr, &signal, sizeof(uint64_t), \ pe, SHM_INTERNAL_SUM, \ - SHM_INTERNAL_UINT64); \ + SHM_INTERNAL_UINT64, nic_idx); \ else \ shmem_internal_atomic_set(ctx, sig_addr, &signal, \ sizeof(uint64_t), pe, \ - SHM_INTERNAL_UINT64); \ + SHM_INTERNAL_UINT64, nic_idx); \ } @@ -729,18 +786,21 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') sizeof(uint64_t), 0, \ (shmem_internal_my_pe == pe)); \ SHMEM_ERR_CHECK_SIG_OP(sig_op); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_put_nb(ctx, target, source, (SIZE) * nelems, \ - pe, &completion); \ + pe, &completion, nic_idx); \ shmem_internal_put_wait(ctx, &completion); \ shmem_internal_fence(ctx); \ if (sig_op == SHMEM_SIGNAL_ADD) \ shmem_internal_atomic(ctx, sig_addr, &signal, sizeof(uint64_t), \ pe, SHM_INTERNAL_SUM, \ - SHM_INTERNAL_UINT64); \ + SHM_INTERNAL_UINT64, nic_idx); \ else \ shmem_internal_atomic_set(ctx, sig_addr, &signal, \ sizeof(uint64_t), pe, \ - SHM_INTERNAL_UINT64); \ + SHM_INTERNAL_UINT64, nic_idx); \ } #define SHMEM_DEF_PUT_SIGNAL_NBI(STYPE,TYPE) \ @@ -757,10 +817,14 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') SHMEM_ERR_CHECK_OVERLAP(target, sig_addr, sizeof(TYPE) * nelems, \ sizeof(uint64_t), 0, \ (shmem_internal_my_pe == pe)); \ + \ SHMEM_ERR_CHECK_SIG_OP(sig_op); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_put_signal_nbi(ctx, target, source, \ sizeof(TYPE) * nelems, sig_addr, \ - signal, sig_op, pe); \ + signal, sig_op, pe, nic_idx); \ } @@ -779,8 +843,12 @@ SHMEM_PROF_DEF_CTX_PUT_N_SIGNAL_NBI(`mem') sizeof(uint64_t), 0, \ (shmem_internal_my_pe == pe)); \ SHMEM_ERR_CHECK_SIG_OP(sig_op); \ + \ + size_t nic_idx = 0; \ + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); \ shmem_internal_put_signal_nbi(ctx, target, source, (SIZE) * nelems, \ - sig_addr, signal, sig_op, pe); \ + sig_addr, signal, sig_op, \ + pe, nic_idx); \ } @@ -871,10 +939,12 @@ shmem_signal_fetch(const uint64_t* sig_addr) SHMEM_ERR_CHECK_INITIALIZED(); SHMEM_ERR_CHECK_SYMMETRIC(sig_addr, sizeof(uint64_t)); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &val, (void *) sig_addr, sizeof(uint64_t), shmem_internal_my_pe, - SHM_INTERNAL_UINT64); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + SHM_INTERNAL_UINT64, nic_idx); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); return val; } @@ -885,8 +955,10 @@ shmemx_signal_add(uint64_t *sig_addr, uint64_t signal, int pe) SHMEM_ERR_CHECK_PE(pe); SHMEM_ERR_CHECK_SYMMETRIC(sig_addr, sizeof(uint64_t)); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_atomic(SHMEM_CTX_DEFAULT, sig_addr, &signal, sizeof(uint64_t), - pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64); + pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64, nic_idx); } void SHMEM_FUNCTION_ATTRIBUTES @@ -897,8 +969,10 @@ shmemx_ctx_signal_add(shmem_ctx_t ctx, uint64_t *sig_addr, uint64_t signal, int SHMEM_ERR_CHECK_CTX(ctx); SHMEM_ERR_CHECK_SYMMETRIC(sig_addr, sizeof(uint64_t)); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_atomic(ctx, sig_addr, &signal, sizeof(uint64_t), - pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64); + pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64, nic_idx); } void SHMEM_FUNCTION_ATTRIBUTES @@ -908,8 +982,10 @@ shmemx_signal_set(uint64_t *sig_addr, uint64_t signal, int pe) SHMEM_ERR_CHECK_PE(pe); SHMEM_ERR_CHECK_SYMMETRIC(sig_addr, sizeof(uint64_t)); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_atomic_set(SHMEM_CTX_DEFAULT, (void *) sig_addr, &signal, - sizeof(uint64_t), pe, SHM_INTERNAL_UINT64); + sizeof(uint64_t), pe, SHM_INTERNAL_UINT64, nic_idx); } void SHMEM_FUNCTION_ATTRIBUTES @@ -920,8 +996,10 @@ shmemx_ctx_signal_set(shmem_ctx_t ctx, uint64_t *sig_addr, uint64_t signal, int SHMEM_ERR_CHECK_CTX(ctx); SHMEM_ERR_CHECK_SYMMETRIC(sig_addr, sizeof(uint64_t)); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); shmem_internal_atomic_set(ctx, (void *) sig_addr, &signal, - sizeof(uint64_t), pe, SHM_INTERNAL_UINT64); + sizeof(uint64_t), pe, SHM_INTERNAL_UINT64, nic_idx); } void SHMEM_FUNCTION_ATTRIBUTES @@ -932,8 +1010,10 @@ shmemx_getmem_ct(shmemx_ct_t ct, void *target, const void *source, size_t nelems SHMEM_ERR_CHECK_SYMMETRIC(source, nelems); SHMEM_ERR_CHECK_NULL(target, nelems); - shmem_internal_get_ct(ct, target, source, nelems, pe); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_get_ct(ct, target, source, nelems, pe, nic_idx); + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); } void SHMEM_FUNCTION_ATTRIBUTES shmemx_putmem_ct(shmemx_ct_t ct, void *target, const void *source, @@ -946,7 +1026,9 @@ void SHMEM_FUNCTION_ATTRIBUTES shmemx_putmem_ct(shmemx_ct_t ct, void *target, co SHMEM_ERR_CHECK_SYMMETRIC(target, nelems); SHMEM_ERR_CHECK_NULL(source, nelems); - shmem_internal_put_ct_nb(ct, target, source, nelems, pe, &completion); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_put_ct_nb(ct, target, source, nelems, pe, &completion, nic_idx); shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); } diff --git a/src/data_f.c4 b/src/data_f.c4 index 5753463d3..568286157 100644 --- a/src/data_f.c4 +++ b/src/data_f.c4 @@ -136,8 +136,8 @@ SHMEM_BIND_F_SIZES(`SHMEM_WRAP_FC_IPUT_SIZE') SHMEM_ERR_CHECK_NULL(target, *len); \ \ shmem_internal_get(SHMEM_CTX_DEFAULT, target, source, \ - SIZE * *len, *pe); \ - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); \ + SIZE * *len, *pe, 0); \ + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); \ } define(`SHMEM_WRAP_FC_GET', @@ -161,7 +161,7 @@ SHMEM_BIND_F_SIZES(`SHMEM_WRAP_FC_GET_SIZE') SHMEM_ERR_CHECK_NULL(target, *nelems); \ \ shmem_internal_get(SHMEM_CTX_DEFAULT, target, source, \ - SIZE * *nelems, *pe); \ + SIZE * *nelems, *pe, 0); \ } define(`SHMEM_WRAP_FC_GET_NBI', @@ -195,11 +195,11 @@ SHMEM_BIND_F_SIZES(`SHMEM_WRAP_FC_GET_NBI_SIZE') \ for ( ; len > 0 ; --len ) { \ shmem_internal_get(SHMEM_CTX_DEFAULT, target, source, SIZE, \ - *pe); \ + *pe, 0); \ target += (*tst * SIZE); \ source += (*sst * SIZE); \ } \ - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); \ + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); \ } define(`SHMEM_WRAP_FC_IGET', diff --git a/src/init.c b/src/init.c index 01ca23dfd..b7480ebe2 100644 --- a/src/init.c +++ b/src/init.c @@ -143,7 +143,9 @@ shmem_internal_shutdown(void) return; } - shmem_internal_barrier_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); shmem_internal_finalized = 1; diff --git a/src/lock_c.c b/src/lock_c.c index 269dd0ed2..7008dd8f8 100644 --- a/src/lock_c.c +++ b/src/lock_c.c @@ -44,7 +44,9 @@ shmem_clear_lock(long *lockp) SHMEM_ERR_CHECK_INITIALIZED(); SHMEM_ERR_CHECK_SYMMETRIC(lockp, sizeof(long)); - shmem_internal_clear_lock(lockp); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_clear_lock(lockp, nic_idx); } @@ -54,7 +56,9 @@ shmem_set_lock(long *lockp) SHMEM_ERR_CHECK_INITIALIZED(); SHMEM_ERR_CHECK_SYMMETRIC(lockp, sizeof(long)); - shmem_internal_set_lock(lockp); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_set_lock(lockp, nic_idx); } @@ -64,5 +68,7 @@ shmem_test_lock(long *lockp) SHMEM_ERR_CHECK_INITIALIZED(); SHMEM_ERR_CHECK_SYMMETRIC(lockp, sizeof(long)); - return shmem_internal_test_lock(lockp); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + return shmem_internal_test_lock(lockp, nic_idx); } diff --git a/src/runtime-mpi.c b/src/runtime-mpi.c index 3713ec3fd..7856556a9 100644 --- a/src/runtime-mpi.c +++ b/src/runtime-mpi.c @@ -28,7 +28,7 @@ /* Note: Increase MAX_KV_COUNT if more key/values are needed. MAX_KV_COUNT is * 2 * the number of key/value pairs. */ -#define MAX_KV_COUNT 20 +#define MAX_KV_COUNT 40 #define MAX_KV_LENGTH 512 static int rank = -1; diff --git a/src/shmem_collectives.h b/src/shmem_collectives.h index 6409c5178..acfae7b41 100644 --- a/src/shmem_collectives.h +++ b/src/shmem_collectives.h @@ -40,13 +40,13 @@ extern coll_type_t shmem_internal_reduce_type; extern coll_type_t shmem_internal_collect_type; extern coll_type_t shmem_internal_fcollect_type; -void shmem_internal_sync_linear(int PE_start, int PE_stride, int PE_size, long *pSync); -void shmem_internal_sync_tree(int PE_start, int PE_stride, int PE_size, long *pSync); -void shmem_internal_sync_dissem(int PE_start, int PE_stride, int PE_size, long *pSync); +void shmem_internal_sync_linear(int PE_start, int PE_stride, int PE_size, long *pSync, size_t nic_idx); +void shmem_internal_sync_tree(int PE_start, int PE_stride, int PE_size, long *pSync, size_t nic_idx); +void shmem_internal_sync_dissem(int PE_start, int PE_stride, int PE_size, long *pSync, size_t nic_idx); static inline void -shmem_internal_sync(int PE_start, int PE_stride, int PE_size, long *pSync) +shmem_internal_sync(int PE_start, int PE_stride, int PE_size, long *pSync, size_t nic_idx) { if (shmem_internal_params.BARRIERS_FLUSH) { fflush(stdout); @@ -58,19 +58,19 @@ shmem_internal_sync(int PE_start, int PE_stride, int PE_size, long *pSync) switch (shmem_internal_barrier_type) { case AUTO: if (PE_size < shmem_internal_params.COLL_CROSSOVER) { - shmem_internal_sync_linear(PE_start, PE_stride, PE_size, pSync); + shmem_internal_sync_linear(PE_start, PE_stride, PE_size, pSync, nic_idx); } else { - shmem_internal_sync_tree(PE_start, PE_stride, PE_size, pSync); + shmem_internal_sync_tree(PE_start, PE_stride, PE_size, pSync, nic_idx); } break; case LINEAR: - shmem_internal_sync_linear(PE_start, PE_stride, PE_size, pSync); + shmem_internal_sync_linear(PE_start, PE_stride, PE_size, pSync, nic_idx); break; case TREE: - shmem_internal_sync_tree(PE_start, PE_stride, PE_size, pSync); + shmem_internal_sync_tree(PE_start, PE_stride, PE_size, pSync, nic_idx); break; case DISSEM: - shmem_internal_sync_dissem(PE_start, PE_stride, PE_size, pSync); + shmem_internal_sync_dissem(PE_start, PE_stride, PE_size, pSync, nic_idx); break; default: RAISE_ERROR_MSG("Illegal barrier/sync type (%d)\n", @@ -85,60 +85,64 @@ shmem_internal_sync(int PE_start, int PE_stride, int PE_size, long *pSync) static inline void -shmem_internal_sync_all(void) +shmem_internal_sync_all(size_t nic_idx) { - shmem_internal_sync(0, 1, shmem_internal_num_pes, shmem_internal_sync_all_psync); + shmem_internal_sync(0, 1, shmem_internal_num_pes, shmem_internal_sync_all_psync, nic_idx); } static inline void -shmem_internal_barrier(int PE_start, int PE_stride, int PE_size, long *pSync) +shmem_internal_barrier(int PE_start, int PE_stride, int PE_size, long *pSync, size_t nic_idx) { shmem_internal_quiet(SHMEM_CTX_DEFAULT); - shmem_internal_sync(PE_start, PE_stride, PE_size, pSync); + shmem_internal_sync(PE_start, PE_stride, PE_size, pSync, nic_idx); } static inline void -shmem_internal_barrier_all(void) +shmem_internal_barrier_all(size_t nic_idx) { shmem_internal_quiet(SHMEM_CTX_DEFAULT); - shmem_internal_sync(0, 1, shmem_internal_num_pes, shmem_internal_barrier_all_psync); + shmem_internal_sync(0, 1, shmem_internal_num_pes, shmem_internal_barrier_all_psync, nic_idx); } void shmem_internal_bcast_linear(void *target, const void *source, size_t len, int PE_root, int PE_start, int PE_stride, int PE_size, - long *pSync, int complete); + long *pSync, int complete, size_t nic_idx); void shmem_internal_bcast_tree(void *target, const void *source, size_t len, int PE_root, int PE_start, int PE_stride, int PE_size, - long *pSync, int complete); + long *pSync, int complete, size_t nic_idx); static inline void shmem_internal_bcast(void *target, const void *source, size_t len, int PE_root, int PE_start, int PE_stride, int PE_size, - long *pSync, int complete) + long *pSync, int complete, size_t nic_idx) { switch (shmem_internal_bcast_type) { case AUTO: if (PE_size < shmem_internal_params.COLL_CROSSOVER) { shmem_internal_bcast_linear(target, source, len, PE_root, PE_start, - PE_stride, PE_size, pSync, complete); + PE_stride, PE_size, pSync, complete, + nic_idx); } else { shmem_internal_bcast_tree(target, source, len, PE_root, PE_start, - PE_stride, PE_size, pSync, complete); + PE_stride, PE_size, pSync, complete, + nic_idx); } break; case LINEAR: shmem_internal_bcast_linear(target, source, len, PE_root, PE_start, - PE_stride, PE_size, pSync, complete); + PE_stride, PE_size, pSync, complete, + nic_idx); break; case TREE: shmem_internal_bcast_tree(target, source, len, PE_root, PE_start, - PE_stride, PE_size, pSync, complete); + PE_stride, PE_size, pSync, complete, + nic_idx); break; default: RAISE_ERROR_MSG("Illegal broadcast type (%d)\n", @@ -150,20 +154,24 @@ shmem_internal_bcast(void *target, const void *source, size_t len, void shmem_internal_op_to_all_linear(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype); + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx); void shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype); + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx); void shmem_internal_op_to_all_tree(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype); + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx); void shmem_internal_op_to_all_recdbl_sw(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, - shm_internal_op_t op, shm_internal_datatype_t datatype); + shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx); static inline void @@ -171,7 +179,7 @@ shmem_internal_op_to_all(void *target, const void *source, size_t count, size_t type_size, int PE_start, int PE_stride, int PE_size, void *pWrk, long *pSync, shm_internal_op_t op, - shm_internal_datatype_t datatype) + shm_internal_datatype_t datatype, size_t nic_idx) { shmem_internal_assert(type_size > 0); @@ -181,21 +189,21 @@ shmem_internal_op_to_all(void *target, const void *source, size_t count, if (PE_size < shmem_internal_params.COLL_CROSSOVER) { shmem_internal_op_to_all_linear(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); } else { shmem_internal_op_to_all_tree(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); } } else { if (count * type_size < shmem_internal_params.COLL_SIZE_CROSSOVER) shmem_internal_op_to_all_recdbl_sw(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); else shmem_internal_op_to_all_ring(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); } break; @@ -203,33 +211,33 @@ shmem_internal_op_to_all(void *target, const void *source, size_t count, if (shmem_transport_atomic_supported(op, datatype)) { shmem_internal_op_to_all_linear(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); } else { shmem_internal_op_to_all_recdbl_sw(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); } break; case RING: shmem_internal_op_to_all_ring(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); break; case TREE: if (shmem_transport_atomic_supported(op, datatype)) { shmem_internal_op_to_all_tree(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); } else { shmem_internal_op_to_all_recdbl_sw(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); } break; case RECDBL: shmem_internal_op_to_all_recdbl_sw(target, source, count, type_size, PE_start, PE_stride, PE_size, - pWrk, pSync, op, datatype); + pWrk, pSync, op, datatype, nic_idx); break; default: RAISE_ERROR_MSG("Illegal reduction type (%d)\n", @@ -239,21 +247,23 @@ shmem_internal_op_to_all(void *target, const void *source, size_t count, void shmem_internal_collect_linear(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx); static inline void shmem_internal_collect(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { switch (shmem_internal_collect_type) { case AUTO: shmem_internal_collect_linear(target, source, len, PE_start, PE_stride, - PE_size, pSync); + PE_size, pSync, nic_idx); break; case LINEAR: shmem_internal_collect_linear(target, source, len, PE_start, PE_stride, - PE_size, pSync); + PE_size, pSync, nic_idx); break; default: RAISE_ERROR_MSG("Illegal collect type (%d)\n", @@ -263,37 +273,41 @@ shmem_internal_collect(void *target, const void *source, size_t len, void shmem_internal_fcollect_linear(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx); void shmem_internal_fcollect_ring(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx); void shmem_internal_fcollect_recdbl(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx); static inline void shmem_internal_fcollect(void *target, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync) + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx) { switch (shmem_internal_fcollect_type) { case AUTO: shmem_internal_fcollect_ring(target, source, len, PE_start, PE_stride, - PE_size, pSync); + PE_size, pSync, nic_idx); break; case LINEAR: shmem_internal_fcollect_linear(target, source, len, PE_start, PE_stride, - PE_size, pSync); + PE_size, pSync, nic_idx); break; case RING: shmem_internal_fcollect_ring(target, source, len, PE_start, PE_stride, - PE_size, pSync); + PE_size, pSync, nic_idx); break; case RECDBL: if (0 == (PE_size & (PE_size - 1))) { shmem_internal_fcollect_recdbl(target, source, len, PE_start, PE_stride, - PE_size, pSync); + PE_size, pSync, nic_idx); } else { shmem_internal_fcollect_ring(target, source, len, PE_start, PE_stride, - PE_size, pSync); + PE_size, pSync, nic_idx); } break; default: @@ -304,9 +318,11 @@ shmem_internal_fcollect(void *target, const void *source, size_t len, void shmem_internal_alltoall(void *dest, const void *source, size_t len, - int PE_start, int PE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx); void shmem_internal_alltoalls(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, size_t elem_size, size_t nelems, - int PE_start, int PE_stride, int PE_size, long *pSync); + int PE_start, int PE_stride, int PE_size, long *pSync, + size_t nic_idx); #endif diff --git a/src/shmem_comm.h b/src/shmem_comm.h index a08ede1db..54ea6ce20 100644 --- a/src/shmem_comm.h +++ b/src/shmem_comm.h @@ -33,7 +33,7 @@ static inline void shmem_internal_put_nb(shmem_ctx_t ctx, void *target, const void *source, size_t len, int pe, - long *completion) + long *completion, size_t nic_idx) { if (len == 0) return; @@ -41,7 +41,7 @@ shmem_internal_put_nb(shmem_ctx_t ctx, void *target, const void *source, size_t if (shmem_shr_transport_use_write(ctx, target, source, len, pe)) { shmem_shr_transport_put(ctx, target, source, len, pe); } else { - shmem_transport_put_nb((shmem_transport_ctx_t *)ctx, target, source, len, pe, completion); + shmem_transport_put_nb((shmem_transport_ctx_t *)ctx, target, source, len, pe, completion, nic_idx); } } @@ -57,7 +57,7 @@ shmem_internal_put_wait(shmem_ctx_t ctx, long *completion) static inline void -shmem_internal_put_scalar(shmem_ctx_t ctx, void *target, const void *source, size_t len, int pe) +shmem_internal_put_scalar(shmem_ctx_t ctx, void *target, const void *source, size_t len, int pe, size_t nic_idx) { shmem_internal_assert(len > 0); @@ -65,11 +65,11 @@ shmem_internal_put_scalar(shmem_ctx_t ctx, void *target, const void *source, siz shmem_shr_transport_put_scalar(ctx, target, source, len, pe); } else { #ifndef DISABLE_OFI_INJECT - shmem_transport_put_scalar((shmem_transport_ctx_t *)ctx, target, source, len, pe); + shmem_transport_put_scalar((shmem_transport_ctx_t *)ctx, target, source, len, pe, nic_idx); #else long completion = 0; - shmem_transport_put_nb((shmem_transport_ctx_t *)ctx, target, source, len, pe, &completion); - shmem_internal_put_wait(ctx, &completion); + shmem_transport_put_nb((shmem_transport_ctx_t *)ctx, target, source, len, pe, &completion, nic_idx); + shmem_internal_put_wait(ctx, &completion); #endif } } @@ -77,35 +77,35 @@ shmem_internal_put_scalar(shmem_ctx_t ctx, void *target, const void *source, siz static inline void shmem_internal_put_signal_nbi(shmem_ctx_t ctx, void *target, const void *source, size_t len, - uint64_t *sig_addr, uint64_t signal, int sig_op, int pe) + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe, size_t nic_idx) { if (len == 0) { if (sig_op == SHMEM_SIGNAL_ADD) shmem_transport_atomic((shmem_transport_ctx_t *) ctx, sig_addr, &signal, sizeof(uint64_t), - pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64); + pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64, nic_idx); else shmem_transport_atomic_set((shmem_transport_ctx_t *) ctx, sig_addr, &signal, - sizeof(uint64_t), pe, SHM_INTERNAL_UINT64); + sizeof(uint64_t), pe, SHM_INTERNAL_UINT64, nic_idx); return; } if (shmem_shr_transport_use_write(ctx, target, source, len, pe)) { - shmem_shr_transport_put_signal(ctx, target, source, len, sig_addr, signal, sig_op, pe); + shmem_shr_transport_put_signal(ctx, target, source, len, sig_addr, signal, sig_op, pe, nic_idx); } else { - shmem_transport_put_signal_nbi((shmem_transport_ctx_t *) ctx, target, source, len, sig_addr, signal, sig_op, pe); + shmem_transport_put_signal_nbi((shmem_transport_ctx_t *) ctx, target, source, len, sig_addr, signal, sig_op, pe, nic_idx); } } static inline void -shmem_internal_put_nbi(shmem_ctx_t ctx, void *target, const void *source, size_t len, int pe) +shmem_internal_put_nbi(shmem_ctx_t ctx, void *target, const void *source, size_t len, int pe, size_t nic_idx) { if (len == 0) return; if (shmem_shr_transport_use_write(ctx, target, source, len, pe)) { shmem_shr_transport_put(ctx, target, source, len, pe); } else { - shmem_transport_put_nbi((shmem_transport_ctx_t *)ctx, target, source, len, pe); + shmem_transport_put_nbi((shmem_transport_ctx_t *)ctx, target, source, len, pe, nic_idx); } } @@ -113,57 +113,58 @@ shmem_internal_put_nbi(shmem_ctx_t ctx, void *target, const void *source, size_t static inline void shmem_internal_put_ct_nb(shmemx_ct_t ct, void *target, const void *source, size_t len, int pe, - long *completion) + long *completion, size_t nic_idx) { /* TODO: add shortcut for on-node-comms */ shmem_transport_put_ct_nb((shmem_transport_ct_t *) - ct, target, source, len, pe, completion); + ct, target, source, len, pe, completion, nic_idx); } static inline void -shmem_internal_get(shmem_ctx_t ctx, void *target, const void *source, size_t len, int pe) +shmem_internal_get(shmem_ctx_t ctx, void *target, const void *source, size_t len, int pe, size_t nic_idx) { if (len == 0) return; if (shmem_shr_transport_use_read(ctx, target, source, len, pe)) { shmem_shr_transport_get(ctx, target, source, len, pe); } else { - shmem_transport_get((shmem_transport_ctx_t *)ctx, target, source, len, pe); + shmem_transport_get((shmem_transport_ctx_t *)ctx, target, source, len, pe, nic_idx); } } static inline void -shmem_internal_get_ct(shmemx_ct_t ct, void *target, const void *source, size_t len, int pe) +shmem_internal_get_ct(shmemx_ct_t ct, void *target, const void *source, size_t len, + int pe, size_t nic_idx) { /* TODO: add shortcut for on-node-comms */ shmem_transport_get_ct((shmem_transport_ct_t *) ct, - target, source, len, pe); + target, source, len, pe, nic_idx); } static inline void -shmem_internal_get_wait(shmem_ctx_t ctx) +shmem_internal_get_wait(shmem_ctx_t ctx, size_t idx) { - shmem_transport_get_wait((shmem_transport_ctx_t *)ctx); + shmem_transport_get_wait((shmem_transport_ctx_t *)ctx, idx); /* on-node is always blocking, so this is a no-op for them */ } static inline void shmem_internal_swap(shmem_ctx_t ctx, void *target, void *source, void *dest, size_t len, - int pe, shm_internal_datatype_t datatype) + int pe, shm_internal_datatype_t datatype, size_t nic_idx) { shmem_internal_assert(len > 0); if (shmem_shr_transport_use_atomic(ctx, target, len, pe, datatype)) { shmem_shr_transport_swap(ctx, target, source, dest, len, pe, datatype); } else { - shmem_transport_swap((shmem_transport_ctx_t *)ctx, target, source, dest, len, pe, datatype); + shmem_transport_swap((shmem_transport_ctx_t *)ctx, target, source, dest, len, pe, datatype, nic_idx); } } @@ -172,7 +173,7 @@ static inline void shmem_internal_swap_nbi(shmem_ctx_t ctx, void *target, void *source, void *dest, size_t len, int pe, - shm_internal_datatype_t datatype) + shm_internal_datatype_t datatype, size_t nic_idx) { shmem_internal_assert(len > 0); @@ -180,7 +181,7 @@ shmem_internal_swap_nbi(shmem_ctx_t ctx, void *target, void *source, shmem_shr_transport_swap(ctx, target, source, dest, len, pe, datatype); } else { shmem_transport_swap_nbi((shmem_transport_ctx_t *)ctx, target, source, - dest, len, pe, datatype); + dest, len, pe, datatype, nic_idx); } } @@ -188,7 +189,7 @@ shmem_internal_swap_nbi(shmem_ctx_t ctx, void *target, void *source, static inline void shmem_internal_cswap(shmem_ctx_t ctx, void *target, void *source, void *dest, void *operand, size_t len, - int pe, shm_internal_datatype_t datatype) + int pe, shm_internal_datatype_t datatype, size_t nic_idx) { shmem_internal_assert(len > 0); @@ -196,7 +197,7 @@ shmem_internal_cswap(shmem_ctx_t ctx, void *target, void *source, void *dest, vo shmem_shr_transport_cswap(ctx, target, source, dest, operand, len, pe, datatype); } else { shmem_transport_cswap((shmem_transport_ctx_t *)ctx, target, source, - dest, operand, len, pe, datatype); + dest, operand, len, pe, datatype, nic_idx); } } @@ -205,7 +206,7 @@ static inline void shmem_internal_cswap_nbi(shmem_ctx_t ctx, void *target, void *source, void *dest, void *operand, size_t len, int pe, - shm_internal_datatype_t datatype) + shm_internal_datatype_t datatype, size_t nic_idx) { shmem_internal_assert(len > 0); @@ -213,7 +214,7 @@ shmem_internal_cswap_nbi(shmem_ctx_t ctx, void *target, void *source, shmem_shr_transport_cswap(ctx, target, source, dest, operand, len, pe, datatype); } else { shmem_transport_cswap_nbi((shmem_transport_ctx_t *)ctx, target, source, - dest, operand, len, pe, datatype); + dest, operand, len, pe, datatype, nic_idx); } } @@ -221,7 +222,7 @@ shmem_internal_cswap_nbi(shmem_ctx_t ctx, void *target, void *source, static inline void shmem_internal_mswap(shmem_ctx_t ctx, void *target, void *source, void *dest, void *mask, size_t len, - int pe, shm_internal_datatype_t datatype) + int pe, shm_internal_datatype_t datatype, size_t nic_idx) { shmem_internal_assert(len > 0); @@ -229,7 +230,7 @@ shmem_internal_mswap(shmem_ctx_t ctx, void *target, void *source, void *dest, vo shmem_shr_transport_mswap(ctx, target, source, dest, mask, len, pe, datatype); } else { shmem_transport_mswap((shmem_transport_ctx_t *)ctx, target, source, - dest, mask, len, pe, datatype); + dest, mask, len, pe, datatype, nic_idx); } } @@ -237,7 +238,8 @@ shmem_internal_mswap(shmem_ctx_t ctx, void *target, void *source, void *dest, vo static inline void shmem_internal_atomic(shmem_ctx_t ctx, void *target, const void *source, size_t len, - int pe, shm_internal_op_t op, shm_internal_datatype_t datatype) + int pe, shm_internal_op_t op, shm_internal_datatype_t datatype, + size_t nic_idx) { shmem_internal_assert(len > 0); @@ -249,11 +251,11 @@ shmem_internal_atomic(shmem_ctx_t ctx, void *target, const void *source, size_t the CXI provider */ unsigned long long tmp_fetch = 0; shmem_transport_fetch_atomic((shmem_transport_ctx_t *)ctx, target, - source, &tmp_fetch, len, pe, op, datatype); - shmem_transport_get_wait((shmem_transport_ctx_t *)ctx); + source, &tmp_fetch, len, pe, op, datatype, nic_idx); + shmem_transport_get_wait((shmem_transport_ctx_t *)ctx, nic_idx); #else shmem_transport_atomic((shmem_transport_ctx_t *)ctx, target, source, - len, pe, op, datatype); + len, pe, op, datatype, nic_idx); #endif } } @@ -262,7 +264,7 @@ shmem_internal_atomic(shmem_ctx_t ctx, void *target, const void *source, size_t static inline void shmem_internal_atomic_fetch(shmem_ctx_t ctx, void *target, const void *source, size_t len, - int pe, shm_internal_datatype_t datatype) + int pe, shm_internal_datatype_t datatype, size_t nic_idx) { shmem_internal_assert(len > 0); @@ -270,7 +272,7 @@ shmem_internal_atomic_fetch(shmem_ctx_t ctx, void *target, const void *source, s shmem_shr_transport_atomic_fetch(ctx, target, source, len, pe, datatype); } else { shmem_transport_atomic_fetch((shmem_transport_ctx_t *)ctx, target, - source, len, pe, datatype); + source, len, pe, datatype, nic_idx); } } @@ -278,7 +280,7 @@ shmem_internal_atomic_fetch(shmem_ctx_t ctx, void *target, const void *source, s static inline void shmem_internal_atomic_set(shmem_ctx_t ctx, void *target, const void *source, size_t len, - int pe, shm_internal_datatype_t datatype) + int pe, shm_internal_datatype_t datatype, size_t nic_idx) { shmem_internal_assert(len > 0); @@ -290,11 +292,11 @@ shmem_internal_atomic_set(shmem_ctx_t ctx, void *target, const void *source, siz the CXI provider */ unsigned long long tmp_fetch = 0; shmem_transport_fetch_atomic((shmem_transport_ctx_t *)ctx, target, - source, &tmp_fetch, len, pe, FI_ATOMIC_WRITE, datatype); - shmem_transport_get_wait((shmem_transport_ctx_t *)ctx); + source, &tmp_fetch, len, pe, FI_ATOMIC_WRITE, datatype, nic_idx); + shmem_transport_get_wait((shmem_transport_ctx_t *)ctx, nic_idx); #else shmem_transport_atomic_set((shmem_transport_ctx_t *)ctx, target, - source, len, pe, datatype); + source, len, pe, datatype, nic_idx); #endif } } @@ -304,7 +306,7 @@ static inline void shmem_internal_fetch_atomic(shmem_ctx_t ctx, void *target, void *source, void *dest, size_t len, int pe, shm_internal_op_t op, - shm_internal_datatype_t datatype) + shm_internal_datatype_t datatype, size_t nic_idx) { shmem_internal_assert(len > 0); @@ -313,7 +315,7 @@ shmem_internal_fetch_atomic(shmem_ctx_t ctx, void *target, void *source, void *d op, datatype); } else { shmem_transport_fetch_atomic((shmem_transport_ctx_t *)ctx, target, - source, dest, len, pe, op, datatype); + source, dest, len, pe, op, datatype, nic_idx); } } @@ -322,7 +324,7 @@ static inline void shmem_internal_atomicv(shmem_ctx_t ctx, void *target, const void *source, size_t len, int pe, shm_internal_op_t op, - shm_internal_datatype_t datatype, long *completion) + shm_internal_datatype_t datatype, long *completion, size_t nic_idx) { shmem_internal_assert(len > 0); @@ -335,14 +337,14 @@ shmem_internal_atomicv(shmem_ctx_t ctx, void *target, const void *source, for (size_t i = 0; i < count; i++) { shmem_internal_fetch_atomic(ctx, ((uint8_t *) target) + (i * type_size), ((uint8_t *) source) + (i * type_size), &tmp_fetch, type_size, - pe, op, datatype); + pe, op, datatype, nic_idx); } #else if (shmem_shr_transport_use_atomic(ctx, target, len, pe, datatype)) { shmem_shr_transport_atomicv(ctx, target, source, len, pe, op, datatype); } else { shmem_transport_atomicv((shmem_transport_ctx_t *)ctx, target, source, len, - pe, op, datatype, completion); + pe, op, datatype, completion, nic_idx); } #endif } @@ -352,7 +354,7 @@ static inline void shmem_internal_fetch_atomic_nbi(shmem_ctx_t ctx, void *target, void *source, void *dest, size_t len, int pe, - shm_internal_op_t op, shm_internal_datatype_t datatype) + shm_internal_op_t op, shm_internal_datatype_t datatype, size_t nic_idx) { shmem_internal_assert(len > 0); @@ -361,7 +363,7 @@ shmem_internal_fetch_atomic_nbi(shmem_ctx_t ctx, void *target, void *source, op, datatype); } else { shmem_transport_fetch_atomic_nbi((shmem_transport_ctx_t *)ctx, target, - source, dest, len, pe, op, datatype); + source, dest, len, pe, op, datatype, nic_idx); } } @@ -403,7 +405,7 @@ void shmem_internal_ct_wait(shmemx_ct_t ct, long wait_for) /* Uses internal put for external heap config; otherwise memcpy */ static inline -void shmem_internal_copy_self(void *dest, const void *source, size_t nelems) +void shmem_internal_copy_self(void *dest, const void *source, size_t nelems, size_t nic_idx) { #ifdef USE_FI_HMEM // "completion" set to 1 to wait for completion of put operation initiated @@ -411,7 +413,7 @@ void shmem_internal_copy_self(void *dest, const void *source, size_t nelems) // to shmem_internal_put_nb. long completion = 1; shmem_internal_put_nb(SHMEM_CTX_DEFAULT, dest, source, nelems, - shmem_internal_my_pe, &completion); + shmem_internal_my_pe, &completion, nic_idx); shmem_internal_put_wait(SHMEM_CTX_DEFAULT, &completion); #else memcpy(dest, source, nelems); diff --git a/src/shmem_internal.h b/src/shmem_internal.h index d6fe359be..83c2ff477 100644 --- a/src/shmem_internal.h +++ b/src/shmem_internal.h @@ -186,6 +186,17 @@ extern hwloc_topology_t shmem_internal_topology; } \ } while(0) +#ifdef USE_OFI +#define SHMEM_GET_TRANSMIT_NIC_IDX(idx) \ + do { \ + int rand_int = rand_r(&shmem_internal_rand_seed); \ + double normalized = (double)rand_int / (double)RAND_MAX; \ + idx = (int)(normalized * shmem_transport_ofi_num_nics); \ + } while (0) +#else +#define SHMEM_GET_TRANSMIT_NIC_IDX(idx) +#endif + #ifdef ENABLE_ERROR_CHECKING #define SHMEM_ERR_CHECK_INITIALIZED() \ do { \ diff --git a/src/shmem_lock.h b/src/shmem_lock.h index e0c2812ce..ca78e5410 100644 --- a/src/shmem_lock.h +++ b/src/shmem_lock.h @@ -37,7 +37,7 @@ typedef struct lock_t lock_t; static inline void -shmem_internal_clear_lock(long *lockp) +shmem_internal_clear_lock(long *lockp, size_t nic_idx) { lock_t *lock = (lock_t*) lockp; int curr, cond, zero = 0, sig = SIGNAL_MASK; @@ -47,8 +47,8 @@ shmem_internal_clear_lock(long *lockp) /* release the lock if I'm the last to try to obtain it */ cond = shmem_internal_my_pe + 1; shmem_internal_cswap(SHMEM_CTX_DEFAULT, &(lock->last), &zero, &curr, &cond, - sizeof(int), 0, SHM_INTERNAL_INT); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + sizeof(int), 0, SHM_INTERNAL_INT, nic_idx); // Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); // Multiplex across NICs? /* if local PE was not the last to hold the lock, look for the next in line */ if (curr != shmem_internal_my_pe + 1) { @@ -58,8 +58,8 @@ shmem_internal_clear_lock(long *lockp) for (;;) { shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &cur_data, &(lock->data), sizeof(int), shmem_internal_my_pe, - SHM_INTERNAL_INT); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + SHM_INTERNAL_INT, nic_idx); // Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); // Multiplex across NICs? if (NEXT(cur_data) != 0) break; @@ -69,43 +69,43 @@ shmem_internal_clear_lock(long *lockp) /* set the signal bit on new lock holder */ shmem_internal_mswap(SHMEM_CTX_DEFAULT, &(lock->data), &sig, &curr, - &sig, sizeof(int), NEXT(cur_data) - 1, SHM_INTERNAL_INT); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + &sig, sizeof(int), NEXT(cur_data) - 1, SHM_INTERNAL_INT, nic_idx);// Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); // Multiplex across NICs? } } static inline void -shmem_internal_set_lock(long *lockp) +shmem_internal_set_lock(long *lockp, size_t nic_idx) { lock_t *lock = (lock_t*) lockp; int curr, zero = 0, me = shmem_internal_my_pe + 1; /* initialize my elements to zero */ shmem_internal_atomic_set(SHMEM_CTX_DEFAULT, &(lock->data), &zero, - sizeof(zero), shmem_internal_my_pe, SHM_INTERNAL_INT); + sizeof(zero), shmem_internal_my_pe, SHM_INTERNAL_INT, nic_idx); shmem_internal_quiet(SHMEM_CTX_DEFAULT); /* update last with my value to add me to the queue */ shmem_internal_swap(SHMEM_CTX_DEFAULT, &(lock->last), &me, &curr, - sizeof(int), 0, SHM_INTERNAL_INT); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + sizeof(int), 0, SHM_INTERNAL_INT, 0); // Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, 0); // Multiplex across NICs? /* If I wasn't the first, need to add myself to the previous last's next */ if (0 != curr) { int next_mask = NEXT_MASK; shmem_internal_mswap(SHMEM_CTX_DEFAULT, &(lock->data), &me, &curr, - &next_mask, sizeof(int), curr - 1, SHM_INTERNAL_INT); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + &next_mask, sizeof(int), curr - 1, SHM_INTERNAL_INT, nic_idx); // Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); // Multiplex across NICs? /* now wait for the signal part of data to be non-zero */ for (;;) { int cur_data; shmem_internal_atomic_fetch(SHMEM_CTX_DEFAULT, &cur_data, &(lock->data), - sizeof(int), shmem_internal_my_pe, SHM_INTERNAL_INT); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + sizeof(int), shmem_internal_my_pe, SHM_INTERNAL_INT, nic_idx); // Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); // Multiplex across NICs? if (SIGNAL(cur_data) != 0) break; @@ -122,20 +122,20 @@ shmem_internal_set_lock(long *lockp) static inline int -shmem_internal_test_lock(long *lockp) +shmem_internal_test_lock(long *lockp, size_t nic_idx) { lock_t *lock = (lock_t*) lockp; int curr, me = shmem_internal_my_pe + 1, zero = 0; /* initialize my elements to zero */ shmem_internal_atomic_set(SHMEM_CTX_DEFAULT, &(lock->data), &zero, - sizeof(zero), shmem_internal_my_pe, SHM_INTERNAL_INT); + sizeof(zero), shmem_internal_my_pe, SHM_INTERNAL_INT, nic_idx); shmem_internal_quiet(SHMEM_CTX_DEFAULT); /* add self to last if and only if the lock is zero (ie, no one has the lock) */ shmem_internal_cswap(SHMEM_CTX_DEFAULT, &(lock->last), &me, &curr, &zero, - sizeof(int), 0, SHM_INTERNAL_INT); - shmem_internal_get_wait(SHMEM_CTX_DEFAULT); + sizeof(int), 0, SHM_INTERNAL_INT, nic_idx); // Multiplex across NICs? + shmem_internal_get_wait(SHMEM_CTX_DEFAULT, nic_idx); // Multiplex across NICs? if (0 == curr) { shmem_internal_membar_acquire(); diff --git a/src/shmem_synchronization.h b/src/shmem_synchronization.h index 0270d6d7b..f18fb4072 100644 --- a/src/shmem_synchronization.h +++ b/src/shmem_synchronization.h @@ -99,35 +99,36 @@ shmem_internal_fence(shmem_ctx_t ctx) #define SHMEM_TEST(type, a, b, ret) COMP(type, SYNC_LOAD(a), b, ret) -#define SHMEM_WAIT_POLL(var, value) \ - do { \ - while (SYNC_LOAD(var) == value) { \ - shmem_transport_probe(); \ - SPINLOCK_BODY(); } \ +#define SHMEM_WAIT_POLL(var, value) \ + do { \ + while (SYNC_LOAD(var) == value) { \ + shmem_transport_probe(); \ + SPINLOCK_BODY(); \ + } \ } while(0) -#define SHMEM_WAIT_UNTIL_POLL(var, cond, value) \ - do { \ - int cmpret; \ - \ - COMP(cond, SYNC_LOAD(var), value, cmpret); \ - while (!cmpret) { \ - shmem_transport_probe(); \ - SPINLOCK_BODY(); \ - COMP(cond, SYNC_LOAD(var), value, cmpret); \ - } \ +#define SHMEM_WAIT_UNTIL_POLL(var, cond, value) \ + do { \ + int cmpret; \ + \ + COMP(cond, SYNC_LOAD(var), value, cmpret); \ + while (!cmpret) { \ + shmem_transport_probe(); \ + SPINLOCK_BODY(); \ + COMP(cond, SYNC_LOAD(var), value, cmpret); \ + } \ } while(0) -#define SHMEM_SIGNAL_WAIT_UNTIL_POLL(var, cond, value, sat_value) \ - do { \ - int cmpret; \ - \ - COMP_SIGNAL(cond, SYNC_LOAD(var), value, cmpret, sat_value); \ - while (!cmpret) { \ - shmem_transport_probe(); \ - SPINLOCK_BODY(); \ - COMP_SIGNAL(cond, SYNC_LOAD(var), value, cmpret, sat_value);\ - } \ +#define SHMEM_SIGNAL_WAIT_UNTIL_POLL(var, cond, value, sat_value) \ + do { \ + int cmpret; \ + \ + COMP_SIGNAL(cond, SYNC_LOAD(var), value, cmpret, sat_value); \ + while (!cmpret) { \ + shmem_transport_probe(); \ + SPINLOCK_BODY(); \ + COMP_SIGNAL(cond, SYNC_LOAD(var), value, cmpret, sat_value); \ + } \ } while(0) #define SHMEM_WAIT_BLOCK(var, value) \ diff --git a/src/shmem_team.c b/src/shmem_team.c index ed54fd239..833a03a17 100644 --- a/src/shmem_team.c +++ b/src/shmem_team.c @@ -289,7 +289,7 @@ int shmem_internal_team_translate_pe(shmem_internal_team_t *src_team, int src_pe int shmem_internal_team_split_strided(shmem_internal_team_t *parent_team, int PE_start, int PE_stride, int PE_size, const shmem_team_config_t *config, long config_mask, - shmem_internal_team_t **new_team) + shmem_internal_team_t **new_team, size_t nic_idx) { *new_team = SHMEM_TEAM_INVALID; @@ -320,7 +320,7 @@ int shmem_internal_team_split_strided(shmem_internal_team_t *parent_team, int PE int my_pe = shmem_internal_pe_in_active_set(shmem_internal_my_pe, global_PE_start, PE_stride, PE_size); - long *psync = shmem_internal_team_choose_psync(parent_team, REDUCE); + long *psync = shmem_internal_team_choose_psync(parent_team, REDUCE, nic_idx); shmem_internal_team_t *myteam = NULL; *team_ret_val = 0; *team_ret_val_reduced = 0; @@ -366,7 +366,7 @@ int shmem_internal_team_split_strided(shmem_internal_team_t *parent_team, int PE shmem_internal_op_to_all(psync_pool_avail_reduced, psync_pool_avail, N_PSYNC_BYTES, 1, myteam->start, PE_stride, PE_size, NULL, - psync, SHM_INTERNAL_BAND, SHM_INTERNAL_UCHAR); + psync, SHM_INTERNAL_BAND, SHM_INTERNAL_UCHAR, nic_idx); /* We cannot release the psync here, because this reduction may not * have been performed on the entire parent team. */ @@ -406,18 +406,18 @@ int shmem_internal_team_split_strided(shmem_internal_team_t *parent_team, int PE /* This barrier on the parent team eliminates problematic race conditions * during psync allocation between back-to-back team creations. */ - psync = shmem_internal_team_choose_psync(parent_team, SYNC); + psync = shmem_internal_team_choose_psync(parent_team, SYNC, nic_idx); - shmem_internal_barrier(parent_team->start, parent_team->stride, parent_team->size, psync); + shmem_internal_barrier(parent_team->start, parent_team->stride, parent_team->size, psync, nic_idx); shmem_internal_team_release_psyncs(parent_team, SYNC); - /* This MAX reduction assures all PEs return the same value. */ - psync = shmem_internal_team_choose_psync(parent_team, REDUCE); + /* This OR reduction assures all PEs return the same value. */ + psync = shmem_internal_team_choose_psync(parent_team, REDUCE, nic_idx); shmem_internal_op_to_all(team_ret_val_reduced, team_ret_val, 1, sizeof(int), parent_team->start, parent_team->stride, parent_team->size, NULL, - psync, SHM_INTERNAL_MAX, SHM_INTERNAL_INT); + psync, SHM_INTERNAL_MAX, SHM_INTERNAL_INT, nic_idx); shmem_internal_team_release_psyncs(parent_team, REDUCE); @@ -433,7 +433,7 @@ int shmem_internal_team_split_strided(shmem_internal_team_t *parent_team, int PE int shmem_internal_team_split_2d(shmem_internal_team_t *parent_team, int xrange, const shmem_team_config_t *xaxis_config, long xaxis_mask, shmem_internal_team_t **xaxis_team, const shmem_team_config_t *yaxis_config, - long yaxis_mask, shmem_internal_team_t **yaxis_team) + long yaxis_mask, shmem_internal_team_t **yaxis_team, size_t nic_idx) { *xaxis_team = SHMEM_TEAM_INVALID; *yaxis_team = SHMEM_TEAM_INVALID; @@ -460,7 +460,8 @@ int shmem_internal_team_split_2d(shmem_internal_team_t *parent_team, int xrange, int xsize = (i == num_xteams - 1 && parent_size % xrange) ? parent_size % xrange : xrange; ret = shmem_internal_team_split_strided(parent_team, start, parent_stride, - xsize, xaxis_config, xaxis_mask, &my_xteam); + xsize, xaxis_config, xaxis_mask, &my_xteam, + nic_idx); if (ret) { RAISE_ERROR_MSG("Creation of x-axis team %d of %d failed\n", i+1, num_xteams); } @@ -481,7 +482,8 @@ int shmem_internal_team_split_2d(shmem_internal_team_t *parent_team, int xrange, int ysize = (remainder && i < remainder) ? yrange + 1 : yrange; ret = shmem_internal_team_split_strided(parent_team, start, xrange*parent_stride, - ysize, yaxis_config, yaxis_mask, &my_yteam); + ysize, yaxis_config, yaxis_mask, &my_yteam, + nic_idx); if (ret) { RAISE_ERROR_MSG("Creation of y-axis team %d of %d failed\n", i+1, num_yteams); } @@ -493,9 +495,9 @@ int shmem_internal_team_split_2d(shmem_internal_team_t *parent_team, int xrange, } } - long *psync = shmem_internal_team_choose_psync(parent_team, SYNC); + long *psync = shmem_internal_team_choose_psync(parent_team, SYNC, nic_idx); - shmem_internal_barrier(parent_start, parent_stride, parent_size, psync); + shmem_internal_barrier(parent_start, parent_stride, parent_size, psync, nic_idx); shmem_internal_team_release_psyncs(parent_team, SYNC); @@ -535,7 +537,7 @@ int shmem_internal_team_destroy(shmem_internal_team_t *team) /* Returns a psync from the given team that can be safely used for the * specified collective operation. */ -long * shmem_internal_team_choose_psync(shmem_internal_team_t *team, shmem_internal_team_op_t op) +long * shmem_internal_team_choose_psync(shmem_internal_team_t *team, shmem_internal_team_op_t op, size_t nic_idx) { switch (op) { @@ -556,7 +558,7 @@ long * shmem_internal_team_choose_psync(shmem_internal_team_t *team, shmem_inter size_t psync = team->psync_idx * SHMEM_SYNC_SIZE; shmem_internal_sync(team->start, team->stride, team->size, - &shmem_internal_psync_barrier_pool[psync]); + &shmem_internal_psync_barrier_pool[psync], nic_idx); for (int i = 0; i < N_PSYNCS_PER_TEAM; i++) { team->psync_avail[i] = 1; diff --git a/src/shmem_team.h b/src/shmem_team.h index 195730864..bf006c8b6 100644 --- a/src/shmem_team.h +++ b/src/shmem_team.h @@ -58,11 +58,12 @@ int shmem_internal_team_translate_pe(shmem_internal_team_t *src_team, int src_pe int shmem_internal_team_split_strided(shmem_internal_team_t *parent_team, int PE_start, int PE_stride, int PE_size, const shmem_team_config_t *config, long config_mask, - shmem_internal_team_t **new_team); + shmem_internal_team_t **new_team, size_t nic_idx); int shmem_internal_team_split_2d(shmem_internal_team_t *parent_team, int xrange, const shmem_team_config_t *xaxis_config, long xaxis_mask, shmem_internal_team_t **xaxis_team, - const shmem_team_config_t *yaxis_config, long yaxis_mask, shmem_internal_team_t **yaxis_team); + const shmem_team_config_t *yaxis_config, long yaxis_mask, shmem_internal_team_t **yaxis_team, + size_t nic_idx); int shmem_internal_team_destroy(shmem_internal_team_t *team); @@ -70,7 +71,7 @@ int shmem_internal_team_create_ctx(shmem_internal_team_t *team, long options, sh int shmem_internal_ctx_get_team(shmem_ctx_t ctx, shmem_internal_team_t **team); -long * shmem_internal_team_choose_psync(shmem_internal_team_t *team, shmem_internal_team_op_t op); +long * shmem_internal_team_choose_psync(shmem_internal_team_t *team, shmem_internal_team_op_t op, size_t nic_idx); void shmem_internal_team_release_psyncs(shmem_internal_team_t *team, shmem_internal_team_op_t op); diff --git a/src/shr_transport.h4 b/src/shr_transport.h4 index 9379ef2e5..fd7db7633 100644 --- a/src/shr_transport.h4 +++ b/src/shr_transport.h4 @@ -566,7 +566,8 @@ SHMEM_DEFINE_FOR_AMO(SHMEM_DEF_SUM_OP) static inline void shmem_shr_transport_put_signal(shmem_ctx_t ctx, void *target, const void *source, size_t len, - uint64_t *sig_addr, uint64_t signal, int sig_op, int pe) + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe, + size_t nic_idx) { #if USE_MEMCPY memcpy(target, source, len); @@ -587,10 +588,10 @@ shmem_shr_transport_put_signal(shmem_ctx_t ctx, void *target, #else if (sig_op == SHMEM_SIGNAL_ADD) shmem_transport_atomic((shmem_transport_ctx_t *) ctx, sig_addr, &signal, sizeof(uint64_t), - pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64); + pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64, nic_idx); else shmem_transport_atomic_set((shmem_transport_ctx_t *) ctx, sig_addr, &signal, - sizeof(uint64_t), pe, SHM_INTERNAL_UINT64); + sizeof(uint64_t), pe, SHM_INTERNAL_UINT64, nic_idx); #endif #elif USE_CMA shmem_transport_cma_put(target, source, len, pe, @@ -600,10 +601,10 @@ shmem_shr_transport_put_signal(shmem_ctx_t ctx, void *target, /* Using network atomics as CMA does not support atomic operations */ if (sig_op == SHMEM_SIGNAL_ADD) shmem_transport_atomic((shmem_transport_ctx_t *) ctx, sig_addr, &signal, sizeof(uint64_t), - pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64); + pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64, nic_idx); else shmem_transport_atomic_set((shmem_transport_ctx_t *) ctx, sig_addr, &signal, - sizeof(uint64_t), pe, SHM_INTERNAL_UINT64); + sizeof(uint64_t), pe, SHM_INTERNAL_UINT64, nic_idx); #else RAISE_ERROR_STR("No path to peer"); #endif diff --git a/src/symmetric_heap_c.c b/src/symmetric_heap_c.c index 30b319ea9..4230c1e6b 100644 --- a/src/symmetric_heap_c.c +++ b/src/symmetric_heap_c.c @@ -295,7 +295,9 @@ shmem_malloc(size_t size) ret = dlmalloc(size); SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - shmem_internal_barrier_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); return ret; } @@ -313,7 +315,9 @@ shmem_calloc(size_t count, size_t size) ret = dlcalloc(count, size); SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - shmem_internal_barrier_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); return ret; } @@ -326,7 +330,9 @@ shmem_free(void *ptr) SHMEM_ERR_CHECK_SYMMETRIC_HEAP(ptr); } - shmem_internal_barrier_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); shmem_internal_free(ptr); } @@ -344,7 +350,9 @@ shmem_realloc(void *ptr, size_t size) SHMEM_ERR_CHECK_SYMMETRIC_HEAP(ptr); } - shmem_internal_barrier_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); SHMEM_MUTEX_LOCK(shmem_internal_mutex_alloc); if (size == 0 && ptr != NULL) { @@ -355,7 +363,8 @@ shmem_realloc(void *ptr, size_t size) } SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - shmem_internal_barrier_all(); + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); return ret; } @@ -376,7 +385,9 @@ shmem_align(size_t alignment, size_t size) ret = dlmemalign(alignment, size); SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - shmem_internal_barrier_all(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); return ret; } @@ -430,9 +441,11 @@ shmem_malloc_with_hints(size_t size, long hints) ret = dlmalloc(size); SHMEM_MUTEX_UNLOCK(shmem_internal_mutex_alloc); - if (!(hints & SHMEMX_MALLOC_NO_BARRIER)) - shmem_internal_barrier_all(); - + if (!(hints & SHMEMX_MALLOC_NO_BARRIER)) { + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); + shmem_internal_barrier_all(nic_idx); + } return ret; } diff --git a/src/teams_c.c4 b/src/teams_c.c4 index c86065f73..1c0e3aa7f 100644 --- a/src/teams_c.c4 +++ b/src/teams_c.c4 @@ -115,9 +115,12 @@ shmem_team_split_strided(shmem_team_t parent_team, int PE_start, { SHMEM_ERR_CHECK_INITIALIZED(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); return shmem_internal_team_split_strided((shmem_internal_team_t *)parent_team, PE_start, PE_stride, PE_size, config, - config_mask, (shmem_internal_team_t **)new_team); + config_mask, (shmem_internal_team_t **)new_team, + nic_idx); } int SHMEM_FUNCTION_ATTRIBUTES @@ -128,11 +131,14 @@ shmem_team_split_2d(shmem_team_t parent_team, int xrange, { SHMEM_ERR_CHECK_INITIALIZED(); + size_t nic_idx = 0; + SHMEM_GET_TRANSMIT_NIC_IDX(nic_idx); return shmem_internal_team_split_2d((shmem_internal_team_t *)parent_team, xrange, xaxis_config, xaxis_mask, (shmem_internal_team_t **)xaxis_team, yaxis_config, yaxis_mask, - (shmem_internal_team_t **)yaxis_team); + (shmem_internal_team_t **)yaxis_team, + nic_idx); } int SHMEM_FUNCTION_ATTRIBUTES diff --git a/src/transport_none.h b/src/transport_none.h index f2a8dfc3a..6d4a9c547 100644 --- a/src/transport_none.h +++ b/src/transport_none.h @@ -112,7 +112,7 @@ shmem_transport_fence(shmem_transport_ctx_t* ctx) static inline void -shmem_transport_put_scalar(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe) +shmem_transport_put_scalar(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } @@ -128,14 +128,14 @@ shmem_transport_put_nb(shmem_transport_ctx_t* ctx, void *target, const void *sou static inline void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - uint64_t *sig_addr, uint64_t signal, int sig_op, int pe) + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } static inline void -shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion) +shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion, size_t nic_idx) { /* No op */ } @@ -143,21 +143,21 @@ shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion) static inline void shmem_transport_put_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe) + int pe, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } static inline void -shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe) +shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } static inline void -shmem_transport_get_wait(shmem_transport_ctx_t* ctx) +shmem_transport_get_wait(shmem_transport_ctx_t* ctx, size_t idx) { /* Nop */ } @@ -166,7 +166,7 @@ shmem_transport_get_wait(shmem_transport_ctx_t* ctx) static inline void shmem_transport_swap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, - size_t len, int pe, shm_internal_datatype_t datatype) + size_t len, int pe, shm_internal_datatype_t datatype, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } @@ -174,7 +174,7 @@ shmem_transport_swap(shmem_transport_ctx_t* ctx, void *target, const void *sourc static inline void shmem_transport_swap_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, - size_t len, int pe, shm_internal_datatype_t datatype) + size_t len, int pe, shm_internal_datatype_t datatype, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } @@ -183,7 +183,7 @@ static inline void shmem_transport_cswap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, const void *operand, size_t len, int pe, - shm_internal_datatype_t datatype) + shm_internal_datatype_t datatype, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } @@ -192,7 +192,7 @@ static inline void shmem_transport_cswap_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, const void *operand, size_t len, int pe, - shm_internal_datatype_t datatype) + shm_internal_datatype_t datatype, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } @@ -201,7 +201,7 @@ static inline void shmem_transport_mswap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, const void *mask, size_t len, int pe, - shm_internal_datatype_t datatype) + shm_internal_datatype_t datatype, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } @@ -209,7 +209,7 @@ shmem_transport_mswap(shmem_transport_ctx_t* ctx, void *target, const void *sour static inline void shmem_transport_atomic(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, shm_internal_op_t op, shm_internal_datatype_t datatype) + int pe, shm_internal_op_t op, shm_internal_datatype_t datatype, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } @@ -225,7 +225,7 @@ shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const void *so static inline void shmem_transport_fetch_atomic(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, size_t len, - int pe, shm_internal_op_t op, shm_internal_datatype_t datatype) + int pe, shm_internal_op_t op, shm_internal_datatype_t datatype, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } @@ -233,7 +233,7 @@ shmem_transport_fetch_atomic(shmem_transport_ctx_t* ctx, void *target, const voi static inline void shmem_transport_fetch_atomic_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, size_t len, - int pe, shm_internal_op_t op, shm_internal_datatype_t datatype) + int pe, shm_internal_op_t op, shm_internal_datatype_t datatype, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } @@ -241,7 +241,7 @@ shmem_transport_fetch_atomic_nbi(shmem_transport_ctx_t* ctx, void *target, const static inline void shmem_transport_atomic_fetch(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, shm_internal_datatype_t datatype) + int pe, shm_internal_datatype_t datatype, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } @@ -249,7 +249,7 @@ shmem_transport_atomic_fetch(shmem_transport_ctx_t* ctx, void *target, const voi static inline void shmem_transport_atomic_set(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, shm_internal_datatype_t datatype) + int pe, shm_internal_datatype_t datatype, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } @@ -294,14 +294,15 @@ void shmem_transport_ct_wait(shmem_transport_ct_t *ct, long wait_for) static inline void shmem_transport_put_ct_nb(shmem_transport_ct_t *ct, void *target, const void - *source, size_t len, int pe, long *completion) + *source, size_t len, int pe, long *completion, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } static inline void shmem_transport_get_ct(shmem_transport_ct_t *ct, void - *target, const void *source, size_t len, int pe) + *target, const void *source, size_t len, int pe, + size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } diff --git a/src/transport_ofi.c b/src/transport_ofi.c index 1fdd9fbc0..7a4451eb5 100644 --- a/src/transport_ofi.c +++ b/src/transport_ofi.c @@ -52,6 +52,9 @@ #include "runtime.h" #include "uthash.h" +struct fi_info **provider_list = NULL; +size_t shmem_transport_ofi_num_nics = 0; + struct fabric_info { struct fi_info *fabrics; struct fi_info *p_info; @@ -386,7 +389,7 @@ struct shmem_transport_ofi_stx_t { int is_private; }; typedef struct shmem_transport_ofi_stx_t shmem_transport_ofi_stx_t; -static shmem_transport_ofi_stx_t* shmem_transport_ofi_stx_pool = NULL; +static shmem_transport_ofi_stx_t** shmem_transport_ofi_stx_pool = NULL; struct shmem_transport_ofi_stx_kvs_t { int stx_idx; @@ -397,7 +400,7 @@ typedef struct shmem_transport_ofi_stx_kvs_t shmem_transport_ofi_stx_kvs_t; static shmem_transport_ofi_stx_kvs_t* shmem_transport_ofi_stx_kvs = NULL; static inline -void shmem_transport_ofi_dump_stx(void) { +void shmem_transport_ofi_dump_stx(size_t idx) { char stx_str[256]; int i, offset; @@ -407,8 +410,8 @@ void shmem_transport_ofi_dump_stx(void) { for (i = offset = 0; i < shmem_transport_ofi_stx_max; i++) offset += snprintf(stx_str+offset, 256-offset, (i == shmem_transport_ofi_stx_max-1) ? "%ld%s" : "%ld%s ", - shmem_transport_ofi_stx_pool[i].ref_cnt, - shmem_transport_ofi_stx_pool[i].is_private ? "P" : "S"); + shmem_transport_ofi_stx_pool[idx][i].ref_cnt, + shmem_transport_ofi_stx_pool[idx][i].is_private ? "P" : "S"); DEBUG_MSG("STX[%ld] = [ %s ]\n", shmem_transport_ofi_stx_max, stx_str); } @@ -432,13 +435,13 @@ void shmem_transport_ofi_stx_rand_init(void) { } static inline -int shmem_transport_ofi_stx_search_unused(void) +int shmem_transport_ofi_stx_search_unused(size_t idx) { int stx_idx = -1, i; for (i = 0; i < shmem_transport_ofi_stx_max; i++) { - if (shmem_transport_ofi_stx_pool[i].ref_cnt == 0) { - shmem_internal_assert(!shmem_transport_ofi_stx_pool[i].is_private); + if (shmem_transport_ofi_stx_pool[idx][i].ref_cnt == 0) { + shmem_internal_assert(!shmem_transport_ofi_stx_pool[idx][i].is_private); stx_idx = i; break; } @@ -449,7 +452,7 @@ int shmem_transport_ofi_stx_search_unused(void) static inline -int shmem_transport_ofi_stx_search_shared(long threshold) +int shmem_transport_ofi_stx_search_shared(long threshold, size_t idx) { static int rr_start_idx = 0; int stx_idx = -1, i, count; @@ -458,9 +461,9 @@ int shmem_transport_ofi_stx_search_shared(long threshold) case ROUNDROBIN: i = rr_start_idx; for (count = 0; count < shmem_transport_ofi_stx_max; count++) { - if (shmem_transport_ofi_stx_pool[i].ref_cnt > 0 && - (shmem_transport_ofi_stx_pool[i].ref_cnt <= threshold || threshold == -1) && - !shmem_transport_ofi_stx_pool[i].is_private) { + if (shmem_transport_ofi_stx_pool[idx][i].ref_cnt > 0 && + (shmem_transport_ofi_stx_pool[idx][i].ref_cnt <= threshold || threshold == -1) && + !shmem_transport_ofi_stx_pool[idx][i].is_private) { stx_idx = i; rr_start_idx = (i + 1) % shmem_transport_ofi_stx_max; break; @@ -473,9 +476,9 @@ int shmem_transport_ofi_stx_search_shared(long threshold) case RANDOM: for (i = count = 0; i < shmem_transport_ofi_stx_max; i++) { - if (shmem_transport_ofi_stx_pool[i].ref_cnt > 0 && - (shmem_transport_ofi_stx_pool[i].ref_cnt <= threshold || threshold == -1) && - !shmem_transport_ofi_stx_pool[i].is_private) + if (shmem_transport_ofi_stx_pool[idx][i].ref_cnt > 0 && + (shmem_transport_ofi_stx_pool[idx][i].ref_cnt <= threshold || threshold == -1) && + !shmem_transport_ofi_stx_pool[idx][i].is_private) { ++count; break; @@ -489,9 +492,9 @@ int shmem_transport_ofi_stx_search_shared(long threshold) else { do { stx_idx = (int) (rand_r(&rand_pool_seed) / (RAND_MAX + 1.0) * shmem_transport_ofi_stx_max); - } while (!(shmem_transport_ofi_stx_pool[stx_idx].ref_cnt > 0 && - (shmem_transport_ofi_stx_pool[stx_idx].ref_cnt <= threshold || threshold == -1) && - !shmem_transport_ofi_stx_pool[stx_idx].is_private)); + } while (!(shmem_transport_ofi_stx_pool[idx][stx_idx].ref_cnt > 0 && + (shmem_transport_ofi_stx_pool[idx][stx_idx].ref_cnt <= threshold || threshold == -1) && + !shmem_transport_ofi_stx_pool[idx][stx_idx].is_private)); } break; @@ -506,21 +509,23 @@ int shmem_transport_ofi_stx_search_shared(long threshold) static inline -void shmem_transport_ofi_stx_allocate(shmem_transport_ctx_t *ctx) +void shmem_transport_ofi_stx_allocate(shmem_transport_ctx_t *ctx, size_t idx) { if (shmem_transport_ofi_stx_max == 0) { - ctx->stx_idx = -1; + ctx->stx_idx[idx] = -1; } else if (shmem_transport_ofi_is_private(ctx->options)) { /* SHMEM contexts that are private to the same thread (i.e. have * SHMEM_CTX_PRIVATE option set) share the same STX. */ + // TODO: Should f be an array of shmem_transport_ofi_stx_kvs_t pointers, or single pointer and + // stx_idx field is an array? shmem_transport_ofi_stx_kvs_t *f; HASH_FIND(hh, shmem_transport_ofi_stx_kvs, &ctx->tid, sizeof(struct shmem_internal_tid), f); if (f) { - shmem_transport_ofi_stx_pool[f->stx_idx].ref_cnt++; - ctx->stx_idx = f->stx_idx; + shmem_transport_ofi_stx_pool[idx][f->stx_idx].ref_cnt++; + ctx->stx_idx[idx] = f->stx_idx; } else { /* No STX allocated to the given TID, attempt to allocate one */ @@ -528,21 +533,21 @@ void shmem_transport_ofi_stx_allocate(shmem_transport_ctx_t *ctx) int stx_idx; shmem_transport_ofi_stx_t *stx = NULL; - stx_idx = shmem_transport_ofi_stx_search_unused(); + stx_idx = shmem_transport_ofi_stx_search_unused(idx); /* Couldn't get new STX, assign a shared one */ /* Note: When stx_max > 0, shared STX allocation is always successful */ if (stx_idx < 0) { DEBUG_STR("private STX unavailable, falling back to STX sharing"); is_unused = 0; - stx_idx = shmem_transport_ofi_stx_search_shared(shmem_transport_ofi_stx_threshold); + stx_idx = shmem_transport_ofi_stx_search_shared(shmem_transport_ofi_stx_threshold, idx); if (stx_idx < 0) - stx_idx = shmem_transport_ofi_stx_search_shared(-1); + stx_idx = shmem_transport_ofi_stx_search_shared(-1, idx); } shmem_internal_assert(stx_idx >= 0); - stx = &shmem_transport_ofi_stx_pool[stx_idx]; - ctx->stx_idx = stx_idx; + stx = &shmem_transport_ofi_stx_pool[idx][stx_idx]; + ctx->stx_idx[idx] = stx_idx; stx->ref_cnt++; if (is_unused) { @@ -552,7 +557,7 @@ void shmem_transport_ofi_stx_allocate(shmem_transport_ctx_t *ctx) RAISE_ERROR_STR("out of memory when allocating STX KVS entry"); } e->tid = ctx->tid; - e->stx_idx = ctx->stx_idx; + e->stx_idx = ctx->stx_idx[idx]; /* FIX? */ HASH_ADD(hh, shmem_transport_ofi_stx_kvs, tid, sizeof(struct shmem_internal_tid), e); } else { @@ -561,20 +566,20 @@ void shmem_transport_ofi_stx_allocate(shmem_transport_ctx_t *ctx) } /* TODO: Optimize this case? else if (ctx->options & SHMEM_CTX_SERIALIZED) */ } else { - int stx_idx = shmem_transport_ofi_stx_search_shared(shmem_transport_ofi_stx_threshold); + int stx_idx = shmem_transport_ofi_stx_search_shared(shmem_transport_ofi_stx_threshold, idx); if (stx_idx < 0) - stx_idx = shmem_transport_ofi_stx_search_unused(); + stx_idx = shmem_transport_ofi_stx_search_unused(idx); if (stx_idx < 0) - stx_idx = shmem_transport_ofi_stx_search_shared(-1); + stx_idx = shmem_transport_ofi_stx_search_shared(-1, idx); shmem_internal_assert(stx_idx >= 0); - ctx->stx_idx = stx_idx; - shmem_transport_ofi_stx_pool[ctx->stx_idx].ref_cnt++; + ctx->stx_idx[idx] = stx_idx; + shmem_transport_ofi_stx_pool[idx][ctx->stx_idx[idx]].ref_cnt++; } - shmem_transport_ofi_dump_stx(); + shmem_transport_ofi_dump_stx(idx); return; } @@ -592,24 +597,24 @@ void init_bounce_buffer(shmem_free_list_item_t *item) static inline -int bind_enable_ep_resources(shmem_transport_ctx_t *ctx) +int bind_enable_ep_resources(shmem_transport_ctx_t *ctx, size_t idx) { int ret = 0; /* If using SOS-managed STXs, bind the STX */ - if (ctx->stx_idx >= 0) { - ret = fi_ep_bind(ctx->ep, &shmem_transport_ofi_stx_pool[ctx->stx_idx].stx->fid, 0); + if (ctx->stx_idx[idx] >= 0) { + ret = fi_ep_bind(ctx->ep[idx], &shmem_transport_ofi_stx_pool[idx][ctx->stx_idx[idx]].stx->fid, 0); OFI_CHECK_RETURN_STR(ret, "fi_ep_bind STX to endpoint failed"); } /* Put counter captures completions for non-fetching operations (put, * atomic, etc.) */ - ret = fi_ep_bind(ctx->ep, &ctx->put_cntr->fid, FI_WRITE); + ret = fi_ep_bind(ctx->ep[idx], &ctx->put_cntr[idx]->fid, FI_WRITE); OFI_CHECK_RETURN_STR(ret, "fi_ep_bind put CNTR to endpoint failed"); /* Get counter captures completions for fetching operations (get, * fetch-atomic, etc.) */ - ret = fi_ep_bind(ctx->ep, &ctx->get_cntr->fid, FI_READ); + ret = fi_ep_bind(ctx->ep[idx], &ctx->get_cntr[idx]->fid, FI_READ); OFI_CHECK_RETURN_STR(ret, "fi_ep_bind get CNTR to endpoint failed"); /* In addition to incrementing the put counter, bounce buffered puts and @@ -622,14 +627,15 @@ int bind_enable_ep_resources(shmem_transport_ctx_t *ctx) * removed below. However, there aren't currently any cases where removing * FI_RECV significantly improves performance or resource usage. */ - ret = fi_ep_bind(ctx->ep, &ctx->cq->fid, + ret = fi_ep_bind(ctx->ep[idx], &ctx->cq[idx]->fid, FI_SELECTIVE_COMPLETION | FI_TRANSMIT | FI_RECV); OFI_CHECK_RETURN_STR(ret, "fi_ep_bind CQ to endpoint failed"); - ret = fi_ep_bind(ctx->ep, &shmem_transport_ofi_avfd->fid, 0); - OFI_CHECK_RETURN_STR(ret, "fi_ep_bind AV to endpoint failed"); + ret = fi_ep_bind(ctx->ep[idx], /*&shmem_transport_ofi_avfd->fid*/ &ctx->av[idx]->fid, 0); /* Currently failing */ + //OFI_CHECK_RETURN_STR(ret, "fi_ep_bind AV to endpoint failed"); + OFI_CHECK_RETURN_MSG(ret, "fi_ep_bind AV to endpoint failed(%s)\n", fi_strerror(errno)); - ret = fi_enable(ctx->ep); + ret = fi_enable(ctx->ep[idx]); OFI_CHECK_RETURN_STR(ret, "fi_enable on endpoint failed"); return ret; @@ -872,14 +878,14 @@ int publish_external_mr_info(void) #endif static -int publish_mr_info(void) +int publish_mr_info(struct fi_info *info) { #ifndef ENABLE_MR_SCALABLE { int err; uint64_t heap_key, data_key; - if (shmem_transport_ofi_info.p_info->domain_attr->mr_mode & FI_MR_PROV_KEY) { + if (info->domain_attr->mr_mode & FI_MR_PROV_KEY) { heap_key = fi_mr_key(shmem_transport_ofi_target_heap_mrfd); data_key = fi_mr_key(shmem_transport_ofi_target_data_mrfd); } else { @@ -901,7 +907,7 @@ int publish_mr_info(void) } #ifdef ENABLE_REMOTE_VIRTUAL_ADDRESSING - if (shmem_transport_ofi_info.p_info->domain_attr->mr_mode & FI_MR_VIRT_ADDR) + if (info->domain_attr->mr_mode & FI_MR_VIRT_ADDR) shmem_transport_ofi_use_absolute_address = 1; else shmem_transport_ofi_use_absolute_address = 0; @@ -910,7 +916,7 @@ int publish_mr_info(void) int err; void *heap_base, *data_base; - if (shmem_transport_ofi_info.p_info->domain_attr->mr_mode & FI_MR_VIRT_ADDR) { + if (info->domain_attr->mr_mode & FI_MR_VIRT_ADDR) { heap_base = shmem_internal_heap_base; data_base = shmem_internal_data_base; } else { @@ -1098,7 +1104,7 @@ int atomicvalid_rtncheck(int ret, int atomic_size, static inline int atomicvalid_DTxOP(int DT_MAX, int OPS_MAX, int *DT, int *OPS, - atomic_support_lv atomic_sup) + atomic_support_lv atomic_sup, size_t idx) { int i, j; size_t atomic_size; @@ -1106,7 +1112,7 @@ int atomicvalid_DTxOP(int DT_MAX, int OPS_MAX, int *DT, int *OPS, for (i = 0; i < DT_MAX; i++) { for (j = 0; j < OPS_MAX; j++) { int dt = SHMEM_TRANSPORT_DTYPE(DT[i]); - int ret = fi_atomicvalid(shmem_transport_ctx_default.ep, + int ret = fi_atomicvalid(shmem_transport_ctx_default.ep[idx], dt, OPS[j], &atomic_size); if (atomicvalid_rtncheck(ret, atomic_size, atomic_sup, SHMEM_OpName[OPS[j]], @@ -1120,7 +1126,7 @@ int atomicvalid_DTxOP(int DT_MAX, int OPS_MAX, int *DT, int *OPS, static inline int compare_atomicvalid_DTxOP(int DT_MAX, int OPS_MAX, int *DT, - int *OPS, atomic_support_lv atomic_sup) + int *OPS, atomic_support_lv atomic_sup, size_t idx) { int i, j; size_t atomic_size; @@ -1128,7 +1134,7 @@ int compare_atomicvalid_DTxOP(int DT_MAX, int OPS_MAX, int *DT, for (i = 0; i < DT_MAX; i++) { for (j = 0; j < OPS_MAX; j++) { int dt = SHMEM_TRANSPORT_DTYPE(DT[i]); - int ret = fi_compare_atomicvalid(shmem_transport_ctx_default.ep, + int ret = fi_compare_atomicvalid(shmem_transport_ctx_default.ep[idx], dt, OPS[j], &atomic_size); if (atomicvalid_rtncheck(ret, atomic_size, atomic_sup, SHMEM_OpName[OPS[j]], @@ -1142,7 +1148,7 @@ int compare_atomicvalid_DTxOP(int DT_MAX, int OPS_MAX, int *DT, static inline int fetch_atomicvalid_DTxOP(int DT_MAX, int OPS_MAX, int *DT, int *OPS, - atomic_support_lv atomic_sup) + atomic_support_lv atomic_sup, size_t idx) { int i, j; size_t atomic_size; @@ -1150,7 +1156,7 @@ int fetch_atomicvalid_DTxOP(int DT_MAX, int OPS_MAX, int *DT, int *OPS, for (i = 0; i < DT_MAX; i++) { for (j = 0; j < OPS_MAX; j++) { int dt = SHMEM_TRANSPORT_DTYPE(DT[i]); - int ret = fi_fetch_atomicvalid(shmem_transport_ctx_default.ep, + int ret = fi_fetch_atomicvalid(shmem_transport_ctx_default.ep[idx], dt, OPS[j], &atomic_size); if (atomicvalid_rtncheck(ret, atomic_size, atomic_sup, SHMEM_OpName[OPS[j]], @@ -1163,7 +1169,7 @@ int fetch_atomicvalid_DTxOP(int DT_MAX, int OPS_MAX, int *DT, int *OPS, } static inline -int atomic_limitations_check(void) +int atomic_limitations_check(size_t idx) { /* Retrieve messaging limitations from OFI * @@ -1182,54 +1188,54 @@ int atomic_limitations_check(void) /* Standard OPS check */ ret = atomicvalid_DTxOP(SIZEOF_AMO_DT, SIZEOF_AMO_OPS, DT_AMO_STANDARD, - AMO_STANDARD_OPS, general_atomic_sup); + AMO_STANDARD_OPS, general_atomic_sup, idx); if (ret) return ret; ret = fetch_atomicvalid_DTxOP(SIZEOF_AMO_DT, SIZEOF_AMO_FOPS, DT_AMO_STANDARD, FETCH_AMO_STANDARD_OPS, - general_atomic_sup); + general_atomic_sup, idx); if (ret) return ret; ret = compare_atomicvalid_DTxOP(SIZEOF_AMO_DT, SIZEOF_AMO_COPS, DT_AMO_STANDARD, COMPARE_AMO_STANDARD_OPS, - general_atomic_sup); + general_atomic_sup, idx); if (ret) return ret; /* Extended OPS check */ ret = atomicvalid_DTxOP(SIZEOF_AMO_EX_DT, SIZEOF_AMO_EX_OPS, DT_AMO_EXTENDED, - AMO_EXTENDED_OPS, general_atomic_sup); + AMO_EXTENDED_OPS, general_atomic_sup, idx); if (ret) return ret; ret = fetch_atomicvalid_DTxOP(SIZEOF_AMO_EX_DT, SIZEOF_AMO_EX_FOPS, DT_AMO_EXTENDED, FETCH_AMO_EXTENDED_OPS, - general_atomic_sup); + general_atomic_sup, idx); if (ret) return ret; /* Reduction OPS check */ ret = atomicvalid_DTxOP(SIZEOF_RED_DT, SIZEOF_RED_OPS, DT_REDUCE_BITWISE, - REDUCE_BITWISE_OPS, reduction_sup); + REDUCE_BITWISE_OPS, reduction_sup, idx); if (ret) return ret; ret = atomicvalid_DTxOP(SIZEOF_REDC_DT, SIZEOF_REDC_OPS, DT_REDUCE_COMPARE, - REDUCE_COMPARE_OPS, reduction_sup); + REDUCE_COMPARE_OPS, reduction_sup, idx); if (ret) return ret; ret = atomicvalid_DTxOP(SIZEOF_REDA_DT, SIZEOF_REDA_OPS, DT_REDUCE_ARITH, - REDUCE_ARITH_OPS, reduction_sup); + REDUCE_ARITH_OPS, reduction_sup, idx); if (ret) return ret; /* Internal atomic requirement */ ret = compare_atomicvalid_DTxOP(SIZEOF_INTERNAL_REQ_DT, SIZEOF_INTERNAL_REQ_OPS, DT_INTERNAL_REQ, INTERNAL_REQ_OPS, - general_atomic_sup); + general_atomic_sup, idx); if (ret) return ret; @@ -1291,6 +1297,19 @@ int populate_av(void) return ret; } + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + ret = fi_av_insert(shmem_transport_ctx_default.av[idx], + alladdrs, + shmem_internal_num_pes, + addr_table, + 0, + NULL); + if (ret != shmem_internal_num_pes) { + RAISE_WARN_STR("av insert failed"); + return ret; + } + } + free(alladdrs); return 0; @@ -1356,6 +1375,11 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p ret = hwloc_get_proc_last_cpu_location(shmem_internal_topology, getpid(), bindset, HWLOC_CPUBIND_PROCESS); if (ret < 0) { RAISE_WARN_MSG("hwloc_get_proc_last_cpu_location failed (%s)\n", strerror(errno)); + provider_list = (struct fi_info **) malloc(num_nics * sizeof(struct fi_info *)); + for (size_t idx = 0; idx < num_nics; idx++) { + provider_list[idx] = provs[idx]; + } + shmem_transport_ofi_num_nics = num_nics; return provs[shmem_internal_my_pe % num_nics]; } @@ -1371,11 +1395,21 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p hwloc_obj_t io_device = hwloc_get_pcidev_by_busid(shmem_internal_topology, pci.domain_id, pci.bus_id, pci.device_id, pci.function_id); if (!io_device) { RAISE_WARN_MSG("hwloc_get_pcidev_by_busid failed\n"); + provider_list = (struct fi_info **) malloc(num_nics * sizeof(struct fi_info *)); + for (size_t idx = 0; idx < num_nics; idx++) { + provider_list[idx] = provs[idx]; + } + shmem_transport_ofi_num_nics = num_nics; return provs[shmem_internal_my_pe % num_nics]; }; hwloc_obj_t first_non_io = hwloc_get_non_io_ancestor_obj(shmem_internal_topology, io_device); if (!first_non_io) { RAISE_WARN_MSG("hwloc_get_non_io_ancestor_obj failed\n"); + provider_list = (struct fi_info **) malloc(num_nics * sizeof(struct fi_info *)); + for (size_t idx = 0; idx < num_nics; idx++) { + provider_list[idx] = provs[idx]; + } + shmem_transport_ofi_num_nics = num_nics; return provs[shmem_internal_my_pe % num_nics]; } @@ -1392,7 +1426,11 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p if (!close_provs) { RAISE_WARN_MSG("Could not detect any NICs with affinity to the process\n"); - + provider_list = (struct fi_info **) malloc(num_nics * sizeof(struct fi_info *)); + for (size_t idx = 0; idx < num_nics; idx++) { + provider_list[idx] = provs[idx]; + } + shmem_transport_ofi_num_nics = num_nics; /* If no 'close' NICs, select from list of all NICs using round-robin assignment */ return provs[shmem_internal_my_pe % num_nics]; } @@ -1400,16 +1438,17 @@ struct fi_info *assign_nic_with_hwloc(struct fi_info *fabric, struct fi_info **p last_added->next = NULL; int idx = 0; - struct fi_info **prov_list = (struct fi_info **) malloc(num_close_nics * sizeof(struct fi_info *)); + provider_list = (struct fi_info **) malloc(num_close_nics * sizeof(struct fi_info *)); for (struct fi_info *cur_fabric = close_provs; cur_fabric; cur_fabric = cur_fabric->next) { - prov_list[idx++] = cur_fabric; + provider_list[idx++] = cur_fabric; } hwloc_bitmap_free(bindset); - struct fi_info *provider = prov_list[shmem_internal_my_pe % num_close_nics]; - free(prov_list); + struct fi_info *provider = provider_list[shmem_internal_my_pe % num_close_nics]; + //free(prov_list); + shmem_transport_ofi_num_nics = num_close_nics; return provider; } #endif @@ -1565,7 +1604,10 @@ int query_for_fabric(struct fabric_info *info) info->p_info = NULL; if (shmem_internal_params.OFI_DISABLE_MULTIRAIL) { + provider_list = (struct fi_info **) malloc(sizeof(struct fi_info *)); + provider_list[0] = fabrics_list_head; info->p_info = fabrics_list_head; + shmem_transport_ofi_num_nics = 1; } else { /* Generate a linked list of all fabrics with a non-null nic value */ @@ -1581,26 +1623,34 @@ int query_for_fabric(struct fabric_info *info) if (multirail_fabric_list_tail) multirail_fabric_list_tail->next = NULL; if (num_nics == 0) { + provider_list = (struct fi_info **) malloc(sizeof(struct fi_info *)); + provider_list[0] = fallback; info->p_info = fallback; + shmem_transport_ofi_num_nics = 1; } else { int idx = 0; - struct fi_info **prov_list = (struct fi_info **) malloc(num_nics * sizeof(struct fi_info *)); + struct fi_info **sorted_prov_list = (struct fi_info **) malloc(num_nics * sizeof(struct fi_info *)); for (struct fi_info *cur_fabric = multirail_fabric_list_head; cur_fabric; cur_fabric = cur_fabric->next) { - prov_list[idx++] = cur_fabric; + sorted_prov_list[idx++] = cur_fabric; } - qsort(prov_list, num_nics, sizeof(struct fi_info *), compare_nic_names); + qsort(sorted_prov_list, num_nics, sizeof(struct fi_info *), compare_nic_names); #ifdef USE_HWLOC - info->p_info = assign_nic_with_hwloc(info->p_info, prov_list, num_nics); + info->p_info = assign_nic_with_hwloc(info->p_info, sorted_prov_list, num_nics); #else /* Round-robin assignment of NICs to PEs * FIXME: A more suitable indexing value would be * shmem_team_my_pe(SHMEM_TEAM_NODE) % num_nics, but it is too early in initialization to * do that here. We would also want to replace the similar occurrences in the * assign_nic_with_hwloc function. */ - info->p_info = prov_list[shmem_internal_my_pe % num_nics]; + provider_list = (struct fi_info **) malloc(num_nics * sizeof(struct fi_info *)); + for (size_t idx = 0; idx < num_nics; idx++) { + provider_list[idx] = sorted_prov_list[idx]; + } + info->p_info = provider_list[shmem_internal_my_pe % num_nics]; + shmem_transport_ofi_num_nics = num_nics; #endif - free(prov_list); + //free(prov_list); //Add free(provider_list) to cleanup } } if (NULL == info->p_info) { @@ -1734,37 +1784,87 @@ static int shmem_transport_ofi_ctx_init(shmem_transport_ctx_t *ctx, int id) struct fabric_info* info = &shmem_transport_ofi_info; - info->p_info->ep_attr->tx_ctx_cnt = shmem_transport_ofi_stx_max > 0 ? FI_SHARED_CONTEXT : 0; - info->p_info->caps = FI_RMA | FI_WRITE | FI_READ | FI_ATOMIC | FI_RECV; - info->p_info->tx_attr->op_flags = FI_DELIVERY_COMPLETE; - info->p_info->mode = 0; - info->p_info->tx_attr->mode = 0; - info->p_info->rx_attr->mode = 0; - info->p_info->tx_attr->caps = info->p_info->caps; - info->p_info->rx_attr->caps = FI_RECV; /* to drive progress on the CQ */; + // Need to do these steps for all providers in provider_list? + //info->p_info->ep_attr->tx_ctx_cnt = shmem_transport_ofi_stx_max > 0 ? FI_SHARED_CONTEXT : 0; + //info->p_info->caps = FI_RMA | FI_WRITE | FI_READ | FI_ATOMIC | FI_RECV; + //info->p_info->tx_attr->op_flags = FI_DELIVERY_COMPLETE; + //info->p_info->mode = 0; + //info->p_info->tx_attr->mode = 0; + //info->p_info->rx_attr->mode = 0; + //info->p_info->tx_attr->caps = info->p_info->caps; + //info->p_info->rx_attr->caps = FI_RECV; /* to drive progress on the CQ */; ctx->id = id; + ctx->fabric = (struct fid_fabric **) malloc(shmem_transport_ofi_num_nics * sizeof(struct fid_fabric *)); + ctx->domain = (struct fid_domain **) malloc(shmem_transport_ofi_num_nics * sizeof(struct fid_domain *)); + ctx->av = (struct fid_av **) malloc(shmem_transport_ofi_num_nics * sizeof(struct fid_av *)); + ctx->ep = (struct fid_ep **) malloc(shmem_transport_ofi_num_nics * sizeof(struct fid_ep *)); + ctx->put_cntr = (struct fid_cntr **) malloc(shmem_transport_ofi_num_nics * sizeof(struct fid_cntr *)); + ctx->get_cntr = (struct fid_cntr **) malloc(shmem_transport_ofi_num_nics * sizeof(struct fid_cntr *)); +#ifdef USE_CTX_LOCK + ctx->pending_put_cntr = (uint64_t *) malloc(shmem_transport_ofi_num_nics * sizeof(uint64_t)); + ctx->pending_get_cntr = (uint64_t *) malloc(shmem_transport_ofi_num_nics * sizeof(uint64_t)); +#else + ctx->pending_put_cntr = (shmem_internal_cntr_t *) malloc(shmem_transport_ofi_num_nics * sizeof(shmem_internal_cntr_t)); + ctx->pending_get_cntr = (shmem_internal_cntr_t *) malloc(shmem_transport_ofi_num_nics * sizeof(shmem_internal_cntr_t)); +#endif + ctx->cq = (struct fid_cq **) malloc(shmem_transport_ofi_num_nics * sizeof(struct fid_cq *)); + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { +#ifdef USE_CTX_LOCK + ctx->pending_put_cntr[idx] = 0; + ctx->pending_get_cntr[idx] = 0; +#else + shmem_internal_cntr_write(&ctx->pending_put_cntr[idx], 0); + shmem_internal_cntr_write(&ctx->pending_get_cntr[idx], 0); +#endif + provider_list[idx]->ep_attr->tx_ctx_cnt = shmem_transport_ofi_stx_max > 0 ? FI_SHARED_CONTEXT : 0; + provider_list[idx]->caps = FI_RMA | FI_WRITE | FI_READ | FI_ATOMIC | FI_RECV; + provider_list[idx]->tx_attr->op_flags = FI_DELIVERY_COMPLETE; + provider_list[idx]->mode = 0; + provider_list[idx]->tx_attr->mode = 0; + provider_list[idx]->rx_attr->mode = 0; + provider_list[idx]->tx_attr->caps = provider_list[idx]->caps; + provider_list[idx]->rx_attr->caps = FI_RECV; /* to drive progress on the CQ */; #ifdef USE_CTX_LOCK SHMEM_MUTEX_INIT(ctx->lock); #endif + ret = fi_fabric(provider_list[idx]->fabric_attr, &ctx->fabric[idx], NULL); + OFI_CHECK_RETURN_STR(ret, "fabric initialization failed"); - ret = fi_cntr_open(shmem_transport_ofi_domainfd, &cntr_put_attr, - &ctx->put_cntr, NULL); - OFI_CHECK_RETURN_MSG(ret, "put_cntr creation failed (%s)\n", fi_strerror(errno)); + ret = fi_domain(/*shmem_transport_ofi_fabfd*/ ctx->fabric[idx], provider_list[idx], + &ctx->domain[idx], NULL); + OFI_CHECK_RETURN_STR(ret, "domain initialization failed"); - ret = fi_cntr_open(shmem_transport_ofi_domainfd, &cntr_get_attr, - &ctx->get_cntr, NULL); - OFI_CHECK_RETURN_MSG(ret, "get_cntr creation failed (%s)\n", fi_strerror(errno)); + struct fi_av_attr av_attr = {0}; +#ifdef USE_AV_MAP + av_attr.type = FI_AV_MAP; +#else + av_attr.type = FI_AV_TABLE; +#endif + ret = fi_av_open(/*shmem_transport_ofi_domainfd*/ ctx->domain[idx], + &av_attr, + /*&shmem_transport_ofi_avfd*/ &ctx->av[idx], + NULL); + OFI_CHECK_RETURN_STR(ret, "AV creation failed"); - ret = fi_cq_open(shmem_transport_ofi_domainfd, &cq_attr, &ctx->cq, NULL); - if (ret && errno == FI_EMFILE) { - DEBUG_STR("Context creation failed because of open files limit, consider increasing with 'ulimit' command"); - } - OFI_CHECK_RETURN_MSG(ret, "cq_open failed (%s)\n", fi_strerror(errno)); + ret = fi_cntr_open(/*shmem_transport_ofi_domainfd*/ ctx->domain[idx], &cntr_put_attr, + &ctx->put_cntr[idx], NULL); + OFI_CHECK_RETURN_MSG(ret, "put_cntr creation failed (%s)\n", fi_strerror(errno)); - ret = fi_endpoint(shmem_transport_ofi_domainfd, - info->p_info, &ctx->ep, NULL); - OFI_CHECK_RETURN_MSG(ret, "ep creation failed (%s)\n", fi_strerror(errno)); + ret = fi_cntr_open(/*shmem_transport_ofi_domainfd*/ ctx->domain[idx], &cntr_get_attr, + &ctx->get_cntr[idx], NULL); + OFI_CHECK_RETURN_MSG(ret, "get_cntr creation failed (%s)\n", fi_strerror(errno)); + + ret = fi_cq_open(/*shmem_transport_ofi_domainfd*/ ctx->domain[idx], &cq_attr, &ctx->cq[idx], NULL); + if (ret && errno == FI_EMFILE) { + DEBUG_STR("Context creation failed because of open files limit, consider increasing with 'ulimit' command"); + } + OFI_CHECK_RETURN_MSG(ret, "cq_open failed (%s)\n", fi_strerror(errno)); + + ret = fi_endpoint(/*shmem_transport_ofi_domainfd*/ ctx->domain[idx], + /*info->p_info*/ provider_list[idx], &ctx->ep[idx], NULL); + OFI_CHECK_RETURN_MSG(ret, "ep creation failed (%s)\n", fi_strerror(errno)); + } /* TODO: Fill in TX attr */ @@ -1773,11 +1873,12 @@ static int shmem_transport_ofi_ctx_init(shmem_transport_ctx_t *ctx, int id) shmem_transport_ofi_is_private(ctx->options)) { ctx->tid = shmem_transport_ofi_gettid(); } - shmem_transport_ofi_stx_allocate(ctx); - - ret = bind_enable_ep_resources(ctx); - OFI_CHECK_RETURN_MSG(ret, "context bind/enable endpoint failed (%s)\n", fi_strerror(errno)); + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + shmem_transport_ofi_stx_allocate(ctx, idx); + ret = bind_enable_ep_resources(ctx, idx); + OFI_CHECK_RETURN_MSG(ret, "context bind/enable endpoint failed (%s)\n", fi_strerror(errno)); + } if (ctx->options & SHMEMX_CTX_BOUNCE_BUFFER && shmem_transport_ofi_bounce_buffer_size > 0 && shmem_transport_ofi_max_bounce_buffers > 0) @@ -1892,7 +1993,7 @@ int shmem_transport_init(void) ret = shmem_transport_ofi_target_ep_init(); if (ret != 0) return ret; - ret = publish_mr_info(); + ret = publish_mr_info(shmem_transport_ofi_info.p_info); if (ret != 0) return ret; ret = publish_av_info(&shmem_transport_ofi_info); @@ -1906,72 +2007,83 @@ int shmem_transport_startup(void) int ret; int i; - if (shmem_internal_params.OFI_STX_AUTO && shmem_transport_ofi_stx_max == 0) { - RAISE_WARN_STR("STXs disabled, ignoring request for automatic STX management"); + shmem_transport_ofi_stx_pool = (shmem_transport_ofi_stx_t **) malloc(shmem_transport_ofi_num_nics * + sizeof(shmem_transport_ofi_stx_t *)); + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + shmem_transport_ofi_stx_pool[idx] = NULL; } - else if (shmem_internal_params.OFI_STX_AUTO) { + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + if (shmem_internal_params.OFI_STX_AUTO && shmem_transport_ofi_stx_max == 0) { + RAISE_WARN_STR("STXs disabled, ignoring request for automatic STX management"); + } + else if (shmem_internal_params.OFI_STX_AUTO) { + long ofi_tx_ctx_cnt = /*shmem_transport_ofi_info.fabrics*/provider_list[idx]->domain_attr->tx_ctx_cnt; + int num_on_node = shmem_runtime_get_node_size(); - long ofi_tx_ctx_cnt = shmem_transport_ofi_info.fabrics->domain_attr->tx_ctx_cnt; - int num_on_node = shmem_runtime_get_node_size(); + if (shmem_internal_params.OFI_STX_MAX_provided) { + RAISE_WARN_MSG("Auto-setting STX_MAX; ignoring provided STX_MAX value '%ld'\n", + shmem_internal_params.OFI_STX_MAX); + } - if (shmem_internal_params.OFI_STX_MAX_provided) { - RAISE_WARN_MSG("Auto-setting STX_MAX; ignoring provided STX_MAX value '%ld'\n", - shmem_internal_params.OFI_STX_MAX); - } + if (ofi_tx_ctx_cnt <= 0) + RAISE_ERROR_MSG("Invalid number of TX contexts (%ld)\n", ofi_tx_ctx_cnt); + + /* Paritition TX resources evenly across node-local PEs */ + /* Note: we assume that the domain reports the same tx_ctx_cnt for + * every PE on the node. We also assume that the resource reported + * should be divided equally among all PEs. These assumptions may not + * be valid in all cases, for example when the provider has already + * partitioned resources or when a node has multiple NICs. */ + shmem_transport_ofi_stx_max = ofi_tx_ctx_cnt / num_on_node; + int remainder = ofi_tx_ctx_cnt % num_on_node; + int node_pe = shmem_internal_my_pe % shmem_internal_num_pes; + if (remainder > 0 && ((node_pe % num_on_node) < remainder)) { + shmem_transport_ofi_stx_max++; + } - if (ofi_tx_ctx_cnt <= 0) - RAISE_ERROR_MSG("Invalid number of TX contexts (%ld)\n", ofi_tx_ctx_cnt); - - /* Paritition TX resources evenly across node-local PEs */ - /* Note: we assume that the domain reports the same tx_ctx_cnt for - * every PE on the node. We also assume that the resource reported - * should be divided equally among all PEs. These assumptions may not - * be valid in all cases, for example when the provider has already - * partitioned resources or when a node has multiple NICs. */ - shmem_transport_ofi_stx_max = ofi_tx_ctx_cnt / num_on_node; - int remainder = ofi_tx_ctx_cnt % num_on_node; - int node_pe = shmem_internal_my_pe % shmem_internal_num_pes; - if (remainder > 0 && ((node_pe % num_on_node) < remainder)) { - shmem_transport_ofi_stx_max++; - } + if (shmem_transport_ofi_stx_max <= 0) + RAISE_ERROR_MSG("Not enough TX contexts (%d)\n", num_on_node); - if (shmem_transport_ofi_stx_max <= 0) - RAISE_ERROR_MSG("Not enough TX contexts (%d)\n", num_on_node); + /* When running more PEs than available STXs, must assign each PE at least 1 */ + if (shmem_transport_ofi_stx_max <= 0) { + shmem_transport_ofi_stx_max = 1; + RAISE_WARN_MSG("Need at least 1 STX per PE, but detected %ld available STXs for %d PEs\n", + ofi_tx_ctx_cnt, num_on_node); + } - /* When running more PEs than available STXs, must assign each PE at least 1 */ - if (shmem_transport_ofi_stx_max <= 0) { - shmem_transport_ofi_stx_max = 1; - RAISE_WARN_MSG("Need at least 1 STX per PE, but detected %ld available STXs for %d PEs\n", - ofi_tx_ctx_cnt, num_on_node); + DEBUG_MSG("Auto-set STX max to %ld\n", shmem_transport_ofi_stx_max); } - DEBUG_MSG("Auto-set STX max to %ld\n", shmem_transport_ofi_stx_max); - } - - /* Allocate STX array with max length */ - if (shmem_transport_ofi_stx_max > 0) { - shmem_transport_ofi_stx_pool = malloc(shmem_transport_ofi_stx_max * - sizeof(shmem_transport_ofi_stx_t)); - if (shmem_transport_ofi_stx_pool == NULL) { - RAISE_ERROR_STR("Out of memory when allocating OFI STX pool"); + /* Allocate STX array with max length */ + if (shmem_transport_ofi_stx_max > 0) { + shmem_transport_ofi_stx_pool[idx] = malloc(shmem_transport_ofi_stx_max * + sizeof(shmem_transport_ofi_stx_t)); + if (shmem_transport_ofi_stx_pool == NULL) { + RAISE_ERROR_STR("Out of memory when allocating OFI STX pool"); + } } - } - for (i = 0; i < shmem_transport_ofi_stx_max; i++) { - ret = fi_stx_context(shmem_transport_ofi_domainfd, NULL, - &shmem_transport_ofi_stx_pool[i].stx, NULL); - OFI_CHECK_RETURN_MSG(ret, "STX context creation failed (%s)\n", fi_strerror(ret)); - shmem_transport_ofi_stx_pool[i].ref_cnt = 0; - shmem_transport_ofi_stx_pool[i].is_private = 0; + for (i = 0; i < shmem_transport_ofi_stx_max; i++) { + ret = fi_stx_context(shmem_transport_ofi_domainfd, NULL, + &shmem_transport_ofi_stx_pool[idx][i].stx, NULL); + OFI_CHECK_RETURN_MSG(ret, "STX context creation failed (%s)\n", fi_strerror(ret)); + shmem_transport_ofi_stx_pool[idx][i].ref_cnt = 0; + shmem_transport_ofi_stx_pool[idx][i].is_private = 0; + } } - shmem_transport_ctx_default.team = &shmem_internal_team_world; + shmem_transport_ctx_default.stx_idx = malloc(shmem_transport_ofi_num_nics * sizeof(int)); + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + shmem_transport_ctx_default.stx_idx[idx] = -1; + } ret = shmem_transport_ofi_ctx_init(&shmem_transport_ctx_default, SHMEM_TRANSPORT_CTX_DEFAULT_ID); if (ret != 0) return ret; - ret = atomic_limitations_check(); - if (ret != 0) return ret; + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + ret = atomic_limitations_check(idx); + if (ret != 0) return ret; + } ret = populate_mr_tables(); if (ret != 0) return ret; @@ -2020,12 +2132,20 @@ int shmem_transport_ctx_create(struct shmem_internal_team_t *team, long options, memset(ctxp, 0, sizeof(shmem_transport_ctx_t)); + ctxp->pending_put_cntr = malloc(shmem_transport_ofi_num_nics * sizeof(uint64_t)); + ctxp->pending_get_cntr = malloc(shmem_transport_ofi_num_nics * sizeof(uint64_t)); + ctxp->stx_idx = malloc(shmem_transport_ofi_num_nics * sizeof(int)); + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { #ifndef USE_CTX_LOCK - shmem_internal_cntr_write(&ctxp->pending_put_cntr, 0); - shmem_internal_cntr_write(&ctxp->pending_get_cntr, 0); + shmem_internal_cntr_write(&ctxp->pending_put_cntr[idx], 0); + shmem_internal_cntr_write(&ctxp->pending_get_cntr[idx], 0); +#else + ctxp->pending_put_cntr[idx] = 0; + ctxp->pending_get_cntr[idx] = 0; #endif - ctxp->stx_idx = -1; + ctxp->stx_idx[idx] = -1; + } ctxp->options = options; ctxp->team = team; @@ -2054,6 +2174,9 @@ void shmem_transport_ctx_destroy(shmem_transport_ctx_t *ctx) if(shmem_internal_params.DEBUG) { SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); if (ctx->bounce_buffers) SHMEM_TRANSPORT_OFI_CTX_BB_LOCK(ctx); + // TODO: May want to include pending/completed counters for ALL NICs or at least an aggregate + // for each counter type +/* Causes seg. fault right now for obvious reasons DEBUG_MSG("id = %d, options = %#0lx, stx_idx = %d\n" RAISE_PE_PREFIX "pending_put_cntr = %9"PRIu64", completed_put_cntr = %9"PRIu64"\n" RAISE_PE_PREFIX "pending_get_cntr = %9"PRIu64", completed_get_cntr = %9"PRIu64"\n" @@ -2068,60 +2191,82 @@ void shmem_transport_ctx_destroy(shmem_transport_ctx_t *ctx) shmem_internal_my_pe, ctx->pending_bb_cntr, ctx->completed_bb_cntr ); +*/ if (ctx->bounce_buffers) SHMEM_TRANSPORT_OFI_CTX_BB_UNLOCK(ctx); SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } - if (ctx->ep) { - ret = fi_close(&ctx->ep->fid); - OFI_CHECK_ERROR_MSG(ret, "Context endpoint close failed (%s)\n", fi_strerror(errno)); + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + if (ctx->ep[idx]) { + ret = fi_close(&ctx->ep[idx]->fid); + OFI_CHECK_ERROR_MSG(ret, "Context endpoint close failed (%s)\n", fi_strerror(errno)); + } } if (ctx->bounce_buffers) { shmem_free_list_destroy(ctx->bounce_buffers); } - if (ctx->stx_idx >= 0) { - SHMEM_MUTEX_LOCK(shmem_transport_ofi_lock); - if (shmem_transport_ofi_is_private(ctx->options)) { - shmem_transport_ofi_stx_kvs_t *e; - HASH_FIND(hh, shmem_transport_ofi_stx_kvs, &ctx->tid, - sizeof(struct shmem_internal_tid), e); - if (e) { - shmem_transport_ofi_stx_t *stx = &shmem_transport_ofi_stx_pool[ctx->stx_idx]; - stx->ref_cnt--; - if (stx->ref_cnt == 0) { - HASH_DEL(shmem_transport_ofi_stx_kvs, e); - free(e); - shmem_transport_ofi_stx_pool[ctx->stx_idx].is_private = 0; + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + if (ctx->stx_idx[idx] >= 0) { + SHMEM_MUTEX_LOCK(shmem_transport_ofi_lock); + if (shmem_transport_ofi_is_private(ctx->options)) { + shmem_transport_ofi_stx_kvs_t *e; + HASH_FIND(hh, shmem_transport_ofi_stx_kvs, &ctx->tid, + sizeof(struct shmem_internal_tid), e); + if (e) { + shmem_transport_ofi_stx_t *stx = &shmem_transport_ofi_stx_pool[idx][ctx->stx_idx[idx]]; + stx->ref_cnt--; + if (stx->ref_cnt == 0) { + HASH_DEL(shmem_transport_ofi_stx_kvs, e); + free(e); + shmem_transport_ofi_stx_pool[idx][ctx->stx_idx[idx]].is_private = 0; + } + } + else { + RAISE_WARN_STR("Unable to locate private STX"); + } + } else { + shmem_transport_ofi_stx_pool[idx][ctx->stx_idx[idx]].ref_cnt--; + if (shmem_transport_ofi_stx_pool[idx][ctx->stx_idx[idx]].is_private) { + SHMEM_MUTEX_UNLOCK(shmem_transport_ofi_lock); + RAISE_ERROR_STR("Destroyed a ctx with an inconsistent is_private field"); } } - else { - RAISE_WARN_STR("Unable to locate private STX"); - } - } else { - shmem_transport_ofi_stx_pool[ctx->stx_idx].ref_cnt--; - if (shmem_transport_ofi_stx_pool[ctx->stx_idx].is_private) { - SHMEM_MUTEX_UNLOCK(shmem_transport_ofi_lock); - RAISE_ERROR_STR("Destroyed a ctx with an inconsistent is_private field"); - } + SHMEM_MUTEX_UNLOCK(shmem_transport_ofi_lock); } - SHMEM_MUTEX_UNLOCK(shmem_transport_ofi_lock); } - if (ctx->put_cntr) { - ret = fi_close(&ctx->put_cntr->fid); - OFI_CHECK_ERROR_MSG(ret, "Context put CNTR close failed (%s)\n", fi_strerror(errno)); - } + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + if (ctx->put_cntr && ctx->put_cntr[idx]) { + ret = fi_close(&ctx->put_cntr[idx]->fid); + OFI_CHECK_ERROR_MSG(ret, "Context put CNTR close failed (%s)\n", fi_strerror(errno)); + } - if (ctx->get_cntr) { - ret = fi_close(&ctx->get_cntr->fid); - OFI_CHECK_ERROR_MSG(ret, "Context get CNTR close failed (%s)\n", fi_strerror(errno)); - } + if (ctx->get_cntr && ctx->get_cntr[idx]) { + ret = fi_close(&ctx->get_cntr[idx]->fid); + OFI_CHECK_ERROR_MSG(ret, "Context get CNTR close failed (%s)\n", fi_strerror(errno)); + } + + if (ctx->cq && ctx->cq[idx]) { + ret = fi_close(&ctx->cq[idx]->fid); + OFI_CHECK_ERROR_MSG(ret, "Context CQ close failed (%s)\n", fi_strerror(errno)); + } + + if (ctx->av && ctx->av[idx]) { + ret = fi_close(&ctx->av[idx]->fid); + OFI_CHECK_ERROR_MSG(ret, "Context AV close failed (%s)\n", fi_strerror(errno)); + } - if (ctx->cq) { - ret = fi_close(&ctx->cq->fid); - OFI_CHECK_ERROR_MSG(ret, "Context CQ close failed (%s)\n", fi_strerror(errno)); + if (ctx->domain && ctx->domain[idx]) { + ret = fi_close(&ctx->domain[idx]->fid); + OFI_CHECK_ERROR_MSG(ret, "Context domain close failed (%s)\n", fi_strerror(errno)); + } + + if (ctx->fabric && ctx->fabric[idx]) { + ret = fi_close(&ctx->fabric[idx]->fid); + OFI_CHECK_ERROR_MSG(ret, "Context fabric close failed (%s)\n", fi_strerror(errno)); + } } #ifdef USE_CTX_LOCK @@ -2161,13 +2306,15 @@ int shmem_transport_fini(void) RAISE_WARN_MSG("Key/value store contained %d unfreed private contexts\n", stx_len); } - for (long i = 0; i < shmem_transport_ofi_stx_max; ++i) { - if (shmem_transport_ofi_stx_pool[i].ref_cnt != 0) - RAISE_WARN_MSG("Closing a %s STX (%zu) with nonzero ref. count (%ld)\n", - shmem_transport_ofi_stx_pool[i].is_private ? "private" : "shared", - i, shmem_transport_ofi_stx_pool[i].ref_cnt); - ret = fi_close(&shmem_transport_ofi_stx_pool[i].stx->fid); - OFI_CHECK_ERROR_MSG(ret, "STX context close failed (%s)\n", fi_strerror(errno)); + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + for (long i = 0; i < shmem_transport_ofi_stx_max; ++i) { + if (shmem_transport_ofi_stx_pool[idx][i].ref_cnt != 0) + RAISE_WARN_MSG("Closing a %s STX (%zu) with nonzero ref. count (%ld)\n", + shmem_transport_ofi_stx_pool[idx][i].is_private ? "private" : "shared", + i, shmem_transport_ofi_stx_pool[idx][i].ref_cnt); + ret = fi_close(&shmem_transport_ofi_stx_pool[idx][i].stx->fid); + OFI_CHECK_ERROR_MSG(ret, "STX context close failed (%s)\n", fi_strerror(errno)); + } } if (shmem_transport_ofi_stx_pool) free(shmem_transport_ofi_stx_pool); diff --git a/src/transport_ofi.h b/src/transport_ofi.h index 616526bba..8b2049e99 100644 --- a/src/transport_ofi.h +++ b/src/transport_ofi.h @@ -33,6 +33,7 @@ #include "shmem_team.h" #include +extern size_t shmem_transport_ofi_num_nics; #if !defined(ENABLE_HARD_POLLING) #define ENABLE_TARGET_CNTR 1 @@ -88,9 +89,9 @@ extern pthread_mutex_t shmem_transport_ofi_progress_lock; do { \ if ((err) == -FI_EAVAIL) { \ struct fi_cq_err_entry e = {0}; \ - ssize_t ret = fi_cq_readerr((ctx)->cq, (void *)&e, 0); \ + ssize_t ret = fi_cq_readerr((ctx)->cq, (void *)&e, 0); /* FIX */ \ if (ret == 1) { \ - const char *errmsg = fi_cq_strerror((ctx)->cq, e.prov_errno, \ + const char *errmsg = fi_cq_strerror((ctx)->cq /* FIX */, e.prov_errno, \ e.err_data, NULL, 0); \ RAISE_ERROR_MSG("Error in operation: %s\n", errmsg); \ } else { \ @@ -316,23 +317,26 @@ struct shmem_transport_ctx_t { shmem_internal_mutex_t lock; #endif long options; - struct fid_ep* ep; - struct fid_cntr* put_cntr; - struct fid_cntr* get_cntr; - struct fid_cq* cq; + struct fid_fabric** fabric; + struct fid_domain** domain; + struct fid_av** av; + struct fid_ep** ep; + struct fid_cntr** put_cntr; + struct fid_cntr** get_cntr; + struct fid_cq** cq; #ifdef USE_CTX_LOCK /* Pending cntr accesses are protected by ctx lock */ - uint64_t pending_put_cntr; - uint64_t pending_get_cntr; + uint64_t* pending_put_cntr; + uint64_t* pending_get_cntr; #else - shmem_internal_cntr_t pending_put_cntr; - shmem_internal_cntr_t pending_get_cntr; + shmem_internal_cntr_t* pending_put_cntr; + shmem_internal_cntr_t* pending_get_cntr; #endif /* These counters are protected by the BB lock */ uint64_t pending_bb_cntr; uint64_t completed_bb_cntr; shmem_free_list_t *bounce_buffers; - int stx_idx; + int* stx_idx; struct shmem_internal_tid tid; struct shmem_internal_team_t *team; }; @@ -407,19 +411,18 @@ int shmem_transport_fini(void); extern size_t SHMEM_Dtsize[FI_DATATYPE_LAST]; -static inline void shmem_transport_get_wait(shmem_transport_ctx_t* ctx); +static inline void shmem_transport_get_wait(shmem_transport_ctx_t* ctx, size_t idx); /* Drain all available events from the CQ. Note, ctx->bounce_buffers must be * locked before calling this routine */ static inline -void shmem_transport_ofi_drain_cq(shmem_transport_ctx_t *ctx) +void shmem_transport_ofi_drain_cq(shmem_transport_ctx_t *ctx, size_t nic_idx) { ssize_t ret = 0; struct fi_cq_entry buf; for (;;) { - ret = fi_cq_read(ctx->cq, (void *)&buf, 1); - + ret = fi_cq_read(ctx->cq[nic_idx], (void *)&buf, 1); /* FIX */ if (ret == -FI_EAGAIN) break; /* No events */ else if (ret == 1) { @@ -457,7 +460,9 @@ shmem_transport_ofi_bounce_buffer_t * create_bounce_buffer(shmem_transport_ctx_t shmem_internal_assert(shmem_transport_ofi_max_bounce_buffers > 0); while (ctx->bounce_buffers->nalloc >= (uint64_t) shmem_transport_ofi_max_bounce_buffers) { - shmem_transport_ofi_drain_cq(ctx); + for (size_t i = 0; i < shmem_transport_ofi_num_nics; i++) { + shmem_transport_ofi_drain_cq(ctx, i); + } } buff = (shmem_transport_ofi_bounce_buffer_t*) shmem_free_list_alloc(ctx->bounce_buffers); @@ -476,7 +481,7 @@ shmem_transport_ofi_bounce_buffer_t * create_bounce_buffer(shmem_transport_ctx_t } static inline -void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx) +void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx, size_t nic_idx) { SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); @@ -485,7 +490,9 @@ void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx) SHMEM_TRANSPORT_OFI_CTX_BB_LOCK(ctx); while (ctx->bounce_buffers->nalloc > 0) { - shmem_transport_ofi_drain_cq(ctx); + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { + shmem_transport_ofi_drain_cq(ctx, nic_idx); + } } SHMEM_TRANSPORT_OFI_CTX_BB_UNLOCK(ctx); @@ -503,10 +510,10 @@ void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx) long poll_count = 0; while (poll_count < shmem_transport_ofi_put_poll_limit || shmem_transport_ofi_put_poll_limit < 0) { - success = fi_cntr_read(ctx->put_cntr); - fail = fi_cntr_readerr(ctx->put_cntr); - cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr); + success = fi_cntr_read(ctx->put_cntr[nic_idx]); /* FIXED? */ + fail = fi_cntr_readerr(ctx->put_cntr[nic_idx]); /* FIXED? */ + cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ shmem_transport_probe(); if (success < cnt && fail == 0) { @@ -521,11 +528,12 @@ void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx) } poll_count++; } - cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr); + + cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ do { cnt = cnt_new; - ssize_t ret = fi_cntr_wait(ctx->put_cntr, cnt, -1); - cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr); + ssize_t ret = fi_cntr_wait(ctx->put_cntr[nic_idx], cnt, -1); /* FIXED? */ + cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ OFI_CTX_CHECK_ERROR(ctx, ret); } while (cnt < cnt_new); shmem_internal_assert(cnt == cnt_new); @@ -536,9 +544,10 @@ void shmem_transport_put_quiet(shmem_transport_ctx_t* ctx) static inline int shmem_transport_quiet(shmem_transport_ctx_t* ctx) { - - shmem_transport_put_quiet(ctx); - shmem_transport_get_wait(ctx); + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { + shmem_transport_put_quiet(ctx, idx); + shmem_transport_get_wait(ctx, idx); + } return 0; } @@ -547,13 +556,15 @@ int shmem_transport_quiet(shmem_transport_ctx_t* ctx) static inline int shmem_transport_fence(shmem_transport_ctx_t* ctx) { + for (size_t idx = 0; idx < shmem_transport_ofi_num_nics; idx++) { #if WANT_TOTAL_DATA_ORDERING == 0 - /* Communication is unordered; must wait for puts and buffered (injected) - * non-fetching atomics to be completed in order to ensure ordering. */ - shmem_transport_put_quiet(ctx); + /* Communication is unordered; must wait for puts and buffered (injected) + * non-fetching atomics to be completed in order to ensure ordering. */ + shmem_transport_put_quiet(ctx, idx); #endif - /* Complete fetching ops; needed to support nonblocking fetch-atomics */ - shmem_transport_get_wait(ctx); + /* Complete fetching ops; needed to support nonblocking fetch-atomics */ + shmem_transport_get_wait(ctx, idx); + } return 0; } @@ -563,21 +574,20 @@ int shmem_transport_fence(shmem_transport_ctx_t* ctx) * to reclaim resources and indicate that the operation should be retried. If * retry limit (ofi_max_poll) is exceeded, abort. */ static inline -int try_again(shmem_transport_ctx_t *ctx, const int ret, uint64_t *polled) { - +int try_again(shmem_transport_ctx_t *ctx, const int ret, uint64_t *polled, size_t nic_idx) { if (ret) { if (ret == -FI_EAGAIN) { if (ctx->bounce_buffers) { SHMEM_TRANSPORT_OFI_CTX_BB_LOCK(ctx); - shmem_transport_ofi_drain_cq(ctx); + shmem_transport_ofi_drain_cq(ctx, nic_idx); SHMEM_TRANSPORT_OFI_CTX_BB_UNLOCK(ctx); } else { /* Poke CQ for errors to encourage progress */ struct fi_cq_err_entry e = {0}; - ssize_t ret = fi_cq_readerr(ctx->cq, (void *)&e, 0); + ssize_t ret = fi_cq_readerr(ctx->cq[nic_idx], (void *)&e, 0); /* FIXED? */ if (ret == 1) { - const char *errmsg = fi_cq_strerror(ctx->cq, e.prov_errno, + const char *errmsg = fi_cq_strerror(ctx->cq[nic_idx], e.prov_errno, /* FIXED? */ e.err_data, NULL, 0); RAISE_ERROR_MSG("Error in operation: %s\n", errmsg); } else if (ret && ret != -FI_EAGAIN) { @@ -586,9 +596,8 @@ int try_again(shmem_transport_ctx_t *ctx, const int ret, uint64_t *polled) { } shmem_transport_probe(); - + (*polled)++; - if ((*polled) <= shmem_transport_ofi_max_poll) { return 1; } @@ -608,7 +617,7 @@ int try_again(shmem_transport_ctx_t *ctx, const int ret, uint64_t *polled) { static inline void shmem_transport_put_scalar(shmem_transport_ctx_t* ctx, void *target, const - void *source, size_t len, int pe) + void *source, size_t len, int pe, size_t nic_idx) { int ret = 0; uint64_t dst = (uint64_t) pe; @@ -621,24 +630,24 @@ void shmem_transport_put_scalar(shmem_transport_ctx_t* ctx, void *target, const shmem_internal_assert(len <= shmem_transport_ofi_max_buffered_send); SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_inject_write(ctx->ep, + ret = fi_inject_write(ctx->ep[nic_idx], /* FIXED? */ source, len, GET_DEST(dst), (uint64_t) addr, key); - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } static inline void shmem_transport_ofi_put_large(shmem_transport_ctx_t* ctx, void *target, const void *source, - size_t len, int pe) + size_t len, int pe, size_t nic_idx) { int ret = 0; uint64_t dst = (uint64_t) pe; @@ -660,15 +669,15 @@ void shmem_transport_ofi_put_large(shmem_transport_ctx_t* ctx, void *target, con (size_t) (((uint8_t *) source) + len - frag_source)); polled = 0; - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_write(ctx->ep, + ret = fi_write(ctx->ep[nic_idx], frag_source, frag_len, GET_MR_DESC(shmem_transport_ofi_get_mr_desc_index(source)), GET_DEST(dst), frag_target, key, NULL); - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ frag_source += frag_len; frag_target += frag_len; @@ -678,7 +687,7 @@ void shmem_transport_ofi_put_large(shmem_transport_ctx_t* ctx, void *target, con static inline void shmem_transport_put_nb(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, long *completion) + int pe, long *completion, size_t nic_idx) { int ret = 0; uint64_t dst = (uint64_t) pe; @@ -690,12 +699,12 @@ void shmem_transport_put_nb(shmem_transport_ctx_t* ctx, void *target, const void if (len <= shmem_transport_ofi_max_buffered_send) { - shmem_transport_put_scalar(ctx, target, source, len, pe); + shmem_transport_put_scalar(ctx, target, source, len, pe, nic_idx); } else if (len <= shmem_transport_ofi_bounce_buffer_size && ctx->bounce_buffers) { SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ shmem_transport_ofi_get_mr(target, pe, &addr, &key); shmem_transport_ofi_bounce_buffer_t *buff = @@ -715,19 +724,19 @@ void shmem_transport_put_nb(shmem_transport_ctx_t* ctx, void *target, const void .data = 0 }; do { - ret = fi_writemsg(ctx->ep, &msg, FI_COMPLETION | FI_DELIVERY_COMPLETE); - } while (try_again(ctx, ret, &polled)); + ret = fi_writemsg(ctx->ep[nic_idx], &msg, FI_COMPLETION | FI_DELIVERY_COMPLETE); /* FIXED? */ + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } else { - shmem_transport_ofi_put_large(ctx, target, source,len, pe); + shmem_transport_ofi_put_large(ctx, target, source, len, pe, nic_idx); (*completion)++; } } static inline void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - uint64_t *sig_addr, uint64_t signal, int sig_op, int pe) + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe, size_t nic_idx) { int ret = 0; uint64_t dst = (uint64_t) pe; @@ -764,8 +773,8 @@ void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, co }; do { - ret = fi_writemsg(ctx->ep, &msg, FI_DELIVERY_COMPLETE | FI_INJECT); - } while (try_again(ctx, ret, &polled)); + ret = fi_writemsg(ctx->ep[nic_idx], &msg, FI_DELIVERY_COMPLETE | FI_INJECT); /* FIXED? */ + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } else { @@ -809,11 +818,11 @@ void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, co msg.rma_iov = &rma_iov; msg.context = frag_source; - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_writemsg(ctx->ep, &msg, FI_DELIVERY_COMPLETE); - } while (try_again(ctx, ret, &polled)); + ret = fi_writemsg(ctx->ep[nic_idx], &msg, FI_DELIVERY_COMPLETE); /* FIXED? */ + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ frag_source += frag_len; frag_target += frag_len; @@ -837,7 +846,7 @@ void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, co int atomic_op = (sig_op == SHMEM_SIGNAL_ADD) ? FI_SUM : FI_ATOMIC_WRITE; SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ const struct fi_ioc msg_iov_signal = { .addr = (uint8_t *) &signal, @@ -862,8 +871,8 @@ void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, co }; do { - ret = fi_atomicmsg(ctx->ep, &msg_signal, flags_signal); - } while (try_again(ctx, ret, &polled)); + ret = fi_atomicmsg(ctx->ep[nic_idx], &msg_signal, flags_signal); /* FIXED? */ + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } @@ -873,30 +882,31 @@ static inline void shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion) { shmem_internal_assert((*completion) >= 0); - if((*completion) > 0) { - shmem_transport_put_quiet(ctx); + for (size_t nic_idx = 0; nic_idx < shmem_transport_ofi_num_nics; nic_idx++) { + shmem_transport_put_quiet(ctx, nic_idx); + } (*completion)--; } } static inline void shmem_transport_put_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe) + int pe, size_t nic_idx) { if (len <= shmem_transport_ofi_max_buffered_send) { - shmem_transport_put_scalar(ctx, target, source, len, pe); + shmem_transport_put_scalar(ctx, target, source, len, pe, nic_idx); } else { - shmem_transport_ofi_put_large(ctx, target, source, len, pe); + shmem_transport_ofi_put_large(ctx, target, source, len, pe, nic_idx); } } static inline -void shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe) +void shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe, size_t nic_idx) { int ret = 0; uint64_t dst = (uint64_t) pe; @@ -909,9 +919,9 @@ void shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *s SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); if (len <= shmem_transport_ofi_max_msg_size) { - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_read(ctx->ep, + ret = fi_read(ctx->ep[nic_idx], /* FIXED? */ target, len, GET_MR_DESC(shmem_transport_ofi_get_mr_desc_index(target)), @@ -919,7 +929,7 @@ void shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *s (uint64_t) addr, key, NULL); - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ } else { uint8_t *frag_target = (uint8_t *) target; @@ -931,15 +941,15 @@ void shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *s (size_t) (((uint8_t *) target) + len - frag_target)); polled = 0; - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_read(ctx->ep, + ret = fi_read(ctx->ep[nic_idx], frag_target, frag_len, GET_MR_DESC(shmem_transport_ofi_get_mr_desc_index(target)), GET_DEST(dst), frag_source, key, NULL); - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ frag_source += frag_len; frag_target += frag_len; @@ -950,7 +960,7 @@ void shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *s static inline -void shmem_transport_get_wait(shmem_transport_ctx_t* ctx) +void shmem_transport_get_wait(shmem_transport_ctx_t* ctx, size_t nic_idx) { /* wait for get counter to meet outstanding count value */ @@ -967,9 +977,9 @@ void shmem_transport_get_wait(shmem_transport_ctx_t* ctx) while (poll_count < shmem_transport_ofi_get_poll_limit || shmem_transport_ofi_get_poll_limit < 0) { - success = fi_cntr_read(ctx->get_cntr); - fail = fi_cntr_readerr(ctx->get_cntr); - cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr); + success = fi_cntr_read(ctx->get_cntr[nic_idx]); + fail = fi_cntr_readerr(ctx->get_cntr[nic_idx]); + cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[nic_idx]); shmem_transport_probe(); @@ -985,11 +995,11 @@ void shmem_transport_get_wait(shmem_transport_ctx_t* ctx) } poll_count++; } - cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr); + cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[nic_idx]); do { cnt = cnt_new; - ssize_t ret = fi_cntr_wait(ctx->get_cntr, cnt, -1); - cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr); + ssize_t ret = fi_cntr_wait(ctx->get_cntr[nic_idx], cnt, -1); + cnt_new = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[nic_idx]); OFI_CTX_CHECK_ERROR(ctx, ret); } while (cnt < cnt_new); shmem_internal_assert(cnt == cnt_new); @@ -1001,7 +1011,7 @@ void shmem_transport_get_wait(shmem_transport_ctx_t* ctx) static inline void shmem_transport_cswap_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, const void *operand, - size_t len, int pe, int datatype) + size_t len, int pe, int datatype, size_t nic_idx) { int ret = 0; uint64_t dst = (uint64_t) pe; @@ -1031,10 +1041,10 @@ void shmem_transport_cswap_nbi(shmem_transport_ctx_t* ctx, void *target, const }; SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_compare_atomicmsg(ctx->ep, + ret = fi_compare_atomicmsg(ctx->ep[nic_idx], /* FIXED? */ &msg, &comparev, NULL, @@ -1044,21 +1054,21 @@ void shmem_transport_cswap_nbi(shmem_transport_ctx_t* ctx, void *target, const 1, FI_INJECT); /* FI_DELIVERY_COMPLETE is not required as it is implied for fetch atomicmsgs */ - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } static inline void shmem_transport_cswap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, - const void *operand, size_t len, int pe, int datatype) + const void *operand, size_t len, int pe, int datatype, size_t nic_idx) { #ifdef ENABLE_MR_ENDPOINT /* CXI provider currently does not support fetch atomics with FI_DELIVERY_COMPLETE * That is why non-blocking API is used which uses FI_INJECT. FI_ATOMIC_READ is * also not supported currently */ shmem_transport_cswap_nbi(ctx, target, source, - dest, operand, len, pe, datatype); + dest, operand, len, pe, datatype, nic_idx); #else int ret = 0; uint64_t dst = (uint64_t) pe; @@ -1072,10 +1082,10 @@ void shmem_transport_cswap(shmem_transport_ctx_t* ctx, void *target, const void shmem_internal_assert(SHMEM_Dtsize[SHMEM_TRANSPORT_DTYPE(datatype)] == len); SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_compare_atomic(ctx->ep, + ret = fi_compare_atomic(ctx->ep[nic_idx], /* FIXED? */ source, 1, GET_MR_DESC(shmem_transport_ofi_get_mr_desc_index(source)), @@ -1089,7 +1099,7 @@ void shmem_transport_cswap(shmem_transport_ctx_t* ctx, void *target, const void SHMEM_TRANSPORT_DTYPE(datatype), FI_CSWAP, NULL); - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); #endif } @@ -1097,7 +1107,7 @@ void shmem_transport_cswap(shmem_transport_ctx_t* ctx, void *target, const void static inline void shmem_transport_mswap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, - const void *mask, size_t len, int pe, int datatype) + const void *mask, size_t len, int pe, int datatype, size_t nic_idx) { int ret = 0; uint64_t dst = (uint64_t) pe; @@ -1111,10 +1121,10 @@ void shmem_transport_mswap(shmem_transport_ctx_t* ctx, void *target, const void shmem_internal_assert(SHMEM_Dtsize[SHMEM_TRANSPORT_DTYPE(datatype)] == len); SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_compare_atomic(ctx->ep, + ret = fi_compare_atomic(ctx->ep[nic_idx], /* FIXED? */ source, 1, GET_MR_DESC(shmem_transport_ofi_get_mr_desc_index(source)), @@ -1128,14 +1138,14 @@ void shmem_transport_mswap(shmem_transport_ctx_t* ctx, void *target, const void SHMEM_TRANSPORT_DTYPE(datatype), FI_MSWAP, NULL); - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } static inline void shmem_transport_atomic(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, int op, int datatype) + int pe, int op, int datatype, size_t nic_idx) { int ret = 0; uint64_t dst = (uint64_t) pe; @@ -1148,10 +1158,10 @@ void shmem_transport_atomic(shmem_transport_ctx_t* ctx, void *target, const void shmem_internal_assert(SHMEM_Dtsize[SHMEM_TRANSPORT_DTYPE(datatype)] == len); SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_inject_atomic(ctx->ep, + ret = fi_inject_atomic(ctx->ep[nic_idx], /* FIXED? */ source, 1, GET_DEST(dst), @@ -1159,7 +1169,7 @@ void shmem_transport_atomic(shmem_transport_ctx_t* ctx, void *target, const void key, SHMEM_TRANSPORT_DTYPE(datatype), op); - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } @@ -1167,7 +1177,7 @@ void shmem_transport_atomic(shmem_transport_ctx_t* ctx, void *target, const void static inline void shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t full_len, int pe, int op, int datatype, - long *completion) + long *completion, size_t nic_idx) { int ret = 0; uint64_t dst = (uint64_t) pe; @@ -1181,7 +1191,7 @@ void shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const voi shmem_internal_assert(SHMEM_Dtsize[dt] * len == full_len); SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - ret = fi_atomicvalid(ctx->ep, dt, op, + ret = fi_atomicvalid(ctx->ep[nic_idx], dt, op, /* FIXED? */ &max_atomic_size); max_atomic_size = max_atomic_size * SHMEM_Dtsize[dt]; if (max_atomic_size > shmem_transport_ofi_max_msg_size @@ -1198,10 +1208,10 @@ void shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const voi polled = 0; - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_inject_atomic(ctx->ep, + ret = fi_inject_atomic(ctx->ep[nic_idx], /* FIXED? */ source, len, GET_DEST(dst), @@ -1209,7 +1219,7 @@ void shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const voi key, dt, op); - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ } else if (full_len <= MIN(shmem_transport_ofi_bounce_buffer_size, max_atomic_size) && @@ -1219,7 +1229,7 @@ void shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const voi create_bounce_buffer(ctx, source, full_len); polled = 0; - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ const struct fi_ioc msg_iov = { .addr = buff->data, .count = len }; const struct fi_rma_ioc rma_iov = { .addr = (uint64_t) addr, .count = len, .key = key }; @@ -1236,8 +1246,8 @@ void shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const voi .data = 0 }; do { - ret = fi_atomicmsg(ctx->ep, &msg, FI_COMPLETION | FI_DELIVERY_COMPLETE); - } while (try_again(ctx, ret, &polled)); + ret = fi_atomicmsg(ctx->ep[nic_idx], &msg, FI_COMPLETION | FI_DELIVERY_COMPLETE); /* FIXED? */ + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ } else { size_t sent = 0; @@ -1247,9 +1257,9 @@ void shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const voi size_t chunksize = MIN((len-sent), (max_atomic_size/SHMEM_Dtsize[dt])); polled = 0; - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_put_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_atomic(ctx->ep, + ret = fi_atomic(ctx->ep[nic_idx], /* FIXED? */ (void *)((char *)source + (sent*SHMEM_Dtsize[dt])), chunksize, @@ -1261,7 +1271,7 @@ void shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const voi dt, op, NULL); - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ sent += chunksize; } @@ -1276,7 +1286,7 @@ void shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const voi static inline void shmem_transport_fetch_atomic_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, - size_t len, int pe, int op, int datatype) + size_t len, int pe, int op, int datatype, size_t nic_idx) { int ret = 0; uint64_t dst = (uint64_t) pe; @@ -1305,17 +1315,17 @@ void shmem_transport_fetch_atomic_nbi(shmem_transport_ctx_t* ctx, void *target, }; SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_fetch_atomicmsg(ctx->ep, + ret = fi_fetch_atomicmsg(ctx->ep[nic_idx], /* FIXED? */ &msg, &resultv, GET_MR_DESC_ADDR(shmem_transport_ofi_get_mr_desc_index(dest)), 1, FI_INJECT); /* FI_DELIVERY_COMPLETE is not required as it's implied for fetch atomicmsgs */ - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); } @@ -1323,14 +1333,15 @@ void shmem_transport_fetch_atomic_nbi(shmem_transport_ctx_t* ctx, void *target, static inline void shmem_transport_fetch_atomic(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, - size_t len, int pe, int op, int datatype) + size_t len, int pe, int op, int datatype, + size_t nic_idx) { #ifdef ENABLE_MR_ENDPOINT /* CXI provider currently does not support fetch atomics with FI_DELIVERY_COMPLETE * That is why non-blocking API is used which uses FI_INJECT. FI_ATOMIC_READ is * also not supported currently */ shmem_transport_fetch_atomic_nbi(ctx, target, source, - dest, len, pe, op, datatype); + dest, len, pe, op, datatype, nic_idx); #else int ret = 0; uint64_t dst = (uint64_t) pe; @@ -1344,10 +1355,10 @@ void shmem_transport_fetch_atomic(shmem_transport_ctx_t* ctx, void *target, shmem_internal_assert(SHMEM_Dtsize[SHMEM_TRANSPORT_DTYPE(datatype)] == len); SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr); + SHMEM_TRANSPORT_OFI_CNTR_INC(&ctx->pending_get_cntr[nic_idx]); /* FIXED? */ do { - ret = fi_fetch_atomic(ctx->ep, + ret = fi_fetch_atomic(ctx->ep[nic_idx], /* FIXED */ source, 1, GET_MR_DESC(shmem_transport_ofi_get_mr_desc_index(source)), @@ -1359,7 +1370,7 @@ void shmem_transport_fetch_atomic(shmem_transport_ctx_t* ctx, void *target, SHMEM_TRANSPORT_DTYPE(datatype), op, NULL); - } while (try_again(ctx, ret, &polled)); + } while (try_again(ctx, ret, &polled, nic_idx)); /* FIXED? */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); #endif } @@ -1368,37 +1379,38 @@ void shmem_transport_fetch_atomic(shmem_transport_ctx_t* ctx, void *target, static inline void shmem_transport_swap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, - size_t len, int pe, int datatype) + size_t len, int pe, int datatype, + size_t nic_idx) { shmem_transport_fetch_atomic(ctx, target, source, dest, len, pe, - FI_ATOMIC_WRITE, datatype); + FI_ATOMIC_WRITE, datatype, nic_idx); } static inline void shmem_transport_swap_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, size_t len, - int pe, int datatype) + int pe, int datatype, size_t nic_idx) { shmem_transport_fetch_atomic_nbi(ctx, target, source, dest, len, pe, - FI_ATOMIC_WRITE, datatype); + FI_ATOMIC_WRITE, datatype, nic_idx); } static inline void shmem_transport_atomic_set(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe, - int datatype) + int datatype, size_t nic_idx) { shmem_transport_atomic(ctx, target, source, len, pe, FI_ATOMIC_WRITE, - datatype); + datatype, nic_idx); } static inline void shmem_transport_atomic_fetch(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe, - int datatype) + int datatype, size_t nic_idx) { #ifdef ENABLE_MR_ENDPOINT /* CXI provider currently does not support fetch atomics with FI_DELIVERY_COMPLETE @@ -1406,10 +1418,10 @@ void shmem_transport_atomic_fetch(shmem_transport_ctx_t* ctx, void *target, * also not supported currently */ long long dummy = 0; shmem_transport_fetch_atomic_nbi(ctx, (void *) source, (const void *) &dummy, - target, len, pe, FI_SUM, datatype); + target, len, pe, FI_SUM, datatype, nic_idx); #else - shmem_transport_fetch_atomic_nbi(ctx, (void *) source, (const void *) NULL, - target, len, pe, FI_ATOMIC_READ, datatype); + shmem_transport_fetch_atomic(ctx, (void *) source, (const void *) NULL, + target, len, pe, FI_ATOMIC_READ, datatype, nic_idx); #endif } @@ -1432,7 +1444,7 @@ int shmem_transport_atomic_supported(shm_internal_op_t op, * actually required by FI_THREAD_COMPLETION. */ SHMEM_TRANSPORT_OFI_CTX_LOCK(&shmem_transport_ctx_default); - int ret = fi_atomicvalid(shmem_transport_ctx_default.ep, + int ret = fi_atomicvalid(shmem_transport_ctx_default.ep[0], /* FIX */ SHMEM_TRANSPORT_DTYPE(datatype), op, &size); SHMEM_TRANSPORT_OFI_CTX_UNLOCK(&shmem_transport_ctx_default); @@ -1445,14 +1457,15 @@ int shmem_transport_atomic_supported(shm_internal_op_t op, static inline void shmem_transport_put_ct_nb(shmem_transport_ct_t *ct, void *target, const void *source, size_t len, int pe, - long *completion) + long *completion, size_t nic_idx) { RAISE_ERROR_STR("OFI transport does not currently support CT operations"); } static inline void shmem_transport_get_ct(shmem_transport_ct_t *ct, void *target, - const void *source, size_t len, int pe) + const void *source, size_t len, int pe, + size_t nic_idx) { RAISE_ERROR_STR("OFI transport does not currently support CT operations"); } @@ -1531,7 +1544,7 @@ uint64_t shmem_transport_pcntr_get_issued_write(shmem_transport_ctx_t *ctx) { uint64_t cnt; SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr); + cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[1]); /* FIX */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); if (ctx->options & SHMEMX_CTX_BOUNCE_BUFFER) { @@ -1547,7 +1560,7 @@ uint64_t shmem_transport_pcntr_get_issued_read(shmem_transport_ctx_t *ctx) { uint64_t cnt; SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr); + cnt = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[1]); /* FIX */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); return cnt; } @@ -1557,7 +1570,7 @@ uint64_t shmem_transport_pcntr_get_completed_write(shmem_transport_ctx_t *ctx) { uint64_t cnt; SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - cnt = fi_cntr_read(ctx->put_cntr); + cnt = fi_cntr_read(ctx->put_cntr[1]); /* FIX */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); if (ctx->options & SHMEMX_CTX_BOUNCE_BUFFER) { @@ -1573,7 +1586,7 @@ uint64_t shmem_transport_pcntr_get_completed_read(shmem_transport_ctx_t *ctx) { uint64_t cnt; SHMEM_TRANSPORT_OFI_CTX_LOCK(ctx); - cnt = fi_cntr_read(ctx->get_cntr); + cnt = fi_cntr_read(ctx->get_cntr[1]); /* FIX */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); return cnt; } @@ -1610,11 +1623,11 @@ void shmem_transport_pcntr_get_all(shmem_transport_ctx_t *ctx, shmemx_pcntr_t *p pcntr->pending_put = ctx->pending_bb_cntr; SHMEM_TRANSPORT_OFI_CTX_BB_UNLOCK(ctx); } - pcntr->completed_put += fi_cntr_read(ctx->put_cntr); - pcntr->completed_get = fi_cntr_read(ctx->get_cntr); + pcntr->completed_put += fi_cntr_read(ctx->put_cntr[1]); /* FIX */ + pcntr->completed_get = fi_cntr_read(ctx->get_cntr[1]); /* FIX */ - pcntr->pending_put += SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr); - pcntr->pending_get = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr); + pcntr->pending_put += SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_put_cntr[1]); /* FIX */ + pcntr->pending_get = SHMEM_TRANSPORT_OFI_CNTR_READ(&ctx->pending_get_cntr[1]); /* FIX */ SHMEM_TRANSPORT_OFI_CTX_UNLOCK(ctx); pcntr->target = shmem_transport_pcntr_get_completed_target(); diff --git a/src/transport_portals4.h b/src/transport_portals4.h index af0223d9d..b66fa5210 100644 --- a/src/transport_portals4.h +++ b/src/transport_portals4.h @@ -242,7 +242,7 @@ int shmem_transport_startup(void); int shmem_transport_fini(void); -static inline void shmem_transport_get_wait(shmem_transport_ctx_t*); +static inline void shmem_transport_get_wait(shmem_transport_ctx_t*, size_t idx); static inline void shmem_transport_probe(void) { return; @@ -257,7 +257,7 @@ shmem_transport_quiet(shmem_transport_ctx_t* ctx) uint64_t cnt, cnt_new; /* wait for completion of all pending NB get events */ - shmem_transport_get_wait(ctx); + shmem_transport_get_wait(ctx, 0); /* wait for remote completion (acks) of all buffered puts */ /* NOTE-MT: continue to wait if additional operations are issued during the quiet */ @@ -368,7 +368,7 @@ shmem_transport_portals4_drain_eq(void) static inline void -shmem_transport_put_scalar(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe) +shmem_transport_put_scalar(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe, size_t nic_idx) { int ret; ptl_process_t peer; @@ -571,7 +571,7 @@ shmem_transport_portals4_put_nbi_internal(shmem_transport_ctx_t* ctx, void *targ static inline void -shmem_transport_put_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe) +shmem_transport_put_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe, size_t nic_idx) { #ifdef ENABLE_REMOTE_VIRTUAL_ADDRESSING shmem_transport_portals4_put_nbi_internal(ctx, target, source, len, pe, @@ -588,7 +588,7 @@ shmem_transport_put_nbi(shmem_transport_ctx_t* ctx, void *target, const void *so static inline void shmem_transport_put_nb(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, long *completion) + int pe, long *completion, size_t nic_idx) { if (ctx->options & SHMEMX_CTX_BOUNCE_BUFFER) { #ifdef ENABLE_REMOTE_VIRTUAL_ADDRESSING @@ -603,7 +603,7 @@ shmem_transport_put_nb(shmem_transport_ctx_t* ctx, void *target, const void *sou shmem_transport_portals4_heap_pt); #endif } else { - shmem_transport_put_nbi(ctx, target, source, len, pe); + shmem_transport_put_nbi(ctx, target, source, len, pe, nic_idx); } } @@ -611,7 +611,7 @@ shmem_transport_put_nb(shmem_transport_ctx_t* ctx, void *target, const void *sou static inline void shmem_transport_put_ct_nb(shmem_transport_ct_t *ct, void *target, const void *source, - size_t len, int pe, long *completion) + size_t len, int pe, long *completion, size_t nic_idx) { #ifdef ENABLE_REMOTE_VIRTUAL_ADDRESSING shmem_transport_portals4_put_nb_internal((shmem_transport_ctx_t *)SHMEM_CTX_DEFAULT, target, source, len, pe, @@ -668,7 +668,7 @@ shmem_transport_portals4_get_internal(shmem_transport_ctx_t* ctx, void *target, static inline -void shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe) +void shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe, size_t nic_idx) { #ifdef ENABLE_REMOTE_VIRTUAL_ADDRESSING shmem_transport_portals4_get_internal(ctx, target, source, len, pe, @@ -683,7 +683,8 @@ void shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *s static inline void shmem_transport_get_ct(shmem_transport_ct_t *ct, void *target, - const void *source, size_t len, int pe) + const void *source, size_t len, int pe, + size_t nic_idx) { #ifdef ENABLE_REMOTE_VIRTUAL_ADDRESSING shmem_transport_portals4_get_internal((shmem_transport_ctx_t *)SHMEM_CTX_DEFAULT, target, source, len, pe, ct->shr_pt, -1); @@ -696,7 +697,7 @@ void shmem_transport_get_ct(shmem_transport_ct_t *ct, void *target, static inline void -shmem_transport_get_wait(shmem_transport_ctx_t* ctx) +shmem_transport_get_wait(shmem_transport_ctx_t* ctx, size_t idx) { int ret; ptl_ct_event_t ct; @@ -718,7 +719,7 @@ shmem_transport_get_wait(shmem_transport_ctx_t* ctx) static inline void shmem_transport_swap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, size_t len, - int pe, ptl_datatype_t datatype) + int pe, ptl_datatype_t datatype, size_t nic_idx) { int ret; ptl_process_t peer; @@ -758,7 +759,7 @@ static inline void shmem_transport_swap_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, size_t len, - int pe, ptl_datatype_t datatype) + int pe, ptl_datatype_t datatype, size_t nic_idx) { /* transport_swap already buffers the source argument */ shmem_transport_swap(ctx, target, source, dest, len, pe, datatype); @@ -769,7 +770,7 @@ static inline void shmem_transport_cswap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, const void *operand, size_t len, int pe, - ptl_datatype_t datatype) + ptl_datatype_t datatype, size_t nic_idx) { int ret; ptl_process_t peer; @@ -810,7 +811,7 @@ void shmem_transport_cswap_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, const void *operand, size_t len, int pe, - ptl_datatype_t datatype) + ptl_datatype_t datatype, size_t nic_idx) { /* transport_cswap already buffers the source and operand arguments */ shmem_transport_cswap(ctx, target, source, dest, operand, len, pe, datatype); @@ -821,7 +822,7 @@ static inline void shmem_transport_mswap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, const void *mask, size_t len, int pe, - ptl_datatype_t datatype) + ptl_datatype_t datatype, size_t nic_idx) { int ret; ptl_process_t peer; @@ -860,7 +861,7 @@ shmem_transport_mswap(shmem_transport_ctx_t* ctx, void *target, const void *sour static inline void shmem_transport_atomic(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, ptl_op_t op, ptl_datatype_t datatype) + int pe, ptl_op_t op, ptl_datatype_t datatype, size_t nic_idx) { int ret; ptl_pt_index_t pt; @@ -1020,7 +1021,7 @@ static inline void shmem_transport_fetch_atomic(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, size_t len, int pe, ptl_op_t op, - ptl_datatype_t datatype) + ptl_datatype_t datatype, size_t nic_idx) { int ret; ptl_pt_index_t pt; @@ -1060,7 +1061,7 @@ void shmem_transport_fetch_atomic_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, size_t len, int pe, ptl_op_t op, - ptl_datatype_t datatype) + ptl_datatype_t datatype, size_t nic_idx) { /* transport_fetch_atomic already buffers the source argument */ shmem_transport_fetch_atomic(ctx, target, source, dest, len, pe, op, datatype); @@ -1070,22 +1071,22 @@ shmem_transport_fetch_atomic_nbi(shmem_transport_ctx_t* ctx, void *target, static inline void shmem_transport_atomic_set(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, int datatype) + int pe, int datatype, size_t nic_idx) { shmem_internal_assert(len <= shmem_transport_portals4_max_atomic_size); - shmem_transport_put_scalar(ctx, target, source, len, pe); + shmem_transport_put_scalar(ctx, target, source, len, pe, nic_idx); } static inline void shmem_transport_atomic_fetch(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, int datatype) + int pe, int datatype, size_t nic_idx) { shmem_internal_assert(len <= shmem_transport_portals4_max_fetch_atomic_size); - shmem_transport_get(ctx, target, source, len, pe); + shmem_transport_get(ctx, target, source, len, pe, nic_idx); } @@ -1102,16 +1103,16 @@ int shmem_transport_atomic_supported(ptl_op_t op, ptl_datatype_t datatype) static inline void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - uint64_t *sig_addr, uint64_t signal, int sig_op, int pe) + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe, size_t nic_idx) { /* FIXME: Need to optimize non-blocking put with signal for Portals. Current implementation below keeps * * the "fence" in between data and signal put */ - shmem_transport_put_nbi(ctx, target, source, len, pe); + shmem_transport_put_nbi(ctx, target, source, len, pe, nic_idx); shmem_transport_fence(ctx); if (sig_op == SHMEM_SIGNAL_ADD) - shmem_transport_atomic(ctx, sig_addr, &signal, sizeof(uint64_t), pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64); + shmem_transport_atomic(ctx, sig_addr, &signal, sizeof(uint64_t), pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64, nic_idx); else - shmem_transport_atomic_set(ctx, sig_addr, &signal, sizeof(uint64_t), pe, SHM_INTERNAL_UINT64); + shmem_transport_atomic_set(ctx, sig_addr, &signal, sizeof(uint64_t), pe, SHM_INTERNAL_UINT64, nic_idx); } static inline diff --git a/src/transport_ucx.h b/src/transport_ucx.h index 779c55ba7..fb1b3299f 100644 --- a/src/transport_ucx.h +++ b/src/transport_ucx.h @@ -230,7 +230,7 @@ shmem_transport_fence(shmem_transport_ctx_t* ctx) static inline void -shmem_transport_put_scalar(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe) +shmem_transport_put_scalar(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe, size_t nic_idx) { ucs_status_t status; ucp_rkey_h rkey; @@ -284,7 +284,7 @@ shmem_transport_put_wait(shmem_transport_ctx_t* ctx, long *completion) static inline void shmem_transport_put_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe) + int pe, size_t nic_idx) { ucs_status_t status; ucp_rkey_h rkey; @@ -298,7 +298,7 @@ shmem_transport_put_nbi(shmem_transport_ctx_t* ctx, void *target, const void *so static inline void -shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe) +shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, int pe, size_t nic_idx) { ucs_status_ptr_t pstatus; ucp_rkey_h rkey; @@ -315,7 +315,7 @@ shmem_transport_get(shmem_transport_ctx_t* ctx, void *target, const void *source static inline void -shmem_transport_get_wait(shmem_transport_ctx_t* ctx) +shmem_transport_get_wait(shmem_transport_ctx_t* ctx, size_t idx) { /* Blocking fetching ops are completed in place, so this is a nop */ } @@ -324,7 +324,7 @@ shmem_transport_get_wait(shmem_transport_ctx_t* ctx) static inline void shmem_transport_swap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, - size_t len, int pe, shm_internal_datatype_t datatype) + size_t len, int pe, shm_internal_datatype_t datatype, size_t nic_idx) { uint8_t *remote_addr; ucp_rkey_h rkey; @@ -361,7 +361,7 @@ shmem_transport_swap(shmem_transport_ctx_t* ctx, void *target, const void *sourc static inline void shmem_transport_swap_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, - size_t len, int pe, shm_internal_datatype_t datatype) + size_t len, int pe, shm_internal_datatype_t datatype, size_t nic_idx) { uint8_t *remote_addr; ucp_rkey_h rkey; @@ -402,7 +402,7 @@ static inline void shmem_transport_cswap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, const void *operand, size_t len, int pe, - shm_internal_datatype_t datatype) + shm_internal_datatype_t datatype, size_t nic_idx) { uint8_t *remote_addr; ucp_rkey_h rkey; @@ -442,7 +442,7 @@ static inline void shmem_transport_cswap_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, const void *operand, size_t len, int pe, - shm_internal_datatype_t datatype) + shm_internal_datatype_t datatype, size_t nic_idx) { uint8_t *remote_addr; ucp_rkey_h rkey; @@ -484,7 +484,7 @@ shmem_transport_cswap_nbi(shmem_transport_ctx_t* ctx, void *target, const void * static inline void shmem_transport_atomic(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, shm_internal_op_t op, shm_internal_datatype_t datatype) + int pe, shm_internal_op_t op, shm_internal_datatype_t datatype, size_t nic_idx) { uint8_t *remote_addr; ucp_rkey_h rkey; @@ -530,7 +530,7 @@ shmem_transport_atomicv(shmem_transport_ctx_t* ctx, void *target, const void *so static inline void shmem_transport_fetch_atomic(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, size_t len, - int pe, shm_internal_op_t op, shm_internal_datatype_t datatype) + int pe, shm_internal_op_t op, shm_internal_datatype_t datatype, size_t nic_idx) { uint8_t *remote_addr; ucp_rkey_h rkey; @@ -570,7 +570,7 @@ shmem_transport_fetch_atomic(shmem_transport_ctx_t* ctx, void *target, const voi static inline void shmem_transport_fetch_atomic_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, size_t len, - int pe, shm_internal_op_t op, shm_internal_datatype_t datatype) + int pe, shm_internal_op_t op, shm_internal_datatype_t datatype, size_t nic_idx) { uint8_t *remote_addr; ucp_rkey_h rkey; @@ -613,7 +613,7 @@ shmem_transport_fetch_atomic_nbi(shmem_transport_ctx_t* ctx, void *target, const static inline void shmem_transport_atomic_fetch(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, shm_internal_datatype_t datatype) + int pe, shm_internal_datatype_t datatype, size_t nic_idx) { uint8_t *remote_addr; ucp_rkey_h rkey; @@ -632,7 +632,7 @@ shmem_transport_atomic_fetch(shmem_transport_ctx_t* ctx, void *target, const voi static inline void shmem_transport_atomic_set(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - int pe, shm_internal_datatype_t datatype) + int pe, shm_internal_datatype_t datatype, size_t nic_idx) { uint8_t *remote_addr; ucp_rkey_h rkey; @@ -675,7 +675,7 @@ static inline void shmem_transport_mswap(shmem_transport_ctx_t* ctx, void *target, const void *source, void *dest, const void *mask, size_t len, int pe, - shm_internal_datatype_t datatype) + shm_internal_datatype_t datatype, size_t nic_idx) { uint8_t *remote_addr; ucp_rkey_h rkey; @@ -718,18 +718,18 @@ void shmem_transport_syncmem(void) static inline void shmem_transport_put_signal_nbi(shmem_transport_ctx_t* ctx, void *target, const void *source, size_t len, - uint64_t *sig_addr, uint64_t signal, int sig_op, int pe) + uint64_t *sig_addr, uint64_t signal, int sig_op, int pe, size_t nic_idx) { - shmem_transport_put_nbi(ctx, target, source, len, pe); + shmem_transport_put_nbi(ctx, target, source, len, pe, nic_idx); shmem_transport_fence(ctx); switch (sig_op) { case SHMEM_SIGNAL_ADD: shmem_transport_atomic(ctx, sig_addr, &signal, sizeof(uint64_t), - pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64); + pe, SHM_INTERNAL_SUM, SHM_INTERNAL_UINT64, nic_idx); break; case SHMEM_SIGNAL_SET: shmem_transport_atomic_set(ctx, sig_addr, &signal, sizeof(uint64_t), - pe, SHM_INTERNAL_UINT64); + pe, SHM_INTERNAL_UINT64, nic_idx); break; default: RAISE_ERROR_MSG("Unsupported operation (%d)\n", sig_op); @@ -772,14 +772,15 @@ void shmem_transport_ct_wait(shmem_transport_ct_t *ct, long wait_for) static inline void shmem_transport_put_ct_nb(shmem_transport_ct_t *ct, void *target, const void - *source, size_t len, int pe, long *completion) + *source, size_t len, int pe, long *completion, size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); } static inline void shmem_transport_get_ct(shmem_transport_ct_t *ct, void - *target, const void *source, size_t len, int pe) + *target, const void *source, size_t len, int pe, + size_t nic_idx) { RAISE_ERROR_STR("No path to peer"); }