From 5eabd15f68248be3f9cc1fcc14372da349ddeebf Mon Sep 17 00:00:00 2001 From: Halit Dogan Date: Tue, 19 Nov 2019 22:35:54 -0500 Subject: [PATCH 1/8] XPMEM: cherry-pick Halit's commits for mmap'd heap Change-Id: Id52557bfc490c364fe812ef5de3d9bd23c0c4921 --- src/collectives.c | 5 ++-- src/shmem_internal.h | 12 ++++++--- src/shmem_team.c | 35 ++++++++++++++------------ src/transport_ofi.c | 5 ++-- src/transport_xpmem.c | 57 +++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 89 insertions(+), 25 deletions(-) diff --git a/src/collectives.c b/src/collectives.c index ee51f869e..83fed6028 100644 --- a/src/collectives.c +++ b/src/collectives.c @@ -677,7 +677,8 @@ shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, si * PE 1 sends chunks 1, 0, 3. At the end, each PE has the reduced chunk * corresponding to its PE id + 1. */ - for (int i = 0; i < PE_size - 1; i++) { + int i = 0; + for (i = 0; i < PE_size - 1; i++) { size_t chunk_in = (group_rank - i - 1 + PE_size) % PE_size; size_t chunk_out = (group_rank - i + PE_size) % PE_size; @@ -722,7 +723,7 @@ shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, si * Initially, each PE has the reduced chunk for PE id + 1. Forward chunks * around the ring until all PEs have all chunks. */ - for (int i = 0; i < PE_size - 1; i++) { + for (i = 0; i < PE_size - 1; i++) { size_t chunk_out = (group_rank + 1 - i + PE_size) % PE_size; size_t chunk_out_extra = chunk_out < count % PE_size; size_t chunk_out_count = count/PE_size + chunk_out_extra; diff --git a/src/shmem_internal.h b/src/shmem_internal.h index d6fe359be..38ff32f3c 100644 --- a/src/shmem_internal.h +++ b/src/shmem_internal.h @@ -613,9 +613,11 @@ static inline size_t shmem_internal_bit_1st_nonzero(const unsigned char *ptr, const size_t size) { /* The following ignores endianess: */ - for(size_t i = 0; i < size; i++) { + size_t i = 0; + for(i = 0; i < size; i++) { unsigned char bit_val = ptr[i]; - for (size_t j = 0; bit_val && j < CHAR_BIT; j++) { + size_t j = 0; + for (j = 0; bit_val && j < CHAR_BIT; j++) { if (bit_val & 1) return i * CHAR_BIT + j; bit_val >>= 1; } @@ -632,8 +634,10 @@ void shmem_internal_bit_to_string(char *str, size_t str_size, { size_t off = 0; - for (size_t i = 0; i < ptr_size; i++) { - for (size_t j = 0; j < CHAR_BIT; j++) { + size_t i = 0; + for (i = 0; i < ptr_size; i++) { + size_t j = 0; + for (j = 0; j < CHAR_BIT; j++) { off += snprintf(str+off, str_size-off, "%s", (ptr[i] & (1 << (CHAR_BIT-1-j))) ? "1" : "0"); if (off >= str_size) return; diff --git a/src/shmem_team.c b/src/shmem_team.c index f0476ab6b..519ac50cb 100644 --- a/src/shmem_team.c +++ b/src/shmem_team.c @@ -114,7 +114,8 @@ int shmem_internal_team_init(void) } else { /* Search for shared-memory peer PEs while checking for a consistent stride */ int start = -1, stride = -1, size = 0; - for (int pe = 0; pe < shmem_internal_num_pes; pe++) { + int pe = 0; + for (pe = 0; pe < shmem_internal_num_pes; pe++) { void *ret_ptr = shmem_internal_ptr(shmem_internal_heap_base, pe); if (ret_ptr == NULL) continue; @@ -179,9 +180,9 @@ int shmem_internal_team_init(void) shmem_internal_team_pool = malloc(shmem_internal_params.TEAMS_MAX * sizeof(shmem_internal_team_t*)); - - for (long i = 0; i < shmem_internal_params.TEAMS_MAX; i++) { - shmem_internal_team_pool[i] = NULL; + long j = 0; + for (j = 0; j < shmem_internal_params.TEAMS_MAX; j++) { + shmem_internal_team_pool[j] = NULL; } shmem_internal_team_pool[SHMEM_TEAM_WORLD_INDEX] = &shmem_internal_team_world; shmem_internal_team_pool[SHMEM_TEAM_SHARED_INDEX] = &shmem_internal_team_shared; @@ -199,8 +200,8 @@ int shmem_internal_team_init(void) shmem_internal_psync_pool = shmem_internal_shmalloc(sizeof(long) * psync_len); if (NULL == shmem_internal_psync_pool) goto cleanup; - for (long i = 0; i < psync_len; i++) { - shmem_internal_psync_pool[i] = SHMEM_SYNC_VALUE; + for (j = 0; j < psync_len; j++) { + shmem_internal_psync_pool[j] = SHMEM_SYNC_VALUE; } /* Convenience pointer to the group-3 pSync array (for barriers and syncs): */ @@ -213,7 +214,7 @@ int shmem_internal_team_init(void) /* Initialize the psync bits to 1, making all slots available: */ memset(psync_pool_avail, 0, 2 * N_PSYNC_BYTES); - for (size_t i = 0; i < (size_t) shmem_internal_params.TEAMS_MAX; i++) { + for (i = 0; i < (size_t) shmem_internal_params.TEAMS_MAX; i++) { shmem_internal_bit_set(psync_pool_avail, N_PSYNC_BYTES, i); } @@ -253,7 +254,8 @@ int shmem_internal_team_init(void) void shmem_internal_team_fini(void) { /* Destroy all undestroyed teams */ - for (long i = 0; i < shmem_internal_params.TEAMS_MAX; i++) { + long i = 0; + for (i = 0; i < shmem_internal_params.TEAMS_MAX; i++) { if (shmem_internal_team_pool[i] != NULL) shmem_internal_team_destroy(shmem_internal_team_pool[i]); } @@ -392,8 +394,9 @@ int shmem_internal_team_split_strided(shmem_internal_team_t *parent_team, int PE } else { /* Set the selected psync bit to 0, reserving that slot */ shmem_internal_bit_clear(psync_pool_avail, N_PSYNC_BYTES, myteam->psync_idx); - - for (size_t i = 0; i < N_PSYNCS_PER_TEAM; i++) + + size_t i = 0; + for (i = 0; i < N_PSYNCS_PER_TEAM; i++) myteam->psync_avail[i] = 1; *new_team = myteam; @@ -514,7 +517,8 @@ int shmem_internal_team_destroy(shmem_internal_team_t *team) } /* Destroy all undestroyed shareable contexts on this team */ - for (size_t i = 0; i < team->contexts_len; i++) { + size_t i = 0; + for (i = 0; i < team->contexts_len; i++) { if (team->contexts[i] != NULL) { if (team->contexts[i]->options & SHMEM_CTX_PRIVATE) RAISE_WARN_MSG("Destroying team with unfreed private context (%zu)\n", i); @@ -537,13 +541,13 @@ int shmem_internal_team_destroy(shmem_internal_team_t *team) * specified collective operation. */ long * shmem_internal_team_choose_psync(shmem_internal_team_t *team, shmem_internal_team_op_t op) { - + int i = 0; switch (op) { case SYNC: return &shmem_internal_psync_barrier_pool[team->psync_idx * SHMEM_SYNC_SIZE]; default: - for (int i = 0; i < N_PSYNCS_PER_TEAM; i++) { + for (i = 0; i < N_PSYNCS_PER_TEAM; i++) { if (team->psync_avail[i]) { team->psync_avail[i] = 0; return &shmem_internal_psync_pool[(team->psync_idx + i) * PSYNC_CHUNK_SIZE]; @@ -558,7 +562,7 @@ long * shmem_internal_team_choose_psync(shmem_internal_team_t *team, shmem_inter shmem_internal_sync(team->start, team->stride, team->size, &shmem_internal_psync_barrier_pool[psync]); - for (int i = 0; i < N_PSYNCS_PER_TEAM; i++) { + for (i = 0; i < N_PSYNCS_PER_TEAM; i++) { team->psync_avail[i] = 1; } team->psync_avail[0] = 0; @@ -569,9 +573,10 @@ long * shmem_internal_team_choose_psync(shmem_internal_team_t *team, shmem_inter void shmem_internal_team_release_psyncs(shmem_internal_team_t *team, shmem_internal_team_op_t op) { + size_t i = 0; switch (op) { case SYNC: - for (size_t i = 0; i < N_PSYNCS_PER_TEAM; i++) { + for (i = 0; i < N_PSYNCS_PER_TEAM; i++) { team->psync_avail[i] = 1; } break; diff --git a/src/transport_ofi.c b/src/transport_ofi.c index 4b1f325f4..c37c23232 100644 --- a/src/transport_ofi.c +++ b/src/transport_ofi.c @@ -2142,6 +2142,7 @@ int shmem_transport_fini(void) int ret; shmem_transport_ofi_stx_kvs_t* e; int stx_len = 0; + long i = 0; /* The default context is not inserted into the list of contexts on * SHMEM_TEAM_WORLD, so it must be destroyed here */ @@ -2158,8 +2159,8 @@ int shmem_transport_fini(void) if (stx_len > 0) { RAISE_WARN_MSG("Key/value store contained %d unfreed private contexts\n", stx_len); } - - for (long i = 0; i < shmem_transport_ofi_stx_max; ++i) { + + for (i = 0; i < shmem_transport_ofi_stx_max; ++i) { if (shmem_transport_ofi_stx_pool[i].ref_cnt != 0) RAISE_WARN_MSG("Closing a %s STX (%zu) with nonzero ref. count (%ld)\n", shmem_transport_ofi_stx_pool[i].is_private ? "private" : "shared", diff --git a/src/transport_xpmem.c b/src/transport_xpmem.c index 4a2bdf129..6147459bc 100644 --- a/src/transport_xpmem.c +++ b/src/transport_xpmem.c @@ -27,6 +27,58 @@ #include "shmem_comm.h" #include "runtime.h" + +#define MPIDI_OFI_SHMGR_NAME_MAXLEN (128) +#define MPIDI_OFI_SHMGR_NAME_PREFIX "/sos_shm_colls_area" + +void shm_create_key(char *key, size_t max_size, unsigned pe, size_t num) { + struct timeval tv; + gettimeofday(&tv, NULL); + long long ticks = num; + snprintf(key, max_size, + "%s-%u-%llX", MPIDI_OFI_SHMGR_NAME_PREFIX, pe, ticks); +} + +void *shm_create_region(char* base, const char *key, int shm_size) { + if(shm_size==0) return NULL; + + int fd = shm_open(key, O_RDWR | O_CREAT | O_TRUNC, 0666); + if(fd == -1) { + fprintf(stderr, "COLL_comm_init error shm_open with errno(%d)\n",errno); + exit(0); + } + + if(ftruncate(fd, shm_size) == -1) { + fprintf(stderr, "COLL_comm_init error ftruncate\n"); + exit(0); + } + + void *shm_base_addr = mmap(base, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0); + if(MAP_FAILED == shm_base_addr) { + fprintf(stderr, "COLL_comm_init error mmap %s size %d\n", key, shm_size); + exit(0); + } + + return shm_base_addr; +} + +void *shm_attach_region(char* base, const char *key, int shm_size) { + if(shm_size==0) return NULL; + + int fd = shm_open(key, O_RDWR, 0); + if(fd == -1) { + fprintf(stderr, "COLL_comm_init error shm_open\n"); + exit(0); + } + void *shm_base_addr = mmap(NULL, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if(MAP_FAILED == shm_base_addr) { + fprintf(stderr, "COLL_comm_init error mmap 2 %s size %d\n", key, shm_size); + exit(0); + } + return shm_base_addr; +} + + struct share_info_t { xpmem_segid_t data_seg; size_t data_len; @@ -184,14 +236,14 @@ shmem_transport_xpmem_fini(void) if (0 != shmem_transport_xpmem_peers[peer_num].data_apid) { xpmem_release(shmem_transport_xpmem_peers[peer_num].data_apid); } - + /* if (NULL != shmem_transport_xpmem_peers[peer_num].heap_ptr) { xpmem_detach(shmem_transport_xpmem_peers[peer_num].heap_attach_ptr); } if (0 != shmem_transport_xpmem_peers[peer_num].heap_apid) { xpmem_release(shmem_transport_xpmem_peers[peer_num].heap_apid); - } + }*/ } free(shmem_transport_xpmem_peers); } @@ -199,6 +251,7 @@ shmem_transport_xpmem_fini(void) if (0 != my_info.data_seg) { xpmem_remove(my_info.data_seg); } + /* if (0 != my_info.heap_seg) { xpmem_remove(my_info.heap_seg); } From 2d2aff16ebf9c23fe33ecf935217c34034b05742 Mon Sep 17 00:00:00 2001 From: Halit Dogan Date: Tue, 5 Nov 2019 09:53:42 -0500 Subject: [PATCH 2/8] Modified xpmem transport code to use mmap for heap allocation Change-Id: Id9499dbd50ed9d95eef70dff389f8f00c3ca1fc0 --- src/transport_xpmem.c | 47 +++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/src/transport_xpmem.c b/src/transport_xpmem.c index 6147459bc..385633cc8 100644 --- a/src/transport_xpmem.c +++ b/src/transport_xpmem.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #define SHMEM_INTERNAL_INCLUDE #include "shmem.h" @@ -103,6 +105,8 @@ shmem_transport_xpmem_init(void) size_t len; int ret; char errmsg[256]; + char key_prefix[MPIDI_OFI_SHMGR_NAME_MAXLEN-10]; + char key[MPIDI_OFI_SHMGR_NAME_MAXLEN]; /* setup data region */ base = FIND_BASE(shmem_internal_data_base, page_size); @@ -117,9 +121,17 @@ shmem_transport_xpmem_init(void) my_info.data_len = len; /* setup heap region */ + // Halit - trying mmap instead of XPMEM base = FIND_BASE(shmem_internal_heap_base, page_size); - len = FIND_LEN(shmem_internal_heap_base, shmem_internal_heap_length, page_size); - my_info.heap_seg = xpmem_make(base, len, XPMEM_PERMIT_MODE, (void*)0666); + len = FIND_LEN(shmem_internal_heap_base, shmem_internal_heap_length, page_size); + shm_create_key(key_prefix, MPIDI_OFI_SHMGR_NAME_MAXLEN-10, shmem_internal_my_pe, 2); + snprintf(key, MPIDI_OFI_SHMGR_NAME_MAXLEN, "%s-heap", key_prefix); + void*myaddr = shm_create_region(base, key, len); + my_info.heap_seg = 0; + //((int*)myaddr)[0] = 12*(shmem_internal_my_pe+1); + //printf("shmem_transport_xpmem_startup\n"); + //shmem_runtime_barrier(); + if (-1 == my_info.heap_seg) { RETURN_ERROR_MSG("xpmem_make failed: %s\n", shmem_util_strerror(errno, errmsg, 256)); @@ -127,6 +139,7 @@ shmem_transport_xpmem_init(void) } my_info.heap_off = (char*) shmem_internal_heap_base - (char*) base; my_info.heap_len = len; + ret = shmem_runtime_put("xpmem-segids", &my_info, sizeof(struct share_info_t)); if (0 != ret) { @@ -145,6 +158,7 @@ shmem_transport_xpmem_startup(void) char errmsg[256]; struct share_info_t info; struct xpmem_addr addr; + long page_size = sysconf(_SC_PAGESIZE); num_on_node = shmem_runtime_get_node_size(); @@ -169,18 +183,23 @@ shmem_transport_xpmem_startup(void) RETURN_ERROR_MSG("runtime_get failed: %d\n", ret); return 1; } + + char key_prefix[MPIDI_OFI_SHMGR_NAME_MAXLEN-10]; + char key[MPIDI_OFI_SHMGR_NAME_MAXLEN]; + int len = 0; shmem_transport_xpmem_peers[peer_num].data_apid = xpmem_get(info.data_seg, XPMEM_RDWR, XPMEM_PERMIT_MODE, (void*)0666); + if (shmem_transport_xpmem_peers[peer_num].data_apid < 0) { RETURN_ERROR_MSG("could not get data apid: %s\n", shmem_util_strerror(errno, errmsg, 256)); return 1; } - addr.apid = shmem_transport_xpmem_peers[peer_num].data_apid; addr.offset = 0; + shmem_transport_xpmem_peers[peer_num].data_attach_ptr = xpmem_attach(addr, info.data_len, NULL); if ((size_t) shmem_transport_xpmem_peers[peer_num].data_ptr == XPMEM_MAXADDR_SIZE) { @@ -191,19 +210,16 @@ shmem_transport_xpmem_startup(void) shmem_transport_xpmem_peers[peer_num].data_ptr = (char*) shmem_transport_xpmem_peers[peer_num].data_attach_ptr + info.data_off; - shmem_transport_xpmem_peers[peer_num].heap_apid = - xpmem_get(info.heap_seg, XPMEM_RDWR, XPMEM_PERMIT_MODE, (void*)0666); - if (shmem_transport_xpmem_peers[peer_num].heap_apid < 0) { - RETURN_ERROR_MSG("could not get heap apid: %s\n", - shmem_util_strerror(errno, errmsg, 256)); - return 1; - } addr.apid = shmem_transport_xpmem_peers[peer_num].heap_apid; addr.offset = 0; + + // Halit attach to neighbors + len = FIND_LEN(shmem_internal_heap_base, shmem_internal_heap_length, page_size); + shm_create_key(key_prefix, MPIDI_OFI_SHMGR_NAME_MAXLEN-10, i, 2); + snprintf(key, MPIDI_OFI_SHMGR_NAME_MAXLEN, "%s-heap", key_prefix); + shmem_transport_xpmem_peers[peer_num].heap_attach_ptr = shm_attach_region(NULL, key, len); - shmem_transport_xpmem_peers[peer_num].heap_attach_ptr = - xpmem_attach(addr, info.heap_len, NULL); if ((size_t) shmem_transport_xpmem_peers[peer_num].heap_ptr == XPMEM_MAXADDR_SIZE) { RETURN_ERROR_MSG("could not get data segment: %s\n", shmem_util_strerror(errno, errmsg, 256)); @@ -211,6 +227,7 @@ shmem_transport_xpmem_startup(void) } shmem_transport_xpmem_peers[peer_num].heap_ptr = (char*) shmem_transport_xpmem_peers[peer_num].heap_attach_ptr + info.heap_off; + //printf("PE[%d] reads data from PE[%d]: %d\n", shmem_internal_my_pe, peer_num, ((int*) shmem_transport_xpmem_peers[peer_num].heap_ptr)[0]); } } @@ -248,13 +265,13 @@ shmem_transport_xpmem_fini(void) free(shmem_transport_xpmem_peers); } - if (0 != my_info.data_seg) { + /*if (0 != my_info.data_seg) { xpmem_remove(my_info.data_seg); } - /* + if (0 != my_info.heap_seg) { xpmem_remove(my_info.heap_seg); - } + }*/ return 0; } From 2f58fe77ac30ceda968b8be0ab4e9f040b2455e5 Mon Sep 17 00:00:00 2001 From: David Ozog Date: Fri, 10 Mar 2023 10:46:39 -0700 Subject: [PATCH 3/8] XPMEM: support mmap'd data segment and cleanup --- src/shmem_team.c | 35 +++++------ src/transport_xpmem.c | 137 ++++++++++++++++++++++++++++-------------- 2 files changed, 108 insertions(+), 64 deletions(-) diff --git a/src/shmem_team.c b/src/shmem_team.c index 519ac50cb..f0476ab6b 100644 --- a/src/shmem_team.c +++ b/src/shmem_team.c @@ -114,8 +114,7 @@ int shmem_internal_team_init(void) } else { /* Search for shared-memory peer PEs while checking for a consistent stride */ int start = -1, stride = -1, size = 0; - int pe = 0; - for (pe = 0; pe < shmem_internal_num_pes; pe++) { + for (int pe = 0; pe < shmem_internal_num_pes; pe++) { void *ret_ptr = shmem_internal_ptr(shmem_internal_heap_base, pe); if (ret_ptr == NULL) continue; @@ -180,9 +179,9 @@ int shmem_internal_team_init(void) shmem_internal_team_pool = malloc(shmem_internal_params.TEAMS_MAX * sizeof(shmem_internal_team_t*)); - long j = 0; - for (j = 0; j < shmem_internal_params.TEAMS_MAX; j++) { - shmem_internal_team_pool[j] = NULL; + + for (long i = 0; i < shmem_internal_params.TEAMS_MAX; i++) { + shmem_internal_team_pool[i] = NULL; } shmem_internal_team_pool[SHMEM_TEAM_WORLD_INDEX] = &shmem_internal_team_world; shmem_internal_team_pool[SHMEM_TEAM_SHARED_INDEX] = &shmem_internal_team_shared; @@ -200,8 +199,8 @@ int shmem_internal_team_init(void) shmem_internal_psync_pool = shmem_internal_shmalloc(sizeof(long) * psync_len); if (NULL == shmem_internal_psync_pool) goto cleanup; - for (j = 0; j < psync_len; j++) { - shmem_internal_psync_pool[j] = SHMEM_SYNC_VALUE; + for (long i = 0; i < psync_len; i++) { + shmem_internal_psync_pool[i] = SHMEM_SYNC_VALUE; } /* Convenience pointer to the group-3 pSync array (for barriers and syncs): */ @@ -214,7 +213,7 @@ int shmem_internal_team_init(void) /* Initialize the psync bits to 1, making all slots available: */ memset(psync_pool_avail, 0, 2 * N_PSYNC_BYTES); - for (i = 0; i < (size_t) shmem_internal_params.TEAMS_MAX; i++) { + for (size_t i = 0; i < (size_t) shmem_internal_params.TEAMS_MAX; i++) { shmem_internal_bit_set(psync_pool_avail, N_PSYNC_BYTES, i); } @@ -254,8 +253,7 @@ int shmem_internal_team_init(void) void shmem_internal_team_fini(void) { /* Destroy all undestroyed teams */ - long i = 0; - for (i = 0; i < shmem_internal_params.TEAMS_MAX; i++) { + for (long i = 0; i < shmem_internal_params.TEAMS_MAX; i++) { if (shmem_internal_team_pool[i] != NULL) shmem_internal_team_destroy(shmem_internal_team_pool[i]); } @@ -394,9 +392,8 @@ int shmem_internal_team_split_strided(shmem_internal_team_t *parent_team, int PE } else { /* Set the selected psync bit to 0, reserving that slot */ shmem_internal_bit_clear(psync_pool_avail, N_PSYNC_BYTES, myteam->psync_idx); - - size_t i = 0; - for (i = 0; i < N_PSYNCS_PER_TEAM; i++) + + for (size_t i = 0; i < N_PSYNCS_PER_TEAM; i++) myteam->psync_avail[i] = 1; *new_team = myteam; @@ -517,8 +514,7 @@ int shmem_internal_team_destroy(shmem_internal_team_t *team) } /* Destroy all undestroyed shareable contexts on this team */ - size_t i = 0; - for (i = 0; i < team->contexts_len; i++) { + for (size_t i = 0; i < team->contexts_len; i++) { if (team->contexts[i] != NULL) { if (team->contexts[i]->options & SHMEM_CTX_PRIVATE) RAISE_WARN_MSG("Destroying team with unfreed private context (%zu)\n", i); @@ -541,13 +537,13 @@ int shmem_internal_team_destroy(shmem_internal_team_t *team) * specified collective operation. */ long * shmem_internal_team_choose_psync(shmem_internal_team_t *team, shmem_internal_team_op_t op) { - int i = 0; + switch (op) { case SYNC: return &shmem_internal_psync_barrier_pool[team->psync_idx * SHMEM_SYNC_SIZE]; default: - for (i = 0; i < N_PSYNCS_PER_TEAM; i++) { + for (int i = 0; i < N_PSYNCS_PER_TEAM; i++) { if (team->psync_avail[i]) { team->psync_avail[i] = 0; return &shmem_internal_psync_pool[(team->psync_idx + i) * PSYNC_CHUNK_SIZE]; @@ -562,7 +558,7 @@ long * shmem_internal_team_choose_psync(shmem_internal_team_t *team, shmem_inter shmem_internal_sync(team->start, team->stride, team->size, &shmem_internal_psync_barrier_pool[psync]); - for (i = 0; i < N_PSYNCS_PER_TEAM; i++) { + for (int i = 0; i < N_PSYNCS_PER_TEAM; i++) { team->psync_avail[i] = 1; } team->psync_avail[0] = 0; @@ -573,10 +569,9 @@ long * shmem_internal_team_choose_psync(shmem_internal_team_t *team, shmem_inter void shmem_internal_team_release_psyncs(shmem_internal_team_t *team, shmem_internal_team_op_t op) { - size_t i = 0; switch (op) { case SYNC: - for (i = 0; i < N_PSYNCS_PER_TEAM; i++) { + for (size_t i = 0; i < N_PSYNCS_PER_TEAM; i++) { team->psync_avail[i] = 1; } break; diff --git a/src/transport_xpmem.c b/src/transport_xpmem.c index 385633cc8..c6bac0939 100644 --- a/src/transport_xpmem.c +++ b/src/transport_xpmem.c @@ -29,34 +29,67 @@ #include "shmem_comm.h" #include "runtime.h" - #define MPIDI_OFI_SHMGR_NAME_MAXLEN (128) #define MPIDI_OFI_SHMGR_NAME_PREFIX "/sos_shm_colls_area" -void shm_create_key(char *key, size_t max_size, unsigned pe, size_t num) { - struct timeval tv; - gettimeofday(&tv, NULL); - long long ticks = num; - snprintf(key, max_size, - "%s-%u-%llX", MPIDI_OFI_SHMGR_NAME_PREFIX, pe, ticks); +static void shm_create_key(char *key, size_t max_size, unsigned pe, size_t num) { + struct timeval tv; + gettimeofday(&tv, NULL); + long long ticks = num; + snprintf(key, max_size, "%s-%u-%llX", MPIDI_OFI_SHMGR_NAME_PREFIX, pe, ticks); +} + +static void *shm_create_region_data_seg(char* base, const char *key, int shm_size) { + if (shm_size == 0) return NULL; + + int fd = shm_open(key, O_RDWR | O_CREAT | O_TRUNC, 0666); + if (fd == -1) { + fprintf(stderr, "COLL_comm_init error shm_open with errno(%d)\n",errno); + exit(1); + } + + /* Write all current contents of the data segment to the file */ + FILE *fp = fdopen(fd, "w"); + size_t ret = fwrite(base, shm_size, 1, fp); + + if (ret == 0) { + fprintf(stderr, "COLL_comm_init error fwrite\n"); + exit(1); + } + + if (ftruncate(fd, shm_size) == -1) { + fprintf(stderr, "COLL_comm_init error ftruncate with errno(%s)\n",strerror(errno)); + exit(1); + } + + void *shm_base_addr = mmap(base, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0); + if (MAP_FAILED == shm_base_addr) { + fprintf(stderr, "COLL_comm_init error mmap %s size %d\n", key, shm_size); + exit(1); + } + + fclose(fp); + + return shm_base_addr; } -void *shm_create_region(char* base, const char *key, int shm_size) { - if(shm_size==0) return NULL; + +static void *shm_create_region(char* base, const char *key, int shm_size) { + if (shm_size == 0) return NULL; int fd = shm_open(key, O_RDWR | O_CREAT | O_TRUNC, 0666); - if(fd == -1) { + if (fd == -1) { fprintf(stderr, "COLL_comm_init error shm_open with errno(%d)\n",errno); exit(0); } - if(ftruncate(fd, shm_size) == -1) { + if (ftruncate(fd, shm_size) == -1) { fprintf(stderr, "COLL_comm_init error ftruncate\n"); exit(0); } void *shm_base_addr = mmap(base, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0); - if(MAP_FAILED == shm_base_addr) { + if (MAP_FAILED == shm_base_addr) { fprintf(stderr, "COLL_comm_init error mmap %s size %d\n", key, shm_size); exit(0); } @@ -64,16 +97,16 @@ void *shm_create_region(char* base, const char *key, int shm_size) { return shm_base_addr; } -void *shm_attach_region(char* base, const char *key, int shm_size) { - if(shm_size==0) return NULL; +static void *shm_attach_region(char* base, const char *key, int shm_size) { + if (shm_size == 0) return NULL; int fd = shm_open(key, O_RDWR, 0); - if(fd == -1) { + if (fd == -1) { fprintf(stderr, "COLL_comm_init error shm_open\n"); exit(0); } void *shm_base_addr = mmap(NULL, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if(MAP_FAILED == shm_base_addr) { + if (MAP_FAILED == shm_base_addr) { fprintf(stderr, "COLL_comm_init error mmap 2 %s size %d\n", key, shm_size); exit(0); } @@ -111,7 +144,14 @@ shmem_transport_xpmem_init(void) /* setup data region */ base = FIND_BASE(shmem_internal_data_base, page_size); len = FIND_LEN(shmem_internal_data_base, shmem_internal_data_length, page_size); - my_info.data_seg = xpmem_make(base, len, XPMEM_PERMIT_MODE, (void*)0666); + shm_create_key(key_prefix, MPIDI_OFI_SHMGR_NAME_MAXLEN-10, shmem_internal_my_pe, 1); + snprintf(key, MPIDI_OFI_SHMGR_NAME_MAXLEN, "%s-data", key_prefix); + void* myaddr_data = shm_create_region_data_seg(base, key, len); + if (myaddr_data == NULL) return 1; + my_info.data_seg = 0; + + //my_info.data_seg = xpmem_make(base, len, XPMEM_PERMIT_MODE, (void*)0666); + if (-1 == my_info.data_seg) { RETURN_ERROR_MSG("xpmem_make failed: %s\n", shmem_util_strerror(errno, errmsg, 256)); @@ -121,16 +161,13 @@ shmem_transport_xpmem_init(void) my_info.data_len = len; /* setup heap region */ - // Halit - trying mmap instead of XPMEM base = FIND_BASE(shmem_internal_heap_base, page_size); len = FIND_LEN(shmem_internal_heap_base, shmem_internal_heap_length, page_size); shm_create_key(key_prefix, MPIDI_OFI_SHMGR_NAME_MAXLEN-10, shmem_internal_my_pe, 2); snprintf(key, MPIDI_OFI_SHMGR_NAME_MAXLEN, "%s-heap", key_prefix); - void*myaddr = shm_create_region(base, key, len); + void* myaddr_heap = shm_create_region(base, key, len); + if (myaddr_heap == NULL) return 1; my_info.heap_seg = 0; - //((int*)myaddr)[0] = 12*(shmem_internal_my_pe+1); - //printf("shmem_transport_xpmem_startup\n"); - //shmem_runtime_barrier(); if (-1 == my_info.heap_seg) { RETURN_ERROR_MSG("xpmem_make failed: %s\n", @@ -139,7 +176,6 @@ shmem_transport_xpmem_init(void) } my_info.heap_off = (char*) shmem_internal_heap_base - (char*) base; my_info.heap_len = len; - ret = shmem_runtime_put("xpmem-segids", &my_info, sizeof(struct share_info_t)); if (0 != ret) { @@ -154,10 +190,10 @@ shmem_transport_xpmem_init(void) int shmem_transport_xpmem_startup(void) { - int ret, i, peer_num, num_on_node; + int ret, peer_num, num_on_node; char errmsg[256]; struct share_info_t info; - struct xpmem_addr addr; + //struct xpmem_addr addr; long page_size = sysconf(_SC_PAGESIZE); num_on_node = shmem_runtime_get_node_size(); @@ -168,7 +204,7 @@ shmem_transport_xpmem_startup(void) if (NULL == shmem_transport_xpmem_peers) return 1; /* get local peer info and map into our address space ... */ - for (i = 0 ; i < shmem_internal_num_pes; ++i) { + for (int i = 0 ; i < shmem_internal_num_pes; ++i) { peer_num = shmem_runtime_get_node_rank(i); if (-1 == peer_num) continue; @@ -188,20 +224,37 @@ shmem_transport_xpmem_startup(void) char key[MPIDI_OFI_SHMGR_NAME_MAXLEN]; int len = 0; - shmem_transport_xpmem_peers[peer_num].data_apid = - xpmem_get(info.data_seg, XPMEM_RDWR, XPMEM_PERMIT_MODE, (void*)0666); + //shmem_transport_xpmem_peers[peer_num].data_apid = + // xpmem_get(info.data_seg, XPMEM_RDWR, XPMEM_PERMIT_MODE, (void*)0666); + + //if (shmem_transport_xpmem_peers[peer_num].data_apid < 0) { + // RETURN_ERROR_MSG("could not get data apid: %s\n", + // shmem_util_strerror(errno, errmsg, 256)); + // return 1; + //} + //addr.apid = shmem_transport_xpmem_peers[peer_num].data_apid; + //addr.offset = 0; + + + //shmem_transport_xpmem_peers[peer_num].data_attach_ptr = + // xpmem_attach(addr, info.data_len, NULL); + //if ((size_t) shmem_transport_xpmem_peers[peer_num].data_ptr == XPMEM_MAXADDR_SIZE) { + // RETURN_ERROR_MSG("could not get data segment: %s\n", + // shmem_util_strerror(errno, errmsg, 256)); + // return 1; + //} + //shmem_transport_xpmem_peers[peer_num].data_ptr = + // (char*) shmem_transport_xpmem_peers[peer_num].data_attach_ptr + info.data_off; + + //addr.apid = shmem_transport_xpmem_peers[peer_num].heap_apid; + //addr.offset = 0; + + //Dave attach to neighbors + len = FIND_LEN(shmem_internal_data_base, shmem_internal_data_length, page_size); + shm_create_key(key_prefix, MPIDI_OFI_SHMGR_NAME_MAXLEN-10, i, 1); + snprintf(key, MPIDI_OFI_SHMGR_NAME_MAXLEN, "%s-data", key_prefix); + shmem_transport_xpmem_peers[peer_num].data_attach_ptr = shm_attach_region(NULL, key, len); - if (shmem_transport_xpmem_peers[peer_num].data_apid < 0) { - RETURN_ERROR_MSG("could not get data apid: %s\n", - shmem_util_strerror(errno, errmsg, 256)); - return 1; - } - addr.apid = shmem_transport_xpmem_peers[peer_num].data_apid; - addr.offset = 0; - - - shmem_transport_xpmem_peers[peer_num].data_attach_ptr = - xpmem_attach(addr, info.data_len, NULL); if ((size_t) shmem_transport_xpmem_peers[peer_num].data_ptr == XPMEM_MAXADDR_SIZE) { RETURN_ERROR_MSG("could not get data segment: %s\n", shmem_util_strerror(errno, errmsg, 256)); @@ -209,10 +262,6 @@ shmem_transport_xpmem_startup(void) } shmem_transport_xpmem_peers[peer_num].data_ptr = (char*) shmem_transport_xpmem_peers[peer_num].data_attach_ptr + info.data_off; - - - addr.apid = shmem_transport_xpmem_peers[peer_num].heap_apid; - addr.offset = 0; // Halit attach to neighbors len = FIND_LEN(shmem_internal_heap_base, shmem_internal_heap_length, page_size); @@ -221,7 +270,7 @@ shmem_transport_xpmem_startup(void) shmem_transport_xpmem_peers[peer_num].heap_attach_ptr = shm_attach_region(NULL, key, len); if ((size_t) shmem_transport_xpmem_peers[peer_num].heap_ptr == XPMEM_MAXADDR_SIZE) { - RETURN_ERROR_MSG("could not get data segment: %s\n", + RETURN_ERROR_MSG("could not get heap segment: %s\n", shmem_util_strerror(errno, errmsg, 256)); return 1; } From 871dda7dad75042a3d9b30d9399c9e5a3f5645f1 Mon Sep 17 00:00:00 2001 From: David Ozog Date: Fri, 10 Mar 2023 13:31:45 -0700 Subject: [PATCH 4/8] MMAP: add transport sharing data & heap with mmap --- configure.ac | 52 ++++++-- src/Makefile.am | 6 + src/shr_transport.h4 | 37 ++++++ src/transport_mmap.c | 274 ++++++++++++++++++++++++++++++++++++++++++ src/transport_mmap.h | 114 ++++++++++++++++++ src/transport_xpmem.c | 193 ++++++----------------------- 6 files changed, 508 insertions(+), 168 deletions(-) create mode 100644 src/transport_mmap.c create mode 100644 src/transport_mmap.h diff --git a/configure.ac b/configure.ac index bd55391ab..90d61e4cb 100755 --- a/configure.ac +++ b/configure.ac @@ -307,6 +307,10 @@ AC_ARG_ENABLE([memcpy], [AC_HELP_STRING([--enable-memcpy], [Use memcpy to perform local put/get operations (default:disabled)])]) +AC_ARG_ENABLE([mmap], + [AC_HELP_STRING([--enable-mmap], + [Use mmap to share symmetric data and heap segments in shared memory (default:disabled)])]) + AC_ARG_ENABLE([ofi-fence], [AC_HELP_STRING([--enable-ofi-fence], [Use FI_FENCE feature to optimize put-with-signal operations. (default: disabled)])]) @@ -491,29 +495,48 @@ CHECK_CMA( [transport_cma="yes"], [transport_cma="no"]) -# If both XPMEM and CMA requested, user needs to choose one: -if test -n "$with_xpmem" -a "$with_xpmem" != "no" -a -n "$with_cma" -a "$with_cma" != "no" ; then - AC_MSG_ERROR([Cannot choose both XPMEM and CMA transports, see --help for details]) -# Check which was requested, XPMEM or CMA: + +num_shared_transports=0 +if test -n "$with_xpmem" -a "$with_xpmem" != "no" ; then + ((num_shared_transports+=1)) +fi +if test -n "$with_cma" -a "$with_cma" != "no" ; then + ((num_shared_transports+=1)) +fi +if test -n "$enable_mmap" -a "$enable_mmap" != "no" ; then + ((num_shared_transports+=1)) +fi + +echo "num_shared_transports is..." $num_shared_transports + +# If more than one shared memory transport requested, user needs to choose one: +if [[ $num_shared_transports -gt 1 ]] ; then + echo "Selected " $num_shared_transports " shared memory transports, please choose one." + AC_MSG_ERROR([Only one shared memory transport is allowed (mmap, XPMEM, or CMA), see --help for details]) +elif test -n "$enable_mmap" -a "$enable_mmap" != "no" ; then + transport_mmap="yes" + transport_cma="no" + transport_xpmem="no" + AC_DEFINE([USE_MMAP], [1], [Define if mmap transport is active]) elif test -n "$with_xpmem" -a "$with_xpmem" != "no" ; then + transport_mmap="no" transport_cma="no" - AC_DEFINE([USE_XPMEM], [1], [Define if XPMEM transport is active]) elif test -n "$with_cma" -a "$with_cma" != "no" ; then + transport_mmap="no" transport_xpmem="no" AC_DEFINE([USE_CMA], [1], [Define if Cross Memory Attach transport is active]) AC_DEFINE([_GNU_SOURCE], [1], [CMA transport header requires global definition of _GNU_SOURCE]) -# If neither, disable XPMEM and CMA: else + transport_mmap="no" transport_xpmem="no" transport_cma="no" - AC_MSG_RESULT([Neither XPMEM nor CMA transport requested]) - + AC_MSG_RESULT([Shared memory transport not requested]) fi -if test "$enable_memcpy" = "yes" -a "$transport_xpmem" = "no" -a "$transport_cma" = "no" ; then +if test "$enable_memcpy" = "yes" -a "$transport_xpmem" = "no" -a "$transport_cma" = "no" -a "$transport_mmap" = "no" ; then transport_memcpy="yes" AC_DEFINE([USE_MEMCPY], [1], [Define to use memcpy for local put/get communication]) -elif test "$transport_xpmem" = "yes" -o "$transport_cma" = "yes" ; then +elif test "$transport_xpmem" = "yes" -o "$transport_cma" = "yes" -o "$transport_mmap" = "yes" ; then transport_memcpy="yes" else transport_memcpy="no" @@ -521,8 +544,9 @@ fi AM_CONDITIONAL([USE_XPMEM], [test "$transport_xpmem" = "yes"]) AM_CONDITIONAL([USE_CMA], [test "$transport_cma" = "yes"]) +AM_CONDITIONAL([USE_MMAP], [test "$transport_mmap" = "yes"]) -AS_IF([test "$transport_xpmem" = "yes" -o "$transport_cma" = "yes"], +AS_IF([test "$transport_xpmem" = "yes" -o "$transport_cma" = "yes" -o "$transport_mmap" = "yes"], [AC_DEFINE([USE_ON_NODE_COMMS], [1], [Define if any on-node comm transport is available]) AC_DEFINE([ENABLE_HARD_POLLING], [1], [Enable hard polling]) ]) @@ -533,7 +557,7 @@ else transport_shr_atomics="no" fi -if test "$enable_shr_atomics" != "no" -a "$transport_xpmem" = "yes" -a "$transport" = "none"; then +if [[[ "$enable_shr_atomics" != "no" && ( "$transport_xpmem" = "yes" || "$transport_mmap" = "yes" ) && "$transport" = "none" ]]]; then transport_shr_atomics="yes" AC_DEFINE([USE_SHR_ATOMICS], [1], [If defined, the shared memory layer will perform processor atomics.]) fi @@ -870,6 +894,9 @@ fi if test -n "$with_xpmem" -a "$with_xpmem" != "no" -a "$with_xpmem" != "yes" ; then DISTCHECK_CONFIGURE_FLAGS="$DISTCHECK_CONFIGURE_FLAGS --with-xpmem=${with_xpmem}" fi +if test -n "$enable_mmap" -a "$enable_mmap" != "no" -a "$enable_mmap" != "yes" ; then + DISTCHECK_CONFIGURE_FLAGS="$DISTCHECK_CONFIGURE_FLAGS --enable-mmap=${enable_mmap}" +fi if test -n "$with_pmi" -a "$with_pmi" != "no" -a "$with_pmi" != "yes" ; then DISTCHECK_CONFIGURE_FLAGS="$DISTCHECK_CONFIGURE_FLAGS --with-pmi=${with_pmi}" fi @@ -1025,6 +1052,7 @@ echo "" echo "On Node Communication:" echo " XPMEM: $transport_xpmem" echo " CMA: $transport_cma" +echo " mmap: $transport_mmap" echo " memcpy (self): $transport_memcpy" echo " Shr. atomics: $transport_shr_atomics" echo "" diff --git a/src/Makefile.am b/src/Makefile.am index 427b0387e..f3ee55bd1 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -128,6 +128,12 @@ libsma_la_SOURCES += \ transport_cma.c endif +if USE_MMAP +libsma_la_SOURCES += \ + transport_mmap.h \ + transport_mmap.c +endif + if USE_PMI_SIMPLE AM_CPPFLAGS += -I$(top_srcdir)/pmi-simple libsma_la_SOURCES += \ diff --git a/src/shr_transport.h4 b/src/shr_transport.h4 index 9379ef2e5..7916fcbf9 100644 --- a/src/shr_transport.h4 +++ b/src/shr_transport.h4 @@ -27,6 +27,10 @@ include(shmem_bind_c.m4)dnl #include "transport_cma.h" #endif +#ifdef USE_MMAP +#include "transport_mmap.h" +#endif + static inline int shmem_shr_transport_init(void) { @@ -41,6 +45,11 @@ shmem_shr_transport_init(void) ret = shmem_transport_cma_init(); if (0 != ret) RETURN_ERROR_MSG("CMA init failed (%d)\n", ret); + +#elif USE_MMAP + ret = shmem_transport_mmap_init(); + if (0 != ret) + RETURN_ERROR_MSG("mmap init failed (%d)\n", ret); #endif return ret; @@ -63,6 +72,12 @@ shmem_shr_transport_startup(void) if (0 != ret) { RETURN_ERROR_MSG("CMA startup failed (%d)\n", ret); } + +#elif USE_MMAP + ret = shmem_transport_mmap_startup(); + if (0 != ret) { + RETURN_ERROR_MSG("mmap startup failed (%d)\n", ret); + } #endif return ret; @@ -76,6 +91,8 @@ shmem_shr_transport_fini(void) shmem_transport_xpmem_fini(); #elif USE_CMA shmem_transport_cma_fini(); +#elif USE_MMAP + shmem_transport_mmap_fini(); #endif } @@ -87,6 +104,8 @@ shmem_shr_transport_ptr(void *target, int noderank, void **local_ptr) { #if USE_XPMEM XPMEM_GET_REMOTE_ACCESS(target, noderank, *local_ptr); +#elif USE_MMAP + MMAP_GET_REMOTE_ACCESS(target, noderank, *local_ptr); #else RAISE_ERROR_MSG("No path to peer (%d)\n", noderank); #endif @@ -147,6 +166,10 @@ shmem_shr_transport_put_scalar(shmem_ctx_t ctx, void *target, #elif USE_CMA shmem_transport_cma_put(target, source, len, pe, shmem_internal_get_shr_rank(pe)); + +#elif USE_MMAP + shmem_transport_mmap_put(target, source, len, pe, + shmem_internal_get_shr_rank(pe)); #else RAISE_ERROR_STR("No path to peer"); #endif @@ -165,6 +188,10 @@ shmem_shr_transport_put(shmem_ctx_t ctx, void *target, const void *source, #elif USE_CMA shmem_transport_cma_put(target, source, len, pe, shmem_internal_get_shr_rank(pe)); + +#elif USE_MMAP + shmem_transport_mmap_put(target, source, len, pe, + shmem_internal_get_shr_rank(pe)); #else RAISE_ERROR_STR("No path to peer"); #endif @@ -183,6 +210,10 @@ shmem_shr_transport_get(shmem_ctx_t ctx, void *target, const void *source, #elif USE_CMA shmem_transport_cma_get(target, source, len, pe, shmem_internal_get_shr_rank(pe)); + +#elif USE_MMAP + shmem_transport_mmap_get(target, source, len, pe, + shmem_internal_get_shr_rank(pe)); #else RAISE_ERROR_STR("No path to peer"); #endif @@ -577,6 +608,12 @@ shmem_shr_transport_put_signal(shmem_ctx_t ctx, void *target, shmem_internal_get_shr_rank(pe)); shmem_internal_membar_acq_rel(); /* Memory fence to ensure target PE observes stores in the correct order */ +#elif USE_MMAP + shmem_transport_mmap_put(target, source, len, pe, + shmem_internal_get_shr_rank(pe)); + shmem_internal_membar_acq_rel(); /* Memory fence to ensure target PE observes + stores in the correct order */ + #if USE_SHR_ATOMICS if (sig_op == SHMEM_SIGNAL_ADD) shmem_shr_transport_atomic(ctx, sig_addr, &signal, sizeof(uint64_t), diff --git a/src/transport_mmap.c b/src/transport_mmap.c new file mode 100644 index 000000000..f2999c641 --- /dev/null +++ b/src/transport_mmap.c @@ -0,0 +1,274 @@ +/* -*- C -*- + * + * Copyright 2011 Sandia Corporation. Under the terms of Contract + * DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government + * retains certain rights in this software. + * + * Copyright (c) 2017 Intel Corporation. All rights reserved. + * This software is available to you under the BSD license. + * + * This file is part of the Sandia OpenSHMEM software package. For license + * information, see the LICENSE file in the top level directory of the + * distribution. + * + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#define SHMEM_INTERNAL_INCLUDE +#include "shmem.h" +#include "shmem_internal.h" +#include "shmem_comm.h" +#include "runtime.h" +#include "transport_mmap.h" + +#define MPIDI_OFI_SHMGR_NAME_MAXLEN (128) +#define MPIDI_OFI_SHMGR_NAME_PREFIX "/sos_shm_colls_area" + +static void shm_create_key(char *key, size_t max_size, unsigned pe, size_t num) { + struct timeval tv; + gettimeofday(&tv, NULL); + long long ticks = num; + snprintf(key, max_size, "%s-%u-%llX", MPIDI_OFI_SHMGR_NAME_PREFIX, pe, ticks); +} + + +static void *shm_create_region(char* base, const char *key, int shm_size) { + if (shm_size == 0) return NULL; + + int fd = shm_open(key, O_RDWR | O_CREAT | O_TRUNC, 0666); + if (fd == -1) { + fprintf(stderr, "COLL_comm_init error shm_open with errno(%d)\n",errno); + exit(0); + } + + if (ftruncate(fd, shm_size) == -1) { + fprintf(stderr, "COLL_comm_init error ftruncate\n"); + exit(0); + } + + void *shm_base_addr = mmap(base, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0); + if (MAP_FAILED == shm_base_addr) { + fprintf(stderr, "COLL_comm_init error mmap %s size %d\n", key, shm_size); + exit(0); + } + + return shm_base_addr; +} + + +static void *shm_create_region_data_seg(char* base, const char *key, int shm_size) { + if (shm_size == 0) return NULL; + + int fd = shm_open(key, O_RDWR | O_CREAT | O_TRUNC, 0666); + if (fd == -1) { + fprintf(stderr, "COLL_comm_init error shm_open with errno(%d)\n",errno); + exit(1); + } + + /* Write all current contents of the data segment to the file */ + FILE *fp = fdopen(fd, "w"); + size_t ret = fwrite(base, shm_size, 1, fp); + + if (ret == 0) { + fprintf(stderr, "COLL_comm_init error fwrite\n"); + exit(1); + } + + if (ftruncate(fd, shm_size) == -1) { + fprintf(stderr, "COLL_comm_init error ftruncate with errno(%s)\n",strerror(errno)); + exit(1); + } + + void *shm_base_addr = mmap(base, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0); + if (MAP_FAILED == shm_base_addr) { + fprintf(stderr, "COLL_comm_init error mmap %s size %d\n", key, shm_size); + exit(1); + } + + fclose(fp); + + return shm_base_addr; +} + + +static void *shm_attach_region(char* base, const char *key, int shm_size) { + if (shm_size == 0) return NULL; + + int fd = shm_open(key, O_RDWR, 0); + if (fd == -1) { + fprintf(stderr, "COLL_comm_init error shm_open\n"); + exit(0); + } + void *shm_base_addr = mmap(NULL, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (MAP_FAILED == shm_base_addr) { + fprintf(stderr, "COLL_comm_init error mmap 2 %s size %d\n", key, shm_size); + exit(0); + } + return shm_base_addr; +} + + +struct share_info_t { + size_t data_len; + size_t data_off; + size_t heap_len; + size_t heap_off; +}; + +struct shmem_transport_mmap_peer_info_t *shmem_transport_mmap_peers = NULL; +static struct share_info_t my_info; + +#define FIND_BASE(ptr, page_size) ((char*) (((uintptr_t) ptr / page_size) * page_size)) +#define FIND_LEN(ptr, len, page_size) ((((char*) ptr - FIND_BASE(ptr, page_size) + len - 1) / \ + page_size + 1) * page_size) + +int +shmem_transport_mmap_init(void) +{ + long page_size = sysconf(_SC_PAGESIZE); + char *base; + size_t len; + int ret; + char key_prefix[MPIDI_OFI_SHMGR_NAME_MAXLEN-10]; + char key[MPIDI_OFI_SHMGR_NAME_MAXLEN]; + + /* setup data region */ + base = FIND_BASE(shmem_internal_data_base, page_size); + len = FIND_LEN(shmem_internal_data_base, shmem_internal_data_length, page_size); + shm_create_key(key_prefix, MPIDI_OFI_SHMGR_NAME_MAXLEN-10, shmem_internal_my_pe, 1); + snprintf(key, MPIDI_OFI_SHMGR_NAME_MAXLEN, "%s-data", key_prefix); + void* myaddr_data = shm_create_region_data_seg(base, key, len); + if (myaddr_data == NULL) return 1; + + my_info.data_off = (char*) shmem_internal_data_base - (char*) base; + my_info.data_len = len; + + /* setup heap region */ + base = FIND_BASE(shmem_internal_heap_base, page_size); + len = FIND_LEN(shmem_internal_heap_base, shmem_internal_heap_length, page_size); + shm_create_key(key_prefix, MPIDI_OFI_SHMGR_NAME_MAXLEN-10, shmem_internal_my_pe, 2); + snprintf(key, MPIDI_OFI_SHMGR_NAME_MAXLEN, "%s-heap", key_prefix); + void* myaddr_heap = shm_create_region(base, key, len); + if (myaddr_heap == NULL) return 1; + + my_info.heap_off = (char*) shmem_internal_heap_base - (char*) base; + my_info.heap_len = len; + + ret = shmem_runtime_put("mmap-segids", &my_info, sizeof(struct share_info_t)); + if (0 != ret) { + RETURN_ERROR_MSG("runtime_put failed: %d\n", ret); + return 1; + } + + return 0; +} + + +int +shmem_transport_mmap_startup(void) +{ + int ret, peer_num, num_on_node; + char errmsg[256]; + struct share_info_t info; + //struct mmap_addr addr; + long page_size = sysconf(_SC_PAGESIZE); + + num_on_node = shmem_runtime_get_node_size(); + + /* allocate space for local peers */ + shmem_transport_mmap_peers = calloc(num_on_node, + sizeof(struct shmem_transport_mmap_peer_info_t)); + if (NULL == shmem_transport_mmap_peers) return 1; + + /* get local peer info and map into our address space ... */ + for (int i = 0 ; i < shmem_internal_num_pes; ++i) { + peer_num = shmem_runtime_get_node_rank(i); + if (-1 == peer_num) continue; + + if (shmem_internal_my_pe == i) { + shmem_transport_mmap_peers[peer_num].data_ptr = + shmem_internal_data_base; + shmem_transport_mmap_peers[peer_num].heap_ptr = + shmem_internal_heap_base; + } else { + ret = shmem_runtime_get(i, "mmap-segids", &info, sizeof(struct share_info_t)); + if (0 != ret) { + RETURN_ERROR_MSG("runtime_get failed: %d\n", ret); + return 1; + } + + char key_prefix[MPIDI_OFI_SHMGR_NAME_MAXLEN-10]; + char key[MPIDI_OFI_SHMGR_NAME_MAXLEN]; + int len = 0; + + /* Attach data segment to neighbors: */ + len = FIND_LEN(shmem_internal_data_base, shmem_internal_data_length, page_size); + shm_create_key(key_prefix, MPIDI_OFI_SHMGR_NAME_MAXLEN-10, i, 1); + snprintf(key, MPIDI_OFI_SHMGR_NAME_MAXLEN, "%s-data", key_prefix); + shmem_transport_mmap_peers[peer_num].data_attach_ptr = shm_attach_region(NULL, key, len); + + if (shmem_transport_mmap_peers[peer_num].data_attach_ptr == NULL) { + RETURN_ERROR_MSG("could not get data segment: %s\n", + shmem_util_strerror(errno, errmsg, 256)); + return 1; + } + shmem_transport_mmap_peers[peer_num].data_ptr = + (char*) shmem_transport_mmap_peers[peer_num].data_attach_ptr + info.data_off; + + /* Attach heap segment to neighbors: */ + len = FIND_LEN(shmem_internal_heap_base, shmem_internal_heap_length, page_size); + shm_create_key(key_prefix, MPIDI_OFI_SHMGR_NAME_MAXLEN-10, i, 2); + snprintf(key, MPIDI_OFI_SHMGR_NAME_MAXLEN, "%s-heap", key_prefix); + shmem_transport_mmap_peers[peer_num].heap_attach_ptr = shm_attach_region(NULL, key, len); + + if (shmem_transport_mmap_peers[peer_num].heap_attach_ptr == NULL) { + RETURN_ERROR_MSG("could not get heap segment: %s\n", + shmem_util_strerror(errno, errmsg, 256)); + return 1; + } + shmem_transport_mmap_peers[peer_num].heap_ptr = + (char*) shmem_transport_mmap_peers[peer_num].heap_attach_ptr + info.heap_off; + } + } + + return 0; +} + + +int +shmem_transport_mmap_fini(void) +{ + int i, peer_num; + size_t data_len, heap_len; + long page_size = sysconf(_SC_PAGESIZE); + + data_len = FIND_LEN(shmem_internal_data_base, shmem_internal_data_length, page_size); + heap_len = FIND_LEN(shmem_internal_heap_base, shmem_internal_heap_length, page_size); + + if (NULL != shmem_transport_mmap_peers) { + for (i = 0 ; i < shmem_internal_num_pes; ++i) { + peer_num = shmem_runtime_get_node_rank(i); + if (-1 == peer_num) continue; + if (shmem_internal_my_pe == i) continue; + + if (NULL != shmem_transport_mmap_peers[peer_num].data_attach_ptr) { + munmap(shmem_transport_mmap_peers[peer_num].data_attach_ptr, data_len); + } + + if (NULL != shmem_transport_mmap_peers[peer_num].heap_attach_ptr) { + munmap(shmem_transport_mmap_peers[peer_num].heap_attach_ptr, heap_len); + } + } + free(shmem_transport_mmap_peers); + } + + return 0; +} diff --git a/src/transport_mmap.h b/src/transport_mmap.h new file mode 100644 index 000000000..e09a7e51c --- /dev/null +++ b/src/transport_mmap.h @@ -0,0 +1,114 @@ +/* -*- C -*- + * + * Copyright 2011 Sandia Corporation. Under the terms of Contract + * DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government + * retains certain rights in this software. + * + * Copyright (c) 2023 Intel Corporation. All rights reserved. + * This software is available to you under the BSD license. + * + * This file is part of the Sandia OpenSHMEM software package. For license + * information, see the LICENSE file in the top level directory of the + * distribution. + * + */ + +#ifndef TRANSPORT_MMAP_H +#define TRANSPORT_MMAP_H + +#include +#include + +struct shmem_transport_mmap_peer_info_t { + void *data_attach_ptr; + void *heap_attach_ptr; + void *data_ptr; + void *heap_ptr; +}; + +extern struct shmem_transport_mmap_peer_info_t *shmem_transport_mmap_peers; + +#ifdef ENABLE_ERROR_CHECKING +#define MMAP_GET_REMOTE_ACCESS(target, rank, ptr) \ + do { \ + if (((void*) target > shmem_internal_data_base) && \ + ((char*) target < (char*) shmem_internal_data_base + shmem_internal_data_length)) { \ + ptr = (char*) target - (char*) shmem_internal_data_base + \ + (char*) shmem_transport_mmap_peers[rank].data_ptr; \ + } else if (((void*) target > shmem_internal_heap_base) && \ + ((char*) target < (char*) shmem_internal_heap_base + shmem_internal_heap_length)) { \ + ptr = (char*) target - (char*) shmem_internal_heap_base + \ + (char*) shmem_transport_mmap_peers[rank].heap_ptr; \ + } else { \ + ptr = NULL; \ + } \ + } while (0) +#else +#define MMAP_GET_REMOTE_ACCESS(target, rank, ptr) \ + do { \ + if ((void*) target < shmem_internal_heap_base) { \ + ptr = (char*) target - (char*) shmem_internal_data_base + \ + (char*) shmem_transport_mmap_peers[rank].data_ptr; \ + } else { \ + ptr = (char*) target - (char*) shmem_internal_heap_base + \ + (char*) shmem_transport_mmap_peers[rank].heap_ptr; \ + } \ + } while (0) +#endif + +int shmem_transport_mmap_init(void); + +int shmem_transport_mmap_startup(void); + +int shmem_transport_mmap_fini(void); + + +static inline +void * +shmem_transport_mmap_ptr(const void *target, int pe, int noderank) +{ + char *remote_ptr; + + MMAP_GET_REMOTE_ACCESS(target, noderank, remote_ptr); + return remote_ptr; +} + + +static inline +void +shmem_transport_mmap_put(void *target, const void *source, size_t len, + int pe, int noderank) +{ + char *remote_ptr; + + MMAP_GET_REMOTE_ACCESS(target, noderank, remote_ptr); +#ifdef ENABLE_ERROR_CHECKING + if (NULL == remote_ptr) { + RAISE_ERROR_MSG("target (0x%"PRIXPTR") outside of symmetric areas\n", + (uintptr_t) target); + } +#endif + + memcpy(remote_ptr, source, len); +} + + +static inline +void +shmem_transport_mmap_get(void *target, const void *source, size_t len, + int pe, int noderank) +{ + char *remote_ptr; + + MMAP_GET_REMOTE_ACCESS(source, noderank, remote_ptr); +#ifdef ENABLE_ERROR_CHECKING + if (NULL == remote_ptr) { + RAISE_ERROR_MSG("target (0x%"PRIXPTR") outside of symmetric areas\n", + (uintptr_t) target); + } +#endif + + memcpy(target, remote_ptr, len); +} + +#endif diff --git a/src/transport_xpmem.c b/src/transport_xpmem.c index c6bac0939..4a2bdf129 100644 --- a/src/transport_xpmem.c +++ b/src/transport_xpmem.c @@ -20,8 +20,6 @@ #include #include #include -#include -#include #define SHMEM_INTERNAL_INCLUDE #include "shmem.h" @@ -29,91 +27,6 @@ #include "shmem_comm.h" #include "runtime.h" -#define MPIDI_OFI_SHMGR_NAME_MAXLEN (128) -#define MPIDI_OFI_SHMGR_NAME_PREFIX "/sos_shm_colls_area" - -static void shm_create_key(char *key, size_t max_size, unsigned pe, size_t num) { - struct timeval tv; - gettimeofday(&tv, NULL); - long long ticks = num; - snprintf(key, max_size, "%s-%u-%llX", MPIDI_OFI_SHMGR_NAME_PREFIX, pe, ticks); -} - -static void *shm_create_region_data_seg(char* base, const char *key, int shm_size) { - if (shm_size == 0) return NULL; - - int fd = shm_open(key, O_RDWR | O_CREAT | O_TRUNC, 0666); - if (fd == -1) { - fprintf(stderr, "COLL_comm_init error shm_open with errno(%d)\n",errno); - exit(1); - } - - /* Write all current contents of the data segment to the file */ - FILE *fp = fdopen(fd, "w"); - size_t ret = fwrite(base, shm_size, 1, fp); - - if (ret == 0) { - fprintf(stderr, "COLL_comm_init error fwrite\n"); - exit(1); - } - - if (ftruncate(fd, shm_size) == -1) { - fprintf(stderr, "COLL_comm_init error ftruncate with errno(%s)\n",strerror(errno)); - exit(1); - } - - void *shm_base_addr = mmap(base, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0); - if (MAP_FAILED == shm_base_addr) { - fprintf(stderr, "COLL_comm_init error mmap %s size %d\n", key, shm_size); - exit(1); - } - - fclose(fp); - - return shm_base_addr; -} - - -static void *shm_create_region(char* base, const char *key, int shm_size) { - if (shm_size == 0) return NULL; - - int fd = shm_open(key, O_RDWR | O_CREAT | O_TRUNC, 0666); - if (fd == -1) { - fprintf(stderr, "COLL_comm_init error shm_open with errno(%d)\n",errno); - exit(0); - } - - if (ftruncate(fd, shm_size) == -1) { - fprintf(stderr, "COLL_comm_init error ftruncate\n"); - exit(0); - } - - void *shm_base_addr = mmap(base, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0); - if (MAP_FAILED == shm_base_addr) { - fprintf(stderr, "COLL_comm_init error mmap %s size %d\n", key, shm_size); - exit(0); - } - - return shm_base_addr; -} - -static void *shm_attach_region(char* base, const char *key, int shm_size) { - if (shm_size == 0) return NULL; - - int fd = shm_open(key, O_RDWR, 0); - if (fd == -1) { - fprintf(stderr, "COLL_comm_init error shm_open\n"); - exit(0); - } - void *shm_base_addr = mmap(NULL, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - if (MAP_FAILED == shm_base_addr) { - fprintf(stderr, "COLL_comm_init error mmap 2 %s size %d\n", key, shm_size); - exit(0); - } - return shm_base_addr; -} - - struct share_info_t { xpmem_segid_t data_seg; size_t data_len; @@ -138,20 +51,11 @@ shmem_transport_xpmem_init(void) size_t len; int ret; char errmsg[256]; - char key_prefix[MPIDI_OFI_SHMGR_NAME_MAXLEN-10]; - char key[MPIDI_OFI_SHMGR_NAME_MAXLEN]; /* setup data region */ base = FIND_BASE(shmem_internal_data_base, page_size); len = FIND_LEN(shmem_internal_data_base, shmem_internal_data_length, page_size); - shm_create_key(key_prefix, MPIDI_OFI_SHMGR_NAME_MAXLEN-10, shmem_internal_my_pe, 1); - snprintf(key, MPIDI_OFI_SHMGR_NAME_MAXLEN, "%s-data", key_prefix); - void* myaddr_data = shm_create_region_data_seg(base, key, len); - if (myaddr_data == NULL) return 1; - my_info.data_seg = 0; - - //my_info.data_seg = xpmem_make(base, len, XPMEM_PERMIT_MODE, (void*)0666); - + my_info.data_seg = xpmem_make(base, len, XPMEM_PERMIT_MODE, (void*)0666); if (-1 == my_info.data_seg) { RETURN_ERROR_MSG("xpmem_make failed: %s\n", shmem_util_strerror(errno, errmsg, 256)); @@ -162,13 +66,8 @@ shmem_transport_xpmem_init(void) /* setup heap region */ base = FIND_BASE(shmem_internal_heap_base, page_size); - len = FIND_LEN(shmem_internal_heap_base, shmem_internal_heap_length, page_size); - shm_create_key(key_prefix, MPIDI_OFI_SHMGR_NAME_MAXLEN-10, shmem_internal_my_pe, 2); - snprintf(key, MPIDI_OFI_SHMGR_NAME_MAXLEN, "%s-heap", key_prefix); - void* myaddr_heap = shm_create_region(base, key, len); - if (myaddr_heap == NULL) return 1; - my_info.heap_seg = 0; - + len = FIND_LEN(shmem_internal_heap_base, shmem_internal_heap_length, page_size); + my_info.heap_seg = xpmem_make(base, len, XPMEM_PERMIT_MODE, (void*)0666); if (-1 == my_info.heap_seg) { RETURN_ERROR_MSG("xpmem_make failed: %s\n", shmem_util_strerror(errno, errmsg, 256)); @@ -190,11 +89,10 @@ shmem_transport_xpmem_init(void) int shmem_transport_xpmem_startup(void) { - int ret, peer_num, num_on_node; + int ret, i, peer_num, num_on_node; char errmsg[256]; struct share_info_t info; - //struct xpmem_addr addr; - long page_size = sysconf(_SC_PAGESIZE); + struct xpmem_addr addr; num_on_node = shmem_runtime_get_node_size(); @@ -204,7 +102,7 @@ shmem_transport_xpmem_startup(void) if (NULL == shmem_transport_xpmem_peers) return 1; /* get local peer info and map into our address space ... */ - for (int i = 0 ; i < shmem_internal_num_pes; ++i) { + for (i = 0 ; i < shmem_internal_num_pes; ++i) { peer_num = shmem_runtime_get_node_rank(i); if (-1 == peer_num) continue; @@ -219,42 +117,20 @@ shmem_transport_xpmem_startup(void) RETURN_ERROR_MSG("runtime_get failed: %d\n", ret); return 1; } - - char key_prefix[MPIDI_OFI_SHMGR_NAME_MAXLEN-10]; - char key[MPIDI_OFI_SHMGR_NAME_MAXLEN]; - int len = 0; - - //shmem_transport_xpmem_peers[peer_num].data_apid = - // xpmem_get(info.data_seg, XPMEM_RDWR, XPMEM_PERMIT_MODE, (void*)0666); - - //if (shmem_transport_xpmem_peers[peer_num].data_apid < 0) { - // RETURN_ERROR_MSG("could not get data apid: %s\n", - // shmem_util_strerror(errno, errmsg, 256)); - // return 1; - //} - //addr.apid = shmem_transport_xpmem_peers[peer_num].data_apid; - //addr.offset = 0; - - - //shmem_transport_xpmem_peers[peer_num].data_attach_ptr = - // xpmem_attach(addr, info.data_len, NULL); - //if ((size_t) shmem_transport_xpmem_peers[peer_num].data_ptr == XPMEM_MAXADDR_SIZE) { - // RETURN_ERROR_MSG("could not get data segment: %s\n", - // shmem_util_strerror(errno, errmsg, 256)); - // return 1; - //} - //shmem_transport_xpmem_peers[peer_num].data_ptr = - // (char*) shmem_transport_xpmem_peers[peer_num].data_attach_ptr + info.data_off; - - //addr.apid = shmem_transport_xpmem_peers[peer_num].heap_apid; - //addr.offset = 0; - - //Dave attach to neighbors - len = FIND_LEN(shmem_internal_data_base, shmem_internal_data_length, page_size); - shm_create_key(key_prefix, MPIDI_OFI_SHMGR_NAME_MAXLEN-10, i, 1); - snprintf(key, MPIDI_OFI_SHMGR_NAME_MAXLEN, "%s-data", key_prefix); - shmem_transport_xpmem_peers[peer_num].data_attach_ptr = shm_attach_region(NULL, key, len); + shmem_transport_xpmem_peers[peer_num].data_apid = + xpmem_get(info.data_seg, XPMEM_RDWR, XPMEM_PERMIT_MODE, (void*)0666); + if (shmem_transport_xpmem_peers[peer_num].data_apid < 0) { + RETURN_ERROR_MSG("could not get data apid: %s\n", + shmem_util_strerror(errno, errmsg, 256)); + return 1; + } + + addr.apid = shmem_transport_xpmem_peers[peer_num].data_apid; + addr.offset = 0; + + shmem_transport_xpmem_peers[peer_num].data_attach_ptr = + xpmem_attach(addr, info.data_len, NULL); if ((size_t) shmem_transport_xpmem_peers[peer_num].data_ptr == XPMEM_MAXADDR_SIZE) { RETURN_ERROR_MSG("could not get data segment: %s\n", shmem_util_strerror(errno, errmsg, 256)); @@ -262,21 +138,27 @@ shmem_transport_xpmem_startup(void) } shmem_transport_xpmem_peers[peer_num].data_ptr = (char*) shmem_transport_xpmem_peers[peer_num].data_attach_ptr + info.data_off; - - // Halit attach to neighbors - len = FIND_LEN(shmem_internal_heap_base, shmem_internal_heap_length, page_size); - shm_create_key(key_prefix, MPIDI_OFI_SHMGR_NAME_MAXLEN-10, i, 2); - snprintf(key, MPIDI_OFI_SHMGR_NAME_MAXLEN, "%s-heap", key_prefix); - shmem_transport_xpmem_peers[peer_num].heap_attach_ptr = shm_attach_region(NULL, key, len); + shmem_transport_xpmem_peers[peer_num].heap_apid = + xpmem_get(info.heap_seg, XPMEM_RDWR, XPMEM_PERMIT_MODE, (void*)0666); + if (shmem_transport_xpmem_peers[peer_num].heap_apid < 0) { + RETURN_ERROR_MSG("could not get heap apid: %s\n", + shmem_util_strerror(errno, errmsg, 256)); + return 1; + } + + addr.apid = shmem_transport_xpmem_peers[peer_num].heap_apid; + addr.offset = 0; + + shmem_transport_xpmem_peers[peer_num].heap_attach_ptr = + xpmem_attach(addr, info.heap_len, NULL); if ((size_t) shmem_transport_xpmem_peers[peer_num].heap_ptr == XPMEM_MAXADDR_SIZE) { - RETURN_ERROR_MSG("could not get heap segment: %s\n", + RETURN_ERROR_MSG("could not get data segment: %s\n", shmem_util_strerror(errno, errmsg, 256)); return 1; } shmem_transport_xpmem_peers[peer_num].heap_ptr = (char*) shmem_transport_xpmem_peers[peer_num].heap_attach_ptr + info.heap_off; - //printf("PE[%d] reads data from PE[%d]: %d\n", shmem_internal_my_pe, peer_num, ((int*) shmem_transport_xpmem_peers[peer_num].heap_ptr)[0]); } } @@ -302,25 +184,24 @@ shmem_transport_xpmem_fini(void) if (0 != shmem_transport_xpmem_peers[peer_num].data_apid) { xpmem_release(shmem_transport_xpmem_peers[peer_num].data_apid); } - /* + if (NULL != shmem_transport_xpmem_peers[peer_num].heap_ptr) { xpmem_detach(shmem_transport_xpmem_peers[peer_num].heap_attach_ptr); } if (0 != shmem_transport_xpmem_peers[peer_num].heap_apid) { xpmem_release(shmem_transport_xpmem_peers[peer_num].heap_apid); - }*/ + } } free(shmem_transport_xpmem_peers); } - /*if (0 != my_info.data_seg) { + if (0 != my_info.data_seg) { xpmem_remove(my_info.data_seg); } - if (0 != my_info.heap_seg) { xpmem_remove(my_info.heap_seg); - }*/ + } return 0; } From d89e6a73ef04365ecf2cefb96f62f43b8852a1e8 Mon Sep 17 00:00:00 2001 From: David Ozog Date: Fri, 10 Mar 2023 15:12:42 -0700 Subject: [PATCH 5/8] MMAP: Cleanup and finalization improvments --- configure.ac | 13 ++++++++--- src/collectives.c | 5 ++-- src/shmem_internal.h | 12 ++++------ src/transport_mmap.c | 55 ++++++++++++++++++++++++++++++-------------- src/transport_ofi.c | 5 ++-- 5 files changed, 56 insertions(+), 34 deletions(-) diff --git a/configure.ac b/configure.ac index 90d61e4cb..1d9b8630e 100755 --- a/configure.ac +++ b/configure.ac @@ -507,17 +507,17 @@ if test -n "$enable_mmap" -a "$enable_mmap" != "no" ; then ((num_shared_transports+=1)) fi -echo "num_shared_transports is..." $num_shared_transports - # If more than one shared memory transport requested, user needs to choose one: if [[ $num_shared_transports -gt 1 ]] ; then - echo "Selected " $num_shared_transports " shared memory transports, please choose one." + echo "Selected" $num_shared_transports "shared memory transports, please choose one." AC_MSG_ERROR([Only one shared memory transport is allowed (mmap, XPMEM, or CMA), see --help for details]) elif test -n "$enable_mmap" -a "$enable_mmap" != "no" ; then transport_mmap="yes" transport_cma="no" transport_xpmem="no" AC_DEFINE([USE_MMAP], [1], [Define if mmap transport is active]) + AC_CHECK_LIB(rt, shm_open, [MMAP_LDFLAGS="-lrt"]) + AC_SUBST([MMAP_LDFLAGS]) elif test -n "$with_xpmem" -a "$with_xpmem" != "no" ; then transport_mmap="no" transport_cma="no" @@ -959,6 +959,13 @@ fi if test -n "opal_hwloc_LDFLAGS"; then WRAPPER_COMPILER_EXTRA_LDFLAGS="$WRAPPER_COMPILER_EXTRA_LDFLAGS $opal_hwloc_LDFLAGS" fi +if test "$transport_mmap" = "yes" ; then + CPPFLAGS="$CPPFLAGS $MMAP_CPPFLAGS" + LDFLAGS="$LDFLAGS $MMAP_LDFLAGS" + LIBS="$LIBS $MMAP_LIBS" + WRAPPER_COMPILER_EXTRA_LDFLAGS="$MMAP_LDFLAGS" + WRAPPER_COMPILER_EXTRA_LIBS="$MMAP_LIBS" +fi CPPFLAGS="$CPPFLAGS $pmi_CPPFLAGS" LDFLAGS="$LDFLAGS $pmi_LDFLAGS $aslr_LDFLAGS" diff --git a/src/collectives.c b/src/collectives.c index 83fed6028..ee51f869e 100644 --- a/src/collectives.c +++ b/src/collectives.c @@ -677,8 +677,7 @@ shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, si * PE 1 sends chunks 1, 0, 3. At the end, each PE has the reduced chunk * corresponding to its PE id + 1. */ - int i = 0; - for (i = 0; i < PE_size - 1; i++) { + for (int i = 0; i < PE_size - 1; i++) { size_t chunk_in = (group_rank - i - 1 + PE_size) % PE_size; size_t chunk_out = (group_rank - i + PE_size) % PE_size; @@ -723,7 +722,7 @@ shmem_internal_op_to_all_ring(void *target, const void *source, size_t count, si * Initially, each PE has the reduced chunk for PE id + 1. Forward chunks * around the ring until all PEs have all chunks. */ - for (i = 0; i < PE_size - 1; i++) { + for (int i = 0; i < PE_size - 1; i++) { size_t chunk_out = (group_rank + 1 - i + PE_size) % PE_size; size_t chunk_out_extra = chunk_out < count % PE_size; size_t chunk_out_count = count/PE_size + chunk_out_extra; diff --git a/src/shmem_internal.h b/src/shmem_internal.h index 38ff32f3c..d6fe359be 100644 --- a/src/shmem_internal.h +++ b/src/shmem_internal.h @@ -613,11 +613,9 @@ static inline size_t shmem_internal_bit_1st_nonzero(const unsigned char *ptr, const size_t size) { /* The following ignores endianess: */ - size_t i = 0; - for(i = 0; i < size; i++) { + for(size_t i = 0; i < size; i++) { unsigned char bit_val = ptr[i]; - size_t j = 0; - for (j = 0; bit_val && j < CHAR_BIT; j++) { + for (size_t j = 0; bit_val && j < CHAR_BIT; j++) { if (bit_val & 1) return i * CHAR_BIT + j; bit_val >>= 1; } @@ -634,10 +632,8 @@ void shmem_internal_bit_to_string(char *str, size_t str_size, { size_t off = 0; - size_t i = 0; - for (i = 0; i < ptr_size; i++) { - size_t j = 0; - for (j = 0; j < CHAR_BIT; j++) { + for (size_t i = 0; i < ptr_size; i++) { + for (size_t j = 0; j < CHAR_BIT; j++) { off += snprintf(str+off, str_size-off, "%s", (ptr[i] & (1 << (CHAR_BIT-1-j))) ? "1" : "0"); if (off >= str_size) return; diff --git a/src/transport_mmap.c b/src/transport_mmap.c index f2999c641..d15728a02 100644 --- a/src/transport_mmap.c +++ b/src/transport_mmap.c @@ -4,7 +4,7 @@ * DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government * retains certain rights in this software. * - * Copyright (c) 2017 Intel Corporation. All rights reserved. + * Copyright (c) 2023 Intel Corporation. All rights reserved. * This software is available to you under the BSD license. * * This file is part of the Sandia OpenSHMEM software package. For license @@ -30,33 +30,32 @@ #include "transport_mmap.h" #define MPIDI_OFI_SHMGR_NAME_MAXLEN (128) -#define MPIDI_OFI_SHMGR_NAME_PREFIX "/sos_shm_colls_area" +#define MPIDI_OFI_SHMGR_NAME_PREFIX "/sos_shm_mmap_area" + static void shm_create_key(char *key, size_t max_size, unsigned pe, size_t num) { - struct timeval tv; - gettimeofday(&tv, NULL); - long long ticks = num; - snprintf(key, max_size, "%s-%u-%llX", MPIDI_OFI_SHMGR_NAME_PREFIX, pe, ticks); + snprintf(key, max_size, "%s-%u-%zu", MPIDI_OFI_SHMGR_NAME_PREFIX, pe, num); } static void *shm_create_region(char* base, const char *key, int shm_size) { if (shm_size == 0) return NULL; + shm_unlink(key); int fd = shm_open(key, O_RDWR | O_CREAT | O_TRUNC, 0666); if (fd == -1) { - fprintf(stderr, "COLL_comm_init error shm_open with errno(%d)\n",errno); + fprintf(stderr, "mmap_init error shm_open with errno(%s)\n", strerror(errno)); exit(0); } if (ftruncate(fd, shm_size) == -1) { - fprintf(stderr, "COLL_comm_init error ftruncate\n"); + fprintf(stderr, "mmap_init error ftruncate\n"); exit(0); } void *shm_base_addr = mmap(base, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0); if (MAP_FAILED == shm_base_addr) { - fprintf(stderr, "COLL_comm_init error mmap %s size %d\n", key, shm_size); + fprintf(stderr, "mmap_init error mmap %s size %d\n", key, shm_size); exit(0); } @@ -67,29 +66,30 @@ static void *shm_create_region(char* base, const char *key, int shm_size) { static void *shm_create_region_data_seg(char* base, const char *key, int shm_size) { if (shm_size == 0) return NULL; + shm_unlink(key); int fd = shm_open(key, O_RDWR | O_CREAT | O_TRUNC, 0666); if (fd == -1) { - fprintf(stderr, "COLL_comm_init error shm_open with errno(%d)\n",errno); + fprintf(stderr, "mmap_init data_seg error shm_open with errno(%s)\n", strerror(errno)); exit(1); } /* Write all current contents of the data segment to the file */ - FILE *fp = fdopen(fd, "w"); + FILE *fp = fdopen(fd, "wb"); size_t ret = fwrite(base, shm_size, 1, fp); if (ret == 0) { - fprintf(stderr, "COLL_comm_init error fwrite\n"); + fprintf(stderr, "mmap_init error fwrite\n"); exit(1); } if (ftruncate(fd, shm_size) == -1) { - fprintf(stderr, "COLL_comm_init error ftruncate with errno(%s)\n",strerror(errno)); + fprintf(stderr, "mmap_init error ftruncate with errno(%s)\n", strerror(errno)); exit(1); } void *shm_base_addr = mmap(base, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0); if (MAP_FAILED == shm_base_addr) { - fprintf(stderr, "COLL_comm_init error mmap %s size %d\n", key, shm_size); + fprintf(stderr, "mmap_init error mmap %s size %d\n", key, shm_size); exit(1); } @@ -104,12 +104,12 @@ static void *shm_attach_region(char* base, const char *key, int shm_size) { int fd = shm_open(key, O_RDWR, 0); if (fd == -1) { - fprintf(stderr, "COLL_comm_init error shm_open\n"); + fprintf(stderr, "mmap_init error shm_open\n"); exit(0); } void *shm_base_addr = mmap(NULL, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (MAP_FAILED == shm_base_addr) { - fprintf(stderr, "COLL_comm_init error mmap 2 %s size %d\n", key, shm_size); + fprintf(stderr, "mmap_init error mmap %s size %d\n", key, shm_size); exit(0); } return shm_base_addr; @@ -246,13 +246,34 @@ shmem_transport_mmap_startup(void) int shmem_transport_mmap_fini(void) { - int i, peer_num; + int i, peer_num, ret; + char errmsg[256]; size_t data_len, heap_len; + char key_prefix[MPIDI_OFI_SHMGR_NAME_MAXLEN-10]; + char key[MPIDI_OFI_SHMGR_NAME_MAXLEN]; long page_size = sysconf(_SC_PAGESIZE); data_len = FIND_LEN(shmem_internal_data_base, shmem_internal_data_length, page_size); heap_len = FIND_LEN(shmem_internal_heap_base, shmem_internal_heap_length, page_size); + shm_create_key(key_prefix, MPIDI_OFI_SHMGR_NAME_MAXLEN-10, shmem_internal_my_pe, 1); + snprintf(key, MPIDI_OFI_SHMGR_NAME_MAXLEN, "%s-data", key_prefix); + + ret = shm_unlink(key); + if (ret != 0) { + RETURN_ERROR_MSG("could not get data segment: %s\n", \ + shmem_util_strerror(errno, errmsg, 256)); + } + + shm_create_key(key_prefix, MPIDI_OFI_SHMGR_NAME_MAXLEN-10, shmem_internal_my_pe, 2); + snprintf(key, MPIDI_OFI_SHMGR_NAME_MAXLEN, "%s-heap", key_prefix); + + ret = shm_unlink(key); + if (ret != 0) { + RETURN_ERROR_MSG("could not get heap segment: %s\n", \ + shmem_util_strerror(errno, errmsg, 256)); + } + if (NULL != shmem_transport_mmap_peers) { for (i = 0 ; i < shmem_internal_num_pes; ++i) { peer_num = shmem_runtime_get_node_rank(i); diff --git a/src/transport_ofi.c b/src/transport_ofi.c index c37c23232..4b1f325f4 100644 --- a/src/transport_ofi.c +++ b/src/transport_ofi.c @@ -2142,7 +2142,6 @@ int shmem_transport_fini(void) int ret; shmem_transport_ofi_stx_kvs_t* e; int stx_len = 0; - long i = 0; /* The default context is not inserted into the list of contexts on * SHMEM_TEAM_WORLD, so it must be destroyed here */ @@ -2159,8 +2158,8 @@ int shmem_transport_fini(void) if (stx_len > 0) { RAISE_WARN_MSG("Key/value store contained %d unfreed private contexts\n", stx_len); } - - for (i = 0; i < shmem_transport_ofi_stx_max; ++i) { + + for (long i = 0; i < shmem_transport_ofi_stx_max; ++i) { if (shmem_transport_ofi_stx_pool[i].ref_cnt != 0) RAISE_WARN_MSG("Closing a %s STX (%zu) with nonzero ref. count (%ld)\n", shmem_transport_ofi_stx_pool[i].is_private ? "private" : "shared", From c2b8baafc395359db2c31101a04e587d026a2dec Mon Sep 17 00:00:00 2001 From: David Ozog Date: Fri, 10 Mar 2023 17:19:35 -0700 Subject: [PATCH 6/8] MMAP: add mmap transport unit test to Github CI --- .github/workflows/ci.yml | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index abe3d20ad..624191e24 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -707,3 +707,38 @@ jobs: make check TESTS= -j SHMEM_DEBUG=1 SHMEM_INFO=1 make VERBOSE=1 TEST_RUNNER="${SOS_PM} -np 2" check ${SOS_PM} -np 1 modules/tests-sos/test/unit/hello + + mmap_only: + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + matrix: + include: + - config_name: transport_none + sos_config: [--enable-mmap --enable-shr-atomics --enable-error-checking --enable-pmi-simple] + + steps: + - name: Checking OS version + run: | + echo "OS_NAME=$(lsb_release -si)-$(ls_release -sr)" >> $GITHUB_ENV + - uses: actions/checkout@v2 + - name: Install dependencies + run: | + sudo apt-get install -y gfortran mpich libmpich-dev libev-dev libev-libevent-dev + sudo sysctl -w kernel.yama.ptrace_scope=0 + sudo sysctl -w kernel.randomize_va_space=0 + + # SOS + - name: Build SOS (${{ matrix.name }}) + run: | + ./autogen.sh + mkdir build; cd build + ../configure --prefix=${SOS_INSTALL_DIR} ${{ matrix.sos_config }} + make -j + make install + - name: Test SOS (${{ matrix.name }}) + run: | + cd build + make check TESTS= -j + SHMEM_DEBUG=1 SHMEM_INFO=1 make VERBOSE=1 TEST_RUNNER="${SOS_PM} -np 2" check + cat modules/tests-sos/test/unit/hello.log From 5c5f72f15677a46c0cf9715d8bda6102a625e83a Mon Sep 17 00:00:00 2001 From: David Ozog Date: Fri, 10 Mar 2023 20:04:52 -0700 Subject: [PATCH 7/8] XPMEM: patch configury and include missing header --- configure.ac | 5 +++-- src/transport_xpmem.c | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/configure.ac b/configure.ac index 1d9b8630e..82b1bf705 100755 --- a/configure.ac +++ b/configure.ac @@ -480,7 +480,7 @@ else transport_portals4="no" transport_ofi="no" transport_ucx="no" - AC_MSG_WARN([No transport requested]) + AC_MSG_WARN([No network transport requested]) fi AM_CONDITIONAL([USE_PORTALS4], [test "$transport_portals4" = "yes"]) @@ -521,6 +521,7 @@ elif test -n "$enable_mmap" -a "$enable_mmap" != "no" ; then elif test -n "$with_xpmem" -a "$with_xpmem" != "no" ; then transport_mmap="no" transport_cma="no" + AC_DEFINE([USE_XPMEM], [1], [Define if XPMEM transport is active]) elif test -n "$with_cma" -a "$with_cma" != "no" ; then transport_mmap="no" transport_xpmem="no" @@ -1019,7 +1020,7 @@ AS_IF([test "$enable_pmi_mpi" != "yes" -a "$enable_pmi_simple" != "yes" -a "$opa [AC_MSG_ERROR([No PMI client interface was configured, consider --enable-pmi-simple or --with-pmi])]) AS_IF([test -z "$num_transports"], - [AC_MSG_WARN([No transport found, resulting library will be unable to exchange messages])]) + [AC_MSG_WARN([No network transport found, library will be unable to exchange remote messages])]) AS_IF([test "$shmem_cv_c11_works" != "yes"], [AC_MSG_WARN([C compiler does not support _Generic, unable to verify and test C11 bindings])]) diff --git a/src/transport_xpmem.c b/src/transport_xpmem.c index 4a2bdf129..538b3f5fa 100644 --- a/src/transport_xpmem.c +++ b/src/transport_xpmem.c @@ -26,6 +26,7 @@ #include "shmem_internal.h" #include "shmem_comm.h" #include "runtime.h" +#include "transport_xpmem.h" struct share_info_t { xpmem_segid_t data_seg; From cbe7bf661f929506c8462f95adc3153f99f5f1dc Mon Sep 17 00:00:00 2001 From: David Ozog Date: Wed, 5 Jun 2024 15:44:47 -0400 Subject: [PATCH 8/8] shared: enable MMAP shmem_ptr and fix bounds check --- src/shmem_remote_pointer.h | 2 ++ src/transport_mmap.h | 14 +++++++------- src/transport_xpmem.h | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/shmem_remote_pointer.h b/src/shmem_remote_pointer.h index 9ca618316..e9cacb694 100644 --- a/src/shmem_remote_pointer.h +++ b/src/shmem_remote_pointer.h @@ -34,6 +34,8 @@ shmem_internal_ptr(const void *target, int pe) if (-1 != (node_rank = shmem_internal_get_shr_rank(pe))) { #if USE_XPMEM return shmem_transport_xpmem_ptr(target, pe, node_rank); +#elif USE_MMAP + return shmem_transport_mmap_ptr(target, pe, node_rank); #else return NULL; #endif diff --git a/src/transport_mmap.h b/src/transport_mmap.h index e09a7e51c..cf4b9c373 100644 --- a/src/transport_mmap.h +++ b/src/transport_mmap.h @@ -29,29 +29,29 @@ struct shmem_transport_mmap_peer_info_t { extern struct shmem_transport_mmap_peer_info_t *shmem_transport_mmap_peers; #ifdef ENABLE_ERROR_CHECKING -#define MMAP_GET_REMOTE_ACCESS(target, rank, ptr) \ +#define MMAP_GET_REMOTE_ACCESS(target, rank, ptr) \ do { \ if (((void*) target > shmem_internal_data_base) && \ ((char*) target < (char*) shmem_internal_data_base + shmem_internal_data_length)) { \ ptr = (char*) target - (char*) shmem_internal_data_base + \ - (char*) shmem_transport_mmap_peers[rank].data_ptr; \ - } else if (((void*) target > shmem_internal_heap_base) && \ + (char*) shmem_transport_mmap_peers[rank].data_ptr; \ + } else if (((void*) target >= shmem_internal_heap_base) && \ ((char*) target < (char*) shmem_internal_heap_base + shmem_internal_heap_length)) { \ ptr = (char*) target - (char*) shmem_internal_heap_base + \ - (char*) shmem_transport_mmap_peers[rank].heap_ptr; \ + (char*) shmem_transport_mmap_peers[rank].heap_ptr; \ } else { \ ptr = NULL; \ } \ } while (0) #else -#define MMAP_GET_REMOTE_ACCESS(target, rank, ptr) \ +#define MMAP_GET_REMOTE_ACCESS(target, rank, ptr) \ do { \ if ((void*) target < shmem_internal_heap_base) { \ ptr = (char*) target - (char*) shmem_internal_data_base + \ - (char*) shmem_transport_mmap_peers[rank].data_ptr; \ + (char*) shmem_transport_mmap_peers[rank].data_ptr; \ } else { \ ptr = (char*) target - (char*) shmem_internal_heap_base + \ - (char*) shmem_transport_mmap_peers[rank].heap_ptr; \ + (char*) shmem_transport_mmap_peers[rank].heap_ptr; \ } \ } while (0) #endif diff --git a/src/transport_xpmem.h b/src/transport_xpmem.h index ba795d2f6..8d9fbe8a8 100644 --- a/src/transport_xpmem.h +++ b/src/transport_xpmem.h @@ -38,7 +38,7 @@ extern struct shmem_transport_xpmem_peer_info_t *shmem_transport_xpmem_peers; ((char*) target < (char*) shmem_internal_data_base + shmem_internal_data_length)) { \ ptr = (char*) target - (char*) shmem_internal_data_base + \ (char*) shmem_transport_xpmem_peers[rank].data_ptr; \ - } else if (((void*) target > shmem_internal_heap_base) && \ + } else if (((void*) target >= shmem_internal_heap_base) && \ ((char*) target < (char*) shmem_internal_heap_base + shmem_internal_heap_length)) { \ ptr = (char*) target - (char*) shmem_internal_heap_base + \ (char*) shmem_transport_xpmem_peers[rank].heap_ptr; \