diff --git a/CMakeLists.txt b/CMakeLists.txt old mode 100644 new mode 100755 index 3fb140f..e8aa377 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,6 +53,7 @@ option(USE_BLIT_KERNEL "Use Blit/Copy Kernel for IPC communication" OFF) option(USE_CALIPER "Enable Caliper benchmarking support" OFF) option(USE_JSONCPP "Enable support for passing a config file" OFF) +option(PRINT_DEBUG "Enable debug output" OFF) # Create a variable for NCCL path @@ -83,15 +84,31 @@ add_subdirectory(CommFunctions) # add subdirectory after all configuration, may # Add your source files and executable here add_executable(${PROJECT_NAME} main.cpp) - -target_link_libraries(${PROJECT_NAME} PUBLIC CommFunctions commbench_compiler_flags OpenMP::OpenMP_CXX) +add_executable(bench_p2p bench_p2p.cpp) +add_executable(bench_alltoall bench_alltoall.cpp) +add_executable(bench_broadcast bench_broadcast.cpp) +add_executable(bench_gather bench_gather.cpp) + +target_link_libraries(${PROJECT_NAME} PUBLIC CommFunctions commbench_compiler_flags) +target_link_libraries(bench_p2p PUBLIC CommFunctions commbench_compiler_flags) +target_link_libraries(bench_alltoall PUBLIC CommFunctions commbench_compiler_flags) +target_link_libraries(bench_broadcast PUBLIC CommFunctions commbench_compiler_flags) +target_link_libraries(bench_gather PUBLIC CommFunctions commbench_compiler_flags) install(TARGETS ${PROJECT_NAME} DESTINATION bin) install(DIRECTORY ${PROJECT_SOURCE_DIR}/CommFunctions DESTINATION include FILES_MATCHING PATTERN "*.h") if (USE_CUDA) set_source_files_properties(main.cpp PROPERTIES LANGUAGE CUDA) + set_source_files_properties(bench_p2p.cpp PROPERTIES LANGUAGE CUDA) + set_source_files_properties(bench_alltoall.cpp PROPERTIES LANGUAGE CUDA) + set_source_files_properties(bench_broadcast.cpp PROPERTIES LANGUAGE CUDA) + set_source_files_properties(bench_gather.cpp PROPERTIES LANGUAGE CUDA) elseif (USE_HIP) set_source_files_properties(main.cpp PROPERTIES LANGUAGE HIP) + set_source_files_properties(bench_p2p.cpp PROPERTIES LANGUAGE HIP) + set_source_files_properties(bench_alltoall.cpp PROPERTIES LANGUAGE HIP) + set_source_files_properties(bench_broadcast.cpp PROPERTIES LANGUAGE HIP) + set_source_files_properties(bench_gather.cpp PROPERTIES LANGUAGE HIP) endif() if (USE_JSONCPP) diff --git a/CommFunctions/CMakeLists.txt b/CommFunctions/CMakeLists.txt old mode 100644 new mode 100755 index ccf3da4..1b11657 --- a/CommFunctions/CMakeLists.txt +++ b/CommFunctions/CMakeLists.txt @@ -2,6 +2,15 @@ add_library(CommFunctions INTERFACE commbench.h validate.h comm.h util.h) target_include_directories(CommFunctions INTERFACE ${MPI_CXX_COMPILER_INCLUDE_DIRS}) target_link_libraries(CommFunctions INTERFACE MPI::MPI_CXX OpenMP::OpenMP_CXX) +if (NOT USE_GASNET) + target_compile_definitions(CommFunctions INTERFACE "USE_MPI") +else() + target_compile_definitions(CommFunctions INTERFACE "USE_GASNET") +endif() + +if (PRINT_DEBUG) + target_compile_definitions(CommFunctions INTERFACE "PRINT_DEBUG") +endif() if (USE_GTL) target_link_libraries(CommFunctions INTERFACE mpi_gtl_hsa) diff --git a/CommFunctions/comm.h b/CommFunctions/comm.h old mode 100644 new mode 100755 index b82249a..ea87586 --- a/CommFunctions/comm.h +++ b/CommFunctions/comm.h @@ -131,28 +131,28 @@ numrecv = 0; if(myid == printid) { - printf("printid: %d Create Bench %d with %d processors\n", printid, benchid, numproc); - printf(" Port: "); + fprintf(stderr, "printid: %d Create Bench %d with %d processors\n", printid, benchid, numproc); + fprintf(stderr, " Port: "); #ifdef PORT_CUDA - printf("CUDA, "); + fprintf(stderr, "CUDA, "); #elif defined PORT_HIP - printf("HIP, "); + fprintf(stderr, "HIP, "); #elif defined PORT_ONEAPI - printf("ONEAPI, "); + fprintf(stderr, "ONEAPI, "); #else - printf("CPU, "); + fprintf(stderr, "CPU, "); #endif #ifdef CAP_NCCL - printf("NCCL, "); + fprintf(stderr, "NCCL, "); #elif CAP_ONECCL - printf("ONECCL, "); + fprintf(stderr, "ONECCL, "); #endif #ifdef IPC_kernel - printf("IPC will call a kernel, \n"); + fprintf(stderr, "IPC will call a kernel, \n"); #endif - printf("Library: "); + fprintf(stderr, "Library: "); print_lib(lib); - printf("\n"); + fprintf(stderr, "\n"); } if(lib == NCCL) { #ifdef CAP_NCCL @@ -165,7 +165,7 @@ broadcast(&id); ncclCommInitRank(&comm_nccl, numproc, id, myid); // this is where it the third gpu hangs if(myid == printid) - printf("******************** NCCL COMMUNICATOR IS CREATED\n"); + fprintf(stderr, "******************** NCCL COMMUNICATOR IS CREATED\n"); } #ifdef PORT_CUDA cudaStreamCreate(&stream_nccl); @@ -195,7 +195,7 @@ auto ctx = ccl::create_context(CommBench::q.get_context()); comm_ccl = new ccl::communicator(ccl::create_communicator(numproc, myid, dev, ctx, kvs)); if(myid == printid) - printf("******************** ONECCL COMMUNICATOR IS CREATED\n"); + fprintf(stderr, "******************** ONECCL COMMUNICATOR IS CREATED\n"); stream_ccl = new ccl::stream(ccl::create_stream(CommBench::q)); } #endif @@ -215,7 +215,7 @@ zeDeviceGetCommandQueueGroupProperties(hDevice, &numQueueGroups, queueProperties.data()); int n_commands_lists = 0; if(myid == printid) - printf("device descovery:\n"); + fprintf(stderr, "device descovery:\n"); for (uint32_t i = 0; i < numQueueGroups; i++) { bool isCompute = false; bool isCopy = false; @@ -224,10 +224,10 @@ if ((queueProperties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY)) isCopy = true; if(myid == printid) - printf("group %d isCompute %d isCopy %d\n", i, isCompute, isCopy); + fprintf(stderr, "group %d isCompute %d isCopy %d\n", i, isCompute, isCopy); for (uint32_t j = 0; j < queueProperties[i].numQueues; j++) { if(myid == printid) - printf(" queue: %d\n", j); + fprintf(stderr, " queue: %d\n", j); n_commands_lists++; } } @@ -253,7 +253,7 @@ } } if(myid == printid) - printf("number of command queues: %ld\n", command_queue.size()); + fprintf(stderr, "number of command queues: %ld\n", command_queue.size()); } #endif } @@ -281,7 +281,7 @@ buffer_list.clear(); buffer_count.clear(); if(myid == printid) - printf("memory freed.\n"); + fprintf(stderr, "memory freed.\n"); } template @@ -310,9 +310,9 @@ T *ptr = nullptr; // MPI_Recv(&ptr, sizeof(T*), MPI_BYTE, i, 0, comm_mpi, MPI_STATUS_IGNORE); recv(&ptr, i); - printf("Bench %d proc %d allocate %p count %ld (", benchid, i, ptr, count); + fprintf(stderr, "Bench %d proc %d allocate %p count %ld (", benchid, i, ptr, count); print_data(count * sizeof(T)); - printf(")\n"); + fprintf(stderr, ")\n"); } } }*/ @@ -334,7 +334,7 @@ // OMIT ZERO MESSAGE SIZE if(count == 0) { if(myid == printid) - printf("Bench %d communication (%d->%d) count = 0 (skipped)\n", benchid, sendid, recvid); + fprintf(stderr, "Bench %d communication (%d->%d) count = 0 (skipped)\n", benchid, sendid, recvid); return; } // ADJUST MESSAGE SIZE @@ -365,11 +365,11 @@ pair(&sendoffset, &sendoffset_temp, sendid, printid); pair(&recvoffset, &recvoffset_temp, recvid, printid); if(myid == printid) { - printf("Bench %d comm %d (%d->%d) sendbuf %p sendoffset %zu recvbuf %p recvoffset %zu count %zu (", benchid, numcomm, sendid, recvid, sendbuf_temp, sendoffset_temp, recvbuf_temp, recvoffset_temp, count); + fprintf(stderr, "Bench %d comm %d (%d->%d) sendbuf %p sendoffset %zu recvbuf %p recvoffset %zu count %zu (", benchid, numcomm, sendid, recvid, sendbuf_temp, sendoffset_temp, recvbuf_temp, recvoffset_temp, count); print_data(count * sizeof(T)); - printf(") "); + fprintf(stderr, ") "); print_lib(lib); - printf("\n"); + fprintf(stderr, "\n"); } } numcomm++; @@ -398,13 +398,13 @@ // PUT (SENDER INITIALIZES) pair(&queue, queue_temp, sendid, printid); if(myid == printid) - printf("selected put queue: %d\n", queue_temp); + fprintf(stderr, "selected put queue: %d\n", queue_temp); } if(lib == IPC_get) { // GET (RECVER INITIALIZES) pair(&queue, queue_temp, recvid, printid); if(myid == printid) - printf("selected get queue: %d\n", queue_temp); + fprintf(stderr, "selected get queue: %d\n", queue_temp); } } } @@ -448,7 +448,7 @@ if(sendid != recvid) { int error = -1; #ifdef PORT_CUDA - printf("trying ipc \n"); + fprintf(stderr, "trying ipc \n"); cudaIpcMemHandle_t memhandle; recv(&memhandle, recvid); error = cudaIpcOpenMemHandle((void**)&remotebuf[numsend], memhandle, cudaIpcMemLazyEnablePeerAccess); @@ -473,7 +473,7 @@ error = zeMemOpenIpcHandle(zeContext, zeDevice, memhandle, 0, (void**)&remotebuf[numsend]); #endif if(error) - printf("IpcOpenMemHandle error %d\n", error); + fprintf(stderr, "comm.h:476 IpcOpenMemHandle error %d\n", error); recv(&remoteoffset[numsend], recvid); } #ifdef IPC_ze @@ -508,7 +508,7 @@ // send(&memhandle, sendid); #endif if(error) - printf("IpcGetMemHandle error %d\n", error); + fprintf(stderr, "comm.h:511 IpcGetMemHandle error %d\n", error); send(&sendoffset, recvid); } break; @@ -596,7 +596,7 @@ // send(&memhandle, sendid); #endif if(error) - printf("IpcGetMemHandle error %d\n", error); + fprintf(stderr, "IpcGetMemHandle error %d\n", error); send(&recvoffset, sendid); } break; @@ -642,7 +642,7 @@ error = zeMemOpenIpcHandle(zeContext, zeDevice, memhandle, 0, (void**)&remotebuf[numrecv]); #endif if(error) - printf("IpcOpenMemHandle error %d\n", error); + fprintf(stderr, "comm.h:645 IpcOpenMemHandle error %d\n", error); recv(&remoteoffset[numrecv], sendid); } #ifdef IPC_ze @@ -702,14 +702,14 @@ double maxTime; double avgTime; CommBench::measure(warmup, numiter, minTime, medTime, maxTime, avgTime, *this); - if(myid == printid) { + if(myid == 0) { size_t data = count * sizeof(T); - printf("data: "); print_data(data); printf("\n"); - printf("minTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", minTime * 1e6, minTime / data * 1e12, data / minTime / 1e9); - printf("medTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", medTime * 1e6, medTime / data * 1e12, data / medTime / 1e9); - printf("maxTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", maxTime * 1e6, maxTime / data * 1e12, data / maxTime / 1e9); - printf("avgTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", avgTime * 1e6, avgTime / data * 1e12, data / avgTime / 1e9); - printf("\n"); + // fprintf(stderr, "data: "); print_data(data); fprintf(stderr, "\n"); + fprintf(stdout, "minTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", minTime * 1e6, minTime / data * 1e12, data / minTime / 1e9); + fprintf(stdout, "medTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", medTime * 1e6, medTime / data * 1e12, data / medTime / 1e9); + fprintf(stdout, "maxTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", maxTime * 1e6, maxTime / data * 1e12, data / maxTime / 1e9); + fprintf(stdout, "avgTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", avgTime * 1e6, avgTime / data * 1e12, data / avgTime / 1e9); + // fprintf(stderr, "\n"); } }; @@ -735,19 +735,19 @@ std::vector matrix = getMatrix(); if(myid == printid) { - printf("\nCommBench %d: ", benchid); + fprintf(stderr, "\nCommBench %d: ", benchid); print_lib(lib); - printf(" communication matrix (reciever x sender) nnz: %d\n", numcomm); + fprintf(stderr, " communication matrix (reciever x sender) nnz: %d\n", numcomm); for(int recver = 0; recver < numproc; recver++) { for(int sender = 0; sender < numproc; sender++) { size_t count = matrix[recver * numproc + sender]; if(count) - printf("%ld ", count); - // printf("1 "); + fprintf(stderr, "%ld ", count); + // fprintf(stderr, "1 "); else - printf(". "); + fprintf(stderr, ". "); } - printf("\n"); + fprintf(stderr, "\n"); } } long sendTotal = 0; @@ -775,24 +775,24 @@ MPI_Allreduce(MPI_IN_PLACE, &total_count, 1, MPI_LONG, MPI_SUM, comm_mpi); if(myid == printid) { for(int p = 0; p < numproc; p++) { - printf("proc %d: %d pieces count %ld ", p, total_buffs[p], total_counts[p]); + fprintf(stderr, "proc %d: %d pieces count %ld ", p, total_buffs[p], total_counts[p]); print_data(total_counts[p] * sizeof(T)); - printf("\n"); + fprintf(stderr, "\n"); } - printf("total pieces: %d count %ld ", total_buff, total_count); + fprintf(stderr, "total pieces: %d count %ld ", total_buff, total_count); print_data(total_count * sizeof(T)); - printf("\n"); + fprintf(stderr, "\n"); } }*/ if(myid == printid) { - printf("send footprint: %ld ", sendTotal); + fprintf(stderr, "send footprint: %ld ", sendTotal); print_data(sendTotal * sizeof(T)); - printf("\n"); - printf("recv footprint: %ld ", recvTotal); + fprintf(stderr, "\n"); + fprintf(stderr, "recv footprint: %ld ", recvTotal); print_data(recvTotal * sizeof(T)); - printf("\n"); - printf("\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "\n"); } } template @@ -817,12 +817,12 @@ /* if(myid == printid) { char filename[2048]; - sprintf(filename, "matrix_%d.txt", benchid); + sfprintf(stderr, filename, "matrix_%d.txt", benchid); FILE *matfile = fopen(filename, "w"); for(int recver = 0; recver < numproc; recver++) { for(int sender = 0; sender < numproc; sender++) - fprintf(matfile, "%ld ", matrix[sender * numproc + recver]); - fprintf(matfile, "\n"); + ffprintf(stderr, matfile, "%ld ", matrix[sender * numproc + recver]); + ffprintf(stderr, matfile, "\n"); } fclose(matfile); }*/ @@ -831,10 +831,10 @@ #ifdef IPC_kernel template - __global__ void copy_kernel(T *output, T *input, size_t count) { + __global__ void copy_kernel(T *output, const T *input) { const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; - if(tid < count) - output[tid] = input[tid]; + // if(tid < count) + output[tid] = input[tid]; } #endif @@ -902,7 +902,7 @@ for(int send = 0; send < numsend; send++) { #ifdef IPC_kernel #if defined(PORT_CUDA) || defined(PORT_HIP) - copy_kernel<<<(sendcount[send] + 255) / 256, 256, 0, stream_ipc[send]>>>(remotebuf[send] + remoteoffset[send], sendbuf[send] + sendoffset[send], sendcount[send]); + copy_kernel<<>>(remotebuf[send] + remoteoffset[send], sendbuf[send] + sendoffset[send]); #elif defined PORT_ONEAPI && !defined IPC_ze // q_ipc[send].memcpy(remotebuf[send] + remoteoffset[send], sendbuf[send] + sendoffset[send], sendcount[send] * sizeof(T)); #endif @@ -931,7 +931,7 @@ for(int recv = 0; recv < numrecv; recv++) { #ifdef IPC_kernel #if defined(PORT_CUDA) || defined(PORT_HIP) - copy_kernel<<<(recvcount[recv] + 255) / 256, 256, 0, stream_ipc[recv]>>>(recvbuf[recv] + recvoffset[recv], remotebuf[recv] + remoteoffset[recv], recvcount[recv]); + copy_kernel<<>>(recvbuf[recv] + recvoffset[recv], remotebuf[recv] + remoteoffset[recv]); #elif defined PORT_ONEAPI && !defined IPC_ze // q_ipc[send].memcpy(remotebuf[send] + remoteoffset[send], sendbuf[send] + sendoffset[send], sendcount[send] * sizeof(T)); #endif @@ -969,7 +969,7 @@ #endif default: print_lib(lib); - printf(" option is not implemented!\n"); + fprintf(stderr, " option is not implemented!\n"); break; } } @@ -1038,7 +1038,7 @@ #endif default: print_lib(lib); - printf(" option is not implemented!\n"); + fprintf(stderr, " option is not implemented!\n"); break; } } diff --git a/CommFunctions/commbench.h b/CommFunctions/commbench.h old mode 100644 new mode 100755 index 7078bc0..20f0016 --- a/CommFunctions/commbench.h +++ b/CommFunctions/commbench.h @@ -92,7 +92,11 @@ namespace CommBench { +#ifdef PRINT_DEBUG static int printid = 0; +#else + static int printid = -1; +#endif static int numbench = 0; static std::vector benchlist; static int mydevice = -1; @@ -124,26 +128,26 @@ namespace CommBench static void print_data(size_t data) { if (data < 1e3) - printf("%d bytes", (int)data); + fprintf(stderr, "%d bytes", (int)data); else if (data < 1e6) - printf("%.4f KB", data / 1e3); + fprintf(stderr, "%.4f KB", data / 1e3); else if (data < 1e9) - printf("%.4f MB", data / 1e6); + fprintf(stderr, "%.4f MB", data / 1e6); else if (data < 1e12) - printf("%.4f GB", data / 1e9); + fprintf(stderr, "%.4f GB", data / 1e9); else - printf("%.4f TB", data / 1e12); + fprintf(stderr, "%.4f TB", data / 1e12); } static void print_lib(library lib) { switch(lib) { - case dummy : printf("dummy"); break; - case IPC : printf("IPC (PUT)"); break; - case IPC_get : printf("IPC (GET)"); break; - case MPI : printf("MPI"); break; - case NCCL : printf("NCCL"); break; - case GEX : printf("GASNET (PUT)"); break; - case GEX_get : printf("GASNET (GET)"); break; - case numlib : printf("numlib"); break; + case dummy : fprintf(stderr, "dummy"); break; + case IPC : fprintf(stderr, "IPC (PUT)"); break; + case IPC_get : fprintf(stderr, "IPC (GET)"); break; + case MPI : fprintf(stderr, "MPI"); break; + case NCCL : fprintf(stderr, "NCCL"); break; + case GEX : fprintf(stderr, "GASNET (PUT)"); break; + case GEX_get : fprintf(stderr, "GASNET (GET)"); break; + case numlib : fprintf(stderr, "numlib"); break; } } @@ -289,12 +293,12 @@ namespace CommBench MPI_Comm_size(comm_mpi, &numproc); if(myid == printid) { if(!init_mpi) { - printf("#################### MPI IS INITIALIZED, it is user's responsibility to finalize.\n"); + fprintf(stderr, "#################### MPI IS INITIALIZED, it is user's responsibility to finalize.\n"); int provided; MPI_Query_thread(&provided); - printf("provided thread support: %d\n", provided); + fprintf(stderr, "provided thread support: %d\n", provided); } - printf("******************** MPI COMMUNICATOR IS CREATED\n"); + fprintf(stderr, "******************** MPI COMMUNICATOR IS CREATED\n"); } } #endif @@ -307,7 +311,7 @@ namespace CommBench myep.push_back(ep_primordial); // primordial is index 0 myep_ptr.push_back(nullptr); // primordial segment is 0 if(myid == printid) - printf("******************** GASNET CLIENT IS CREATED\n"); + fprintf(stderr, "******************** GASNET CLIENT IS CREATED\n"); #ifdef USE_GASNET gex_AM_Entry_t handlers[] = { {am_recv_index, (gex_AM_Fn_t)am_recv, GEX_FLAG_AM_REQUEST | GEX_FLAG_AM_SHORT, 0}, @@ -371,19 +375,19 @@ namespace CommBench int numiter = times.size(); if(myid == printid) { - printf("%d measurement iterations (sorted):\n", numiter); + fprintf(stderr, "%d measurement iterations (sorted):\n", numiter); for(int iter = 0; iter < numiter; iter++) { - printf("time: %.4e", times[iter] * 1e6); + fprintf(stderr, "time: %.4e", times[iter] * 1e6); if(iter == 0) - printf(" -> min\n"); + fprintf(stderr, " -> min\n"); else if(iter == numiter / 2) - printf(" -> median\n"); + fprintf(stderr, " -> median\n"); else if(iter == numiter - 1) - printf(" -> max\n"); + fprintf(stderr, " -> max\n"); else - printf("\n"); + fprintf(stderr, "\n"); } - printf("\n"); + fprintf(stderr, "\n"); } double minTime = times[0]; double medTime = times[numiter / 2]; @@ -393,12 +397,12 @@ namespace CommBench avgTime += times[iter]; avgTime /= numiter; if(myid == printid) { - printf("data: "); print_data(data); printf("\n"); - printf("minTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", minTime * 1e6, minTime / data * 1e12, data / minTime / 1e9); - printf("medTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", medTime * 1e6, medTime / data * 1e12, data / medTime / 1e9); - printf("maxTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", maxTime * 1e6, maxTime / data * 1e12, data / maxTime / 1e9); - printf("avgTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", avgTime * 1e6, avgTime / data * 1e12, data / avgTime / 1e9); - printf("\n"); + fprintf(stderr, "data: "); print_data(data); fprintf(stderr, "\n"); + fprintf(stderr, "minTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", minTime * 1e6, minTime / data * 1e12, data / minTime / 1e9); + fprintf(stderr, "medTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", medTime * 1e6, medTime / data * 1e12, data / medTime / 1e9); + fprintf(stderr, "maxTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", maxTime * 1e6, maxTime / data * 1e12, data / maxTime / 1e9); + fprintf(stderr, "avgTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", avgTime * 1e6, avgTime / data * 1e12, data / avgTime / 1e9); + fprintf(stderr, "\n"); } } @@ -457,7 +461,7 @@ namespace CommBench } //for(int i = 0; i < numproc; i++) - // printf("myid %d i: %d sendcount %d senddispl %d recvcount %d recvdispl %d\n", myid, i, sendcount[i], senddispl[i], recvcount[i], recvdispl[i]); + // fprintf(stderr, "myid %d i: %d sendcount %d senddispl %d recvcount %d recvdispl %d\n", myid, i, sendcount[i], senddispl[i], recvcount[i], recvdispl[i]); T *sendbuf; T *recvbuf; allocate(sendbuf, senddispl[numproc]); @@ -495,7 +499,7 @@ namespace CommBench double starts[numiter]; if(myid == printid) - printf("%d warmup iterations (in order):\n", warmup); + fprintf(stderr, "%d warmup iterations (in order):\n", warmup); for (int iter = -warmup; iter < numiter; iter++) { for(int send = 0; send < comm.numsend; send++) { #if defined PORT_CUDA @@ -519,7 +523,7 @@ namespace CommBench allreduce_max(&time); if(iter < 0) { if(myid == printid) - printf("startup %.2e warmup: %.2e\n", start * 1e6, time * 1e6); + fprintf(stderr, "startup %.2e warmup: %.2e\n", start * 1e6, time * 1e6); } else { starts[iter] = start; @@ -530,19 +534,19 @@ namespace CommBench std::sort(starts, starts + numiter, [](const double & a, const double & b) -> bool {return a < b;}); if(myid == printid) { - printf("%d measurement iterations (sorted):\n", numiter); + fprintf(stderr, "%d measurement iterations (sorted):\n", numiter); for(int iter = 0; iter < numiter; iter++) { - printf("start: %.4e time: %.4e", starts[iter] * 1e6, times[iter] * 1e6); + fprintf(stderr, "start: %.4e time: %.4e", starts[iter] * 1e6, times[iter] * 1e6); if(iter == 0) - printf(" -> min\n"); + fprintf(stderr, " -> min\n"); else if(iter == numiter / 2) - printf(" -> median\n"); + fprintf(stderr, " -> median\n"); else if(iter == numiter - 1) - printf(" -> max\n"); + fprintf(stderr, " -> max\n"); else - printf("\n"); + fprintf(stderr, "\n"); } - printf("\n"); + fprintf(stderr, "\n"); } minTime = times[0]; medTime = times[numiter / 2]; @@ -560,18 +564,18 @@ namespace CommBench allgather(&memory, memory_all.data()); if(myid == printid) { size_t memory_total = 0; - printf("\n"); - printf("CommBench memory report:\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "CommBench memory report:\n"); for(int i = 0; i < numproc; i++) { - printf("proc: %d memory ", i); + fprintf(stderr, "proc: %d memory ", i); print_data(memory_all[i]); - printf("\n"); + fprintf(stderr, "\n"); memory_total += memory_all[i]; } - printf("total memory: "); + fprintf(stderr, "total memory: "); print_data(memory_total); - printf("\n"); - printf("\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "\n"); } } diff --git a/CommFunctions/util.h b/CommFunctions/util.h old mode 100644 new mode 100755 index 4c94f05..36d28f1 --- a/CommFunctions/util.h +++ b/CommFunctions/util.h @@ -29,7 +29,7 @@ void setup_gpu() { cudaGetDeviceCount(&deviceCount); #ifdef CAP_NCCL if (numproc > deviceCount) { - printf("Warning: Using the same device for different ranks of a communicator for NCCL is not supported\n"); + fprintf(stderr, "Warning: Using the same device for different ranks of a communicator for NCCL is not supported\n"); } #endif int device = myid % deviceCount; @@ -37,10 +37,10 @@ void setup_gpu() { set_device(device); if(!init) { if(myid == printid) - printf("CUDA PORT\n"); + fprintf(stderr, "CUDA PORT\n"); // SET DEVICE if(myid == printid) - printf("deviceCount: %d\n", deviceCount); + fprintf(stderr, "deviceCount: %d\n", deviceCount); // REPORT if(myid == printid){ int error = system("nvidia-smi"); @@ -48,21 +48,21 @@ void setup_gpu() { int device; cudaGetDevice(&device); cudaGetDeviceCount(&deviceCount); - printf("Device %d Count: %d\n", device, deviceCount); + fprintf(stderr, "Device %d Count: %d\n", device, deviceCount); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp,0); - printf("Device %d name: %s\n",0,deviceProp.name); - printf("Clock Frequency: %f GHz\n",deviceProp.clockRate/1.e9); - printf("Computational Capabilities: %d, %d\n",deviceProp.major,deviceProp.minor); - printf("Maximum global memory size: %lu\n",deviceProp.totalGlobalMem); - printf("Maximum constant memory size: %lu\n",deviceProp.totalConstMem); - printf("Maximum shared memory size per block: %lu\n",deviceProp.sharedMemPerBlock); - printf("Maximum block dimensions: %dx%dx%d\n",deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]); - printf("Maximum grid dimensions: %dx%dx%d\n",deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]); - printf("Maximum threads per block: %d\n",deviceProp.maxThreadsPerBlock); - printf("Warp size: %d\n",deviceProp.warpSize); - printf("32-bit Reg. per block: %d\n",deviceProp.regsPerBlock); - printf("\n"); + fprintf(stderr, "Device %d name: %s\n",0,deviceProp.name); + fprintf(stderr, "Clock Frequency: %f GHz\n",deviceProp.clockRate/1.e9); + fprintf(stderr, "Computational Capabilities: %d, %d\n",deviceProp.major,deviceProp.minor); + fprintf(stderr, "Maximum global memory size: %lu\n",deviceProp.totalGlobalMem); + fprintf(stderr, "Maximum constant memory size: %lu\n",deviceProp.totalConstMem); + fprintf(stderr, "Maximum shared memory size per block: %lu\n",deviceProp.sharedMemPerBlock); + fprintf(stderr, "Maximum block dimensions: %dx%dx%d\n",deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]); + fprintf(stderr, "Maximum grid dimensions: %dx%dx%d\n",deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]); + fprintf(stderr, "Maximum threads per block: %d\n",deviceProp.maxThreadsPerBlock); + fprintf(stderr, "Warp size: %d\n",deviceProp.warpSize); + fprintf(stderr, "32-bit Reg. per block: %d\n",deviceProp.regsPerBlock); + fprintf(stderr, "\n"); } } #elif defined PORT_HIP @@ -73,10 +73,10 @@ void setup_gpu() { set_device(device); if(!init) { if(myid == printid) - printf("HIP PORT\n"); + fprintf(stderr, "HIP PORT\n"); //DEVICE MANAGEMENT if(myid == printid) - printf("deviceCount: %d\n", deviceCount); + fprintf(stderr, "deviceCount: %d\n", deviceCount); // REPORT if(myid == printid) { system("rocm-smi"); @@ -84,34 +84,34 @@ void setup_gpu() { int device; hipGetDevice(&device); hipGetDeviceCount(&deviceCount); - printf("Device %d Count: %d\n", device, deviceCount); + fprintf(stderr, "Device %d Count: %d\n", device, deviceCount); hipDeviceProp_t deviceProp; hipGetDeviceProperties(&deviceProp,0); - printf("Device %d name: %s\n",0,deviceProp.name); - printf("Maximum global memory size: %lu\n",deviceProp.totalGlobalMem); - printf("Maximum shared memory size per block: %lu\n",deviceProp.sharedMemPerBlock); - printf("32-bit Reg. per block: %d\n",deviceProp.regsPerBlock); - printf("Warp size: %d\n",deviceProp.warpSize); - printf("Maximum threads per block: %d\n",deviceProp.maxThreadsPerBlock); - printf("Maximum block dimensions: %dx%dx%d\n",deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]); - printf("Maximum grid dimensions: %dx%dx%d\n",deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]); - printf("Clock frequency: %d khz\n",deviceProp.clockRate); - printf("Global memory frequency: %d khz\n", deviceProp.memoryClockRate); - printf("Global memory bus width: %d bits\n", deviceProp.memoryBusWidth); - printf("Maximum constant memory size: %lu\n",deviceProp.totalConstMem); - printf("Compute capability: %d.%d\n", deviceProp.major, deviceProp.minor); - printf("Number of multi-processors: %d\n", deviceProp.multiProcessorCount); - printf("L2 cache size: %d\n", deviceProp.l2CacheSize); - printf("Max. threads per multi-processor: %d\n", deviceProp.maxThreadsPerMultiProcessor); - printf("Compute mode: %d\n", deviceProp.computeMode); - printf("Device-side clock instruction rate: %d khz\n", deviceProp.clockInstructionRate); - printf("\n"); + fprintf(stderr, "Device %d name: %s\n",0,deviceProp.name); + fprintf(stderr, "Maximum global memory size: %lu\n",deviceProp.totalGlobalMem); + fprintf(stderr, "Maximum shared memory size per block: %lu\n",deviceProp.sharedMemPerBlock); + fprintf(stderr, "32-bit Reg. per block: %d\n",deviceProp.regsPerBlock); + fprintf(stderr, "Warp size: %d\n",deviceProp.warpSize); + fprintf(stderr, "Maximum threads per block: %d\n",deviceProp.maxThreadsPerBlock); + fprintf(stderr, "Maximum block dimensions: %dx%dx%d\n",deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]); + fprintf(stderr, "Maximum grid dimensions: %dx%dx%d\n",deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]); + fprintf(stderr, "Clock frequency: %d khz\n",deviceProp.clockRate); + fprintf(stderr, "Global memory frequency: %d khz\n", deviceProp.memoryClockRate); + fprintf(stderr, "Global memory bus width: %d bits\n", deviceProp.memoryBusWidth); + fprintf(stderr, "Maximum constant memory size: %lu\n",deviceProp.totalConstMem); + fprintf(stderr, "Compute capability: %d.%d\n", deviceProp.major, deviceProp.minor); + fprintf(stderr, "Number of multi-processors: %d\n", deviceProp.multiProcessorCount); + fprintf(stderr, "L2 cache size: %d\n", deviceProp.l2CacheSize); + fprintf(stderr, "Max. threads per multi-processor: %d\n", deviceProp.maxThreadsPerMultiProcessor); + fprintf(stderr, "Compute mode: %d\n", deviceProp.computeMode); + fprintf(stderr, "Device-side clock instruction rate: %d khz\n", deviceProp.clockInstructionRate); + fprintf(stderr, "\n"); } } #elif defined PORT_SYCL if(!init) { if(CommBench::myid == CommBench::printid) - printf("SYCL PORT\n"); + fprintf(stderr, "SYCL PORT\n"); // Initialize the driver zeInit(0); // Discover all the driver instances @@ -134,34 +134,34 @@ void setup_gpu() { if(CommBench::myid == CommBench::printid) { if(ZE_DEVICE_TYPE_GPU == device_properties.type) - printf("driverCount %d deviceCount %d GPU\n", driverCount, deviceCount); + fprintf(stderr, "driverCount %d deviceCount %d GPU\n", driverCount, deviceCount); else - printf("GPU not found!\n"); - printf("type %d\n", device_properties.type); - printf("vendorId %d\n", device_properties.vendorId); - printf("deviceId %d\n", device_properties.deviceId); - printf("flags %d\n", device_properties.flags); - printf("subdeviceId %d\n", device_properties.subdeviceId); - printf("coreClockRate %d\n", device_properties.coreClockRate); - printf("maxMemAllocSize %ld\n", device_properties.maxMemAllocSize); - printf("maxHardwareContexts %d\n", device_properties.maxHardwareContexts); - printf("maxCommandQueuePriority %d\n", device_properties.maxCommandQueuePriority); - printf("numThreadsPerEU %d\n", device_properties.numThreadsPerEU); - printf("physicalEUSimdWidth %d\n", device_properties.physicalEUSimdWidth); - printf("numSubslicesPerSlice %d\n", device_properties.numEUsPerSubslice); - printf("numSlices %d\n", device_properties.numSlices); - printf("timerResolution %ld\n", device_properties.timerResolution); - printf("timestampValidBits %d\n", device_properties.timestampValidBits); - printf("kernelTimestampValidBits %d\n", device_properties.kernelTimestampValidBits); + fprintf(stderr, "GPU not found!\n"); + fprintf(stderr, "type %d\n", device_properties.type); + fprintf(stderr, "vendorId %d\n", device_properties.vendorId); + fprintf(stderr, "deviceId %d\n", device_properties.deviceId); + fprintf(stderr, "flags %d\n", device_properties.flags); + fprintf(stderr, "subdeviceId %d\n", device_properties.subdeviceId); + fprintf(stderr, "coreClockRate %d\n", device_properties.coreClockRate); + fprintf(stderr, "maxMemAllocSize %ld\n", device_properties.maxMemAllocSize); + fprintf(stderr, "maxHardwareContexts %d\n", device_properties.maxHardwareContexts); + fprintf(stderr, "maxCommandQueuePriority %d\n", device_properties.maxCommandQueuePriority); + fprintf(stderr, "numThreadsPerEU %d\n", device_properties.numThreadsPerEU); + fprintf(stderr, "physicalEUSimdWidth %d\n", device_properties.physicalEUSimdWidth); + fprintf(stderr, "numSubslicesPerSlice %d\n", device_properties.numEUsPerSubslice); + fprintf(stderr, "numSlices %d\n", device_properties.numSlices); + fprintf(stderr, "timerResolution %ld\n", device_properties.timerResolution); + fprintf(stderr, "timestampValidBits %d\n", device_properties.timestampValidBits); + fprintf(stderr, "kernelTimestampValidBits %d\n", device_properties.kernelTimestampValidBits); //for(int j = 0; j < ZE_MAX_DEVICE_UUID_SIZE; j++) - // printf("uuid %d\n", device_properties.uuid.id[j]); - printf("name %s\n", device_properties.name); - printf("\n"); + // fprintf(stderr, "uuid %d\n", device_properties.uuid.id[j]); + fprintf(stderr, "name %s\n", device_properties.name); + fprintf(stderr, "\n"); } } /*ze_bool_t test = false; zeDeviceCanAccessPeer(allDevices[0], allDevices[1], &test); - printf("can access peer %d\n", test);*/ + fprintf(stderr, "can access peer %d\n", test);*/ delete[] allDevices; } delete[] allDrivers; @@ -169,7 +169,7 @@ void setup_gpu() { #else if(!init) if(CommBench::myid == CommBench::printid) - printf("CPU VERSION\n"); + fprintf(stderr, "CPU VERSION\n"); #endif init = true; } diff --git a/CommFunctions/validate.h b/CommFunctions/validate.h old mode 100644 new mode 100755 index 8d03cd7..7dbb31c --- a/CommFunctions/validate.h +++ b/CommFunctions/validate.h @@ -1,5 +1,5 @@ template -void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &coll) { +void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &coll, int source = 0, int dest = 1) { int myid = CommBench::myid; int numproc = CommBench::numproc; @@ -26,10 +26,10 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c switch(pattern) { case 0: { - if(myid == 0) printf("VERIFY P2P\n"); - if(myid == 1) { + if(myid == source) fprintf(stderr, "VERIFY P2P\n"); + if(myid == dest) { for(size_t i = 0; i < count; i++) { - // printf("myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); + // fprintf(stderr, "myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); if(recvbuf[i] != i) pass = false; } @@ -38,11 +38,11 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c break; case 1: { - if(myid == ROOT) printf("VERIFY GATHER\n"); + if(myid == ROOT) fprintf(stderr, "VERIFY GATHER\n"); if(myid == ROOT) { for(int p = 0; p < numproc; p++) for(size_t i = 0; i < count; i++) { - // printf("myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); + // fprintf(stderr, "myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); if(recvbuf[p * count + i] != i) pass = false; } @@ -51,9 +51,9 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c break; case 2: { - if(myid == ROOT) printf("VERIFY SCATTER\n"); + if(myid == ROOT) fprintf(stderr, "VERIFY SCATTER\n"); for(size_t i = 0; i < count; i++) { - // printf("myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); + // fprintf(stderr, "myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); if(recvbuf[i] != myid * count + i) pass = false; } @@ -61,9 +61,9 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c break; case 3: { - if(myid == ROOT) printf("VERIFY BCAST\n"); + if(myid == ROOT) fprintf(stderr, "VERIFY BCAST\n"); for(size_t i = 0; i < count; i++) { - // printf("myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); + // fprintf(stderr, "myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); if(recvbuf[i] != i) pass = false; } @@ -71,16 +71,16 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c break; case 4: { - if(myid == ROOT) printf("REDUCE IS NOT TESTED\n"); + if(myid == ROOT) fprintf(stderr, "REDUCE IS NOT TESTED\n"); pass = false; } break; case 5: { - if(myid == ROOT) printf("VERIFY ALL-TO-ALL\n"); + if(myid == ROOT) fprintf(stderr, "VERIFY ALL-TO-ALL\n"); for(int p = 0; p < numproc; p++) for(size_t i = 0; i < count; i++) { - // printf("myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); + // fprintf(stderr, "myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); if(recvbuf[p * count + i] != myid * count + i) pass = false; } @@ -88,10 +88,10 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c break; case 6: { - if(myid == ROOT) printf("VERIFY ALL-GATHER\n"); + if(myid == ROOT) fprintf(stderr, "VERIFY ALL-GATHER\n"); for(int p = 0; p < numproc; p++) for(size_t i = 0; i < count; i++) { - // printf("myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); + // fprintf(stderr, "myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); if(recvbuf[p * count + i] != i) pass = false; } @@ -99,13 +99,13 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c break; case 7: { - if(myid == ROOT) printf("REDUCE-SCATTER IS NOT TESTED\n"); + if(myid == ROOT) fprintf(stderr, "REDUCE-SCATTER IS NOT TESTED\n"); pass = false; } break; case 8: { - if(myid == ROOT) printf("ALL-REDUCE IS NOT TESTED\n"); + if(myid == ROOT) fprintf(stderr, "ALL-REDUCE IS NOT TESTED\n"); pass = false; } break; @@ -113,9 +113,9 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c pass = CommBench::allreduce_land(pass); if(myid == ROOT) { if(pass) - printf("PASSED!\n"); + fprintf(stderr, "PASSED!\n"); else - printf("FAILED!!!\n"); + fprintf(stderr, "FAILED!!!\n"); } CommBench::freeHost(sendbuf); diff --git a/bench.cpp b/bench.cpp new file mode 100644 index 0000000..fca92d9 --- /dev/null +++ b/bench.cpp @@ -0,0 +1,70 @@ +#include "commbench.h" +#include +#include +#include + +using namespace CommBench; + +int main(int argc, char *argv[]) { + init(); + + if (argc < 3) { + if (myid == 0) + fprintf(stderr, "Invalid arguments, usage is ./bench \n"); + return 1; + } + char* folder = argv[1]; + + int *sendbuf, *recvbuf; + int max_num_bytes = 31; + char file_name[100]; + int gpus = numproc; + + allocate(sendbuf, 1ull << max_num_bytes); + allocate(recvbuf, 1ull << max_num_bytes); + + for (int i = 2; i < argc; i++) { + char* lib_str = argv[i]; + library lib; + if (strcmp(lib_str, "mpi") == 0) + lib = library::MPI; + else if (strcmp(lib_str, "ipc_put") == 0) + lib = library::IPC; + else if (strcmp(lib_str, "ipc_get") == 0) + lib = library::IPC_get; + else if (strcmp(lib_str, "xccl") == 0) + lib = library::NCCL; + else { + if (myid == 0) + fprintf(stderr, "Invalid library %s\n", lib_str); + return 1; + } + if (myid == 0) + fprintf(stderr, "Testing library %s\n", lib_str); + for (int source = 0; source < gpus; source++) { + for (int dest = 0; dest < gpus; dest++) { + if (myid == 0) + fprintf(stderr, "\033[2K\rP2P Comm: %d->%d", source, dest); + // fprintf(stderr, "P2P Comm: %d->%d\n", source, dest); + for (int size = 2; size < max_num_bytes; size+=2) { + Comm bench(lib); + if (myid == 0) { + sprintf(file_name, "%s/%s/%d_%d_%llu.out", folder, + lib_str, source, dest, 1ull << size); + freopen(file_name, "w", stdout); + } + bench.add(sendbuf, recvbuf, 1ull << size, source, dest); + bench.measure(5, 10); + // bench.clear(); + } + } + } + + if (myid == 0) + fprintf(stderr, "\033[2K\rCompleted testing library %s\n", lib_str); + // fprintf(stderr, "Completed testing library %s\n", libs_str[i].c_str()); + } + free(sendbuf); + free(recvbuf); + MPI_Finalize(); +} diff --git a/bench_alltoall.cpp b/bench_alltoall.cpp new file mode 100755 index 0000000..1ada1ce --- /dev/null +++ b/bench_alltoall.cpp @@ -0,0 +1,66 @@ +#include "commbench.h" +#include +#include +#include + +using namespace CommBench; + +int main(int argc, char *argv[]) { + init(); + + if (argc < 3) { + if (myid == 0) + fprintf(stderr, "Invalid arguments, usage is ./bench \n"); + return 1; + } + char* folder = argv[1]; + + int *sendbuf, *recvbuf; + int max_num_bytes = 25; + char file_name[100]; + int gpus = numproc; + + allocate(sendbuf, (1ull << max_num_bytes) * numproc); + allocate(recvbuf, (1ull << max_num_bytes) * numproc); + + for (int i = 2; i < argc; i++) { + char* lib_str = argv[i]; + library lib; + if (strcmp(lib_str, "mpi") == 0) + lib = library::MPI; + else if (strcmp(lib_str, "ipc_put") == 0) + lib = library::IPC; + else if (strcmp(lib_str, "ipc_get") == 0) + lib = library::IPC_get; + else if (strcmp(lib_str, "xccl") == 0) + lib = library::NCCL; + else { + if (myid == 0) + fprintf(stderr, "Invalid library %s\n", lib_str); + return 1; + } + if (myid == 0) + fprintf(stderr, "Testing library %s\n", lib_str); + for (int size = 2; size < max_num_bytes; size+=2) { + if (myid == 0) + fprintf(stderr, "\033[2K\rTesting size: %llu bytes", 1ull << size); + Comm bench(lib); + if (myid == 0) { + sprintf(file_name, "%s/%s/%llu.out", folder, + lib_str, 1ull << size); + freopen(file_name, "w", stdout); + } + for (int sender = 0; sender < numproc; sender++) + for (int recver = 0; recver < numproc; recver++) + bench.add(sendbuf, recver * (1ull << size), recvbuf, sender * (1ull << size), + 1ull << size, sender, recver); + bench.measure(5, 10); + } + + if (myid == 0) + fprintf(stderr, "\033[2K\rCompleted testing library %s\n", lib_str); + } + free(sendbuf); + free(recvbuf); + MPI_Finalize(); +} diff --git a/bench_broadcast.cpp b/bench_broadcast.cpp new file mode 100755 index 0000000..8da3cfe --- /dev/null +++ b/bench_broadcast.cpp @@ -0,0 +1,66 @@ +#include "commbench.h" +#include +#include +#include + +using namespace CommBench; + +int main(int argc, char *argv[]) { + init(); + + if (argc < 3) { + if (myid == 0) + fprintf(stderr, "Invalid arguments, usage is ./bench \n"); + return 1; + } + char* folder = argv[1]; + + int *sendbuf, *recvbuf; + int max_num_bytes = 31; + char file_name[100]; + int gpus = numproc; + + allocate(sendbuf, 1ull << max_num_bytes); + allocate(recvbuf, 1ull << max_num_bytes); + + for (int i = 2; i < argc; i++) { + char* lib_str = argv[i]; + library lib; + if (strcmp(lib_str, "mpi") == 0) + lib = library::MPI; + else if (strcmp(lib_str, "ipc_put") == 0) + lib = library::IPC; + else if (strcmp(lib_str, "ipc_get") == 0) + lib = library::IPC_get; + else if (strcmp(lib_str, "xccl") == 0) + lib = library::NCCL; + else { + if (myid == 0) + fprintf(stderr, "Invalid library %s\n", lib_str); + return 1; + } + if (myid == 0) + fprintf(stderr, "Testing library %s\n", lib_str); + for (int source = 0; source < gpus; source++) { + for (int size = 2; size < max_num_bytes; size+=2) { + if (myid == 0) + fprintf(stderr, "\033[2K\rBroadcaster: %d, size: %llu", source, 1ull << size); + Comm bench(lib); + if (myid == 0) { + sprintf(file_name, "%s/%s/%d_%llu.out", folder, + lib_str, source, 1ull << size); + freopen(file_name, "w", stdout); + } + for (int p = 0; p < numproc; p++) + bench.add(sendbuf, 0, recvbuf, 0, 1ull << size, source, p); + bench.measure(5, 10); + } + } + + if (myid == 0) + fprintf(stderr, "\033[2K\rCompleted testing library %s\n", lib_str); + } + free(sendbuf); + free(recvbuf); + MPI_Finalize(); +} diff --git a/bench_gather.cpp b/bench_gather.cpp new file mode 100755 index 0000000..447631b --- /dev/null +++ b/bench_gather.cpp @@ -0,0 +1,67 @@ +#include "commbench.h" +#include +#include +#include + +using namespace CommBench; + +int main(int argc, char *argv[]) { + init(); + + if (argc < 3) { + if (myid == 0) + fprintf(stderr, "Invalid arguments, usage is ./bench \n"); + return 1; + } + char* folder = argv[1]; + + int *sendbuf, *recvbuf; + int max_num_bytes = 27; + char file_name[100]; + int gpus = numproc; + + allocate(sendbuf, 1ull << max_num_bytes); + allocate(recvbuf, (1ull << max_num_bytes) * numproc); + + for (int i = 2; i < argc; i++) { + char* lib_str = argv[i]; + library lib; + if (strcmp(lib_str, "mpi") == 0) + lib = library::MPI; + else if (strcmp(lib_str, "ipc_put") == 0) + lib = library::IPC; + else if (strcmp(lib_str, "ipc_get") == 0) + lib = library::IPC_get; + else if (strcmp(lib_str, "xccl") == 0) + lib = library::NCCL; + else { + if (myid == 0) + fprintf(stderr, "Invalid library %s\n", lib_str); + return 1; + } + if (myid == 0) + fprintf(stderr, "Testing library %s\n", lib_str); + for (int dest = 0; dest < gpus; dest++) { + // fprintf(stderr, "P2P Comm: %d->%d\n", source, dest); + for (int size = 2; size < max_num_bytes; size+=2) { + if (myid == 0) + fprintf(stderr, "\033[2K\rGatherer: %d, size: %llu", dest, 1ull << size); + Comm bench(lib); + if (myid == 0) { + sprintf(file_name, "%s/%s/%d_%llu.out", folder, + lib_str, dest, 1ull << size); + freopen(file_name, "w", stdout); + } + for (int p = 0; p < numproc; p++) + bench.add(sendbuf, 0, recvbuf, p * (1ull << size), 1ull << size, p, dest); + bench.measure(5, 10); + } + } + + if (myid == 0) + fprintf(stderr, "\033[2K\rCompleted testing library %s\n", lib_str); + } + free(sendbuf); + free(recvbuf); + MPI_Finalize(); +} diff --git a/bench_p2p.cpp b/bench_p2p.cpp new file mode 100755 index 0000000..c848960 --- /dev/null +++ b/bench_p2p.cpp @@ -0,0 +1,68 @@ +#include "commbench.h" +#include +#include +#include + +using namespace CommBench; + +int main(int argc, char *argv[]) { + init(); + + if (argc < 3) { + if (myid == 0) + fprintf(stderr, "Invalid arguments, usage is ./bench \n"); + return 1; + } + char* folder = argv[1]; + + int *sendbuf, *recvbuf; + int max_num_bytes = 31; + char file_name[100]; + int gpus = numproc; + + allocate(sendbuf, 1ull << max_num_bytes); + allocate(recvbuf, 1ull << max_num_bytes); + + for (int i = 2; i < argc; i++) { + char* lib_str = argv[i]; + library lib; + if (strcmp(lib_str, "mpi") == 0) + lib = library::MPI; + else if (strcmp(lib_str, "ipc_put") == 0) + lib = library::IPC; + else if (strcmp(lib_str, "ipc_get") == 0) + lib = library::IPC_get; + else if (strcmp(lib_str, "xccl") == 0) + lib = library::NCCL; + else { + if (myid == 0) + fprintf(stderr, "Invalid library %s\n", lib_str); + return 1; + } + if (myid == 0) + fprintf(stderr, "Testing library %s\n", lib_str); + for (int source = 0; source < gpus; source++) { + for (int dest = 0; dest < gpus; dest++) { + if (myid == 0) + fprintf(stderr, "\033[2K\rP2P Comm: %d->%d", source, dest); + for (int size = 2; size < max_num_bytes; size+=2) { + Comm bench(lib); + if (myid == 0) { + sprintf(file_name, "%s/%s/%d_%d_%llu.out", folder, + lib_str, source, dest, 1ull << size); + freopen(file_name, "w", stdout); + } + bench.add(sendbuf, recvbuf, 1ull << size, source, dest); + // bench.add(sendbuf, recvbuf, 1ull << size, dest, source); + bench.measure(5, 10); + } + } + } + + if (myid == 0) + fprintf(stderr, "\033[2K\rCompleted testing library %s\n", lib_str); + } + free(sendbuf); + free(recvbuf); + MPI_Finalize(); +} diff --git a/data_final.tar.gz b/data_final.tar.gz new file mode 100644 index 0000000..2d8710e Binary files /dev/null and b/data_final.tar.gz differ diff --git a/instructions.md b/instructions.md new file mode 100644 index 0000000..1246bdb --- /dev/null +++ b/instructions.md @@ -0,0 +1,85 @@ +# Overview of Usage + +This branch provides the same `CommBench` executable with the standard usage flags, but also adds the `--source` and `--dest` flags that can be used for P2P, gather, and broadcast patterns. Gather only supports `--dest` and broadcast only supports `--source`. + +In addition, we provide four new targets. `bench_p2p`, `bench_gather`, `bench_broadcast` and `bench_alltoall` for these respective patterns. These provide support for "mass benchmarking tests", testing all combinations of the available processors. The usage for these is `bench_ ` where you can provide a space separated list of libraries (out of `ipc_get`, `ipc_put`, `mpi`, and `xccl`) to test. + +This will then dump the output for each of these benchmarks into a lot of files in the directories `output_folder/` for each specified library. These can be parsed and visualized later. These DO NOT create the directories themselves, so make sure that the directory `output_folder/` exists for every library you want to test. Otherwise the benchmarks will silently go on + +# Building CommBench for El Dorado Benchmarking + +In this branch, most of the "debug" output is disabled by default and when enabled is directed to `stderr`. To enable this output, configure CMake with the `-DPRINT_DEBUG=ON` flag, but by default it is off so that only the necessary output is shown. + +To build, run the following in the CommBench directory: +```sh +cmake -S . -B build -DUSE_HIP=ON -DUSE_XCCL=ON -DXCCL_PATH=/opt/rocm-6.3.1 -DUSE_GTL=ON -DGTL_PATH=/opt/cray/pe/mpich/8.1.32/ofi/amd/6.0 -DCMAKE_EXPORT_COMPILE_COMMANDS=TRUE -DMPI_C_COMPILER=/opt/cray/pe/mpich/8.1.32/ofi/amd/6.0/bin/mpicc -DMPI_CXX_COMPILER=/opt/cray/pe/mpich/8.1.32/ofi/amd/6.0/bin/mpicxx +``` +and if you want to use CommBench's `IPC_kernel` for the `ipc_put` and `ipc_get` communications, add the flag `-DUSE_BLIT_KERNEL=ON`. It may be helpful to have two separate build folders, one with and one without this flag. Adjust `-B ` accordingly. + +Make sure to have the `PrgEnv-amd/8.6.0`, `craype-accel-amd-gfx942`, `cmake/3.24.2` and `rocm/6.3.1` modules loaded. To build for `rocm/6.4.1`, load `amd/6.4.1` and `rocm/6.4.1`. + +# Running Benchmarks + +Make sure to have to have `MPICH_GPU_SUPPORT_ENABLED=1` set. Experiment with the `HSA_ENABLE_SDMA=0` environment variable as well. + +Set up the output folders before running the benchmark. The scripts expect this file structure: +``` +pattern +├── CPX +│ ├── lib1 +│ ├── lib2 +│ └── lib3 +├── SPX +│ ├── lib1 +│ ├── lib2 +│ └── lib3 +└── TPX + ├── lib1 + ├── lib2 + └── lib3 +``` +You can technically have different libraries specified for the partitions, but as mentioned before, make sure the folders are created inside the partition. + +Allocate an instance with +```sh +flux alloc -N1 --setattr=gpumode= --conf=resource.rediscover=true --time-limit=