diff --git a/CMakeLists.txt b/CMakeLists.txt
old mode 100644
new mode 100755
index 3fb140f..e8aa377
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,6 +53,7 @@ option(USE_BLIT_KERNEL "Use Blit/Copy Kernel for IPC communication" OFF)
 
 option(USE_CALIPER "Enable Caliper benchmarking support" OFF)
 option(USE_JSONCPP "Enable support for passing a config file" OFF)
+option(PRINT_DEBUG "Enable debug output" OFF)
 
 
 # Create a variable for NCCL path
@@ -83,15 +84,31 @@ add_subdirectory(CommFunctions) # add subdirectory after all configuration, may
 
 # Add your source files and executable here
 add_executable(${PROJECT_NAME} main.cpp)
-
-target_link_libraries(${PROJECT_NAME} PUBLIC CommFunctions commbench_compiler_flags OpenMP::OpenMP_CXX)
+add_executable(bench_p2p bench_p2p.cpp)
+add_executable(bench_alltoall bench_alltoall.cpp)
+add_executable(bench_broadcast bench_broadcast.cpp)
+add_executable(bench_gather bench_gather.cpp)
+
+target_link_libraries(${PROJECT_NAME} PUBLIC CommFunctions commbench_compiler_flags)
+target_link_libraries(bench_p2p PUBLIC CommFunctions commbench_compiler_flags)
+target_link_libraries(bench_alltoall PUBLIC CommFunctions commbench_compiler_flags)
+target_link_libraries(bench_broadcast PUBLIC CommFunctions commbench_compiler_flags)
+target_link_libraries(bench_gather PUBLIC CommFunctions commbench_compiler_flags)
 install(TARGETS ${PROJECT_NAME} DESTINATION bin)
 install(DIRECTORY ${PROJECT_SOURCE_DIR}/CommFunctions DESTINATION include FILES_MATCHING PATTERN "*.h")
 
 if (USE_CUDA)
     set_source_files_properties(main.cpp PROPERTIES LANGUAGE CUDA)
+    set_source_files_properties(bench_p2p.cpp PROPERTIES LANGUAGE CUDA)
+    set_source_files_properties(bench_alltoall.cpp PROPERTIES LANGUAGE CUDA)
+    set_source_files_properties(bench_broadcast.cpp PROPERTIES LANGUAGE CUDA)
+    set_source_files_properties(bench_gather.cpp PROPERTIES LANGUAGE CUDA)
 elseif (USE_HIP)
     set_source_files_properties(main.cpp PROPERTIES LANGUAGE HIP)
+    set_source_files_properties(bench_p2p.cpp PROPERTIES LANGUAGE HIP)
+    set_source_files_properties(bench_alltoall.cpp PROPERTIES LANGUAGE HIP)
+    set_source_files_properties(bench_broadcast.cpp PROPERTIES LANGUAGE HIP)
+    set_source_files_properties(bench_gather.cpp PROPERTIES LANGUAGE HIP)
 endif()
 
 if (USE_JSONCPP)
diff --git a/CommFunctions/CMakeLists.txt b/CommFunctions/CMakeLists.txt
old mode 100644
new mode 100755
index ccf3da4..1b11657
--- a/CommFunctions/CMakeLists.txt
+++ b/CommFunctions/CMakeLists.txt
@@ -2,6 +2,15 @@
 add_library(CommFunctions INTERFACE commbench.h validate.h comm.h util.h)
 target_include_directories(CommFunctions INTERFACE ${MPI_CXX_COMPILER_INCLUDE_DIRS})
 target_link_libraries(CommFunctions INTERFACE MPI::MPI_CXX OpenMP::OpenMP_CXX)
+if (NOT USE_GASNET)
+    target_compile_definitions(CommFunctions INTERFACE "USE_MPI")
+else()
+    target_compile_definitions(CommFunctions INTERFACE "USE_GASNET")
+endif()
+
+if (PRINT_DEBUG)
+    target_compile_definitions(CommFunctions INTERFACE "PRINT_DEBUG")
+endif()
 
 if (USE_GTL)
     target_link_libraries(CommFunctions INTERFACE mpi_gtl_hsa)
diff --git a/CommFunctions/comm.h b/CommFunctions/comm.h
old mode 100644
new mode 100755
index b82249a..ea87586
--- a/CommFunctions/comm.h
+++ b/CommFunctions/comm.h
@@ -131,28 +131,28 @@
     numrecv = 0;
 
     if(myid == printid) {
-      printf("printid: %d Create Bench %d with %d processors\n", printid, benchid, numproc);
-      printf("  Port: ");
+      fprintf(stderr, "printid: %d Create Bench %d with %d processors\n", printid, benchid, numproc);
+      fprintf(stderr, "  Port: ");
 #ifdef PORT_CUDA
-      printf("CUDA, ");
+      fprintf(stderr, "CUDA, ");
 #elif defined PORT_HIP
-      printf("HIP, ");
+      fprintf(stderr, "HIP, ");
 #elif defined PORT_ONEAPI
-      printf("ONEAPI, ");
+      fprintf(stderr, "ONEAPI, ");
 #else
-      printf("CPU, ");
+      fprintf(stderr, "CPU, ");
 #endif
 #ifdef CAP_NCCL
-      printf("NCCL, ");
+      fprintf(stderr, "NCCL, ");
 #elif CAP_ONECCL
-      printf("ONECCL, ");
+      fprintf(stderr, "ONECCL, ");
 #endif
 #ifdef IPC_kernel
-      printf("IPC will call a kernel, \n");
+      fprintf(stderr, "IPC will call a kernel, \n");
 #endif
-      printf("Library: ");
+      fprintf(stderr, "Library: ");
       print_lib(lib);
-      printf("\n");
+      fprintf(stderr, "\n");
     }
     if(lib == NCCL) {
 #ifdef CAP_NCCL
@@ -165,7 +165,7 @@
         broadcast(&id);
         ncclCommInitRank(&comm_nccl, numproc, id, myid); // this is where it the third gpu hangs
         if(myid == printid)
-          printf("******************** NCCL COMMUNICATOR IS CREATED\n");
+          fprintf(stderr, "******************** NCCL COMMUNICATOR IS CREATED\n");
       }
 #ifdef PORT_CUDA
       cudaStreamCreate(&stream_nccl);
@@ -195,7 +195,7 @@
         auto ctx = ccl::create_context(CommBench::q.get_context());
         comm_ccl = new ccl::communicator(ccl::create_communicator(numproc, myid, dev, ctx, kvs));
         if(myid == printid)
-          printf("******************** ONECCL COMMUNICATOR IS CREATED\n");
+          fprintf(stderr, "******************** ONECCL COMMUNICATOR IS CREATED\n");
         stream_ccl = new ccl::stream(ccl::create_stream(CommBench::q));
       }
 #endif
@@ -215,7 +215,7 @@
       zeDeviceGetCommandQueueGroupProperties(hDevice, &numQueueGroups, queueProperties.data());
       int n_commands_lists = 0;
       if(myid == printid)
-        printf("device descovery:\n");
+        fprintf(stderr, "device descovery:\n");
       for (uint32_t i = 0; i < numQueueGroups; i++) {
         bool isCompute = false;
         bool isCopy = false;
@@ -224,10 +224,10 @@
         if ((queueProperties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY))
           isCopy = true;
         if(myid == printid)
-          printf("group %d isCompute %d isCopy %d\n", i, isCompute, isCopy);
+          fprintf(stderr, "group %d isCompute %d isCopy %d\n", i, isCompute, isCopy);
         for (uint32_t j = 0; j < queueProperties[i].numQueues; j++) {
           if(myid == printid)
-            printf("  queue: %d\n", j);
+            fprintf(stderr, "  queue: %d\n", j);
           n_commands_lists++;
         }
       }
@@ -253,7 +253,7 @@
           }
         }
         if(myid == printid)
-          printf("number of command queues: %ld\n", command_queue.size());
+          fprintf(stderr, "number of command queues: %ld\n", command_queue.size());
       }
 #endif
     }
@@ -281,7 +281,7 @@
     buffer_list.clear();
     buffer_count.clear();
     if(myid == printid)
-      printf("memory freed.\n");
+      fprintf(stderr, "memory freed.\n");
   }
 
   template <typename T>
@@ -310,9 +310,9 @@
         T *ptr = nullptr;
         // MPI_Recv(&ptr, sizeof(T*), MPI_BYTE, i, 0, comm_mpi, MPI_STATUS_IGNORE);
         recv(&ptr, i);
-        printf("Bench %d proc %d allocate %p count %ld (", benchid, i, ptr, count);
+        fprintf(stderr, "Bench %d proc %d allocate %p count %ld (", benchid, i, ptr, count);
         print_data(count * sizeof(T));
-        printf(")\n");
+        fprintf(stderr, ")\n");
       }
     }
   }*/
@@ -334,7 +334,7 @@
     // OMIT ZERO MESSAGE SIZE
     if(count == 0) {
       if(myid == printid)
-        printf("Bench %d communication (%d->%d) count = 0 (skipped)\n", benchid, sendid, recvid);
+        fprintf(stderr, "Bench %d communication (%d->%d) count = 0 (skipped)\n", benchid, sendid, recvid);
       return;
     }
     // ADJUST MESSAGE SIZE
@@ -365,11 +365,11 @@
       pair(&sendoffset, &sendoffset_temp, sendid, printid);
       pair(&recvoffset, &recvoffset_temp, recvid, printid);
       if(myid == printid) {
-        printf("Bench %d comm %d (%d->%d) sendbuf %p sendoffset %zu recvbuf %p recvoffset %zu count %zu (", benchid, numcomm, sendid, recvid, sendbuf_temp, sendoffset_temp, recvbuf_temp, recvoffset_temp, count);
+        fprintf(stderr, "Bench %d comm %d (%d->%d) sendbuf %p sendoffset %zu recvbuf %p recvoffset %zu count %zu (", benchid, numcomm, sendid, recvid, sendbuf_temp, sendoffset_temp, recvbuf_temp, recvoffset_temp, count);
         print_data(count * sizeof(T));
-        printf(") ");
+        fprintf(stderr, ") ");
         print_lib(lib);
-        printf("\n");
+        fprintf(stderr, "\n");
       }
     }
     numcomm++;
@@ -398,13 +398,13 @@
           // PUT (SENDER INITIALIZES)
           pair(&queue, queue_temp, sendid, printid);
           if(myid == printid)
-            printf("selected put queue: %d\n", queue_temp);
+            fprintf(stderr, "selected put queue: %d\n", queue_temp);
         }
         if(lib == IPC_get) {
           // GET (RECVER INITIALIZES)
           pair(&queue, queue_temp, recvid, printid);
           if(myid == printid)
-            printf("selected get queue: %d\n", queue_temp);
+            fprintf(stderr, "selected get queue: %d\n", queue_temp);
         }
       }
     }
@@ -448,7 +448,7 @@
           if(sendid != recvid) {
             int error = -1;
 #ifdef PORT_CUDA
-            printf("trying ipc \n");
+            fprintf(stderr, "trying ipc \n");
             cudaIpcMemHandle_t memhandle;
             recv(&memhandle, recvid);
             error = cudaIpcOpenMemHandle((void**)&remotebuf[numsend], memhandle, cudaIpcMemLazyEnablePeerAccess);
@@ -473,7 +473,7 @@
 	    error = zeMemOpenIpcHandle(zeContext, zeDevice, memhandle, 0, (void**)&remotebuf[numsend]);
 #endif
             if(error)
-              printf("IpcOpenMemHandle error %d\n", error);
+              fprintf(stderr, "comm.h:476 IpcOpenMemHandle error %d\n", error);
             recv(&remoteoffset[numsend], recvid);
           }
 #ifdef IPC_ze
@@ -508,7 +508,7 @@
             // send(&memhandle, sendid);
 #endif
             if(error)
-              printf("IpcGetMemHandle error %d\n", error);
+              fprintf(stderr, "comm.h:511 IpcGetMemHandle error %d\n", error);
             send(&sendoffset, recvid);
           }
           break;
@@ -596,7 +596,7 @@
             // send(&memhandle, sendid);
 #endif
             if(error)
-              printf("IpcGetMemHandle error %d\n", error);
+              fprintf(stderr, "IpcGetMemHandle error %d\n", error);
             send(&recvoffset, sendid);
           }
           break;
@@ -642,7 +642,7 @@
             error = zeMemOpenIpcHandle(zeContext, zeDevice, memhandle, 0, (void**)&remotebuf[numrecv]);
 #endif
             if(error)
-              printf("IpcOpenMemHandle error %d\n", error);
+              fprintf(stderr, "comm.h:645 IpcOpenMemHandle error %d\n", error);
             recv(&remoteoffset[numrecv], sendid);
           }
 #ifdef IPC_ze
@@ -702,14 +702,14 @@
     double maxTime;
     double avgTime;
     CommBench::measure(warmup, numiter, minTime, medTime, maxTime, avgTime, *this);
-    if(myid == printid) {
+    if(myid == 0) {
       size_t data = count * sizeof(T);
-      printf("data: "); print_data(data); printf("\n");
-      printf("minTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", minTime * 1e6, minTime / data * 1e12, data / minTime / 1e9);
-      printf("medTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", medTime * 1e6, medTime / data * 1e12, data / medTime / 1e9);
-      printf("maxTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", maxTime * 1e6, maxTime / data * 1e12, data / maxTime / 1e9);
-      printf("avgTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", avgTime * 1e6, avgTime / data * 1e12, data / avgTime / 1e9);
-      printf("\n");
+      // fprintf(stderr, "data: "); print_data(data); fprintf(stderr, "\n");
+      fprintf(stdout, "minTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", minTime * 1e6, minTime / data * 1e12, data / minTime / 1e9);
+      fprintf(stdout, "medTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", medTime * 1e6, medTime / data * 1e12, data / medTime / 1e9);
+      fprintf(stdout, "maxTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", maxTime * 1e6, maxTime / data * 1e12, data / maxTime / 1e9);
+      fprintf(stdout, "avgTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", avgTime * 1e6, avgTime / data * 1e12, data / avgTime / 1e9);
+      // fprintf(stderr, "\n");
     }
   };
 
@@ -735,19 +735,19 @@
     std::vector<size_t> matrix = getMatrix();
 
     if(myid == printid) {
-      printf("\nCommBench %d: ", benchid);
+      fprintf(stderr, "\nCommBench %d: ", benchid);
       print_lib(lib);
-      printf(" communication matrix (reciever x sender) nnz: %d\n", numcomm);
+      fprintf(stderr, " communication matrix (reciever x sender) nnz: %d\n", numcomm);
       for(int recver = 0; recver < numproc; recver++) {
         for(int sender = 0; sender < numproc; sender++) {
           size_t count = matrix[recver * numproc + sender];
           if(count)
-            printf("%ld ", count);
-            // printf("1 ");
+            fprintf(stderr, "%ld ", count);
+            // fprintf(stderr, "1 ");
           else
-            printf(". ");
+            fprintf(stderr, ". ");
         }
-        printf("\n");
+        fprintf(stderr, "\n");
       }
     }
     long sendTotal = 0;
@@ -775,24 +775,24 @@
       MPI_Allreduce(MPI_IN_PLACE, &total_count, 1, MPI_LONG, MPI_SUM, comm_mpi);
       if(myid == printid) {
         for(int p = 0; p < numproc; p++) {
-          printf("proc %d: %d pieces count %ld ", p, total_buffs[p], total_counts[p]);
+          fprintf(stderr, "proc %d: %d pieces count %ld ", p, total_buffs[p], total_counts[p]);
           print_data(total_counts[p] * sizeof(T));
-          printf("\n");
+          fprintf(stderr, "\n");
         }
-        printf("total pieces: %d count %ld ", total_buff, total_count);
+        fprintf(stderr, "total pieces: %d count %ld ", total_buff, total_count);
         print_data(total_count * sizeof(T));
-        printf("\n");
+        fprintf(stderr, "\n");
       }
     }*/
 
     if(myid == printid) {
-      printf("send footprint: %ld ", sendTotal);
+      fprintf(stderr, "send footprint: %ld ", sendTotal);
       print_data(sendTotal * sizeof(T));
-      printf("\n");
-      printf("recv footprint: %ld ", recvTotal);
+      fprintf(stderr, "\n");
+      fprintf(stderr, "recv footprint: %ld ", recvTotal);
       print_data(recvTotal * sizeof(T));
-      printf("\n");
-      printf("\n");
+      fprintf(stderr, "\n");
+      fprintf(stderr, "\n");
     }
   }
   template <typename T>
@@ -817,12 +817,12 @@
 
     /* if(myid == printid) {
       char filename[2048];
-      sprintf(filename, "matrix_%d.txt", benchid);
+      sfprintf(stderr, filename, "matrix_%d.txt", benchid);
       FILE *matfile = fopen(filename, "w");
       for(int recver = 0; recver < numproc; recver++) {
         for(int sender = 0; sender < numproc; sender++)
-          fprintf(matfile, "%ld ", matrix[sender * numproc + recver]);
-        fprintf(matfile, "\n");
+          ffprintf(stderr, matfile, "%ld ", matrix[sender * numproc + recver]);
+        ffprintf(stderr, matfile, "\n");
       }
       fclose(matfile);
     }*/
@@ -831,10 +831,10 @@
 
 #ifdef IPC_kernel
   template <typename T>
-  __global__ void copy_kernel(T *output, T *input, size_t count) {
+  __global__ void copy_kernel(T *output, const T *input) {
     const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if(tid < count)
-      output[tid] = input[tid];
+    // if(tid < count)
+    output[tid] = input[tid];
   }
 #endif
 
@@ -902,7 +902,7 @@
         for(int send = 0; send < numsend; send++) {
 #ifdef IPC_kernel
   #if defined(PORT_CUDA) || defined(PORT_HIP)
-          copy_kernel<T><<<(sendcount[send] + 255) / 256, 256, 0, stream_ipc[send]>>>(remotebuf[send] + remoteoffset[send], sendbuf[send] + sendoffset[send], sendcount[send]);
+          copy_kernel<T><<<dim3((sendcount[send] + 1023) / 1024), dim3(1024), 0, stream_ipc[send]>>>(remotebuf[send] + remoteoffset[send], sendbuf[send] + sendoffset[send]);
   #elif defined PORT_ONEAPI && !defined IPC_ze
           // q_ipc[send].memcpy(remotebuf[send] + remoteoffset[send], sendbuf[send] + sendoffset[send], sendcount[send] * sizeof(T));
   #endif
@@ -931,7 +931,7 @@
         for(int recv = 0; recv < numrecv; recv++) {
 #ifdef IPC_kernel
   #if defined(PORT_CUDA) || defined(PORT_HIP)
-          copy_kernel<T><<<(recvcount[recv] + 255) / 256, 256, 0, stream_ipc[recv]>>>(recvbuf[recv] + recvoffset[recv], remotebuf[recv] + remoteoffset[recv], recvcount[recv]);
+          copy_kernel<T><<<dim3((recvcount[recv] + 1023) / 1024), dim3(1024), 0, stream_ipc[recv]>>>(recvbuf[recv] + recvoffset[recv], remotebuf[recv] + remoteoffset[recv]);
   #elif defined PORT_ONEAPI && !defined IPC_ze
           // q_ipc[send].memcpy(remotebuf[send] + remoteoffset[send], sendbuf[send] + sendoffset[send], sendcount[send] * sizeof(T));
   #endif
@@ -969,7 +969,7 @@
 #endif
       default:
         print_lib(lib);
-        printf(" option is not implemented!\n");
+        fprintf(stderr, " option is not implemented!\n");
         break;
     }
   }
@@ -1038,7 +1038,7 @@
 #endif
       default:
         print_lib(lib);
-        printf(" option is not implemented!\n");
+        fprintf(stderr, " option is not implemented!\n");
         break;
     }
   }
diff --git a/CommFunctions/commbench.h b/CommFunctions/commbench.h
old mode 100644
new mode 100755
index 7078bc0..20f0016
--- a/CommFunctions/commbench.h
+++ b/CommFunctions/commbench.h
@@ -92,7 +92,11 @@
 
 namespace CommBench
 {
+#ifdef PRINT_DEBUG
   static int printid = 0;
+#else
+  static int printid = -1;
+#endif
   static int numbench = 0;
   static std::vector<void*> benchlist;
   static int mydevice = -1;
@@ -124,26 +128,26 @@ namespace CommBench
 
   static void print_data(size_t data) {
     if (data < 1e3)
-      printf("%d bytes", (int)data);
+      fprintf(stderr, "%d bytes", (int)data);
     else if (data < 1e6)
-      printf("%.4f KB", data / 1e3);
+      fprintf(stderr, "%.4f KB", data / 1e3);
     else if (data < 1e9)
-      printf("%.4f MB", data / 1e6);
+      fprintf(stderr, "%.4f MB", data / 1e6);
     else if (data < 1e12)
-      printf("%.4f GB", data / 1e9);
+      fprintf(stderr, "%.4f GB", data / 1e9);
     else
-      printf("%.4f TB", data / 1e12);
+      fprintf(stderr, "%.4f TB", data / 1e12);
   }
   static void print_lib(library lib) {
     switch(lib) {
-      case dummy      : printf("dummy");        break;
-      case IPC        : printf("IPC (PUT)");    break;
-      case IPC_get    : printf("IPC (GET)");    break;
-      case MPI        : printf("MPI");          break;
-      case NCCL       : printf("NCCL");         break;
-      case GEX        : printf("GASNET (PUT)"); break;
-      case GEX_get    : printf("GASNET (GET)"); break;
-      case numlib     : printf("numlib");       break;
+      case dummy      : fprintf(stderr, "dummy");        break;
+      case IPC        : fprintf(stderr, "IPC (PUT)");    break;
+      case IPC_get    : fprintf(stderr, "IPC (GET)");    break;
+      case MPI        : fprintf(stderr, "MPI");          break;
+      case NCCL       : fprintf(stderr, "NCCL");         break;
+      case GEX        : fprintf(stderr, "GASNET (PUT)"); break;
+      case GEX_get    : fprintf(stderr, "GASNET (GET)"); break;
+      case numlib     : fprintf(stderr, "numlib");       break;
     }
   }
 
@@ -289,12 +293,12 @@ namespace CommBench
       MPI_Comm_size(comm_mpi, &numproc);
       if(myid == printid) {
         if(!init_mpi) {
-          printf("#################### MPI IS INITIALIZED, it is user's responsibility to finalize.\n");
+          fprintf(stderr, "#################### MPI IS INITIALIZED, it is user's responsibility to finalize.\n");
           int provided;
           MPI_Query_thread(&provided);
-          printf("provided thread support: %d\n", provided);
+          fprintf(stderr, "provided thread support: %d\n", provided);
         }
-        printf("******************** MPI COMMUNICATOR IS CREATED\n");
+        fprintf(stderr, "******************** MPI COMMUNICATOR IS CREATED\n");
       }
     }
 #endif
@@ -307,7 +311,7 @@ namespace CommBench
       myep.push_back(ep_primordial); // primordial is index 0
       myep_ptr.push_back(nullptr); // primordial segment is 0
       if(myid == printid)
-        printf("******************** GASNET CLIENT IS CREATED\n");
+        fprintf(stderr, "******************** GASNET CLIENT IS CREATED\n");
 #ifdef USE_GASNET
       gex_AM_Entry_t handlers[] = {
         {am_recv_index, (gex_AM_Fn_t)am_recv, GEX_FLAG_AM_REQUEST | GEX_FLAG_AM_SHORT, 0},
@@ -371,19 +375,19 @@ namespace CommBench
     int numiter = times.size();
 
     if(myid == printid) {
-      printf("%d measurement iterations (sorted):\n", numiter);
+      fprintf(stderr, "%d measurement iterations (sorted):\n", numiter);
       for(int iter = 0; iter < numiter; iter++) {
-        printf("time: %.4e", times[iter] * 1e6);
+        fprintf(stderr, "time: %.4e", times[iter] * 1e6);
         if(iter == 0)
-          printf(" -> min\n");
+          fprintf(stderr, " -> min\n");
         else if(iter == numiter / 2)
-          printf(" -> median\n");
+          fprintf(stderr, " -> median\n");
         else if(iter == numiter - 1)
-          printf(" -> max\n");
+          fprintf(stderr, " -> max\n");
         else
-          printf("\n");
+          fprintf(stderr, "\n");
       }
-      printf("\n");
+      fprintf(stderr, "\n");
     }
     double minTime = times[0];
     double medTime = times[numiter / 2];
@@ -393,12 +397,12 @@ namespace CommBench
       avgTime += times[iter];
     avgTime /= numiter;
     if(myid == printid) {
-      printf("data: "); print_data(data); printf("\n");
-      printf("minTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", minTime * 1e6, minTime / data * 1e12, data / minTime / 1e9);
-      printf("medTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", medTime * 1e6, medTime / data * 1e12, data / medTime / 1e9);
-      printf("maxTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", maxTime * 1e6, maxTime / data * 1e12, data / maxTime / 1e9);
-      printf("avgTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", avgTime * 1e6, avgTime / data * 1e12, data / avgTime / 1e9);
-      printf("\n");
+      fprintf(stderr, "data: "); print_data(data); fprintf(stderr, "\n");
+      fprintf(stderr, "minTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", minTime * 1e6, minTime / data * 1e12, data / minTime / 1e9);
+      fprintf(stderr, "medTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", medTime * 1e6, medTime / data * 1e12, data / medTime / 1e9);
+      fprintf(stderr, "maxTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", maxTime * 1e6, maxTime / data * 1e12, data / maxTime / 1e9);
+      fprintf(stderr, "avgTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", avgTime * 1e6, avgTime / data * 1e12, data / avgTime / 1e9);
+      fprintf(stderr, "\n");
     }
   }
 
@@ -457,7 +461,7 @@ namespace CommBench
 
     }
     //for(int i = 0; i < numproc; i++)
-    //  printf("myid %d i: %d sendcount %d senddispl %d recvcount %d recvdispl %d\n", myid, i, sendcount[i], senddispl[i], recvcount[i], recvdispl[i]);
+    //  fprintf(stderr, "myid %d i: %d sendcount %d senddispl %d recvcount %d recvdispl %d\n", myid, i, sendcount[i], senddispl[i], recvcount[i], recvdispl[i]);
     T *sendbuf;
     T *recvbuf;
     allocate(sendbuf, senddispl[numproc]);
@@ -495,7 +499,7 @@ namespace CommBench
     double starts[numiter];
 
     if(myid == printid)
-      printf("%d warmup iterations (in order):\n", warmup);
+      fprintf(stderr, "%d warmup iterations (in order):\n", warmup);
     for (int iter = -warmup; iter < numiter; iter++) {
       for(int send = 0; send < comm.numsend; send++) {
 #if defined PORT_CUDA
@@ -519,7 +523,7 @@ namespace CommBench
       allreduce_max(&time);
       if(iter < 0) {
         if(myid == printid)
-          printf("startup %.2e warmup: %.2e\n", start * 1e6, time * 1e6);
+          fprintf(stderr, "startup %.2e warmup: %.2e\n", start * 1e6, time * 1e6);
       }
       else {
         starts[iter] = start;
@@ -530,19 +534,19 @@ namespace CommBench
     std::sort(starts, starts + numiter,  [](const double & a, const double & b) -> bool {return a < b;});
 
     if(myid == printid) {
-      printf("%d measurement iterations (sorted):\n", numiter);
+      fprintf(stderr, "%d measurement iterations (sorted):\n", numiter);
       for(int iter = 0; iter < numiter; iter++) {
-        printf("start: %.4e time: %.4e", starts[iter] * 1e6, times[iter] * 1e6);
+        fprintf(stderr, "start: %.4e time: %.4e", starts[iter] * 1e6, times[iter] * 1e6);
         if(iter == 0)
-          printf(" -> min\n");
+          fprintf(stderr, " -> min\n");
         else if(iter == numiter / 2)
-          printf(" -> median\n");
+          fprintf(stderr, " -> median\n");
         else if(iter == numiter - 1)
-          printf(" -> max\n");
+          fprintf(stderr, " -> max\n");
         else
-          printf("\n");
+          fprintf(stderr, "\n");
       }
-      printf("\n");
+      fprintf(stderr, "\n");
     }
     minTime = times[0];
     medTime = times[numiter / 2];
@@ -560,18 +564,18 @@ namespace CommBench
     allgather(&memory, memory_all.data());
     if(myid == printid) {
       size_t memory_total = 0;
-      printf("\n");
-      printf("CommBench memory report:\n");
+      fprintf(stderr, "\n");
+      fprintf(stderr, "CommBench memory report:\n");
       for(int i = 0; i < numproc; i++) {
-        printf("proc: %d memory ", i);
+        fprintf(stderr, "proc: %d memory ", i);
         print_data(memory_all[i]);
-        printf("\n");
+        fprintf(stderr, "\n");
         memory_total += memory_all[i];
       }
-      printf("total memory: ");
+      fprintf(stderr, "total memory: ");
       print_data(memory_total);
-      printf("\n");
-      printf("\n");
+      fprintf(stderr, "\n");
+      fprintf(stderr, "\n");
     }
   }
 
diff --git a/CommFunctions/util.h b/CommFunctions/util.h
old mode 100644
new mode 100755
index 4c94f05..36d28f1
--- a/CommFunctions/util.h
+++ b/CommFunctions/util.h
@@ -29,7 +29,7 @@ void setup_gpu() {
   cudaGetDeviceCount(&deviceCount);
   #ifdef CAP_NCCL
     if (numproc > deviceCount) {
-      printf("Warning: Using the same device for different ranks of a communicator for NCCL is not supported\n");
+      fprintf(stderr, "Warning: Using the same device for different ranks of a communicator for NCCL is not supported\n");
     }
   #endif
   int device = myid % deviceCount;
@@ -37,10 +37,10 @@ void setup_gpu() {
   set_device(device);
   if(!init) {
     if(myid == printid)
-      printf("CUDA PORT\n");
+      fprintf(stderr, "CUDA PORT\n");
     // SET DEVICE
     if(myid == printid)
-      printf("deviceCount: %d\n", deviceCount);
+      fprintf(stderr, "deviceCount: %d\n", deviceCount);
     // REPORT
     if(myid == printid){
       int error = system("nvidia-smi");
@@ -48,21 +48,21 @@ void setup_gpu() {
       int device;
       cudaGetDevice(&device);
       cudaGetDeviceCount(&deviceCount);
-      printf("Device %d Count: %d\n", device, deviceCount);
+      fprintf(stderr, "Device %d Count: %d\n", device, deviceCount);
       cudaDeviceProp deviceProp;
       cudaGetDeviceProperties(&deviceProp,0);
-      printf("Device %d name: %s\n",0,deviceProp.name);
-      printf("Clock Frequency: %f GHz\n",deviceProp.clockRate/1.e9);
-      printf("Computational Capabilities: %d, %d\n",deviceProp.major,deviceProp.minor);
-      printf("Maximum global memory size: %lu\n",deviceProp.totalGlobalMem);
-      printf("Maximum constant memory size: %lu\n",deviceProp.totalConstMem);
-      printf("Maximum shared memory size per block: %lu\n",deviceProp.sharedMemPerBlock);
-      printf("Maximum block dimensions: %dx%dx%d\n",deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);
-      printf("Maximum grid dimensions: %dx%dx%d\n",deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);
-      printf("Maximum threads per block: %d\n",deviceProp.maxThreadsPerBlock);
-      printf("Warp size: %d\n",deviceProp.warpSize);
-      printf("32-bit Reg. per block: %d\n",deviceProp.regsPerBlock);
-      printf("\n");
+      fprintf(stderr, "Device %d name: %s\n",0,deviceProp.name);
+      fprintf(stderr, "Clock Frequency: %f GHz\n",deviceProp.clockRate/1.e9);
+      fprintf(stderr, "Computational Capabilities: %d, %d\n",deviceProp.major,deviceProp.minor);
+      fprintf(stderr, "Maximum global memory size: %lu\n",deviceProp.totalGlobalMem);
+      fprintf(stderr, "Maximum constant memory size: %lu\n",deviceProp.totalConstMem);
+      fprintf(stderr, "Maximum shared memory size per block: %lu\n",deviceProp.sharedMemPerBlock);
+      fprintf(stderr, "Maximum block dimensions: %dx%dx%d\n",deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);
+      fprintf(stderr, "Maximum grid dimensions: %dx%dx%d\n",deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);
+      fprintf(stderr, "Maximum threads per block: %d\n",deviceProp.maxThreadsPerBlock);
+      fprintf(stderr, "Warp size: %d\n",deviceProp.warpSize);
+      fprintf(stderr, "32-bit Reg. per block: %d\n",deviceProp.regsPerBlock);
+      fprintf(stderr, "\n");
     }
   }
 #elif defined PORT_HIP
@@ -73,10 +73,10 @@ void setup_gpu() {
   set_device(device);
   if(!init) {
     if(myid == printid)
-      printf("HIP PORT\n");
+      fprintf(stderr, "HIP PORT\n");
     //DEVICE MANAGEMENT
     if(myid == printid)
-      printf("deviceCount: %d\n", deviceCount);
+      fprintf(stderr, "deviceCount: %d\n", deviceCount);
     // REPORT
     if(myid == printid) {
       system("rocm-smi");
@@ -84,34 +84,34 @@ void setup_gpu() {
       int device;
       hipGetDevice(&device);
       hipGetDeviceCount(&deviceCount);
-      printf("Device %d Count: %d\n", device, deviceCount);
+      fprintf(stderr, "Device %d Count: %d\n", device, deviceCount);
       hipDeviceProp_t deviceProp;
       hipGetDeviceProperties(&deviceProp,0);
-      printf("Device %d name: %s\n",0,deviceProp.name);
-      printf("Maximum global memory size: %lu\n",deviceProp.totalGlobalMem);
-      printf("Maximum shared memory size per block: %lu\n",deviceProp.sharedMemPerBlock);
-      printf("32-bit Reg. per block: %d\n",deviceProp.regsPerBlock);
-      printf("Warp size: %d\n",deviceProp.warpSize);
-      printf("Maximum threads per block: %d\n",deviceProp.maxThreadsPerBlock);
-      printf("Maximum block dimensions: %dx%dx%d\n",deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);
-      printf("Maximum grid dimensions: %dx%dx%d\n",deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);
-      printf("Clock frequency: %d khz\n",deviceProp.clockRate);
-      printf("Global memory frequency: %d khz\n", deviceProp.memoryClockRate);
-      printf("Global memory bus width: %d bits\n", deviceProp.memoryBusWidth);
-      printf("Maximum constant memory size: %lu\n",deviceProp.totalConstMem);
-      printf("Compute capability: %d.%d\n", deviceProp.major, deviceProp.minor);
-      printf("Number of multi-processors: %d\n", deviceProp.multiProcessorCount);
-      printf("L2 cache size: %d\n", deviceProp.l2CacheSize);
-      printf("Max. threads per multi-processor: %d\n", deviceProp.maxThreadsPerMultiProcessor);
-      printf("Compute mode: %d\n", deviceProp.computeMode);
-      printf("Device-side clock instruction rate: %d khz\n", deviceProp.clockInstructionRate);
-      printf("\n");
+      fprintf(stderr, "Device %d name: %s\n",0,deviceProp.name);
+      fprintf(stderr, "Maximum global memory size: %lu\n",deviceProp.totalGlobalMem);
+      fprintf(stderr, "Maximum shared memory size per block: %lu\n",deviceProp.sharedMemPerBlock);
+      fprintf(stderr, "32-bit Reg. per block: %d\n",deviceProp.regsPerBlock);
+      fprintf(stderr, "Warp size: %d\n",deviceProp.warpSize);
+      fprintf(stderr, "Maximum threads per block: %d\n",deviceProp.maxThreadsPerBlock);
+      fprintf(stderr, "Maximum block dimensions: %dx%dx%d\n",deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);
+      fprintf(stderr, "Maximum grid dimensions: %dx%dx%d\n",deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);
+      fprintf(stderr, "Clock frequency: %d khz\n",deviceProp.clockRate);
+      fprintf(stderr, "Global memory frequency: %d khz\n", deviceProp.memoryClockRate);
+      fprintf(stderr, "Global memory bus width: %d bits\n", deviceProp.memoryBusWidth);
+      fprintf(stderr, "Maximum constant memory size: %lu\n",deviceProp.totalConstMem);
+      fprintf(stderr, "Compute capability: %d.%d\n", deviceProp.major, deviceProp.minor);
+      fprintf(stderr, "Number of multi-processors: %d\n", deviceProp.multiProcessorCount);
+      fprintf(stderr, "L2 cache size: %d\n", deviceProp.l2CacheSize);
+      fprintf(stderr, "Max. threads per multi-processor: %d\n", deviceProp.maxThreadsPerMultiProcessor);
+      fprintf(stderr, "Compute mode: %d\n", deviceProp.computeMode);
+      fprintf(stderr, "Device-side clock instruction rate: %d khz\n", deviceProp.clockInstructionRate);
+      fprintf(stderr, "\n");
     }
   }
 #elif defined PORT_SYCL
   if(!init) {
     if(CommBench::myid == CommBench::printid)
-      printf("SYCL PORT\n");
+      fprintf(stderr, "SYCL PORT\n");
     // Initialize the driver
     zeInit(0);
     // Discover all the driver instances
@@ -134,34 +134,34 @@ void setup_gpu() {
         if(CommBench::myid == CommBench::printid)
         {
           if(ZE_DEVICE_TYPE_GPU == device_properties.type)
-            printf("driverCount %d deviceCount %d GPU\n", driverCount, deviceCount);
+            fprintf(stderr, "driverCount %d deviceCount %d GPU\n", driverCount, deviceCount);
           else
-            printf("GPU not found!\n");
-          printf("type %d\n", device_properties.type);
-          printf("vendorId %d\n", device_properties.vendorId);
-          printf("deviceId %d\n", device_properties.deviceId);
-          printf("flags %d\n", device_properties.flags);
-          printf("subdeviceId %d\n", device_properties.subdeviceId);
-          printf("coreClockRate %d\n", device_properties.coreClockRate);
-          printf("maxMemAllocSize %ld\n", device_properties.maxMemAllocSize);
-          printf("maxHardwareContexts %d\n", device_properties.maxHardwareContexts);
-          printf("maxCommandQueuePriority %d\n", device_properties.maxCommandQueuePriority);
-          printf("numThreadsPerEU %d\n", device_properties.numThreadsPerEU);
-          printf("physicalEUSimdWidth %d\n", device_properties.physicalEUSimdWidth);
-          printf("numSubslicesPerSlice %d\n", device_properties.numEUsPerSubslice);
-          printf("numSlices %d\n", device_properties.numSlices);
-          printf("timerResolution %ld\n", device_properties.timerResolution);
-          printf("timestampValidBits %d\n", device_properties.timestampValidBits);
-          printf("kernelTimestampValidBits %d\n", device_properties.kernelTimestampValidBits);
+            fprintf(stderr, "GPU not found!\n");
+          fprintf(stderr, "type %d\n", device_properties.type);
+          fprintf(stderr, "vendorId %d\n", device_properties.vendorId);
+          fprintf(stderr, "deviceId %d\n", device_properties.deviceId);
+          fprintf(stderr, "flags %d\n", device_properties.flags);
+          fprintf(stderr, "subdeviceId %d\n", device_properties.subdeviceId);
+          fprintf(stderr, "coreClockRate %d\n", device_properties.coreClockRate);
+          fprintf(stderr, "maxMemAllocSize %ld\n", device_properties.maxMemAllocSize);
+          fprintf(stderr, "maxHardwareContexts %d\n", device_properties.maxHardwareContexts);
+          fprintf(stderr, "maxCommandQueuePriority %d\n", device_properties.maxCommandQueuePriority);
+          fprintf(stderr, "numThreadsPerEU %d\n", device_properties.numThreadsPerEU);
+          fprintf(stderr, "physicalEUSimdWidth %d\n", device_properties.physicalEUSimdWidth);
+          fprintf(stderr, "numSubslicesPerSlice %d\n", device_properties.numEUsPerSubslice);
+          fprintf(stderr, "numSlices %d\n", device_properties.numSlices);
+          fprintf(stderr, "timerResolution %ld\n", device_properties.timerResolution);
+          fprintf(stderr, "timestampValidBits %d\n", device_properties.timestampValidBits);
+          fprintf(stderr, "kernelTimestampValidBits %d\n", device_properties.kernelTimestampValidBits);
           //for(int j = 0; j < ZE_MAX_DEVICE_UUID_SIZE; j++)
-          //  printf("uuid %d\n", device_properties.uuid.id[j]);
-          printf("name %s\n", device_properties.name);
-          printf("\n");
+          //  fprintf(stderr, "uuid %d\n", device_properties.uuid.id[j]);
+          fprintf(stderr, "name %s\n", device_properties.name);
+          fprintf(stderr, "\n");
         }
       }
       /*ze_bool_t test = false;
       zeDeviceCanAccessPeer(allDevices[0], allDevices[1], &test);
-      printf("can access peer %d\n", test);*/
+      fprintf(stderr, "can access peer %d\n", test);*/
       delete[] allDevices;
     }
     delete[] allDrivers;
@@ -169,7 +169,7 @@ void setup_gpu() {
 #else
   if(!init)
     if(CommBench::myid == CommBench::printid)
-      printf("CPU VERSION\n");
+      fprintf(stderr, "CPU VERSION\n");
 #endif
   init = true;
 }
diff --git a/CommFunctions/validate.h b/CommFunctions/validate.h
old mode 100644
new mode 100755
index 8d03cd7..7dbb31c
--- a/CommFunctions/validate.h
+++ b/CommFunctions/validate.h
@@ -1,5 +1,5 @@
 template <class Coll>
-void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &coll) {
+void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &coll, int source = 0, int dest = 1) {
 
   int myid = CommBench::myid;
   int numproc = CommBench::numproc;
@@ -26,10 +26,10 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c
   switch(pattern) {
     case 0:
       {
-        if(myid == 0) printf("VERIFY P2P\n");
-        if(myid == 1) {
+        if(myid == source) fprintf(stderr, "VERIFY P2P\n");
+        if(myid == dest) {
           for(size_t i = 0; i < count; i++) {
-            // printf("myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]);
+            // fprintf(stderr, "myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]);
             if(recvbuf[i] != i)
               pass = false;
           }
@@ -38,11 +38,11 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c
       break;
     case 1:
       {
-        if(myid == ROOT) printf("VERIFY GATHER\n");
+        if(myid == ROOT) fprintf(stderr, "VERIFY GATHER\n");
         if(myid == ROOT) {
           for(int p = 0; p < numproc; p++)
             for(size_t i = 0; i < count; i++) {
-              // printf("myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]);
+              // fprintf(stderr, "myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]);
               if(recvbuf[p * count + i] != i)
                 pass = false;
             }
@@ -51,9 +51,9 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c
       break;
     case 2:
       {
-        if(myid == ROOT) printf("VERIFY SCATTER\n");
+        if(myid == ROOT) fprintf(stderr, "VERIFY SCATTER\n");
         for(size_t i = 0; i < count; i++) {
-          // printf("myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]);
+          // fprintf(stderr, "myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]);
           if(recvbuf[i] != myid * count + i)
             pass = false;
         }
@@ -61,9 +61,9 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c
       break;
     case 3:
       {
-        if(myid == ROOT) printf("VERIFY BCAST\n");
+        if(myid == ROOT) fprintf(stderr, "VERIFY BCAST\n");
         for(size_t i = 0; i < count; i++) {
-          // printf("myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]);
+          // fprintf(stderr, "myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]);
           if(recvbuf[i] != i)
             pass = false;
         }
@@ -71,16 +71,16 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c
       break;
     case 4:
       {
-        if(myid == ROOT) printf("REDUCE IS NOT TESTED\n");
+        if(myid == ROOT) fprintf(stderr, "REDUCE IS NOT TESTED\n");
           pass = false;
       }
       break;
     case 5:
       {
-        if(myid == ROOT) printf("VERIFY ALL-TO-ALL\n");
+        if(myid == ROOT) fprintf(stderr, "VERIFY ALL-TO-ALL\n");
         for(int p = 0; p < numproc; p++)
           for(size_t i = 0; i < count; i++) {
-            // printf("myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]);
+            // fprintf(stderr, "myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]);
             if(recvbuf[p * count + i] != myid * count + i)
               pass = false;
           }
@@ -88,10 +88,10 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c
       break;
     case 6:
       {
-        if(myid == ROOT) printf("VERIFY ALL-GATHER\n");
+        if(myid == ROOT) fprintf(stderr, "VERIFY ALL-GATHER\n");
         for(int p = 0; p < numproc; p++)
           for(size_t i = 0; i < count; i++) {
-            // printf("myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]);
+            // fprintf(stderr, "myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]);
             if(recvbuf[p * count + i] != i)
               pass = false;
           }
@@ -99,13 +99,13 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c
       break;
     case 7:
       {
-        if(myid == ROOT) printf("REDUCE-SCATTER IS NOT TESTED\n");
+        if(myid == ROOT) fprintf(stderr, "REDUCE-SCATTER IS NOT TESTED\n");
           pass = false;
       }
       break;
     case 8: 
       { 
-        if(myid == ROOT) printf("ALL-REDUCE IS NOT TESTED\n");
+        if(myid == ROOT) fprintf(stderr, "ALL-REDUCE IS NOT TESTED\n");
           pass = false;
       }
       break;
@@ -113,9 +113,9 @@ void validate(int *sendbuf_d, int *recvbuf_d, size_t count, int pattern, Coll &c
   pass = CommBench::allreduce_land(pass);
   if(myid == ROOT) {
     if(pass) 
-      printf("PASSED!\n");
+      fprintf(stderr, "PASSED!\n");
     else 
-      printf("FAILED!!!\n");
+      fprintf(stderr, "FAILED!!!\n");
   }
 
   CommBench::freeHost(sendbuf);
diff --git a/bench.cpp b/bench.cpp
new file mode 100644
index 0000000..fca92d9
--- /dev/null
+++ b/bench.cpp
@@ -0,0 +1,70 @@
+#include "commbench.h"
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+using namespace CommBench;
+
+int main(int argc, char *argv[]) {
+  init();
+
+  if (argc < 3) {
+    if (myid == 0)
+      fprintf(stderr, "Invalid arguments, usage is ./bench <folder> <libs>\n");
+    return 1;
+  }
+  char* folder = argv[1];
+
+  int *sendbuf, *recvbuf;
+  int max_num_bytes = 31;
+  char file_name[100];
+  int gpus = numproc;
+
+  allocate(sendbuf, 1ull << max_num_bytes);
+  allocate(recvbuf, 1ull << max_num_bytes);
+
+  for (int i = 2; i < argc; i++) {
+    char* lib_str = argv[i];
+    library lib;
+    if (strcmp(lib_str, "mpi") == 0)
+      lib = library::MPI;
+    else if (strcmp(lib_str, "ipc_put") == 0)
+      lib = library::IPC;
+    else if (strcmp(lib_str, "ipc_get") == 0)
+      lib = library::IPC_get;
+    else if (strcmp(lib_str, "xccl") == 0)
+      lib = library::NCCL;
+    else {
+      if (myid == 0)
+        fprintf(stderr, "Invalid library %s\n", lib_str);
+      return 1;
+    }
+    if (myid == 0)
+      fprintf(stderr, "Testing library %s\n", lib_str);
+    for (int source = 0; source < gpus; source++) {
+      for (int dest = 0; dest < gpus; dest++) {
+        if (myid == 0)
+          fprintf(stderr, "\033[2K\rP2P Comm: %d->%d", source, dest);
+          // fprintf(stderr, "P2P Comm: %d->%d\n", source, dest);
+        for (int size = 2; size < max_num_bytes; size+=2) {
+          Comm<int> bench(lib);
+          if (myid == 0) {
+            sprintf(file_name, "%s/%s/%d_%d_%llu.out", folder,
+                    lib_str, source, dest, 1ull << size);
+            freopen(file_name, "w", stdout);
+          }
+          bench.add(sendbuf, recvbuf, 1ull << size, source, dest);
+          bench.measure(5, 10);
+          // bench.clear();
+        }
+      }
+    }
+
+    if (myid == 0)
+      fprintf(stderr, "\033[2K\rCompleted testing library %s\n", lib_str);
+      // fprintf(stderr, "Completed testing library %s\n", libs_str[i].c_str());
+  }
+  free(sendbuf);
+  free(recvbuf);
+  MPI_Finalize();
+}
diff --git a/bench_alltoall.cpp b/bench_alltoall.cpp
new file mode 100755
index 0000000..1ada1ce
--- /dev/null
+++ b/bench_alltoall.cpp
@@ -0,0 +1,66 @@
+#include "commbench.h"
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+using namespace CommBench;
+
+int main(int argc, char *argv[]) {
+  init();
+
+  if (argc < 3) {
+    if (myid == 0)
+      fprintf(stderr, "Invalid arguments, usage is ./bench <folder> <libs>\n");
+    return 1;
+  }
+  char* folder = argv[1];
+
+  int *sendbuf, *recvbuf;
+  int max_num_bytes = 25;
+  char file_name[100];
+  int gpus = numproc;
+
+  allocate(sendbuf, (1ull << max_num_bytes) * numproc);
+  allocate(recvbuf, (1ull << max_num_bytes) * numproc);
+
+  for (int i = 2; i < argc; i++) {
+    char* lib_str = argv[i];
+    library lib;
+    if (strcmp(lib_str, "mpi") == 0)
+      lib = library::MPI;
+    else if (strcmp(lib_str, "ipc_put") == 0)
+      lib = library::IPC;
+    else if (strcmp(lib_str, "ipc_get") == 0)
+      lib = library::IPC_get;
+    else if (strcmp(lib_str, "xccl") == 0)
+      lib = library::NCCL;
+    else {
+      if (myid == 0)
+        fprintf(stderr, "Invalid library %s\n", lib_str);
+      return 1;
+    }
+    if (myid == 0)
+      fprintf(stderr, "Testing library %s\n", lib_str);
+        for (int size = 2; size < max_num_bytes; size+=2) {
+          if (myid == 0)
+            fprintf(stderr, "\033[2K\rTesting size: %llu bytes", 1ull << size);
+          Comm<int> bench(lib);
+          if (myid == 0) {
+            sprintf(file_name, "%s/%s/%llu.out", folder,
+                    lib_str, 1ull << size);
+            freopen(file_name, "w", stdout);
+          }
+          for (int sender = 0; sender < numproc; sender++)
+            for (int recver = 0; recver < numproc; recver++)
+              bench.add(sendbuf, recver * (1ull << size), recvbuf, sender * (1ull << size),
+                     1ull << size, sender, recver);
+          bench.measure(5, 10);
+    }
+
+    if (myid == 0)
+      fprintf(stderr, "\033[2K\rCompleted testing library %s\n", lib_str);
+  }
+  free(sendbuf);
+  free(recvbuf);
+  MPI_Finalize();
+}
diff --git a/bench_broadcast.cpp b/bench_broadcast.cpp
new file mode 100755
index 0000000..8da3cfe
--- /dev/null
+++ b/bench_broadcast.cpp
@@ -0,0 +1,66 @@
+#include "commbench.h"
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+using namespace CommBench;
+
+int main(int argc, char *argv[]) {
+  init();
+
+  if (argc < 3) {
+    if (myid == 0)
+      fprintf(stderr, "Invalid arguments, usage is ./bench <folder> <libs>\n");
+    return 1;
+  }
+  char* folder = argv[1];
+
+  int *sendbuf, *recvbuf;
+  int max_num_bytes = 31;
+  char file_name[100];
+  int gpus = numproc;
+
+  allocate(sendbuf, 1ull << max_num_bytes);
+  allocate(recvbuf, 1ull << max_num_bytes);
+
+  for (int i = 2; i < argc; i++) {
+    char* lib_str = argv[i];
+    library lib;
+    if (strcmp(lib_str, "mpi") == 0)
+      lib = library::MPI;
+    else if (strcmp(lib_str, "ipc_put") == 0)
+      lib = library::IPC;
+    else if (strcmp(lib_str, "ipc_get") == 0)
+      lib = library::IPC_get;
+    else if (strcmp(lib_str, "xccl") == 0)
+      lib = library::NCCL;
+    else {
+      if (myid == 0)
+        fprintf(stderr, "Invalid library %s\n", lib_str);
+      return 1;
+    }
+    if (myid == 0)
+      fprintf(stderr, "Testing library %s\n", lib_str);
+    for (int source = 0; source < gpus; source++) {
+        for (int size = 2; size < max_num_bytes; size+=2) {
+          if (myid == 0)
+            fprintf(stderr, "\033[2K\rBroadcaster: %d, size: %llu", source, 1ull << size);
+          Comm<int> bench(lib);
+          if (myid == 0) {
+            sprintf(file_name, "%s/%s/%d_%llu.out", folder,
+                    lib_str, source, 1ull << size);
+            freopen(file_name, "w", stdout);
+          }
+          for (int p = 0; p < numproc; p++)
+            bench.add(sendbuf, 0, recvbuf, 0, 1ull << size, source, p);
+          bench.measure(5, 10);
+        }
+    }
+
+    if (myid == 0)
+      fprintf(stderr, "\033[2K\rCompleted testing library %s\n", lib_str);
+  }
+  free(sendbuf);
+  free(recvbuf);
+  MPI_Finalize();
+}
diff --git a/bench_gather.cpp b/bench_gather.cpp
new file mode 100755
index 0000000..447631b
--- /dev/null
+++ b/bench_gather.cpp
@@ -0,0 +1,67 @@
+#include "commbench.h"
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+using namespace CommBench;
+
+int main(int argc, char *argv[]) {
+  init();
+
+  if (argc < 3) {
+    if (myid == 0)
+      fprintf(stderr, "Invalid arguments, usage is ./bench <folder> <libs>\n");
+    return 1;
+  }
+  char* folder = argv[1];
+
+  int *sendbuf, *recvbuf;
+  int max_num_bytes = 27;
+  char file_name[100];
+  int gpus = numproc;
+
+  allocate(sendbuf, 1ull << max_num_bytes);
+  allocate(recvbuf, (1ull << max_num_bytes) * numproc);
+
+  for (int i = 2; i < argc; i++) {
+    char* lib_str = argv[i];
+    library lib;
+    if (strcmp(lib_str, "mpi") == 0)
+      lib = library::MPI;
+    else if (strcmp(lib_str, "ipc_put") == 0)
+      lib = library::IPC;
+    else if (strcmp(lib_str, "ipc_get") == 0)
+      lib = library::IPC_get;
+    else if (strcmp(lib_str, "xccl") == 0)
+      lib = library::NCCL;
+    else {
+      if (myid == 0)
+        fprintf(stderr, "Invalid library %s\n", lib_str);
+      return 1;
+    }
+    if (myid == 0)
+      fprintf(stderr, "Testing library %s\n", lib_str);
+      for (int dest = 0; dest < gpus; dest++) {
+          // fprintf(stderr, "P2P Comm: %d->%d\n", source, dest);
+        for (int size = 2; size < max_num_bytes; size+=2) {
+          if (myid == 0)
+            fprintf(stderr, "\033[2K\rGatherer: %d, size: %llu", dest, 1ull << size);
+          Comm<int> bench(lib);
+          if (myid == 0) {
+            sprintf(file_name, "%s/%s/%d_%llu.out", folder,
+                    lib_str, dest, 1ull << size);
+            freopen(file_name, "w", stdout);
+          }
+          for (int p = 0; p < numproc; p++)
+            bench.add(sendbuf, 0, recvbuf, p * (1ull << size), 1ull << size, p, dest);
+          bench.measure(5, 10);
+        }
+    }
+
+    if (myid == 0)
+      fprintf(stderr, "\033[2K\rCompleted testing library %s\n", lib_str);
+  }
+  free(sendbuf);
+  free(recvbuf);
+  MPI_Finalize();
+}
diff --git a/bench_p2p.cpp b/bench_p2p.cpp
new file mode 100755
index 0000000..c848960
--- /dev/null
+++ b/bench_p2p.cpp
@@ -0,0 +1,68 @@
+#include "commbench.h"
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+using namespace CommBench;
+
+int main(int argc, char *argv[]) {
+  init();
+
+  if (argc < 3) {
+    if (myid == 0)
+      fprintf(stderr, "Invalid arguments, usage is ./bench <folder> <libs>\n");
+    return 1;
+  }
+  char* folder = argv[1];
+
+  int *sendbuf, *recvbuf;
+  int max_num_bytes = 31;
+  char file_name[100];
+  int gpus = numproc;
+
+  allocate(sendbuf, 1ull << max_num_bytes);
+  allocate(recvbuf, 1ull << max_num_bytes);
+
+  for (int i = 2; i < argc; i++) {
+    char* lib_str = argv[i];
+    library lib;
+    if (strcmp(lib_str, "mpi") == 0)
+      lib = library::MPI;
+    else if (strcmp(lib_str, "ipc_put") == 0)
+      lib = library::IPC;
+    else if (strcmp(lib_str, "ipc_get") == 0)
+      lib = library::IPC_get;
+    else if (strcmp(lib_str, "xccl") == 0)
+      lib = library::NCCL;
+    else {
+      if (myid == 0)
+        fprintf(stderr, "Invalid library %s\n", lib_str);
+      return 1;
+    }
+    if (myid == 0)
+      fprintf(stderr, "Testing library %s\n", lib_str);
+    for (int source = 0; source < gpus; source++) {
+      for (int dest = 0; dest < gpus; dest++) {
+        if (myid == 0)
+          fprintf(stderr, "\033[2K\rP2P Comm: %d->%d", source, dest);
+        for (int size = 2; size < max_num_bytes; size+=2) {
+          Comm<int> bench(lib);
+          if (myid == 0) {
+            sprintf(file_name, "%s/%s/%d_%d_%llu.out", folder,
+                    lib_str, source, dest, 1ull << size);
+            freopen(file_name, "w", stdout);
+          }
+          bench.add(sendbuf, recvbuf, 1ull << size, source, dest);
+          // bench.add(sendbuf, recvbuf, 1ull << size, dest, source);
+          bench.measure(5, 10);
+        }
+      }
+    }
+
+    if (myid == 0)
+      fprintf(stderr, "\033[2K\rCompleted testing library %s\n", lib_str);
+  }
+  free(sendbuf);
+  free(recvbuf);
+  MPI_Finalize();
+}
diff --git a/data_final.tar.gz b/data_final.tar.gz
new file mode 100644
index 0000000..2d8710e
Binary files /dev/null and b/data_final.tar.gz differ
diff --git a/instructions.md b/instructions.md
new file mode 100644
index 0000000..1246bdb
--- /dev/null
+++ b/instructions.md
@@ -0,0 +1,85 @@
+# Overview of Usage
+
+This branch provides the same `CommBench` executable with the standard usage flags, but also adds the `--source` and `--dest` flags that can be used for P2P, gather, and broadcast patterns. Gather only supports `--dest` and broadcast only supports `--source`.
+
+In addition, we provide four new targets. `bench_p2p`, `bench_gather`, `bench_broadcast` and `bench_alltoall` for these respective patterns. These provide support for "mass benchmarking tests", testing all combinations of the available processors. The usage for these is `bench_<pattern> <output_folder> <libs>` where you can provide a space separated list of libraries (out of `ipc_get`, `ipc_put`, `mpi`, and `xccl`) to test.
+
+This will then dump the output for each of these benchmarks into a lot of files in the directories `output_folder/<lib>` for each specified library. These can be parsed and visualized later. These DO NOT create the directories themselves, so make sure that the directory `output_folder/<lib>` exists for every library you want to test. Otherwise the benchmarks will silently go on 
+
+# Building CommBench for El Dorado Benchmarking
+
+In this branch, most of the "debug" output is disabled by default and when enabled is directed to `stderr`. To enable this output, configure CMake with the `-DPRINT_DEBUG=ON` flag, but by default it is off so that only the necessary output is shown.
+
+To build, run the following in the CommBench directory:
+```sh
+cmake -S . -B build -DUSE_HIP=ON -DUSE_XCCL=ON -DXCCL_PATH=/opt/rocm-6.3.1 -DUSE_GTL=ON -DGTL_PATH=/opt/cray/pe/mpich/8.1.32/ofi/amd/6.0 -DCMAKE_EXPORT_COMPILE_COMMANDS=TRUE -DMPI_C_COMPILER=/opt/cray/pe/mpich/8.1.32/ofi/amd/6.0/bin/mpicc -DMPI_CXX_COMPILER=/opt/cray/pe/mpich/8.1.32/ofi/amd/6.0/bin/mpicxx
+```
+and if you want to use CommBench's `IPC_kernel` for the `ipc_put` and `ipc_get` communications, add the flag `-DUSE_BLIT_KERNEL=ON`. It may be helpful to have two separate build folders, one with and one without this flag. Adjust `-B <build_folder>` accordingly.
+
+Make sure to have the `PrgEnv-amd/8.6.0`, `craype-accel-amd-gfx942`, `cmake/3.24.2` and `rocm/6.3.1` modules loaded. To build for `rocm/6.4.1`, load `amd/6.4.1` and `rocm/6.4.1`.
+
+# Running Benchmarks
+
+Make sure to have to have `MPICH_GPU_SUPPORT_ENABLED=1` set. Experiment with the `HSA_ENABLE_SDMA=0` environment variable as well.
+
+Set up the output folders before running the benchmark. The scripts expect this file structure:
+```
+pattern
+├── CPX
+│   ├── lib1
+│   ├── lib2
+│   └── lib3
+├── SPX
+│   ├── lib1
+│   ├── lib2
+│   └── lib3
+└── TPX
+    ├── lib1
+    ├── lib2
+    └── lib3
+```
+You can technically have different libraries specified for the partitions, but as mentioned before, make sure the folders are created inside the partition.
+
+Allocate an instance with
+```sh
+flux alloc -N1 --setattr=gpumode=<modality> --conf=resource.rediscover=true --time-limit=<time>
+```
+
+To run the benchmark, use the command with 4 for SPX, 12 for TPX, and 24 for CPX.
+```sh
+flux run -N1 -n<proc> -g1 -x -o mpibind=off ./build/bench_<pattern> <output_folder> <libs>
+```
+
+For example, you might specify the output folder to be `pattern/SPX` and you want to test the `ipc_get` pattern, then you would need to make sure the directory `pattern/SPX/ipc_get` exists. Then the command is 
+
+```sh
+flux run -N1 -n4 -g1 -x -o mpibind=off ./build/bench_<pattern> pattern/SPX ipc_get
+```
+
+Once this is done, you run the script `parse.py`. At the bottom of the script file, you can adjust the folder to `pattern` and then change the list to the modalities that you have. This script takes the text files and stores them in a pandas dataframe for visualization later.
+
+Since CommBench does not work properly with MPI for the pattern benchmarks, we are supplementing the output with the OSU benchmarks. Run it like so:
+
+```sh
+flux run -N1 -n<proc> -g1 -x -o mpibind=off ./get_local_rank ./mpi/collective/osu_<pattern> -f -m 16:<max_bytes> -d rocm -x 5 -i 10 > <pattern>/<modality>/mpi/osu.out
+```
+
+since for MPI, the parse scripts expect a file `mpi/osu.out` to load that data for graphing. 
+
+Then the `visualize_df.py` script generates the graphs from this dataframe which would be stored in `pattern/SPX_data.pkl`. Adjust the function calls at the bottom for plotting different patterns with input folder corresponding to `pattern` where it can find `pattern/<modality>_data.pkl` and it will output to your desired folder. These scripts will create the folders as needed.
+
+The bandwidth as graphed is not taken from the file outputs, but calculated from the latency. If you modify any of the benchmarks, make sure to update how the size is calculated in either `parse.py` or `visualize_df.py` so that the number of bytes used for the bandwidth calculation is correct.
+
+# Known Issues
+
+## MPI
+
+Since MPI does not play nicely for the TPX and CPX modalities, you need to "segment" the runs. Run the `bench_p2p` for `mpi` by itself and adjust the for loop in `bench_p2p.cpp` to be `for (int dest = 0; dest < 6; dest++)` and run again like that `for (int dest = 6; dest < 12; dest++)` to get the rest of the data. For CPX mode, do the same thing, but with 0 to 8, 8 to 16, and 16, 24.
+
+## XCCL/Flux
+
+Typically this error is seen with XCCL, but it may occur other times where Flux gives some sqlite errors and says the disk/database is malformed. You need to exit out of the currently allocated session and start a new one first and then also you will need to "buffer" the runs like with MPI. It may work better to buffer the outer for loop through the sources in `bench_p2p.cpp` or you might need to buffer both. 
+
+## Flux Batch
+
+It would be easier to submit these jobs to Flux as batch jobs, but sometimes you might get strange errors like an "alarm clock error". This doesn't happen in interactive sessions, so it might be advisable to avoid Flux Batch.
\ No newline at end of file
diff --git a/main.cpp b/main.cpp
old mode 100644
new mode 100755
index 4f183d9..29bbaa7
--- a/main.cpp
+++ b/main.cpp
@@ -1,4 +1,5 @@
 #include "commbench.h"
+#include <cstdlib>
 #define ROOT 0
 #include "validate.h"
 #include <array>
@@ -9,11 +10,21 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include <iostream>
 
 using namespace CommBench;
 
-enum pattern { p2p, gather, scatter, broadcast, reduce, alltoall, allgather };
+enum pattern {
+  p2p,
+  gather,
+  scatter,
+  broadcast,
+  reduce,
+  alltoall,
+  allgather,
+  rail,
+  asymmetric_fill,
+  symmetric_fill
+};
 int myid_loc;
 
 struct step {
@@ -21,11 +32,11 @@ struct step {
   library lib;
 };
 
-/*template <typename... Args> void FATAL_ERROR(const char *fmt, Args... args) {
+template <typename... Args> void FATAL_ERROR(const char *fmt, Args... args) {
   if (myid_loc == printid) {
     fprintf(stderr, "FATAL ERROR: ");
     fprintf(stderr, fmt, args...);
-    std::abort();
+    std::exit(EXIT_FAILURE);
   }
 }
 
@@ -41,12 +52,12 @@ template <typename... Args> void WARNING(const char *fmt, Args... args) {
     fprintf(stderr, "WARNING: ");
     fprintf(stderr, fmt, args...);
   }
-}*/
+}
 
 std::unordered_map<std::string, std::vector<std::string>>
 parseArgs(int argc, char *argv[]) {
-  static const std::array<std::string, 5> valid_args = {"use", "pattern", "validate",
-                                           "nbytes", "file"};
+  static const std::array<std::string, 7> valid_args = {
+      "use", "pattern", "validate", "nbytes", "file", "dest", "source"};
   int i = 1;
   std::unordered_map<std::string, std::vector<std::string>> args;
   std::string prev = "";
@@ -63,9 +74,9 @@ parseArgs(int argc, char *argv[]) {
           break;
         }
       if (!valid) {
-//FATAL_ERROR("unknown argument \"%s\"\n", argv[i]);
-	std::cout << "unknown argument \"%s\"\n" << std::endl;
-	std::exit(EXIT_FAILURE);
+        FATAL_ERROR("unknown argument \"%s\"\n", argv[i]);
+        // std::cout << "unknown argument \"%s\"\n" << std::endl;
+        // std::exit(EXIT_FAILURE);
       }
       if (args.find(arg) == args.end())
         args.insert({arg, {}});
@@ -75,9 +86,9 @@ parseArgs(int argc, char *argv[]) {
 
       args[prev].push_back(cur);
     } else {
-//FATAL_ERROR("unknown argument", argv[i]);
-      std::cout << "unknown argument" << std::endl;
-      std::exit(EXIT_FAILURE);
+      FATAL_ERROR("unknown argument", argv[i]);
+      // std::cout << "unknown argument" << std::endl;
+      // std::exit(EXIT_FAILURE);
     }
     i++;
   }
@@ -89,39 +100,40 @@ library parseLib(const std::string &libStr) {
     return library::MPI;
   else if (libStr == "ipc_put") {
 #if !(defined(PORT_CUDA) || defined(PORT_HIP) || defined(PORT_ONEAPI))
-    //FATAL_ERROR("Cannot use IPC without compiling for CUDA, "
-      //          "ROCm, or OneAPI\n");
-    std::cout << "Cannot use IPC without compiling for CUDA, ROCm, or OneAPI" << std::endl;
-    std::exit(EXIT_FAILURE);
+    FATAL_ERROR("Cannot use IPC without compiling for CUDA, "
+                "ROCm, or OneAPI\n");
+    // std::cout << "Cannot use IPC without compiling for CUDA, ROCm, or OneAPI"
+    // << std::endl; std::exit(EXIT_FAILURE);
 #endif
     return library::IPC;
   } else if (libStr == "ipc_get") {
 #if !(defined(PORT_CUDA) || defined(PORT_HIP) || defined(PORT_ONEAPI))
-    //FATAL_ERROR("Cannot use IPC without compiling for CUDA, "
-      //          "ROCm, or OneAPI\n");
-    std::cout << "Cannot use IPC without compiling for CUDA, ROCm, or OneAPI" << std::endl;
-    std::exit(EXIT_FAILURE);
+    FATAL_ERROR("Cannot use IPC without compiling for CUDA, "
+                "ROCm, or OneAPI\n");
+    // std::cout << "Cannot use IPC without compiling for CUDA, ROCm, or OneAPI"
+    // << std::endl; std::exit(EXIT_FAILURE);
 #endif
     return library::IPC_get;
   } else if (libStr == "xccl") {
 #if !(defined(PORT_CUDA) || defined(PORT_HIP) || defined(PORT_ONEAPI))
-    //FATAL_ERROR("Cannot use IPC without compiling for CUDA, "
-    //            "ROCm, or OneAPI\n");
-    std::cout << "Cannot use IPC without compiling for CUDA, ROCm, or OneAPI" << std::endl;
-    std::exit(EXIT_FAILURE);
+    FATAL_ERROR("Cannot use IPC without compiling for CUDA, "
+                "ROCm, or OneAPI\n");
+    // std::cout << "Cannot use IPC without compiling for CUDA, ROCm, or OneAPI"
+    // << std::endl; std::exit(EXIT_FAILURE);
 #endif
 #ifndef CAP_NCCL
-    //FATAL_ERROR("Not compiled for using XCCL\n");
-    std::cout << "Not compiled for using XCCL" << std::endl;
-    std::exit(EXIT_FAILURE);
+    FATAL_ERROR("Not compiled for using XCCL\n");
+    // std::cout << "Not compiled for using XCCL" << std::endl;
+    // std::exit(EXIT_FAILURE);
 #endif
     return library::NCCL;
   } else {
-    //FATAL_ERROR("Unknown communication library option \"%s\". Please "
-    //            "specify one of: mpi, ipc_put, ipc_get, or xccl.\n",
-    //            libStr.c_str());
-    std::cout << "Unknown communication library option. Please specify one of: mpi, ipc_put, ipc_get, or xccl." << std::endl;
-    std::exit(EXIT_FAILURE);
+    FATAL_ERROR("Unknown communication library option \"%s\". Please "
+                "specify one of: mpi, ipc_put, ipc_get, or xccl.\n",
+                libStr.c_str());
+    // std::cout << "Unknown communication library option. Please specify one
+    // of: mpi, ipc_put, ipc_get, or xccl." << std::endl;
+    // std::exit(EXIT_FAILURE);
   }
 }
 
@@ -138,13 +150,20 @@ pattern parsePattern(const std::string &patStr) {
     return pattern::alltoall;
   else if (patStr == "allgather")
     return pattern::allgather;
+  else if (patStr == "rail")
+    return pattern::rail;
+  else if (patStr == "asymmetric_fill")
+    return pattern::asymmetric_fill;
+  else if (patStr == "symmetric_fill")
+    return pattern::symmetric_fill;
   else {
-    std::cout << "Unknown communication pattern. Please use one of: p2p, broadcast, gather, scatter, alltoall, or allgather." << std::endl;
-    std::exit(EXIT_FAILURE);
-    //FATAL_ERROR(
-    //    "Unknown communication pattern \"%s\". Please use one "
-    //    "of: p2p, broadcast, gather, scatter, alltoall, or allgather.\n",
-    //    patStr.c_str());
+    // std::cout << "Unknown communication pattern. Please use one of: p2p,
+    // broadcast, gather, scatter, alltoall, or allgather." << std::endl;
+    // std::exit(EXIT_FAILURE);
+    FATAL_ERROR(
+        "Unknown communication pattern \"%s\". Please use one "
+        "of: p2p, broadcast, gather, scatter, alltoall, or allgather.\n",
+        patStr.c_str());
   }
 }
 
@@ -158,144 +177,168 @@ int main(int argc, char *argv[]) {
       parseArgs(argc, argv);
 
   if (args.find("pattern") != args.end() && args.find("file") != args.end()) {
-    std::cout << "Cannot use both the --file and --pattern options." << std::endl;
-    std::exit(EXIT_FAILURE);
+    // std::cout << "Cannot use both the --file and --pattern options." <<
+    // std::endl; std::exit(EXIT_FAILURE);
+    FATAL_ERROR("Cannot use both the --file and --pattern options.\n");
   }
-    //FATAL_ERROR("Cannot use both the --file and --pattern options.\n");
 
   library lib_def = library::MPI;
   if (args["use"].size() != 0) {
     lib_def = parseLib(args["use"][0]);
   } else {
-    std::cout << "No communication library specified, using MPI by default" << std::endl;
-    //WARNING("No communication library specified, using MPI by default\n");
+    // std::cout << "No communication library specified, using MPI by default"
+    // << std::endl;
+    WARNING("No communication library specified, using MPI by default\n");
   }
 
   std::vector<step> steps;
   if (args.find("file") != args.end()) {
-    #ifndef CONFIG_FILE 
-      std::cout << "Cannot use the --file flag unless compiled with jsoncpp support." << std::endl;
-      std::exit(EXIT_FAILURE);
-      //FATAL_ERROR("Cannot use the --file flag unless compiled with jsoncpp support.\n");
-    #else
+#ifndef CONFIG_FILE
+    // std::cout << "Cannot use the --file flag unless compiled with jsoncpp
+    // support." << std::endl; std::exit(EXIT_FAILURE);
+    FATAL_ERROR(
+        "Cannot use the --file flag unless compiled with jsoncpp support.\n");
+#else
     std::ifstream file(args["file"][0], std::ifstream::binary);
     if (!file.is_open())
-      std::cout << "Could not open file." << std::endl;
-      std::exit(EXIT_FAILURE);
-      //FATAL_ERROR("Could not open file \"%s\"\n", args["file"][0]);
+      // std::cout << "Could not open file." << std::endl;
+      // std::exit(EXIT_FAILURE);
+      FATAL_ERROR("Could not open file \"%s\"\n", args["file"][0]);
 
-      Json::Value root;
-      Json::CharReaderBuilder builder;
-      std::string errs;
-      int step = 1;
+    Json::Value root;
+    Json::CharReaderBuilder builder;
+    std::string errs;
+    int step = 1;
 
-      if (!Json::parseFromStream(builder, file, &root, &errs))
-	std::cout << "Failed to parse JSON" << std::endl;
-	std::exit(EXIT_FAILURE);
-        //FATAL_ERROR("Failed to parse JSON\n");
+    if (!Json::parseFromStream(builder, file, &root, &errs))
+      // std::cout << "Failed to parse JSON" << std::endl;
+      // std::exit(EXIT_FAILURE);
+      FATAL_ERROR("Failed to parse JSON\n");
 
-      if (root.isMember("steps") && root["steps"].isArray()) {
-        Json::Value &stepsArray = root["steps"];
-        for (const auto &stepNumber : stepsArray) {
-          std::cout << "STEP " << step << std::endl;
-          step = step + 1;
-          library lib = lib_def;
-          if (stepNumber.isMember("library"))
-            lib = parseLib(stepNumber["library"].asString());
-          std::vector<pattern> patterns;
-          Json::Value patternsArray = stepNumber["patterns"];
-          for (const auto &testType : patternsArray)
-            patterns.push_back(parsePattern(testType.asString()));
-          steps.push_back({patterns, lib});
-        }
-      }
-      #endif
-    } else {
-      std::vector<pattern> patterns;
-      for (int i = 0; i < args["pattern"].size(); i++) {
-        patterns.push_back(parsePattern(args["pattern"][i]));
+    if (root.isMember("steps") && root["steps"].isArray()) {
+      Json::Value &stepsArray = root["steps"];
+      for (const auto &stepNumber : stepsArray) {
+        std::cout << "STEP " << step << std::endl;
+        step = step + 1;
+        library lib = lib_def;
+        if (stepNumber.isMember("library"))
+          lib = parseLib(stepNumber["library"].asString());
+        std::vector<pattern> patterns;
+        Json::Value patternsArray = stepNumber["patterns"];
+        for (const auto &testType : patternsArray)
+          patterns.push_back(parsePattern(testType.asString()));
+        steps.push_back({patterns, lib});
       }
+    }
+#endif
+  } else {
+    std::vector<pattern> patterns;
+    for (int i = 0; i < args["pattern"].size(); i++) {
+      patterns.push_back(parsePattern(args["pattern"][i]));
+    }
 
-      if (patterns.size() == 0) {
-	std::cout << "No communication pattern specified, using P2P by default." << std::endl;
-        //WARNING("No communication pattern specified, using P2P by default\n");
-        patterns.push_back(p2p);
-      }
-      steps.push_back({patterns, lib_def});
+    if (patterns.size() == 0) {
+      // std::cout << "No communication pattern specified, using P2P by
+      // default." << std::endl;
+      WARNING("No communication pattern specified, using P2P by default\n");
+      patterns.push_back(p2p);
     }
+    steps.push_back({patterns, lib_def});
+  }
 
-    bool run_validate = false;
-    if (args.find("validate") != args.end())
-      run_validate = true;
+  bool run_validate = false;
+  if (args.find("validate") != args.end())
+    run_validate = true;
 
-    int *sendbuf;
-    int *recvbuf;
-    size_t numbytes = 1e8;
-    if (args.find("nbytes") != args.end()) {
-      if (args["nbytes"].size() == 0)
-	std::cout << "Missing number of bytes argument for --nbytes" << std::endl;
-        std::exit(EXIT_FAILURE);
-      	//FATAL_ERROR("Missing number of bytes argument for --nbytes");
-      try {
-        numbytes = std::stoull(args["nbytes"][0]);
-      } catch (...) {
-	std::cout << "Invalid input to --nbytes" << std::endl;
-	std::exit(EXIT_FAILURE);
-        //FATAL_ERROR("Invalid input \"%s\" to --nbytes.", args["nbytes"][0]);
-      }
+  int *sendbuf;
+  int *recvbuf;
+  size_t numbytes = 1e8;
+  if (args.find("nbytes") != args.end()) {
+    if (args["nbytes"].size() == 0)
+      // std::cout << "Missing number of bytes argument for --nbytes" <<
+      // std::endl;
+      // std::exit(EXIT_FAILURE);
+      FATAL_ERROR("Missing number of bytes argument for --nbytes");
+    try {
+      numbytes = std::stoull(args["nbytes"][0]);
+    } catch (...) {
+      // std::cout << "Invalid input to --nbytes" << std::endl;
+      // std::exit(EXIT_FAILURE);
+      FATAL_ERROR("Invalid input \"%s\" to --nbytes.",
+                  args["nbytes"][0].c_str());
     }
+  }
 
-    allocate(sendbuf, numbytes * numproc);
-    allocate(recvbuf, numbytes * numproc);
-    for (int j = 0; j < steps.size(); j++) {
-      const auto &patterns = steps[j].patterns;
-      for (int i = 0; i < patterns.size(); i++) {
-        Comm<int> test(steps[j].lib);
-        switch (patterns[i]) {
-        case pattern::p2p:
-          test.add(sendbuf, recvbuf, numbytes, 0, 1);
-          break;
-        case pattern::broadcast:
-          for (int p = 0; p < numproc; p++)
-            test.add(sendbuf, 0, recvbuf, 0, numbytes, ROOT, p);
-          break;
-        case pattern::gather:
-          for (int p = 0; p < numproc; p++)
-            test.add(sendbuf, 0, recvbuf, p * numbytes, numbytes, p, ROOT);
-          break;
-        case pattern::scatter:
-          for (int p = 0; p < numproc; p++)
-            test.add(sendbuf, p * numbytes, recvbuf, 0, numbytes, ROOT, p);
-          break;
-        case pattern::alltoall:
-          for (int sender = 0; sender < numproc; sender++)
-            for (int recver = 0; recver < numproc; recver++)
-              test.add(sendbuf, recver * numbytes, recvbuf, sender * numbytes,
-                       numbytes, sender, recver);
-          break;
-        case pattern::allgather:
-          for (int sender = 0; sender < numproc; sender++)
-            for (int recver = 0; recver < numproc; recver++)
-              test.add(sendbuf, 0, recvbuf, sender * numbytes, numbytes, sender,
-                       recver);
-          break;
-        default:; // error?
-        }
-        if (run_validate)
-          validate(sendbuf, recvbuf, numbytes, patterns[i], test);
-        else {
-#ifndef BENCH_CALIPER
-          test.measure(5, 10, numbytes * numproc);
+  int source = 0, dest = 1;
+
+  if (args.find("source") != args.end()) {
+    try {
+      source = std::stoi(args["source"][0]);
+    } catch (...) {
+      FATAL_ERROR("invalid input \"%s\" to --source.", args["source"][0].c_str());
+    }
+  }
+
+  if (args.find("dest") != args.end()) {
+    try {
+      dest = std::stoi(args["dest"][0]);
+    } catch (...) {
+      FATAL_ERROR("invalid input \"%s\" to --dest.", args["dest"][0].c_str());
+    }
+  }
+  
+  allocate(sendbuf, numbytes * numproc);
+  allocate(recvbuf, numbytes * numproc);
+
+  for (int j = 0; j < steps.size(); j++) {
+    const auto &patterns = steps[j].patterns;
+    for (int i = 0; i < patterns.size(); i++) {
+      Comm<int> test(steps[j].lib);
+      switch (patterns[i]) {
+      case pattern::p2p:
+        test.add(sendbuf, recvbuf, numbytes, source, dest);
+        break;
+      case pattern::broadcast:
+        for (int p = 0; p < numproc; p++)
+          test.add(sendbuf, 0, recvbuf, 0, numbytes, source, p);
+        break;
+      case pattern::gather:
+        for (int p = 0; p < numproc; p++)
+          test.add(sendbuf, 0, recvbuf, p * numbytes, numbytes, p, dest);
+        break;
+      case pattern::scatter:
+        for (int p = 0; p < numproc; p++)
+          test.add(sendbuf, p * numbytes, recvbuf, 0, numbytes, ROOT, p);
+        break;
+      case pattern::alltoall:
+        for (int sender = 0; sender < numproc; sender++)
+          for (int recver = 0; recver < numproc; recver++)
+            test.add(sendbuf, recver * numbytes, recvbuf, sender * numbytes,
+                     numbytes, sender, recver);
+        break;
+      case pattern::allgather:
+        for (int sender = 0; sender < numproc; sender++)
+          for (int recver = 0; recver < numproc; recver++)
+            test.add(sendbuf, 0, recvbuf, sender * numbytes, numbytes, sender,
+                     recver);
+        break;
+      default:; // error?
+      }
+      if (run_validate)
+        validate(sendbuf, recvbuf, numbytes, patterns[i], test, source, dest);
+      else {
+#ifdef BENCH_CALIPER
+        test.measure_caliper(5, 10);
 #else
-          test.measure_caliper(5, 10);
+        test.measure(5, 10, numbytes);
 #endif
-        }
       }
     }
-    free(sendbuf);
-    free(recvbuf);
+  }
+  free(sendbuf);
+  free(recvbuf);
 
-    MPI_Finalize();
+  MPI_Finalize();
 
-    return 0;
-  }
+  return 0;
+}
diff --git a/parse.py b/parse.py
new file mode 100755
index 0000000..c4f4fec
--- /dev/null
+++ b/parse.py
@@ -0,0 +1,82 @@
+import os
+import pickle as pkl
+import pandas as pd
+import numpy as np
+
+def parse_osu(file_path: str) -> pd.DataFrame:
+    dtypes = {"size": np.int64, "avg": np.float64, "min": np.float64, "max": np.float64}
+    with open(file_path, mode="r") as file:
+        for _ in range(4):
+            file.readline()
+        
+        data = [line.split() for line in file]
+        df = pd.DataFrame(data, columns=("size", "avg", "min", "max", "iter")).drop(["iter"], axis=1)
+        for col in df.columns:
+            df[col] = df[col].astype(dtypes[col])
+
+        df = df[np.log2(df["size"]) % 2 == 0].reset_index(drop=True)
+
+    return df
+
+def parse_p2p(folder: str, out_file: str) -> dict:
+    dfs: dict[str, pd.DataFrame] = {}
+
+    for comm in os.listdir(folder):
+        data = []
+        print(comm)
+        for file_name in os.listdir(f"{folder}/{comm}"):
+            parts = file_name.removesuffix(".out").split(sep='_')
+            with open(f"{folder}/{comm}/{file_name}", "r") as file:
+                times = [np.float64(line.split(sep=' ')[1]) for line in file]
+            data.append([np.int64(parts[2]) * 4, np.int32(parts[0]), np.int32(parts[1]), *times])
+        dfs[comm] = pd.DataFrame(data, columns=("size", "source", "dest", "min", "med", "max", "avg"))
+
+    with open(out_file, mode="wb") as file:
+        pkl.dump(dfs, file)
+
+def parse_alltoall(folder: str, out_file: str) -> dict:
+    dfs: dict[str, pd.DataFrame] = {}
+
+    for comm in os.listdir(folder):
+        if comm == "mpi":
+            dfs[comm] = parse_osu(f"{folder}/{comm}/osu.out")
+            continue
+        data = []
+        print(comm)
+        for file_name in os.listdir(f"{folder}/{comm}"):
+            size = np.int64(file_name.removesuffix(".out")) * 4
+            with open(f"{folder}/{comm}/{file_name}", "r") as file:
+                times = [np.float64(line.split(sep=' ')[1]) for line in file]
+            data.append([size, *times])
+        dfs[comm] = pd.DataFrame(data, columns=("size", "min", "med", "max", "avg"))
+
+    with open(out_file, mode="wb") as file:
+        pkl.dump(dfs, file)
+
+def parse_gather_broadcast(folder: str, out_file: str) -> dict:
+    dfs: dict[str, pd.DataFrame] = {}
+
+    for comm in os.listdir(folder):
+        if comm == "mpi":
+            dfs[comm] = parse_osu(f"{folder}/{comm}/osu.out")
+            continue
+        print(comm)
+        data = []
+        for file_name in os.listdir(f"{folder}/{comm}"):
+            parts = file_name.removesuffix(".out").split(sep='_')
+            with open(f"{folder}/{comm}/{file_name}", "r") as file:
+                times = [np.float64(line.split(sep=' ')[1]) for line in file] # min, med, max, avg
+            data.append([np.int64(parts[1]) * 4, np.int32(parts[0]), *times])
+        dfs[comm] = pd.DataFrame(data, columns=("size", "root", "min", "med", "max", "avg")) # size, root, min, med, max, avg
+
+    # print(dfs)
+
+    with open(out_file, mode="wb") as file:
+        pkl.dump(dfs, file)
+
+folder = "data/bidir"
+partitions = ["SPX", "TPX", "CPX"]
+ 
+for modality in partitions:
+    parse_p2p(f"{folder}/{modality}", f"{folder}/{modality}_data.pkl")
+# parse_osu("/pscratch/agatram/gather/SPX/mpi/osu.out")
\ No newline at end of file
diff --git a/tests/ipc/CMakeLists.txt b/tests/ipc/CMakeLists.txt
index d03e1f0..f745f85 100644
--- a/tests/ipc/CMakeLists.txt
+++ b/tests/ipc/CMakeLists.txt
@@ -93,4 +93,4 @@ set_tests_properties(IPC_GET_ALLTOALL_Test PROPERTIES PASS_REGULAR_EXPRESSION "P
 add_test(NAME IPC_GET_ALLGATHER_Test
          COMMAND ${MPIEXEC} -n 3 $<TARGET_FILE:ipc_get_allgather_test>
          )
-set_tests_properties(IPC_GET_ALLGATHER_Test PROPERTIES PASS_REGULAR_EXPRESSION "PASSED!")
\ No newline at end of file
+set_tests_properties(IPC_GET_ALLGATHER_Test PROPERTIES PASS_REGULAR_EXPRESSION "PASSED!")
diff --git a/tests/mpi/CMakeLists.txt b/tests/mpi/CMakeLists.txt
index 3470b4f..dbc6253 100644
--- a/tests/mpi/CMakeLists.txt
+++ b/tests/mpi/CMakeLists.txt
@@ -49,4 +49,4 @@ set_tests_properties(MPI_ALLTOALL_Test PROPERTIES PASS_REGULAR_EXPRESSION "PASSE
 add_test(NAME MPI_ALLGATHER_Test
          COMMAND ${MPIEXEC} -n 3 $<TARGET_FILE:mpi_allgather_test>
          )
-set_tests_properties(MPI_ALLGATHER_Test PROPERTIES PASS_REGULAR_EXPRESSION "PASSED!")
\ No newline at end of file
+set_tests_properties(MPI_ALLGATHER_Test PROPERTIES PASS_REGULAR_EXPRESSION "PASSED!")
diff --git a/tests/xccl/CMakeLists.txt b/tests/xccl/CMakeLists.txt
index 1fb1f7e..94a34fc 100644
--- a/tests/xccl/CMakeLists.txt
+++ b/tests/xccl/CMakeLists.txt
@@ -48,4 +48,4 @@ set_tests_properties(XCCL_ALLTOALL_Test PROPERTIES PASS_REGULAR_EXPRESSION "PASS
 add_test(NAME XCCL_ALLGATHER_Test
          	COMMAND ${MPIEXEC} -n 3 $<TARGET_FILE:xccl_allgather_test>
          )
-set_tests_properties(XCCL_ALLGATHER_Test PROPERTIES PASS_REGULAR_EXPRESSION "PASSED!")
\ No newline at end of file
+set_tests_properties(XCCL_ALLGATHER_Test PROPERTIES PASS_REGULAR_EXPRESSION "PASSED!")
diff --git a/visualize.py b/visualize.py
new file mode 100644
index 0000000..afd0e17
--- /dev/null
+++ b/visualize.py
@@ -0,0 +1,44 @@
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+
+def get_data(folder: str, gpus: int, size: int) -> np.ndarray:
+    arr = np.empty((gpus, gpus))
+
+    for file_name in os.listdir(folder):
+        parts = file_name.removesuffix(".out").split(sep='_')
+        if parts[2] == str(size):
+            with open(f"{folder}/{file_name}", "r") as file:
+                min_time = file.readline().split(sep=' ')[1]
+            arr[int(parts[0])][int(parts[1])] = min_time
+
+    return arr
+
+nproc = 12
+fig, axes = plt.subplots(2, 2, figsize=(24, 24), constrained_layout=True)
+folder = "data_aws_xccl"
+lib = "xccl"
+lib_title = "XCCL"
+
+for i in range(4):
+    ax = axes[i // 2][i % 2]
+    nbytes = 2 ** (8 * (i + 1) - 2)
+    arr = get_data(f"{folder}/{lib}", nproc, nbytes) 
+
+    ax.imshow(arr, origin="lower")
+    ax.set_xticks(np.arange(nproc))
+    ax.set_yticks(np.arange(nproc))
+
+    ax.set_xlabel("Source")
+    ax.set_ylabel("Destination")
+    ax.set_title(f"{nbytes * 4} bytes")
+
+    # Loop over data dimensions and create text annotations.
+    for i in range(nproc):
+        for j in range(nproc):
+            entry = arr[i, j]
+            text = ax.text(j, i, f"{entry:.1f}",
+                        ha="center", va="center", color="w", fontsize=6)
+
+fig.suptitle(f"{lib_title} Latency")
+fig.savefig(f"vis/{lib}.png", dpi=400)
diff --git a/visualize_df.py b/visualize_df.py
new file mode 100755
index 0000000..a9c361d
--- /dev/null
+++ b/visualize_df.py
@@ -0,0 +1,227 @@
+import pickle as pkl
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+
+col_names = { "ipc_get": "IPC Get", "ipc_put": "IPC Put", "ipc_get_blit": "IPC Get BLIT Kernel", "ipc_put_blit": "IPC Put BLIT Kernel", "xccl": "XCCL", "mpi": "MPI", "ipc_get_no_sdma": "IPC Get SDMA Disabled",  "ipc_put_no_sdma": "IPC Put SDMA Disabled"}
+
+def heatmap(ax, arr, nbytes, nproc, bandwidth=False):
+    ax.imshow(arr, origin="lower", vmin=np.nanmin(arr), vmax=np.nanmax(arr))
+    ax.set_xticks(np.arange(nproc))
+    ax.set_yticks(np.arange(nproc))
+
+    ax.set_xlabel("Source")
+    ax.set_ylabel("Destination")
+    ax.set_title(f"{nbytes} bytes")
+
+    # Loop over data dimensions and create text annotations.
+    for i in range(nproc):
+        for j in range(nproc):
+            if i == j:
+                continue
+            entry = arr[i, j]
+            if bandwidth:
+                entry_text = f"{entry:.4f}"
+            else:
+                entry_text = f"{entry:.1f}"
+            text = ax.text(j, i, entry_text,
+                        ha="center", va="center", color="w", fontsize=6)
+
+def plot_p2p(data_folder, graph_folder, modality):
+    cols = ("size", "source", "dest", "min", "med", "max", "avg")
+    # modalities = [("SPX", 4, 8), ("TPX", 12, 16), ("CPX", 24, 24)]
+    modalities = {"SPX": (4, 8), "TPX": (12, 6), "CPX": (24, 24)}
+    sizes_bw = (2 ** 20, 2 ** 24, 2 ** 28, 2 ** 32)
+    sizes_lat = (2 ** 8, 2 ** 12, 2 ** 16, 2 ** 20)
+    nproc, figsize = modalities[modality]
+    with open(f"{data_folder}/{modality}_data.pkl", mode="rb") as file:
+        dfs = pkl.load(file)
+
+    os.makedirs(os.path.join(graph_folder, f"{modality}_bandwidth"), exist_ok=True)
+    os.makedirs(os.path.join(graph_folder, f"{modality}_latency"), exist_ok=True)
+
+    for col, df_loc in dfs.items():
+        groups = df_loc.groupby(["size"])
+
+        fig_bw, axes_bw = plt.subplots(2, 2, figsize=(figsize, figsize), constrained_layout=True)
+        for plot_i, size in enumerate(sizes_bw):
+            df_size = groups.get_group(size)
+
+            arr_bw = np.empty((nproc, nproc), dtype=np.float64)
+
+            for i, j, val in df_size[["source", "dest", "min"]].itertuples(index=False):
+                arr_bw[i][j] = size / val * 1e-3 if i != j else np.nan
+            
+            ax_bw = axes_bw[plot_i // 2][plot_i % 2]
+
+            heatmap(ax_bw, arr_bw, size, nproc, True)
+
+        fig_lat, axes_lat = plt.subplots(2, 2, figsize=(figsize, figsize), constrained_layout=True)
+        for plot_i, size in enumerate(sizes_lat):
+            df_size = groups.get_group(size)
+
+            arr_lat = np.empty((nproc, nproc), dtype=np.float64)
+
+            for i, j, val in df_size[["source", "dest", "min"]].itertuples(index=False):
+                arr_lat[i][j] = val if i != j else np.nan
+            ax_lat = axes_lat[plot_i // 2][plot_i % 2]
+            heatmap(ax_lat, arr_lat, size, nproc, False)
+        
+        fig_bw.suptitle(f"{col_names[col]} Bandwidth in GB/s")
+        fig_bw.savefig(f"{graph_folder}/{modality}_bandwidth/{col}.png", dpi=400)
+        fig_lat.suptitle(f"{col_names[col]} Latency in µs")
+        fig_lat.savefig(f"{graph_folder}/{modality}_latency/{col}.png", dpi=400)
+
+        plt.close(fig_bw)
+        plt.close(fig_lat)
+
+
+def plot_p2p_alt(data_folder, graph_folder, modality):
+    cols = ("size", "source", "dest", "min", "med", "max", "avg")
+    # modalities = [("SPX", 4, [1, 2]), ("TPX", 12, [1, 2, 8]), ("CPX", 24, [1, 2, 14])]
+    modalities = {"SPX": (6, [2]), "TPX": (12, [2, 8]), "CPX": (12, [2, 14])}
+    sizes_bw = (2 ** 18, 2 ** 22, 2 ** 26, 2 ** 30)
+    sizes_lat = (2 ** 6, 2 ** 10, 2 ** 14, 2 ** 18)
+    width, dests = modalities[modality]
+    with open(f"{data_folder}/{modality}_data.pkl", mode="rb") as file:
+        dfs = pkl.load(file)
+
+    os.makedirs(graph_folder, exist_ok=True)
+
+    fig_lat, axes_lat = plt.subplots(1, len(dests), figsize=(width, 5), constrained_layout=True)
+    fig_bw, axes_bw = plt.subplots(1, len(dests), figsize=(width, 5), constrained_layout=True)
+    if len(dests) == 1:
+        axes_lat = [axes_lat]
+        axes_bw = [axes_bw]
+    for col, df_loc in dfs.items():
+        df_loc = df_loc.sort_values(by=["size"], ignore_index=True)
+        groups = df_loc.groupby(["source", "dest"])
+        for ax_lat, ax_bw, dest in zip(axes_lat, axes_bw, dests):
+            data = groups.get_group((1, dest)).reset_index(drop=True)
+            # print(col, dest, data)
+            ax_lat.plot(data["size"], data["min"], label=col_names[col])
+            ax_bw.plot(data["size"], data["size"] / data["min"] * 1e-3, label=col_names[col])
+
+    for ax_lat, ax_bw, dest in zip(axes_lat, axes_bw, dests):
+        ax_lat.set_xscale("log")
+        ax_lat.set_yscale("log")
+        ax_lat.set_xlabel("Size (bytes)")
+        ax_lat.set_ylabel("Latency in µs")
+        ax_lat.set_title(f"P2P from 1 to {dest}")
+        ax_lat.legend()
+
+        ax_bw.set_xscale("log")
+        ax_bw.set_xlabel("Size (bytes)")
+        ax_bw.set_ylabel("Bandwidth in GB/s")
+        ax_bw.set_title(f"P2P from 1 to {dest}")
+        ax_bw.legend()
+
+    fig_lat.suptitle(f"Latency of P2P Connections")
+    fig_lat.savefig(f"{graph_folder}/{modality}_latency.png", dpi=400)
+
+    fig_bw.suptitle(f"Bandwidth of P2P Connections")
+    fig_bw.savefig(f"{graph_folder}/{modality}_bandwidth.png", dpi=400)
+
+    plt.close(fig_bw)
+    plt.close(fig_lat)
+
+
+def plot_alltoall(data_folder, graph_folder, modality):
+    cols = ("size", "min", "med", "max", "avg")
+    # modalities = [("SPX", 4), ("TPX", 12), ("CPX", 24)]
+    modalities = {"SPX": 4, "TPX": 4, "CPX": 24}
+    nproc = modalities[modality]
+    with open(f"{data_folder}/{modality}_data.pkl", mode="rb") as file:
+        dfs = pkl.load(file)
+
+    os.makedirs(graph_folder, exist_ok=True)
+
+    fig_lat, ax_lat = plt.subplots(constrained_layout=True)
+    fig_bw, ax_bw = plt.subplots(constrained_layout=True)
+    for col, df_loc in dfs.items():
+        df_loc = df_loc.sort_values(by=["size"], ignore_index=True)
+        ax_lat.plot(df_loc["size"], df_loc["min"], label=col_names[col])
+        ax_bw.plot(df_loc["size"], df_loc["size"] / df_loc["min"] * 1e-3 * nproc * nproc, label=col_names[col])
+
+    ax_lat.set_xscale("log")
+    ax_lat.set_yscale("log")
+    ax_lat.set_xlabel("Size (bytes)")
+    ax_lat.set_ylabel("Latency in µs")
+    ax_lat.legend()
+    fig_lat.suptitle(f"Latency of All-to-All Collective")
+    fig_lat.savefig(f"{graph_folder}/{modality}_latency.png", dpi=400)
+
+
+    ax_bw.set_xscale("log")
+    ax_bw.set_xlabel("Size (bytes)")
+    ax_bw.set_ylabel("Bandwidth in GB/s")
+    ax_bw.legend()
+
+    fig_bw.suptitle(f"Bandwidth of All-to-All Collective")
+    fig_bw.savefig(f"{graph_folder}/{modality}_bandwidth.png", dpi=400)
+
+    plt.close(fig_bw)
+    plt.close(fig_lat)
+
+
+def plot_broadcast_gather(data_folder, graph_folder, modality, gather=False):
+    cols = ("size", "root", "min", "med", "max", "avg")
+    # modalities = [("SPX", 4), ("TPX", 12), ("CPX", 24)]
+
+    modalities = {"SPX": 4, "TPX": 4, "CPX": 24}
+    nproc = modalities[modality]
+
+    title = "Gather" if gather else "Broadcast"
+
+    with open(f"{data_folder}/{modality}_data.pkl", mode="rb") as file:
+        dfs = pkl.load(file)
+
+    os.makedirs(graph_folder, exist_ok=True)
+
+    fig_lat, ax_lat = plt.subplots(constrained_layout=True)
+    fig_bw, ax_bw = plt.subplots(constrained_layout=True)
+
+    bins, ref_ind = max([(df_loc["size"].nunique(), name) for name, df_loc in dfs.items()])
+    sizes = dfs[ref_ind]["size"].astype(np.uint64).unique()
+    sizes.sort()
+    groups = len(dfs)
+
+    for i, (col, df_loc) in enumerate(dfs.items()):
+        # need to create bin labels and scatter in that area
+        df_loc["size"] = df_loc["size"].astype(np.uint64)
+        df_loc = df_loc.sort_values(by=["size"], ignore_index=True)
+        pos = np.searchsorted(sizes, df_loc["size"]) * (groups * 2 + 1) + i + groups / 2
+        ax_lat.scatter(pos, df_loc["min"], label=col_names[col], s=15.0, marker="x", alpha=0.75, linewidths=1.0)
+        ax_bw.scatter(pos, df_loc["size"] / df_loc["min"] * 1e-3 * nproc, label=col_names[col], s=15.0, marker="x", alpha=0.75, linewidths=1.0)
+
+    for i in range(bins):
+        ax_lat.axvline((i + 1) * (2 * groups + 1) - 1, color="gray", linestyle="--", linewidth=0.5)
+        ax_bw.axvline((i + 1) * (2 * groups + 1) - 1, color="gray", linestyle="--", linewidth=0.5)
+    ax_lat.set_xticks(np.arange(0, bins) * (groups * 2 + 1) + groups, sizes, rotation=-60.0, size="small")
+    ax_bw.set_xticks(np.arange(0, bins) * (groups * 2 + 1) + groups, sizes, rotation=-60.0, size="small")
+
+    ax_lat.set_yscale("log")
+    ax_lat.set_xlabel("Size (bytes)")
+    ax_lat.set_ylabel("Latency in µs")
+    ax_lat.legend()
+    fig_lat.suptitle(f"Latency of {title} Collective")
+    fig_lat.savefig(f"{graph_folder}/{modality}_latency.png", dpi=400)
+
+
+    # ax_bw.set_xscale("log")
+    ax_bw.set_xlabel("Size (bytes)")
+    ax_bw.set_ylabel("Bandwidth in GB/s")
+    ax_bw.legend()
+
+    fig_bw.suptitle(f"Bandwidth of {title} Collective")
+    fig_bw.savefig(f"{graph_folder}/{modality}_bandwidth.png", dpi=400)
+
+    plt.close(fig_bw)
+    plt.close(fig_lat)
+
+# plot_alltoall("/pscratch/agatram/alltoall-6.4.1", "vis_new/alltoall-6.4.1", "SPX")
+plot_p2p("data/p2p", "vis/p2p", "SPX")
+# plot_broadcast_gather("/pscratch/agatram/gather-6.4.1", "vis_new/gather-6.4.1", "SPX", True)
+# plot_broadcast_gather("data/broadcast", "vis/broadcast", "SPX", False)
+plot_p2p_alt("data/p2p", "vis/p2p", "SPX")
\ No newline at end of file