From b21468cfa6fc00f90df0157c87de7c5a4d11ee6d Mon Sep 17 00:00:00 2001
From: Tim Ziegler <timziegler1604@gmail.com>
Date: Wed, 20 Nov 2024 14:31:46 +0100
Subject: [PATCH 01/15] Changed runtime to support multiple devices

---
 lib/runtime/CusanRuntime.cpp              | 107 +++++++++++++---------
 lib/runtime/CusanRuntime.h                |  39 +++++---
 lib/runtime/CusanRuntime_cudaSpecific.cpp |   9 ++
 3 files changed, 101 insertions(+), 54 deletions(-)
diff --git a/lib/runtime/CusanRuntime.cpp b/lib/runtime/CusanRuntime.cpp
index a859af4..54de591 100644
--- a/lib/runtime/CusanRuntime.cpp
+++ b/lib/runtime/CusanRuntime.cpp
@@ -76,14 +76,27 @@ struct PointerAccess {
   AccessState mode{AccessState::kRW};
 };
 
-class Runtime {
+class Runtime;
+
+class Device {
+  friend Runtime;
   // NOTE: assumed to be a ordered map so we can iterate in ascending order
   std::map<const void*, AllocationInfo> allocations_;
   std::map<Stream, TsanFiber> streams_;
   std::map<Event, Stream> events_;
+
+  Device(){
+    //every device has a default stream
+
+  }
+};
+
+class Runtime {
+  std::map<int32_t, Device> devices_;
+  int32_t current_gpu_;
   TsanFiber cpu_fiber_;
   TsanFiber curr_fiber_;
-  bool init_ = false;
+  bool init_;
 
  public:
   static constexpr Stream kDefaultStream = Stream();
@@ -92,15 +105,10 @@ class Runtime {
   static Runtime& get() {
     static Runtime run_t;
     if (!run_t.init_) {
-#ifdef CUSAN_FIBERPOOL
-      TsanFiberPoolInit();
-#endif
-      run_t.cpu_fiber_  = TsanGetCurrentFiber();
-      run_t.curr_fiber_ = run_t.cpu_fiber_;
-
-      // default '0' cuda stream
+      run_t.current_gpu_ = get_current_device();
+      run_t.cpu_fiber_   = TsanGetCurrentFiber();
+      run_t.devices_.insert({run_t.current_gpu_, {}});
       { run_t.register_stream(kDefaultStream); }
-
       run_t.init_ = true;
     }
     return run_t;
@@ -110,26 +118,33 @@ class Runtime {
 
   void operator=(const Runtime&) = delete;
 
-  [[nodiscard]] const std::map<const void*, AllocationInfo>& get_allocations() const {
-    return allocations_;
+  Device& get_device(DeviceID id) {
+    if (devices_.find(id) == devices_.end()) {
+      devices_.insert({id, {}});
+    }
+    return devices_.at(id);
+  }
+
+  [[nodiscard]] const std::map<const void*, AllocationInfo>& get_allocations() {
+    return get_device(current_gpu_).allocations_;
   }
 
   void happens_before() {
     LOG_TRACE("[cusan]    HappensBefore of curr fiber")
+    auto& gpu = get_device(current_gpu_);
     TsanHappensBefore(curr_fiber_);
     stats_recorder.inc_TsanHappensBefore();
   }
 
   void switch_to_cpu() {
     LOG_TRACE("[cusan]    Switch to cpu")
-    // if we where one a default stream we should also post sync
-    // meaning that all work submitted after from the cpu should also be run after the default kernels are done
-    // TODO: double check with blocking
-    auto search_result = streams_.find(Runtime::kDefaultStream);
-    assert(search_result != streams_.end() && "Tried using stream that wasn't created prior");
+    auto& gpu = get_device(current_gpu_);
+
+    auto search_result = gpu.streams_.find(Runtime::kDefaultStream);
+    assert(search_result != gpu.streams_.end() && "Tried using stream that wasn't created prior");
     if (curr_fiber_ == search_result->second) {
       LOG_TRACE("[cusan]        syncing all other blocking GPU streams to run after since its default stream")
-      for (const auto& [s, sync_var] : streams_) {
+      for (const auto& [s, sync_var] : gpu.streams_) {
         if (s.isBlocking && !s.isDefaultStream()) {
           LOG_TRACE("[cusan]        happens before " << s.handle)
           TsanHappensBefore(sync_var);
@@ -145,28 +160,31 @@ class Runtime {
   }
 
   void register_stream(Stream stream) {
+    auto& gpu = get_device(current_gpu_);
+
     static uint32_t n_streams = 0;
-    auto search_result        = streams_.find(stream);
-    assert(search_result == streams_.end() && "Registered stream twice");
+    auto search_result        = gpu.streams_.find(stream);
+    assert(search_result == gpu.streams_.end() && "Registered stream twice");
     TsanFiber fiber = TsanCreateFiber(0);
     stats_recorder.inc_TsanCreateFiber();
     char name[32];
     snprintf(name, 32, "cuda_stream %u", n_streams++);
     TsanSetFiberName(fiber, name);
-    streams_.insert({stream, fiber});
+    gpu.streams_.insert({stream, fiber});
   }
 
   void switch_to_stream(Stream stream) {
     LOG_TRACE("[cusan]    Switching to stream: " << stream.handle)
-    auto search_result = streams_.find(stream);
-    assert(search_result != streams_.end() && "Tried using stream that wasn't created prior");
+    auto& gpu          = get_device(current_gpu_);
+    auto search_result = gpu.streams_.find(stream);
+    assert(search_result != gpu.streams_.end() && "Tried using stream that wasn't created prior");
     TsanSwitchToFiber(search_result->second, 0);
     stats_recorder.inc_TsanSwitchToFiber();
     if (search_result->first.isDefaultStream()) {
       LOG_TRACE("[cusan]        syncing all other blocking GPU streams since its default stream")
       // then we are on the default stream and as such want to synchronize behind all other streams
       // unless they are nonBlocking
-      for (auto& [s, sync_var] : streams_) {
+      for (auto& [s, sync_var] : gpu.streams_) {
         if (s.isBlocking && !s.isDefaultStream()) {
           LOG_TRACE("[cusan]        happens after " << s.handle)
           TsanHappensAfter(sync_var);
@@ -179,7 +197,8 @@ class Runtime {
 
   void happens_after_all_streams(bool onlyBlockingStreams = false) {
     LOG_TRACE("[cusan]    happens_after_all_streams but only blocking ones: " << onlyBlockingStreams)
-    for (const auto& [stream, fiber] : streams_) {
+    auto& gpu = get_device(current_gpu_);
+    for (const auto& [stream, fiber] : gpu.streams_) {
       if (!onlyBlockingStreams || stream.isBlocking) {
         LOG_TRACE("[cusan]        happens after " << stream.handle)
         TsanHappensAfter(fiber);
@@ -189,43 +208,51 @@ class Runtime {
   }
 
   void happens_after_stream(Stream stream) {
-    auto search_result = streams_.find(stream);
-    assert(search_result != streams_.end() && "Tried using stream that wasn't created prior");
+    auto& gpu = get_device(current_gpu_);
+
+    auto search_result = gpu.streams_.find(stream);
+    assert(search_result != gpu.streams_.end() && "Tried using stream that wasn't created prior");
     TsanHappensAfter(search_result->second);
     stats_recorder.inc_TsanHappensAfter();
   }
 
   void record_event(Event event, Stream stream) {
     LOG_TRACE("[cusan]    Record event: " << event << " stream:" << stream.handle);
-    events_[event] = stream;
+    auto& gpu          = get_device(current_gpu_);
+    gpu.events_[event] = stream;
   }
 
   // Sync the event on the current stream
   void sync_event(Event event) {
-    auto search_result = events_.find(event);
-    assert(search_result != events_.end() && "Tried using event that wasn't recorded to prior");
-    LOG_TRACE("[cusan]    Sync event: " << event << " recorded on stream:" << events_[event].handle)
-    happens_after_stream(events_[event]);
+    auto& gpu = get_device(current_gpu_);
+
+    auto search_result = gpu.events_.find(event);
+    assert(search_result != gpu.events_.end() && "Tried using event that wasn't recorded to prior");
+    LOG_TRACE("[cusan]    Sync event: " << event << " recorded on stream:" << gpu.events_[event].handle)
+    happens_after_stream(gpu.events_[event]);
   }
 
   void insert_allocation(void* ptr, AllocationInfo info) {
-    assert(allocations_.find(ptr) == allocations_.end() && "Registered an allocation multiple times");
-    allocations_[ptr] = info;
+    auto& gpu = get_device(current_gpu_);
+    assert(gpu.allocations_.find(ptr) == gpu.allocations_.end() && "Registered an allocation multiple times");
+    gpu.allocations_[ptr] = info;
   }
 
   void free_allocation(void* ptr, bool must_exist = true) {
-    bool found = allocations_.find(ptr) != allocations_.end();
+    auto& gpu  = get_device(current_gpu_);
+    bool found = gpu.allocations_.find(ptr) != gpu.allocations_.end();
     if (must_exist) {
       assert(found && "Tried to delete a non existent allocation");
     }
     if (found) {
-      allocations_.erase(ptr);
+      gpu.allocations_.erase(ptr);
     }
   }
 
   AllocationInfo* get_allocation_info(const void* ptr) {
-    auto res = allocations_.find(ptr);
-    if (res == allocations_.end()) {
+    auto& gpu = get_device(current_gpu_);
+    auto res  = gpu.allocations_.find(ptr);
+    if (res == gpu.allocations_.end()) {
       // fallback find if it lies within a region
       // for(auto [alloc_ptr, alloc_info]: allocations_){
       //   if(alloc_ptr < ptr && ((const char*)alloc_ptr) + alloc_info.size > ptr){
@@ -269,15 +296,13 @@ class Runtime {
   }
 };
 
-cusan_MemcpyKind infer_memcpy_direction(const void* target, const void* from);
-
 }  // namespace cusan::runtime
 
 using namespace cusan::runtime;
 
 namespace helper {
 #ifndef CUSAN_TYPEART
-inline std::optional<size_t> find_memory_alloc_size(const Runtime& runtime, const void* ptr) {
+inline std::optional<size_t> find_memory_alloc_size(Runtime& runtime, const void* ptr) {
   const auto& allocs = runtime.get_allocations();
 
   // if there exists any allocation
diff --git a/lib/runtime/CusanRuntime.h b/lib/runtime/CusanRuntime.h
index 8dcaef2..7210bcc 100644
--- a/lib/runtime/CusanRuntime.h
+++ b/lib/runtime/CusanRuntime.h
@@ -8,19 +8,45 @@
 #define LIB_RUNTIME_CUSAN_H_
 #include <cstddef>
 
+#ifdef __cplusplus
+
+extern "C" {
+#endif
+
+typedef enum cusan_memcpy_kind_t : unsigned int {
+  cusan_MemcpyHostToHost     = 0,
+  cusan_MemcpyHostToDevice   = 1,
+  cusan_MemcpyDeviceToHost   = 2,
+  cusan_MemcpyDeviceToDevice = 3,
+  cusan_MemcpyDefault        = 4,
+} cusan_MemcpyKind;
+
+typedef enum cusan_stream_create_flags_t : unsigned int {
+  cusan_StreamFlagsDefault     = 0,
+  cusan_StreamFlagsNonBlocking = 1,
+} cusan_StreamCreateFlags;
+#ifdef __cplusplus
+}
+#endif
+
 #ifdef __cplusplus
 namespace cusan::runtime {
 using TsanFiber = void*;
 using Event     = const void*;
 using RawStream = const void*;
+using DeviceID = int;
+cusan_MemcpyKind infer_memcpy_direction(const void* target, const void* from);
+DeviceID get_current_device();
 }  // namespace cusan::runtime
 using cusan::runtime::Event;
 using cusan::runtime::RawStream;
 using cusan::runtime::TsanFiber;
+using cusan::runtime::DeviceID;
 #else
 #define TsanFiber void*
 #define Event const void*
 #define RawStream const void*
+#define DeviceID int
 #endif
 
 #ifdef __cplusplus
@@ -28,19 +54,6 @@ using cusan::runtime::TsanFiber;
 extern "C" {
 #endif
 
-typedef enum cusan_memcpy_kind_t : unsigned int {
-  cusan_MemcpyHostToHost     = 0,
-  cusan_MemcpyHostToDevice   = 1,
-  cusan_MemcpyDeviceToHost   = 2,
-  cusan_MemcpyDeviceToDevice = 3,
-  cusan_MemcpyDefault        = 4,
-} cusan_MemcpyKind;
-
-typedef enum cusan_stream_create_flags_t : unsigned int {
-  cusan_StreamFlagsDefault     = 0,
-  cusan_StreamFlagsNonBlocking = 1,
-} cusan_StreamCreateFlags;
-
 void _cusan_kernel_register(void** kernel_args, short* modes, int n, RawStream stream);
 void _cusan_sync_device();
 void _cusan_event_record(Event event, RawStream stream);
diff --git a/lib/runtime/CusanRuntime_cudaSpecific.cpp b/lib/runtime/CusanRuntime_cudaSpecific.cpp
index 6958db3..8d8f3ec 100644
--- a/lib/runtime/CusanRuntime_cudaSpecific.cpp
+++ b/lib/runtime/CusanRuntime_cudaSpecific.cpp
@@ -10,6 +10,15 @@
 #include <cuda_runtime_api.h>
 
 namespace cusan::runtime {
+
+
+
+DeviceID get_current_device(){
+  DeviceID res;
+  cudaGetDevice(&res);
+  return res;
+}
+
 cusan_MemcpyKind infer_memcpy_direction(const void* target, const void* from) {
   cudaDeviceProp prop;
   cudaGetDeviceProperties(&prop, 0);

From d60e1b41c35999e7098b42d1f7b7808bcc547dc8 Mon Sep 17 00:00:00 2001
From: Tim Ziegler <timziegler1604@gmail.com>
Date: Wed, 20 Nov 2024 14:32:10 +0100
Subject: [PATCH 02/15] Added suport for setDevice and chooseDevice

---
 lib/pass/AnalysisTransform.cpp | 23 +++++++++++++++
 lib/pass/AnalysisTransform.h   |  2 ++
 lib/pass/CusanPass.cpp         |  2 ++
 lib/pass/FunctionDecl.cpp      |  9 ++++++
 lib/pass/FunctionDecl.h        |  2 ++
 lib/runtime/CusanRuntime.cpp   | 53 ++++++++++++++++++++++------------
 lib/runtime/CusanRuntime.h     |  4 +++
 7 files changed, 77 insertions(+), 18 deletions(-)

diff --git a/lib/pass/AnalysisTransform.cpp b/lib/pass/AnalysisTransform.cpp
index d539dae..808686d 100644
--- a/lib/pass/AnalysisTransform.cpp
+++ b/lib/pass/AnalysisTransform.cpp
@@ -589,6 +589,29 @@ llvm::SmallVector<Value*> CudaMallocPitch::map_arguments(IRBuilder<>& irb, llvm:
   return {ptr, real_size};
 }
 
+
+// CudaSetDevice
+
+CudaSetDevice::CudaSetDevice(callback::FunctionDecl* decls) {
+  setup("cudaSetDevice", &decls->cusan_set_device.f);
+}
+llvm::SmallVector<Value*> CudaSetDevice::map_arguments(IRBuilder<>&, llvm::ArrayRef<Value*> args) {
+  //cudaSetDevice ( int  device )
+  assert(args.size() == 1);
+  return {args[0]};
+}
+
+// CudaCHooseDevice
+
+CudaChooseDevice::CudaChooseDevice(callback::FunctionDecl* decls) {
+  setup("cudaChooseDevice", &decls->cusan_choose_device.f);
+}
+llvm::SmallVector<Value*> CudaChooseDevice::map_arguments(IRBuilder<>&, llvm::ArrayRef<Value*> args) {
+  //cudaChooseDevice ( int* device, const cudaDeviceProp* prop )
+  assert(args.size() == 2);
+  return {args[0]};
+}
+
 // CudaStreamQuery
 
 CudaStreamQuery::CudaStreamQuery(callback::FunctionDecl* decls) {
diff --git a/lib/pass/AnalysisTransform.h b/lib/pass/AnalysisTransform.h
index 97c1bc1..76b49f1 100644
--- a/lib/pass/AnalysisTransform.h
+++ b/lib/pass/AnalysisTransform.h
@@ -212,6 +212,8 @@ BasicInstrumenterDecl(CudaMallocManaged);
 BasicInstrumenterDecl(CudaMalloc);
 BasicInstrumenterDecl(CudaFree);
 BasicInstrumenterDecl(CudaMallocPitch);
+BasicInstrumenterDecl(CudaSetDevice);
+BasicInstrumenterDecl(CudaChooseDevice);
 
 class CudaStreamQuery : public SimpleInstrumenter<CudaStreamQuery> {
  public:
diff --git a/lib/pass/CusanPass.cpp b/lib/pass/CusanPass.cpp
index 00bc33c..6af51e9 100644
--- a/lib/pass/CusanPass.cpp
+++ b/lib/pass/CusanPass.cpp
@@ -163,6 +163,8 @@ bool CusanPass::runOnFunc(llvm::Function& function) {
   modified |= transform::StreamCreateWithFlagsInstrumenter(&cusan_decls_).instrument(function);
   modified |= transform::StreamCreateWithPriorityInstrumenter(&cusan_decls_).instrument(function);
   modified |= transform::CudaMallocPitch(&cusan_decls_).instrument(function);
+  modified |= transform::CudaChooseDevice(&cusan_decls_).instrument(function);
+  modified |= transform::CudaSetDevice(&cusan_decls_).instrument(function);
 
   auto data_for_host = host::kernel_model_for_stub(&function, this->kernel_models_);
   if (data_for_host) {
diff --git a/lib/pass/FunctionDecl.cpp b/lib/pass/FunctionDecl.cpp
index f58c828..91e029d 100644
--- a/lib/pass/FunctionDecl.cpp
+++ b/lib/pass/FunctionDecl.cpp
@@ -1,4 +1,5 @@
 #include "FunctionDecl.h"
+#include <cstdint>
 
 namespace cusan::callback {
 
@@ -124,6 +125,14 @@ void FunctionDecl::initialize(llvm::Module& module) {
   //  void* devPtr, size_t pitch, size_t width, size_t height
   ArgTypes arg_types_2d_memset = {void_ptr, size_t_ty, size_t_ty, size_t_ty};
   make_function(cusan_memset_2d, arg_types_2d_memset);
+
+  //  int device
+  ArgTypes arg_types_set_device = {Type::getInt32Ty(c)};
+  make_function(cusan_set_device, arg_types_set_device);
+
+  //  void* device
+  ArgTypes arg_types_choose_device = {Type::getInt32Ty(c)->getPointerTo()};
+  make_function(cusan_choose_device, arg_types_choose_device);
 }
 
 }  // namespace cusan::callback
diff --git a/lib/pass/FunctionDecl.h b/lib/pass/FunctionDecl.h
index d40e80a..91eabe5 100644
--- a/lib/pass/FunctionDecl.h
+++ b/lib/pass/FunctionDecl.h
@@ -41,6 +41,8 @@ struct FunctionDecl {
   CusanFunction cusan_host_register{"_cusan_host_register"};
   CusanFunction cusan_host_unregister{"_cusan_host_unregister"};
   CusanFunction cusan_device_alloc{"_cusan_device_alloc"};
+  CusanFunction cusan_set_device{"_cusan_set_device"};
+  CusanFunction cusan_choose_device{"_cusan_choose_device"};
   CusanFunction cusan_device_free{"_cusan_device_free"};
   CusanFunction cusan_stream_query{"_cusan_stream_query"};
   CusanFunction cusan_event_query{"_cusan_event_query"};
diff --git a/lib/runtime/CusanRuntime.cpp b/lib/runtime/CusanRuntime.cpp
index 54de591..3d0a00b 100644
--- a/lib/runtime/CusanRuntime.cpp
+++ b/lib/runtime/CusanRuntime.cpp
@@ -85,15 +85,14 @@ class Device {
   std::map<Stream, TsanFiber> streams_;
   std::map<Event, Stream> events_;
 
-  Device(){
-    //every device has a default stream
-
+  Device() {
+    // every device has a default stream
   }
 };
 
 class Runtime {
   std::map<int32_t, Device> devices_;
-  int32_t current_gpu_;
+  int32_t current_device_;
   TsanFiber cpu_fiber_;
   TsanFiber curr_fiber_;
   bool init_;
@@ -105,9 +104,9 @@ class Runtime {
   static Runtime& get() {
     static Runtime run_t;
     if (!run_t.init_) {
-      run_t.current_gpu_ = get_current_device();
+      run_t.current_device_ = get_current_device();
       run_t.cpu_fiber_   = TsanGetCurrentFiber();
-      run_t.devices_.insert({run_t.current_gpu_, {}});
+      run_t.devices_.insert({run_t.current_device_, {}});
       { run_t.register_stream(kDefaultStream); }
       run_t.init_ = true;
     }
@@ -125,20 +124,27 @@ class Runtime {
     return devices_.at(id);
   }
 
+  void set_device(DeviceID device){
+    if (devices_.find(device) == devices_.end()) {
+      devices_.insert({device, {}});
+    }
+    current_device_ = device;
+  }
+
   [[nodiscard]] const std::map<const void*, AllocationInfo>& get_allocations() {
-    return get_device(current_gpu_).allocations_;
+    return get_device(current_device_).allocations_;
   }
 
   void happens_before() {
     LOG_TRACE("[cusan]    HappensBefore of curr fiber")
-    auto& gpu = get_device(current_gpu_);
+    auto& gpu = get_device(current_device_);
     TsanHappensBefore(curr_fiber_);
     stats_recorder.inc_TsanHappensBefore();
   }
 
   void switch_to_cpu() {
     LOG_TRACE("[cusan]    Switch to cpu")
-    auto& gpu = get_device(current_gpu_);
+    auto& gpu = get_device(current_device_);
 
     auto search_result = gpu.streams_.find(Runtime::kDefaultStream);
     assert(search_result != gpu.streams_.end() && "Tried using stream that wasn't created prior");
@@ -160,7 +166,7 @@ class Runtime {
   }
 
   void register_stream(Stream stream) {
-    auto& gpu = get_device(current_gpu_);
+    auto& gpu = get_device(current_device_);
 
     static uint32_t n_streams = 0;
     auto search_result        = gpu.streams_.find(stream);
@@ -175,7 +181,7 @@ class Runtime {
 
   void switch_to_stream(Stream stream) {
     LOG_TRACE("[cusan]    Switching to stream: " << stream.handle)
-    auto& gpu          = get_device(current_gpu_);
+    auto& gpu          = get_device(current_device_);
     auto search_result = gpu.streams_.find(stream);
     assert(search_result != gpu.streams_.end() && "Tried using stream that wasn't created prior");
     TsanSwitchToFiber(search_result->second, 0);
@@ -197,7 +203,7 @@ class Runtime {
 
   void happens_after_all_streams(bool onlyBlockingStreams = false) {
     LOG_TRACE("[cusan]    happens_after_all_streams but only blocking ones: " << onlyBlockingStreams)
-    auto& gpu = get_device(current_gpu_);
+    auto& gpu = get_device(current_device_);
     for (const auto& [stream, fiber] : gpu.streams_) {
       if (!onlyBlockingStreams || stream.isBlocking) {
         LOG_TRACE("[cusan]        happens after " << stream.handle)
@@ -208,7 +214,7 @@ class Runtime {
   }
 
   void happens_after_stream(Stream stream) {
-    auto& gpu = get_device(current_gpu_);
+    auto& gpu = get_device(current_device_);
 
     auto search_result = gpu.streams_.find(stream);
     assert(search_result != gpu.streams_.end() && "Tried using stream that wasn't created prior");
@@ -218,13 +224,13 @@ class Runtime {
 
   void record_event(Event event, Stream stream) {
     LOG_TRACE("[cusan]    Record event: " << event << " stream:" << stream.handle);
-    auto& gpu          = get_device(current_gpu_);
+    auto& gpu          = get_device(current_device_);
     gpu.events_[event] = stream;
   }
 
   // Sync the event on the current stream
   void sync_event(Event event) {
-    auto& gpu = get_device(current_gpu_);
+    auto& gpu = get_device(current_device_);
 
     auto search_result = gpu.events_.find(event);
     assert(search_result != gpu.events_.end() && "Tried using event that wasn't recorded to prior");
@@ -233,13 +239,13 @@ class Runtime {
   }
 
   void insert_allocation(void* ptr, AllocationInfo info) {
-    auto& gpu = get_device(current_gpu_);
+    auto& gpu = get_device(current_device_);
     assert(gpu.allocations_.find(ptr) == gpu.allocations_.end() && "Registered an allocation multiple times");
     gpu.allocations_[ptr] = info;
   }
 
   void free_allocation(void* ptr, bool must_exist = true) {
-    auto& gpu  = get_device(current_gpu_);
+    auto& gpu  = get_device(current_device_);
     bool found = gpu.allocations_.find(ptr) != gpu.allocations_.end();
     if (must_exist) {
       assert(found && "Tried to delete a non existent allocation");
@@ -250,7 +256,7 @@ class Runtime {
   }
 
   AllocationInfo* get_allocation_info(const void* ptr) {
-    auto& gpu = get_device(current_gpu_);
+    auto& gpu = get_device(current_device_);
     auto res  = gpu.allocations_.find(ptr);
     if (res == gpu.allocations_.end()) {
       // fallback find if it lies within a region
@@ -536,6 +542,17 @@ void _cusan_event_query(Event event, unsigned int err) {
   }
 }
 
+void _cusan_set_device(DeviceID device) {
+  Runtime::get().set_device(device);
+}
+
+void _cusan_choose_device(DeviceID* device) {
+  // does this function ever return a nullptr?
+  // and what would that mean
+  assert(device);
+  Runtime::get().set_device(*device);
+}
+
 void _cusan_memset_async_impl(void* target, size_t count, RawStream stream) {
   // The Async versions are always asynchronous with respect to the host.
   auto& runtime = Runtime::get();
diff --git a/lib/runtime/CusanRuntime.h b/lib/runtime/CusanRuntime.h
index 7210bcc..c970f41 100644
--- a/lib/runtime/CusanRuntime.h
+++ b/lib/runtime/CusanRuntime.h
@@ -85,6 +85,10 @@ void _cusan_device_alloc(void** ptr, size_t size);
 void _cusan_device_free(void* ptr);
 void _cusan_stream_query(RawStream stream, unsigned int err);
 void _cusan_event_query(Event event, unsigned int err);
+
+void _cusan_set_device(DeviceID device);
+void _cusan_choose_device(DeviceID* device);
+
 #ifdef __cplusplus
 }
 #endif

From f6d6d03e9cd0e2e3bd1c1a3872134272e880121f Mon Sep 17 00:00:00 2001
From: Tim Ziegler <timziegler1604@gmail.com>
Date: Fri, 13 Dec 2024 17:09:24 +0100
Subject: [PATCH 03/15] Initial multi gpu

---
 lib/runtime/CusanRuntime.cpp              | 231 +++++++++---------
 lib/runtime/CusanRuntime.h                |   2 +-
 lib/runtime/CusanRuntime_cudaSpecific.cpp |   2 +-
 test/CMakeLists.txt                       |   4 +-
 test/multi_gpu/01_device_sync.c           |  89 +++++++
 test/multi_gpu/02_device_sync_event.c     |  75 ++++++
 test/multi_gpu/TSan_External.h            | 276 ++++++++++++++++++++++
 test/multi_gpu/suppressions.txt           |  29 +++
 8 files changed, 588 insertions(+), 120 deletions(-)
 create mode 100644 test/multi_gpu/01_device_sync.c
 create mode 100644 test/multi_gpu/02_device_sync_event.c
 create mode 100644 test/multi_gpu/TSan_External.h
 create mode 100644 test/multi_gpu/suppressions.txt

diff --git a/lib/runtime/CusanRuntime.cpp b/lib/runtime/CusanRuntime.cpp
index 3d0a00b..bca4839 100644
--- a/lib/runtime/CusanRuntime.cpp
+++ b/lib/runtime/CusanRuntime.cpp
@@ -79,78 +79,40 @@ struct PointerAccess {
 class Runtime;
 
 class Device {
-  friend Runtime;
   // NOTE: assumed to be a ordered map so we can iterate in ascending order
   std::map<const void*, AllocationInfo> allocations_;
   std::map<Stream, TsanFiber> streams_;
   std::map<Event, Stream> events_;
-
-  Device() {
-    // every device has a default stream
-  }
-};
-
-class Runtime {
-  std::map<int32_t, Device> devices_;
-  int32_t current_device_;
   TsanFiber cpu_fiber_;
   TsanFiber curr_fiber_;
-  bool init_;
 
  public:
   static constexpr Stream kDefaultStream = Stream();
   Recorder stats_recorder;
 
-  static Runtime& get() {
-    static Runtime run_t;
-    if (!run_t.init_) {
-      run_t.current_device_ = get_current_device();
-      run_t.cpu_fiber_   = TsanGetCurrentFiber();
-      run_t.devices_.insert({run_t.current_device_, {}});
-      { run_t.register_stream(kDefaultStream); }
-      run_t.init_ = true;
-    }
-    return run_t;
-  }
-
-  Runtime(const Runtime&) = delete;
-
-  void operator=(const Runtime&) = delete;
-
-  Device& get_device(DeviceID id) {
-    if (devices_.find(id) == devices_.end()) {
-      devices_.insert({id, {}});
-    }
-    return devices_.at(id);
-  }
-
-  void set_device(DeviceID device){
-    if (devices_.find(device) == devices_.end()) {
-      devices_.insert({device, {}});
-    }
-    current_device_ = device;
+  Device() {
+    // every device has a default stream
+    { register_stream(Device::kDefaultStream); }
+    cpu_fiber_ = TsanGetCurrentFiber();
   }
 
   [[nodiscard]] const std::map<const void*, AllocationInfo>& get_allocations() {
-    return get_device(current_device_).allocations_;
+    return allocations_;
   }
 
   void happens_before() {
     LOG_TRACE("[cusan]    HappensBefore of curr fiber")
-    auto& gpu = get_device(current_device_);
     TsanHappensBefore(curr_fiber_);
     stats_recorder.inc_TsanHappensBefore();
   }
 
   void switch_to_cpu() {
     LOG_TRACE("[cusan]    Switch to cpu")
-    auto& gpu = get_device(current_device_);
-
-    auto search_result = gpu.streams_.find(Runtime::kDefaultStream);
-    assert(search_result != gpu.streams_.end() && "Tried using stream that wasn't created prior");
+    auto search_result = streams_.find(Device::kDefaultStream);
+    assert(search_result != streams_.end() && "Tried using stream that wasn't created prior");
     if (curr_fiber_ == search_result->second) {
       LOG_TRACE("[cusan]        syncing all other blocking GPU streams to run after since its default stream")
-      for (const auto& [s, sync_var] : gpu.streams_) {
+      for (const auto& [s, sync_var] : streams_) {
         if (s.isBlocking && !s.isDefaultStream()) {
           LOG_TRACE("[cusan]        happens before " << s.handle)
           TsanHappensBefore(sync_var);
@@ -166,31 +128,28 @@ class Runtime {
   }
 
   void register_stream(Stream stream) {
-    auto& gpu = get_device(current_device_);
-
     static uint32_t n_streams = 0;
-    auto search_result        = gpu.streams_.find(stream);
-    assert(search_result == gpu.streams_.end() && "Registered stream twice");
+    auto search_result        = streams_.find(stream);
+    assert(search_result == streams_.end() && "Registered stream twice");
     TsanFiber fiber = TsanCreateFiber(0);
     stats_recorder.inc_TsanCreateFiber();
     char name[32];
     snprintf(name, 32, "cuda_stream %u", n_streams++);
     TsanSetFiberName(fiber, name);
-    gpu.streams_.insert({stream, fiber});
+    streams_.insert({stream, fiber});
   }
 
   void switch_to_stream(Stream stream) {
     LOG_TRACE("[cusan]    Switching to stream: " << stream.handle)
-    auto& gpu          = get_device(current_device_);
-    auto search_result = gpu.streams_.find(stream);
-    assert(search_result != gpu.streams_.end() && "Tried using stream that wasn't created prior");
+    auto search_result = streams_.find(stream);
+    assert(search_result != streams_.end() && "Tried using stream that wasn't created prior");
     TsanSwitchToFiber(search_result->second, 0);
     stats_recorder.inc_TsanSwitchToFiber();
     if (search_result->first.isDefaultStream()) {
       LOG_TRACE("[cusan]        syncing all other blocking GPU streams since its default stream")
       // then we are on the default stream and as such want to synchronize behind all other streams
       // unless they are nonBlocking
-      for (auto& [s, sync_var] : gpu.streams_) {
+      for (auto& [s, sync_var] : streams_) {
         if (s.isBlocking && !s.isDefaultStream()) {
           LOG_TRACE("[cusan]        happens after " << s.handle)
           TsanHappensAfter(sync_var);
@@ -203,8 +162,7 @@ class Runtime {
 
   void happens_after_all_streams(bool onlyBlockingStreams = false) {
     LOG_TRACE("[cusan]    happens_after_all_streams but only blocking ones: " << onlyBlockingStreams)
-    auto& gpu = get_device(current_device_);
-    for (const auto& [stream, fiber] : gpu.streams_) {
+    for (const auto& [stream, fiber] : streams_) {
       if (!onlyBlockingStreams || stream.isBlocking) {
         LOG_TRACE("[cusan]        happens after " << stream.handle)
         TsanHappensAfter(fiber);
@@ -214,51 +172,43 @@ class Runtime {
   }
 
   void happens_after_stream(Stream stream) {
-    auto& gpu = get_device(current_device_);
-
-    auto search_result = gpu.streams_.find(stream);
-    assert(search_result != gpu.streams_.end() && "Tried using stream that wasn't created prior");
+    auto search_result = streams_.find(stream);
+    assert(search_result != streams_.end() && "Tried using stream that wasn't created prior");
     TsanHappensAfter(search_result->second);
     stats_recorder.inc_TsanHappensAfter();
   }
 
   void record_event(Event event, Stream stream) {
     LOG_TRACE("[cusan]    Record event: " << event << " stream:" << stream.handle);
-    auto& gpu          = get_device(current_device_);
-    gpu.events_[event] = stream;
+    events_[event] = stream;
   }
 
   // Sync the event on the current stream
   void sync_event(Event event) {
-    auto& gpu = get_device(current_device_);
-
-    auto search_result = gpu.events_.find(event);
-    assert(search_result != gpu.events_.end() && "Tried using event that wasn't recorded to prior");
-    LOG_TRACE("[cusan]    Sync event: " << event << " recorded on stream:" << gpu.events_[event].handle)
-    happens_after_stream(gpu.events_[event]);
+    auto search_result = events_.find(event);
+    assert(search_result != events_.end() && "Tried using event that wasn't recorded to prior");
+    LOG_TRACE("[cusan]    Sync event: " << event << " recorded on stream:" << events_[event].handle)
+    happens_after_stream(events_[event]);
   }
 
   void insert_allocation(void* ptr, AllocationInfo info) {
-    auto& gpu = get_device(current_device_);
-    assert(gpu.allocations_.find(ptr) == gpu.allocations_.end() && "Registered an allocation multiple times");
-    gpu.allocations_[ptr] = info;
+    assert(allocations_.find(ptr) == allocations_.end() && "Registered an allocation multiple times");
+    allocations_[ptr] = info;
   }
 
   void free_allocation(void* ptr, bool must_exist = true) {
-    auto& gpu  = get_device(current_device_);
-    bool found = gpu.allocations_.find(ptr) != gpu.allocations_.end();
+    bool found = allocations_.find(ptr) != allocations_.end();
     if (must_exist) {
       assert(found && "Tried to delete a non existent allocation");
     }
     if (found) {
-      gpu.allocations_.erase(ptr);
+      allocations_.erase(ptr);
     }
   }
 
   AllocationInfo* get_allocation_info(const void* ptr) {
-    auto& gpu = get_device(current_device_);
-    auto res  = gpu.allocations_.find(ptr);
-    if (res == gpu.allocations_.end()) {
+    auto res = allocations_.find(ptr);
+    if (res == allocations_.end()) {
       // fallback find if it lies within a region
       // for(auto [alloc_ptr, alloc_info]: allocations_){
       //   if(alloc_ptr < ptr && ((const char*)alloc_ptr) + alloc_info.size > ptr){
@@ -270,10 +220,7 @@ class Runtime {
     return &res->second;
   }
 
- private:
-  Runtime() = default;
-
-  ~Runtime() {
+  void output_statistics() {
 #undef cusan_stat_handle
 #define cusan_stat_handle(name) table.put(Row::make(#name, stats_recorder.get_##name()));
 #if CUSAN_SOFTCOUNTER
@@ -295,6 +242,56 @@ class Runtime {
 #endif
 #undef cusan_stat_handle
 #undef CUSAN_CUDA_EVENT_LIST
+  }
+};
+
+class Runtime {
+  std::map<int32_t, Device> devices_;
+  int32_t current_device_;
+  bool init_;
+
+ public:
+  static Runtime& get() {
+    static Runtime run_t;
+    if (!run_t.init_) {
+      run_t.current_device_ = get_current_device_id();
+      run_t.devices_.insert({run_t.current_device_, {}});
+      run_t.init_ = true;
+    }
+    return run_t;
+  }
+
+  Runtime(const Runtime&) = delete;
+
+  void operator=(const Runtime&) = delete;
+
+  Device& get_current_device() {
+    return devices_.at(current_device_);
+  }
+
+  Device& get_device(DeviceID id) {
+    if (devices_.find(id) == devices_.end()) {
+      devices_.insert({id, {}});
+    }
+    return devices_.at(id);
+  }
+
+  void set_device(DeviceID device) {
+    if (devices_.find(device) == devices_.end()) {
+      devices_.insert({device, {}});
+    }
+    current_device_ = device;
+  }
+
+ private:
+  Runtime() = default;
+
+  ~Runtime() {
+#if CUSAN_SOFTCOUNTER
+  for(auto& [_, device]: devices_){
+    device.output_statistics();
+  }
+#endif
 
 #ifdef CUSAN_FIBERPOOL
     // TsanFiberPoolFini();
@@ -308,7 +305,7 @@ using namespace cusan::runtime;
 
 namespace helper {
 #ifndef CUSAN_TYPEART
-inline std::optional<size_t> find_memory_alloc_size(Runtime& runtime, const void* ptr) {
+inline std::optional<size_t> find_memory_alloc_size(Device& runtime, const void* ptr) {
   const auto& allocs = runtime.get_allocations();
 
   // if there exists any allocation
@@ -335,7 +332,7 @@ inline std::optional<size_t> find_memory_alloc_size(Runtime& runtime, const void
   return {};
 }
 #else
-inline std::optional<size_t> find_memory_alloc_size(const Runtime&, const void* ptr) {
+inline std::optional<size_t> find_memory_alloc_size(const Device&, const void* ptr) {
   size_t alloc_size{0};
   int alloc_id{0};
   auto query_status = typeart_get_type(ptr, &alloc_id, &alloc_size);
@@ -354,7 +351,7 @@ inline std::optional<size_t> find_memory_alloc_size(const Runtime&, const void*
 
 void _cusan_kernel_register(void** kernel_args, short* modes, int n, RawStream stream) {
   LOG_TRACE("[cusan]Kernel Register with " << n << " Args and on stream:" << stream)
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
 
   llvm::SmallVector<size_t, 4> sizes;
   for (int i = 0; i < n; ++i) {
@@ -403,21 +400,21 @@ void _cusan_kernel_register(void** kernel_args, short* modes, int n, RawStream s
 
 void _cusan_sync_device() {
   LOG_TRACE("[cusan]Sync Device\n")
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_sync_device_calls();
   runtime.happens_after_all_streams();
 }
 
 void _cusan_event_record(Event event, RawStream stream) {
   LOG_TRACE("[cusan]Event Record")
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_event_record_calls();
   runtime.record_event(event, Stream(stream));
 }
 
 void _cusan_sync_stream(RawStream raw_stream) {
   LOG_TRACE("[cusan]Sync Stream" << raw_stream)
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_sync_stream_calls();
   const auto stream = Stream(raw_stream);
   if (stream.isDefaultStream()) {
@@ -431,28 +428,28 @@ void _cusan_sync_stream(RawStream raw_stream) {
 
 void _cusan_sync_event(Event event) {
   LOG_TRACE("[cusan]Sync Event" << event)
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_sync_event_calls();
   runtime.sync_event(event);
 }
 
 void _cusan_create_event(Event*) {
   LOG_TRACE("[cusan]create event")
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_create_event_calls();
 }
 
 void _cusan_create_stream(RawStream* stream, cusan_StreamCreateFlags flags) {
   LOG_TRACE("[cusan]create stream with flags: " << flags
                                                 << " isNonBlocking: " << (bool)(flags & cusan_StreamFlagsNonBlocking))
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_create_stream_calls();
   runtime.register_stream(Stream(*stream, !(bool)(flags & cusan_StreamFlagsNonBlocking)));
 }
 
 void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int) {
   LOG_TRACE("[cusan]StreamWaitEvent stream:" << stream << " on event:" << event)
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_stream_wait_event_calls();
   runtime.switch_to_stream(Stream(stream));
   runtime.sync_event(event);
@@ -464,7 +461,7 @@ void _cusan_host_alloc(void** ptr, size_t size, unsigned int) {
   // at least based of this presentation and some comments in the cuda forums this syncs the whole device
   //  https://developer.download.nvidia.com/CUDA/training/StreamsAndConcurrencyWebinar.pdf
   LOG_TRACE("[cusan]host alloc " << *ptr << " with size " << size)
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_host_alloc_calls();
   // runtime.happens_after_all_streams();
 
@@ -473,27 +470,27 @@ void _cusan_host_alloc(void** ptr, size_t size, unsigned int) {
 
 void _cusan_host_free(void* ptr) {
   LOG_TRACE("[cusan]host free")
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_host_free_calls();
   runtime.free_allocation(ptr);
 }
 
 void _cusan_host_register(void* ptr, size_t size, unsigned int) {
   LOG_TRACE("[cusan]host register " << ptr << " with size:" << size);
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_host_register_calls();
   runtime.insert_allocation(ptr, AllocationInfo::Pinned(size));
 }
 void _cusan_host_unregister(void* ptr) {
   LOG_TRACE("[cusan]host unregister " << ptr);
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_host_unregister_calls();
   runtime.free_allocation(ptr);
 }
 
 void _cusan_managed_alloc(void** ptr, size_t size, unsigned int) {
   LOG_TRACE("[cusan]Managed host alloc " << *ptr << " with size " << size << " -> implicit device sync")
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_managed_alloc_calls();
   runtime.happens_after_all_streams();
   runtime.insert_allocation(*ptr, AllocationInfo::Managed(size));
@@ -503,7 +500,7 @@ void _cusan_device_alloc(void** ptr, size_t size) {
   // implicit syncs device
   // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#stream-ordered-memory-allocator
   LOG_TRACE("[cusan]Device alloc " << *ptr << " with size " << size << " -> implicit device sync")
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_device_alloc_calls();
 
   runtime.insert_allocation(*ptr, AllocationInfo::Device(size));
@@ -514,7 +511,7 @@ void _cusan_device_free(void* ptr) {
   // implicit syncs device
   // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#stream-ordered-memory-allocator
   LOG_TRACE("[cusan]Device free " << ptr << " -> TODO maybe implicit device sync")
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_device_free_calls();
   runtime.happens_after_all_streams();
 }
@@ -522,7 +519,7 @@ void _cusan_device_free(void* ptr) {
 // TODO: get rid of cudaSpecifc check for cudaSuccess 0
 void _cusan_stream_query(RawStream stream, unsigned int err) {
   LOG_TRACE("[cusan] Stream query " << stream << " -> " << err)
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_stream_query_calls();
   if (err == 0) {
     LOG_TRACE("[cusan]    syncing")
@@ -534,7 +531,7 @@ void _cusan_stream_query(RawStream stream, unsigned int err) {
 // TODO: get rid of cudaSpecifc check for cudaSuccess 0
 void _cusan_event_query(Event event, unsigned int err) {
   LOG_TRACE("[cusan] Event query " << event << " -> " << err)
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_event_query_calls();
   if (err == 0) {
     LOG_TRACE("[cusan]    syncing")
@@ -555,7 +552,7 @@ void _cusan_choose_device(DeviceID* device) {
 
 void _cusan_memset_async_impl(void* target, size_t count, RawStream stream) {
   // The Async versions are always asynchronous with respect to the host.
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_memset_async_calls();
   runtime.switch_to_stream(Stream(stream));
   TsanMemoryWritePC(target, count, __builtin_return_address(0));
@@ -566,9 +563,9 @@ void _cusan_memset_async_impl(void* target, size_t count, RawStream stream) {
 void _cusan_memset_impl(void* target, size_t count) {
   // The cudaMemset functions are asynchronous with respect to the host except when the target memory is pinned host
   // memory.
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_memset_calls();
-  runtime.switch_to_stream(Runtime::kDefaultStream);
+  runtime.switch_to_stream(Device::kDefaultStream);
   LOG_TRACE("[cusan]    " << "Write to " << target << " with size: " << count)
   TsanMemoryWritePC(target, count, __builtin_return_address(0));
   runtime.stats_recorder.inc_TsanMemoryWrite();
@@ -579,7 +576,7 @@ void _cusan_memset_impl(void* target, size_t count) {
   // if we couldn't find alloc info we just assume the worst and don't sync
   if ((alloc_info && (alloc_info->is_pinned || alloc_info->is_managed)) || CUSAN_SYNC_DETAIL_LEVEL == 0) {
     LOG_TRACE("[cusan]    " << "Memset is blocking")
-    runtime.happens_after_stream(Runtime::kDefaultStream);
+    runtime.happens_after_stream(Device::kDefaultStream);
   } else {
     LOG_TRACE("[cusan]    " << "Memset is not blocking")
     if (!alloc_info) {
@@ -611,7 +608,7 @@ void _cusan_memset_async(void* target, size_t count, RawStream stream) {
 
 void _cusan_memcpy_async_impl(void* target, size_t write_size, const void* from, size_t read_size,
                               cusan_MemcpyKind kind, RawStream stream) {
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_memcpy_async_calls();
   if (kind == cusan_MemcpyHostToHost && CUSAN_SYNC_DETAIL_LEVEL == 1) {
     // 2. For transfers from any host memory to any host memory, the function is fully synchronous with respect to the
@@ -649,23 +646,23 @@ void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_
     kind = infer_memcpy_direction(target, from);
   }
 
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_memcpy_calls();
   if (CUSAN_SYNC_DETAIL_LEVEL == 0) {
     LOG_TRACE("[cusan]   DefaultStream+Blocking")
     // In this mode: Memcpy always blocks, no detailed view w.r.t. memory direction
-    runtime.switch_to_stream(Runtime::kDefaultStream);
+    runtime.switch_to_stream(Device::kDefaultStream);
     TsanMemoryReadPC(from, read_size, __builtin_return_address(0));
     runtime.stats_recorder.inc_TsanMemoryRead();
     TsanMemoryWritePC(target, write_size, __builtin_return_address(0));
     runtime.stats_recorder.inc_TsanMemoryWrite();
     runtime.happens_before();
     runtime.switch_to_cpu();
-    runtime.happens_after_stream(Runtime::kDefaultStream);
+    runtime.happens_after_stream(Device::kDefaultStream);
   } else if (kind == cusan_MemcpyDeviceToDevice) {
     // 4. For transfers from device memory to device memory, no host-side synchronization is performed.
     LOG_TRACE("[cusan]   DefaultStream")
-    runtime.switch_to_stream(Runtime::kDefaultStream);
+    runtime.switch_to_stream(Device::kDefaultStream);
     TsanMemoryReadPC(from, read_size, __builtin_return_address(0));
     runtime.stats_recorder.inc_TsanMemoryRead();
     TsanMemoryWritePC(target, write_size, __builtin_return_address(0));
@@ -675,14 +672,14 @@ void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_
     // 3. For transfers from device to either pageable or pinned host memory, the function returns only once the copy
     // has completed.
     LOG_TRACE("[cusan]   DefaultStream+Blocking")
-    runtime.switch_to_stream(Runtime::kDefaultStream);
+    runtime.switch_to_stream(Device::kDefaultStream);
     TsanMemoryReadPC(from, read_size, __builtin_return_address(0));
     runtime.stats_recorder.inc_TsanMemoryRead();
     TsanMemoryWritePC(target, write_size, __builtin_return_address(0));
     runtime.stats_recorder.inc_TsanMemoryWrite();
     runtime.happens_before();
     runtime.switch_to_cpu();
-    runtime.happens_after_stream(Runtime::kDefaultStream);
+    runtime.happens_after_stream(Device::kDefaultStream);
   } else if (kind == cusan_MemcpyHostToDevice) {
     // 1. For transfers from pageable host memory to device memory, a stream sync is performed before the copy is
     // initiated.
@@ -690,7 +687,7 @@ void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_
     auto* alloc_info = runtime.get_allocation_info(from);
     // if we couldn't find alloc info we just assume the worst and don't sync
     if (alloc_info && !alloc_info->is_pinned) {
-      runtime.happens_after_stream(Runtime::kDefaultStream);
+      runtime.happens_after_stream(Device::kDefaultStream);
       LOG_TRACE("[cusan]   DefaultStream+Blocking")
     } else {
       LOG_TRACE("[cusan]   DefaultStream")
@@ -699,20 +696,20 @@ void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_
     //   device memory
     TsanMemoryReadPC(from, read_size, __builtin_return_address(0));
     runtime.stats_recorder.inc_TsanMemoryRead();
-    runtime.switch_to_stream(Runtime::kDefaultStream);
+    runtime.switch_to_stream(Device::kDefaultStream);
     TsanMemoryWritePC(target, write_size, __builtin_return_address(0));
     runtime.stats_recorder.inc_TsanMemoryWrite();
     runtime.happens_before();
     runtime.switch_to_cpu();
-    runtime.happens_after_stream(Runtime::kDefaultStream);
+    runtime.happens_after_stream(Device::kDefaultStream);
   } else if (kind == cusan_MemcpyHostToHost) {
     // 5. For transfers from any host memory to any host memory, the function is fully synchronous with respect to the
     // host.
     LOG_TRACE("[cusan]   DefaultStream+Blocking")
-    runtime.switch_to_stream(Runtime::kDefaultStream);
+    runtime.switch_to_stream(Device::kDefaultStream);
     runtime.happens_before();
     runtime.switch_to_cpu();
-    runtime.happens_after_stream(Runtime::kDefaultStream);
+    runtime.happens_after_stream(Device::kDefaultStream);
     TsanMemoryReadPC(from, read_size, __builtin_return_address(0));
     runtime.stats_recorder.inc_TsanMemoryRead();
     TsanMemoryWritePC(target, write_size, __builtin_return_address(0));
diff --git a/lib/runtime/CusanRuntime.h b/lib/runtime/CusanRuntime.h
index c970f41..784f805 100644
--- a/lib/runtime/CusanRuntime.h
+++ b/lib/runtime/CusanRuntime.h
@@ -36,7 +36,7 @@ using Event     = const void*;
 using RawStream = const void*;
 using DeviceID = int;
 cusan_MemcpyKind infer_memcpy_direction(const void* target, const void* from);
-DeviceID get_current_device();
+DeviceID get_current_device_id();
 }  // namespace cusan::runtime
 using cusan::runtime::Event;
 using cusan::runtime::RawStream;
diff --git a/lib/runtime/CusanRuntime_cudaSpecific.cpp b/lib/runtime/CusanRuntime_cudaSpecific.cpp
index 8d8f3ec..fbe7d26 100644
--- a/lib/runtime/CusanRuntime_cudaSpecific.cpp
+++ b/lib/runtime/CusanRuntime_cudaSpecific.cpp
@@ -13,7 +13,7 @@ namespace cusan::runtime {
 
 
 
-DeviceID get_current_device(){
+DeviceID get_current_device_id(){
   DeviceID res;
   cudaGetDevice(&res);
   return res;
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 34234fe..047ed65 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -72,7 +72,7 @@ function(cusan_add_lit_target)
       set(TARGET_NAME check-cusan-${suite})
     endif()
 
-    if(NOT EXISTS ${SUITE_PATH} AND NOT ${suite} STREQUAL "staging")
+    if(NOT EXISTS ${SUITE_PATH} AND NOT ${suite} STREQUAL "staging" AND NOT ${suite} STREQUAL "multi_gpu")
       message(WARNING "Could not find suitable lit test target at ${SUITE_PATH}")
       continue()
     endif()
@@ -105,6 +105,7 @@ set(CUSAN_SUITES
   pass
   kernel_analysis
   staging
+  multi_gpu
 )
 
 include(ProcessorCount)
@@ -119,6 +120,7 @@ set(CUSAN_SUITES_WORKERS
   ${NUM_CPU}
   ${NUM_CPU}
   1
+  1
 )
 
 cusan_add_lit_target(SUITES ${CUSAN_SUITES} WORKERS ${CUSAN_SUITES_WORKERS})
diff --git a/test/multi_gpu/01_device_sync.c b/test/multi_gpu/01_device_sync.c
new file mode 100644
index 0000000..319b6f4
--- /dev/null
+++ b/test/multi_gpu/01_device_sync.c
@@ -0,0 +1,89 @@
+// clang-format off
+// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include <cstdio>
+#include <cuda_runtime.h>
+
+__global__ void write_kernel_delay(int* arr, const int N, const unsigned int delay) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(delay);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  if (tid < N) {
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main() {
+  const int size            = 256;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+  int* managed_data;
+  int* managed_data2;
+
+  int n_devices = 0;
+  cudaGetDeviceCount(&n_devices);
+  if (n_devices < 2) {
+    printf("This test is designed for CUDA on multiple devices but there is only one or none here. Exiting.\n");
+    return 1;
+  }
+
+
+  cudaSetDevice(0);
+  cudaMallocManaged(&managed_data, size * sizeof(int));
+  cudaMemset(managed_data, 0, size * sizeof(int));
+
+
+  cudaSetDevice(1);
+  cudaMallocManaged(&managed_data2, size * sizeof(int));
+  cudaMemset(managed_data2, 0, size * sizeof(int));
+
+
+
+  cudaSetDevice(0);
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock>>>(managed_data, size, 1316134912);
+
+  //if we only have the later synchronize we will only syncrhonize the second device
+#ifdef CUSAN_SYNC
+  cudaDeviceSynchronize();
+#endif
+
+  cudaSetDevice(1);
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock>>>(managed_data2, size, 1);
+  cudaDeviceSynchronize();
+
+  for (int i = 0; i < size; i++) {
+    if (managed_data[i] == 0) {
+      printf("[Error] sync managed_data %i\n", managed_data[i]);
+      break;
+    }
+  }
+    for (int i = 0; i < size; i++) {
+    if (managed_data2[i] == 0) {
+      printf("[Error] sync managed_data2 %i\n", managed_data[i]);
+      break;
+    }
+  }
+
+
+
+  cudaFree(managed_data);
+  cudaFree(managed_data2);
+  return 0;
+}
diff --git a/test/multi_gpu/02_device_sync_event.c b/test/multi_gpu/02_device_sync_event.c
new file mode 100644
index 0000000..2317e29
--- /dev/null
+++ b/test/multi_gpu/02_device_sync_event.c
@@ -0,0 +1,75 @@
+// clang-format off
+// RUN: %wrapper-cc %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-cc %clang_args -DCUSAN_SYNC %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+// XFAIL: *
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+__global__ void kernel(int* data) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(1000000U);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  data[tid] = (tid + 1);
+}
+
+int main() {
+  const int size            = 256;
+  const int threadsPerBlock = 256;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  int n_devices = 0;
+  cudaGetDeviceCount(&n_devices);
+  if (n_devices < 2) {
+    printf("This test is designed for CUDA on multiple devices but there is only one or none here. Exiting.\n");
+    return 1;
+  }
+
+  cudaSetDevice(0);
+
+  int* d_data;  // Unified Memory pointer
+
+  // Allocate Unified Memory
+  cudaMallocManaged(&d_data, size * sizeof(int));
+  cudaMemset(d_data, 0, size * sizeof(int));
+
+  cudaEvent_t endEvent;
+  cudaEventCreate(&endEvent);
+  kernel<<<blocksPerGrid, threadsPerBlock>>>(d_data);
+  cudaEventRecord(endEvent);
+
+  cudaSetDevice(1);
+
+#ifdef CUSAN_SYNC
+  cudaEventSynchronize(endEvent);
+#endif
+
+  for (int i = 0; i < size; i++) {
+    if (d_data[i] < 1) {
+      printf("[Error] sync\n");
+      break;
+    }
+  }
+
+  cudaEventDestroy(endEvent);
+  cudaFree(d_data);
+
+  return 0;
+}
diff --git a/test/multi_gpu/TSan_External.h b/test/multi_gpu/TSan_External.h
new file mode 100644
index 0000000..36849c9
--- /dev/null
+++ b/test/multi_gpu/TSan_External.h
@@ -0,0 +1,276 @@
+/* Part of the MUST Project, under BSD-3-Clause License
+ * See https://hpc.rwth-aachen.de/must/LICENSE for license information.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+/**
+ * @file TSan_External.h
+ * Functions exported from ThreadSaniziter that can be used for dynamic annotations.
+ * https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/tsan/rtl/tsan_interface.cpp
+ * https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cpp
+ */
+
+#ifndef TSAN_EXTERNAL_H
+#define TSAN_EXTERNAL_H
+
+typedef unsigned long uptr;
+
+typedef enum { mo_relaxed, mo_consume, mo_acquire, mo_release, mo_acq_rel, mo_seq_cst } morder;
+
+typedef unsigned char a8;
+typedef unsigned short a16;
+typedef unsigned int a32;
+typedef unsigned long long a64;
+
+#ifdef MUST_DEBUG
+// Print an error message *once* if an annotation function is used that is not overwritten by the
+// TSan runtime
+#define FALLBACK_PRINT(func_name)                                                  \
+  {                                                                                \
+    static bool once = false;                                                      \
+    if (!once) {                                                                   \
+      printf(                                                                      \
+          "[MUST-ERROR] %s fallback called, check your TSan runtime and the call " \
+          "signature\n",                                                           \
+          func_name);                                                              \
+      once = true;                                                                 \
+    }                                                                              \
+  }
+#else
+#define FALLBACK_PRINT(func_name)
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ThreadSanitizer defines the following functions that can be used in MUST for dynamic annotations.
+void __attribute__((weak)) AnnotateHappensAfter(const char* file, int line, const volatile void* cv) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) AnnotateHappensBefore(const char* file, int line, const volatile void* cv) {
+  FALLBACK_PRINT(__func__);
+}
+
+void __attribute__((weak)) AnnotateNewMemory(const char* file, int line, const volatile void* cv, uptr size) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) AnnotateMemoryRead(const char* file, int line, const volatile void* cv, uptr size) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) AnnotateMemoryWrite(const char* file, int line, const volatile void* cv, uptr size) {
+  FALLBACK_PRINT(__func__);
+}
+
+void __attribute__((weak)) AnnotateIgnoreReadsBegin(const char* file, int line) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) AnnotateIgnoreReadsEnd(const char* file, int line) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) AnnotateIgnoreWritesBegin(const char* file, int line) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) AnnotateIgnoreWritesEnd(const char* file, int line) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) AnnotateIgnoreSyncBegin(const char* file, int line) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) AnnotateIgnoreSyncEnd(const char* file, int line) {
+  FALLBACK_PRINT(__func__);
+}
+
+void __attribute__((weak)) AnnotateRWLockCreate(const char* file, int line, const volatile void* cv) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) AnnotateRWLockDestroy(const char* file, int line, const volatile void* cv) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak))
+AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv, unsigned long long is_w) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak))
+AnnotateRWLockReleased(const char* file, int line, const volatile void* cv, unsigned long long is_w) {
+  FALLBACK_PRINT(__func__);
+}
+
+void __attribute__((weak)) __tsan_read_range(void* addr, uptr size) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) __tsan_write_range(void* addr, uptr size) {
+  FALLBACK_PRINT(__func__);
+}
+
+void __attribute__((weak)) __tsan_read1_pc(void* addr, void* pc) {
+}
+void __attribute__((weak)) __tsan_write1_pc(void* addr, void* pc) {
+}
+void __attribute__((weak)) __tsan_read2_pc(void* addr, void* pc) {
+}
+void __attribute__((weak)) __tsan_write2_pc(void* addr, void* pc) {
+}
+void __attribute__((weak)) __tsan_read4_pc(void* addr, void* pc) {
+}
+void __attribute__((weak)) __tsan_write4_pc(void* addr, void* pc) {
+}
+void __attribute__((weak)) __tsan_read8_pc(void* addr, void* pc) {
+}
+void __attribute__((weak)) __tsan_write8_pc(void* addr, void* pc) {
+}
+
+#define annotateHelper(rw, s)                                            \
+  if (len >= s) {                                                        \
+    printf("annotateHelper(%s, %i, %p, %li)\n", #rw, s, addr, (uptr)pc); \
+    len -= s;                                                            \
+    __tsan_##rw##s##_pc(addr, pc);                                       \
+    addr += s;                                                           \
+    size -= s;                                                           \
+  }
+
+void __attribute__((weak)) __tsan_read_range_pc(void* a, uptr size, void* pc) {
+  char* addr = (char*)a;
+  uptr len   = ((uptr)addr) % 8;
+  if (size < len)
+    len = size;
+  annotateHelper(read, 4) annotateHelper(read, 2) annotateHelper(read, 1) for (; size > 7; size -= 8) {
+    __tsan_read8_pc(addr, pc);
+    addr += 8;
+  }
+  len = size;
+  annotateHelper(read, 4) annotateHelper(read, 2) annotateHelper(read, 1)
+}
+
+void __attribute__((weak)) __tsan_write_range_pc(void* a, uptr size, void* pc) {
+  char* addr = (char*)a;
+  uptr len   = ((uptr)addr) % 8;
+  // printf("__tsan_write_range_pc(%p, %li, %li), %li\n", addr, size, pc, len);
+  if (size < len)
+    len = size;
+  annotateHelper(write, 4) annotateHelper(write, 2) annotateHelper(write, 1) for (; size > 7; size -= 8) {
+    // printf("__tsan_write8_pc(%p, %li)\n", addr, pc);
+    __tsan_write8_pc(addr, pc);
+    addr += 8;
+  }
+  len = size;
+  annotateHelper(write, 4) annotateHelper(write, 2) annotateHelper(write, 1)
+}
+
+void __attribute__((weak)) __tsan_func_entry(void* call_pc) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) __tsan_func_exit() {
+  FALLBACK_PRINT(__func__);
+}
+
+a8 __attribute__((weak)) __tsan_atomic8_load(const volatile a8* a, morder mo) {
+  FALLBACK_PRINT(__func__);
+  return 0;
+}
+a16 __attribute__((weak)) __tsan_atomic16_load(const volatile a16* a, morder mo) {
+  FALLBACK_PRINT(__func__);
+  return 0;
+}
+a32 __attribute__((weak)) __tsan_atomic32_load(const volatile a32* a, morder mo) {
+  FALLBACK_PRINT(__func__);
+  return 0;
+}
+a64 __attribute__((weak)) __tsan_atomic64_load(const volatile a64* a, morder mo) {
+  FALLBACK_PRINT(__func__);
+  return 0;
+}
+void __attribute__((weak)) __tsan_atomic8_store(volatile a8* a, a8 v, morder mo) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) __tsan_atomic16_store(volatile a16* a, a16 v, morder mo) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) __tsan_atomic32_store(volatile a32* a, a32 v, morder mo) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) __tsan_atomic64_store(volatile a64* a, a64 v, morder mo) {
+  FALLBACK_PRINT(__func__);
+}
+
+// TLC extension
+void __attribute__((weak)) AnnotateInitTLC(const char* file, int line, const volatile void* cv) {
+  FALLBACK_PRINT(__func__);
+  AnnotateHappensBefore(file, line, cv);
+}
+void __attribute__((weak)) AnnotateStartTLC(const char* file, int line, const volatile void* cv) {
+  FALLBACK_PRINT(__func__);
+  AnnotateHappensAfter(file, line, cv);
+}
+
+// Fibers
+void __attribute__((weak)) * __tsan_get_current_fiber() {
+  FALLBACK_PRINT(__func__);
+  return nullptr;
+}
+void __attribute__((weak)) * __tsan_create_fiber(unsigned flags) {
+  FALLBACK_PRINT(__func__);
+  return nullptr;
+}
+void __attribute__((weak)) __tsan_destroy_fiber(void* fiber) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) __tsan_switch_to_fiber(void* fiber, unsigned flags) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) __tsan_set_fiber_name(void* fiber, const char* name) {
+  FALLBACK_PRINT(__func__);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#define TsanHappensBefore(cv) AnnotateHappensBefore(__FILE__, __LINE__, cv)
+#define TsanHappensAfter(cv) AnnotateHappensAfter(__FILE__, __LINE__, cv)
+
+#define TsanIgnoreWritesBegin() AnnotateIgnoreWritesBegin(__FILE__, __LINE__)
+#define TsanIgnoreWritesEnd() AnnotateIgnoreWritesEnd(__FILE__, __LINE__)
+#define TsanIgnoreReadsBegin() AnnotateIgnoreReadsBegin(__FILE__, __LINE__)
+#define TsanIgnoreReadsEnd() AnnotateIgnoreReadsEnd(__FILE__, __LINE__)
+#define TsanIgnoreSyncBegin() AnnotateIgnoreSyncBegin(__FILE__, __LINE__)
+#define TsanIgnoreSyncEnd() AnnotateIgnoreSyncEnd(__FILE__, __LINE__)
+
+#define TsanInitTLC(cv) AnnotateInitTLC(__FILE__, __LINE__, cv)
+#define TsanStartTLC(cv) AnnotateStartTLC(__FILE__, __LINE__, cv)
+
+#define TsanCreateFiber(flags) __tsan_create_fiber(flags)
+#define TsanDestroyFiber(fiber) __tsan_destroy_fiber(fiber)
+#define TsanSwitchToFiber(fiber, flags) __tsan_switch_to_fiber(fiber, flags)
+#define TsanGetCurrentFiber() __tsan_get_current_fiber()
+#define TsanSetFiberName(fiber, name) __tsan_set_fiber_name(fiber, name)
+
+#define TsanMemoryRead(addr, size) __tsan_read_range(addr, size)
+#define TsanMemoryWrite(addr, size) __tsan_write_range(addr, size)
+#define TsanMemoryReadPC(addr, size, pc) __tsan_read_range_pc(addr, size, pc)
+#define TsanMemoryWritePC(addr, size, pc) __tsan_write_range_pc(addr, size, pc)
+
+#define TsanFuncEntry(pc) __tsan_func_entry(pc)
+#define TsanFuncExit() __tsan_func_exit()
+
+#define TsanPCMemoryRead(pc, addr, size) AnnotatePCMemoryRead(pc, __FILE__, __LINE__, addr, size)
+#define TsanPCMemoryWrite(pc, addr, size) AnnotatePCMemoryWrite(pc, __FILE__, __LINE__, addr, size)
+
+#define TsanNewMemory(addr, size) AnnotateNewMemory(__FILE__, __LINE__, addr, size)
+#define TsanFreeMemory(addr, size) AnnotateNewMemory(__FILE__, __LINE__, addr, size)
+
+#define TsanRWLockCreate(cv) AnnotateRWLockCreate(__FILE__, __LINE__, cv)
+#define TsanRWLockDestroy(cv) AnnotateRWLockDestroy(__FILE__, __LINE__, cv)
+#define TsanRWLockAcquired(cv, is_w) AnnotateRWLockAcquired(__FILE__, __LINE__, cv, is_w)
+#define TsanRWLockReleased(cv, is_w) AnnotateRWLockReleased(__FILE__, __LINE__, cv, is_w)
+
+#define TsanAtomic8Load(a8, mo) __tsan_atomic8_load(a8, mo)
+#define TsanAtomic8Store(a8, v, mo) __tsan_atomic8_store(a8, v, mo)
+#define TsanAtomic16Load(a16, mo) __tsan_atomic16_load(a16, mo)
+#define TsanAtomic16Store(a16, v, mo) __tsan_atomic16_store(a16, v, mo)
+#define TsanAtomic32Load(a32, mo) __tsan_atomic32_load(a32, mo)
+#define TsanAtomic32Store(a32, v, mo) __tsan_atomic32_store(a32, v, mo)
+#define TsanAtomic64Load(a64, mo) __tsan_atomic64_load(a64, mo)
+#define TsanAtomic64Store(a64, v, mo) __tsan_atomic64_store(a64, v, mo)
+
+#endif /*TSAN_EXTERNAL_H*/
\ No newline at end of file
diff --git a/test/multi_gpu/suppressions.txt b/test/multi_gpu/suppressions.txt
new file mode 100644
index 0000000..3292e71
--- /dev/null
+++ b/test/multi_gpu/suppressions.txt
@@ -0,0 +1,29 @@
+called_from_lib:libmpi_cxx*
+called_from_lib:libmpi.so*
+called_from_lib:libcuda.so*
+called_from_lib:libtypeartRuntime.so*
+
+# Probably not required, from previous experiments
+#called_from_lib:libucx*
+called_from_lib:libucp*
+called_from_lib:libucs*
+#called_from_lib:libucm_cuda.so*
+called_from_lib:libucm.so*
+called_from_lib:libuct_ib.so.0
+called_from_lib:libuct.so.0
+called_from_lib:mca_pmix_pmix3x*
+called_from_lib:libibverbs.so*
+#called_from_lib:libevent_pthreads-2.1.so*
+called_from_lib:libevent_core-2.1.so*
+#called_from_lib:libhfi1verbs-rdmav34.so*
+called_from_lib:libopen-pal.so.40*
+#called_from_lib:mca_errmgr_default_app.so*
+#called_from_lib:mca_gds_hash.so*
+#called_from_lib:mca_ptl_tcp.so*
+#called_from_lib:mca_bfrops_v20.so*
+#called_from_lib:mca_psec_munge.so*
+#called_from_lib:mca_gds_ds21.so*
+#called_from_lib:libmca_common_dstore.so*
+called_from_lib:libopen-rte.so.40*
+called_from_lib:ld-linux-x86-64.so*
+called_from_lib:libmlx5-rdma*

From 7073b159c3ccc5488e49ad33966ac88b92dd9fef Mon Sep 17 00:00:00 2001
From: Tim Ziegler <timziegler1604@gmail.com>
Date: Fri, 27 Dec 2024 18:48:03 +0100
Subject: [PATCH 04/15] Added per device runtime statistics

---
 lib/runtime/CusanRuntime.cpp          | 78 ++++++++++++++-------------
 test/multi_gpu/02_device_sync_event.c |  1 -
 2 files changed, 42 insertions(+), 37 deletions(-)

diff --git a/lib/runtime/CusanRuntime.cpp b/lib/runtime/CusanRuntime.cpp
index bca4839..5070131 100644
--- a/lib/runtime/CusanRuntime.cpp
+++ b/lib/runtime/CusanRuntime.cpp
@@ -82,7 +82,6 @@ class Device {
   // NOTE: assumed to be a ordered map so we can iterate in ascending order
   std::map<const void*, AllocationInfo> allocations_;
   std::map<Stream, TsanFiber> streams_;
-  std::map<Event, Stream> events_;
   TsanFiber cpu_fiber_;
   TsanFiber curr_fiber_;
 
@@ -90,7 +89,7 @@ class Device {
   static constexpr Stream kDefaultStream = Stream();
   Recorder stats_recorder;
 
-  Device() {
+  Device() : stats_recorder() {
     // every device has a default stream
     { register_stream(Device::kDefaultStream); }
     cpu_fiber_ = TsanGetCurrentFiber();
@@ -100,6 +99,10 @@ class Device {
     return allocations_;
   }
 
+  [[nodiscard]] TsanFiber get_stream_fiber(Stream stream) {
+    return streams_[stream];
+  }
+
   void happens_before() {
     LOG_TRACE("[cusan]    HappensBefore of curr fiber")
     TsanHappensBefore(curr_fiber_);
@@ -178,19 +181,6 @@ class Device {
     stats_recorder.inc_TsanHappensAfter();
   }
 
-  void record_event(Event event, Stream stream) {
-    LOG_TRACE("[cusan]    Record event: " << event << " stream:" << stream.handle);
-    events_[event] = stream;
-  }
-
-  // Sync the event on the current stream
-  void sync_event(Event event) {
-    auto search_result = events_.find(event);
-    assert(search_result != events_.end() && "Tried using event that wasn't recorded to prior");
-    LOG_TRACE("[cusan]    Sync event: " << event << " recorded on stream:" << events_[event].handle)
-    happens_after_stream(events_[event]);
-  }
-
   void insert_allocation(void* ptr, AllocationInfo info) {
     assert(allocations_.find(ptr) == allocations_.end() && "Registered an allocation multiple times");
     allocations_[ptr] = info;
@@ -224,7 +214,7 @@ class Device {
 #undef cusan_stat_handle
 #define cusan_stat_handle(name) table.put(Row::make(#name, stats_recorder.get_##name()));
 #if CUSAN_SOFTCOUNTER
-    Table table{"Cusan runtime statistics"};
+    Table table{"Cusan device statistics"};
 #ifdef CUSAN_FIBERPOOL
     table.put(Row::make("Fiberpool", 1));
 #else
@@ -247,6 +237,7 @@ class Device {
 
 class Runtime {
   std::map<int32_t, Device> devices_;
+  std::map<Event, std::pair<TsanFiber, Device*>> events_;
   int32_t current_device_;
   bool init_;
 
@@ -255,7 +246,7 @@ class Runtime {
     static Runtime run_t;
     if (!run_t.init_) {
       run_t.current_device_ = get_current_device_id();
-      run_t.devices_.insert({run_t.current_device_, {}});
+      run_t.devices_[run_t.current_device_];
       run_t.init_ = true;
     }
     return run_t;
@@ -271,26 +262,40 @@ class Runtime {
 
   Device& get_device(DeviceID id) {
     if (devices_.find(id) == devices_.end()) {
-      devices_.insert({id, {}});
+      devices_[id];
     }
     return devices_.at(id);
   }
 
   void set_device(DeviceID device) {
     if (devices_.find(device) == devices_.end()) {
-      devices_.insert({device, {}});
+      devices_[device];
     }
     current_device_ = device;
   }
 
+  void record_event(Event event, Stream stream) {
+    LOG_TRACE("[cusan]    Record event: " << event << " stream:" << stream.handle);
+    auto& current_device = get_current_device();
+    auto search_result   = current_device.get_stream_fiber(stream);
+    events_.insert({event, {search_result, &current_device}});
+  }
+
+  // Sync the event on the current stream
+  void sync_event(Event event) {
+    auto [stream_fiber, device] = events_[event];
+    TsanHappensAfter(stream_fiber);
+    device->stats_recorder.inc_TsanHappensAfter();
+  }
+
  private:
   Runtime() = default;
 
   ~Runtime() {
 #if CUSAN_SOFTCOUNTER
-  for(auto& [_, device]: devices_){
-    device.output_statistics();
-  }
+    for (auto& [_, device] : devices_) {
+      device.output_statistics();
+    }
 #endif
 
 #ifdef CUSAN_FIBERPOOL
@@ -407,8 +412,9 @@ void _cusan_sync_device() {
 
 void _cusan_event_record(Event event, RawStream stream) {
   LOG_TRACE("[cusan]Event Record")
-  auto& runtime = Runtime::get().get_current_device();
-  runtime.stats_recorder.inc_event_record_calls();
+  auto& runtime = Runtime::get();
+  auto& device  = runtime.get_current_device();
+  device.stats_recorder.inc_event_record_calls();
   runtime.record_event(event, Stream(stream));
 }
 
@@ -428,8 +434,9 @@ void _cusan_sync_stream(RawStream raw_stream) {
 
 void _cusan_sync_event(Event event) {
   LOG_TRACE("[cusan]Sync Event" << event)
-  auto& runtime = Runtime::get().get_current_device();
-  runtime.stats_recorder.inc_sync_event_calls();
+  auto& runtime = Runtime::get();
+  auto& device  = runtime.get_current_device();
+  device.stats_recorder.inc_sync_event_calls();
   runtime.sync_event(event);
 }
 
@@ -449,12 +456,13 @@ void _cusan_create_stream(RawStream* stream, cusan_StreamCreateFlags flags) {
 
 void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int) {
   LOG_TRACE("[cusan]StreamWaitEvent stream:" << stream << " on event:" << event)
-  auto& runtime = Runtime::get().get_current_device();
-  runtime.stats_recorder.inc_stream_wait_event_calls();
-  runtime.switch_to_stream(Stream(stream));
+  auto& runtime = Runtime::get();
+  auto& device  = runtime.get_current_device();
+  device.stats_recorder.inc_stream_wait_event_calls();
+  device.switch_to_stream(Stream(stream));
   runtime.sync_event(event);
-  runtime.happens_before();
-  runtime.switch_to_cpu();
+  device.happens_before();
+  device.switch_to_cpu();
 }
 
 void _cusan_host_alloc(void** ptr, size_t size, unsigned int) {
@@ -516,23 +524,21 @@ void _cusan_device_free(void* ptr) {
   runtime.happens_after_all_streams();
 }
 
-// TODO: get rid of cudaSpecifc check for cudaSuccess 0
 void _cusan_stream_query(RawStream stream, unsigned int err) {
   LOG_TRACE("[cusan] Stream query " << stream << " -> " << err)
   auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_stream_query_calls();
   if (err == 0) {
     LOG_TRACE("[cusan]    syncing")
-
     runtime.happens_after_stream(Stream{stream});
   }
 }
 
-// TODO: get rid of cudaSpecifc check for cudaSuccess 0
 void _cusan_event_query(Event event, unsigned int err) {
   LOG_TRACE("[cusan] Event query " << event << " -> " << err)
-  auto& runtime = Runtime::get().get_current_device();
-  runtime.stats_recorder.inc_event_query_calls();
+  auto& runtime = Runtime::get();
+  auto& device  = runtime.get_current_device();
+  device.stats_recorder.inc_event_query_calls();
   if (err == 0) {
     LOG_TRACE("[cusan]    syncing")
     runtime.sync_event(event);
diff --git a/test/multi_gpu/02_device_sync_event.c b/test/multi_gpu/02_device_sync_event.c
index 2317e29..000f05d 100644
--- a/test/multi_gpu/02_device_sync_event.c
+++ b/test/multi_gpu/02_device_sync_event.c
@@ -13,7 +13,6 @@
 // CHECK-SYNC-NOT: data race
 // CHECK-SYNC-NOT: [Error] sync
 
-// XFAIL: *
 
 #include <cuda_runtime.h>
 #include <stdio.h>

From 8b8cea120a664354a47a70bff4aff4760ee955b6 Mon Sep 17 00:00:00 2001
From: Tim Ziegler <timziegler1604@gmail.com>
Date: Fri, 31 Jan 2025 16:47:48 +0100
Subject: [PATCH 05/15] Added additional statistic for device switches

---
 lib/runtime/CusanRuntime.cpp | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/lib/runtime/CusanRuntime.cpp b/lib/runtime/CusanRuntime.cpp
index 5070131..60c4f14 100644
--- a/lib/runtime/CusanRuntime.cpp
+++ b/lib/runtime/CusanRuntime.cpp
@@ -95,6 +95,10 @@ class Device {
     cpu_fiber_ = TsanGetCurrentFiber();
   }
 
+  bool operator==(const Device& other) const {
+    return curr_fiber_ == other.curr_fiber_;
+  }
+
   [[nodiscard]] const std::map<const void*, AllocationInfo>& get_allocations() {
     return allocations_;
   }
@@ -240,7 +244,9 @@ class Runtime {
   std::map<Event, std::pair<TsanFiber, Device*>> events_;
   int32_t current_device_;
   bool init_;
-
+#if CUSAN_SOFTCOUNTER
+  softcounter::AtomicCounter device_switches = 0;
+#endif
  public:
   static Runtime& get() {
     static Runtime run_t;
@@ -256,6 +262,15 @@ class Runtime {
 
   void operator=(const Runtime&) = delete;
 
+#if CUSAN_SOFTCOUNTER
+  inline void inc_device_switches() {
+    this->device_switches++;
+  }
+  inline softcounter::Counter get_device_switches() {
+    return this->device_switches;
+  }
+#endif
+
   Device& get_current_device() {
     return devices_.at(current_device_);
   }
@@ -271,6 +286,11 @@ class Runtime {
     if (devices_.find(device) == devices_.end()) {
       devices_[device];
     }
+#if CUSAN_SOFTCOUNTER
+    if (current_device_ != device) {
+      inc_device_switches();
+    }
+#endif
     current_device_ = device;
   }
 
@@ -296,10 +316,11 @@ class Runtime {
     for (auto& [_, device] : devices_) {
       device.output_statistics();
     }
-#endif
 
-#ifdef CUSAN_FIBERPOOL
-    // TsanFiberPoolFini();
+    Table table{"Cusan runtime statistics"};
+    table.put(Row::make("Device Switches ", get_device_switches()));
+    table.print(std::cout);
+
 #endif
   }
 };
@@ -546,14 +567,16 @@ void _cusan_event_query(Event event, unsigned int err) {
 }
 
 void _cusan_set_device(DeviceID device) {
-  Runtime::get().set_device(device);
+  auto& r = Runtime::get();
+  r.set_device(device);
 }
 
 void _cusan_choose_device(DeviceID* device) {
   // does this function ever return a nullptr?
   // and what would that mean
   assert(device);
-  Runtime::get().set_device(*device);
+  auto& r = Runtime::get();
+  r.set_device(*device);
 }
 
 void _cusan_memset_async_impl(void* target, size_t count, RawStream stream) {

From 4c2def3d9c3ec59d24b4355f829ead34a88559e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Mon, 10 Mar 2025 13:25:14 +0100
Subject: [PATCH 06/15] Adapt tests for wrapper

---
 test/multi_gpu/01_device_sync.c            | 16 +++++-----------
 test/multi_gpu/02_device_sync_event.c      |  5 ++---
 test/runtime/11_struct_of_buff.c           |  2 +-
 test/runtime/20_cuda_default_stream_sync.c |  7 ++++---
 test/runtime/31_tsan_cuda_event.c          |  2 +-
 test/runtime/32_tsan_async_copy.c          | 16 +++++-----------
 test/runtime/33_tsan_wait_event.c          |  2 +-
 7 files changed, 19 insertions(+), 31 deletions(-)

diff --git a/test/multi_gpu/01_device_sync.c b/test/multi_gpu/01_device_sync.c
index 319b6f4..cc0b7b8 100644
--- a/test/multi_gpu/01_device_sync.c
+++ b/test/multi_gpu/01_device_sync.c
@@ -1,8 +1,8 @@
 // clang-format off
-// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %wrapper-cc %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t.exe
 // RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
 
-// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %wrapper-cc -DCUSAN_SYNC %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
 // clang-format on
@@ -13,8 +13,8 @@
 // CHECK-SYNC-NOT: data race
 // CHECK-SYNC-NOT: [Error] sync
 
-#include <cstdio>
 #include <cuda_runtime.h>
+#include <stdio.h>
 
 __global__ void write_kernel_delay(int* arr, const int N, const unsigned int delay) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
@@ -44,22 +44,18 @@ int main() {
     return 1;
   }
 
-
   cudaSetDevice(0);
   cudaMallocManaged(&managed_data, size * sizeof(int));
   cudaMemset(managed_data, 0, size * sizeof(int));
 
-
   cudaSetDevice(1);
   cudaMallocManaged(&managed_data2, size * sizeof(int));
   cudaMemset(managed_data2, 0, size * sizeof(int));
 
-
-
   cudaSetDevice(0);
   write_kernel_delay<<<blocksPerGrid, threadsPerBlock>>>(managed_data, size, 1316134912);
 
-  //if we only have the later synchronize we will only syncrhonize the second device
+  // if we only have the later synchronize we will only syncrhonize the second device
 #ifdef CUSAN_SYNC
   cudaDeviceSynchronize();
 #endif
@@ -74,15 +70,13 @@ int main() {
       break;
     }
   }
-    for (int i = 0; i < size; i++) {
+  for (int i = 0; i < size; i++) {
     if (managed_data2[i] == 0) {
       printf("[Error] sync managed_data2 %i\n", managed_data[i]);
       break;
     }
   }
 
-
-
   cudaFree(managed_data);
   cudaFree(managed_data2);
   return 0;
diff --git a/test/multi_gpu/02_device_sync_event.c b/test/multi_gpu/02_device_sync_event.c
index 000f05d..e0e5c23 100644
--- a/test/multi_gpu/02_device_sync_event.c
+++ b/test/multi_gpu/02_device_sync_event.c
@@ -1,8 +1,8 @@
 // clang-format off
-// RUN: %wrapper-cc %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %wrapper-cc %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t.exe
 // RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
 
-// RUN: %wrapper-cc %clang_args -DCUSAN_SYNC %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %wrapper-cc %clang_args -DCUSAN_SYNC -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
 // clang-format on
@@ -13,7 +13,6 @@
 // CHECK-SYNC-NOT: data race
 // CHECK-SYNC-NOT: [Error] sync
 
-
 #include <cuda_runtime.h>
 #include <stdio.h>
 
diff --git a/test/runtime/11_struct_of_buff.c b/test/runtime/11_struct_of_buff.c
index eb594f1..121c3a7 100644
--- a/test/runtime/11_struct_of_buff.c
+++ b/test/runtime/11_struct_of_buff.c
@@ -10,7 +10,7 @@
 
 // CHECK-SYNC-NOT: data race
 
-#include "../support/gpu_mpi.h"
+// #include "../support/gpu_mpi.h"
 
 struct BufferStorage {
   int* buff1;
diff --git a/test/runtime/20_cuda_default_stream_sync.c b/test/runtime/20_cuda_default_stream_sync.c
index 78aedfc..c232483 100644
--- a/test/runtime/20_cuda_default_stream_sync.c
+++ b/test/runtime/20_cuda_default_stream_sync.c
@@ -1,8 +1,8 @@
 // clang-format off
-// RUN: %wrapper-cxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %wrapper-cc %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t.exe
 // RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
 
-// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %wrapper-cc -DCUSAN_SYNC %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
 
 // clang-format on
@@ -13,8 +13,9 @@
 // CHECK-SYNC-NOT: data race
 // CHECK-SYNC-NOT: [Error] sync
 
-#include "../support/gpu_mpi.h"
+// #include "../support/gpu_mpi.h"
 
+#include <stdio.h>
 #include <unistd.h>
 
 __global__ void write_kernel_delay(int* arr, const int N, const unsigned int delay) {
diff --git a/test/runtime/31_tsan_cuda_event.c b/test/runtime/31_tsan_cuda_event.c
index beacaeb..aa213fa 100644
--- a/test/runtime/31_tsan_cuda_event.c
+++ b/test/runtime/31_tsan_cuda_event.c
@@ -11,7 +11,7 @@
 
 // CHECK-SYNC-NOT: data race
 
-#include "../support/gpu_mpi.h"
+// #include "../support/gpu_mpi.h"
 
 #include <unistd.h>
 
diff --git a/test/runtime/32_tsan_async_copy.c b/test/runtime/32_tsan_async_copy.c
index 5a6f7f7..14cfcd5 100644
--- a/test/runtime/32_tsan_async_copy.c
+++ b/test/runtime/32_tsan_async_copy.c
@@ -1,9 +1,9 @@
 // clang-format off
-// RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
+// RUN: %wrapper-cc %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
 
-// RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
+// RUN: %wrapper-cc -DCUSAN_SYNC %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
 
 // CHECK-DAG: data race
 // CHECK-DAG: [Error] sync
@@ -13,8 +13,7 @@
 
 // clang-format on
 
-#include "../support/gpu_mpi.h"
-
+#include <stdio.h>
 #include <unistd.h>
 
 __global__ void kernel(int* arr, const int N) {
@@ -32,11 +31,6 @@ __global__ void kernel(int* arr, const int N) {
 }
 
 int main(int argc, char* argv[]) {
-  if (!has_gpu_aware_mpi()) {
-    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
-    return 1;
-  }
-
   cudaEvent_t first_finished_event;
   cudaEventCreate(&first_finished_event);
   cudaStream_t stream1;
diff --git a/test/runtime/33_tsan_wait_event.c b/test/runtime/33_tsan_wait_event.c
index c675758..af2a5a1 100644
--- a/test/runtime/33_tsan_wait_event.c
+++ b/test/runtime/33_tsan_wait_event.c
@@ -11,7 +11,7 @@
 
 // clang-format on
 
-#include "../support/gpu_mpi.h"
+// #include "../support/gpu_mpi.h"
 
 #include <unistd.h>
 

From 335b4f1f2db593b69779f33a09b25294a981ce69 Mon Sep 17 00:00:00 2001
From: ahueck <hueck.alexander@gmail.com>
Date: Mon, 10 Mar 2025 14:26:09 +0100
Subject: [PATCH 07/15] Added callback for cuda synchronizations events (#5)

Co-authored-by: Tim Ziegler <timziegler1604@gmail.com>
---
 .github/workflows/ci.yml                  | 32 +++++--------
 CMakeLists.txt                            |  2 +
 lib/pass/AnalysisTransform.cpp            | 36 +++++++++++++++
 lib/pass/AnalysisTransform.h              | 19 ++++++++
 lib/pass/CusanPass.cpp                    |  7 ++-
 lib/pass/FunctionDecl.cpp                 |  5 ++
 lib/pass/FunctionDecl.h                   |  2 +-
 lib/runtime/CusanRuntime.cpp              | 25 +++++-----
 lib/runtime/CusanRuntime.h                | 56 ++++++++++++++---------
 lib/runtime/CusanRuntime_cudaSpecific.cpp |  6 +--
 lib/support/Util.h                        |  2 +-
 test/lit.cfg                              |  4 +-
 test/multi_gpu/01_device_sync.c           |  2 +-
 test/multi_gpu/TSan_External.h            |  8 ++--
 test/pass/TSan_External.h                 |  8 ++--
 test/runtime/TSan_External.h              |  8 ++--
 16 files changed, 146 insertions(+), 76 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d121748..014567f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -10,25 +10,6 @@ env:
   CC: clang
 
 jobs:
-  format-check:
-    runs-on: ubuntu-22.04
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Format source code
-        run: |
-          find lib test \
-            -type f \
-            -a \( -name "*.c" -o -name "*.cpp" -o -name "*.h" \) \
-            -print0 \
-            | xargs -0 clang-format-14 -i
-
-      - name: Format check
-        run: |
-          git status --porcelain --untracked-files=no
-          git status --porcelain --untracked-files=no | xargs -o -I {} test -z \"{}\"
-
   codespell:
     runs-on: ubuntu-22.04
 
@@ -44,19 +25,30 @@ jobs:
           - llvm-version: 14
             os: ubuntu-22.04
             preset: release
+            cuda: 11.8.0
+          - llvm-version: 19
+            os: ubuntu-24.04
+            preset: release
+            cuda: 12.6.0
 
     runs-on: ${{ matrix.os }}
 
     steps:
       - uses: actions/checkout@v4
 
+      - name: LLVM apt
+        if: ${{ matrix.llvm-version == 19 }}
+        run: |
+          wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
+          echo "deb http://apt.llvm.org/noble/ llvm-toolchain-noble-19 main" | sudo tee /etc/apt/sources.list.d/llvm-19.list
+
       - name: Update apt
         run: sudo apt-get update
 
       - uses: Jimver/cuda-toolkit@v0.2.19
         id: cuda-toolkit
         with:
-          cuda: '11.8.0'
+          cuda: '${{ matrix.cuda }}'
           method: network
           sub-packages: '["nvcc", "cudart", "cudart-dev"]'
           non-cuda-sub-packages: '["libcurand", "libcurand-dev"]'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 191bca8..58f070b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,6 +9,8 @@ project(
 )
 
 set(CMAKE_VERBOSE_MAKEFILE ON)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
 
 list(
   APPEND
diff --git a/lib/pass/AnalysisTransform.cpp b/lib/pass/AnalysisTransform.cpp
index b6864e2..001e3f5 100644
--- a/lib/pass/AnalysisTransform.cpp
+++ b/lib/pass/AnalysisTransform.cpp
@@ -656,4 +656,40 @@ llvm::SmallVector<Value*, 1> CudaEventQuery::map_return_value(IRBuilder<>& irb,
   return {result};
 }
 
+CudaStreamSyncCallback::CudaStreamSyncCallback(callback::FunctionDecl* decls) {
+  setup("cudaStreamSynchronize", &decls->cusan_sync_callback.f);
+}
+llvm::SmallVector<Value*> CudaStreamSyncCallback::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  //( void* stream)
+  assert(args.size() == 1);
+  return {irb.getInt8(1)};
+}
+llvm::SmallVector<Value*, 1> CudaStreamSyncCallback::map_return_value(IRBuilder<>& irb, Value* result) {
+  return {result};
+}
+
+CudaEventSyncCallback::CudaEventSyncCallback(callback::FunctionDecl* decls) {
+  setup("cudaEventSynchronize", &decls->cusan_sync_callback.f);
+}
+llvm::SmallVector<Value*> CudaEventSyncCallback::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  //( void* event)
+  assert(args.size() == 1);
+  return {irb.getInt8(2)};
+}
+llvm::SmallVector<Value*, 1> CudaEventSyncCallback::map_return_value(IRBuilder<>& irb, Value* result) {
+  return {result};
+}
+
+CudaDeviceSyncCallback::CudaDeviceSyncCallback(callback::FunctionDecl* decls) {
+  setup("cudaDeviceSynchronize", &decls->cusan_sync_callback.f);
+}
+llvm::SmallVector<Value*> CudaDeviceSyncCallback::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  //( )
+  assert(args.size() == 0);
+  return {irb.getInt8(0)};
+}
+llvm::SmallVector<Value*, 1> CudaDeviceSyncCallback::map_return_value(IRBuilder<>& irb, Value* result) {
+  return {result};
+}
+
 }  // namespace cusan::transform
diff --git a/lib/pass/AnalysisTransform.h b/lib/pass/AnalysisTransform.h
index 453f359..55e3f1b 100644
--- a/lib/pass/AnalysisTransform.h
+++ b/lib/pass/AnalysisTransform.h
@@ -229,6 +229,25 @@ class CudaEventQuery : public SimpleInstrumenter<CudaEventQuery> {
   static llvm::SmallVector<Value*, 1> map_return_value(IRBuilder<>& irb, Value* result);
 };
 
+class CudaStreamSyncCallback : public SimpleInstrumenter<CudaStreamSyncCallback> {
+ public:
+  CudaStreamSyncCallback(callback::FunctionDecl* decls);
+  static llvm::SmallVector<Value*> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args);
+  static llvm::SmallVector<Value*, 1> map_return_value(IRBuilder<>& irb, Value* result);
+};
+class CudaEventSyncCallback : public SimpleInstrumenter<CudaEventSyncCallback> {
+ public:
+  CudaEventSyncCallback(callback::FunctionDecl* decls);
+  static llvm::SmallVector<Value*> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args);
+  static llvm::SmallVector<Value*, 1> map_return_value(IRBuilder<>& irb, Value* result);
+};
+class CudaDeviceSyncCallback : public SimpleInstrumenter<CudaDeviceSyncCallback> {
+ public:
+  CudaDeviceSyncCallback(callback::FunctionDecl* decls);
+  static llvm::SmallVector<Value*> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args);
+  static llvm::SmallVector<Value*, 1> map_return_value(IRBuilder<>& irb, Value* result);
+};
+
 }  // namespace transform
 }  // namespace cusan
 
diff --git a/lib/pass/CusanPass.cpp b/lib/pass/CusanPass.cpp
index ee0f0d8..aebe65f 100644
--- a/lib/pass/CusanPass.cpp
+++ b/lib/pass/CusanPass.cpp
@@ -60,7 +60,7 @@ class LegacyCusanPass : public llvm::ModulePass {
  public:
   static char ID;  // NOLINT
 
-  LegacyCusanPass() : ModulePass(ID){};
+  LegacyCusanPass() : ModulePass(ID) {};
 
   bool runOnModule(llvm::Module& module) override;
 
@@ -183,6 +183,11 @@ bool CusanPass::runOnFunc(llvm::Function& function) {
   modified |= transform::CudaChooseDevice(&cusan_decls_).instrument(function);
   modified |= transform::CudaSetDevice(&cusan_decls_).instrument(function);
 
+  // callbacks
+  modified |= transform::CudaDeviceSyncCallback(&cusan_decls_).instrument(function);
+  modified |= transform::CudaEventSyncCallback(&cusan_decls_).instrument(function);
+  modified |= transform::CudaStreamSyncCallback(&cusan_decls_).instrument(function);
+
   auto data_for_host = host::kernel_model_for_stub(&function, this->kernel_models_);
   if (data_for_host) {
     LOG_FATAL("Found kernel data for " << util::try_demangle_fully(function) << ": "
diff --git a/lib/pass/FunctionDecl.cpp b/lib/pass/FunctionDecl.cpp
index 91e029d..9e8477f 100644
--- a/lib/pass/FunctionDecl.cpp
+++ b/lib/pass/FunctionDecl.cpp
@@ -1,4 +1,5 @@
 #include "FunctionDecl.h"
+
 #include <cstdint>
 
 namespace cusan::callback {
@@ -133,6 +134,10 @@ void FunctionDecl::initialize(llvm::Module& module) {
   //  void* device
   ArgTypes arg_types_choose_device = {Type::getInt32Ty(c)->getPointerTo()};
   make_function(cusan_choose_device, arg_types_choose_device);
+
+  //  u8 evenType, u32 returnValue
+  ArgTypes arg_types_sync_callback = {Type::getInt8Ty(c), Type::getInt32Ty(c)};
+  make_function(cusan_sync_callback, arg_types_sync_callback);
 }
 
 }  // namespace cusan::callback
diff --git a/lib/pass/FunctionDecl.h b/lib/pass/FunctionDecl.h
index 91eabe5..f6457c1 100644
--- a/lib/pass/FunctionDecl.h
+++ b/lib/pass/FunctionDecl.h
@@ -46,7 +46,7 @@ struct FunctionDecl {
   CusanFunction cusan_device_free{"_cusan_device_free"};
   CusanFunction cusan_stream_query{"_cusan_stream_query"};
   CusanFunction cusan_event_query{"_cusan_event_query"};
-
+  CusanFunction cusan_sync_callback{"cusan_sync_callback"};
   void initialize(llvm::Module& m);
 };
 
diff --git a/lib/runtime/CusanRuntime.cpp b/lib/runtime/CusanRuntime.cpp
index 6fd6358..be31ff1 100644
--- a/lib/runtime/CusanRuntime.cpp
+++ b/lib/runtime/CusanRuntime.cpp
@@ -91,9 +91,7 @@ class Device {
 
   Device() : stats_recorder() {
     // every device has a default stream
-    {
-      register_stream(Device::kDefaultStream);
-    }
+    { register_stream(Device::kDefaultStream); }
     cpu_fiber_ = TsanGetCurrentFiber();
   }
 
@@ -469,7 +467,7 @@ void _cusan_create_event(Event*) {
   runtime.stats_recorder.inc_create_event_calls();
 }
 
-void _cusan_create_stream(RawStream* stream, cusan_StreamCreateFlags flags) {
+void _cusan_create_stream(RawStream* stream, cusan_stream_create_flags flags) {
   LOG_TRACE("[cusan]create stream with flags: " << flags
                                                 << " isNonBlocking: " << (bool)(flags & cusan_StreamFlagsNonBlocking))
   auto& runtime = Runtime::get().get_current_device();
@@ -620,10 +618,10 @@ void _cusan_memset_impl(void* target, size_t count) {
   // r.happens_after_stream(Runtime::default_stream));
 }
 
-void _cusan_memset_2d(void* target, size_t pitch, size_t, size_t height, cusan_MemcpyKind) {
+void _cusan_memset_2d(void* target, size_t pitch, size_t, size_t height, cusan_memcpy_kind) {
   _cusan_memset_impl(target, pitch * height);
 }
-void _cusan_memset_2d_async(void* target, size_t pitch, size_t, size_t height, cusan_MemcpyKind, RawStream stream) {
+void _cusan_memset_2d_async(void* target, size_t pitch, size_t, size_t height, cusan_memcpy_kind, RawStream stream) {
   _cusan_memset_async_impl(target, pitch * height, stream);
 }
 
@@ -638,7 +636,7 @@ void _cusan_memset_async(void* target, size_t count, RawStream stream) {
 }
 
 void _cusan_memcpy_async_impl(void* target, size_t write_size, const void* from, size_t read_size,
-                              cusan_MemcpyKind kind, RawStream stream) {
+                              cusan_memcpy_kind kind, RawStream stream) {
   auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_memcpy_async_calls();
   if (kind == cusan_MemcpyHostToHost && CUSAN_SYNC_DETAIL_LEVEL == 1) {
@@ -670,7 +668,7 @@ void _cusan_memcpy_async_impl(void* target, size_t write_size, const void* from,
   }
 }
 
-void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_t read_size, cusan_MemcpyKind kind) {
+void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_t read_size, cusan_memcpy_kind kind) {
   // TODO verify that the memcpy2d beheaviour is actually the same as normal memcpy
 
   if (kind == cusan_MemcpyDefault) {
@@ -751,7 +749,7 @@ void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_
 }
 
 void _cusan_memcpy_2d_async(void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height,
-                            cusan_MemcpyKind kind, RawStream stream) {
+                            cusan_memcpy_kind kind, RawStream stream) {
   LOG_TRACE("[cusan]MemcpyAsync" << width * height << " bytes to:" << target)
 
   size_t read_size  = spitch * height;
@@ -759,20 +757,23 @@ void _cusan_memcpy_2d_async(void* target, size_t dpitch, const void* from, size_
   _cusan_memcpy_async_impl(target, write_size, from, read_size, kind, stream);
 }
 
-void _cusan_memcpy_async(void* target, const void* from, size_t count, cusan_MemcpyKind kind, RawStream stream) {
+void _cusan_memcpy_async(void* target, const void* from, size_t count, cusan_memcpy_kind kind, RawStream stream) {
   LOG_TRACE("[cusan]MemcpyAsync" << count << " bytes to:" << target)
   _cusan_memcpy_async_impl(target, count, from, count, kind, stream);
 }
 
 void _cusan_memcpy_2d(void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height,
-                      cusan_MemcpyKind kind) {
+                      cusan_memcpy_kind kind) {
   LOG_TRACE("[cusan]Memcpy2d " << width * height << " from:" << from << " to:" << target);
   size_t read_size  = spitch * height;
   size_t write_size = dpitch * height;
   _cusan_memcpy_impl(target, write_size, from, read_size, kind);
 }
 
-void _cusan_memcpy(void* target, const void* from, size_t count, cusan_MemcpyKind kind) {
+void _cusan_memcpy(void* target, const void* from, size_t count, cusan_memcpy_kind kind) {
   LOG_TRACE("[cusan]Memcpy " << count << " from:" << from << " to:" << target);
   _cusan_memcpy_impl(target, count, from, count, kind);
 }
+
+void cusan_sync_callback(cusan_sync_type /*type*/, unsigned int /*return_value*/) {
+}
diff --git a/lib/runtime/CusanRuntime.h b/lib/runtime/CusanRuntime.h
index 784f805..0b5fb60 100644
--- a/lib/runtime/CusanRuntime.h
+++ b/lib/runtime/CusanRuntime.h
@@ -19,12 +19,12 @@ typedef enum cusan_memcpy_kind_t : unsigned int {
   cusan_MemcpyDeviceToHost   = 2,
   cusan_MemcpyDeviceToDevice = 3,
   cusan_MemcpyDefault        = 4,
-} cusan_MemcpyKind;
+} cusan_memcpy_kind;
 
 typedef enum cusan_stream_create_flags_t : unsigned int {
   cusan_StreamFlagsDefault     = 0,
   cusan_StreamFlagsNonBlocking = 1,
-} cusan_StreamCreateFlags;
+} cusan_stream_create_flags;
 #ifdef __cplusplus
 }
 #endif
@@ -34,14 +34,14 @@ namespace cusan::runtime {
 using TsanFiber = void*;
 using Event     = const void*;
 using RawStream = const void*;
-using DeviceID = int;
-cusan_MemcpyKind infer_memcpy_direction(const void* target, const void* from);
+using DeviceID  = int;
+cusan_memcpy_kind infer_memcpy_direction(const void* target, const void* from);
 DeviceID get_current_device_id();
 }  // namespace cusan::runtime
+using cusan::runtime::DeviceID;
 using cusan::runtime::Event;
 using cusan::runtime::RawStream;
 using cusan::runtime::TsanFiber;
-using cusan::runtime::DeviceID;
 #else
 #define TsanFiber void*
 #define Event const void*
@@ -55,26 +55,35 @@ extern "C" {
 #endif
 
 void _cusan_kernel_register(void** kernel_args, short* modes, int n, RawStream stream);
+
 void _cusan_sync_device();
-void _cusan_event_record(Event event, RawStream stream);
+void _cusan_set_device(DeviceID device);
+void _cusan_choose_device(DeviceID* device);
+
 void _cusan_sync_stream(RawStream stream);
+void _cusan_create_stream(RawStream* stream, cusan_stream_create_flags flags);
+void _cusan_stream_query(RawStream stream, unsigned int err);
+
 void _cusan_sync_event(Event event);
-void _cusan_stream_event(Event event);
+void _cusan_event_record(Event event, RawStream stream);
 void _cusan_create_event(RawStream* event);
-void _cusan_create_stream(RawStream* stream, cusan_StreamCreateFlags flags);
-void _cusan_memcpy_async(void* target, const void* from, size_t count, cusan_MemcpyKind kind, RawStream stream);
-void _cusan_memset_async(void* target, size_t count, RawStream stream);
-void _cusan_memcpy(void* target, const void* from, size_t count, cusan_MemcpyKind);
+void _cusan_event_query(Event event, unsigned int err);
+void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int flags);
+void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int flags);
+
+void _cusan_memcpy(void* target, const void* from, size_t count, cusan_memcpy_kind);
+void _cusan_memcpy_async(void* target, const void* from, size_t count, cusan_memcpy_kind kind, RawStream stream);
 void _cusan_memcpy_2d(void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height,
-                      cusan_MemcpyKind);
+                      cusan_memcpy_kind);
 void _cusan_memcpy_2d_async(void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height,
-                            cusan_MemcpyKind, RawStream stream);
-void _cusan_memset_2d(void* target, size_t pitch, size_t width, size_t height, cusan_MemcpyKind);
-void _cusan_memset_2d_async(void* target, size_t pitch, size_t width, size_t height, cusan_MemcpyKind,
-                            RawStream stream);
+                            cusan_memcpy_kind, RawStream stream);
+
 void _cusan_memset(void* target, size_t count);
-void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int flags);
-void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int flags);
+void _cusan_memset_async(void* target, size_t count, RawStream stream);
+void _cusan_memset_2d(void* target, size_t pitch, size_t width, size_t height, cusan_memcpy_kind);
+void _cusan_memset_2d_async(void* target, size_t pitch, size_t width, size_t height, cusan_memcpy_kind,
+                            RawStream stream);
+
 void _cusan_host_alloc(void** ptr, size_t size, unsigned int flags);
 void _cusan_host_free(void* ptr);
 void _cusan_managed_alloc(void** ptr, size_t size, unsigned int flags);
@@ -83,11 +92,14 @@ void _cusan_host_register(void* ptr, size_t size, unsigned int flags);
 void _cusan_host_unregister(void* ptr);
 void _cusan_device_alloc(void** ptr, size_t size);
 void _cusan_device_free(void* ptr);
-void _cusan_stream_query(RawStream stream, unsigned int err);
-void _cusan_event_query(Event event, unsigned int err);
 
-void _cusan_set_device(DeviceID device);
-void _cusan_choose_device(DeviceID* device);
+typedef enum cusan_sync_type_t : unsigned char {
+  cusan_Device = 0,
+  cusan_Stream = 1,
+  cusan_Event  = 2,
+} cusan_sync_type;
+
+void cusan_sync_callback(cusan_sync_type /*type*/, unsigned int /*return_value*/);
 
 #ifdef __cplusplus
 }
diff --git a/lib/runtime/CusanRuntime_cudaSpecific.cpp b/lib/runtime/CusanRuntime_cudaSpecific.cpp
index fbe7d26..970401d 100644
--- a/lib/runtime/CusanRuntime_cudaSpecific.cpp
+++ b/lib/runtime/CusanRuntime_cudaSpecific.cpp
@@ -11,15 +11,13 @@
 
 namespace cusan::runtime {
 
-
-
-DeviceID get_current_device_id(){
+DeviceID get_current_device_id() {
   DeviceID res;
   cudaGetDevice(&res);
   return res;
 }
 
-cusan_MemcpyKind infer_memcpy_direction(const void* target, const void* from) {
+cusan_memcpy_kind infer_memcpy_direction(const void* target, const void* from) {
   cudaDeviceProp prop;
   cudaGetDeviceProperties(&prop, 0);
   assert(prop.unifiedAddressing && "Can only use default direction for memcpy when Unified memory is supported.");
diff --git a/lib/support/Util.h b/lib/support/Util.h
index 54014a9..081c6fc 100644
--- a/lib/support/Util.h
+++ b/lib/support/Util.h
@@ -34,7 +34,7 @@ inline std::string demangle(String&& s) {
 #if LLVM_VERSION_MAJOR >= 15
   auto demangle = llvm::itaniumDemangle(name.data(), false);
 #else
-  auto* demangle      = llvm::itaniumDemangle(name.data(), nullptr, nullptr, nullptr);
+  auto* demangle = llvm::itaniumDemangle(name.data(), nullptr, nullptr, nullptr);
 #endif
   if (demangle && !std::string(demangle).empty()) {
     return {demangle};
diff --git a/test/lit.cfg b/test/lit.cfg
index 9836a64..8146092 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -46,7 +46,7 @@ if config.cusan_typeart:
 config.substitutions.append(("%clang-cpp", clang_cpp))
 config.substitutions.append(("%clang-cc", clang_cc))
 config.substitutions.append(("%opt", opt))
-config.substitutions.append(("%filecheck", filecheck + " --dump-input fail"))
+config.substitutions.append(("%filecheck", filecheck + " --dump-input fail --dump-input-context=40"))
 
 config.substitutions.append(("%lib_dir", cusan_lib_dir))
 config.substitutions.append(("%pass_dir", cusan_pass_dir))
@@ -77,4 +77,4 @@ config.substitutions.append(('%clang_args', '-D__STRICT_ANSI__ -fPIC -O1 -g0'))
 
 config.substitutions.append(('%rm-file', 'rm -f '))
 
-config.substitutions.append(('%clang-pass-only-args', '-D__STRICT_ANSI__ -O1 -c'))
+config.substitutions.append(('%clang-pass-only-args', '-D__STRICT_ANSI__ -std=c++17 -O1 -c'))
diff --git a/test/multi_gpu/01_device_sync.c b/test/multi_gpu/01_device_sync.c
index cc0b7b8..d069a70 100644
--- a/test/multi_gpu/01_device_sync.c
+++ b/test/multi_gpu/01_device_sync.c
@@ -55,7 +55,7 @@ int main() {
   cudaSetDevice(0);
   write_kernel_delay<<<blocksPerGrid, threadsPerBlock>>>(managed_data, size, 1316134912);
 
-  // if we only have the later synchronize we will only syncrhonize the second device
+  // if we only have the later synchronize we will only synchronize the second device
 #ifdef CUSAN_SYNC
   cudaDeviceSynchronize();
 #endif
diff --git a/test/multi_gpu/TSan_External.h b/test/multi_gpu/TSan_External.h
index 36849c9..c7b5796 100644
--- a/test/multi_gpu/TSan_External.h
+++ b/test/multi_gpu/TSan_External.h
@@ -87,12 +87,12 @@ void __attribute__((weak)) AnnotateRWLockCreate(const char* file, int line, cons
 void __attribute__((weak)) AnnotateRWLockDestroy(const char* file, int line, const volatile void* cv) {
   FALLBACK_PRINT(__func__);
 }
-void __attribute__((weak))
-AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv, unsigned long long is_w) {
+void __attribute__((weak)) AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv,
+                                                  unsigned long long is_w) {
   FALLBACK_PRINT(__func__);
 }
-void __attribute__((weak))
-AnnotateRWLockReleased(const char* file, int line, const volatile void* cv, unsigned long long is_w) {
+void __attribute__((weak)) AnnotateRWLockReleased(const char* file, int line, const volatile void* cv,
+                                                  unsigned long long is_w) {
   FALLBACK_PRINT(__func__);
 }
 
diff --git a/test/pass/TSan_External.h b/test/pass/TSan_External.h
index 574f0db..c543fdc 100644
--- a/test/pass/TSan_External.h
+++ b/test/pass/TSan_External.h
@@ -87,12 +87,12 @@ void __attribute__((weak)) AnnotateRWLockCreate(const char* file, int line, cons
 void __attribute__((weak)) AnnotateRWLockDestroy(const char* file, int line, const volatile void* cv) {
   FALLBACK_PRINT(__func__);
 }
-void __attribute__((weak))
-AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv, unsigned long long is_w) {
+void __attribute__((weak)) AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv,
+                                                  unsigned long long is_w) {
   FALLBACK_PRINT(__func__);
 }
-void __attribute__((weak))
-AnnotateRWLockReleased(const char* file, int line, const volatile void* cv, unsigned long long is_w) {
+void __attribute__((weak)) AnnotateRWLockReleased(const char* file, int line, const volatile void* cv,
+                                                  unsigned long long is_w) {
   FALLBACK_PRINT(__func__);
 }
 
diff --git a/test/runtime/TSan_External.h b/test/runtime/TSan_External.h
index 36849c9..c7b5796 100644
--- a/test/runtime/TSan_External.h
+++ b/test/runtime/TSan_External.h
@@ -87,12 +87,12 @@ void __attribute__((weak)) AnnotateRWLockCreate(const char* file, int line, cons
 void __attribute__((weak)) AnnotateRWLockDestroy(const char* file, int line, const volatile void* cv) {
   FALLBACK_PRINT(__func__);
 }
-void __attribute__((weak))
-AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv, unsigned long long is_w) {
+void __attribute__((weak)) AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv,
+                                                  unsigned long long is_w) {
   FALLBACK_PRINT(__func__);
 }
-void __attribute__((weak))
-AnnotateRWLockReleased(const char* file, int line, const volatile void* cv, unsigned long long is_w) {
+void __attribute__((weak)) AnnotateRWLockReleased(const char* file, int line, const volatile void* cv,
+                                                  unsigned long long is_w) {
   FALLBACK_PRINT(__func__);
 }
 

From 2cd2b54299227d06badd087209dee30b97b4d00f Mon Sep 17 00:00:00 2001
From: Tim Ziegler <timziegler1604@gmail.com>
Date: Wed, 12 Mar 2025 10:10:26 +0100
Subject: [PATCH 08/15] Put mpiexec tests behind a requires

---
 test/runtime/03_cuda_to_mpi.c                        |  2 ++
 test/runtime/04_mpi_to_cuda.c                        |  2 ++
 test/runtime/05_cuda_to_mpi_stream.c                 |  2 ++
 test/runtime/06_cuda_to_mpi_event.c                  |  2 ++
 test/runtime/07_cuda_to_mpi_read.c                   |  2 ++
 test/runtime/08_cudamemcpy_to_mpi.c                  |  2 ++
 test/runtime/11_cuda_to_mpi_struct_of_buff.c         |  2 ++
 test/runtime/12_struct_ptr.c                         |  2 ++
 test/runtime/18_cuda_to_mpi_event_query_busy_loop.c  |  2 ++
 ...da_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c |  2 ++
 test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c       |  3 +++
 test/runtime/21_chunked_streams_example.c            |  2 ++
 test/runtime/22_cuda_to_mpi_partial_buff_write.c     |  2 ++
 test/runtime/26_malloc_pitch.c                       |  2 ++
 test/runtime/29_tsan_cuda_to_mpi.c                   |  2 ++
 test/runtime/30_tsan_annotate_cuda_to_mpi.c          |  3 +++
 test/runtime/lit.local.cfg                           | 12 ++++++++++++
 17 files changed, 46 insertions(+)

diff --git a/test/runtime/03_cuda_to_mpi.c b/test/runtime/03_cuda_to_mpi.c
index bbb61e6..d7f11b6 100644
--- a/test/runtime/03_cuda_to_mpi.c
+++ b/test/runtime/03_cuda_to_mpi.c
@@ -5,6 +5,8 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
+// REQUIRES: mpiexec
+
 // clang-format on
 
 // CHECK-DAG: data race
diff --git a/test/runtime/04_mpi_to_cuda.c b/test/runtime/04_mpi_to_cuda.c
index 3b54714..370f48f 100644
--- a/test/runtime/04_mpi_to_cuda.c
+++ b/test/runtime/04_mpi_to_cuda.c
@@ -5,6 +5,8 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
+// REQUIRES: mpiexec
+
 // clang-format on
 
 // CHECK-DAG: data race
diff --git a/test/runtime/05_cuda_to_mpi_stream.c b/test/runtime/05_cuda_to_mpi_stream.c
index e07b35c..af827b9 100644
--- a/test/runtime/05_cuda_to_mpi_stream.c
+++ b/test/runtime/05_cuda_to_mpi_stream.c
@@ -5,6 +5,8 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
+// REQUIRES: mpiexec
+
 // clang-format on
 
 // CHECK-DAG: data race
diff --git a/test/runtime/06_cuda_to_mpi_event.c b/test/runtime/06_cuda_to_mpi_event.c
index b44e4a3..0585479 100644
--- a/test/runtime/06_cuda_to_mpi_event.c
+++ b/test/runtime/06_cuda_to_mpi_event.c
@@ -5,6 +5,8 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
+// REQUIRES: mpiexec
+
 // clang-format on
 
 // CHECK-DAG: data race
diff --git a/test/runtime/07_cuda_to_mpi_read.c b/test/runtime/07_cuda_to_mpi_read.c
index 8d36e6b..e367e2a 100644
--- a/test/runtime/07_cuda_to_mpi_read.c
+++ b/test/runtime/07_cuda_to_mpi_read.c
@@ -2,6 +2,8 @@
 // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck --allow-empty %s
 
+// REQUIRES: mpiexec
+
 // clang-format on
 
 // CHECK-NOT: data race
diff --git a/test/runtime/08_cudamemcpy_to_mpi.c b/test/runtime/08_cudamemcpy_to_mpi.c
index 18c6671..b8fa5bf 100644
--- a/test/runtime/08_cudamemcpy_to_mpi.c
+++ b/test/runtime/08_cudamemcpy_to_mpi.c
@@ -2,6 +2,8 @@
 // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
 // RUN: %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s --allow-empty
 
+// REQUIRES: mpiexec
+
 // clang-format on
 
 // CHECK-NOT: data race
diff --git a/test/runtime/11_cuda_to_mpi_struct_of_buff.c b/test/runtime/11_cuda_to_mpi_struct_of_buff.c
index ea875dc..4a3141d 100644
--- a/test/runtime/11_cuda_to_mpi_struct_of_buff.c
+++ b/test/runtime/11_cuda_to_mpi_struct_of_buff.c
@@ -5,6 +5,8 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
+// REQUIRES: mpiexec
+
 // clang-format on
 
 // CHECK-DAG: data race
diff --git a/test/runtime/12_struct_ptr.c b/test/runtime/12_struct_ptr.c
index 3792d13..5a71312 100644
--- a/test/runtime/12_struct_ptr.c
+++ b/test/runtime/12_struct_ptr.c
@@ -5,6 +5,8 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
+// REQUIRES: mpiexec
+
 // CHECK-DAG: data race
 
 // CHECK-SYNC-NOT: data race
diff --git a/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c b/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c
index cf613ca..73674a7 100644
--- a/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c
+++ b/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c
@@ -5,6 +5,8 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
+// REQUIRES: mpiexec
+
 // clang-format on
 
 // CHECK-DAG: data race
diff --git a/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c b/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c
index c5b221e..61eee40 100644
--- a/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c
+++ b/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c
@@ -5,6 +5,8 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
+// REQUIRES: mpiexec
+
 // clang-format on
 
 // CHECK-DAG: data race
diff --git a/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c b/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c
index 5d72ad7..820912b 100644
--- a/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c
+++ b/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c
@@ -4,6 +4,9 @@
 
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// REQUIRES: mpiexec
+
 // clang-format on
 
 // CHECK-DAG: data race
diff --git a/test/runtime/21_chunked_streams_example.c b/test/runtime/21_chunked_streams_example.c
index 90b21e0..fa4f5d0 100644
--- a/test/runtime/21_chunked_streams_example.c
+++ b/test/runtime/21_chunked_streams_example.c
@@ -2,6 +2,8 @@
 // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
 
+// REQUIRES: mpiexec
+
 // clang-format on
 
 // CHECK-NOT: data race
diff --git a/test/runtime/22_cuda_to_mpi_partial_buff_write.c b/test/runtime/22_cuda_to_mpi_partial_buff_write.c
index 1c8ffcd..8a29dd9 100644
--- a/test/runtime/22_cuda_to_mpi_partial_buff_write.c
+++ b/test/runtime/22_cuda_to_mpi_partial_buff_write.c
@@ -2,6 +2,8 @@
 // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s --allow-empty
 
+// REQUIRES: mpiexec
+
 // clang-format on
 
 // CHECK-NOT: data race
diff --git a/test/runtime/26_malloc_pitch.c b/test/runtime/26_malloc_pitch.c
index b2b12da..3d7524f 100644
--- a/test/runtime/26_malloc_pitch.c
+++ b/test/runtime/26_malloc_pitch.c
@@ -5,6 +5,8 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
+// REQUIRES: mpiexec
+
 // clang-format on
 
 // CHECK-DAG: data race
diff --git a/test/runtime/29_tsan_cuda_to_mpi.c b/test/runtime/29_tsan_cuda_to_mpi.c
index a39b5a0..14ebfa1 100644
--- a/test/runtime/29_tsan_cuda_to_mpi.c
+++ b/test/runtime/29_tsan_cuda_to_mpi.c
@@ -5,6 +5,8 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
+// REQUIRES: mpiexec
+
 // clang-format on
 
 // CHECK: [Error] sync
diff --git a/test/runtime/30_tsan_annotate_cuda_to_mpi.c b/test/runtime/30_tsan_annotate_cuda_to_mpi.c
index 13246ef..24e59ce 100644
--- a/test/runtime/30_tsan_annotate_cuda_to_mpi.c
+++ b/test/runtime/30_tsan_annotate_cuda_to_mpi.c
@@ -4,6 +4,9 @@
 
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda -g %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
+
+// REQUIRES: mpiexec
+
 // clang-format on
 
 // CHECK-DAG: data race
diff --git a/test/runtime/lit.local.cfg b/test/runtime/lit.local.cfg
index 3b823ec..f681f81 100644
--- a/test/runtime/lit.local.cfg
+++ b/test/runtime/lit.local.cfg
@@ -1,4 +1,16 @@
+import subprocess
+
 if config.cusan_use_workarounds:
     config.environment['NEOReadDebugKeys'] = '1'
     config.environment['DisableDeepBind'] = '1'
     config.environment['OMPI_MCA_memory'] = '^patcher'
+
+def has_mpirun():
+    try:
+        subprocess.check_call(['mpirun', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        return True
+    except subprocess.CalledProcessError:
+        return False
+
+
+config.available_features.add('openmpi' if has_mpirun() else '')
\ No newline at end of file

From c791956bb1ef7feaa45fb22cf93ebd3851f1d65e Mon Sep 17 00:00:00 2001
From: Tim Ziegler <timziegler1604@gmail.com>
Date: Wed, 12 Mar 2025 10:17:56 +0100
Subject: [PATCH 09/15] Put multi gpu tests behind a requires

---
 test/multi_gpu/01_device_sync.c       |  2 ++
 test/multi_gpu/02_device_sync_event.c |  2 ++
 test/multi_gpu/lit.local.cfg          | 16 ++++++++++++++++
 3 files changed, 20 insertions(+)
 create mode 100644 test/multi_gpu/lit.local.cfg

diff --git a/test/multi_gpu/01_device_sync.c b/test/multi_gpu/01_device_sync.c
index d069a70..52747c3 100644
--- a/test/multi_gpu/01_device_sync.c
+++ b/test/multi_gpu/01_device_sync.c
@@ -5,6 +5,8 @@
 // RUN: %wrapper-cc -DCUSAN_SYNC %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
+// REQUIRES: multigpu
+
 // clang-format on
 
 // CHECK-DAG: data race
diff --git a/test/multi_gpu/02_device_sync_event.c b/test/multi_gpu/02_device_sync_event.c
index e0e5c23..a72a7fe 100644
--- a/test/multi_gpu/02_device_sync_event.c
+++ b/test/multi_gpu/02_device_sync_event.c
@@ -5,6 +5,8 @@
 // RUN: %wrapper-cc %clang_args -DCUSAN_SYNC -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
+// REQUIRES: multigpu
+
 // clang-format on
 
 // CHECK-DAG: data race
diff --git a/test/multi_gpu/lit.local.cfg b/test/multi_gpu/lit.local.cfg
new file mode 100644
index 0000000..0f360f0
--- /dev/null
+++ b/test/multi_gpu/lit.local.cfg
@@ -0,0 +1,16 @@
+import subprocess
+
+if config.cusan_use_workarounds:
+    config.environment['NEOReadDebugKeys'] = '1'
+    config.environment['DisableDeepBind'] = '1'
+    config.environment['OMPI_MCA_memory'] = '^patcher'
+
+def has_multigpu():
+    try:
+        subprocess.check_call(['nvidia-smi', '-i=1'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        return True
+    except subprocess.CalledProcessError:
+        return False
+
+
+config.available_features.add('multigpu' if has_multigpu() else '')
\ No newline at end of file

From 96b619974a29cf5557d39c5dfbe502028e2f4df8 Mon Sep 17 00:00:00 2001
From: Tim Ziegler <timziegler1604@gmail.com>
Date: Thu, 13 Mar 2025 10:49:55 +0100
Subject: [PATCH 10/15] Added stream/event to sync callback

---
 lib/pass/AnalysisTransform.cpp |  6 +++---
 lib/pass/FunctionDecl.cpp      |  2 +-
 lib/runtime/CusanRuntime.cpp   | 13 ++++++++++++-
 lib/runtime/CusanRuntime.h     |  8 ++++----
 4 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/lib/pass/AnalysisTransform.cpp b/lib/pass/AnalysisTransform.cpp
index 001e3f5..e84c7eb 100644
--- a/lib/pass/AnalysisTransform.cpp
+++ b/lib/pass/AnalysisTransform.cpp
@@ -662,7 +662,7 @@ CudaStreamSyncCallback::CudaStreamSyncCallback(callback::FunctionDecl* decls) {
 llvm::SmallVector<Value*> CudaStreamSyncCallback::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
   //( void* stream)
   assert(args.size() == 1);
-  return {irb.getInt8(1)};
+  return {irb.getInt8(1), args[0]};
 }
 llvm::SmallVector<Value*, 1> CudaStreamSyncCallback::map_return_value(IRBuilder<>& irb, Value* result) {
   return {result};
@@ -674,7 +674,7 @@ CudaEventSyncCallback::CudaEventSyncCallback(callback::FunctionDecl* decls) {
 llvm::SmallVector<Value*> CudaEventSyncCallback::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
   //( void* event)
   assert(args.size() == 1);
-  return {irb.getInt8(2)};
+  return {irb.getInt8(2), args[0]};
 }
 llvm::SmallVector<Value*, 1> CudaEventSyncCallback::map_return_value(IRBuilder<>& irb, Value* result) {
   return {result};
@@ -686,7 +686,7 @@ CudaDeviceSyncCallback::CudaDeviceSyncCallback(callback::FunctionDecl* decls) {
 llvm::SmallVector<Value*> CudaDeviceSyncCallback::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
   //( )
   assert(args.size() == 0);
-  return {irb.getInt8(0)};
+  return {irb.getInt8(0), ConstantPointerNull::get(irb.getPtrTy())};
 }
 llvm::SmallVector<Value*, 1> CudaDeviceSyncCallback::map_return_value(IRBuilder<>& irb, Value* result) {
   return {result};
diff --git a/lib/pass/FunctionDecl.cpp b/lib/pass/FunctionDecl.cpp
index 9e8477f..cefd273 100644
--- a/lib/pass/FunctionDecl.cpp
+++ b/lib/pass/FunctionDecl.cpp
@@ -136,7 +136,7 @@ void FunctionDecl::initialize(llvm::Module& module) {
   make_function(cusan_choose_device, arg_types_choose_device);
 
   //  u8 evenType, u32 returnValue
-  ArgTypes arg_types_sync_callback = {Type::getInt8Ty(c), Type::getInt32Ty(c)};
+  ArgTypes arg_types_sync_callback = {Type::getInt8Ty(c), void_ptr, Type::getInt32Ty(c)};
   make_function(cusan_sync_callback, arg_types_sync_callback);
 }
 
diff --git a/lib/runtime/CusanRuntime.cpp b/lib/runtime/CusanRuntime.cpp
index be31ff1..0ec5213 100644
--- a/lib/runtime/CusanRuntime.cpp
+++ b/lib/runtime/CusanRuntime.cpp
@@ -775,5 +775,16 @@ void _cusan_memcpy(void* target, const void* from, size_t count, cusan_memcpy_ki
   _cusan_memcpy_impl(target, count, from, count, kind);
 }
 
-void cusan_sync_callback(cusan_sync_type /*type*/, unsigned int /*return_value*/) {
+void cusan_sync_callback(cusan_sync_type type, const void* event_or_stream, unsigned int return_value) {
+  //switch (type) {
+  //  case cusan_Device:
+  //    printf("Device sync return value %i\n", return_value);
+  //    break;
+  //  case cusan_Stream:
+  //    printf("Stream %#x sync return value %i\n", event_or_stream, return_value);
+  //    break;
+  //  case cusan_Event:
+  //    printf("Event %#x sync return value %i\n", event_or_stream, return_value);
+  //    break;
+  //}
 }
diff --git a/lib/runtime/CusanRuntime.h b/lib/runtime/CusanRuntime.h
index 0b5fb60..048e566 100644
--- a/lib/runtime/CusanRuntime.h
+++ b/lib/runtime/CusanRuntime.h
@@ -94,12 +94,12 @@ void _cusan_device_alloc(void** ptr, size_t size);
 void _cusan_device_free(void* ptr);
 
 typedef enum cusan_sync_type_t : unsigned char {
-  cusan_Device = 0,
-  cusan_Stream = 1,
-  cusan_Event  = 2,
+  cusan_Device = 0, //second argument is a nullptr
+  cusan_Stream = 1, //second argument is the stream pointer
+  cusan_Event  = 2, //second argument is the event pointer
 } cusan_sync_type;
 
-void cusan_sync_callback(cusan_sync_type /*type*/, unsigned int /*return_value*/);
+void cusan_sync_callback(cusan_sync_type /*type*/, const void* /*event or stream*/, unsigned int /*return_value*/);
 
 #ifdef __cplusplus
 }

From 7b517c3ea123a0022bea70fde5135ba6244a7db5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Sun, 20 Apr 2025 13:43:32 +0200
Subject: [PATCH 11/15] Fix LLVM-14 compile error, fix some warnings

---
 cmake/cusanToolchain.cmake       |  2 ++
 cmake/modules/cusan-format.cmake |  7 ++---
 externals/CMakeLists.txt         |  6 +++++
 lib/pass/AnalysisTransform.cpp   | 13 ++++++---
 lib/pass/CMakeLists.txt          |  1 +
 lib/pass/CusanPass.cpp           |  4 ++-
 lib/runtime/CusanRuntime.cpp     | 45 +++++++++++++++++---------------
 lib/runtime/StatsCounter.h       |  4 +--
 lib/runtime/TSanInterface.h      |  3 +++
 9 files changed, 54 insertions(+), 31 deletions(-)

diff --git a/cmake/cusanToolchain.cmake b/cmake/cusanToolchain.cmake
index 9e98131..449a0ff 100644
--- a/cmake/cusanToolchain.cmake
+++ b/cmake/cusanToolchain.cmake
@@ -41,6 +41,8 @@ option(CUSAN_FIBERPOOL "Use external fiber pool to manage ThreadSanitizer fibers
 option(CUSAN_SOFTCOUNTER "Print runtime counters" OFF)
 option(CUSAN_SYNC_DETAIL_LEVEL "Enable implicit sync analysis of memcpy/memset" ON)
 
+option(CUSAN_DEVICE_SYNC_CALLBACKS "Enable runtime callbacks after sync calls" ON)
+
 option(CUSAN_TEST_WORKAROUNDS "Enable workarounds for MPI + TSan regarding runtime tests" ON)
 mark_as_advanced(CUSAN_TEST_WORKAROUNDS)
 
diff --git a/cmake/modules/cusan-format.cmake b/cmake/modules/cusan-format.cmake
index bacdf5d..96c393d 100644
--- a/cmake/modules/cusan-format.cmake
+++ b/cmake/modules/cusan-format.cmake
@@ -20,11 +20,12 @@ function(cusan_target_format target comment)
     filter_dir(${exclude})
   endforeach()
 
-  find_program(FORMAT_COMMAND
+  find_program(CLANG_FORMAT_COMMAND
                NAMES clang-format-${LLVM_VERSION_MAJOR} clang-format)
-  if(FORMAT_COMMAND)
+  mark_as_advanced(CLANG_FORMAT_COMMAND)
+  if(CLANG_FORMAT_COMMAND)
     add_custom_target(${target}
-      COMMAND ${FORMAT_COMMAND} -i -style=file ${ARG_OTHER} ${ARG_UNPARSED_ARGUMENTS}
+      COMMAND ${CLANG_FORMAT_COMMAND} -i -style=file ${ARG_OTHER} ${ARG_UNPARSED_ARGUMENTS}
               ${ALL_CXX_FILES}
       WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
       COMMENT "${comment}"
diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt
index 863ccc1..4dc9ddf 100644
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -42,3 +42,9 @@ if(CUSAN_TYPEART)
     )
     FetchContent_MakeAvailable(typeart)
 endif()
+
+mark_as_advanced(
+    FETCHCONTENT_BASE_DIR
+    FETCHCONTENT_FULLY_DISCONNECTED
+    FETCHCONTENT_QUIET
+    FETCHCONTENT_UPDATES_DISCONNECTED)
\ No newline at end of file
diff --git a/lib/pass/AnalysisTransform.cpp b/lib/pass/AnalysisTransform.cpp
index e84c7eb..6da40e1 100644
--- a/lib/pass/AnalysisTransform.cpp
+++ b/lib/pass/AnalysisTransform.cpp
@@ -664,7 +664,7 @@ llvm::SmallVector<Value*> CudaStreamSyncCallback::map_arguments(IRBuilder<>& irb
   assert(args.size() == 1);
   return {irb.getInt8(1), args[0]};
 }
-llvm::SmallVector<Value*, 1> CudaStreamSyncCallback::map_return_value(IRBuilder<>& irb, Value* result) {
+llvm::SmallVector<Value*, 1> CudaStreamSyncCallback::map_return_value(IRBuilder<>&, Value* result) {
   return {result};
 }
 
@@ -676,7 +676,7 @@ llvm::SmallVector<Value*> CudaEventSyncCallback::map_arguments(IRBuilder<>& irb,
   assert(args.size() == 1);
   return {irb.getInt8(2), args[0]};
 }
-llvm::SmallVector<Value*, 1> CudaEventSyncCallback::map_return_value(IRBuilder<>& irb, Value* result) {
+llvm::SmallVector<Value*, 1> CudaEventSyncCallback::map_return_value(IRBuilder<>&, Value* result) {
   return {result};
 }
 
@@ -686,9 +686,14 @@ CudaDeviceSyncCallback::CudaDeviceSyncCallback(callback::FunctionDecl* decls) {
 llvm::SmallVector<Value*> CudaDeviceSyncCallback::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
   //( )
   assert(args.size() == 0);
-  return {irb.getInt8(0), ConstantPointerNull::get(irb.getPtrTy())};
+#if LLVM_VERSION_MAJOR < 15
+  auto* ptr_type = PointerType::getUnqual(irb.getContext());
+#else
+  auto* ptr_type = irb.getPtrTy();
+#endif
+  return {irb.getInt8(0), ConstantPointerNull::get(ptr_type)};
 }
-llvm::SmallVector<Value*, 1> CudaDeviceSyncCallback::map_return_value(IRBuilder<>& irb, Value* result) {
+llvm::SmallVector<Value*, 1> CudaDeviceSyncCallback::map_return_value(IRBuilder<>&, Value* result) {
   return {result};
 }
 
diff --git a/lib/pass/CMakeLists.txt b/lib/pass/CMakeLists.txt
index f52baa5..8fbefd9 100644
--- a/lib/pass/CMakeLists.txt
+++ b/lib/pass/CMakeLists.txt
@@ -14,6 +14,7 @@ mark_as_advanced(LLVM_CUSAN_TRANSFORMPASS_LINK_INTO_TOOLS)
 target_compile_definitions(cusan_TransformPass
   PRIVATE
   $<$<BOOL:${CUSAN_TYPEART}>:CUSAN_TYPEART=1>
+  $<$<BOOL:${CUSAN_DEVICE_SYNC_CALLBACKS}>:CUSAN_DEVICE_SYNC_CALLBACKS=1>
 )
 
 set_target_properties(
diff --git a/lib/pass/CusanPass.cpp b/lib/pass/CusanPass.cpp
index aebe65f..48cbf50 100644
--- a/lib/pass/CusanPass.cpp
+++ b/lib/pass/CusanPass.cpp
@@ -60,7 +60,7 @@ class LegacyCusanPass : public llvm::ModulePass {
  public:
   static char ID;  // NOLINT
 
-  LegacyCusanPass() : ModulePass(ID) {};
+  LegacyCusanPass() : ModulePass(ID){};
 
   bool runOnModule(llvm::Module& module) override;
 
@@ -184,9 +184,11 @@ bool CusanPass::runOnFunc(llvm::Function& function) {
   modified |= transform::CudaSetDevice(&cusan_decls_).instrument(function);
 
   // callbacks
+#ifdef CUSAN_DEVICE_SYNC_CALLBACKS
   modified |= transform::CudaDeviceSyncCallback(&cusan_decls_).instrument(function);
   modified |= transform::CudaEventSyncCallback(&cusan_decls_).instrument(function);
   modified |= transform::CudaStreamSyncCallback(&cusan_decls_).instrument(function);
+#endif
 
   auto data_for_host = host::kernel_model_for_stub(&function, this->kernel_models_);
   if (data_for_host) {
diff --git a/lib/runtime/CusanRuntime.cpp b/lib/runtime/CusanRuntime.cpp
index 0ec5213..da2488c 100644
--- a/lib/runtime/CusanRuntime.cpp
+++ b/lib/runtime/CusanRuntime.cpp
@@ -12,7 +12,7 @@
 #include "analysis/KernelModel.h"
 #include "support/Logger.h"
 #include "StatsCounter.h"
-#if CUSAN_SOFTCOUNTER
+#ifdef CUSAN_SOFTCOUNTER
 #include "support/Table.h"
 #endif
 #include "TSanInterface.h"
@@ -217,7 +217,7 @@ class Device {
   void output_statistics() {
 #undef cusan_stat_handle
 #define cusan_stat_handle(name) table.put(Row::make(#name, stats_recorder.get_##name()));
-#if CUSAN_SOFTCOUNTER
+#ifdef CUSAN_SOFTCOUNTER
     Table table{"Cusan device statistics"};
 #ifdef CUSAN_FIBERPOOL
     table.put(Row::make("Fiberpool", 1));
@@ -244,7 +244,7 @@ class Runtime {
   std::map<Event, std::pair<TsanFiber, Device*>> events_;
   int32_t current_device_;
   bool init_;
-#if CUSAN_SOFTCOUNTER
+#ifdef CUSAN_SOFTCOUNTER
   softcounter::AtomicCounter device_switches = 0;
 #endif
  public:
@@ -262,7 +262,7 @@ class Runtime {
 
   void operator=(const Runtime&) = delete;
 
-#if CUSAN_SOFTCOUNTER
+#ifdef CUSAN_SOFTCOUNTER
   inline void inc_device_switches() {
     this->device_switches++;
   }
@@ -286,7 +286,7 @@ class Runtime {
     if (devices_.find(device) == devices_.end()) {
       devices_[device];
     }
-#if CUSAN_SOFTCOUNTER
+#ifdef CUSAN_SOFTCOUNTER
     if (current_device_ != device) {
       inc_device_switches();
     }
@@ -312,7 +312,7 @@ class Runtime {
   Runtime() = default;
 
   ~Runtime() {
-#if CUSAN_SOFTCOUNTER
+#ifdef CUSAN_SOFTCOUNTER
     for (auto& [_, device] : devices_) {
       device.output_statistics();
     }
@@ -595,7 +595,8 @@ void _cusan_memset_impl(void* target, size_t count) {
   auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_memset_calls();
   runtime.switch_to_stream(Device::kDefaultStream);
-  LOG_TRACE("[cusan]    " << "Write to " << target << " with size: " << count)
+  LOG_TRACE("[cusan]    "
+            << "Write to " << target << " with size: " << count)
   TsanMemoryWritePC(target, count, __builtin_return_address(0));
   runtime.stats_recorder.inc_TsanMemoryWrite();
   runtime.happens_before();
@@ -604,10 +605,12 @@ void _cusan_memset_impl(void* target, size_t count) {
   auto* alloc_info = runtime.get_allocation_info(target);
   // if we couldn't find alloc info we just assume the worst and don't sync
   if ((alloc_info && (alloc_info->is_pinned || alloc_info->is_managed)) || CUSAN_SYNC_DETAIL_LEVEL == 0) {
-    LOG_TRACE("[cusan]    " << "Memset is blocking")
+    LOG_TRACE("[cusan]    "
+              << "Memset is blocking")
     runtime.happens_after_stream(Device::kDefaultStream);
   } else {
-    LOG_TRACE("[cusan]    " << "Memset is not blocking")
+    LOG_TRACE("[cusan]    "
+              << "Memset is not blocking")
     if (!alloc_info) {
       LOG_DEBUG("[cusan]    Failed to get alloc info " << target);
     } else if (!alloc_info->is_pinned && !alloc_info->is_managed) {
@@ -775,16 +778,16 @@ void _cusan_memcpy(void* target, const void* from, size_t count, cusan_memcpy_ki
   _cusan_memcpy_impl(target, count, from, count, kind);
 }
 
-void cusan_sync_callback(cusan_sync_type type, const void* event_or_stream, unsigned int return_value) {
-  //switch (type) {
-  //  case cusan_Device:
-  //    printf("Device sync return value %i\n", return_value);
-  //    break;
-  //  case cusan_Stream:
-  //    printf("Stream %#x sync return value %i\n", event_or_stream, return_value);
-  //    break;
-  //  case cusan_Event:
-  //    printf("Event %#x sync return value %i\n", event_or_stream, return_value);
-  //    break;
-  //}
+void cusan_sync_callback(cusan_sync_type /*type*/, const void* /*event_or_stream*/, unsigned int /*return_value*/) {
+  // switch (type) {
+  //   case cusan_Device:
+  //     printf("Device sync return value %i\n", return_value);
+  //     break;
+  //   case cusan_Stream:
+  //     printf("Stream %#x sync return value %i\n", event_or_stream, return_value);
+  //     break;
+  //   case cusan_Event:
+  //     printf("Event %#x sync return value %i\n", event_or_stream, return_value);
+  //     break;
+  // }
 }
diff --git a/lib/runtime/StatsCounter.h b/lib/runtime/StatsCounter.h
index 2b00377..5e66b6e 100644
--- a/lib/runtime/StatsCounter.h
+++ b/lib/runtime/StatsCounter.h
@@ -107,9 +107,9 @@ class NoneRecorder {
   Statistics stats_r;
   CUSAN_CUDA_EVENT_LIST
 #include "TsanEvents.inc"
-  void inc_TsanMemoryReadCount(unsigned count) {
+  void inc_TsanMemoryReadCount(unsigned) {
   }
-  void inc_TsanMemoryWriteCount(unsigned count) {
+  void inc_TsanMemoryWriteCount(unsigned) {
   }
 };
 
diff --git a/lib/runtime/TSanInterface.h b/lib/runtime/TSanInterface.h
index 88d0f48..324b688 100644
--- a/lib/runtime/TSanInterface.h
+++ b/lib/runtime/TSanInterface.h
@@ -1,6 +1,9 @@
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wformat"
 #pragma GCC diagnostic ignored "-Wmacro-redefined"
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wformat-pedantic"
+#pragma GCC diagnostic ignored "-Wcast-qual"
 
 #include "TSan_External.h"
 

From 3a838df11e1c815471419517166150fa4544852d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Sun, 20 Apr 2025 15:10:58 +0200
Subject: [PATCH 12/15] Fix MPI check for tests

---
 test/lit.cfg               |  3 +++
 test/runtime/lit.local.cfg | 12 ------------
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/test/lit.cfg b/test/lit.cfg
index 8146092..cf956ce 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -43,6 +43,9 @@ else:
 if config.cusan_typeart:
   config.available_features.add('typeart')
 
+if config.mpiexec:
+  config.available_features.add('mpiexec')
+
 config.substitutions.append(("%clang-cpp", clang_cpp))
 config.substitutions.append(("%clang-cc", clang_cc))
 config.substitutions.append(("%opt", opt))
diff --git a/test/runtime/lit.local.cfg b/test/runtime/lit.local.cfg
index f681f81..3b823ec 100644
--- a/test/runtime/lit.local.cfg
+++ b/test/runtime/lit.local.cfg
@@ -1,16 +1,4 @@
-import subprocess
-
 if config.cusan_use_workarounds:
     config.environment['NEOReadDebugKeys'] = '1'
     config.environment['DisableDeepBind'] = '1'
     config.environment['OMPI_MCA_memory'] = '^patcher'
-
-def has_mpirun():
-    try:
-        subprocess.check_call(['mpirun', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        return True
-    except subprocess.CalledProcessError:
-        return False
-
-
-config.available_features.add('openmpi' if has_mpirun() else '')
\ No newline at end of file

From 616f8619970df1c282193a1f1c1f6f253236c217 Mon Sep 17 00:00:00 2001
From: ahueck <hueck.alexander@gmail.com>
Date: Sun, 20 Apr 2025 18:14:05 +0200
Subject: [PATCH 13/15] MPI dependency optional (#7)

---
 CMakeLists.txt                                |  10 +-
 cmake/cusanToolchain.cmake                    |   2 +-
 lib/runtime/CMakeLists.txt                    | 126 +++++++++---------
 lib/runtime/TSanInterface.h                   |   6 +-
 scripts/CMakeLists.txt                        |   1 -
 test/CMakeLists.txt                           |  23 +++-
 test/kernel_analysis/03_struct_write.c        |   2 +
 test/kernel_analysis/08_big_struct_write.c    |   2 +
 test/lit.cfg                                  |   2 +
 test/lit.site.cfg.in                          |   1 +
 test/pass/03_cuda_to_mpi.c                    |   4 +-
 test/pass/04_mpi_to_cuda.c                    |   2 +
 test/pass/05_cuda_to_mpi_stream.c             |   2 +
 test/pass/06_cuda_to_mpi_event.c              |   2 +
 test/pass/07_cuda_to_mpi_read.c               |   2 +
 test/pass/08_cudamemcpy_to_mpi.c              |   2 +
 test/pass/11_cuda_to_mpi_struct_of_buff.c     |   2 +
 .../18_cuda_to_mpi_event_query_busy_loop.c    |   1 +
 ...pi_send_cudaMemcpyAsyncH2H_implicit_sync.c |   1 +
 test/pass/20_cuda_to_mpi_send_ds_sync_w_r.c   |   2 +
 test/pass/21_chunked_streams_example.c        |   2 +
 test/pass/22_cuda_to_mpi_partial_buff_write.c |   2 +
 test/pass/26_malloc_pitch.c                   |   2 +
 test/pass/29_tsan_cuda_to_mpi.c               |   1 +
 test/pass/30_tsan_annotate_cuda_to_mpi.c      |   2 +
 test/pass/31_tsan_cuda_event.c                |   9 +-
 test/pass/32_tsan_async_copy.c                |   2 +-
 test/runtime/03_cuda_to_mpi.c                 |   2 +-
 test/runtime/04_mpi_to_cuda.c                 |   2 +-
 test/runtime/05_cuda_to_mpi_stream.c          |   2 +-
 test/runtime/06_cuda_to_mpi_event.c           |   2 +-
 test/runtime/07_cuda_to_mpi_read.c            |   2 +-
 test/runtime/08_cudamemcpy_to_mpi.c           |   2 +-
 test/runtime/11_cuda_to_mpi_struct_of_buff.c  |   2 +-
 test/runtime/12_struct_ptr.c                  |   2 +-
 .../18_cuda_to_mpi_event_query_busy_loop.c    |   2 +-
 ...pi_send_cudaMemcpyAsyncH2H_implicit_sync.c |   2 +-
 .../runtime/20_cuda_to_mpi_send_ds_sync_w_r.c |   2 +-
 test/runtime/21_chunked_streams_example.c     |   2 +-
 .../22_cuda_to_mpi_partial_buff_write.c       |   2 +-
 test/runtime/26_malloc_pitch.c                |   2 +-
 test/runtime/29_tsan_cuda_to_mpi.c            |   2 +-
 test/runtime/30_tsan_annotate_cuda_to_mpi.c   |   2 +-
 43 files changed, 146 insertions(+), 101 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 58f070b..69051f5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,11 +30,6 @@ add_subdirectory(externals)
 add_subdirectory(lib)
 add_subdirectory(scripts)
 
-if(PROJECT_IS_TOP_LEVEL)
-  enable_testing()
-  add_subdirectory(test)
-endif()
-
 write_basic_package_version_file(
   ${CMAKE_CURRENT_BINARY_DIR}/cusanConfigVersion.cmake
   VERSION ${PROJECT_VERSION}
@@ -68,3 +63,8 @@ if(PROJECT_IS_TOP_LEVEL)
       QUIET_ON_EMPTY
     )
 endif()
+
+if(PROJECT_IS_TOP_LEVEL)
+  enable_testing()
+  add_subdirectory(test)
+endif()
diff --git a/cmake/cusanToolchain.cmake b/cmake/cusanToolchain.cmake
index 449a0ff..cf2a55a 100644
--- a/cmake/cusanToolchain.cmake
+++ b/cmake/cusanToolchain.cmake
@@ -26,7 +26,7 @@ string(COMPARE EQUAL "${CMAKE_SOURCE_DIR}" "${PROJECT_SOURCE_DIR}"
 )
 
 find_package(CUDAToolkit REQUIRED)
-find_package(MPI REQUIRED)
+find_package(MPI)
 
 option(CUSAN_TEST_CONFIGURE_IDE "Add targets for tests to help the IDE with completion etc." ON)
 mark_as_advanced(CUSAN_TEST_CONFIGURE_IDE)
diff --git a/lib/runtime/CMakeLists.txt b/lib/runtime/CMakeLists.txt
index 741bbc2..5397584 100644
--- a/lib/runtime/CMakeLists.txt
+++ b/lib/runtime/CMakeLists.txt
@@ -1,67 +1,69 @@
-find_package(MPI)
-add_library(cusan_mpi_interceptor SHARED MPIInterception.cpp)
-target_link_libraries(cusan_mpi_interceptor
-  PRIVATE MPI::MPI_CXX
-)
-set_target_properties(
-  cusan_mpi_interceptor
-  PROPERTIES
-  OUTPUT_NAME "CusanMPIInterceptor"
-  EXPORT_NAME "MPIInterceptor"
-)
-add_library(cusan::MPI_Interceptor ALIAS cusan_mpi_interceptor)
-target_compile_features(cusan_mpi_interceptor PUBLIC cxx_std_17)
-target_compile_definitions(
-  cusan_mpi_interceptor
-  PRIVATE CUSAN_LOG_LEVEL=${CUSAN_LOG_LEVEL_RT}
-  LLVM_VERSION_MAJOR=${LLVM_VERSION_MAJOR}
-   $<$<BOOL:${CUSAN_FIBERPOOL}>:CUSAN_FIBERPOOL=1>
-   $<$<BOOL:${CUSAN_TYPEART}>:CUSAN_TYPEART=1>
-   $<$<BOOL:${CUSAN_SOFTCOUNTER}>:CUSAN_SOFTCOUNTER>
-)
-
-target_include_directories(cusan_mpi_interceptor ${warning_guard}
-  PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
-  PRIVATE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lib>
-)
-
-target_include_directories(cusan_mpi_interceptor
-  SYSTEM
-  PRIVATE
-  ${LLVM_INCLUDE_DIRS}
-)
-
-target_link_libraries(cusan_mpi_interceptor PRIVATE LLVMSupport)
-
-if(CUSAN_FIBERPOOL)
-  target_link_libraries(cusan_mpi_interceptor PUBLIC cusan::fiberpool)
-  set_target_properties(cusan_mpi_interceptor PROPERTIES
-        BUILD_WITH_INSTALL_RPATH FALSE
-        LINK_FLAGS "-Wl,-rpath,${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
+if (MPI_FOUND)
+  add_library(cusan_mpi_interceptor SHARED MPIInterception.cpp)
+  target_link_libraries(cusan_mpi_interceptor
+    PRIVATE MPI::MPI_CXX
+  )
+  set_target_properties(
+    cusan_mpi_interceptor
+    PROPERTIES
+    OUTPUT_NAME "CusanMPIInterceptor"
+    EXPORT_NAME "MPIInterceptor"
+  )
+  add_library(cusan::MPI_Interceptor ALIAS cusan_mpi_interceptor)
+  
+  target_compile_features(cusan_mpi_interceptor PUBLIC cxx_std_17)
+  target_compile_definitions(
+    cusan_mpi_interceptor
+    PRIVATE CUSAN_LOG_LEVEL=${CUSAN_LOG_LEVEL_RT}
+    LLVM_VERSION_MAJOR=${LLVM_VERSION_MAJOR}
+    $<$<BOOL:${CUSAN_FIBERPOOL}>:CUSAN_FIBERPOOL=1>
+    $<$<BOOL:${CUSAN_TYPEART}>:CUSAN_TYPEART=1>
+    $<$<BOOL:${CUSAN_SOFTCOUNTER}>:CUSAN_SOFTCOUNTER>
+  )
+
+  target_include_directories(cusan_mpi_interceptor ${warning_guard}
+    PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+    PRIVATE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lib>
+  )
+
+  target_include_directories(cusan_mpi_interceptor
+    SYSTEM
+    PRIVATE
+    ${LLVM_INCLUDE_DIRS}
+  )
+
+  target_link_libraries(cusan_mpi_interceptor PRIVATE LLVMSupport)
+
+  if(CUSAN_FIBERPOOL)
+    target_link_libraries(cusan_mpi_interceptor PUBLIC cusan::fiberpool)
+    set_target_properties(cusan_mpi_interceptor PROPERTIES
+          BUILD_WITH_INSTALL_RPATH FALSE
+          LINK_FLAGS "-Wl,-rpath,${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
+  endif()
+
+  set(CONFIG_NAME cusanMPIInterceptor)
+  set(TARGETS_EXPORT_NAME ${CONFIG_NAME}Targets)
+
+  install(
+    TARGETS cusan_mpi_interceptor
+    EXPORT ${TARGETS_EXPORT_NAME}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  )
+
+  install(
+    EXPORT ${TARGETS_EXPORT_NAME}
+    NAMESPACE cusan::
+    DESTINATION ${CUSAN_INSTALL_CONFIGDIR}
+  )
+
+  export(
+    EXPORT ${TARGETS_EXPORT_NAME}
+    FILE ${CMAKE_BINARY_DIR}/${TARGETS_EXPORT_NAME}.cmake
+    NAMESPACE cusan::
+  )
 endif()
 
-set(CONFIG_NAME cusanMPIInterceptor)
-set(TARGETS_EXPORT_NAME ${CONFIG_NAME}Targets)
-
-install(
-  TARGETS cusan_mpi_interceptor
-  EXPORT ${TARGETS_EXPORT_NAME}
-  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-)
-
-install(
-  EXPORT ${TARGETS_EXPORT_NAME}
-  NAMESPACE cusan::
-  DESTINATION ${CUSAN_INSTALL_CONFIGDIR}
-)
-
-export(
-  EXPORT ${TARGETS_EXPORT_NAME}
-  FILE ${CMAKE_BINARY_DIR}/${TARGETS_EXPORT_NAME}.cmake
-  NAMESPACE cusan::
-)
-
 add_library(cusan_Runtime SHARED CusanRuntime.cpp CusanRuntime_cudaSpecific.cpp)
 set_target_properties(
   cusan_Runtime
diff --git a/lib/runtime/TSanInterface.h b/lib/runtime/TSanInterface.h
index 324b688..3bfe367 100644
--- a/lib/runtime/TSanInterface.h
+++ b/lib/runtime/TSanInterface.h
@@ -1,9 +1,11 @@
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wformat"
-#pragma GCC diagnostic ignored "-Wmacro-redefined"
 #pragma GCC diagnostic ignored "-Wunused-parameter"
-#pragma GCC diagnostic ignored "-Wformat-pedantic"
 #pragma GCC diagnostic ignored "-Wcast-qual"
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wmacro-redefined"
+#pragma clang diagnostic ignored "-Wformat-pedantic"
+#endif
 
 #include "TSan_External.h"
 
diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt
index 74d956c..b27f7fb 100644
--- a/scripts/CMakeLists.txt
+++ b/scripts/CMakeLists.txt
@@ -175,7 +175,6 @@ if(NOT CUSAN_TYPEART)
   )
 endif()
 
-find_package(MPI)
 if(MPI_C_FOUND)
   configure_cusan_script(
     ${CUSAN_WRAPPER} cusan-mpicc${CMAKE_DEBUG_POSTFIX}
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 64266d6..d13fe37 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -36,6 +36,7 @@ function(configure_cusan_lit_site_cfg input output)
 
   pythonize_bool(${CUSAN_TEST_WORKAROUNDS} CUSAN_TEST_WORKAROUND)
   pythonize_bool(${CUSAN_TYPEART} CUSAN_HAS_TYPEART)
+  pythonize_bool(${MPI_FOUND} CUSAN_HAS_MPI)
 
   cusan_target_generate_file(${input} ${output})
 endfunction()
@@ -90,16 +91,32 @@ function(cusan_add_lit_target)
   endforeach()
 endfunction()
 
+
+if(NOT MPI_FOUND)
+  # mock target to make test generation work
+  add_library(cusan_mpi_interceptor_mock SHARED IMPORTED)
+  set_target_properties(
+    cusan_mpi_interceptor_mock
+    PROPERTIES
+    OUTPUT_NAME "CusanMPIInterceptorPlaceholder"
+    IMPORTED_LOCATION "${PROJECT_SOURCE_DIR}/lib/runtime"
+  )
+  set(_cusan_interceptor_target cusan_mpi_interceptor_mock)
+  add_library(cusan::MPI_Interceptor ALIAS cusan_mpi_interceptor_mock)
+endif()
+
 set(CUSAN_TEST_DEPENDS
   cusan::TransformPass
-  cusan::MPI_Interceptor
   cusan::Runtime
   cusan::Analysis
-  
 )
 
+if(FOUND_MPI)
+  list(APPEND CUSAN_TEST_DEPENDS cusan::MPI_Interceptor)
+endif()
+
 if(CUSAN_TYPEART)
-list(APPEND CUSAN_TEST_DEPENDS typeart::Runtime typeart::TransformPass)
+  list(APPEND CUSAN_TEST_DEPENDS typeart::Runtime typeart::TransformPass)
 endif()
 
 set(CUSAN_SUITES
diff --git a/test/kernel_analysis/03_struct_write.c b/test/kernel_analysis/03_struct_write.c
index c001e03..a6edeef 100644
--- a/test/kernel_analysis/03_struct_write.c
+++ b/test/kernel_analysis/03_struct_write.c
@@ -22,6 +22,8 @@
 // CHECK-NEXT: subarg: {{.*}}ptr: 0, rw: ReadWrite
 // CHECK-NOT: Handling Arg:
 
+// REQUIRES: mpi
+
 // clang-format on
 
 #include "../support/gpu_mpi.h"
diff --git a/test/kernel_analysis/08_big_struct_write.c b/test/kernel_analysis/08_big_struct_write.c
index 5d34cd5..3df5664 100644
--- a/test/kernel_analysis/08_big_struct_write.c
+++ b/test/kernel_analysis/08_big_struct_write.c
@@ -22,6 +22,8 @@
 // CHECK-NEXT: subarg: {{.*}}ptr: 0, rw: ReadWrite
 // CHECK-NOT: Handling Arg:
 
+// REQUIRES: mpi
+
 // clang-format on
 
 #include "../support/gpu_mpi.h"
diff --git a/test/lit.cfg b/test/lit.cfg
index cf956ce..909e384 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -45,6 +45,8 @@ if config.cusan_typeart:
 
 if config.mpiexec:
   config.available_features.add('mpiexec')
+if config.cusan_mpi:
+  config.available_features.add('mpi')
 
 config.substitutions.append(("%clang-cpp", clang_cpp))
 config.substitutions.append(("%clang-cc", clang_cc))
diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in
index 70030c3..d6cde3f 100644
--- a/test/lit.site.cfg.in
+++ b/test/lit.site.cfg.in
@@ -31,6 +31,7 @@ config.cusan_pass = "$<TARGET_FILE_NAME:cusan::TransformPass>"
 config.cusan_mpi_interceptor = "$<TARGET_FILE:cusan::MPI_Interceptor>"
 config.cusan_use_workarounds = @CUSAN_TEST_WORKAROUND@
 config.cusan_typeart = @CUSAN_HAS_TYPEART@
+config.cusan_mpi = @CUSAN_HAS_MPI@
 
 # Let the main config do the real work.
 config.loaded_site_config = True
diff --git a/test/pass/03_cuda_to_mpi.c b/test/pass/03_cuda_to_mpi.c
index 82d6a98..5186403 100644
--- a/test/pass/03_cuda_to_mpi.c
+++ b/test/pass/03_cuda_to_mpi.c
@@ -2,11 +2,11 @@
 // RUN: %rm-file %t.yaml 
 // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
-
-
 // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[target:%[0-9a-z]+]], {{i8\*|ptr}} {{.*}}[[from:%[0-9a-z]+]], 
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]], {{i8\*|ptr}} {{.*}}[[from]],
 
+// REQUIRES: mpi
+
 // clang-format on
 
 #include "../support/gpu_mpi.h"
diff --git a/test/pass/04_mpi_to_cuda.c b/test/pass/04_mpi_to_cuda.c
index e27f41a..6196dcc 100644
--- a/test/pass/04_mpi_to_cuda.c
+++ b/test/pass/04_mpi_to_cuda.c
@@ -8,6 +8,8 @@
 // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[target:%[0-9a-z]+]], {{i8\*|ptr}} {{.*}}[[from:%[0-9a-z]+]],
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]], {{i8\*|ptr}} {{.*}}[[from]],
 
+// REQUIRES: mpi
+
 // clang-format on
 
 #include "../support/gpu_mpi.h"
diff --git a/test/pass/05_cuda_to_mpi_stream.c b/test/pass/05_cuda_to_mpi_stream.c
index bdb304f..6a22f3e 100644
--- a/test/pass/05_cuda_to_mpi_stream.c
+++ b/test/pass/05_cuda_to_mpi_stream.c
@@ -10,6 +10,8 @@
 // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamSynchronize
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_stream
 
+// REQUIRES: mpi
+
 // clang-format on
 
 #include "../support/gpu_mpi.h"
diff --git a/test/pass/06_cuda_to_mpi_event.c b/test/pass/06_cuda_to_mpi_event.c
index 2253965..50bdd34 100644
--- a/test/pass/06_cuda_to_mpi_event.c
+++ b/test/pass/06_cuda_to_mpi_event.c
@@ -10,6 +10,8 @@
 // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[target:%[0-9a-z]+]], {{i8\*|ptr}} {{.*}}[[from:%[0-9a-z]+]], 
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]], {{i8\*|ptr}} {{.*}}[[from]],
 
+// REQUIRES: mpi
+
 // clang-format on
 
 #include "../support/gpu_mpi.h"
diff --git a/test/pass/07_cuda_to_mpi_read.c b/test/pass/07_cuda_to_mpi_read.c
index c1ce739..f379fb7 100644
--- a/test/pass/07_cuda_to_mpi_read.c
+++ b/test/pass/07_cuda_to_mpi_read.c
@@ -11,6 +11,8 @@
 // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[target:%[0-9a-z]+]], {{i8\*|ptr}} {{.*}}[[from:%[0-9a-z]+]],
 // CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]], {{i8\*|ptr}} {{.*}}[[from]],
 
+// REQUIRES: mpi
+
 // clang-format on
 
 #include "../support/gpu_mpi.h"
diff --git a/test/pass/08_cudamemcpy_to_mpi.c b/test/pass/08_cudamemcpy_to_mpi.c
index 3d1139c..1d553a2 100644
--- a/test/pass/08_cudamemcpy_to_mpi.c
+++ b/test/pass/08_cudamemcpy_to_mpi.c
@@ -15,6 +15,8 @@
 // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamSynchronize
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_stream
 
+// REQUIRES: mpi
+
 // clang-format on
 
 // Tsan sometimes crashes with this test it seems
diff --git a/test/pass/11_cuda_to_mpi_struct_of_buff.c b/test/pass/11_cuda_to_mpi_struct_of_buff.c
index 2dcea96..3a2001d 100644
--- a/test/pass/11_cuda_to_mpi_struct_of_buff.c
+++ b/test/pass/11_cuda_to_mpi_struct_of_buff.c
@@ -3,6 +3,8 @@
 
 // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
+// REQUIRES: mpi
+
 // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
 // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
diff --git a/test/pass/18_cuda_to_mpi_event_query_busy_loop.c b/test/pass/18_cuda_to_mpi_event_query_busy_loop.c
index cd00aae..8c590cc 100644
--- a/test/pass/18_cuda_to_mpi_event_query_busy_loop.c
+++ b/test/pass/18_cuda_to_mpi_event_query_busy_loop.c
@@ -3,6 +3,7 @@
 
 // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
+// REQUIRES: mpi
 
 // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
 // CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_stream
diff --git a/test/pass/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c b/test/pass/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c
index 1c5d299..248c3a6 100644
--- a/test/pass/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c
+++ b/test/pass/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c
@@ -3,6 +3,7 @@
 
 // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
+// REQUIRES: mpi
 
 // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
 // CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_stream
diff --git a/test/pass/20_cuda_to_mpi_send_ds_sync_w_r.c b/test/pass/20_cuda_to_mpi_send_ds_sync_w_r.c
index f86f3a0..871b3ab 100644
--- a/test/pass/20_cuda_to_mpi_send_ds_sync_w_r.c
+++ b/test/pass/20_cuda_to_mpi_send_ds_sync_w_r.c
@@ -3,6 +3,8 @@
 
 // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
+// REQUIRES: mpi
+
 // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
 // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaMemset
diff --git a/test/pass/21_chunked_streams_example.c b/test/pass/21_chunked_streams_example.c
index 13562f7..03774ce 100644
--- a/test/pass/21_chunked_streams_example.c
+++ b/test/pass/21_chunked_streams_example.c
@@ -3,6 +3,8 @@
 
 // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
+// REQUIRES: mpi
+
 // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
 // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
diff --git a/test/pass/22_cuda_to_mpi_partial_buff_write.c b/test/pass/22_cuda_to_mpi_partial_buff_write.c
index c037c8a..2f23ae8 100644
--- a/test/pass/22_cuda_to_mpi_partial_buff_write.c
+++ b/test/pass/22_cuda_to_mpi_partial_buff_write.c
@@ -3,6 +3,8 @@
 
 // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
+// REQUIRES: mpi
+
 // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
 // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamSynchronize
diff --git a/test/pass/26_malloc_pitch.c b/test/pass/26_malloc_pitch.c
index b71af1a..7a0a97c 100644
--- a/test/pass/26_malloc_pitch.c
+++ b/test/pass/26_malloc_pitch.c
@@ -6,6 +6,8 @@
 // {{.*}}[[from:%[0-9a-z]+]], CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]],
 // {{i8\*|ptr}} {{.*}}[[from]],
 
+// REQUIRES: mpi
+
 // clang-format on
 
 #include "../support/gpu_mpi.h"
diff --git a/test/pass/29_tsan_cuda_to_mpi.c b/test/pass/29_tsan_cuda_to_mpi.c
index 7608db6..9ab6fdc 100644
--- a/test/pass/29_tsan_cuda_to_mpi.c
+++ b/test/pass/29_tsan_cuda_to_mpi.c
@@ -3,6 +3,7 @@
 
 // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
+// REQUIRES: mpi
 
 // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy
 // CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memcpy
diff --git a/test/pass/30_tsan_annotate_cuda_to_mpi.c b/test/pass/30_tsan_annotate_cuda_to_mpi.c
index 506eff1..335e372 100644
--- a/test/pass/30_tsan_annotate_cuda_to_mpi.c
+++ b/test/pass/30_tsan_annotate_cuda_to_mpi.c
@@ -8,6 +8,8 @@
 // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFree
 // CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_free
 
+// REQUIRES: mpi
+
 // clang-format on
 
 #include "../support/gpu_mpi.h"
diff --git a/test/pass/31_tsan_cuda_event.c b/test/pass/31_tsan_cuda_event.c
index 918ba99..eb819ea 100644
--- a/test/pass/31_tsan_cuda_event.c
+++ b/test/pass/31_tsan_cuda_event.c
@@ -1,8 +1,7 @@
 // clang-format off
 // RUN: %rm-file %t.yaml 
 
-// RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
-
+// RUN: %wrapper-cc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
 // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaEventCreate
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_event
@@ -15,8 +14,6 @@
 
 // clang-format on
 
-#include "../support/gpu_mpi.h"
-
 #include <unistd.h>
 
 __global__ void kernel(int* arr, const int N) {  // CHECK-DAG: [[FILENAME]]:[[@LINE]]
@@ -27,10 +24,6 @@ __global__ void kernel(int* arr, const int N) {  // CHECK-DAG: [[FILENAME]]:[[@L
 }
 
 int main(int argc, char* argv[]) {
-  if (!has_gpu_aware_mpi()) {
-    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
-    return 1;
-  }
   cudaEvent_t first_finished_event;
   cudaEventCreate(&first_finished_event);
   cudaStream_t stream1;
diff --git a/test/pass/32_tsan_async_copy.c b/test/pass/32_tsan_async_copy.c
index cc7c12f..d30220d 100644
--- a/test/pass/32_tsan_async_copy.c
+++ b/test/pass/32_tsan_async_copy.c
@@ -1,7 +1,7 @@
 // clang-format off
 // RUN: %rm-file %t.yaml 
 
-// RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
+// RUN: %wrapper-cc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
 
 // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate 
diff --git a/test/runtime/03_cuda_to_mpi.c b/test/runtime/03_cuda_to_mpi.c
index d7f11b6..e2dcd9d 100644
--- a/test/runtime/03_cuda_to_mpi.c
+++ b/test/runtime/03_cuda_to_mpi.c
@@ -5,7 +5,7 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
-// REQUIRES: mpiexec
+// REQUIRES: mpi
 
 // clang-format on
 
diff --git a/test/runtime/04_mpi_to_cuda.c b/test/runtime/04_mpi_to_cuda.c
index 370f48f..77f8915 100644
--- a/test/runtime/04_mpi_to_cuda.c
+++ b/test/runtime/04_mpi_to_cuda.c
@@ -5,7 +5,7 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
-// REQUIRES: mpiexec
+// REQUIRES: mpi
 
 // clang-format on
 
diff --git a/test/runtime/05_cuda_to_mpi_stream.c b/test/runtime/05_cuda_to_mpi_stream.c
index af827b9..4ff4400 100644
--- a/test/runtime/05_cuda_to_mpi_stream.c
+++ b/test/runtime/05_cuda_to_mpi_stream.c
@@ -5,7 +5,7 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
-// REQUIRES: mpiexec
+// REQUIRES: mpi
 
 // clang-format on
 
diff --git a/test/runtime/06_cuda_to_mpi_event.c b/test/runtime/06_cuda_to_mpi_event.c
index 0585479..65ef15e 100644
--- a/test/runtime/06_cuda_to_mpi_event.c
+++ b/test/runtime/06_cuda_to_mpi_event.c
@@ -5,7 +5,7 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
-// REQUIRES: mpiexec
+// REQUIRES: mpi
 
 // clang-format on
 
diff --git a/test/runtime/07_cuda_to_mpi_read.c b/test/runtime/07_cuda_to_mpi_read.c
index e367e2a..d1ee1fc 100644
--- a/test/runtime/07_cuda_to_mpi_read.c
+++ b/test/runtime/07_cuda_to_mpi_read.c
@@ -2,7 +2,7 @@
 // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck --allow-empty %s
 
-// REQUIRES: mpiexec
+// REQUIRES: mpi
 
 // clang-format on
 
diff --git a/test/runtime/08_cudamemcpy_to_mpi.c b/test/runtime/08_cudamemcpy_to_mpi.c
index b8fa5bf..4aedbcc 100644
--- a/test/runtime/08_cudamemcpy_to_mpi.c
+++ b/test/runtime/08_cudamemcpy_to_mpi.c
@@ -2,7 +2,7 @@
 // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
 // RUN: %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s --allow-empty
 
-// REQUIRES: mpiexec
+// REQUIRES: mpi
 
 // clang-format on
 
diff --git a/test/runtime/11_cuda_to_mpi_struct_of_buff.c b/test/runtime/11_cuda_to_mpi_struct_of_buff.c
index 4a3141d..8455f74 100644
--- a/test/runtime/11_cuda_to_mpi_struct_of_buff.c
+++ b/test/runtime/11_cuda_to_mpi_struct_of_buff.c
@@ -5,7 +5,7 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
-// REQUIRES: mpiexec
+// REQUIRES: mpi
 
 // clang-format on
 
diff --git a/test/runtime/12_struct_ptr.c b/test/runtime/12_struct_ptr.c
index 5a71312..41ab21d 100644
--- a/test/runtime/12_struct_ptr.c
+++ b/test/runtime/12_struct_ptr.c
@@ -5,7 +5,7 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
-// REQUIRES: mpiexec
+// REQUIRES: mpi
 
 // CHECK-DAG: data race
 
diff --git a/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c b/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c
index 73674a7..0b82829 100644
--- a/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c
+++ b/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c
@@ -5,7 +5,7 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
-// REQUIRES: mpiexec
+// REQUIRES: mpi
 
 // clang-format on
 
diff --git a/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c b/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c
index 61eee40..23d9a9c 100644
--- a/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c
+++ b/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c
@@ -5,7 +5,7 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
-// REQUIRES: mpiexec
+// REQUIRES: mpi
 
 // clang-format on
 
diff --git a/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c b/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c
index 820912b..7f88a11 100644
--- a/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c
+++ b/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c
@@ -5,7 +5,7 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
-// REQUIRES: mpiexec
+// REQUIRES: mpi
 
 // clang-format on
 
diff --git a/test/runtime/21_chunked_streams_example.c b/test/runtime/21_chunked_streams_example.c
index fa4f5d0..6b7dee5 100644
--- a/test/runtime/21_chunked_streams_example.c
+++ b/test/runtime/21_chunked_streams_example.c
@@ -2,7 +2,7 @@
 // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
 
-// REQUIRES: mpiexec
+// REQUIRES: mpi
 
 // clang-format on
 
diff --git a/test/runtime/22_cuda_to_mpi_partial_buff_write.c b/test/runtime/22_cuda_to_mpi_partial_buff_write.c
index 8a29dd9..2639ce8 100644
--- a/test/runtime/22_cuda_to_mpi_partial_buff_write.c
+++ b/test/runtime/22_cuda_to_mpi_partial_buff_write.c
@@ -2,7 +2,7 @@
 // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s --allow-empty
 
-// REQUIRES: mpiexec
+// REQUIRES: mpi
 
 // clang-format on
 
diff --git a/test/runtime/26_malloc_pitch.c b/test/runtime/26_malloc_pitch.c
index 3d7524f..0dac955 100644
--- a/test/runtime/26_malloc_pitch.c
+++ b/test/runtime/26_malloc_pitch.c
@@ -5,7 +5,7 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
-// REQUIRES: mpiexec
+// REQUIRES: mpi
 
 // clang-format on
 
diff --git a/test/runtime/29_tsan_cuda_to_mpi.c b/test/runtime/29_tsan_cuda_to_mpi.c
index 14ebfa1..2ac700f 100644
--- a/test/runtime/29_tsan_cuda_to_mpi.c
+++ b/test/runtime/29_tsan_cuda_to_mpi.c
@@ -5,7 +5,7 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
-// REQUIRES: mpiexec
+// REQUIRES: mpi
 
 // clang-format on
 
diff --git a/test/runtime/30_tsan_annotate_cuda_to_mpi.c b/test/runtime/30_tsan_annotate_cuda_to_mpi.c
index 24e59ce..19c6d97 100644
--- a/test/runtime/30_tsan_annotate_cuda_to_mpi.c
+++ b/test/runtime/30_tsan_annotate_cuda_to_mpi.c
@@ -5,7 +5,7 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda -g %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
 
-// REQUIRES: mpiexec
+// REQUIRES: mpi
 
 // clang-format on
 

From 53986fd5baadcc878bbdb461fcbc962944a32198 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Mon, 21 Apr 2025 12:09:19 +0200
Subject: [PATCH 14/15] Find MPI quiet

---
 cmake/cusanToolchain.cmake | 2 +-
 scripts/CMakeLists.txt     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/cusanToolchain.cmake b/cmake/cusanToolchain.cmake
index cf2a55a..63cf466 100644
--- a/cmake/cusanToolchain.cmake
+++ b/cmake/cusanToolchain.cmake
@@ -26,7 +26,7 @@ string(COMPARE EQUAL "${CMAKE_SOURCE_DIR}" "${PROJECT_SOURCE_DIR}"
 )
 
 find_package(CUDAToolkit REQUIRED)
-find_package(MPI)
+find_package(MPI QUIET)
 
 option(CUSAN_TEST_CONFIGURE_IDE "Add targets for tests to help the IDE with completion etc." ON)
 mark_as_advanced(CUSAN_TEST_CONFIGURE_IDE)
diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt
index b27f7fb..6dffaf8 100644
--- a/scripts/CMakeLists.txt
+++ b/scripts/CMakeLists.txt
@@ -92,7 +92,7 @@ function(configure_cusan_script input output)
 endfunction()
 
 function(cusan_find_mpi_vendor_helper symbol ret_value)
-  find_package(MPI)
+  find_package(MPI QUIET)
 
   if(NOT MPI_FOUND)
     set(${ret_value} false)

From 3d4ff1b02396f8c1072d73da9db5c413e8ae5605 Mon Sep 17 00:00:00 2001
From: ahueck <hueck.alexander@gmail.com>
Date: Mon, 21 Apr 2025 12:50:20 +0200
Subject: [PATCH 15/15] Sync callback default OFF (#9)

---
 .github/workflows/ci.yml                 |  7 ++++++-
 cmake/cusanToolchain.cmake               |  2 +-
 lib/runtime/CusanRuntime.cpp             |  1 +
 test/CMakeLists.txt                      |  1 +
 test/lit.cfg                             |  3 +++
 test/lit.site.cfg.in                     |  1 +
 test/pass/34_test_sync_callback.c        | 14 ++++++++++++++
 test/pass/35_test_sync_callback_stream.c | 15 +++++++++++++++
 test/pass/36_test_sync_callback_event.c  | 14 ++++++++++++++
 9 files changed, 56 insertions(+), 2 deletions(-)
 create mode 100644 test/pass/34_test_sync_callback.c
 create mode 100644 test/pass/35_test_sync_callback_stream.c
 create mode 100644 test/pass/36_test_sync_callback_event.c

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 014567f..6ae9856 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -30,6 +30,11 @@ jobs:
             os: ubuntu-24.04
             preset: release
             cuda: 12.6.0
+          - llvm-version: 19
+            os: ubuntu-24.04
+            preset: release
+            cuda: 12.6.0
+            cusan-option: -DCUSAN_DEVICE_SYNC_CALLBACKS=ON
 
     runs-on: ${{ matrix.os }}
 
@@ -70,7 +75,7 @@ jobs:
           echo "EXTERNAL_LIT=/usr/lib/llvm-${{ matrix.llvm-version }}/build/utils/lit/lit.py" >> $GITHUB_ENV
 
       - name: Configure CuSan
-        run: cmake -B build --preset ${{ matrix.preset }} -DLLVM_DIR=${LLVM_CMAKE_DIR} -DLLVM_EXTERNAL_LIT=${EXTERNAL_LIT}
+        run: cmake -B build --preset ${{ matrix.preset }} -DLLVM_DIR=${LLVM_CMAKE_DIR} -DLLVM_EXTERNAL_LIT=${EXTERNAL_LIT} ${{ matrix.cusan-option }}
 
       - name: Build CuSan
         run: cmake --build build --parallel 2
diff --git a/cmake/cusanToolchain.cmake b/cmake/cusanToolchain.cmake
index 63cf466..4a8d132 100644
--- a/cmake/cusanToolchain.cmake
+++ b/cmake/cusanToolchain.cmake
@@ -41,7 +41,7 @@ option(CUSAN_FIBERPOOL "Use external fiber pool to manage ThreadSanitizer fibers
 option(CUSAN_SOFTCOUNTER "Print runtime counters" OFF)
 option(CUSAN_SYNC_DETAIL_LEVEL "Enable implicit sync analysis of memcpy/memset" ON)
 
-option(CUSAN_DEVICE_SYNC_CALLBACKS "Enable runtime callbacks after sync calls" ON)
+option(CUSAN_DEVICE_SYNC_CALLBACKS "Enable runtime callbacks after sync calls" OFF)
 
 option(CUSAN_TEST_WORKAROUNDS "Enable workarounds for MPI + TSan regarding runtime tests" ON)
 mark_as_advanced(CUSAN_TEST_WORKAROUNDS)
diff --git a/lib/runtime/CusanRuntime.cpp b/lib/runtime/CusanRuntime.cpp
index da2488c..9c00736 100644
--- a/lib/runtime/CusanRuntime.cpp
+++ b/lib/runtime/CusanRuntime.cpp
@@ -779,6 +779,7 @@ void _cusan_memcpy(void* target, const void* from, size_t count, cusan_memcpy_ki
 }
 
 void cusan_sync_callback(cusan_sync_type /*type*/, const void* /*event_or_stream*/, unsigned int /*return_value*/) {
+  LOG_TRACE("[cusan]Callback");
   // switch (type) {
   //   case cusan_Device:
   //     printf("Device sync return value %i\n", return_value);
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index d13fe37..42e693a 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -37,6 +37,7 @@ function(configure_cusan_lit_site_cfg input output)
   pythonize_bool(${CUSAN_TEST_WORKAROUNDS} CUSAN_TEST_WORKAROUND)
   pythonize_bool(${CUSAN_TYPEART} CUSAN_HAS_TYPEART)
   pythonize_bool(${MPI_FOUND} CUSAN_HAS_MPI)
+  pythonize_bool(${CUSAN_DEVICE_SYNC_CALLBACKS} CUSAN_HAS_SYNC_CALLBACKS)
 
   cusan_target_generate_file(${input} ${output})
 endfunction()
diff --git a/test/lit.cfg b/test/lit.cfg
index 909e384..bfea736 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -48,6 +48,9 @@ if config.mpiexec:
 if config.cusan_mpi:
   config.available_features.add('mpi')
 
+if config.cusan_sync_callback:
+  config.available_features.add('sync_callback')
+
 config.substitutions.append(("%clang-cpp", clang_cpp))
 config.substitutions.append(("%clang-cc", clang_cc))
 config.substitutions.append(("%opt", opt))
diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in
index d6cde3f..14e1304 100644
--- a/test/lit.site.cfg.in
+++ b/test/lit.site.cfg.in
@@ -32,6 +32,7 @@ config.cusan_mpi_interceptor = "$<TARGET_FILE:cusan::MPI_Interceptor>"
 config.cusan_use_workarounds = @CUSAN_TEST_WORKAROUND@
 config.cusan_typeart = @CUSAN_HAS_TYPEART@
 config.cusan_mpi = @CUSAN_HAS_MPI@
+config.cusan_sync_callback = @CUSAN_HAS_SYNC_CALLBACKS@
 
 # Let the main config do the real work.
 config.loaded_site_config = True
diff --git a/test/pass/34_test_sync_callback.c b/test/pass/34_test_sync_callback.c
new file mode 100644
index 0000000..2015bfe
--- /dev/null
+++ b/test/pass/34_test_sync_callback.c
@@ -0,0 +1,14 @@
+// clang-format off
+// RUN: %rm-file %t.yaml 
+// RUN: %wrapper-cc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s
+// REQUIRES: sync_callback
+// clang-format on
+
+// CHECK: {{(invoke|call)}} i32 @cudaDeviceSynchronize
+// CHECK: {{(invoke|call)}} void @cusan_sync_callback
+
+int main(int argc, char* argv[]) {
+  cudaDeviceSynchronize();
+
+  return 0;
+}
diff --git a/test/pass/35_test_sync_callback_stream.c b/test/pass/35_test_sync_callback_stream.c
new file mode 100644
index 0000000..d8c5b68
--- /dev/null
+++ b/test/pass/35_test_sync_callback_stream.c
@@ -0,0 +1,15 @@
+// clang-format off
+// RUN: %rm-file %t.yaml 
+// RUN: %wrapper-cc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s
+// REQUIRES: sync_callback
+// clang-format on
+
+// CHECK: {{(invoke|call)}} i32 @cudaStreamSynchronize
+// CHECK: {{(invoke|call)}} void @cusan_sync_callback
+
+int main(int argc, char* argv[]) {
+  cudaStream_t stream = NULL;
+  cudaStreamSynchronize(stream);
+
+  return 0;
+}
diff --git a/test/pass/36_test_sync_callback_event.c b/test/pass/36_test_sync_callback_event.c
new file mode 100644
index 0000000..6ee4679
--- /dev/null
+++ b/test/pass/36_test_sync_callback_event.c
@@ -0,0 +1,14 @@
+// clang-format off
+// RUN: %rm-file %t.yaml 
+// RUN: %wrapper-cc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s
+// REQUIRES: sync_callback
+// clang-format on
+
+// CHECK: {{(invoke|call)}} i32 @cudaEventSynchronize
+// CHECK: {{(invoke|call)}} void @cusan_sync_callback
+
+int main(int argc, char* argv[]) {
+  cudaEvent_t event;
+  cudaEventSynchronize(event);
+  return 0;
+}