diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d121748..6ae9856 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -10,25 +10,6 @@ env:
   CC: clang
 
 jobs:
-  format-check:
-    runs-on: ubuntu-22.04
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Format source code
-        run: |
-          find lib test \
-            -type f \
-            -a \( -name "*.c" -o -name "*.cpp" -o -name "*.h" \) \
-            -print0 \
-            | xargs -0 clang-format-14 -i
-
-      - name: Format check
-        run: |
-          git status --porcelain --untracked-files=no
-          git status --porcelain --untracked-files=no | xargs -o -I {} test -z \"{}\"
-
   codespell:
     runs-on: ubuntu-22.04
 
@@ -44,19 +25,35 @@ jobs:
           - llvm-version: 14
             os: ubuntu-22.04
             preset: release
+            cuda: 11.8.0
+          - llvm-version: 19
+            os: ubuntu-24.04
+            preset: release
+            cuda: 12.6.0
+          - llvm-version: 19
+            os: ubuntu-24.04
+            preset: release
+            cuda: 12.6.0
+            cusan-option: -DCUSAN_DEVICE_SYNC_CALLBACKS=ON
 
     runs-on: ${{ matrix.os }}
 
     steps:
       - uses: actions/checkout@v4
 
+      - name: LLVM apt
+        if: ${{ matrix.llvm-version == 19 }}
+        run: |
+          wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
+          echo "deb http://apt.llvm.org/noble/ llvm-toolchain-noble-19 main" | sudo tee /etc/apt/sources.list.d/llvm-19.list
+
       - name: Update apt
         run: sudo apt-get update
 
       - uses: Jimver/cuda-toolkit@v0.2.19
         id: cuda-toolkit
         with:
-          cuda: '11.8.0'
+          cuda: '${{ matrix.cuda }}'
           method: network
           sub-packages: '["nvcc", "cudart", "cudart-dev"]'
           non-cuda-sub-packages: '["libcurand", "libcurand-dev"]'
@@ -78,7 +75,7 @@ jobs:
           echo "EXTERNAL_LIT=/usr/lib/llvm-${{ matrix.llvm-version }}/build/utils/lit/lit.py" >> $GITHUB_ENV
 
       - name: Configure CuSan
-        run: cmake -B build --preset ${{ matrix.preset }} -DLLVM_DIR=${LLVM_CMAKE_DIR} -DLLVM_EXTERNAL_LIT=${EXTERNAL_LIT}
+        run: cmake -B build --preset ${{ matrix.preset }} -DLLVM_DIR=${LLVM_CMAKE_DIR} -DLLVM_EXTERNAL_LIT=${EXTERNAL_LIT} ${{ matrix.cusan-option }}
 
       - name: Build CuSan
         run: cmake --build build --parallel 2
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 191bca8..69051f5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,6 +9,8 @@ project(
 )
 
 set(CMAKE_VERBOSE_MAKEFILE ON)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
 
 list(
   APPEND
@@ -28,11 +30,6 @@ add_subdirectory(externals)
 add_subdirectory(lib)
 add_subdirectory(scripts)
 
-if(PROJECT_IS_TOP_LEVEL)
-  enable_testing()
-  add_subdirectory(test)
-endif()
-
 write_basic_package_version_file(
   ${CMAKE_CURRENT_BINARY_DIR}/cusanConfigVersion.cmake
   VERSION ${PROJECT_VERSION}
@@ -66,3 +63,8 @@ if(PROJECT_IS_TOP_LEVEL)
       QUIET_ON_EMPTY
     )
 endif()
+
+if(PROJECT_IS_TOP_LEVEL)
+  enable_testing()
+  add_subdirectory(test)
+endif()
diff --git a/cmake/cusanToolchain.cmake b/cmake/cusanToolchain.cmake
index 9e98131..4a8d132 100644
--- a/cmake/cusanToolchain.cmake
+++ b/cmake/cusanToolchain.cmake
@@ -26,7 +26,7 @@ string(COMPARE EQUAL "${CMAKE_SOURCE_DIR}" "${PROJECT_SOURCE_DIR}"
 )
 
 find_package(CUDAToolkit REQUIRED)
-find_package(MPI REQUIRED)
+find_package(MPI QUIET)
 
 option(CUSAN_TEST_CONFIGURE_IDE "Add targets for tests to help the IDE with completion etc." ON)
 mark_as_advanced(CUSAN_TEST_CONFIGURE_IDE)
@@ -41,6 +41,8 @@ option(CUSAN_FIBERPOOL "Use external fiber pool to manage ThreadSanitizer fibers
 option(CUSAN_SOFTCOUNTER "Print runtime counters" OFF)
 option(CUSAN_SYNC_DETAIL_LEVEL "Enable implicit sync analysis of memcpy/memset" ON)
 
+option(CUSAN_DEVICE_SYNC_CALLBACKS "Enable runtime callbacks after sync calls" OFF)
+
 option(CUSAN_TEST_WORKAROUNDS "Enable workarounds for MPI + TSan regarding runtime tests" ON)
 mark_as_advanced(CUSAN_TEST_WORKAROUNDS)
 
diff --git a/cmake/modules/cusan-format.cmake b/cmake/modules/cusan-format.cmake
index bacdf5d..96c393d 100644
--- a/cmake/modules/cusan-format.cmake
+++ b/cmake/modules/cusan-format.cmake
@@ -20,11 +20,12 @@ function(cusan_target_format target comment)
     filter_dir(${exclude})
   endforeach()
 
-  find_program(FORMAT_COMMAND
+  find_program(CLANG_FORMAT_COMMAND
                NAMES clang-format-${LLVM_VERSION_MAJOR} clang-format)
-  if(FORMAT_COMMAND)
+  mark_as_advanced(CLANG_FORMAT_COMMAND)
+  if(CLANG_FORMAT_COMMAND)
     add_custom_target(${target}
-      COMMAND ${FORMAT_COMMAND} -i -style=file ${ARG_OTHER} ${ARG_UNPARSED_ARGUMENTS}
+      COMMAND ${CLANG_FORMAT_COMMAND} -i -style=file ${ARG_OTHER} ${ARG_UNPARSED_ARGUMENTS}
               ${ALL_CXX_FILES}
       WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
       COMMENT "${comment}"
diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt
index 863ccc1..4dc9ddf 100644
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -42,3 +42,9 @@ if(CUSAN_TYPEART)
     )
     FetchContent_MakeAvailable(typeart)
 endif()
+
+mark_as_advanced(
+    FETCHCONTENT_BASE_DIR
+    FETCHCONTENT_FULLY_DISCONNECTED
+    FETCHCONTENT_QUIET
+    FETCHCONTENT_UPDATES_DISCONNECTED)
\ No newline at end of file
diff --git a/lib/pass/AnalysisTransform.cpp b/lib/pass/AnalysisTransform.cpp
index f2a9983..6da40e1 100644
--- a/lib/pass/AnalysisTransform.cpp
+++ b/lib/pass/AnalysisTransform.cpp
@@ -582,6 +582,48 @@ llvm::SmallVector<Value*> CudaFree::map_arguments(IRBuilder<>& irb, llvm::ArrayR
   return {ptr};
 }
 
+// CudaMallocPitch
+
+CudaMallocPitch::CudaMallocPitch(callback::FunctionDecl* decls) {
+  setup("cudaMallocPitch", &decls->cusan_device_alloc.f);
+}
+llvm::SmallVector<Value*> CudaMallocPitch::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  //(void** devPtr, size_t* pitch, size_t width, size_t height )
+  assert(args.size() == 4);
+  auto* ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+
+  //"The function may pad the allocation"
+  //"*pitch by cudaMallocPitch() is the width in bytes of the allocation"
+  auto* pitch = irb.CreateLoad(irb.getIntPtrTy(irb.GetInsertBlock()->getModule()->getDataLayout()), args[1]);
+  // auto* width = args[2];
+  auto* height = args[3];
+
+  auto* real_size = irb.CreateMul(pitch, height);
+  return {ptr, real_size};
+}
+
+// CudaSetDevice
+
+CudaSetDevice::CudaSetDevice(callback::FunctionDecl* decls) {
+  setup("cudaSetDevice", &decls->cusan_set_device.f);
+}
+llvm::SmallVector<Value*> CudaSetDevice::map_arguments(IRBuilder<>&, llvm::ArrayRef<Value*> args) {
+  // cudaSetDevice ( int  device )
+  assert(args.size() == 1);
+  return {args[0]};
+}
+
+// CudaCHooseDevice
+
+CudaChooseDevice::CudaChooseDevice(callback::FunctionDecl* decls) {
+  setup("cudaChooseDevice", &decls->cusan_choose_device.f);
+}
+llvm::SmallVector<Value*> CudaChooseDevice::map_arguments(IRBuilder<>&, llvm::ArrayRef<Value*> args) {
+  // cudaChooseDevice ( int* device, const cudaDeviceProp* prop )
+  assert(args.size() == 2);
+  return {args[0]};
+}
+
 // CudaStreamQuery
 
 CudaStreamQuery::CudaStreamQuery(callback::FunctionDecl* decls) {
@@ -614,4 +656,45 @@ llvm::SmallVector<Value*, 1> CudaEventQuery::map_return_value(IRBuilder<>& irb,
   return {result};
 }
 
+CudaStreamSyncCallback::CudaStreamSyncCallback(callback::FunctionDecl* decls) {
+  setup("cudaStreamSynchronize", &decls->cusan_sync_callback.f);
+}
+llvm::SmallVector<Value*> CudaStreamSyncCallback::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  //( void* stream)
+  assert(args.size() == 1);
+  return {irb.getInt8(1), args[0]};
+}
+llvm::SmallVector<Value*, 1> CudaStreamSyncCallback::map_return_value(IRBuilder<>&, Value* result) {
+  return {result};
+}
+
+CudaEventSyncCallback::CudaEventSyncCallback(callback::FunctionDecl* decls) {
+  setup("cudaEventSynchronize", &decls->cusan_sync_callback.f);
+}
+llvm::SmallVector<Value*> CudaEventSyncCallback::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  //( void* event)
+  assert(args.size() == 1);
+  return {irb.getInt8(2), args[0]};
+}
+llvm::SmallVector<Value*, 1> CudaEventSyncCallback::map_return_value(IRBuilder<>&, Value* result) {
+  return {result};
+}
+
+CudaDeviceSyncCallback::CudaDeviceSyncCallback(callback::FunctionDecl* decls) {
+  setup("cudaDeviceSynchronize", &decls->cusan_sync_callback.f);
+}
+llvm::SmallVector<Value*> CudaDeviceSyncCallback::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  //( )
+  assert(args.size() == 0);
+#if LLVM_VERSION_MAJOR < 15
+  auto* ptr_type = PointerType::getUnqual(irb.getContext());
+#else
+  auto* ptr_type = irb.getPtrTy();
+#endif
+  return {irb.getInt8(0), ConstantPointerNull::get(ptr_type)};
+}
+llvm::SmallVector<Value*, 1> CudaDeviceSyncCallback::map_return_value(IRBuilder<>&, Value* result) {
+  return {result};
+}
+
 }  // namespace cusan::transform
diff --git a/lib/pass/AnalysisTransform.h b/lib/pass/AnalysisTransform.h
index fbb5cef..55e3f1b 100644
--- a/lib/pass/AnalysisTransform.h
+++ b/lib/pass/AnalysisTransform.h
@@ -211,27 +211,9 @@ BasicInstrumenterDecl(CudaHostFree);
 BasicInstrumenterDecl(CudaMallocManaged);
 BasicInstrumenterDecl(CudaMalloc);
 BasicInstrumenterDecl(CudaFree);
-
-class CudaMallocPitch : public SimpleInstrumenter<CudaMallocPitch> {
- public:
-  CudaMallocPitch(callback::FunctionDecl* decls) {
-    setup("cudaMallocPitch", &decls->cusan_device_alloc.f);
-  }
-  static llvm::SmallVector<Value*, 2> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
-    //(void** devPtr, size_t* pitch, size_t width, size_t height )
-    assert(args.size() == 4);
-    auto* ptr = irb.CreateBitOrPointerCast(args[0], irb.getInt8Ty()->getPointerTo());
-
-    //"The function may pad the allocation"
-    //"*pitch by cudaMallocPitch() is the width in bytes of the allocation"
-    auto* pitch = irb.CreateLoad(irb.getIntPtrTy(irb.GetInsertBlock()->getModule()->getDataLayout()), args[1]);
-    // auto* width = args[2];
-    auto* height = args[3];
-
-    auto* real_size = irb.CreateMul(pitch, height);
-    return {ptr, real_size};
-  }
-};
+BasicInstrumenterDecl(CudaMallocPitch);
+BasicInstrumenterDecl(CudaSetDevice);
+BasicInstrumenterDecl(CudaChooseDevice);
 
 class CudaStreamQuery : public SimpleInstrumenter<CudaStreamQuery> {
  public:
@@ -247,6 +229,25 @@ class CudaEventQuery : public SimpleInstrumenter<CudaEventQuery> {
   static llvm::SmallVector<Value*, 1> map_return_value(IRBuilder<>& irb, Value* result);
 };
 
+class CudaStreamSyncCallback : public SimpleInstrumenter<CudaStreamSyncCallback> {
+ public:
+  CudaStreamSyncCallback(callback::FunctionDecl* decls);
+  static llvm::SmallVector<Value*> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args);
+  static llvm::SmallVector<Value*, 1> map_return_value(IRBuilder<>& irb, Value* result);
+};
+class CudaEventSyncCallback : public SimpleInstrumenter<CudaEventSyncCallback> {
+ public:
+  CudaEventSyncCallback(callback::FunctionDecl* decls);
+  static llvm::SmallVector<Value*> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args);
+  static llvm::SmallVector<Value*, 1> map_return_value(IRBuilder<>& irb, Value* result);
+};
+class CudaDeviceSyncCallback : public SimpleInstrumenter<CudaDeviceSyncCallback> {
+ public:
+  CudaDeviceSyncCallback(callback::FunctionDecl* decls);
+  static llvm::SmallVector<Value*> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args);
+  static llvm::SmallVector<Value*, 1> map_return_value(IRBuilder<>& irb, Value* result);
+};
+
 }  // namespace transform
 }  // namespace cusan
 
diff --git a/lib/pass/CMakeLists.txt b/lib/pass/CMakeLists.txt
index f52baa5..8fbefd9 100644
--- a/lib/pass/CMakeLists.txt
+++ b/lib/pass/CMakeLists.txt
@@ -14,6 +14,7 @@ mark_as_advanced(LLVM_CUSAN_TRANSFORMPASS_LINK_INTO_TOOLS)
 target_compile_definitions(cusan_TransformPass
   PRIVATE
   $<$<BOOL:${CUSAN_TYPEART}>:CUSAN_TYPEART=1>
+  $<$<BOOL:${CUSAN_DEVICE_SYNC_CALLBACKS}>:CUSAN_DEVICE_SYNC_CALLBACKS=1>
 )
 
 set_target_properties(
diff --git a/lib/pass/CusanPass.cpp b/lib/pass/CusanPass.cpp
index d205ddd..48cbf50 100644
--- a/lib/pass/CusanPass.cpp
+++ b/lib/pass/CusanPass.cpp
@@ -180,6 +180,15 @@ bool CusanPass::runOnFunc(llvm::Function& function) {
   modified |= transform::StreamCreateWithFlagsInstrumenter(&cusan_decls_).instrument(function);
   modified |= transform::StreamCreateWithPriorityInstrumenter(&cusan_decls_).instrument(function);
   modified |= transform::CudaMallocPitch(&cusan_decls_).instrument(function);
+  modified |= transform::CudaChooseDevice(&cusan_decls_).instrument(function);
+  modified |= transform::CudaSetDevice(&cusan_decls_).instrument(function);
+
+  // callbacks
+#ifdef CUSAN_DEVICE_SYNC_CALLBACKS
+  modified |= transform::CudaDeviceSyncCallback(&cusan_decls_).instrument(function);
+  modified |= transform::CudaEventSyncCallback(&cusan_decls_).instrument(function);
+  modified |= transform::CudaStreamSyncCallback(&cusan_decls_).instrument(function);
+#endif
 
   auto data_for_host = host::kernel_model_for_stub(&function, this->kernel_models_);
   if (data_for_host) {
diff --git a/lib/pass/FunctionDecl.cpp b/lib/pass/FunctionDecl.cpp
index f58c828..cefd273 100644
--- a/lib/pass/FunctionDecl.cpp
+++ b/lib/pass/FunctionDecl.cpp
@@ -1,5 +1,7 @@
 #include "FunctionDecl.h"
 
+#include <cstdint>
+
 namespace cusan::callback {
 
 void FunctionDecl::initialize(llvm::Module& module) {
@@ -124,6 +126,18 @@ void FunctionDecl::initialize(llvm::Module& module) {
   //  void* devPtr, size_t pitch, size_t width, size_t height
   ArgTypes arg_types_2d_memset = {void_ptr, size_t_ty, size_t_ty, size_t_ty};
   make_function(cusan_memset_2d, arg_types_2d_memset);
+
+  //  int device
+  ArgTypes arg_types_set_device = {Type::getInt32Ty(c)};
+  make_function(cusan_set_device, arg_types_set_device);
+
+  //  void* device
+  ArgTypes arg_types_choose_device = {Type::getInt32Ty(c)->getPointerTo()};
+  make_function(cusan_choose_device, arg_types_choose_device);
+
+  //  u8 evenType, u32 returnValue
+  ArgTypes arg_types_sync_callback = {Type::getInt8Ty(c), void_ptr, Type::getInt32Ty(c)};
+  make_function(cusan_sync_callback, arg_types_sync_callback);
 }
 
 }  // namespace cusan::callback
diff --git a/lib/pass/FunctionDecl.h b/lib/pass/FunctionDecl.h
index d40e80a..f6457c1 100644
--- a/lib/pass/FunctionDecl.h
+++ b/lib/pass/FunctionDecl.h
@@ -41,10 +41,12 @@ struct FunctionDecl {
   CusanFunction cusan_host_register{"_cusan_host_register"};
   CusanFunction cusan_host_unregister{"_cusan_host_unregister"};
   CusanFunction cusan_device_alloc{"_cusan_device_alloc"};
+  CusanFunction cusan_set_device{"_cusan_set_device"};
+  CusanFunction cusan_choose_device{"_cusan_choose_device"};
   CusanFunction cusan_device_free{"_cusan_device_free"};
   CusanFunction cusan_stream_query{"_cusan_stream_query"};
   CusanFunction cusan_event_query{"_cusan_event_query"};
-
+  CusanFunction cusan_sync_callback{"cusan_sync_callback"};
   void initialize(llvm::Module& m);
 };
 
diff --git a/lib/runtime/CMakeLists.txt b/lib/runtime/CMakeLists.txt
index 741bbc2..5397584 100644
--- a/lib/runtime/CMakeLists.txt
+++ b/lib/runtime/CMakeLists.txt
@@ -1,67 +1,69 @@
-find_package(MPI)
-add_library(cusan_mpi_interceptor SHARED MPIInterception.cpp)
-target_link_libraries(cusan_mpi_interceptor
-  PRIVATE MPI::MPI_CXX
-)
-set_target_properties(
-  cusan_mpi_interceptor
-  PROPERTIES
-  OUTPUT_NAME "CusanMPIInterceptor"
-  EXPORT_NAME "MPIInterceptor"
-)
-add_library(cusan::MPI_Interceptor ALIAS cusan_mpi_interceptor)
-target_compile_features(cusan_mpi_interceptor PUBLIC cxx_std_17)
-target_compile_definitions(
-  cusan_mpi_interceptor
-  PRIVATE CUSAN_LOG_LEVEL=${CUSAN_LOG_LEVEL_RT}
-  LLVM_VERSION_MAJOR=${LLVM_VERSION_MAJOR}
-   $<$<BOOL:${CUSAN_FIBERPOOL}>:CUSAN_FIBERPOOL=1>
-   $<$<BOOL:${CUSAN_TYPEART}>:CUSAN_TYPEART=1>
-   $<$<BOOL:${CUSAN_SOFTCOUNTER}>:CUSAN_SOFTCOUNTER>
-)
-
-target_include_directories(cusan_mpi_interceptor ${warning_guard}
-  PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
-  PRIVATE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lib>
-)
-
-target_include_directories(cusan_mpi_interceptor
-  SYSTEM
-  PRIVATE
-  ${LLVM_INCLUDE_DIRS}
-)
-
-target_link_libraries(cusan_mpi_interceptor PRIVATE LLVMSupport)
-
-if(CUSAN_FIBERPOOL)
-  target_link_libraries(cusan_mpi_interceptor PUBLIC cusan::fiberpool)
-  set_target_properties(cusan_mpi_interceptor PROPERTIES
-        BUILD_WITH_INSTALL_RPATH FALSE
-        LINK_FLAGS "-Wl,-rpath,${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
+if (MPI_FOUND)
+  add_library(cusan_mpi_interceptor SHARED MPIInterception.cpp)
+  target_link_libraries(cusan_mpi_interceptor
+    PRIVATE MPI::MPI_CXX
+  )
+  set_target_properties(
+    cusan_mpi_interceptor
+    PROPERTIES
+    OUTPUT_NAME "CusanMPIInterceptor"
+    EXPORT_NAME "MPIInterceptor"
+  )
+  add_library(cusan::MPI_Interceptor ALIAS cusan_mpi_interceptor)
+  
+  target_compile_features(cusan_mpi_interceptor PUBLIC cxx_std_17)
+  target_compile_definitions(
+    cusan_mpi_interceptor
+    PRIVATE CUSAN_LOG_LEVEL=${CUSAN_LOG_LEVEL_RT}
+    LLVM_VERSION_MAJOR=${LLVM_VERSION_MAJOR}
+    $<$<BOOL:${CUSAN_FIBERPOOL}>:CUSAN_FIBERPOOL=1>
+    $<$<BOOL:${CUSAN_TYPEART}>:CUSAN_TYPEART=1>
+    $<$<BOOL:${CUSAN_SOFTCOUNTER}>:CUSAN_SOFTCOUNTER>
+  )
+
+  target_include_directories(cusan_mpi_interceptor ${warning_guard}
+    PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+    PRIVATE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lib>
+  )
+
+  target_include_directories(cusan_mpi_interceptor
+    SYSTEM
+    PRIVATE
+    ${LLVM_INCLUDE_DIRS}
+  )
+
+  target_link_libraries(cusan_mpi_interceptor PRIVATE LLVMSupport)
+
+  if(CUSAN_FIBERPOOL)
+    target_link_libraries(cusan_mpi_interceptor PUBLIC cusan::fiberpool)
+    set_target_properties(cusan_mpi_interceptor PROPERTIES
+          BUILD_WITH_INSTALL_RPATH FALSE
+          LINK_FLAGS "-Wl,-rpath,${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
+  endif()
+
+  set(CONFIG_NAME cusanMPIInterceptor)
+  set(TARGETS_EXPORT_NAME ${CONFIG_NAME}Targets)
+
+  install(
+    TARGETS cusan_mpi_interceptor
+    EXPORT ${TARGETS_EXPORT_NAME}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  )
+
+  install(
+    EXPORT ${TARGETS_EXPORT_NAME}
+    NAMESPACE cusan::
+    DESTINATION ${CUSAN_INSTALL_CONFIGDIR}
+  )
+
+  export(
+    EXPORT ${TARGETS_EXPORT_NAME}
+    FILE ${CMAKE_BINARY_DIR}/${TARGETS_EXPORT_NAME}.cmake
+    NAMESPACE cusan::
+  )
 endif()
 
-set(CONFIG_NAME cusanMPIInterceptor)
-set(TARGETS_EXPORT_NAME ${CONFIG_NAME}Targets)
-
-install(
-  TARGETS cusan_mpi_interceptor
-  EXPORT ${TARGETS_EXPORT_NAME}
-  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-)
-
-install(
-  EXPORT ${TARGETS_EXPORT_NAME}
-  NAMESPACE cusan::
-  DESTINATION ${CUSAN_INSTALL_CONFIGDIR}
-)
-
-export(
-  EXPORT ${TARGETS_EXPORT_NAME}
-  FILE ${CMAKE_BINARY_DIR}/${TARGETS_EXPORT_NAME}.cmake
-  NAMESPACE cusan::
-)
-
 add_library(cusan_Runtime SHARED CusanRuntime.cpp CusanRuntime_cudaSpecific.cpp)
 set_target_properties(
   cusan_Runtime
diff --git a/lib/runtime/CusanRuntime.cpp b/lib/runtime/CusanRuntime.cpp
index 25ed8d0..9c00736 100644
--- a/lib/runtime/CusanRuntime.cpp
+++ b/lib/runtime/CusanRuntime.cpp
@@ -12,7 +12,7 @@
 #include "analysis/KernelModel.h"
 #include "support/Logger.h"
 #include "StatsCounter.h"
-#if CUSAN_SOFTCOUNTER
+#ifdef CUSAN_SOFTCOUNTER
 #include "support/Table.h"
 #endif
 #include "TSanInterface.h"
@@ -76,44 +76,37 @@ struct PointerAccess {
   AccessState mode{AccessState::kRW};
 };
 
-class Runtime {
+class Runtime;
+
+class Device {
   // NOTE: assumed to be a ordered map so we can iterate in ascending order
   std::map<const void*, AllocationInfo> allocations_;
   std::map<Stream, TsanFiber> streams_;
-  std::map<Event, Stream> events_;
   TsanFiber cpu_fiber_;
   TsanFiber curr_fiber_;
-  bool init_ = false;
 
  public:
   static constexpr Stream kDefaultStream = Stream();
   Recorder stats_recorder;
 
-  static Runtime& get() {
-    static Runtime run_t;
-    if (!run_t.init_) {
-#ifdef CUSAN_FIBERPOOL
-      TsanFiberPoolInit();
-#endif
-      run_t.cpu_fiber_  = TsanGetCurrentFiber();
-      run_t.curr_fiber_ = run_t.cpu_fiber_;
-
-      // default '0' cuda stream
-      { run_t.register_stream(kDefaultStream); }
-
-      run_t.init_ = true;
-    }
-    return run_t;
+  Device() : stats_recorder() {
+    // every device has a default stream
+    { register_stream(Device::kDefaultStream); }
+    cpu_fiber_ = TsanGetCurrentFiber();
   }
 
-  Runtime(const Runtime&) = delete;
-
-  void operator=(const Runtime&) = delete;
+  bool operator==(const Device& other) const {
+    return curr_fiber_ == other.curr_fiber_;
+  }
 
-  [[nodiscard]] const std::map<const void*, AllocationInfo>& get_allocations() const {
+  [[nodiscard]] const std::map<const void*, AllocationInfo>& get_allocations() {
     return allocations_;
   }
 
+  [[nodiscard]] TsanFiber get_stream_fiber(Stream stream) {
+    return streams_[stream];
+  }
+
   void happens_before() {
     LOG_TRACE("[cusan]    HappensBefore of curr fiber")
     TsanHappensBefore(curr_fiber_);
@@ -122,10 +115,7 @@ class Runtime {
 
   void switch_to_cpu() {
     LOG_TRACE("[cusan]    Switch to cpu")
-    // if we where one a default stream we should also post sync
-    // meaning that all work submitted after from the cpu should also be run after the default kernels are done
-    // TODO: double check with blocking
-    auto search_result = streams_.find(Runtime::kDefaultStream);
+    auto search_result = streams_.find(Device::kDefaultStream);
     assert(search_result != streams_.end() && "Tried using stream that wasn't created prior");
     if (curr_fiber_ == search_result->second) {
       LOG_TRACE("[cusan]        syncing all other blocking GPU streams to run after since its default stream")
@@ -195,19 +185,6 @@ class Runtime {
     stats_recorder.inc_TsanHappensAfter();
   }
 
-  void record_event(Event event, Stream stream) {
-    LOG_TRACE("[cusan]    Record event: " << event << " stream:" << stream.handle);
-    events_[event] = stream;
-  }
-
-  // Sync the event on the current stream
-  void sync_event(Event event) {
-    auto search_result = events_.find(event);
-    assert(search_result != events_.end() && "Tried using event that wasn't recorded to prior");
-    LOG_TRACE("[cusan]    Sync event: " << event << " recorded on stream:" << events_[event].handle)
-    happens_after_stream(events_[event]);
-  }
-
   void insert_allocation(void* ptr, AllocationInfo info) {
     assert(allocations_.find(ptr) == allocations_.end() && "Registered an allocation multiple times");
     allocations_[ptr] = info;
@@ -237,14 +214,11 @@ class Runtime {
     return &res->second;
   }
 
- private:
-  Runtime() = default;
-
-  ~Runtime() {
+  void output_statistics() {
 #undef cusan_stat_handle
 #define cusan_stat_handle(name) table.put(Row::make(#name, stats_recorder.get_##name()));
-#if CUSAN_SOFTCOUNTER
-    Table table{"Cusan runtime statistics"};
+#ifdef CUSAN_SOFTCOUNTER
+    Table table{"Cusan device statistics"};
 #ifdef CUSAN_FIBERPOOL
     table.put(Row::make("Fiberpool", 1));
 #else
@@ -262,14 +236,94 @@ class Runtime {
 #endif
 #undef cusan_stat_handle
 #undef CUSAN_CUDA_EVENT_LIST
+  }
+};
 
-#ifdef CUSAN_FIBERPOOL
-    // TsanFiberPoolFini();
+class Runtime {
+  std::map<int32_t, Device> devices_;
+  std::map<Event, std::pair<TsanFiber, Device*>> events_;
+  int32_t current_device_;
+  bool init_;
+#ifdef CUSAN_SOFTCOUNTER
+  softcounter::AtomicCounter device_switches = 0;
 #endif
+ public:
+  static Runtime& get() {
+    static Runtime run_t;
+    if (!run_t.init_) {
+      run_t.current_device_ = get_current_device_id();
+      run_t.devices_[run_t.current_device_];
+      run_t.init_ = true;
+    }
+    return run_t;
+  }
+
+  Runtime(const Runtime&) = delete;
+
+  void operator=(const Runtime&) = delete;
+
+#ifdef CUSAN_SOFTCOUNTER
+  inline void inc_device_switches() {
+    this->device_switches++;
+  }
+  inline softcounter::Counter get_device_switches() {
+    return this->device_switches;
+  }
+#endif
+
+  Device& get_current_device() {
+    return devices_.at(current_device_);
+  }
+
+  Device& get_device(DeviceID id) {
+    if (devices_.find(id) == devices_.end()) {
+      devices_[id];
+    }
+    return devices_.at(id);
+  }
+
+  void set_device(DeviceID device) {
+    if (devices_.find(device) == devices_.end()) {
+      devices_[device];
+    }
+#ifdef CUSAN_SOFTCOUNTER
+    if (current_device_ != device) {
+      inc_device_switches();
+    }
+#endif
+    current_device_ = device;
   }
-};
 
-cusan_MemcpyKind infer_memcpy_direction(const void* target, const void* from);
+  void record_event(Event event, Stream stream) {
+    LOG_TRACE("[cusan]    Record event: " << event << " stream:" << stream.handle);
+    auto& current_device = get_current_device();
+    auto search_result   = current_device.get_stream_fiber(stream);
+    events_.insert({event, {search_result, &current_device}});
+  }
+
+  // Sync the event on the current stream
+  void sync_event(Event event) {
+    auto [stream_fiber, device] = events_[event];
+    TsanHappensAfter(stream_fiber);
+    device->stats_recorder.inc_TsanHappensAfter();
+  }
+
+ private:
+  Runtime() = default;
+
+  ~Runtime() {
+#ifdef CUSAN_SOFTCOUNTER
+    for (auto& [_, device] : devices_) {
+      device.output_statistics();
+    }
+
+    Table table{"Cusan runtime statistics"};
+    table.put(Row::make("Device Switches ", get_device_switches()));
+    table.print(std::cout);
+
+#endif
+  }
+};
 
 }  // namespace cusan::runtime
 
@@ -277,7 +331,7 @@ using namespace cusan::runtime;
 
 namespace helper {
 #ifndef CUSAN_TYPEART
-inline std::optional<size_t> find_memory_alloc_size(const Runtime& runtime, const void* ptr) {
+inline std::optional<size_t> find_memory_alloc_size(Device& runtime, const void* ptr) {
   const auto& allocs = runtime.get_allocations();
 
   // if there exists any allocation
@@ -304,7 +358,7 @@ inline std::optional<size_t> find_memory_alloc_size(const Runtime& runtime, cons
   return {};
 }
 #else
-inline std::optional<size_t> find_memory_alloc_size(const Runtime&, const void* ptr) {
+inline std::optional<size_t> find_memory_alloc_size(const Device&, const void* ptr) {
   size_t alloc_size{0};
   int alloc_id{0};
   auto query_status = typeart_get_type(ptr, &alloc_id, &alloc_size);
@@ -323,7 +377,7 @@ inline std::optional<size_t> find_memory_alloc_size(const Runtime&, const void*
 
 void _cusan_kernel_register(void** kernel_args, short* modes, int n, RawStream stream) {
   LOG_TRACE("[cusan]Kernel Register with " << n << " Args and on stream:" << stream)
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
 
   llvm::SmallVector<size_t, 4> sizes;
   for (int i = 0; i < n; ++i) {
@@ -372,7 +426,7 @@ void _cusan_kernel_register(void** kernel_args, short* modes, int n, RawStream s
 
 void _cusan_sync_device() {
   LOG_TRACE("[cusan]Sync Device\n")
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_sync_device_calls();
   runtime.happens_after_all_streams();
 }
@@ -380,13 +434,14 @@ void _cusan_sync_device() {
 void _cusan_event_record(Event event, RawStream stream) {
   LOG_TRACE("[cusan]Event Record")
   auto& runtime = Runtime::get();
-  runtime.stats_recorder.inc_event_record_calls();
+  auto& device  = runtime.get_current_device();
+  device.stats_recorder.inc_event_record_calls();
   runtime.record_event(event, Stream(stream));
 }
 
 void _cusan_sync_stream(RawStream raw_stream) {
   LOG_TRACE("[cusan]Sync Stream" << raw_stream)
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_sync_stream_calls();
   const auto stream = Stream(raw_stream);
   if (stream.isDefaultStream()) {
@@ -401,20 +456,21 @@ void _cusan_sync_stream(RawStream raw_stream) {
 void _cusan_sync_event(Event event) {
   LOG_TRACE("[cusan]Sync Event" << event)
   auto& runtime = Runtime::get();
-  runtime.stats_recorder.inc_sync_event_calls();
+  auto& device  = runtime.get_current_device();
+  device.stats_recorder.inc_sync_event_calls();
   runtime.sync_event(event);
 }
 
 void _cusan_create_event(Event*) {
   LOG_TRACE("[cusan]create event")
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_create_event_calls();
 }
 
-void _cusan_create_stream(RawStream* stream, cusan_StreamCreateFlags flags) {
+void _cusan_create_stream(RawStream* stream, cusan_stream_create_flags flags) {
   LOG_TRACE("[cusan]create stream with flags: " << flags
                                                 << " isNonBlocking: " << (bool)(flags & cusan_StreamFlagsNonBlocking))
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_create_stream_calls();
   runtime.register_stream(Stream(*stream, !(bool)(flags & cusan_StreamFlagsNonBlocking)));
 }
@@ -422,18 +478,19 @@ void _cusan_create_stream(RawStream* stream, cusan_StreamCreateFlags flags) {
 void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int) {
   LOG_TRACE("[cusan]StreamWaitEvent stream:" << stream << " on event:" << event)
   auto& runtime = Runtime::get();
-  runtime.stats_recorder.inc_stream_wait_event_calls();
-  runtime.switch_to_stream(Stream(stream));
+  auto& device  = runtime.get_current_device();
+  device.stats_recorder.inc_stream_wait_event_calls();
+  device.switch_to_stream(Stream(stream));
   runtime.sync_event(event);
-  runtime.happens_before();
-  runtime.switch_to_cpu();
+  device.happens_before();
+  device.switch_to_cpu();
 }
 
 void _cusan_host_alloc(void** ptr, size_t size, unsigned int) {
   // at least based of this presentation and some comments in the cuda forums this syncs the whole device
   //  https://developer.download.nvidia.com/CUDA/training/StreamsAndConcurrencyWebinar.pdf
   LOG_TRACE("[cusan]host alloc " << *ptr << " with size " << size)
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_host_alloc_calls();
   // runtime.happens_after_all_streams();
 
@@ -442,27 +499,27 @@ void _cusan_host_alloc(void** ptr, size_t size, unsigned int) {
 
 void _cusan_host_free(void* ptr) {
   LOG_TRACE("[cusan]host free")
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_host_free_calls();
   runtime.free_allocation(ptr);
 }
 
 void _cusan_host_register(void* ptr, size_t size, unsigned int) {
   LOG_TRACE("[cusan]host register " << ptr << " with size:" << size);
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_host_register_calls();
   runtime.insert_allocation(ptr, AllocationInfo::Pinned(size));
 }
 void _cusan_host_unregister(void* ptr) {
   LOG_TRACE("[cusan]host unregister " << ptr);
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_host_unregister_calls();
   runtime.free_allocation(ptr);
 }
 
 void _cusan_managed_alloc(void** ptr, size_t size, unsigned int) {
   LOG_TRACE("[cusan]Managed host alloc " << *ptr << " with size " << size << " -> implicit device sync")
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_managed_alloc_calls();
   runtime.happens_after_all_streams();
   runtime.insert_allocation(*ptr, AllocationInfo::Managed(size));
@@ -472,7 +529,7 @@ void _cusan_device_alloc(void** ptr, size_t size) {
   // implicit syncs device
   // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#stream-ordered-memory-allocator
   LOG_TRACE("[cusan]Device alloc " << *ptr << " with size " << size << " -> implicit device sync")
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_device_alloc_calls();
 
   runtime.insert_allocation(*ptr, AllocationInfo::Device(size));
@@ -483,37 +540,48 @@ void _cusan_device_free(void* ptr) {
   // implicit syncs device
   // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#stream-ordered-memory-allocator
   LOG_TRACE("[cusan]Device free " << ptr << " -> TODO maybe implicit device sync")
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_device_free_calls();
   runtime.happens_after_all_streams();
 }
 
-// TODO: get rid of cudaSpecifc check for cudaSuccess 0
 void _cusan_stream_query(RawStream stream, unsigned int err) {
   LOG_TRACE("[cusan] Stream query " << stream << " -> " << err)
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_stream_query_calls();
   if (err == 0) {
     LOG_TRACE("[cusan]    syncing")
-
     runtime.happens_after_stream(Stream{stream});
   }
 }
 
-// TODO: get rid of cudaSpecifc check for cudaSuccess 0
 void _cusan_event_query(Event event, unsigned int err) {
   LOG_TRACE("[cusan] Event query " << event << " -> " << err)
   auto& runtime = Runtime::get();
-  runtime.stats_recorder.inc_event_query_calls();
+  auto& device  = runtime.get_current_device();
+  device.stats_recorder.inc_event_query_calls();
   if (err == 0) {
     LOG_TRACE("[cusan]    syncing")
     runtime.sync_event(event);
   }
 }
 
+void _cusan_set_device(DeviceID device) {
+  auto& r = Runtime::get();
+  r.set_device(device);
+}
+
+void _cusan_choose_device(DeviceID* device) {
+  // does this function ever return a nullptr?
+  // and what would that mean
+  assert(device);
+  auto& r = Runtime::get();
+  r.set_device(*device);
+}
+
 void _cusan_memset_async_impl(void* target, size_t count, RawStream stream) {
   // The Async versions are always asynchronous with respect to the host.
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_memset_async_calls();
   runtime.switch_to_stream(Stream(stream));
   TsanMemoryWritePC(target, count, __builtin_return_address(0));
@@ -524,9 +592,9 @@ void _cusan_memset_async_impl(void* target, size_t count, RawStream stream) {
 void _cusan_memset_impl(void* target, size_t count) {
   // The cudaMemset functions are asynchronous with respect to the host except when the target memory is pinned host
   // memory.
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_memset_calls();
-  runtime.switch_to_stream(Runtime::kDefaultStream);
+  runtime.switch_to_stream(Device::kDefaultStream);
   LOG_TRACE("[cusan]    "
             << "Write to " << target << " with size: " << count)
   TsanMemoryWritePC(target, count, __builtin_return_address(0));
@@ -539,7 +607,7 @@ void _cusan_memset_impl(void* target, size_t count) {
   if ((alloc_info && (alloc_info->is_pinned || alloc_info->is_managed)) || CUSAN_SYNC_DETAIL_LEVEL == 0) {
     LOG_TRACE("[cusan]    "
               << "Memset is blocking")
-    runtime.happens_after_stream(Runtime::kDefaultStream);
+    runtime.happens_after_stream(Device::kDefaultStream);
   } else {
     LOG_TRACE("[cusan]    "
               << "Memset is not blocking")
@@ -553,10 +621,10 @@ void _cusan_memset_impl(void* target, size_t count) {
   // r.happens_after_stream(Runtime::default_stream));
 }
 
-void _cusan_memset_2d(void* target, size_t pitch, size_t, size_t height, cusan_MemcpyKind) {
+void _cusan_memset_2d(void* target, size_t pitch, size_t, size_t height, cusan_memcpy_kind) {
   _cusan_memset_impl(target, pitch * height);
 }
-void _cusan_memset_2d_async(void* target, size_t pitch, size_t, size_t height, cusan_MemcpyKind, RawStream stream) {
+void _cusan_memset_2d_async(void* target, size_t pitch, size_t, size_t height, cusan_memcpy_kind, RawStream stream) {
   _cusan_memset_async_impl(target, pitch * height, stream);
 }
 
@@ -571,8 +639,8 @@ void _cusan_memset_async(void* target, size_t count, RawStream stream) {
 }
 
 void _cusan_memcpy_async_impl(void* target, size_t write_size, const void* from, size_t read_size,
-                              cusan_MemcpyKind kind, RawStream stream) {
-  auto& runtime = Runtime::get();
+                              cusan_memcpy_kind kind, RawStream stream) {
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_memcpy_async_calls();
   if (kind == cusan_MemcpyHostToHost && CUSAN_SYNC_DETAIL_LEVEL == 1) {
     // 2. For transfers from any host memory to any host memory, the function is fully synchronous with respect to the
@@ -603,30 +671,30 @@ void _cusan_memcpy_async_impl(void* target, size_t write_size, const void* from,
   }
 }
 
-void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_t read_size, cusan_MemcpyKind kind) {
+void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_t read_size, cusan_memcpy_kind kind) {
   // TODO verify that the memcpy2d beheaviour is actually the same as normal memcpy
 
   if (kind == cusan_MemcpyDefault) {
     kind = infer_memcpy_direction(target, from);
   }
 
-  auto& runtime = Runtime::get();
+  auto& runtime = Runtime::get().get_current_device();
   runtime.stats_recorder.inc_memcpy_calls();
   if (CUSAN_SYNC_DETAIL_LEVEL == 0) {
     LOG_TRACE("[cusan]   DefaultStream+Blocking")
     // In this mode: Memcpy always blocks, no detailed view w.r.t. memory direction
-    runtime.switch_to_stream(Runtime::kDefaultStream);
+    runtime.switch_to_stream(Device::kDefaultStream);
     TsanMemoryReadPC(from, read_size, __builtin_return_address(0));
     runtime.stats_recorder.inc_TsanMemoryRead();
     TsanMemoryWritePC(target, write_size, __builtin_return_address(0));
     runtime.stats_recorder.inc_TsanMemoryWrite();
     runtime.happens_before();
     runtime.switch_to_cpu();
-    runtime.happens_after_stream(Runtime::kDefaultStream);
+    runtime.happens_after_stream(Device::kDefaultStream);
   } else if (kind == cusan_MemcpyDeviceToDevice) {
     // 4. For transfers from device memory to device memory, no host-side synchronization is performed.
     LOG_TRACE("[cusan]   DefaultStream")
-    runtime.switch_to_stream(Runtime::kDefaultStream);
+    runtime.switch_to_stream(Device::kDefaultStream);
     TsanMemoryReadPC(from, read_size, __builtin_return_address(0));
     runtime.stats_recorder.inc_TsanMemoryRead();
     TsanMemoryWritePC(target, write_size, __builtin_return_address(0));
@@ -636,14 +704,14 @@ void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_
     // 3. For transfers from device to either pageable or pinned host memory, the function returns only once the copy
     // has completed.
     LOG_TRACE("[cusan]   DefaultStream+Blocking")
-    runtime.switch_to_stream(Runtime::kDefaultStream);
+    runtime.switch_to_stream(Device::kDefaultStream);
     TsanMemoryReadPC(from, read_size, __builtin_return_address(0));
     runtime.stats_recorder.inc_TsanMemoryRead();
     TsanMemoryWritePC(target, write_size, __builtin_return_address(0));
     runtime.stats_recorder.inc_TsanMemoryWrite();
     runtime.happens_before();
     runtime.switch_to_cpu();
-    runtime.happens_after_stream(Runtime::kDefaultStream);
+    runtime.happens_after_stream(Device::kDefaultStream);
   } else if (kind == cusan_MemcpyHostToDevice) {
     // 1. For transfers from pageable host memory to device memory, a stream sync is performed before the copy is
     // initiated.
@@ -651,7 +719,7 @@ void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_
     auto* alloc_info = runtime.get_allocation_info(from);
     // if we couldn't find alloc info we just assume the worst and don't sync
     if (alloc_info && !alloc_info->is_pinned) {
-      runtime.happens_after_stream(Runtime::kDefaultStream);
+      runtime.happens_after_stream(Device::kDefaultStream);
       LOG_TRACE("[cusan]   DefaultStream+Blocking")
     } else {
       LOG_TRACE("[cusan]   DefaultStream")
@@ -660,20 +728,20 @@ void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_
     //   device memory
     TsanMemoryReadPC(from, read_size, __builtin_return_address(0));
     runtime.stats_recorder.inc_TsanMemoryRead();
-    runtime.switch_to_stream(Runtime::kDefaultStream);
+    runtime.switch_to_stream(Device::kDefaultStream);
     TsanMemoryWritePC(target, write_size, __builtin_return_address(0));
     runtime.stats_recorder.inc_TsanMemoryWrite();
     runtime.happens_before();
     runtime.switch_to_cpu();
-    runtime.happens_after_stream(Runtime::kDefaultStream);
+    runtime.happens_after_stream(Device::kDefaultStream);
   } else if (kind == cusan_MemcpyHostToHost) {
     // 5. For transfers from any host memory to any host memory, the function is fully synchronous with respect to the
     // host.
     LOG_TRACE("[cusan]   DefaultStream+Blocking")
-    runtime.switch_to_stream(Runtime::kDefaultStream);
+    runtime.switch_to_stream(Device::kDefaultStream);
     runtime.happens_before();
     runtime.switch_to_cpu();
-    runtime.happens_after_stream(Runtime::kDefaultStream);
+    runtime.happens_after_stream(Device::kDefaultStream);
     TsanMemoryReadPC(from, read_size, __builtin_return_address(0));
     runtime.stats_recorder.inc_TsanMemoryRead();
     TsanMemoryWritePC(target, write_size, __builtin_return_address(0));
@@ -684,7 +752,7 @@ void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_
 }
 
 void _cusan_memcpy_2d_async(void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height,
-                            cusan_MemcpyKind kind, RawStream stream) {
+                            cusan_memcpy_kind kind, RawStream stream) {
   LOG_TRACE("[cusan]MemcpyAsync" << width * height << " bytes to:" << target)
 
   size_t read_size  = spitch * height;
@@ -692,20 +760,35 @@ void _cusan_memcpy_2d_async(void* target, size_t dpitch, const void* from, size_
   _cusan_memcpy_async_impl(target, write_size, from, read_size, kind, stream);
 }
 
-void _cusan_memcpy_async(void* target, const void* from, size_t count, cusan_MemcpyKind kind, RawStream stream) {
+void _cusan_memcpy_async(void* target, const void* from, size_t count, cusan_memcpy_kind kind, RawStream stream) {
   LOG_TRACE("[cusan]MemcpyAsync" << count << " bytes to:" << target)
   _cusan_memcpy_async_impl(target, count, from, count, kind, stream);
 }
 
 void _cusan_memcpy_2d(void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height,
-                      cusan_MemcpyKind kind) {
+                      cusan_memcpy_kind kind) {
   LOG_TRACE("[cusan]Memcpy2d " << width * height << " from:" << from << " to:" << target);
   size_t read_size  = spitch * height;
   size_t write_size = dpitch * height;
   _cusan_memcpy_impl(target, write_size, from, read_size, kind);
 }
 
-void _cusan_memcpy(void* target, const void* from, size_t count, cusan_MemcpyKind kind) {
+void _cusan_memcpy(void* target, const void* from, size_t count, cusan_memcpy_kind kind) {
   LOG_TRACE("[cusan]Memcpy " << count << " from:" << from << " to:" << target);
   _cusan_memcpy_impl(target, count, from, count, kind);
 }
+
+void cusan_sync_callback(cusan_sync_type /*type*/, const void* /*event_or_stream*/, unsigned int /*return_value*/) {
+  LOG_TRACE("[cusan]Callback");
+  // switch (type) {
+  //   case cusan_Device:
+  //     printf("Device sync return value %i\n", return_value);
+  //     break;
+  //   case cusan_Stream:
+  //     printf("Stream %#x sync return value %i\n", event_or_stream, return_value);
+  //     break;
+  //   case cusan_Event:
+  //     printf("Event %#x sync return value %i\n", event_or_stream, return_value);
+  //     break;
+  // }
+}
diff --git a/lib/runtime/CusanRuntime.h b/lib/runtime/CusanRuntime.h
index 8dcaef2..048e566 100644
--- a/lib/runtime/CusanRuntime.h
+++ b/lib/runtime/CusanRuntime.h
@@ -8,12 +8,37 @@
 #define LIB_RUNTIME_CUSAN_H_
 #include <cstddef>
 
+#ifdef __cplusplus
+
+extern "C" {
+#endif
+
+typedef enum cusan_memcpy_kind_t : unsigned int {
+  cusan_MemcpyHostToHost     = 0,
+  cusan_MemcpyHostToDevice   = 1,
+  cusan_MemcpyDeviceToHost   = 2,
+  cusan_MemcpyDeviceToDevice = 3,
+  cusan_MemcpyDefault        = 4,
+} cusan_memcpy_kind;
+
+typedef enum cusan_stream_create_flags_t : unsigned int {
+  cusan_StreamFlagsDefault     = 0,
+  cusan_StreamFlagsNonBlocking = 1,
+} cusan_stream_create_flags;
+#ifdef __cplusplus
+}
+#endif
+
 #ifdef __cplusplus
 namespace cusan::runtime {
 using TsanFiber = void*;
 using Event     = const void*;
 using RawStream = const void*;
+using DeviceID  = int;
+cusan_memcpy_kind infer_memcpy_direction(const void* target, const void* from);
+DeviceID get_current_device_id();
 }  // namespace cusan::runtime
+using cusan::runtime::DeviceID;
 using cusan::runtime::Event;
 using cusan::runtime::RawStream;
 using cusan::runtime::TsanFiber;
@@ -21,6 +46,7 @@ using cusan::runtime::TsanFiber;
 #define TsanFiber void*
 #define Event const void*
 #define RawStream const void*
+#define DeviceID int
 #endif
 
 #ifdef __cplusplus
@@ -28,40 +54,36 @@ using cusan::runtime::TsanFiber;
 extern "C" {
 #endif
 
-typedef enum cusan_memcpy_kind_t : unsigned int {
-  cusan_MemcpyHostToHost     = 0,
-  cusan_MemcpyHostToDevice   = 1,
-  cusan_MemcpyDeviceToHost   = 2,
-  cusan_MemcpyDeviceToDevice = 3,
-  cusan_MemcpyDefault        = 4,
-} cusan_MemcpyKind;
-
-typedef enum cusan_stream_create_flags_t : unsigned int {
-  cusan_StreamFlagsDefault     = 0,
-  cusan_StreamFlagsNonBlocking = 1,
-} cusan_StreamCreateFlags;
-
 void _cusan_kernel_register(void** kernel_args, short* modes, int n, RawStream stream);
+
 void _cusan_sync_device();
-void _cusan_event_record(Event event, RawStream stream);
+void _cusan_set_device(DeviceID device);
+void _cusan_choose_device(DeviceID* device);
+
 void _cusan_sync_stream(RawStream stream);
+void _cusan_create_stream(RawStream* stream, cusan_stream_create_flags flags);
+void _cusan_stream_query(RawStream stream, unsigned int err);
+
 void _cusan_sync_event(Event event);
-void _cusan_stream_event(Event event);
+void _cusan_event_record(Event event, RawStream stream);
 void _cusan_create_event(RawStream* event);
-void _cusan_create_stream(RawStream* stream, cusan_StreamCreateFlags flags);
-void _cusan_memcpy_async(void* target, const void* from, size_t count, cusan_MemcpyKind kind, RawStream stream);
-void _cusan_memset_async(void* target, size_t count, RawStream stream);
-void _cusan_memcpy(void* target, const void* from, size_t count, cusan_MemcpyKind);
+void _cusan_event_query(Event event, unsigned int err);
+void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int flags);
+void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int flags);
+
+void _cusan_memcpy(void* target, const void* from, size_t count, cusan_memcpy_kind);
+void _cusan_memcpy_async(void* target, const void* from, size_t count, cusan_memcpy_kind kind, RawStream stream);
 void _cusan_memcpy_2d(void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height,
-                      cusan_MemcpyKind);
+                      cusan_memcpy_kind);
 void _cusan_memcpy_2d_async(void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height,
-                            cusan_MemcpyKind, RawStream stream);
-void _cusan_memset_2d(void* target, size_t pitch, size_t width, size_t height, cusan_MemcpyKind);
-void _cusan_memset_2d_async(void* target, size_t pitch, size_t width, size_t height, cusan_MemcpyKind,
-                            RawStream stream);
+                            cusan_memcpy_kind, RawStream stream);
+
 void _cusan_memset(void* target, size_t count);
-void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int flags);
-void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int flags);
+void _cusan_memset_async(void* target, size_t count, RawStream stream);
+void _cusan_memset_2d(void* target, size_t pitch, size_t width, size_t height, cusan_memcpy_kind);
+void _cusan_memset_2d_async(void* target, size_t pitch, size_t width, size_t height, cusan_memcpy_kind,
+                            RawStream stream);
+
 void _cusan_host_alloc(void** ptr, size_t size, unsigned int flags);
 void _cusan_host_free(void* ptr);
 void _cusan_managed_alloc(void** ptr, size_t size, unsigned int flags);
@@ -70,8 +92,15 @@ void _cusan_host_register(void* ptr, size_t size, unsigned int flags);
 void _cusan_host_unregister(void* ptr);
 void _cusan_device_alloc(void** ptr, size_t size);
 void _cusan_device_free(void* ptr);
-void _cusan_stream_query(RawStream stream, unsigned int err);
-void _cusan_event_query(Event event, unsigned int err);
+
+typedef enum cusan_sync_type_t : unsigned char {
+  cusan_Device = 0, //second argument is a nullptr
+  cusan_Stream = 1, //second argument is the stream pointer
+  cusan_Event  = 2, //second argument is the event pointer
+} cusan_sync_type;
+
+void cusan_sync_callback(cusan_sync_type /*type*/, const void* /*event or stream*/, unsigned int /*return_value*/);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/runtime/CusanRuntime_cudaSpecific.cpp b/lib/runtime/CusanRuntime_cudaSpecific.cpp
index 6958db3..970401d 100644
--- a/lib/runtime/CusanRuntime_cudaSpecific.cpp
+++ b/lib/runtime/CusanRuntime_cudaSpecific.cpp
@@ -10,7 +10,14 @@
 #include <cuda_runtime_api.h>
 
 namespace cusan::runtime {
-cusan_MemcpyKind infer_memcpy_direction(const void* target, const void* from) {
+
+DeviceID get_current_device_id() {
+  DeviceID res;
+  cudaGetDevice(&res);
+  return res;
+}
+
+cusan_memcpy_kind infer_memcpy_direction(const void* target, const void* from) {
   cudaDeviceProp prop;
   cudaGetDeviceProperties(&prop, 0);
   assert(prop.unifiedAddressing && "Can only use default direction for memcpy when Unified memory is supported.");
diff --git a/lib/runtime/StatsCounter.h b/lib/runtime/StatsCounter.h
index 2b00377..5e66b6e 100644
--- a/lib/runtime/StatsCounter.h
+++ b/lib/runtime/StatsCounter.h
@@ -107,9 +107,9 @@ class NoneRecorder {
   Statistics stats_r;
   CUSAN_CUDA_EVENT_LIST
 #include "TsanEvents.inc"
-  void inc_TsanMemoryReadCount(unsigned count) {
+  void inc_TsanMemoryReadCount(unsigned) {
   }
-  void inc_TsanMemoryWriteCount(unsigned count) {
+  void inc_TsanMemoryWriteCount(unsigned) {
   }
 };
 
diff --git a/lib/runtime/TSanInterface.h b/lib/runtime/TSanInterface.h
index 88d0f48..3bfe367 100644
--- a/lib/runtime/TSanInterface.h
+++ b/lib/runtime/TSanInterface.h
@@ -1,6 +1,11 @@
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wformat"
-#pragma GCC diagnostic ignored "-Wmacro-redefined"
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wcast-qual"
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wmacro-redefined"
+#pragma clang diagnostic ignored "-Wformat-pedantic"
+#endif
 
 #include "TSan_External.h"
 
diff --git a/lib/support/Util.h b/lib/support/Util.h
index 54014a9..081c6fc 100644
--- a/lib/support/Util.h
+++ b/lib/support/Util.h
@@ -34,7 +34,7 @@ inline std::string demangle(String&& s) {
 #if LLVM_VERSION_MAJOR >= 15
   auto demangle = llvm::itaniumDemangle(name.data(), false);
 #else
-  auto* demangle      = llvm::itaniumDemangle(name.data(), nullptr, nullptr, nullptr);
+  auto* demangle = llvm::itaniumDemangle(name.data(), nullptr, nullptr, nullptr);
 #endif
   if (demangle && !std::string(demangle).empty()) {
     return {demangle};
diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt
index 74d956c..6dffaf8 100644
--- a/scripts/CMakeLists.txt
+++ b/scripts/CMakeLists.txt
@@ -92,7 +92,7 @@ function(configure_cusan_script input output)
 endfunction()
 
 function(cusan_find_mpi_vendor_helper symbol ret_value)
-  find_package(MPI)
+  find_package(MPI QUIET)
 
   if(NOT MPI_FOUND)
     set(${ret_value} false)
@@ -175,7 +175,6 @@ if(NOT CUSAN_TYPEART)
   )
 endif()
 
-find_package(MPI)
 if(MPI_C_FOUND)
   configure_cusan_script(
     ${CUSAN_WRAPPER} cusan-mpicc${CMAKE_DEBUG_POSTFIX}
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index db0f1df..42e693a 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -36,6 +36,8 @@ function(configure_cusan_lit_site_cfg input output)
 
   pythonize_bool(${CUSAN_TEST_WORKAROUNDS} CUSAN_TEST_WORKAROUND)
   pythonize_bool(${CUSAN_TYPEART} CUSAN_HAS_TYPEART)
+  pythonize_bool(${MPI_FOUND} CUSAN_HAS_MPI)
+  pythonize_bool(${CUSAN_DEVICE_SYNC_CALLBACKS} CUSAN_HAS_SYNC_CALLBACKS)
 
   cusan_target_generate_file(${input} ${output})
 endfunction()
@@ -75,7 +77,7 @@ function(cusan_add_lit_target)
       set(TARGET_NAME check-cusan-${suite})
     endif()
 
-    if(NOT EXISTS ${SUITE_PATH} AND NOT ${suite} STREQUAL "staging")
+    if(NOT EXISTS ${SUITE_PATH} AND NOT ${suite} STREQUAL "staging" AND NOT ${suite} STREQUAL "multi_gpu")
       message(WARNING "Could not find suitable lit test target at ${SUITE_PATH}")
       continue()
     endif()
@@ -90,16 +92,32 @@ function(cusan_add_lit_target)
   endforeach()
 endfunction()
 
+
+if(NOT MPI_FOUND)
+  # mock target to make test generation work
+  add_library(cusan_mpi_interceptor_mock SHARED IMPORTED)
+  set_target_properties(
+    cusan_mpi_interceptor_mock
+    PROPERTIES
+    OUTPUT_NAME "CusanMPIInterceptorPlaceholder"
+    IMPORTED_LOCATION "${PROJECT_SOURCE_DIR}/lib/runtime"
+  )
+  set(_cusan_interceptor_target cusan_mpi_interceptor_mock)
+  add_library(cusan::MPI_Interceptor ALIAS cusan_mpi_interceptor_mock)
+endif()
+
 set(CUSAN_TEST_DEPENDS
   cusan::TransformPass
-  cusan::MPI_Interceptor
   cusan::Runtime
   cusan::Analysis
-  
 )
 
+if(FOUND_MPI)
+  list(APPEND CUSAN_TEST_DEPENDS cusan::MPI_Interceptor)
+endif()
+
 if(CUSAN_TYPEART)
-list(APPEND CUSAN_TEST_DEPENDS typeart::Runtime typeart::TransformPass)
+  list(APPEND CUSAN_TEST_DEPENDS typeart::Runtime typeart::TransformPass)
 endif()
 
 set(CUSAN_SUITES
@@ -108,6 +126,7 @@ set(CUSAN_SUITES
   pass
   kernel_analysis
   staging
+  multi_gpu
 )
 
 include(ProcessorCount)
@@ -124,6 +143,7 @@ set(CUSAN_SUITES_WORKERS
   ${NUM_CPU}
   ${NUM_CPU}
   1
+  1
 )
 
 cusan_add_lit_target(SUITES ${CUSAN_SUITES} WORKERS ${CUSAN_SUITES_WORKERS})
diff --git a/test/kernel_analysis/03_struct_write.c b/test/kernel_analysis/03_struct_write.c
index c001e03..a6edeef 100644
--- a/test/kernel_analysis/03_struct_write.c
+++ b/test/kernel_analysis/03_struct_write.c
@@ -22,6 +22,8 @@
 // CHECK-NEXT: subarg: {{.*}}ptr: 0, rw: ReadWrite
 // CHECK-NOT: Handling Arg:
 
+// REQUIRES: mpi
+
 // clang-format on
 
 #include "../support/gpu_mpi.h"
diff --git a/test/kernel_analysis/08_big_struct_write.c b/test/kernel_analysis/08_big_struct_write.c
index 5d34cd5..3df5664 100644
--- a/test/kernel_analysis/08_big_struct_write.c
+++ b/test/kernel_analysis/08_big_struct_write.c
@@ -22,6 +22,8 @@
 // CHECK-NEXT: subarg: {{.*}}ptr: 0, rw: ReadWrite
 // CHECK-NOT: Handling Arg:
 
+// REQUIRES: mpi
+
 // clang-format on
 
 #include "../support/gpu_mpi.h"
diff --git a/test/lit.cfg b/test/lit.cfg
index 9836a64..bfea736 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -43,10 +43,18 @@ else:
 if config.cusan_typeart:
   config.available_features.add('typeart')
 
+if config.mpiexec:
+  config.available_features.add('mpiexec')
+if config.cusan_mpi:
+  config.available_features.add('mpi')
+
+if config.cusan_sync_callback:
+  config.available_features.add('sync_callback')
+
 config.substitutions.append(("%clang-cpp", clang_cpp))
 config.substitutions.append(("%clang-cc", clang_cc))
 config.substitutions.append(("%opt", opt))
-config.substitutions.append(("%filecheck", filecheck + " --dump-input fail"))
+config.substitutions.append(("%filecheck", filecheck + " --dump-input fail --dump-input-context=40"))
 
 config.substitutions.append(("%lib_dir", cusan_lib_dir))
 config.substitutions.append(("%pass_dir", cusan_pass_dir))
@@ -77,4 +85,4 @@ config.substitutions.append(('%clang_args', '-D__STRICT_ANSI__ -fPIC -O1 -g0'))
 
 config.substitutions.append(('%rm-file', 'rm -f '))
 
-config.substitutions.append(('%clang-pass-only-args', '-D__STRICT_ANSI__ -O1 -c'))
+config.substitutions.append(('%clang-pass-only-args', '-D__STRICT_ANSI__ -std=c++17 -O1 -c'))
diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in
index 70030c3..14e1304 100644
--- a/test/lit.site.cfg.in
+++ b/test/lit.site.cfg.in
@@ -31,6 +31,8 @@ config.cusan_pass = "$<TARGET_FILE_NAME:cusan::TransformPass>"
 config.cusan_mpi_interceptor = "$<TARGET_FILE:cusan::MPI_Interceptor>"
 config.cusan_use_workarounds = @CUSAN_TEST_WORKAROUND@
 config.cusan_typeart = @CUSAN_HAS_TYPEART@
+config.cusan_mpi = @CUSAN_HAS_MPI@
+config.cusan_sync_callback = @CUSAN_HAS_SYNC_CALLBACKS@
 
 # Let the main config do the real work.
 config.loaded_site_config = True
diff --git a/test/multi_gpu/01_device_sync.c b/test/multi_gpu/01_device_sync.c
new file mode 100644
index 0000000..52747c3
--- /dev/null
+++ b/test/multi_gpu/01_device_sync.c
@@ -0,0 +1,85 @@
+// clang-format off
+// RUN: %wrapper-cc %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-cc -DCUSAN_SYNC %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// REQUIRES: multigpu
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+__global__ void write_kernel_delay(int* arr, const int N, const unsigned int delay) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(delay);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  if (tid < N) {
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main() {
+  const int size            = 256;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+  int* managed_data;
+  int* managed_data2;
+
+  int n_devices = 0;
+  cudaGetDeviceCount(&n_devices);
+  if (n_devices < 2) {
+    printf("This test is designed for CUDA on multiple devices but there is only one or none here. Exiting.\n");
+    return 1;
+  }
+
+  cudaSetDevice(0);
+  cudaMallocManaged(&managed_data, size * sizeof(int));
+  cudaMemset(managed_data, 0, size * sizeof(int));
+
+  cudaSetDevice(1);
+  cudaMallocManaged(&managed_data2, size * sizeof(int));
+  cudaMemset(managed_data2, 0, size * sizeof(int));
+
+  cudaSetDevice(0);
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock>>>(managed_data, size, 1316134912);
+
+  // if we only have the later synchronize we will only synchronize the second device
+#ifdef CUSAN_SYNC
+  cudaDeviceSynchronize();
+#endif
+
+  cudaSetDevice(1);
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock>>>(managed_data2, size, 1);
+  cudaDeviceSynchronize();
+
+  for (int i = 0; i < size; i++) {
+    if (managed_data[i] == 0) {
+      printf("[Error] sync managed_data %i\n", managed_data[i]);
+      break;
+    }
+  }
+  for (int i = 0; i < size; i++) {
+    if (managed_data2[i] == 0) {
+      printf("[Error] sync managed_data2 %i\n", managed_data[i]);
+      break;
+    }
+  }
+
+  cudaFree(managed_data);
+  cudaFree(managed_data2);
+  return 0;
+}
diff --git a/test/multi_gpu/02_device_sync_event.c b/test/multi_gpu/02_device_sync_event.c
new file mode 100644
index 0000000..a72a7fe
--- /dev/null
+++ b/test/multi_gpu/02_device_sync_event.c
@@ -0,0 +1,75 @@
+// clang-format off
+// RUN: %wrapper-cc %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t.exe
+// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-cc %clang_args -DCUSAN_SYNC -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// REQUIRES: multigpu
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+__global__ void kernel(int* data) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(1000000U);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  data[tid] = (tid + 1);
+}
+
+int main() {
+  const int size            = 256;
+  const int threadsPerBlock = 256;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  int n_devices = 0;
+  cudaGetDeviceCount(&n_devices);
+  if (n_devices < 2) {
+    printf("This test is designed for CUDA on multiple devices but there is only one or none here. Exiting.\n");
+    return 1;
+  }
+
+  cudaSetDevice(0);
+
+  int* d_data;  // Unified Memory pointer
+
+  // Allocate Unified Memory
+  cudaMallocManaged(&d_data, size * sizeof(int));
+  cudaMemset(d_data, 0, size * sizeof(int));
+
+  cudaEvent_t endEvent;
+  cudaEventCreate(&endEvent);
+  kernel<<<blocksPerGrid, threadsPerBlock>>>(d_data);
+  cudaEventRecord(endEvent);
+
+  cudaSetDevice(1);
+
+#ifdef CUSAN_SYNC
+  cudaEventSynchronize(endEvent);
+#endif
+
+  for (int i = 0; i < size; i++) {
+    if (d_data[i] < 1) {
+      printf("[Error] sync\n");
+      break;
+    }
+  }
+
+  cudaEventDestroy(endEvent);
+  cudaFree(d_data);
+
+  return 0;
+}
diff --git a/test/multi_gpu/TSan_External.h b/test/multi_gpu/TSan_External.h
new file mode 100644
index 0000000..c7b5796
--- /dev/null
+++ b/test/multi_gpu/TSan_External.h
@@ -0,0 +1,276 @@
+/* Part of the MUST Project, under BSD-3-Clause License
+ * See https://hpc.rwth-aachen.de/must/LICENSE for license information.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+/**
+ * @file TSan_External.h
+ * Functions exported from ThreadSaniziter that can be used for dynamic annotations.
+ * https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/tsan/rtl/tsan_interface.cpp
+ * https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cpp
+ */
+
+#ifndef TSAN_EXTERNAL_H
+#define TSAN_EXTERNAL_H
+
+typedef unsigned long uptr;
+
+typedef enum { mo_relaxed, mo_consume, mo_acquire, mo_release, mo_acq_rel, mo_seq_cst } morder;
+
+typedef unsigned char a8;
+typedef unsigned short a16;
+typedef unsigned int a32;
+typedef unsigned long long a64;
+
+#ifdef MUST_DEBUG
+// Print an error message *once* if an annotation function is used that is not overwritten by the
+// TSan runtime
+#define FALLBACK_PRINT(func_name)                                                  \
+  {                                                                                \
+    static bool once = false;                                                      \
+    if (!once) {                                                                   \
+      printf(                                                                      \
+          "[MUST-ERROR] %s fallback called, check your TSan runtime and the call " \
+          "signature\n",                                                           \
+          func_name);                                                              \
+      once = true;                                                                 \
+    }                                                                              \
+  }
+#else
+#define FALLBACK_PRINT(func_name)
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ThreadSanitizer defines the following functions that can be used in MUST for dynamic annotations.
+void __attribute__((weak)) AnnotateHappensAfter(const char* file, int line, const volatile void* cv) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) AnnotateHappensBefore(const char* file, int line, const volatile void* cv) {
+  FALLBACK_PRINT(__func__);
+}
+
+void __attribute__((weak)) AnnotateNewMemory(const char* file, int line, const volatile void* cv, uptr size) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) AnnotateMemoryRead(const char* file, int line, const volatile void* cv, uptr size) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) AnnotateMemoryWrite(const char* file, int line, const volatile void* cv, uptr size) {
+  FALLBACK_PRINT(__func__);
+}
+
+void __attribute__((weak)) AnnotateIgnoreReadsBegin(const char* file, int line) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) AnnotateIgnoreReadsEnd(const char* file, int line) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) AnnotateIgnoreWritesBegin(const char* file, int line) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) AnnotateIgnoreWritesEnd(const char* file, int line) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) AnnotateIgnoreSyncBegin(const char* file, int line) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) AnnotateIgnoreSyncEnd(const char* file, int line) {
+  FALLBACK_PRINT(__func__);
+}
+
+void __attribute__((weak)) AnnotateRWLockCreate(const char* file, int line, const volatile void* cv) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) AnnotateRWLockDestroy(const char* file, int line, const volatile void* cv) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv,
+                                                  unsigned long long is_w) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) AnnotateRWLockReleased(const char* file, int line, const volatile void* cv,
+                                                  unsigned long long is_w) {
+  FALLBACK_PRINT(__func__);
+}
+
+void __attribute__((weak)) __tsan_read_range(void* addr, uptr size) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) __tsan_write_range(void* addr, uptr size) {
+  FALLBACK_PRINT(__func__);
+}
+
+void __attribute__((weak)) __tsan_read1_pc(void* addr, void* pc) {
+}
+void __attribute__((weak)) __tsan_write1_pc(void* addr, void* pc) {
+}
+void __attribute__((weak)) __tsan_read2_pc(void* addr, void* pc) {
+}
+void __attribute__((weak)) __tsan_write2_pc(void* addr, void* pc) {
+}
+void __attribute__((weak)) __tsan_read4_pc(void* addr, void* pc) {
+}
+void __attribute__((weak)) __tsan_write4_pc(void* addr, void* pc) {
+}
+void __attribute__((weak)) __tsan_read8_pc(void* addr, void* pc) {
+}
+void __attribute__((weak)) __tsan_write8_pc(void* addr, void* pc) {
+}
+
+#define annotateHelper(rw, s)                                            \
+  if (len >= s) {                                                        \
+    printf("annotateHelper(%s, %i, %p, %li)\n", #rw, s, addr, (uptr)pc); \
+    len -= s;                                                            \
+    __tsan_##rw##s##_pc(addr, pc);                                       \
+    addr += s;                                                           \
+    size -= s;                                                           \
+  }
+
+void __attribute__((weak)) __tsan_read_range_pc(void* a, uptr size, void* pc) {
+  char* addr = (char*)a;
+  uptr len   = ((uptr)addr) % 8;
+  if (size < len)
+    len = size;
+  annotateHelper(read, 4) annotateHelper(read, 2) annotateHelper(read, 1) for (; size > 7; size -= 8) {
+    __tsan_read8_pc(addr, pc);
+    addr += 8;
+  }
+  len = size;
+  annotateHelper(read, 4) annotateHelper(read, 2) annotateHelper(read, 1)
+}
+
+void __attribute__((weak)) __tsan_write_range_pc(void* a, uptr size, void* pc) {
+  char* addr = (char*)a;
+  uptr len   = ((uptr)addr) % 8;
+  // printf("__tsan_write_range_pc(%p, %li, %li), %li\n", addr, size, pc, len);
+  if (size < len)
+    len = size;
+  annotateHelper(write, 4) annotateHelper(write, 2) annotateHelper(write, 1) for (; size > 7; size -= 8) {
+    // printf("__tsan_write8_pc(%p, %li)\n", addr, pc);
+    __tsan_write8_pc(addr, pc);
+    addr += 8;
+  }
+  len = size;
+  annotateHelper(write, 4) annotateHelper(write, 2) annotateHelper(write, 1)
+}
+
+void __attribute__((weak)) __tsan_func_entry(void* call_pc) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) __tsan_func_exit() {
+  FALLBACK_PRINT(__func__);
+}
+
+a8 __attribute__((weak)) __tsan_atomic8_load(const volatile a8* a, morder mo) {
+  FALLBACK_PRINT(__func__);
+  return 0;
+}
+a16 __attribute__((weak)) __tsan_atomic16_load(const volatile a16* a, morder mo) {
+  FALLBACK_PRINT(__func__);
+  return 0;
+}
+a32 __attribute__((weak)) __tsan_atomic32_load(const volatile a32* a, morder mo) {
+  FALLBACK_PRINT(__func__);
+  return 0;
+}
+a64 __attribute__((weak)) __tsan_atomic64_load(const volatile a64* a, morder mo) {
+  FALLBACK_PRINT(__func__);
+  return 0;
+}
+void __attribute__((weak)) __tsan_atomic8_store(volatile a8* a, a8 v, morder mo) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) __tsan_atomic16_store(volatile a16* a, a16 v, morder mo) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) __tsan_atomic32_store(volatile a32* a, a32 v, morder mo) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) __tsan_atomic64_store(volatile a64* a, a64 v, morder mo) {
+  FALLBACK_PRINT(__func__);
+}
+
+// TLC extension
+void __attribute__((weak)) AnnotateInitTLC(const char* file, int line, const volatile void* cv) {
+  FALLBACK_PRINT(__func__);
+  AnnotateHappensBefore(file, line, cv);
+}
+void __attribute__((weak)) AnnotateStartTLC(const char* file, int line, const volatile void* cv) {
+  FALLBACK_PRINT(__func__);
+  AnnotateHappensAfter(file, line, cv);
+}
+
+// Fibers
+void __attribute__((weak)) * __tsan_get_current_fiber() {
+  FALLBACK_PRINT(__func__);
+  return nullptr;
+}
+void __attribute__((weak)) * __tsan_create_fiber(unsigned flags) {
+  FALLBACK_PRINT(__func__);
+  return nullptr;
+}
+void __attribute__((weak)) __tsan_destroy_fiber(void* fiber) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) __tsan_switch_to_fiber(void* fiber, unsigned flags) {
+  FALLBACK_PRINT(__func__);
+}
+void __attribute__((weak)) __tsan_set_fiber_name(void* fiber, const char* name) {
+  FALLBACK_PRINT(__func__);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#define TsanHappensBefore(cv) AnnotateHappensBefore(__FILE__, __LINE__, cv)
+#define TsanHappensAfter(cv) AnnotateHappensAfter(__FILE__, __LINE__, cv)
+
+#define TsanIgnoreWritesBegin() AnnotateIgnoreWritesBegin(__FILE__, __LINE__)
+#define TsanIgnoreWritesEnd() AnnotateIgnoreWritesEnd(__FILE__, __LINE__)
+#define TsanIgnoreReadsBegin() AnnotateIgnoreReadsBegin(__FILE__, __LINE__)
+#define TsanIgnoreReadsEnd() AnnotateIgnoreReadsEnd(__FILE__, __LINE__)
+#define TsanIgnoreSyncBegin() AnnotateIgnoreSyncBegin(__FILE__, __LINE__)
+#define TsanIgnoreSyncEnd() AnnotateIgnoreSyncEnd(__FILE__, __LINE__)
+
+#define TsanInitTLC(cv) AnnotateInitTLC(__FILE__, __LINE__, cv)
+#define TsanStartTLC(cv) AnnotateStartTLC(__FILE__, __LINE__, cv)
+
+#define TsanCreateFiber(flags) __tsan_create_fiber(flags)
+#define TsanDestroyFiber(fiber) __tsan_destroy_fiber(fiber)
+#define TsanSwitchToFiber(fiber, flags) __tsan_switch_to_fiber(fiber, flags)
+#define TsanGetCurrentFiber() __tsan_get_current_fiber()
+#define TsanSetFiberName(fiber, name) __tsan_set_fiber_name(fiber, name)
+
+#define TsanMemoryRead(addr, size) __tsan_read_range(addr, size)
+#define TsanMemoryWrite(addr, size) __tsan_write_range(addr, size)
+#define TsanMemoryReadPC(addr, size, pc) __tsan_read_range_pc(addr, size, pc)
+#define TsanMemoryWritePC(addr, size, pc) __tsan_write_range_pc(addr, size, pc)
+
+#define TsanFuncEntry(pc) __tsan_func_entry(pc)
+#define TsanFuncExit() __tsan_func_exit()
+
+#define TsanPCMemoryRead(pc, addr, size) AnnotatePCMemoryRead(pc, __FILE__, __LINE__, addr, size)
+#define TsanPCMemoryWrite(pc, addr, size) AnnotatePCMemoryWrite(pc, __FILE__, __LINE__, addr, size)
+
+#define TsanNewMemory(addr, size) AnnotateNewMemory(__FILE__, __LINE__, addr, size)
+#define TsanFreeMemory(addr, size) AnnotateNewMemory(__FILE__, __LINE__, addr, size)
+
+#define TsanRWLockCreate(cv) AnnotateRWLockCreate(__FILE__, __LINE__, cv)
+#define TsanRWLockDestroy(cv) AnnotateRWLockDestroy(__FILE__, __LINE__, cv)
+#define TsanRWLockAcquired(cv, is_w) AnnotateRWLockAcquired(__FILE__, __LINE__, cv, is_w)
+#define TsanRWLockReleased(cv, is_w) AnnotateRWLockReleased(__FILE__, __LINE__, cv, is_w)
+
+#define TsanAtomic8Load(a8, mo) __tsan_atomic8_load(a8, mo)
+#define TsanAtomic8Store(a8, v, mo) __tsan_atomic8_store(a8, v, mo)
+#define TsanAtomic16Load(a16, mo) __tsan_atomic16_load(a16, mo)
+#define TsanAtomic16Store(a16, v, mo) __tsan_atomic16_store(a16, v, mo)
+#define TsanAtomic32Load(a32, mo) __tsan_atomic32_load(a32, mo)
+#define TsanAtomic32Store(a32, v, mo) __tsan_atomic32_store(a32, v, mo)
+#define TsanAtomic64Load(a64, mo) __tsan_atomic64_load(a64, mo)
+#define TsanAtomic64Store(a64, v, mo) __tsan_atomic64_store(a64, v, mo)
+
+#endif /*TSAN_EXTERNAL_H*/
\ No newline at end of file
diff --git a/test/multi_gpu/lit.local.cfg b/test/multi_gpu/lit.local.cfg
new file mode 100644
index 0000000..0f360f0
--- /dev/null
+++ b/test/multi_gpu/lit.local.cfg
@@ -0,0 +1,16 @@
+import subprocess
+
+if config.cusan_use_workarounds:
+    config.environment['NEOReadDebugKeys'] = '1'
+    config.environment['DisableDeepBind'] = '1'
+    config.environment['OMPI_MCA_memory'] = '^patcher'
+
+def has_multigpu():
+    try:
+        subprocess.check_call(['nvidia-smi', '-i=1'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        return True
+    except subprocess.CalledProcessError:
+        return False
+
+
+config.available_features.add('multigpu' if has_multigpu() else '')
\ No newline at end of file
diff --git a/test/multi_gpu/suppressions.txt b/test/multi_gpu/suppressions.txt
new file mode 100644
index 0000000..3292e71
--- /dev/null
+++ b/test/multi_gpu/suppressions.txt
@@ -0,0 +1,29 @@
+called_from_lib:libmpi_cxx*
+called_from_lib:libmpi.so*
+called_from_lib:libcuda.so*
+called_from_lib:libtypeartRuntime.so*
+
+# Probably not required, from previous experiments
+#called_from_lib:libucx*
+called_from_lib:libucp*
+called_from_lib:libucs*
+#called_from_lib:libucm_cuda.so*
+called_from_lib:libucm.so*
+called_from_lib:libuct_ib.so.0
+called_from_lib:libuct.so.0
+called_from_lib:mca_pmix_pmix3x*
+called_from_lib:libibverbs.so*
+#called_from_lib:libevent_pthreads-2.1.so*
+called_from_lib:libevent_core-2.1.so*
+#called_from_lib:libhfi1verbs-rdmav34.so*
+called_from_lib:libopen-pal.so.40*
+#called_from_lib:mca_errmgr_default_app.so*
+#called_from_lib:mca_gds_hash.so*
+#called_from_lib:mca_ptl_tcp.so*
+#called_from_lib:mca_bfrops_v20.so*
+#called_from_lib:mca_psec_munge.so*
+#called_from_lib:mca_gds_ds21.so*
+#called_from_lib:libmca_common_dstore.so*
+called_from_lib:libopen-rte.so.40*
+called_from_lib:ld-linux-x86-64.so*
+called_from_lib:libmlx5-rdma*
diff --git a/test/pass/03_cuda_to_mpi.c b/test/pass/03_cuda_to_mpi.c
index 82d6a98..5186403 100644
--- a/test/pass/03_cuda_to_mpi.c
+++ b/test/pass/03_cuda_to_mpi.c
@@ -2,11 +2,11 @@
 // RUN: %rm-file %t.yaml 
 // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
-
-
 // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[target:%[0-9a-z]+]], {{i8\*|ptr}} {{.*}}[[from:%[0-9a-z]+]], 
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]], {{i8\*|ptr}} {{.*}}[[from]],
 
+// REQUIRES: mpi
+
 // clang-format on
 
 #include "../support/gpu_mpi.h"
diff --git a/test/pass/04_mpi_to_cuda.c b/test/pass/04_mpi_to_cuda.c
index e27f41a..6196dcc 100644
--- a/test/pass/04_mpi_to_cuda.c
+++ b/test/pass/04_mpi_to_cuda.c
@@ -8,6 +8,8 @@
 // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[target:%[0-9a-z]+]], {{i8\*|ptr}} {{.*}}[[from:%[0-9a-z]+]],
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]], {{i8\*|ptr}} {{.*}}[[from]],
 
+// REQUIRES: mpi
+
 // clang-format on
 
 #include "../support/gpu_mpi.h"
diff --git a/test/pass/05_cuda_to_mpi_stream.c b/test/pass/05_cuda_to_mpi_stream.c
index bdb304f..6a22f3e 100644
--- a/test/pass/05_cuda_to_mpi_stream.c
+++ b/test/pass/05_cuda_to_mpi_stream.c
@@ -10,6 +10,8 @@
 // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamSynchronize
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_stream
 
+// REQUIRES: mpi
+
 // clang-format on
 
 #include "../support/gpu_mpi.h"
diff --git a/test/pass/06_cuda_to_mpi_event.c b/test/pass/06_cuda_to_mpi_event.c
index 2253965..50bdd34 100644
--- a/test/pass/06_cuda_to_mpi_event.c
+++ b/test/pass/06_cuda_to_mpi_event.c
@@ -10,6 +10,8 @@
 // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[target:%[0-9a-z]+]], {{i8\*|ptr}} {{.*}}[[from:%[0-9a-z]+]], 
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]], {{i8\*|ptr}} {{.*}}[[from]],
 
+// REQUIRES: mpi
+
 // clang-format on
 
 #include "../support/gpu_mpi.h"
diff --git a/test/pass/07_cuda_to_mpi_read.c b/test/pass/07_cuda_to_mpi_read.c
index c1ce739..f379fb7 100644
--- a/test/pass/07_cuda_to_mpi_read.c
+++ b/test/pass/07_cuda_to_mpi_read.c
@@ -11,6 +11,8 @@
 // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[target:%[0-9a-z]+]], {{i8\*|ptr}} {{.*}}[[from:%[0-9a-z]+]],
 // CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]], {{i8\*|ptr}} {{.*}}[[from]],
 
+// REQUIRES: mpi
+
 // clang-format on
 
 #include "../support/gpu_mpi.h"
diff --git a/test/pass/08_cudamemcpy_to_mpi.c b/test/pass/08_cudamemcpy_to_mpi.c
index 3d1139c..1d553a2 100644
--- a/test/pass/08_cudamemcpy_to_mpi.c
+++ b/test/pass/08_cudamemcpy_to_mpi.c
@@ -15,6 +15,8 @@
 // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamSynchronize
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_stream
 
+// REQUIRES: mpi
+
 // clang-format on
 
 // Tsan sometimes crashes with this test it seems
diff --git a/test/pass/11_cuda_to_mpi_struct_of_buff.c b/test/pass/11_cuda_to_mpi_struct_of_buff.c
index 2dcea96..3a2001d 100644
--- a/test/pass/11_cuda_to_mpi_struct_of_buff.c
+++ b/test/pass/11_cuda_to_mpi_struct_of_buff.c
@@ -3,6 +3,8 @@
 
 // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
+// REQUIRES: mpi
+
 // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
 // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
diff --git a/test/pass/18_cuda_to_mpi_event_query_busy_loop.c b/test/pass/18_cuda_to_mpi_event_query_busy_loop.c
index cd00aae..8c590cc 100644
--- a/test/pass/18_cuda_to_mpi_event_query_busy_loop.c
+++ b/test/pass/18_cuda_to_mpi_event_query_busy_loop.c
@@ -3,6 +3,7 @@
 
 // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
+// REQUIRES: mpi
 
 // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
 // CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_stream
diff --git a/test/pass/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c b/test/pass/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c
index 1c5d299..248c3a6 100644
--- a/test/pass/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c
+++ b/test/pass/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c
@@ -3,6 +3,7 @@
 
 // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
+// REQUIRES: mpi
 
 // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
 // CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_stream
diff --git a/test/pass/20_cuda_to_mpi_send_ds_sync_w_r.c b/test/pass/20_cuda_to_mpi_send_ds_sync_w_r.c
index f86f3a0..871b3ab 100644
--- a/test/pass/20_cuda_to_mpi_send_ds_sync_w_r.c
+++ b/test/pass/20_cuda_to_mpi_send_ds_sync_w_r.c
@@ -3,6 +3,8 @@
 
 // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
+// REQUIRES: mpi
+
 // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
 // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaMemset
diff --git a/test/pass/21_chunked_streams_example.c b/test/pass/21_chunked_streams_example.c
index 13562f7..03774ce 100644
--- a/test/pass/21_chunked_streams_example.c
+++ b/test/pass/21_chunked_streams_example.c
@@ -3,6 +3,8 @@
 
 // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
+// REQUIRES: mpi
+
 // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
 // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
diff --git a/test/pass/22_cuda_to_mpi_partial_buff_write.c b/test/pass/22_cuda_to_mpi_partial_buff_write.c
index c037c8a..2f23ae8 100644
--- a/test/pass/22_cuda_to_mpi_partial_buff_write.c
+++ b/test/pass/22_cuda_to_mpi_partial_buff_write.c
@@ -3,6 +3,8 @@
 
 // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
+// REQUIRES: mpi
+
 // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
 // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamSynchronize
diff --git a/test/pass/26_malloc_pitch.c b/test/pass/26_malloc_pitch.c
index b71af1a..7a0a97c 100644
--- a/test/pass/26_malloc_pitch.c
+++ b/test/pass/26_malloc_pitch.c
@@ -6,6 +6,8 @@
 // {{.*}}[[from:%[0-9a-z]+]], CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]],
 // {{i8\*|ptr}} {{.*}}[[from]],
 
+// REQUIRES: mpi
+
 // clang-format on
 
 #include "../support/gpu_mpi.h"
diff --git a/test/pass/29_tsan_cuda_to_mpi.c b/test/pass/29_tsan_cuda_to_mpi.c
index 7608db6..9ab6fdc 100644
--- a/test/pass/29_tsan_cuda_to_mpi.c
+++ b/test/pass/29_tsan_cuda_to_mpi.c
@@ -3,6 +3,7 @@
 
 // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
+// REQUIRES: mpi
 
 // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy
 // CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memcpy
diff --git a/test/pass/30_tsan_annotate_cuda_to_mpi.c b/test/pass/30_tsan_annotate_cuda_to_mpi.c
index 506eff1..335e372 100644
--- a/test/pass/30_tsan_annotate_cuda_to_mpi.c
+++ b/test/pass/30_tsan_annotate_cuda_to_mpi.c
@@ -8,6 +8,8 @@
 // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFree
 // CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_free
 
+// REQUIRES: mpi
+
 // clang-format on
 
 #include "../support/gpu_mpi.h"
diff --git a/test/pass/31_tsan_cuda_event.c b/test/pass/31_tsan_cuda_event.c
index 918ba99..eb819ea 100644
--- a/test/pass/31_tsan_cuda_event.c
+++ b/test/pass/31_tsan_cuda_event.c
@@ -1,8 +1,7 @@
 // clang-format off
 // RUN: %rm-file %t.yaml 
 
-// RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
-
+// RUN: %wrapper-cc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
 // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaEventCreate
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_event
@@ -15,8 +14,6 @@
 
 // clang-format on
 
-#include "../support/gpu_mpi.h"
-
 #include <unistd.h>
 
 __global__ void kernel(int* arr, const int N) {  // CHECK-DAG: [[FILENAME]]:[[@LINE]]
@@ -27,10 +24,6 @@ __global__ void kernel(int* arr, const int N) {  // CHECK-DAG: [[FILENAME]]:[[@L
 }
 
 int main(int argc, char* argv[]) {
-  if (!has_gpu_aware_mpi()) {
-    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
-    return 1;
-  }
   cudaEvent_t first_finished_event;
   cudaEventCreate(&first_finished_event);
   cudaStream_t stream1;
diff --git a/test/pass/32_tsan_async_copy.c b/test/pass/32_tsan_async_copy.c
index cc7c12f..d30220d 100644
--- a/test/pass/32_tsan_async_copy.c
+++ b/test/pass/32_tsan_async_copy.c
@@ -1,7 +1,7 @@
 // clang-format off
 // RUN: %rm-file %t.yaml 
 
-// RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
+// RUN: %wrapper-cc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
 
 // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate 
diff --git a/test/pass/34_test_sync_callback.c b/test/pass/34_test_sync_callback.c
new file mode 100644
index 0000000..2015bfe
--- /dev/null
+++ b/test/pass/34_test_sync_callback.c
@@ -0,0 +1,14 @@
+// clang-format off
+// RUN: %rm-file %t.yaml 
+// RUN: %wrapper-cc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s
+// REQUIRES: sync_callback
+// clang-format on
+
+// CHECK: {{(invoke|call)}} i32 @cudaDeviceSynchronize
+// CHECK: {{(invoke|call)}} void @cusan_sync_callback
+
+int main(int argc, char* argv[]) {
+  cudaDeviceSynchronize();
+
+  return 0;
+}
diff --git a/test/pass/35_test_sync_callback_stream.c b/test/pass/35_test_sync_callback_stream.c
new file mode 100644
index 0000000..d8c5b68
--- /dev/null
+++ b/test/pass/35_test_sync_callback_stream.c
@@ -0,0 +1,15 @@
+// clang-format off
+// RUN: %rm-file %t.yaml 
+// RUN: %wrapper-cc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s
+// REQUIRES: sync_callback
+// clang-format on
+
+// CHECK: {{(invoke|call)}} i32 @cudaStreamSynchronize
+// CHECK: {{(invoke|call)}} void @cusan_sync_callback
+
+int main(int argc, char* argv[]) {
+  cudaStream_t stream = NULL;
+  cudaStreamSynchronize(stream);
+
+  return 0;
+}
diff --git a/test/pass/36_test_sync_callback_event.c b/test/pass/36_test_sync_callback_event.c
new file mode 100644
index 0000000..6ee4679
--- /dev/null
+++ b/test/pass/36_test_sync_callback_event.c
@@ -0,0 +1,14 @@
+// clang-format off
+// RUN: %rm-file %t.yaml 
+// RUN: %wrapper-cc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s
+// REQUIRES: sync_callback
+// clang-format on
+
+// CHECK: {{(invoke|call)}} i32 @cudaEventSynchronize
+// CHECK: {{(invoke|call)}} void @cusan_sync_callback
+
+int main(int argc, char* argv[]) {
+  cudaEvent_t event;
+  cudaEventSynchronize(event);
+  return 0;
+}
diff --git a/test/pass/TSan_External.h b/test/pass/TSan_External.h
index 574f0db..c543fdc 100644
--- a/test/pass/TSan_External.h
+++ b/test/pass/TSan_External.h
@@ -87,12 +87,12 @@ void __attribute__((weak)) AnnotateRWLockCreate(const char* file, int line, cons
 void __attribute__((weak)) AnnotateRWLockDestroy(const char* file, int line, const volatile void* cv) {
   FALLBACK_PRINT(__func__);
 }
-void __attribute__((weak))
-AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv, unsigned long long is_w) {
+void __attribute__((weak)) AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv,
+                                                  unsigned long long is_w) {
   FALLBACK_PRINT(__func__);
 }
-void __attribute__((weak))
-AnnotateRWLockReleased(const char* file, int line, const volatile void* cv, unsigned long long is_w) {
+void __attribute__((weak)) AnnotateRWLockReleased(const char* file, int line, const volatile void* cv,
+                                                  unsigned long long is_w) {
   FALLBACK_PRINT(__func__);
 }
 
diff --git a/test/runtime/03_cuda_to_mpi.c b/test/runtime/03_cuda_to_mpi.c
index bbb61e6..e2dcd9d 100644
--- a/test/runtime/03_cuda_to_mpi.c
+++ b/test/runtime/03_cuda_to_mpi.c
@@ -5,6 +5,8 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
+// REQUIRES: mpi
+
 // clang-format on
 
 // CHECK-DAG: data race
diff --git a/test/runtime/04_mpi_to_cuda.c b/test/runtime/04_mpi_to_cuda.c
index 3b54714..77f8915 100644
--- a/test/runtime/04_mpi_to_cuda.c
+++ b/test/runtime/04_mpi_to_cuda.c
@@ -5,6 +5,8 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
+// REQUIRES: mpi
+
 // clang-format on
 
 // CHECK-DAG: data race
diff --git a/test/runtime/05_cuda_to_mpi_stream.c b/test/runtime/05_cuda_to_mpi_stream.c
index e07b35c..4ff4400 100644
--- a/test/runtime/05_cuda_to_mpi_stream.c
+++ b/test/runtime/05_cuda_to_mpi_stream.c
@@ -5,6 +5,8 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
+// REQUIRES: mpi
+
 // clang-format on
 
 // CHECK-DAG: data race
diff --git a/test/runtime/06_cuda_to_mpi_event.c b/test/runtime/06_cuda_to_mpi_event.c
index b44e4a3..65ef15e 100644
--- a/test/runtime/06_cuda_to_mpi_event.c
+++ b/test/runtime/06_cuda_to_mpi_event.c
@@ -5,6 +5,8 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
+// REQUIRES: mpi
+
 // clang-format on
 
 // CHECK-DAG: data race
diff --git a/test/runtime/07_cuda_to_mpi_read.c b/test/runtime/07_cuda_to_mpi_read.c
index 8d36e6b..d1ee1fc 100644
--- a/test/runtime/07_cuda_to_mpi_read.c
+++ b/test/runtime/07_cuda_to_mpi_read.c
@@ -2,6 +2,8 @@
 // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck --allow-empty %s
 
+// REQUIRES: mpi
+
 // clang-format on
 
 // CHECK-NOT: data race
diff --git a/test/runtime/08_cudamemcpy_to_mpi.c b/test/runtime/08_cudamemcpy_to_mpi.c
index 18c6671..4aedbcc 100644
--- a/test/runtime/08_cudamemcpy_to_mpi.c
+++ b/test/runtime/08_cudamemcpy_to_mpi.c
@@ -2,6 +2,8 @@
 // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
 // RUN: %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s --allow-empty
 
+// REQUIRES: mpi
+
 // clang-format on
 
 // CHECK-NOT: data race
diff --git a/test/runtime/11_cuda_to_mpi_struct_of_buff.c b/test/runtime/11_cuda_to_mpi_struct_of_buff.c
index ea875dc..8455f74 100644
--- a/test/runtime/11_cuda_to_mpi_struct_of_buff.c
+++ b/test/runtime/11_cuda_to_mpi_struct_of_buff.c
@@ -5,6 +5,8 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
+// REQUIRES: mpi
+
 // clang-format on
 
 // CHECK-DAG: data race
diff --git a/test/runtime/11_struct_of_buff.c b/test/runtime/11_struct_of_buff.c
index eb594f1..121c3a7 100644
--- a/test/runtime/11_struct_of_buff.c
+++ b/test/runtime/11_struct_of_buff.c
@@ -10,7 +10,7 @@
 
 // CHECK-SYNC-NOT: data race
 
-#include "../support/gpu_mpi.h"
+// #include "../support/gpu_mpi.h"
 
 struct BufferStorage {
   int* buff1;
diff --git a/test/runtime/12_struct_ptr.c b/test/runtime/12_struct_ptr.c
index 3792d13..41ab21d 100644
--- a/test/runtime/12_struct_ptr.c
+++ b/test/runtime/12_struct_ptr.c
@@ -5,6 +5,8 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
+// REQUIRES: mpi
+
 // CHECK-DAG: data race
 
 // CHECK-SYNC-NOT: data race
diff --git a/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c b/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c
index cf613ca..0b82829 100644
--- a/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c
+++ b/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c
@@ -5,6 +5,8 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
+// REQUIRES: mpi
+
 // clang-format on
 
 // CHECK-DAG: data race
diff --git a/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c b/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c
index c5b221e..23d9a9c 100644
--- a/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c
+++ b/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c
@@ -5,6 +5,8 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
+// REQUIRES: mpi
+
 // clang-format on
 
 // CHECK-DAG: data race
diff --git a/test/runtime/20_cuda_default_stream_sync.c b/test/runtime/20_cuda_default_stream_sync.c
index 78aedfc..c232483 100644
--- a/test/runtime/20_cuda_default_stream_sync.c
+++ b/test/runtime/20_cuda_default_stream_sync.c
@@ -1,8 +1,8 @@
 // clang-format off
-// RUN: %wrapper-cxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %wrapper-cc %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t.exe
 // RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
 
-// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %wrapper-cc -DCUSAN_SYNC %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
 
 // clang-format on
@@ -13,8 +13,9 @@
 // CHECK-SYNC-NOT: data race
 // CHECK-SYNC-NOT: [Error] sync
 
-#include "../support/gpu_mpi.h"
+// #include "../support/gpu_mpi.h"
 
+#include <stdio.h>
 #include <unistd.h>
 
 __global__ void write_kernel_delay(int* arr, const int N, const unsigned int delay) {
diff --git a/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c b/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c
index 5d72ad7..7f88a11 100644
--- a/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c
+++ b/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c
@@ -4,6 +4,9 @@
 
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// REQUIRES: mpi
+
 // clang-format on
 
 // CHECK-DAG: data race
diff --git a/test/runtime/21_chunked_streams_example.c b/test/runtime/21_chunked_streams_example.c
index 90b21e0..6b7dee5 100644
--- a/test/runtime/21_chunked_streams_example.c
+++ b/test/runtime/21_chunked_streams_example.c
@@ -2,6 +2,8 @@
 // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
 
+// REQUIRES: mpi
+
 // clang-format on
 
 // CHECK-NOT: data race
diff --git a/test/runtime/22_cuda_to_mpi_partial_buff_write.c b/test/runtime/22_cuda_to_mpi_partial_buff_write.c
index 1c8ffcd..2639ce8 100644
--- a/test/runtime/22_cuda_to_mpi_partial_buff_write.c
+++ b/test/runtime/22_cuda_to_mpi_partial_buff_write.c
@@ -2,6 +2,8 @@
 // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s --allow-empty
 
+// REQUIRES: mpi
+
 // clang-format on
 
 // CHECK-NOT: data race
diff --git a/test/runtime/26_malloc_pitch.c b/test/runtime/26_malloc_pitch.c
index b2b12da..0dac955 100644
--- a/test/runtime/26_malloc_pitch.c
+++ b/test/runtime/26_malloc_pitch.c
@@ -5,6 +5,8 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
+// REQUIRES: mpi
+
 // clang-format on
 
 // CHECK-DAG: data race
diff --git a/test/runtime/29_tsan_cuda_to_mpi.c b/test/runtime/29_tsan_cuda_to_mpi.c
index a39b5a0..2ac700f 100644
--- a/test/runtime/29_tsan_cuda_to_mpi.c
+++ b/test/runtime/29_tsan_cuda_to_mpi.c
@@ -5,6 +5,8 @@
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
+// REQUIRES: mpi
+
 // clang-format on
 
 // CHECK: [Error] sync
diff --git a/test/runtime/30_tsan_annotate_cuda_to_mpi.c b/test/runtime/30_tsan_annotate_cuda_to_mpi.c
index 13246ef..19c6d97 100644
--- a/test/runtime/30_tsan_annotate_cuda_to_mpi.c
+++ b/test/runtime/30_tsan_annotate_cuda_to_mpi.c
@@ -4,6 +4,9 @@
 
 // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda -g %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
+
+// REQUIRES: mpi
+
 // clang-format on
 
 // CHECK-DAG: data race
diff --git a/test/runtime/31_tsan_cuda_event.c b/test/runtime/31_tsan_cuda_event.c
index beacaeb..aa213fa 100644
--- a/test/runtime/31_tsan_cuda_event.c
+++ b/test/runtime/31_tsan_cuda_event.c
@@ -11,7 +11,7 @@
 
 // CHECK-SYNC-NOT: data race
 
-#include "../support/gpu_mpi.h"
+// #include "../support/gpu_mpi.h"
 
 #include <unistd.h>
 
diff --git a/test/runtime/32_tsan_async_copy.c b/test/runtime/32_tsan_async_copy.c
index 5a6f7f7..14cfcd5 100644
--- a/test/runtime/32_tsan_async_copy.c
+++ b/test/runtime/32_tsan_async_copy.c
@@ -1,9 +1,9 @@
 // clang-format off
-// RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
+// RUN: %wrapper-cc %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
 
-// RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
+// RUN: %wrapper-cc -DCUSAN_SYNC %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
 
 // CHECK-DAG: data race
 // CHECK-DAG: [Error] sync
@@ -13,8 +13,7 @@
 
 // clang-format on
 
-#include "../support/gpu_mpi.h"
-
+#include <stdio.h>
 #include <unistd.h>
 
 __global__ void kernel(int* arr, const int N) {
@@ -32,11 +31,6 @@ __global__ void kernel(int* arr, const int N) {
 }
 
 int main(int argc, char* argv[]) {
-  if (!has_gpu_aware_mpi()) {
-    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
-    return 1;
-  }
-
   cudaEvent_t first_finished_event;
   cudaEventCreate(&first_finished_event);
   cudaStream_t stream1;
diff --git a/test/runtime/33_tsan_wait_event.c b/test/runtime/33_tsan_wait_event.c
index c675758..af2a5a1 100644
--- a/test/runtime/33_tsan_wait_event.c
+++ b/test/runtime/33_tsan_wait_event.c
@@ -11,7 +11,7 @@
 
 // clang-format on
 
-#include "../support/gpu_mpi.h"
+// #include "../support/gpu_mpi.h"
 
 #include <unistd.h>
 
diff --git a/test/runtime/TSan_External.h b/test/runtime/TSan_External.h
index 36849c9..c7b5796 100644
--- a/test/runtime/TSan_External.h
+++ b/test/runtime/TSan_External.h
@@ -87,12 +87,12 @@ void __attribute__((weak)) AnnotateRWLockCreate(const char* file, int line, cons
 void __attribute__((weak)) AnnotateRWLockDestroy(const char* file, int line, const volatile void* cv) {
   FALLBACK_PRINT(__func__);
 }
-void __attribute__((weak))
-AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv, unsigned long long is_w) {
+void __attribute__((weak)) AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv,
+                                                  unsigned long long is_w) {
   FALLBACK_PRINT(__func__);
 }
-void __attribute__((weak))
-AnnotateRWLockReleased(const char* file, int line, const volatile void* cv, unsigned long long is_w) {
+void __attribute__((weak)) AnnotateRWLockReleased(const char* file, int line, const volatile void* cv,
+                                                  unsigned long long is_w) {
   FALLBACK_PRINT(__func__);
 }