From b21468cfa6fc00f90df0157c87de7c5a4d11ee6d Mon Sep 17 00:00:00 2001 From: Tim Ziegler Date: Wed, 20 Nov 2024 14:31:46 +0100 Subject: [PATCH 01/15] Changed runtime to support multiple devices --- lib/runtime/CusanRuntime.cpp | 107 +++++++++++++--------- lib/runtime/CusanRuntime.h | 39 +++++--- lib/runtime/CusanRuntime_cudaSpecific.cpp | 9 ++ 3 files changed, 101 insertions(+), 54 deletions(-) diff --git a/lib/runtime/CusanRuntime.cpp b/lib/runtime/CusanRuntime.cpp index a859af4..54de591 100644 --- a/lib/runtime/CusanRuntime.cpp +++ b/lib/runtime/CusanRuntime.cpp @@ -76,14 +76,27 @@ struct PointerAccess { AccessState mode{AccessState::kRW}; }; -class Runtime { +class Runtime; + +class Device { + friend Runtime; // NOTE: assumed to be a ordered map so we can iterate in ascending order std::map allocations_; std::map streams_; std::map events_; + + Device(){ + //every device has a default stream + + } +}; + +class Runtime { + std::map devices_; + int32_t current_gpu_; TsanFiber cpu_fiber_; TsanFiber curr_fiber_; - bool init_ = false; + bool init_; public: static constexpr Stream kDefaultStream = Stream(); @@ -92,15 +105,10 @@ class Runtime { static Runtime& get() { static Runtime run_t; if (!run_t.init_) { -#ifdef CUSAN_FIBERPOOL - TsanFiberPoolInit(); -#endif - run_t.cpu_fiber_ = TsanGetCurrentFiber(); - run_t.curr_fiber_ = run_t.cpu_fiber_; - - // default '0' cuda stream + run_t.current_gpu_ = get_current_device(); + run_t.cpu_fiber_ = TsanGetCurrentFiber(); + run_t.devices_.insert({run_t.current_gpu_, {}}); { run_t.register_stream(kDefaultStream); } - run_t.init_ = true; } return run_t; @@ -110,26 +118,33 @@ class Runtime { void operator=(const Runtime&) = delete; - [[nodiscard]] const std::map& get_allocations() const { - return allocations_; + Device& get_device(DeviceID id) { + if (devices_.find(id) == devices_.end()) { + devices_.insert({id, {}}); + } + return devices_.at(id); + } + + [[nodiscard]] const std::map& get_allocations() { + return get_device(current_gpu_).allocations_; } void happens_before() { LOG_TRACE("[cusan] HappensBefore of curr fiber") + auto& gpu = get_device(current_gpu_); TsanHappensBefore(curr_fiber_); stats_recorder.inc_TsanHappensBefore(); } void switch_to_cpu() { LOG_TRACE("[cusan] Switch to cpu") - // if we where one a default stream we should also post sync - // meaning that all work submitted after from the cpu should also be run after the default kernels are done - // TODO: double check with blocking - auto search_result = streams_.find(Runtime::kDefaultStream); - assert(search_result != streams_.end() && "Tried using stream that wasn't created prior"); + auto& gpu = get_device(current_gpu_); + + auto search_result = gpu.streams_.find(Runtime::kDefaultStream); + assert(search_result != gpu.streams_.end() && "Tried using stream that wasn't created prior"); if (curr_fiber_ == search_result->second) { LOG_TRACE("[cusan] syncing all other blocking GPU streams to run after since its default stream") - for (const auto& [s, sync_var] : streams_) { + for (const auto& [s, sync_var] : gpu.streams_) { if (s.isBlocking && !s.isDefaultStream()) { LOG_TRACE("[cusan] happens before " << s.handle) TsanHappensBefore(sync_var); @@ -145,28 +160,31 @@ class Runtime { } void register_stream(Stream stream) { + auto& gpu = get_device(current_gpu_); + static uint32_t n_streams = 0; - auto search_result = streams_.find(stream); - assert(search_result == streams_.end() && "Registered stream twice"); + auto search_result = gpu.streams_.find(stream); + assert(search_result == gpu.streams_.end() && "Registered stream twice"); TsanFiber fiber = TsanCreateFiber(0); stats_recorder.inc_TsanCreateFiber(); char name[32]; snprintf(name, 32, "cuda_stream %u", n_streams++); TsanSetFiberName(fiber, name); - streams_.insert({stream, fiber}); + gpu.streams_.insert({stream, fiber}); } void switch_to_stream(Stream stream) { LOG_TRACE("[cusan] Switching to stream: " << stream.handle) - auto search_result = streams_.find(stream); - assert(search_result != streams_.end() && "Tried using stream that wasn't created prior"); + auto& gpu = get_device(current_gpu_); + auto search_result = gpu.streams_.find(stream); + assert(search_result != gpu.streams_.end() && "Tried using stream that wasn't created prior"); TsanSwitchToFiber(search_result->second, 0); stats_recorder.inc_TsanSwitchToFiber(); if (search_result->first.isDefaultStream()) { LOG_TRACE("[cusan] syncing all other blocking GPU streams since its default stream") // then we are on the default stream and as such want to synchronize behind all other streams // unless they are nonBlocking - for (auto& [s, sync_var] : streams_) { + for (auto& [s, sync_var] : gpu.streams_) { if (s.isBlocking && !s.isDefaultStream()) { LOG_TRACE("[cusan] happens after " << s.handle) TsanHappensAfter(sync_var); @@ -179,7 +197,8 @@ class Runtime { void happens_after_all_streams(bool onlyBlockingStreams = false) { LOG_TRACE("[cusan] happens_after_all_streams but only blocking ones: " << onlyBlockingStreams) - for (const auto& [stream, fiber] : streams_) { + auto& gpu = get_device(current_gpu_); + for (const auto& [stream, fiber] : gpu.streams_) { if (!onlyBlockingStreams || stream.isBlocking) { LOG_TRACE("[cusan] happens after " << stream.handle) TsanHappensAfter(fiber); @@ -189,43 +208,51 @@ class Runtime { } void happens_after_stream(Stream stream) { - auto search_result = streams_.find(stream); - assert(search_result != streams_.end() && "Tried using stream that wasn't created prior"); + auto& gpu = get_device(current_gpu_); + + auto search_result = gpu.streams_.find(stream); + assert(search_result != gpu.streams_.end() && "Tried using stream that wasn't created prior"); TsanHappensAfter(search_result->second); stats_recorder.inc_TsanHappensAfter(); } void record_event(Event event, Stream stream) { LOG_TRACE("[cusan] Record event: " << event << " stream:" << stream.handle); - events_[event] = stream; + auto& gpu = get_device(current_gpu_); + gpu.events_[event] = stream; } // Sync the event on the current stream void sync_event(Event event) { - auto search_result = events_.find(event); - assert(search_result != events_.end() && "Tried using event that wasn't recorded to prior"); - LOG_TRACE("[cusan] Sync event: " << event << " recorded on stream:" << events_[event].handle) - happens_after_stream(events_[event]); + auto& gpu = get_device(current_gpu_); + + auto search_result = gpu.events_.find(event); + assert(search_result != gpu.events_.end() && "Tried using event that wasn't recorded to prior"); + LOG_TRACE("[cusan] Sync event: " << event << " recorded on stream:" << gpu.events_[event].handle) + happens_after_stream(gpu.events_[event]); } void insert_allocation(void* ptr, AllocationInfo info) { - assert(allocations_.find(ptr) == allocations_.end() && "Registered an allocation multiple times"); - allocations_[ptr] = info; + auto& gpu = get_device(current_gpu_); + assert(gpu.allocations_.find(ptr) == gpu.allocations_.end() && "Registered an allocation multiple times"); + gpu.allocations_[ptr] = info; } void free_allocation(void* ptr, bool must_exist = true) { - bool found = allocations_.find(ptr) != allocations_.end(); + auto& gpu = get_device(current_gpu_); + bool found = gpu.allocations_.find(ptr) != gpu.allocations_.end(); if (must_exist) { assert(found && "Tried to delete a non existent allocation"); } if (found) { - allocations_.erase(ptr); + gpu.allocations_.erase(ptr); } } AllocationInfo* get_allocation_info(const void* ptr) { - auto res = allocations_.find(ptr); - if (res == allocations_.end()) { + auto& gpu = get_device(current_gpu_); + auto res = gpu.allocations_.find(ptr); + if (res == gpu.allocations_.end()) { // fallback find if it lies within a region // for(auto [alloc_ptr, alloc_info]: allocations_){ // if(alloc_ptr < ptr && ((const char*)alloc_ptr) + alloc_info.size > ptr){ @@ -269,15 +296,13 @@ class Runtime { } }; -cusan_MemcpyKind infer_memcpy_direction(const void* target, const void* from); - } // namespace cusan::runtime using namespace cusan::runtime; namespace helper { #ifndef CUSAN_TYPEART -inline std::optional find_memory_alloc_size(const Runtime& runtime, const void* ptr) { +inline std::optional find_memory_alloc_size(Runtime& runtime, const void* ptr) { const auto& allocs = runtime.get_allocations(); // if there exists any allocation diff --git a/lib/runtime/CusanRuntime.h b/lib/runtime/CusanRuntime.h index 8dcaef2..7210bcc 100644 --- a/lib/runtime/CusanRuntime.h +++ b/lib/runtime/CusanRuntime.h @@ -8,19 +8,45 @@ #define LIB_RUNTIME_CUSAN_H_ #include +#ifdef __cplusplus + +extern "C" { +#endif + +typedef enum cusan_memcpy_kind_t : unsigned int { + cusan_MemcpyHostToHost = 0, + cusan_MemcpyHostToDevice = 1, + cusan_MemcpyDeviceToHost = 2, + cusan_MemcpyDeviceToDevice = 3, + cusan_MemcpyDefault = 4, +} cusan_MemcpyKind; + +typedef enum cusan_stream_create_flags_t : unsigned int { + cusan_StreamFlagsDefault = 0, + cusan_StreamFlagsNonBlocking = 1, +} cusan_StreamCreateFlags; +#ifdef __cplusplus +} +#endif + #ifdef __cplusplus namespace cusan::runtime { using TsanFiber = void*; using Event = const void*; using RawStream = const void*; +using DeviceID = int; +cusan_MemcpyKind infer_memcpy_direction(const void* target, const void* from); +DeviceID get_current_device(); } // namespace cusan::runtime using cusan::runtime::Event; using cusan::runtime::RawStream; using cusan::runtime::TsanFiber; +using cusan::runtime::DeviceID; #else #define TsanFiber void* #define Event const void* #define RawStream const void* +#define DeviceID int #endif #ifdef __cplusplus @@ -28,19 +54,6 @@ using cusan::runtime::TsanFiber; extern "C" { #endif -typedef enum cusan_memcpy_kind_t : unsigned int { - cusan_MemcpyHostToHost = 0, - cusan_MemcpyHostToDevice = 1, - cusan_MemcpyDeviceToHost = 2, - cusan_MemcpyDeviceToDevice = 3, - cusan_MemcpyDefault = 4, -} cusan_MemcpyKind; - -typedef enum cusan_stream_create_flags_t : unsigned int { - cusan_StreamFlagsDefault = 0, - cusan_StreamFlagsNonBlocking = 1, -} cusan_StreamCreateFlags; - void _cusan_kernel_register(void** kernel_args, short* modes, int n, RawStream stream); void _cusan_sync_device(); void _cusan_event_record(Event event, RawStream stream); diff --git a/lib/runtime/CusanRuntime_cudaSpecific.cpp b/lib/runtime/CusanRuntime_cudaSpecific.cpp index 6958db3..8d8f3ec 100644 --- a/lib/runtime/CusanRuntime_cudaSpecific.cpp +++ b/lib/runtime/CusanRuntime_cudaSpecific.cpp @@ -10,6 +10,15 @@ #include namespace cusan::runtime { + + + +DeviceID get_current_device(){ + DeviceID res; + cudaGetDevice(&res); + return res; +} + cusan_MemcpyKind infer_memcpy_direction(const void* target, const void* from) { cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 0); From d60e1b41c35999e7098b42d1f7b7808bcc547dc8 Mon Sep 17 00:00:00 2001 From: Tim Ziegler Date: Wed, 20 Nov 2024 14:32:10 +0100 Subject: [PATCH 02/15] Added suport for setDevice and chooseDevice --- lib/pass/AnalysisTransform.cpp | 23 +++++++++++++++ lib/pass/AnalysisTransform.h | 2 ++ lib/pass/CusanPass.cpp | 2 ++ lib/pass/FunctionDecl.cpp | 9 ++++++ lib/pass/FunctionDecl.h | 2 ++ lib/runtime/CusanRuntime.cpp | 53 ++++++++++++++++++++++------------ lib/runtime/CusanRuntime.h | 4 +++ 7 files changed, 77 insertions(+), 18 deletions(-) diff --git a/lib/pass/AnalysisTransform.cpp b/lib/pass/AnalysisTransform.cpp index d539dae..808686d 100644 --- a/lib/pass/AnalysisTransform.cpp +++ b/lib/pass/AnalysisTransform.cpp @@ -589,6 +589,29 @@ llvm::SmallVector CudaMallocPitch::map_arguments(IRBuilder<>& irb, llvm: return {ptr, real_size}; } + +// CudaSetDevice + +CudaSetDevice::CudaSetDevice(callback::FunctionDecl* decls) { + setup("cudaSetDevice", &decls->cusan_set_device.f); +} +llvm::SmallVector CudaSetDevice::map_arguments(IRBuilder<>&, llvm::ArrayRef args) { + //cudaSetDevice ( int device ) + assert(args.size() == 1); + return {args[0]}; +} + +// CudaCHooseDevice + +CudaChooseDevice::CudaChooseDevice(callback::FunctionDecl* decls) { + setup("cudaChooseDevice", &decls->cusan_choose_device.f); +} +llvm::SmallVector CudaChooseDevice::map_arguments(IRBuilder<>&, llvm::ArrayRef args) { + //cudaChooseDevice ( int* device, const cudaDeviceProp* prop ) + assert(args.size() == 2); + return {args[0]}; +} + // CudaStreamQuery CudaStreamQuery::CudaStreamQuery(callback::FunctionDecl* decls) { diff --git a/lib/pass/AnalysisTransform.h b/lib/pass/AnalysisTransform.h index 97c1bc1..76b49f1 100644 --- a/lib/pass/AnalysisTransform.h +++ b/lib/pass/AnalysisTransform.h @@ -212,6 +212,8 @@ BasicInstrumenterDecl(CudaMallocManaged); BasicInstrumenterDecl(CudaMalloc); BasicInstrumenterDecl(CudaFree); BasicInstrumenterDecl(CudaMallocPitch); +BasicInstrumenterDecl(CudaSetDevice); +BasicInstrumenterDecl(CudaChooseDevice); class CudaStreamQuery : public SimpleInstrumenter { public: diff --git a/lib/pass/CusanPass.cpp b/lib/pass/CusanPass.cpp index 00bc33c..6af51e9 100644 --- a/lib/pass/CusanPass.cpp +++ b/lib/pass/CusanPass.cpp @@ -163,6 +163,8 @@ bool CusanPass::runOnFunc(llvm::Function& function) { modified |= transform::StreamCreateWithFlagsInstrumenter(&cusan_decls_).instrument(function); modified |= transform::StreamCreateWithPriorityInstrumenter(&cusan_decls_).instrument(function); modified |= transform::CudaMallocPitch(&cusan_decls_).instrument(function); + modified |= transform::CudaChooseDevice(&cusan_decls_).instrument(function); + modified |= transform::CudaSetDevice(&cusan_decls_).instrument(function); auto data_for_host = host::kernel_model_for_stub(&function, this->kernel_models_); if (data_for_host) { diff --git a/lib/pass/FunctionDecl.cpp b/lib/pass/FunctionDecl.cpp index f58c828..91e029d 100644 --- a/lib/pass/FunctionDecl.cpp +++ b/lib/pass/FunctionDecl.cpp @@ -1,4 +1,5 @@ #include "FunctionDecl.h" +#include namespace cusan::callback { @@ -124,6 +125,14 @@ void FunctionDecl::initialize(llvm::Module& module) { // void* devPtr, size_t pitch, size_t width, size_t height ArgTypes arg_types_2d_memset = {void_ptr, size_t_ty, size_t_ty, size_t_ty}; make_function(cusan_memset_2d, arg_types_2d_memset); + + // int device + ArgTypes arg_types_set_device = {Type::getInt32Ty(c)}; + make_function(cusan_set_device, arg_types_set_device); + + // void* device + ArgTypes arg_types_choose_device = {Type::getInt32Ty(c)->getPointerTo()}; + make_function(cusan_choose_device, arg_types_choose_device); } } // namespace cusan::callback diff --git a/lib/pass/FunctionDecl.h b/lib/pass/FunctionDecl.h index d40e80a..91eabe5 100644 --- a/lib/pass/FunctionDecl.h +++ b/lib/pass/FunctionDecl.h @@ -41,6 +41,8 @@ struct FunctionDecl { CusanFunction cusan_host_register{"_cusan_host_register"}; CusanFunction cusan_host_unregister{"_cusan_host_unregister"}; CusanFunction cusan_device_alloc{"_cusan_device_alloc"}; + CusanFunction cusan_set_device{"_cusan_set_device"}; + CusanFunction cusan_choose_device{"_cusan_choose_device"}; CusanFunction cusan_device_free{"_cusan_device_free"}; CusanFunction cusan_stream_query{"_cusan_stream_query"}; CusanFunction cusan_event_query{"_cusan_event_query"}; diff --git a/lib/runtime/CusanRuntime.cpp b/lib/runtime/CusanRuntime.cpp index 54de591..3d0a00b 100644 --- a/lib/runtime/CusanRuntime.cpp +++ b/lib/runtime/CusanRuntime.cpp @@ -85,15 +85,14 @@ class Device { std::map streams_; std::map events_; - Device(){ - //every device has a default stream - + Device() { + // every device has a default stream } }; class Runtime { std::map devices_; - int32_t current_gpu_; + int32_t current_device_; TsanFiber cpu_fiber_; TsanFiber curr_fiber_; bool init_; @@ -105,9 +104,9 @@ class Runtime { static Runtime& get() { static Runtime run_t; if (!run_t.init_) { - run_t.current_gpu_ = get_current_device(); + run_t.current_device_ = get_current_device(); run_t.cpu_fiber_ = TsanGetCurrentFiber(); - run_t.devices_.insert({run_t.current_gpu_, {}}); + run_t.devices_.insert({run_t.current_device_, {}}); { run_t.register_stream(kDefaultStream); } run_t.init_ = true; } @@ -125,20 +124,27 @@ class Runtime { return devices_.at(id); } + void set_device(DeviceID device){ + if (devices_.find(device) == devices_.end()) { + devices_.insert({device, {}}); + } + current_device_ = device; + } + [[nodiscard]] const std::map& get_allocations() { - return get_device(current_gpu_).allocations_; + return get_device(current_device_).allocations_; } void happens_before() { LOG_TRACE("[cusan] HappensBefore of curr fiber") - auto& gpu = get_device(current_gpu_); + auto& gpu = get_device(current_device_); TsanHappensBefore(curr_fiber_); stats_recorder.inc_TsanHappensBefore(); } void switch_to_cpu() { LOG_TRACE("[cusan] Switch to cpu") - auto& gpu = get_device(current_gpu_); + auto& gpu = get_device(current_device_); auto search_result = gpu.streams_.find(Runtime::kDefaultStream); assert(search_result != gpu.streams_.end() && "Tried using stream that wasn't created prior"); @@ -160,7 +166,7 @@ class Runtime { } void register_stream(Stream stream) { - auto& gpu = get_device(current_gpu_); + auto& gpu = get_device(current_device_); static uint32_t n_streams = 0; auto search_result = gpu.streams_.find(stream); @@ -175,7 +181,7 @@ class Runtime { void switch_to_stream(Stream stream) { LOG_TRACE("[cusan] Switching to stream: " << stream.handle) - auto& gpu = get_device(current_gpu_); + auto& gpu = get_device(current_device_); auto search_result = gpu.streams_.find(stream); assert(search_result != gpu.streams_.end() && "Tried using stream that wasn't created prior"); TsanSwitchToFiber(search_result->second, 0); @@ -197,7 +203,7 @@ class Runtime { void happens_after_all_streams(bool onlyBlockingStreams = false) { LOG_TRACE("[cusan] happens_after_all_streams but only blocking ones: " << onlyBlockingStreams) - auto& gpu = get_device(current_gpu_); + auto& gpu = get_device(current_device_); for (const auto& [stream, fiber] : gpu.streams_) { if (!onlyBlockingStreams || stream.isBlocking) { LOG_TRACE("[cusan] happens after " << stream.handle) @@ -208,7 +214,7 @@ class Runtime { } void happens_after_stream(Stream stream) { - auto& gpu = get_device(current_gpu_); + auto& gpu = get_device(current_device_); auto search_result = gpu.streams_.find(stream); assert(search_result != gpu.streams_.end() && "Tried using stream that wasn't created prior"); @@ -218,13 +224,13 @@ class Runtime { void record_event(Event event, Stream stream) { LOG_TRACE("[cusan] Record event: " << event << " stream:" << stream.handle); - auto& gpu = get_device(current_gpu_); + auto& gpu = get_device(current_device_); gpu.events_[event] = stream; } // Sync the event on the current stream void sync_event(Event event) { - auto& gpu = get_device(current_gpu_); + auto& gpu = get_device(current_device_); auto search_result = gpu.events_.find(event); assert(search_result != gpu.events_.end() && "Tried using event that wasn't recorded to prior"); @@ -233,13 +239,13 @@ class Runtime { } void insert_allocation(void* ptr, AllocationInfo info) { - auto& gpu = get_device(current_gpu_); + auto& gpu = get_device(current_device_); assert(gpu.allocations_.find(ptr) == gpu.allocations_.end() && "Registered an allocation multiple times"); gpu.allocations_[ptr] = info; } void free_allocation(void* ptr, bool must_exist = true) { - auto& gpu = get_device(current_gpu_); + auto& gpu = get_device(current_device_); bool found = gpu.allocations_.find(ptr) != gpu.allocations_.end(); if (must_exist) { assert(found && "Tried to delete a non existent allocation"); @@ -250,7 +256,7 @@ class Runtime { } AllocationInfo* get_allocation_info(const void* ptr) { - auto& gpu = get_device(current_gpu_); + auto& gpu = get_device(current_device_); auto res = gpu.allocations_.find(ptr); if (res == gpu.allocations_.end()) { // fallback find if it lies within a region @@ -536,6 +542,17 @@ void _cusan_event_query(Event event, unsigned int err) { } } +void _cusan_set_device(DeviceID device) { + Runtime::get().set_device(device); +} + +void _cusan_choose_device(DeviceID* device) { + // does this function ever return a nullptr? + // and what would that mean + assert(device); + Runtime::get().set_device(*device); +} + void _cusan_memset_async_impl(void* target, size_t count, RawStream stream) { // The Async versions are always asynchronous with respect to the host. auto& runtime = Runtime::get(); diff --git a/lib/runtime/CusanRuntime.h b/lib/runtime/CusanRuntime.h index 7210bcc..c970f41 100644 --- a/lib/runtime/CusanRuntime.h +++ b/lib/runtime/CusanRuntime.h @@ -85,6 +85,10 @@ void _cusan_device_alloc(void** ptr, size_t size); void _cusan_device_free(void* ptr); void _cusan_stream_query(RawStream stream, unsigned int err); void _cusan_event_query(Event event, unsigned int err); + +void _cusan_set_device(DeviceID device); +void _cusan_choose_device(DeviceID* device); + #ifdef __cplusplus } #endif From f6d6d03e9cd0e2e3bd1c1a3872134272e880121f Mon Sep 17 00:00:00 2001 From: Tim Ziegler Date: Fri, 13 Dec 2024 17:09:24 +0100 Subject: [PATCH 03/15] Initial multi gpu --- lib/runtime/CusanRuntime.cpp | 231 +++++++++--------- lib/runtime/CusanRuntime.h | 2 +- lib/runtime/CusanRuntime_cudaSpecific.cpp | 2 +- test/CMakeLists.txt | 4 +- test/multi_gpu/01_device_sync.c | 89 +++++++ test/multi_gpu/02_device_sync_event.c | 75 ++++++ test/multi_gpu/TSan_External.h | 276 ++++++++++++++++++++++ test/multi_gpu/suppressions.txt | 29 +++ 8 files changed, 588 insertions(+), 120 deletions(-) create mode 100644 test/multi_gpu/01_device_sync.c create mode 100644 test/multi_gpu/02_device_sync_event.c create mode 100644 test/multi_gpu/TSan_External.h create mode 100644 test/multi_gpu/suppressions.txt diff --git a/lib/runtime/CusanRuntime.cpp b/lib/runtime/CusanRuntime.cpp index 3d0a00b..bca4839 100644 --- a/lib/runtime/CusanRuntime.cpp +++ b/lib/runtime/CusanRuntime.cpp @@ -79,78 +79,40 @@ struct PointerAccess { class Runtime; class Device { - friend Runtime; // NOTE: assumed to be a ordered map so we can iterate in ascending order std::map allocations_; std::map streams_; std::map events_; - - Device() { - // every device has a default stream - } -}; - -class Runtime { - std::map devices_; - int32_t current_device_; TsanFiber cpu_fiber_; TsanFiber curr_fiber_; - bool init_; public: static constexpr Stream kDefaultStream = Stream(); Recorder stats_recorder; - static Runtime& get() { - static Runtime run_t; - if (!run_t.init_) { - run_t.current_device_ = get_current_device(); - run_t.cpu_fiber_ = TsanGetCurrentFiber(); - run_t.devices_.insert({run_t.current_device_, {}}); - { run_t.register_stream(kDefaultStream); } - run_t.init_ = true; - } - return run_t; - } - - Runtime(const Runtime&) = delete; - - void operator=(const Runtime&) = delete; - - Device& get_device(DeviceID id) { - if (devices_.find(id) == devices_.end()) { - devices_.insert({id, {}}); - } - return devices_.at(id); - } - - void set_device(DeviceID device){ - if (devices_.find(device) == devices_.end()) { - devices_.insert({device, {}}); - } - current_device_ = device; + Device() { + // every device has a default stream + { register_stream(Device::kDefaultStream); } + cpu_fiber_ = TsanGetCurrentFiber(); } [[nodiscard]] const std::map& get_allocations() { - return get_device(current_device_).allocations_; + return allocations_; } void happens_before() { LOG_TRACE("[cusan] HappensBefore of curr fiber") - auto& gpu = get_device(current_device_); TsanHappensBefore(curr_fiber_); stats_recorder.inc_TsanHappensBefore(); } void switch_to_cpu() { LOG_TRACE("[cusan] Switch to cpu") - auto& gpu = get_device(current_device_); - - auto search_result = gpu.streams_.find(Runtime::kDefaultStream); - assert(search_result != gpu.streams_.end() && "Tried using stream that wasn't created prior"); + auto search_result = streams_.find(Device::kDefaultStream); + assert(search_result != streams_.end() && "Tried using stream that wasn't created prior"); if (curr_fiber_ == search_result->second) { LOG_TRACE("[cusan] syncing all other blocking GPU streams to run after since its default stream") - for (const auto& [s, sync_var] : gpu.streams_) { + for (const auto& [s, sync_var] : streams_) { if (s.isBlocking && !s.isDefaultStream()) { LOG_TRACE("[cusan] happens before " << s.handle) TsanHappensBefore(sync_var); @@ -166,31 +128,28 @@ class Runtime { } void register_stream(Stream stream) { - auto& gpu = get_device(current_device_); - static uint32_t n_streams = 0; - auto search_result = gpu.streams_.find(stream); - assert(search_result == gpu.streams_.end() && "Registered stream twice"); + auto search_result = streams_.find(stream); + assert(search_result == streams_.end() && "Registered stream twice"); TsanFiber fiber = TsanCreateFiber(0); stats_recorder.inc_TsanCreateFiber(); char name[32]; snprintf(name, 32, "cuda_stream %u", n_streams++); TsanSetFiberName(fiber, name); - gpu.streams_.insert({stream, fiber}); + streams_.insert({stream, fiber}); } void switch_to_stream(Stream stream) { LOG_TRACE("[cusan] Switching to stream: " << stream.handle) - auto& gpu = get_device(current_device_); - auto search_result = gpu.streams_.find(stream); - assert(search_result != gpu.streams_.end() && "Tried using stream that wasn't created prior"); + auto search_result = streams_.find(stream); + assert(search_result != streams_.end() && "Tried using stream that wasn't created prior"); TsanSwitchToFiber(search_result->second, 0); stats_recorder.inc_TsanSwitchToFiber(); if (search_result->first.isDefaultStream()) { LOG_TRACE("[cusan] syncing all other blocking GPU streams since its default stream") // then we are on the default stream and as such want to synchronize behind all other streams // unless they are nonBlocking - for (auto& [s, sync_var] : gpu.streams_) { + for (auto& [s, sync_var] : streams_) { if (s.isBlocking && !s.isDefaultStream()) { LOG_TRACE("[cusan] happens after " << s.handle) TsanHappensAfter(sync_var); @@ -203,8 +162,7 @@ class Runtime { void happens_after_all_streams(bool onlyBlockingStreams = false) { LOG_TRACE("[cusan] happens_after_all_streams but only blocking ones: " << onlyBlockingStreams) - auto& gpu = get_device(current_device_); - for (const auto& [stream, fiber] : gpu.streams_) { + for (const auto& [stream, fiber] : streams_) { if (!onlyBlockingStreams || stream.isBlocking) { LOG_TRACE("[cusan] happens after " << stream.handle) TsanHappensAfter(fiber); @@ -214,51 +172,43 @@ class Runtime { } void happens_after_stream(Stream stream) { - auto& gpu = get_device(current_device_); - - auto search_result = gpu.streams_.find(stream); - assert(search_result != gpu.streams_.end() && "Tried using stream that wasn't created prior"); + auto search_result = streams_.find(stream); + assert(search_result != streams_.end() && "Tried using stream that wasn't created prior"); TsanHappensAfter(search_result->second); stats_recorder.inc_TsanHappensAfter(); } void record_event(Event event, Stream stream) { LOG_TRACE("[cusan] Record event: " << event << " stream:" << stream.handle); - auto& gpu = get_device(current_device_); - gpu.events_[event] = stream; + events_[event] = stream; } // Sync the event on the current stream void sync_event(Event event) { - auto& gpu = get_device(current_device_); - - auto search_result = gpu.events_.find(event); - assert(search_result != gpu.events_.end() && "Tried using event that wasn't recorded to prior"); - LOG_TRACE("[cusan] Sync event: " << event << " recorded on stream:" << gpu.events_[event].handle) - happens_after_stream(gpu.events_[event]); + auto search_result = events_.find(event); + assert(search_result != events_.end() && "Tried using event that wasn't recorded to prior"); + LOG_TRACE("[cusan] Sync event: " << event << " recorded on stream:" << events_[event].handle) + happens_after_stream(events_[event]); } void insert_allocation(void* ptr, AllocationInfo info) { - auto& gpu = get_device(current_device_); - assert(gpu.allocations_.find(ptr) == gpu.allocations_.end() && "Registered an allocation multiple times"); - gpu.allocations_[ptr] = info; + assert(allocations_.find(ptr) == allocations_.end() && "Registered an allocation multiple times"); + allocations_[ptr] = info; } void free_allocation(void* ptr, bool must_exist = true) { - auto& gpu = get_device(current_device_); - bool found = gpu.allocations_.find(ptr) != gpu.allocations_.end(); + bool found = allocations_.find(ptr) != allocations_.end(); if (must_exist) { assert(found && "Tried to delete a non existent allocation"); } if (found) { - gpu.allocations_.erase(ptr); + allocations_.erase(ptr); } } AllocationInfo* get_allocation_info(const void* ptr) { - auto& gpu = get_device(current_device_); - auto res = gpu.allocations_.find(ptr); - if (res == gpu.allocations_.end()) { + auto res = allocations_.find(ptr); + if (res == allocations_.end()) { // fallback find if it lies within a region // for(auto [alloc_ptr, alloc_info]: allocations_){ // if(alloc_ptr < ptr && ((const char*)alloc_ptr) + alloc_info.size > ptr){ @@ -270,10 +220,7 @@ class Runtime { return &res->second; } - private: - Runtime() = default; - - ~Runtime() { + void output_statistics() { #undef cusan_stat_handle #define cusan_stat_handle(name) table.put(Row::make(#name, stats_recorder.get_##name())); #if CUSAN_SOFTCOUNTER @@ -295,6 +242,56 @@ class Runtime { #endif #undef cusan_stat_handle #undef CUSAN_CUDA_EVENT_LIST + } +}; + +class Runtime { + std::map devices_; + int32_t current_device_; + bool init_; + + public: + static Runtime& get() { + static Runtime run_t; + if (!run_t.init_) { + run_t.current_device_ = get_current_device_id(); + run_t.devices_.insert({run_t.current_device_, {}}); + run_t.init_ = true; + } + return run_t; + } + + Runtime(const Runtime&) = delete; + + void operator=(const Runtime&) = delete; + + Device& get_current_device() { + return devices_.at(current_device_); + } + + Device& get_device(DeviceID id) { + if (devices_.find(id) == devices_.end()) { + devices_.insert({id, {}}); + } + return devices_.at(id); + } + + void set_device(DeviceID device) { + if (devices_.find(device) == devices_.end()) { + devices_.insert({device, {}}); + } + current_device_ = device; + } + + private: + Runtime() = default; + + ~Runtime() { +#if CUSAN_SOFTCOUNTER + for(auto& [_, device]: devices_){ + device.output_statistics(); + } +#endif #ifdef CUSAN_FIBERPOOL // TsanFiberPoolFini(); @@ -308,7 +305,7 @@ using namespace cusan::runtime; namespace helper { #ifndef CUSAN_TYPEART -inline std::optional find_memory_alloc_size(Runtime& runtime, const void* ptr) { +inline std::optional find_memory_alloc_size(Device& runtime, const void* ptr) { const auto& allocs = runtime.get_allocations(); // if there exists any allocation @@ -335,7 +332,7 @@ inline std::optional find_memory_alloc_size(Runtime& runtime, const void return {}; } #else -inline std::optional find_memory_alloc_size(const Runtime&, const void* ptr) { +inline std::optional find_memory_alloc_size(const Device&, const void* ptr) { size_t alloc_size{0}; int alloc_id{0}; auto query_status = typeart_get_type(ptr, &alloc_id, &alloc_size); @@ -354,7 +351,7 @@ inline std::optional find_memory_alloc_size(const Runtime&, const void* void _cusan_kernel_register(void** kernel_args, short* modes, int n, RawStream stream) { LOG_TRACE("[cusan]Kernel Register with " << n << " Args and on stream:" << stream) - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); llvm::SmallVector sizes; for (int i = 0; i < n; ++i) { @@ -403,21 +400,21 @@ void _cusan_kernel_register(void** kernel_args, short* modes, int n, RawStream s void _cusan_sync_device() { LOG_TRACE("[cusan]Sync Device\n") - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_sync_device_calls(); runtime.happens_after_all_streams(); } void _cusan_event_record(Event event, RawStream stream) { LOG_TRACE("[cusan]Event Record") - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_event_record_calls(); runtime.record_event(event, Stream(stream)); } void _cusan_sync_stream(RawStream raw_stream) { LOG_TRACE("[cusan]Sync Stream" << raw_stream) - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_sync_stream_calls(); const auto stream = Stream(raw_stream); if (stream.isDefaultStream()) { @@ -431,28 +428,28 @@ void _cusan_sync_stream(RawStream raw_stream) { void _cusan_sync_event(Event event) { LOG_TRACE("[cusan]Sync Event" << event) - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_sync_event_calls(); runtime.sync_event(event); } void _cusan_create_event(Event*) { LOG_TRACE("[cusan]create event") - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_create_event_calls(); } void _cusan_create_stream(RawStream* stream, cusan_StreamCreateFlags flags) { LOG_TRACE("[cusan]create stream with flags: " << flags << " isNonBlocking: " << (bool)(flags & cusan_StreamFlagsNonBlocking)) - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_create_stream_calls(); runtime.register_stream(Stream(*stream, !(bool)(flags & cusan_StreamFlagsNonBlocking))); } void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int) { LOG_TRACE("[cusan]StreamWaitEvent stream:" << stream << " on event:" << event) - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_stream_wait_event_calls(); runtime.switch_to_stream(Stream(stream)); runtime.sync_event(event); @@ -464,7 +461,7 @@ void _cusan_host_alloc(void** ptr, size_t size, unsigned int) { // at least based of this presentation and some comments in the cuda forums this syncs the whole device // https://developer.download.nvidia.com/CUDA/training/StreamsAndConcurrencyWebinar.pdf LOG_TRACE("[cusan]host alloc " << *ptr << " with size " << size) - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_host_alloc_calls(); // runtime.happens_after_all_streams(); @@ -473,27 +470,27 @@ void _cusan_host_alloc(void** ptr, size_t size, unsigned int) { void _cusan_host_free(void* ptr) { LOG_TRACE("[cusan]host free") - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_host_free_calls(); runtime.free_allocation(ptr); } void _cusan_host_register(void* ptr, size_t size, unsigned int) { LOG_TRACE("[cusan]host register " << ptr << " with size:" << size); - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_host_register_calls(); runtime.insert_allocation(ptr, AllocationInfo::Pinned(size)); } void _cusan_host_unregister(void* ptr) { LOG_TRACE("[cusan]host unregister " << ptr); - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_host_unregister_calls(); runtime.free_allocation(ptr); } void _cusan_managed_alloc(void** ptr, size_t size, unsigned int) { LOG_TRACE("[cusan]Managed host alloc " << *ptr << " with size " << size << " -> implicit device sync") - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_managed_alloc_calls(); runtime.happens_after_all_streams(); runtime.insert_allocation(*ptr, AllocationInfo::Managed(size)); @@ -503,7 +500,7 @@ void _cusan_device_alloc(void** ptr, size_t size) { // implicit syncs device // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#stream-ordered-memory-allocator LOG_TRACE("[cusan]Device alloc " << *ptr << " with size " << size << " -> implicit device sync") - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_device_alloc_calls(); runtime.insert_allocation(*ptr, AllocationInfo::Device(size)); @@ -514,7 +511,7 @@ void _cusan_device_free(void* ptr) { // implicit syncs device // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#stream-ordered-memory-allocator LOG_TRACE("[cusan]Device free " << ptr << " -> TODO maybe implicit device sync") - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_device_free_calls(); runtime.happens_after_all_streams(); } @@ -522,7 +519,7 @@ void _cusan_device_free(void* ptr) { // TODO: get rid of cudaSpecifc check for cudaSuccess 0 void _cusan_stream_query(RawStream stream, unsigned int err) { LOG_TRACE("[cusan] Stream query " << stream << " -> " << err) - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_stream_query_calls(); if (err == 0) { LOG_TRACE("[cusan] syncing") @@ -534,7 +531,7 @@ void _cusan_stream_query(RawStream stream, unsigned int err) { // TODO: get rid of cudaSpecifc check for cudaSuccess 0 void _cusan_event_query(Event event, unsigned int err) { LOG_TRACE("[cusan] Event query " << event << " -> " << err) - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_event_query_calls(); if (err == 0) { LOG_TRACE("[cusan] syncing") @@ -555,7 +552,7 @@ void _cusan_choose_device(DeviceID* device) { void _cusan_memset_async_impl(void* target, size_t count, RawStream stream) { // The Async versions are always asynchronous with respect to the host. - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_memset_async_calls(); runtime.switch_to_stream(Stream(stream)); TsanMemoryWritePC(target, count, __builtin_return_address(0)); @@ -566,9 +563,9 @@ void _cusan_memset_async_impl(void* target, size_t count, RawStream stream) { void _cusan_memset_impl(void* target, size_t count) { // The cudaMemset functions are asynchronous with respect to the host except when the target memory is pinned host // memory. - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_memset_calls(); - runtime.switch_to_stream(Runtime::kDefaultStream); + runtime.switch_to_stream(Device::kDefaultStream); LOG_TRACE("[cusan] " << "Write to " << target << " with size: " << count) TsanMemoryWritePC(target, count, __builtin_return_address(0)); runtime.stats_recorder.inc_TsanMemoryWrite(); @@ -579,7 +576,7 @@ void _cusan_memset_impl(void* target, size_t count) { // if we couldn't find alloc info we just assume the worst and don't sync if ((alloc_info && (alloc_info->is_pinned || alloc_info->is_managed)) || CUSAN_SYNC_DETAIL_LEVEL == 0) { LOG_TRACE("[cusan] " << "Memset is blocking") - runtime.happens_after_stream(Runtime::kDefaultStream); + runtime.happens_after_stream(Device::kDefaultStream); } else { LOG_TRACE("[cusan] " << "Memset is not blocking") if (!alloc_info) { @@ -611,7 +608,7 @@ void _cusan_memset_async(void* target, size_t count, RawStream stream) { void _cusan_memcpy_async_impl(void* target, size_t write_size, const void* from, size_t read_size, cusan_MemcpyKind kind, RawStream stream) { - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_memcpy_async_calls(); if (kind == cusan_MemcpyHostToHost && CUSAN_SYNC_DETAIL_LEVEL == 1) { // 2. For transfers from any host memory to any host memory, the function is fully synchronous with respect to the @@ -649,23 +646,23 @@ void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_ kind = infer_memcpy_direction(target, from); } - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_memcpy_calls(); if (CUSAN_SYNC_DETAIL_LEVEL == 0) { LOG_TRACE("[cusan] DefaultStream+Blocking") // In this mode: Memcpy always blocks, no detailed view w.r.t. memory direction - runtime.switch_to_stream(Runtime::kDefaultStream); + runtime.switch_to_stream(Device::kDefaultStream); TsanMemoryReadPC(from, read_size, __builtin_return_address(0)); runtime.stats_recorder.inc_TsanMemoryRead(); TsanMemoryWritePC(target, write_size, __builtin_return_address(0)); runtime.stats_recorder.inc_TsanMemoryWrite(); runtime.happens_before(); runtime.switch_to_cpu(); - runtime.happens_after_stream(Runtime::kDefaultStream); + runtime.happens_after_stream(Device::kDefaultStream); } else if (kind == cusan_MemcpyDeviceToDevice) { // 4. For transfers from device memory to device memory, no host-side synchronization is performed. LOG_TRACE("[cusan] DefaultStream") - runtime.switch_to_stream(Runtime::kDefaultStream); + runtime.switch_to_stream(Device::kDefaultStream); TsanMemoryReadPC(from, read_size, __builtin_return_address(0)); runtime.stats_recorder.inc_TsanMemoryRead(); TsanMemoryWritePC(target, write_size, __builtin_return_address(0)); @@ -675,14 +672,14 @@ void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_ // 3. For transfers from device to either pageable or pinned host memory, the function returns only once the copy // has completed. LOG_TRACE("[cusan] DefaultStream+Blocking") - runtime.switch_to_stream(Runtime::kDefaultStream); + runtime.switch_to_stream(Device::kDefaultStream); TsanMemoryReadPC(from, read_size, __builtin_return_address(0)); runtime.stats_recorder.inc_TsanMemoryRead(); TsanMemoryWritePC(target, write_size, __builtin_return_address(0)); runtime.stats_recorder.inc_TsanMemoryWrite(); runtime.happens_before(); runtime.switch_to_cpu(); - runtime.happens_after_stream(Runtime::kDefaultStream); + runtime.happens_after_stream(Device::kDefaultStream); } else if (kind == cusan_MemcpyHostToDevice) { // 1. For transfers from pageable host memory to device memory, a stream sync is performed before the copy is // initiated. @@ -690,7 +687,7 @@ void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_ auto* alloc_info = runtime.get_allocation_info(from); // if we couldn't find alloc info we just assume the worst and don't sync if (alloc_info && !alloc_info->is_pinned) { - runtime.happens_after_stream(Runtime::kDefaultStream); + runtime.happens_after_stream(Device::kDefaultStream); LOG_TRACE("[cusan] DefaultStream+Blocking") } else { LOG_TRACE("[cusan] DefaultStream") @@ -699,20 +696,20 @@ void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_ // device memory TsanMemoryReadPC(from, read_size, __builtin_return_address(0)); runtime.stats_recorder.inc_TsanMemoryRead(); - runtime.switch_to_stream(Runtime::kDefaultStream); + runtime.switch_to_stream(Device::kDefaultStream); TsanMemoryWritePC(target, write_size, __builtin_return_address(0)); runtime.stats_recorder.inc_TsanMemoryWrite(); runtime.happens_before(); runtime.switch_to_cpu(); - runtime.happens_after_stream(Runtime::kDefaultStream); + runtime.happens_after_stream(Device::kDefaultStream); } else if (kind == cusan_MemcpyHostToHost) { // 5. For transfers from any host memory to any host memory, the function is fully synchronous with respect to the // host. LOG_TRACE("[cusan] DefaultStream+Blocking") - runtime.switch_to_stream(Runtime::kDefaultStream); + runtime.switch_to_stream(Device::kDefaultStream); runtime.happens_before(); runtime.switch_to_cpu(); - runtime.happens_after_stream(Runtime::kDefaultStream); + runtime.happens_after_stream(Device::kDefaultStream); TsanMemoryReadPC(from, read_size, __builtin_return_address(0)); runtime.stats_recorder.inc_TsanMemoryRead(); TsanMemoryWritePC(target, write_size, __builtin_return_address(0)); diff --git a/lib/runtime/CusanRuntime.h b/lib/runtime/CusanRuntime.h index c970f41..784f805 100644 --- a/lib/runtime/CusanRuntime.h +++ b/lib/runtime/CusanRuntime.h @@ -36,7 +36,7 @@ using Event = const void*; using RawStream = const void*; using DeviceID = int; cusan_MemcpyKind infer_memcpy_direction(const void* target, const void* from); -DeviceID get_current_device(); +DeviceID get_current_device_id(); } // namespace cusan::runtime using cusan::runtime::Event; using cusan::runtime::RawStream; diff --git a/lib/runtime/CusanRuntime_cudaSpecific.cpp b/lib/runtime/CusanRuntime_cudaSpecific.cpp index 8d8f3ec..fbe7d26 100644 --- a/lib/runtime/CusanRuntime_cudaSpecific.cpp +++ b/lib/runtime/CusanRuntime_cudaSpecific.cpp @@ -13,7 +13,7 @@ namespace cusan::runtime { -DeviceID get_current_device(){ +DeviceID get_current_device_id(){ DeviceID res; cudaGetDevice(&res); return res; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 34234fe..047ed65 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -72,7 +72,7 @@ function(cusan_add_lit_target) set(TARGET_NAME check-cusan-${suite}) endif() - if(NOT EXISTS ${SUITE_PATH} AND NOT ${suite} STREQUAL "staging") + if(NOT EXISTS ${SUITE_PATH} AND NOT ${suite} STREQUAL "staging" AND NOT ${suite} STREQUAL "multi_gpu") message(WARNING "Could not find suitable lit test target at ${SUITE_PATH}") continue() endif() @@ -105,6 +105,7 @@ set(CUSAN_SUITES pass kernel_analysis staging + multi_gpu ) include(ProcessorCount) @@ -119,6 +120,7 @@ set(CUSAN_SUITES_WORKERS ${NUM_CPU} ${NUM_CPU} 1 + 1 ) cusan_add_lit_target(SUITES ${CUSAN_SUITES} WORKERS ${CUSAN_SUITES_WORKERS}) diff --git a/test/multi_gpu/01_device_sync.c b/test/multi_gpu/01_device_sync.c new file mode 100644 index 0000000..319b6f4 --- /dev/null +++ b/test/multi_gpu/01_device_sync.c @@ -0,0 +1,89 @@ +// clang-format off +// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe +// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s + +// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe +// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC + +// clang-format on + +// CHECK-DAG: data race +// CHECK-DAG: [Error] sync + +// CHECK-SYNC-NOT: data race +// CHECK-SYNC-NOT: [Error] sync + +#include +#include + +__global__ void write_kernel_delay(int* arr, const int N, const unsigned int delay) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; +#if __CUDA_ARCH__ >= 700 + for (int i = 0; i < tid; i++) { + __nanosleep(delay); + } +#else + printf(">>> __CUDA_ARCH__ !\n"); +#endif + if (tid < N) { + arr[tid] = (tid + 1); + } +} + +int main() { + const int size = 256; + const int threadsPerBlock = size; + const int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock; + int* managed_data; + int* managed_data2; + + int n_devices = 0; + cudaGetDeviceCount(&n_devices); + if (n_devices < 2) { + printf("This test is designed for CUDA on multiple devices but there is only one or none here. Exiting.\n"); + return 1; + } + + + cudaSetDevice(0); + cudaMallocManaged(&managed_data, size * sizeof(int)); + cudaMemset(managed_data, 0, size * sizeof(int)); + + + cudaSetDevice(1); + cudaMallocManaged(&managed_data2, size * sizeof(int)); + cudaMemset(managed_data2, 0, size * sizeof(int)); + + + + cudaSetDevice(0); + write_kernel_delay<<>>(managed_data, size, 1316134912); + + //if we only have the later synchronize we will only syncrhonize the second device +#ifdef CUSAN_SYNC + cudaDeviceSynchronize(); +#endif + + cudaSetDevice(1); + write_kernel_delay<<>>(managed_data2, size, 1); + cudaDeviceSynchronize(); + + for (int i = 0; i < size; i++) { + if (managed_data[i] == 0) { + printf("[Error] sync managed_data %i\n", managed_data[i]); + break; + } + } + for (int i = 0; i < size; i++) { + if (managed_data2[i] == 0) { + printf("[Error] sync managed_data2 %i\n", managed_data[i]); + break; + } + } + + + + cudaFree(managed_data); + cudaFree(managed_data2); + return 0; +} diff --git a/test/multi_gpu/02_device_sync_event.c b/test/multi_gpu/02_device_sync_event.c new file mode 100644 index 0000000..2317e29 --- /dev/null +++ b/test/multi_gpu/02_device_sync_event.c @@ -0,0 +1,75 @@ +// clang-format off +// RUN: %wrapper-cc %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe +// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s + +// RUN: %wrapper-cc %clang_args -DCUSAN_SYNC %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe +// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC + +// clang-format on + +// CHECK-DAG: data race +// CHECK-DAG: [Error] sync + +// CHECK-SYNC-NOT: data race +// CHECK-SYNC-NOT: [Error] sync + +// XFAIL: * + +#include +#include + +__global__ void kernel(int* data) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; +#if __CUDA_ARCH__ >= 700 + for (int i = 0; i < tid; i++) { + __nanosleep(1000000U); + } +#else + printf(">>> __CUDA_ARCH__ !\n"); +#endif + data[tid] = (tid + 1); +} + +int main() { + const int size = 256; + const int threadsPerBlock = 256; + const int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock; + + int n_devices = 0; + cudaGetDeviceCount(&n_devices); + if (n_devices < 2) { + printf("This test is designed for CUDA on multiple devices but there is only one or none here. Exiting.\n"); + return 1; + } + + cudaSetDevice(0); + + int* d_data; // Unified Memory pointer + + // Allocate Unified Memory + cudaMallocManaged(&d_data, size * sizeof(int)); + cudaMemset(d_data, 0, size * sizeof(int)); + + cudaEvent_t endEvent; + cudaEventCreate(&endEvent); + kernel<<>>(d_data); + cudaEventRecord(endEvent); + + cudaSetDevice(1); + +#ifdef CUSAN_SYNC + cudaEventSynchronize(endEvent); +#endif + + for (int i = 0; i < size; i++) { + if (d_data[i] < 1) { + printf("[Error] sync\n"); + break; + } + } + + cudaEventDestroy(endEvent); + cudaFree(d_data); + + return 0; +} diff --git a/test/multi_gpu/TSan_External.h b/test/multi_gpu/TSan_External.h new file mode 100644 index 0000000..36849c9 --- /dev/null +++ b/test/multi_gpu/TSan_External.h @@ -0,0 +1,276 @@ +/* Part of the MUST Project, under BSD-3-Clause License + * See https://hpc.rwth-aachen.de/must/LICENSE for license information. + * SPDX-License-Identifier: BSD-3-Clause + */ + +/** + * @file TSan_External.h + * Functions exported from ThreadSaniziter that can be used for dynamic annotations. + * https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/tsan/rtl/tsan_interface.cpp + * https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cpp + */ + +#ifndef TSAN_EXTERNAL_H +#define TSAN_EXTERNAL_H + +typedef unsigned long uptr; + +typedef enum { mo_relaxed, mo_consume, mo_acquire, mo_release, mo_acq_rel, mo_seq_cst } morder; + +typedef unsigned char a8; +typedef unsigned short a16; +typedef unsigned int a32; +typedef unsigned long long a64; + +#ifdef MUST_DEBUG +// Print an error message *once* if an annotation function is used that is not overwritten by the +// TSan runtime +#define FALLBACK_PRINT(func_name) \ + { \ + static bool once = false; \ + if (!once) { \ + printf( \ + "[MUST-ERROR] %s fallback called, check your TSan runtime and the call " \ + "signature\n", \ + func_name); \ + once = true; \ + } \ + } +#else +#define FALLBACK_PRINT(func_name) +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// ThreadSanitizer defines the following functions that can be used in MUST for dynamic annotations. +void __attribute__((weak)) AnnotateHappensAfter(const char* file, int line, const volatile void* cv) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) AnnotateHappensBefore(const char* file, int line, const volatile void* cv) { + FALLBACK_PRINT(__func__); +} + +void __attribute__((weak)) AnnotateNewMemory(const char* file, int line, const volatile void* cv, uptr size) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) AnnotateMemoryRead(const char* file, int line, const volatile void* cv, uptr size) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) AnnotateMemoryWrite(const char* file, int line, const volatile void* cv, uptr size) { + FALLBACK_PRINT(__func__); +} + +void __attribute__((weak)) AnnotateIgnoreReadsBegin(const char* file, int line) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) AnnotateIgnoreReadsEnd(const char* file, int line) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) AnnotateIgnoreWritesBegin(const char* file, int line) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) AnnotateIgnoreWritesEnd(const char* file, int line) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) AnnotateIgnoreSyncBegin(const char* file, int line) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) AnnotateIgnoreSyncEnd(const char* file, int line) { + FALLBACK_PRINT(__func__); +} + +void __attribute__((weak)) AnnotateRWLockCreate(const char* file, int line, const volatile void* cv) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) AnnotateRWLockDestroy(const char* file, int line, const volatile void* cv) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) +AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv, unsigned long long is_w) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) +AnnotateRWLockReleased(const char* file, int line, const volatile void* cv, unsigned long long is_w) { + FALLBACK_PRINT(__func__); +} + +void __attribute__((weak)) __tsan_read_range(void* addr, uptr size) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) __tsan_write_range(void* addr, uptr size) { + FALLBACK_PRINT(__func__); +} + +void __attribute__((weak)) __tsan_read1_pc(void* addr, void* pc) { +} +void __attribute__((weak)) __tsan_write1_pc(void* addr, void* pc) { +} +void __attribute__((weak)) __tsan_read2_pc(void* addr, void* pc) { +} +void __attribute__((weak)) __tsan_write2_pc(void* addr, void* pc) { +} +void __attribute__((weak)) __tsan_read4_pc(void* addr, void* pc) { +} +void __attribute__((weak)) __tsan_write4_pc(void* addr, void* pc) { +} +void __attribute__((weak)) __tsan_read8_pc(void* addr, void* pc) { +} +void __attribute__((weak)) __tsan_write8_pc(void* addr, void* pc) { +} + +#define annotateHelper(rw, s) \ + if (len >= s) { \ + printf("annotateHelper(%s, %i, %p, %li)\n", #rw, s, addr, (uptr)pc); \ + len -= s; \ + __tsan_##rw##s##_pc(addr, pc); \ + addr += s; \ + size -= s; \ + } + +void __attribute__((weak)) __tsan_read_range_pc(void* a, uptr size, void* pc) { + char* addr = (char*)a; + uptr len = ((uptr)addr) % 8; + if (size < len) + len = size; + annotateHelper(read, 4) annotateHelper(read, 2) annotateHelper(read, 1) for (; size > 7; size -= 8) { + __tsan_read8_pc(addr, pc); + addr += 8; + } + len = size; + annotateHelper(read, 4) annotateHelper(read, 2) annotateHelper(read, 1) +} + +void __attribute__((weak)) __tsan_write_range_pc(void* a, uptr size, void* pc) { + char* addr = (char*)a; + uptr len = ((uptr)addr) % 8; + // printf("__tsan_write_range_pc(%p, %li, %li), %li\n", addr, size, pc, len); + if (size < len) + len = size; + annotateHelper(write, 4) annotateHelper(write, 2) annotateHelper(write, 1) for (; size > 7; size -= 8) { + // printf("__tsan_write8_pc(%p, %li)\n", addr, pc); + __tsan_write8_pc(addr, pc); + addr += 8; + } + len = size; + annotateHelper(write, 4) annotateHelper(write, 2) annotateHelper(write, 1) +} + +void __attribute__((weak)) __tsan_func_entry(void* call_pc) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) __tsan_func_exit() { + FALLBACK_PRINT(__func__); +} + +a8 __attribute__((weak)) __tsan_atomic8_load(const volatile a8* a, morder mo) { + FALLBACK_PRINT(__func__); + return 0; +} +a16 __attribute__((weak)) __tsan_atomic16_load(const volatile a16* a, morder mo) { + FALLBACK_PRINT(__func__); + return 0; +} +a32 __attribute__((weak)) __tsan_atomic32_load(const volatile a32* a, morder mo) { + FALLBACK_PRINT(__func__); + return 0; +} +a64 __attribute__((weak)) __tsan_atomic64_load(const volatile a64* a, morder mo) { + FALLBACK_PRINT(__func__); + return 0; +} +void __attribute__((weak)) __tsan_atomic8_store(volatile a8* a, a8 v, morder mo) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) __tsan_atomic16_store(volatile a16* a, a16 v, morder mo) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) __tsan_atomic32_store(volatile a32* a, a32 v, morder mo) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) __tsan_atomic64_store(volatile a64* a, a64 v, morder mo) { + FALLBACK_PRINT(__func__); +} + +// TLC extension +void __attribute__((weak)) AnnotateInitTLC(const char* file, int line, const volatile void* cv) { + FALLBACK_PRINT(__func__); + AnnotateHappensBefore(file, line, cv); +} +void __attribute__((weak)) AnnotateStartTLC(const char* file, int line, const volatile void* cv) { + FALLBACK_PRINT(__func__); + AnnotateHappensAfter(file, line, cv); +} + +// Fibers +void __attribute__((weak)) * __tsan_get_current_fiber() { + FALLBACK_PRINT(__func__); + return nullptr; +} +void __attribute__((weak)) * __tsan_create_fiber(unsigned flags) { + FALLBACK_PRINT(__func__); + return nullptr; +} +void __attribute__((weak)) __tsan_destroy_fiber(void* fiber) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) __tsan_switch_to_fiber(void* fiber, unsigned flags) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) __tsan_set_fiber_name(void* fiber, const char* name) { + FALLBACK_PRINT(__func__); +} + +#ifdef __cplusplus +} +#endif + +#define TsanHappensBefore(cv) AnnotateHappensBefore(__FILE__, __LINE__, cv) +#define TsanHappensAfter(cv) AnnotateHappensAfter(__FILE__, __LINE__, cv) + +#define TsanIgnoreWritesBegin() AnnotateIgnoreWritesBegin(__FILE__, __LINE__) +#define TsanIgnoreWritesEnd() AnnotateIgnoreWritesEnd(__FILE__, __LINE__) +#define TsanIgnoreReadsBegin() AnnotateIgnoreReadsBegin(__FILE__, __LINE__) +#define TsanIgnoreReadsEnd() AnnotateIgnoreReadsEnd(__FILE__, __LINE__) +#define TsanIgnoreSyncBegin() AnnotateIgnoreSyncBegin(__FILE__, __LINE__) +#define TsanIgnoreSyncEnd() AnnotateIgnoreSyncEnd(__FILE__, __LINE__) + +#define TsanInitTLC(cv) AnnotateInitTLC(__FILE__, __LINE__, cv) +#define TsanStartTLC(cv) AnnotateStartTLC(__FILE__, __LINE__, cv) + +#define TsanCreateFiber(flags) __tsan_create_fiber(flags) +#define TsanDestroyFiber(fiber) __tsan_destroy_fiber(fiber) +#define TsanSwitchToFiber(fiber, flags) __tsan_switch_to_fiber(fiber, flags) +#define TsanGetCurrentFiber() __tsan_get_current_fiber() +#define TsanSetFiberName(fiber, name) __tsan_set_fiber_name(fiber, name) + +#define TsanMemoryRead(addr, size) __tsan_read_range(addr, size) +#define TsanMemoryWrite(addr, size) __tsan_write_range(addr, size) +#define TsanMemoryReadPC(addr, size, pc) __tsan_read_range_pc(addr, size, pc) +#define TsanMemoryWritePC(addr, size, pc) __tsan_write_range_pc(addr, size, pc) + +#define TsanFuncEntry(pc) __tsan_func_entry(pc) +#define TsanFuncExit() __tsan_func_exit() + +#define TsanPCMemoryRead(pc, addr, size) AnnotatePCMemoryRead(pc, __FILE__, __LINE__, addr, size) +#define TsanPCMemoryWrite(pc, addr, size) AnnotatePCMemoryWrite(pc, __FILE__, __LINE__, addr, size) + +#define TsanNewMemory(addr, size) AnnotateNewMemory(__FILE__, __LINE__, addr, size) +#define TsanFreeMemory(addr, size) AnnotateNewMemory(__FILE__, __LINE__, addr, size) + +#define TsanRWLockCreate(cv) AnnotateRWLockCreate(__FILE__, __LINE__, cv) +#define TsanRWLockDestroy(cv) AnnotateRWLockDestroy(__FILE__, __LINE__, cv) +#define TsanRWLockAcquired(cv, is_w) AnnotateRWLockAcquired(__FILE__, __LINE__, cv, is_w) +#define TsanRWLockReleased(cv, is_w) AnnotateRWLockReleased(__FILE__, __LINE__, cv, is_w) + +#define TsanAtomic8Load(a8, mo) __tsan_atomic8_load(a8, mo) +#define TsanAtomic8Store(a8, v, mo) __tsan_atomic8_store(a8, v, mo) +#define TsanAtomic16Load(a16, mo) __tsan_atomic16_load(a16, mo) +#define TsanAtomic16Store(a16, v, mo) __tsan_atomic16_store(a16, v, mo) +#define TsanAtomic32Load(a32, mo) __tsan_atomic32_load(a32, mo) +#define TsanAtomic32Store(a32, v, mo) __tsan_atomic32_store(a32, v, mo) +#define TsanAtomic64Load(a64, mo) __tsan_atomic64_load(a64, mo) +#define TsanAtomic64Store(a64, v, mo) __tsan_atomic64_store(a64, v, mo) + +#endif /*TSAN_EXTERNAL_H*/ \ No newline at end of file diff --git a/test/multi_gpu/suppressions.txt b/test/multi_gpu/suppressions.txt new file mode 100644 index 0000000..3292e71 --- /dev/null +++ b/test/multi_gpu/suppressions.txt @@ -0,0 +1,29 @@ +called_from_lib:libmpi_cxx* +called_from_lib:libmpi.so* +called_from_lib:libcuda.so* +called_from_lib:libtypeartRuntime.so* + +# Probably not required, from previous experiments +#called_from_lib:libucx* +called_from_lib:libucp* +called_from_lib:libucs* +#called_from_lib:libucm_cuda.so* +called_from_lib:libucm.so* +called_from_lib:libuct_ib.so.0 +called_from_lib:libuct.so.0 +called_from_lib:mca_pmix_pmix3x* +called_from_lib:libibverbs.so* +#called_from_lib:libevent_pthreads-2.1.so* +called_from_lib:libevent_core-2.1.so* +#called_from_lib:libhfi1verbs-rdmav34.so* +called_from_lib:libopen-pal.so.40* +#called_from_lib:mca_errmgr_default_app.so* +#called_from_lib:mca_gds_hash.so* +#called_from_lib:mca_ptl_tcp.so* +#called_from_lib:mca_bfrops_v20.so* +#called_from_lib:mca_psec_munge.so* +#called_from_lib:mca_gds_ds21.so* +#called_from_lib:libmca_common_dstore.so* +called_from_lib:libopen-rte.so.40* +called_from_lib:ld-linux-x86-64.so* +called_from_lib:libmlx5-rdma* From 7073b159c3ccc5488e49ad33966ac88b92dd9fef Mon Sep 17 00:00:00 2001 From: Tim Ziegler Date: Fri, 27 Dec 2024 18:48:03 +0100 Subject: [PATCH 04/15] Added per device runtime statistics --- lib/runtime/CusanRuntime.cpp | 78 ++++++++++++++------------- test/multi_gpu/02_device_sync_event.c | 1 - 2 files changed, 42 insertions(+), 37 deletions(-) diff --git a/lib/runtime/CusanRuntime.cpp b/lib/runtime/CusanRuntime.cpp index bca4839..5070131 100644 --- a/lib/runtime/CusanRuntime.cpp +++ b/lib/runtime/CusanRuntime.cpp @@ -82,7 +82,6 @@ class Device { // NOTE: assumed to be a ordered map so we can iterate in ascending order std::map allocations_; std::map streams_; - std::map events_; TsanFiber cpu_fiber_; TsanFiber curr_fiber_; @@ -90,7 +89,7 @@ class Device { static constexpr Stream kDefaultStream = Stream(); Recorder stats_recorder; - Device() { + Device() : stats_recorder() { // every device has a default stream { register_stream(Device::kDefaultStream); } cpu_fiber_ = TsanGetCurrentFiber(); @@ -100,6 +99,10 @@ class Device { return allocations_; } + [[nodiscard]] TsanFiber get_stream_fiber(Stream stream) { + return streams_[stream]; + } + void happens_before() { LOG_TRACE("[cusan] HappensBefore of curr fiber") TsanHappensBefore(curr_fiber_); @@ -178,19 +181,6 @@ class Device { stats_recorder.inc_TsanHappensAfter(); } - void record_event(Event event, Stream stream) { - LOG_TRACE("[cusan] Record event: " << event << " stream:" << stream.handle); - events_[event] = stream; - } - - // Sync the event on the current stream - void sync_event(Event event) { - auto search_result = events_.find(event); - assert(search_result != events_.end() && "Tried using event that wasn't recorded to prior"); - LOG_TRACE("[cusan] Sync event: " << event << " recorded on stream:" << events_[event].handle) - happens_after_stream(events_[event]); - } - void insert_allocation(void* ptr, AllocationInfo info) { assert(allocations_.find(ptr) == allocations_.end() && "Registered an allocation multiple times"); allocations_[ptr] = info; @@ -224,7 +214,7 @@ class Device { #undef cusan_stat_handle #define cusan_stat_handle(name) table.put(Row::make(#name, stats_recorder.get_##name())); #if CUSAN_SOFTCOUNTER - Table table{"Cusan runtime statistics"}; + Table table{"Cusan device statistics"}; #ifdef CUSAN_FIBERPOOL table.put(Row::make("Fiberpool", 1)); #else @@ -247,6 +237,7 @@ class Device { class Runtime { std::map devices_; + std::map> events_; int32_t current_device_; bool init_; @@ -255,7 +246,7 @@ class Runtime { static Runtime run_t; if (!run_t.init_) { run_t.current_device_ = get_current_device_id(); - run_t.devices_.insert({run_t.current_device_, {}}); + run_t.devices_[run_t.current_device_]; run_t.init_ = true; } return run_t; @@ -271,26 +262,40 @@ class Runtime { Device& get_device(DeviceID id) { if (devices_.find(id) == devices_.end()) { - devices_.insert({id, {}}); + devices_[id]; } return devices_.at(id); } void set_device(DeviceID device) { if (devices_.find(device) == devices_.end()) { - devices_.insert({device, {}}); + devices_[device]; } current_device_ = device; } + void record_event(Event event, Stream stream) { + LOG_TRACE("[cusan] Record event: " << event << " stream:" << stream.handle); + auto& current_device = get_current_device(); + auto search_result = current_device.get_stream_fiber(stream); + events_.insert({event, {search_result, ¤t_device}}); + } + + // Sync the event on the current stream + void sync_event(Event event) { + auto [stream_fiber, device] = events_[event]; + TsanHappensAfter(stream_fiber); + device->stats_recorder.inc_TsanHappensAfter(); + } + private: Runtime() = default; ~Runtime() { #if CUSAN_SOFTCOUNTER - for(auto& [_, device]: devices_){ - device.output_statistics(); - } + for (auto& [_, device] : devices_) { + device.output_statistics(); + } #endif #ifdef CUSAN_FIBERPOOL @@ -407,8 +412,9 @@ void _cusan_sync_device() { void _cusan_event_record(Event event, RawStream stream) { LOG_TRACE("[cusan]Event Record") - auto& runtime = Runtime::get().get_current_device(); - runtime.stats_recorder.inc_event_record_calls(); + auto& runtime = Runtime::get(); + auto& device = runtime.get_current_device(); + device.stats_recorder.inc_event_record_calls(); runtime.record_event(event, Stream(stream)); } @@ -428,8 +434,9 @@ void _cusan_sync_stream(RawStream raw_stream) { void _cusan_sync_event(Event event) { LOG_TRACE("[cusan]Sync Event" << event) - auto& runtime = Runtime::get().get_current_device(); - runtime.stats_recorder.inc_sync_event_calls(); + auto& runtime = Runtime::get(); + auto& device = runtime.get_current_device(); + device.stats_recorder.inc_sync_event_calls(); runtime.sync_event(event); } @@ -449,12 +456,13 @@ void _cusan_create_stream(RawStream* stream, cusan_StreamCreateFlags flags) { void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int) { LOG_TRACE("[cusan]StreamWaitEvent stream:" << stream << " on event:" << event) - auto& runtime = Runtime::get().get_current_device(); - runtime.stats_recorder.inc_stream_wait_event_calls(); - runtime.switch_to_stream(Stream(stream)); + auto& runtime = Runtime::get(); + auto& device = runtime.get_current_device(); + device.stats_recorder.inc_stream_wait_event_calls(); + device.switch_to_stream(Stream(stream)); runtime.sync_event(event); - runtime.happens_before(); - runtime.switch_to_cpu(); + device.happens_before(); + device.switch_to_cpu(); } void _cusan_host_alloc(void** ptr, size_t size, unsigned int) { @@ -516,23 +524,21 @@ void _cusan_device_free(void* ptr) { runtime.happens_after_all_streams(); } -// TODO: get rid of cudaSpecifc check for cudaSuccess 0 void _cusan_stream_query(RawStream stream, unsigned int err) { LOG_TRACE("[cusan] Stream query " << stream << " -> " << err) auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_stream_query_calls(); if (err == 0) { LOG_TRACE("[cusan] syncing") - runtime.happens_after_stream(Stream{stream}); } } -// TODO: get rid of cudaSpecifc check for cudaSuccess 0 void _cusan_event_query(Event event, unsigned int err) { LOG_TRACE("[cusan] Event query " << event << " -> " << err) - auto& runtime = Runtime::get().get_current_device(); - runtime.stats_recorder.inc_event_query_calls(); + auto& runtime = Runtime::get(); + auto& device = runtime.get_current_device(); + device.stats_recorder.inc_event_query_calls(); if (err == 0) { LOG_TRACE("[cusan] syncing") runtime.sync_event(event); diff --git a/test/multi_gpu/02_device_sync_event.c b/test/multi_gpu/02_device_sync_event.c index 2317e29..000f05d 100644 --- a/test/multi_gpu/02_device_sync_event.c +++ b/test/multi_gpu/02_device_sync_event.c @@ -13,7 +13,6 @@ // CHECK-SYNC-NOT: data race // CHECK-SYNC-NOT: [Error] sync -// XFAIL: * #include #include From 8b8cea120a664354a47a70bff4aff4760ee955b6 Mon Sep 17 00:00:00 2001 From: Tim Ziegler Date: Fri, 31 Jan 2025 16:47:48 +0100 Subject: [PATCH 05/15] Added additional statistic for device switches --- lib/runtime/CusanRuntime.cpp | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/lib/runtime/CusanRuntime.cpp b/lib/runtime/CusanRuntime.cpp index 5070131..60c4f14 100644 --- a/lib/runtime/CusanRuntime.cpp +++ b/lib/runtime/CusanRuntime.cpp @@ -95,6 +95,10 @@ class Device { cpu_fiber_ = TsanGetCurrentFiber(); } + bool operator==(const Device& other) const { + return curr_fiber_ == other.curr_fiber_; + } + [[nodiscard]] const std::map& get_allocations() { return allocations_; } @@ -240,7 +244,9 @@ class Runtime { std::map> events_; int32_t current_device_; bool init_; - +#if CUSAN_SOFTCOUNTER + softcounter::AtomicCounter device_switches = 0; +#endif public: static Runtime& get() { static Runtime run_t; @@ -256,6 +262,15 @@ class Runtime { void operator=(const Runtime&) = delete; +#if CUSAN_SOFTCOUNTER + inline void inc_device_switches() { + this->device_switches++; + } + inline softcounter::Counter get_device_switches() { + return this->device_switches; + } +#endif + Device& get_current_device() { return devices_.at(current_device_); } @@ -271,6 +286,11 @@ class Runtime { if (devices_.find(device) == devices_.end()) { devices_[device]; } +#if CUSAN_SOFTCOUNTER + if (current_device_ != device) { + inc_device_switches(); + } +#endif current_device_ = device; } @@ -296,10 +316,11 @@ class Runtime { for (auto& [_, device] : devices_) { device.output_statistics(); } -#endif -#ifdef CUSAN_FIBERPOOL - // TsanFiberPoolFini(); + Table table{"Cusan runtime statistics"}; + table.put(Row::make("Device Switches ", get_device_switches())); + table.print(std::cout); + #endif } }; @@ -546,14 +567,16 @@ void _cusan_event_query(Event event, unsigned int err) { } void _cusan_set_device(DeviceID device) { - Runtime::get().set_device(device); + auto& r = Runtime::get(); + r.set_device(device); } void _cusan_choose_device(DeviceID* device) { // does this function ever return a nullptr? // and what would that mean assert(device); - Runtime::get().set_device(*device); + auto& r = Runtime::get(); + r.set_device(*device); } void _cusan_memset_async_impl(void* target, size_t count, RawStream stream) { From 4c2def3d9c3ec59d24b4355f829ead34a88559e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20H=C3=BCck?= Date: Mon, 10 Mar 2025 13:25:14 +0100 Subject: [PATCH 06/15] Adapt tests for wrapper --- test/multi_gpu/01_device_sync.c | 16 +++++----------- test/multi_gpu/02_device_sync_event.c | 5 ++--- test/runtime/11_struct_of_buff.c | 2 +- test/runtime/20_cuda_default_stream_sync.c | 7 ++++--- test/runtime/31_tsan_cuda_event.c | 2 +- test/runtime/32_tsan_async_copy.c | 16 +++++----------- test/runtime/33_tsan_wait_event.c | 2 +- 7 files changed, 19 insertions(+), 31 deletions(-) diff --git a/test/multi_gpu/01_device_sync.c b/test/multi_gpu/01_device_sync.c index 319b6f4..cc0b7b8 100644 --- a/test/multi_gpu/01_device_sync.c +++ b/test/multi_gpu/01_device_sync.c @@ -1,8 +1,8 @@ // clang-format off -// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe +// RUN: %wrapper-cc %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t.exe // RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe +// RUN: %wrapper-cc -DCUSAN_SYNC %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t-sync.exe // RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC // clang-format on @@ -13,8 +13,8 @@ // CHECK-SYNC-NOT: data race // CHECK-SYNC-NOT: [Error] sync -#include #include +#include __global__ void write_kernel_delay(int* arr, const int N, const unsigned int delay) { int tid = threadIdx.x + blockIdx.x * blockDim.x; @@ -44,22 +44,18 @@ int main() { return 1; } - cudaSetDevice(0); cudaMallocManaged(&managed_data, size * sizeof(int)); cudaMemset(managed_data, 0, size * sizeof(int)); - cudaSetDevice(1); cudaMallocManaged(&managed_data2, size * sizeof(int)); cudaMemset(managed_data2, 0, size * sizeof(int)); - - cudaSetDevice(0); write_kernel_delay<<>>(managed_data, size, 1316134912); - //if we only have the later synchronize we will only syncrhonize the second device + // if we only have the later synchronize we will only syncrhonize the second device #ifdef CUSAN_SYNC cudaDeviceSynchronize(); #endif @@ -74,15 +70,13 @@ int main() { break; } } - for (int i = 0; i < size; i++) { + for (int i = 0; i < size; i++) { if (managed_data2[i] == 0) { printf("[Error] sync managed_data2 %i\n", managed_data[i]); break; } } - - cudaFree(managed_data); cudaFree(managed_data2); return 0; diff --git a/test/multi_gpu/02_device_sync_event.c b/test/multi_gpu/02_device_sync_event.c index 000f05d..e0e5c23 100644 --- a/test/multi_gpu/02_device_sync_event.c +++ b/test/multi_gpu/02_device_sync_event.c @@ -1,8 +1,8 @@ // clang-format off -// RUN: %wrapper-cc %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe +// RUN: %wrapper-cc %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t.exe // RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -// RUN: %wrapper-cc %clang_args -DCUSAN_SYNC %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe +// RUN: %wrapper-cc %clang_args -DCUSAN_SYNC -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC // clang-format on @@ -13,7 +13,6 @@ // CHECK-SYNC-NOT: data race // CHECK-SYNC-NOT: [Error] sync - #include #include diff --git a/test/runtime/11_struct_of_buff.c b/test/runtime/11_struct_of_buff.c index eb594f1..121c3a7 100644 --- a/test/runtime/11_struct_of_buff.c +++ b/test/runtime/11_struct_of_buff.c @@ -10,7 +10,7 @@ // CHECK-SYNC-NOT: data race -#include "../support/gpu_mpi.h" +// #include "../support/gpu_mpi.h" struct BufferStorage { int* buff1; diff --git a/test/runtime/20_cuda_default_stream_sync.c b/test/runtime/20_cuda_default_stream_sync.c index 78aedfc..c232483 100644 --- a/test/runtime/20_cuda_default_stream_sync.c +++ b/test/runtime/20_cuda_default_stream_sync.c @@ -1,8 +1,8 @@ // clang-format off -// RUN: %wrapper-cxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe +// RUN: %wrapper-cc %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t.exe // RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s -// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe +// RUN: %wrapper-cc -DCUSAN_SYNC %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t-sync.exe // RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC // clang-format on @@ -13,8 +13,9 @@ // CHECK-SYNC-NOT: data race // CHECK-SYNC-NOT: [Error] sync -#include "../support/gpu_mpi.h" +// #include "../support/gpu_mpi.h" +#include #include __global__ void write_kernel_delay(int* arr, const int N, const unsigned int delay) { diff --git a/test/runtime/31_tsan_cuda_event.c b/test/runtime/31_tsan_cuda_event.c index beacaeb..aa213fa 100644 --- a/test/runtime/31_tsan_cuda_event.c +++ b/test/runtime/31_tsan_cuda_event.c @@ -11,7 +11,7 @@ // CHECK-SYNC-NOT: data race -#include "../support/gpu_mpi.h" +// #include "../support/gpu_mpi.h" #include diff --git a/test/runtime/32_tsan_async_copy.c b/test/runtime/32_tsan_async_copy.c index 5a6f7f7..14cfcd5 100644 --- a/test/runtime/32_tsan_async_copy.c +++ b/test/runtime/32_tsan_async_copy.c @@ -1,9 +1,9 @@ // clang-format off -// RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe -// RUN: %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s +// RUN: %wrapper-cc %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t.exe +// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s -// RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe -// RUN: %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC +// RUN: %wrapper-cc -DCUSAN_SYNC %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t-sync.exe +// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC // CHECK-DAG: data race // CHECK-DAG: [Error] sync @@ -13,8 +13,7 @@ // clang-format on -#include "../support/gpu_mpi.h" - +#include #include __global__ void kernel(int* arr, const int N) { @@ -32,11 +31,6 @@ __global__ void kernel(int* arr, const int N) { } int main(int argc, char* argv[]) { - if (!has_gpu_aware_mpi()) { - printf("This example is designed for CUDA-aware MPI. Exiting.\n"); - return 1; - } - cudaEvent_t first_finished_event; cudaEventCreate(&first_finished_event); cudaStream_t stream1; diff --git a/test/runtime/33_tsan_wait_event.c b/test/runtime/33_tsan_wait_event.c index c675758..af2a5a1 100644 --- a/test/runtime/33_tsan_wait_event.c +++ b/test/runtime/33_tsan_wait_event.c @@ -11,7 +11,7 @@ // clang-format on -#include "../support/gpu_mpi.h" +// #include "../support/gpu_mpi.h" #include From 335b4f1f2db593b69779f33a09b25294a981ce69 Mon Sep 17 00:00:00 2001 From: ahueck Date: Mon, 10 Mar 2025 14:26:09 +0100 Subject: [PATCH 07/15] Added callback for cuda synchronizations events (#5) Co-authored-by: Tim Ziegler --- .github/workflows/ci.yml | 32 +++++-------- CMakeLists.txt | 2 + lib/pass/AnalysisTransform.cpp | 36 +++++++++++++++ lib/pass/AnalysisTransform.h | 19 ++++++++ lib/pass/CusanPass.cpp | 7 ++- lib/pass/FunctionDecl.cpp | 5 ++ lib/pass/FunctionDecl.h | 2 +- lib/runtime/CusanRuntime.cpp | 25 +++++----- lib/runtime/CusanRuntime.h | 56 ++++++++++++++--------- lib/runtime/CusanRuntime_cudaSpecific.cpp | 6 +-- lib/support/Util.h | 2 +- test/lit.cfg | 4 +- test/multi_gpu/01_device_sync.c | 2 +- test/multi_gpu/TSan_External.h | 8 ++-- test/pass/TSan_External.h | 8 ++-- test/runtime/TSan_External.h | 8 ++-- 16 files changed, 146 insertions(+), 76 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d121748..014567f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,25 +10,6 @@ env: CC: clang jobs: - format-check: - runs-on: ubuntu-22.04 - - steps: - - uses: actions/checkout@v4 - - - name: Format source code - run: | - find lib test \ - -type f \ - -a \( -name "*.c" -o -name "*.cpp" -o -name "*.h" \) \ - -print0 \ - | xargs -0 clang-format-14 -i - - - name: Format check - run: | - git status --porcelain --untracked-files=no - git status --porcelain --untracked-files=no | xargs -o -I {} test -z \"{}\" - codespell: runs-on: ubuntu-22.04 @@ -44,19 +25,30 @@ jobs: - llvm-version: 14 os: ubuntu-22.04 preset: release + cuda: 11.8.0 + - llvm-version: 19 + os: ubuntu-24.04 + preset: release + cuda: 12.6.0 runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 + - name: LLVM apt + if: ${{ matrix.llvm-version == 19 }} + run: | + wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - + echo "deb http://apt.llvm.org/noble/ llvm-toolchain-noble-19 main" | sudo tee /etc/apt/sources.list.d/llvm-19.list + - name: Update apt run: sudo apt-get update - uses: Jimver/cuda-toolkit@v0.2.19 id: cuda-toolkit with: - cuda: '11.8.0' + cuda: '${{ matrix.cuda }}' method: network sub-packages: '["nvcc", "cudart", "cudart-dev"]' non-cuda-sub-packages: '["libcurand", "libcurand-dev"]' diff --git a/CMakeLists.txt b/CMakeLists.txt index 191bca8..58f070b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,6 +9,8 @@ project( ) set(CMAKE_VERBOSE_MAKEFILE ON) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) list( APPEND diff --git a/lib/pass/AnalysisTransform.cpp b/lib/pass/AnalysisTransform.cpp index b6864e2..001e3f5 100644 --- a/lib/pass/AnalysisTransform.cpp +++ b/lib/pass/AnalysisTransform.cpp @@ -656,4 +656,40 @@ llvm::SmallVector CudaEventQuery::map_return_value(IRBuilder<>& irb, return {result}; } +CudaStreamSyncCallback::CudaStreamSyncCallback(callback::FunctionDecl* decls) { + setup("cudaStreamSynchronize", &decls->cusan_sync_callback.f); +} +llvm::SmallVector CudaStreamSyncCallback::map_arguments(IRBuilder<>& irb, llvm::ArrayRef args) { + //( void* stream) + assert(args.size() == 1); + return {irb.getInt8(1)}; +} +llvm::SmallVector CudaStreamSyncCallback::map_return_value(IRBuilder<>& irb, Value* result) { + return {result}; +} + +CudaEventSyncCallback::CudaEventSyncCallback(callback::FunctionDecl* decls) { + setup("cudaEventSynchronize", &decls->cusan_sync_callback.f); +} +llvm::SmallVector CudaEventSyncCallback::map_arguments(IRBuilder<>& irb, llvm::ArrayRef args) { + //( void* event) + assert(args.size() == 1); + return {irb.getInt8(2)}; +} +llvm::SmallVector CudaEventSyncCallback::map_return_value(IRBuilder<>& irb, Value* result) { + return {result}; +} + +CudaDeviceSyncCallback::CudaDeviceSyncCallback(callback::FunctionDecl* decls) { + setup("cudaDeviceSynchronize", &decls->cusan_sync_callback.f); +} +llvm::SmallVector CudaDeviceSyncCallback::map_arguments(IRBuilder<>& irb, llvm::ArrayRef args) { + //( ) + assert(args.size() == 0); + return {irb.getInt8(0)}; +} +llvm::SmallVector CudaDeviceSyncCallback::map_return_value(IRBuilder<>& irb, Value* result) { + return {result}; +} + } // namespace cusan::transform diff --git a/lib/pass/AnalysisTransform.h b/lib/pass/AnalysisTransform.h index 453f359..55e3f1b 100644 --- a/lib/pass/AnalysisTransform.h +++ b/lib/pass/AnalysisTransform.h @@ -229,6 +229,25 @@ class CudaEventQuery : public SimpleInstrumenter { static llvm::SmallVector map_return_value(IRBuilder<>& irb, Value* result); }; +class CudaStreamSyncCallback : public SimpleInstrumenter { + public: + CudaStreamSyncCallback(callback::FunctionDecl* decls); + static llvm::SmallVector map_arguments(IRBuilder<>& irb, llvm::ArrayRef args); + static llvm::SmallVector map_return_value(IRBuilder<>& irb, Value* result); +}; +class CudaEventSyncCallback : public SimpleInstrumenter { + public: + CudaEventSyncCallback(callback::FunctionDecl* decls); + static llvm::SmallVector map_arguments(IRBuilder<>& irb, llvm::ArrayRef args); + static llvm::SmallVector map_return_value(IRBuilder<>& irb, Value* result); +}; +class CudaDeviceSyncCallback : public SimpleInstrumenter { + public: + CudaDeviceSyncCallback(callback::FunctionDecl* decls); + static llvm::SmallVector map_arguments(IRBuilder<>& irb, llvm::ArrayRef args); + static llvm::SmallVector map_return_value(IRBuilder<>& irb, Value* result); +}; + } // namespace transform } // namespace cusan diff --git a/lib/pass/CusanPass.cpp b/lib/pass/CusanPass.cpp index ee0f0d8..aebe65f 100644 --- a/lib/pass/CusanPass.cpp +++ b/lib/pass/CusanPass.cpp @@ -60,7 +60,7 @@ class LegacyCusanPass : public llvm::ModulePass { public: static char ID; // NOLINT - LegacyCusanPass() : ModulePass(ID){}; + LegacyCusanPass() : ModulePass(ID) {}; bool runOnModule(llvm::Module& module) override; @@ -183,6 +183,11 @@ bool CusanPass::runOnFunc(llvm::Function& function) { modified |= transform::CudaChooseDevice(&cusan_decls_).instrument(function); modified |= transform::CudaSetDevice(&cusan_decls_).instrument(function); + // callbacks + modified |= transform::CudaDeviceSyncCallback(&cusan_decls_).instrument(function); + modified |= transform::CudaEventSyncCallback(&cusan_decls_).instrument(function); + modified |= transform::CudaStreamSyncCallback(&cusan_decls_).instrument(function); + auto data_for_host = host::kernel_model_for_stub(&function, this->kernel_models_); if (data_for_host) { LOG_FATAL("Found kernel data for " << util::try_demangle_fully(function) << ": " diff --git a/lib/pass/FunctionDecl.cpp b/lib/pass/FunctionDecl.cpp index 91e029d..9e8477f 100644 --- a/lib/pass/FunctionDecl.cpp +++ b/lib/pass/FunctionDecl.cpp @@ -1,4 +1,5 @@ #include "FunctionDecl.h" + #include namespace cusan::callback { @@ -133,6 +134,10 @@ void FunctionDecl::initialize(llvm::Module& module) { // void* device ArgTypes arg_types_choose_device = {Type::getInt32Ty(c)->getPointerTo()}; make_function(cusan_choose_device, arg_types_choose_device); + + // u8 evenType, u32 returnValue + ArgTypes arg_types_sync_callback = {Type::getInt8Ty(c), Type::getInt32Ty(c)}; + make_function(cusan_sync_callback, arg_types_sync_callback); } } // namespace cusan::callback diff --git a/lib/pass/FunctionDecl.h b/lib/pass/FunctionDecl.h index 91eabe5..f6457c1 100644 --- a/lib/pass/FunctionDecl.h +++ b/lib/pass/FunctionDecl.h @@ -46,7 +46,7 @@ struct FunctionDecl { CusanFunction cusan_device_free{"_cusan_device_free"}; CusanFunction cusan_stream_query{"_cusan_stream_query"}; CusanFunction cusan_event_query{"_cusan_event_query"}; - + CusanFunction cusan_sync_callback{"cusan_sync_callback"}; void initialize(llvm::Module& m); }; diff --git a/lib/runtime/CusanRuntime.cpp b/lib/runtime/CusanRuntime.cpp index 6fd6358..be31ff1 100644 --- a/lib/runtime/CusanRuntime.cpp +++ b/lib/runtime/CusanRuntime.cpp @@ -91,9 +91,7 @@ class Device { Device() : stats_recorder() { // every device has a default stream - { - register_stream(Device::kDefaultStream); - } + { register_stream(Device::kDefaultStream); } cpu_fiber_ = TsanGetCurrentFiber(); } @@ -469,7 +467,7 @@ void _cusan_create_event(Event*) { runtime.stats_recorder.inc_create_event_calls(); } -void _cusan_create_stream(RawStream* stream, cusan_StreamCreateFlags flags) { +void _cusan_create_stream(RawStream* stream, cusan_stream_create_flags flags) { LOG_TRACE("[cusan]create stream with flags: " << flags << " isNonBlocking: " << (bool)(flags & cusan_StreamFlagsNonBlocking)) auto& runtime = Runtime::get().get_current_device(); @@ -620,10 +618,10 @@ void _cusan_memset_impl(void* target, size_t count) { // r.happens_after_stream(Runtime::default_stream)); } -void _cusan_memset_2d(void* target, size_t pitch, size_t, size_t height, cusan_MemcpyKind) { +void _cusan_memset_2d(void* target, size_t pitch, size_t, size_t height, cusan_memcpy_kind) { _cusan_memset_impl(target, pitch * height); } -void _cusan_memset_2d_async(void* target, size_t pitch, size_t, size_t height, cusan_MemcpyKind, RawStream stream) { +void _cusan_memset_2d_async(void* target, size_t pitch, size_t, size_t height, cusan_memcpy_kind, RawStream stream) { _cusan_memset_async_impl(target, pitch * height, stream); } @@ -638,7 +636,7 @@ void _cusan_memset_async(void* target, size_t count, RawStream stream) { } void _cusan_memcpy_async_impl(void* target, size_t write_size, const void* from, size_t read_size, - cusan_MemcpyKind kind, RawStream stream) { + cusan_memcpy_kind kind, RawStream stream) { auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_memcpy_async_calls(); if (kind == cusan_MemcpyHostToHost && CUSAN_SYNC_DETAIL_LEVEL == 1) { @@ -670,7 +668,7 @@ void _cusan_memcpy_async_impl(void* target, size_t write_size, const void* from, } } -void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_t read_size, cusan_MemcpyKind kind) { +void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_t read_size, cusan_memcpy_kind kind) { // TODO verify that the memcpy2d beheaviour is actually the same as normal memcpy if (kind == cusan_MemcpyDefault) { @@ -751,7 +749,7 @@ void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_ } void _cusan_memcpy_2d_async(void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height, - cusan_MemcpyKind kind, RawStream stream) { + cusan_memcpy_kind kind, RawStream stream) { LOG_TRACE("[cusan]MemcpyAsync" << width * height << " bytes to:" << target) size_t read_size = spitch * height; @@ -759,20 +757,23 @@ void _cusan_memcpy_2d_async(void* target, size_t dpitch, const void* from, size_ _cusan_memcpy_async_impl(target, write_size, from, read_size, kind, stream); } -void _cusan_memcpy_async(void* target, const void* from, size_t count, cusan_MemcpyKind kind, RawStream stream) { +void _cusan_memcpy_async(void* target, const void* from, size_t count, cusan_memcpy_kind kind, RawStream stream) { LOG_TRACE("[cusan]MemcpyAsync" << count << " bytes to:" << target) _cusan_memcpy_async_impl(target, count, from, count, kind, stream); } void _cusan_memcpy_2d(void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height, - cusan_MemcpyKind kind) { + cusan_memcpy_kind kind) { LOG_TRACE("[cusan]Memcpy2d " << width * height << " from:" << from << " to:" << target); size_t read_size = spitch * height; size_t write_size = dpitch * height; _cusan_memcpy_impl(target, write_size, from, read_size, kind); } -void _cusan_memcpy(void* target, const void* from, size_t count, cusan_MemcpyKind kind) { +void _cusan_memcpy(void* target, const void* from, size_t count, cusan_memcpy_kind kind) { LOG_TRACE("[cusan]Memcpy " << count << " from:" << from << " to:" << target); _cusan_memcpy_impl(target, count, from, count, kind); } + +void cusan_sync_callback(cusan_sync_type /*type*/, unsigned int /*return_value*/) { +} diff --git a/lib/runtime/CusanRuntime.h b/lib/runtime/CusanRuntime.h index 784f805..0b5fb60 100644 --- a/lib/runtime/CusanRuntime.h +++ b/lib/runtime/CusanRuntime.h @@ -19,12 +19,12 @@ typedef enum cusan_memcpy_kind_t : unsigned int { cusan_MemcpyDeviceToHost = 2, cusan_MemcpyDeviceToDevice = 3, cusan_MemcpyDefault = 4, -} cusan_MemcpyKind; +} cusan_memcpy_kind; typedef enum cusan_stream_create_flags_t : unsigned int { cusan_StreamFlagsDefault = 0, cusan_StreamFlagsNonBlocking = 1, -} cusan_StreamCreateFlags; +} cusan_stream_create_flags; #ifdef __cplusplus } #endif @@ -34,14 +34,14 @@ namespace cusan::runtime { using TsanFiber = void*; using Event = const void*; using RawStream = const void*; -using DeviceID = int; -cusan_MemcpyKind infer_memcpy_direction(const void* target, const void* from); +using DeviceID = int; +cusan_memcpy_kind infer_memcpy_direction(const void* target, const void* from); DeviceID get_current_device_id(); } // namespace cusan::runtime +using cusan::runtime::DeviceID; using cusan::runtime::Event; using cusan::runtime::RawStream; using cusan::runtime::TsanFiber; -using cusan::runtime::DeviceID; #else #define TsanFiber void* #define Event const void* @@ -55,26 +55,35 @@ extern "C" { #endif void _cusan_kernel_register(void** kernel_args, short* modes, int n, RawStream stream); + void _cusan_sync_device(); -void _cusan_event_record(Event event, RawStream stream); +void _cusan_set_device(DeviceID device); +void _cusan_choose_device(DeviceID* device); + void _cusan_sync_stream(RawStream stream); +void _cusan_create_stream(RawStream* stream, cusan_stream_create_flags flags); +void _cusan_stream_query(RawStream stream, unsigned int err); + void _cusan_sync_event(Event event); -void _cusan_stream_event(Event event); +void _cusan_event_record(Event event, RawStream stream); void _cusan_create_event(RawStream* event); -void _cusan_create_stream(RawStream* stream, cusan_StreamCreateFlags flags); -void _cusan_memcpy_async(void* target, const void* from, size_t count, cusan_MemcpyKind kind, RawStream stream); -void _cusan_memset_async(void* target, size_t count, RawStream stream); -void _cusan_memcpy(void* target, const void* from, size_t count, cusan_MemcpyKind); +void _cusan_event_query(Event event, unsigned int err); +void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int flags); +void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int flags); + +void _cusan_memcpy(void* target, const void* from, size_t count, cusan_memcpy_kind); +void _cusan_memcpy_async(void* target, const void* from, size_t count, cusan_memcpy_kind kind, RawStream stream); void _cusan_memcpy_2d(void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height, - cusan_MemcpyKind); + cusan_memcpy_kind); void _cusan_memcpy_2d_async(void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height, - cusan_MemcpyKind, RawStream stream); -void _cusan_memset_2d(void* target, size_t pitch, size_t width, size_t height, cusan_MemcpyKind); -void _cusan_memset_2d_async(void* target, size_t pitch, size_t width, size_t height, cusan_MemcpyKind, - RawStream stream); + cusan_memcpy_kind, RawStream stream); + void _cusan_memset(void* target, size_t count); -void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int flags); -void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int flags); +void _cusan_memset_async(void* target, size_t count, RawStream stream); +void _cusan_memset_2d(void* target, size_t pitch, size_t width, size_t height, cusan_memcpy_kind); +void _cusan_memset_2d_async(void* target, size_t pitch, size_t width, size_t height, cusan_memcpy_kind, + RawStream stream); + void _cusan_host_alloc(void** ptr, size_t size, unsigned int flags); void _cusan_host_free(void* ptr); void _cusan_managed_alloc(void** ptr, size_t size, unsigned int flags); @@ -83,11 +92,14 @@ void _cusan_host_register(void* ptr, size_t size, unsigned int flags); void _cusan_host_unregister(void* ptr); void _cusan_device_alloc(void** ptr, size_t size); void _cusan_device_free(void* ptr); -void _cusan_stream_query(RawStream stream, unsigned int err); -void _cusan_event_query(Event event, unsigned int err); -void _cusan_set_device(DeviceID device); -void _cusan_choose_device(DeviceID* device); +typedef enum cusan_sync_type_t : unsigned char { + cusan_Device = 0, + cusan_Stream = 1, + cusan_Event = 2, +} cusan_sync_type; + +void cusan_sync_callback(cusan_sync_type /*type*/, unsigned int /*return_value*/); #ifdef __cplusplus } diff --git a/lib/runtime/CusanRuntime_cudaSpecific.cpp b/lib/runtime/CusanRuntime_cudaSpecific.cpp index fbe7d26..970401d 100644 --- a/lib/runtime/CusanRuntime_cudaSpecific.cpp +++ b/lib/runtime/CusanRuntime_cudaSpecific.cpp @@ -11,15 +11,13 @@ namespace cusan::runtime { - - -DeviceID get_current_device_id(){ +DeviceID get_current_device_id() { DeviceID res; cudaGetDevice(&res); return res; } -cusan_MemcpyKind infer_memcpy_direction(const void* target, const void* from) { +cusan_memcpy_kind infer_memcpy_direction(const void* target, const void* from) { cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 0); assert(prop.unifiedAddressing && "Can only use default direction for memcpy when Unified memory is supported."); diff --git a/lib/support/Util.h b/lib/support/Util.h index 54014a9..081c6fc 100644 --- a/lib/support/Util.h +++ b/lib/support/Util.h @@ -34,7 +34,7 @@ inline std::string demangle(String&& s) { #if LLVM_VERSION_MAJOR >= 15 auto demangle = llvm::itaniumDemangle(name.data(), false); #else - auto* demangle = llvm::itaniumDemangle(name.data(), nullptr, nullptr, nullptr); + auto* demangle = llvm::itaniumDemangle(name.data(), nullptr, nullptr, nullptr); #endif if (demangle && !std::string(demangle).empty()) { return {demangle}; diff --git a/test/lit.cfg b/test/lit.cfg index 9836a64..8146092 100644 --- a/test/lit.cfg +++ b/test/lit.cfg @@ -46,7 +46,7 @@ if config.cusan_typeart: config.substitutions.append(("%clang-cpp", clang_cpp)) config.substitutions.append(("%clang-cc", clang_cc)) config.substitutions.append(("%opt", opt)) -config.substitutions.append(("%filecheck", filecheck + " --dump-input fail")) +config.substitutions.append(("%filecheck", filecheck + " --dump-input fail --dump-input-context=40")) config.substitutions.append(("%lib_dir", cusan_lib_dir)) config.substitutions.append(("%pass_dir", cusan_pass_dir)) @@ -77,4 +77,4 @@ config.substitutions.append(('%clang_args', '-D__STRICT_ANSI__ -fPIC -O1 -g0')) config.substitutions.append(('%rm-file', 'rm -f ')) -config.substitutions.append(('%clang-pass-only-args', '-D__STRICT_ANSI__ -O1 -c')) +config.substitutions.append(('%clang-pass-only-args', '-D__STRICT_ANSI__ -std=c++17 -O1 -c')) diff --git a/test/multi_gpu/01_device_sync.c b/test/multi_gpu/01_device_sync.c index cc0b7b8..d069a70 100644 --- a/test/multi_gpu/01_device_sync.c +++ b/test/multi_gpu/01_device_sync.c @@ -55,7 +55,7 @@ int main() { cudaSetDevice(0); write_kernel_delay<<>>(managed_data, size, 1316134912); - // if we only have the later synchronize we will only syncrhonize the second device + // if we only have the later synchronize we will only synchronize the second device #ifdef CUSAN_SYNC cudaDeviceSynchronize(); #endif diff --git a/test/multi_gpu/TSan_External.h b/test/multi_gpu/TSan_External.h index 36849c9..c7b5796 100644 --- a/test/multi_gpu/TSan_External.h +++ b/test/multi_gpu/TSan_External.h @@ -87,12 +87,12 @@ void __attribute__((weak)) AnnotateRWLockCreate(const char* file, int line, cons void __attribute__((weak)) AnnotateRWLockDestroy(const char* file, int line, const volatile void* cv) { FALLBACK_PRINT(__func__); } -void __attribute__((weak)) -AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv, unsigned long long is_w) { +void __attribute__((weak)) AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv, + unsigned long long is_w) { FALLBACK_PRINT(__func__); } -void __attribute__((weak)) -AnnotateRWLockReleased(const char* file, int line, const volatile void* cv, unsigned long long is_w) { +void __attribute__((weak)) AnnotateRWLockReleased(const char* file, int line, const volatile void* cv, + unsigned long long is_w) { FALLBACK_PRINT(__func__); } diff --git a/test/pass/TSan_External.h b/test/pass/TSan_External.h index 574f0db..c543fdc 100644 --- a/test/pass/TSan_External.h +++ b/test/pass/TSan_External.h @@ -87,12 +87,12 @@ void __attribute__((weak)) AnnotateRWLockCreate(const char* file, int line, cons void __attribute__((weak)) AnnotateRWLockDestroy(const char* file, int line, const volatile void* cv) { FALLBACK_PRINT(__func__); } -void __attribute__((weak)) -AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv, unsigned long long is_w) { +void __attribute__((weak)) AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv, + unsigned long long is_w) { FALLBACK_PRINT(__func__); } -void __attribute__((weak)) -AnnotateRWLockReleased(const char* file, int line, const volatile void* cv, unsigned long long is_w) { +void __attribute__((weak)) AnnotateRWLockReleased(const char* file, int line, const volatile void* cv, + unsigned long long is_w) { FALLBACK_PRINT(__func__); } diff --git a/test/runtime/TSan_External.h b/test/runtime/TSan_External.h index 36849c9..c7b5796 100644 --- a/test/runtime/TSan_External.h +++ b/test/runtime/TSan_External.h @@ -87,12 +87,12 @@ void __attribute__((weak)) AnnotateRWLockCreate(const char* file, int line, cons void __attribute__((weak)) AnnotateRWLockDestroy(const char* file, int line, const volatile void* cv) { FALLBACK_PRINT(__func__); } -void __attribute__((weak)) -AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv, unsigned long long is_w) { +void __attribute__((weak)) AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv, + unsigned long long is_w) { FALLBACK_PRINT(__func__); } -void __attribute__((weak)) -AnnotateRWLockReleased(const char* file, int line, const volatile void* cv, unsigned long long is_w) { +void __attribute__((weak)) AnnotateRWLockReleased(const char* file, int line, const volatile void* cv, + unsigned long long is_w) { FALLBACK_PRINT(__func__); } From 2cd2b54299227d06badd087209dee30b97b4d00f Mon Sep 17 00:00:00 2001 From: Tim Ziegler Date: Wed, 12 Mar 2025 10:10:26 +0100 Subject: [PATCH 08/15] Put mpiexec tests behind a requires --- test/runtime/03_cuda_to_mpi.c | 2 ++ test/runtime/04_mpi_to_cuda.c | 2 ++ test/runtime/05_cuda_to_mpi_stream.c | 2 ++ test/runtime/06_cuda_to_mpi_event.c | 2 ++ test/runtime/07_cuda_to_mpi_read.c | 2 ++ test/runtime/08_cudamemcpy_to_mpi.c | 2 ++ test/runtime/11_cuda_to_mpi_struct_of_buff.c | 2 ++ test/runtime/12_struct_ptr.c | 2 ++ test/runtime/18_cuda_to_mpi_event_query_busy_loop.c | 2 ++ ...da_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c | 2 ++ test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c | 3 +++ test/runtime/21_chunked_streams_example.c | 2 ++ test/runtime/22_cuda_to_mpi_partial_buff_write.c | 2 ++ test/runtime/26_malloc_pitch.c | 2 ++ test/runtime/29_tsan_cuda_to_mpi.c | 2 ++ test/runtime/30_tsan_annotate_cuda_to_mpi.c | 3 +++ test/runtime/lit.local.cfg | 12 ++++++++++++ 17 files changed, 46 insertions(+) diff --git a/test/runtime/03_cuda_to_mpi.c b/test/runtime/03_cuda_to_mpi.c index bbb61e6..d7f11b6 100644 --- a/test/runtime/03_cuda_to_mpi.c +++ b/test/runtime/03_cuda_to_mpi.c @@ -5,6 +5,8 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC +// REQUIRES: mpiexec + // clang-format on // CHECK-DAG: data race diff --git a/test/runtime/04_mpi_to_cuda.c b/test/runtime/04_mpi_to_cuda.c index 3b54714..370f48f 100644 --- a/test/runtime/04_mpi_to_cuda.c +++ b/test/runtime/04_mpi_to_cuda.c @@ -5,6 +5,8 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC +// REQUIRES: mpiexec + // clang-format on // CHECK-DAG: data race diff --git a/test/runtime/05_cuda_to_mpi_stream.c b/test/runtime/05_cuda_to_mpi_stream.c index e07b35c..af827b9 100644 --- a/test/runtime/05_cuda_to_mpi_stream.c +++ b/test/runtime/05_cuda_to_mpi_stream.c @@ -5,6 +5,8 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC +// REQUIRES: mpiexec + // clang-format on // CHECK-DAG: data race diff --git a/test/runtime/06_cuda_to_mpi_event.c b/test/runtime/06_cuda_to_mpi_event.c index b44e4a3..0585479 100644 --- a/test/runtime/06_cuda_to_mpi_event.c +++ b/test/runtime/06_cuda_to_mpi_event.c @@ -5,6 +5,8 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC +// REQUIRES: mpiexec + // clang-format on // CHECK-DAG: data race diff --git a/test/runtime/07_cuda_to_mpi_read.c b/test/runtime/07_cuda_to_mpi_read.c index 8d36e6b..e367e2a 100644 --- a/test/runtime/07_cuda_to_mpi_read.c +++ b/test/runtime/07_cuda_to_mpi_read.c @@ -2,6 +2,8 @@ // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck --allow-empty %s +// REQUIRES: mpiexec + // clang-format on // CHECK-NOT: data race diff --git a/test/runtime/08_cudamemcpy_to_mpi.c b/test/runtime/08_cudamemcpy_to_mpi.c index 18c6671..b8fa5bf 100644 --- a/test/runtime/08_cudamemcpy_to_mpi.c +++ b/test/runtime/08_cudamemcpy_to_mpi.c @@ -2,6 +2,8 @@ // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe // RUN: %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s --allow-empty +// REQUIRES: mpiexec + // clang-format on // CHECK-NOT: data race diff --git a/test/runtime/11_cuda_to_mpi_struct_of_buff.c b/test/runtime/11_cuda_to_mpi_struct_of_buff.c index ea875dc..4a3141d 100644 --- a/test/runtime/11_cuda_to_mpi_struct_of_buff.c +++ b/test/runtime/11_cuda_to_mpi_struct_of_buff.c @@ -5,6 +5,8 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC +// REQUIRES: mpiexec + // clang-format on // CHECK-DAG: data race diff --git a/test/runtime/12_struct_ptr.c b/test/runtime/12_struct_ptr.c index 3792d13..5a71312 100644 --- a/test/runtime/12_struct_ptr.c +++ b/test/runtime/12_struct_ptr.c @@ -5,6 +5,8 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC +// REQUIRES: mpiexec + // CHECK-DAG: data race // CHECK-SYNC-NOT: data race diff --git a/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c b/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c index cf613ca..73674a7 100644 --- a/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c +++ b/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c @@ -5,6 +5,8 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC +// REQUIRES: mpiexec + // clang-format on // CHECK-DAG: data race diff --git a/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c b/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c index c5b221e..61eee40 100644 --- a/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c +++ b/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c @@ -5,6 +5,8 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC +// REQUIRES: mpiexec + // clang-format on // CHECK-DAG: data race diff --git a/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c b/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c index 5d72ad7..820912b 100644 --- a/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c +++ b/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c @@ -4,6 +4,9 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC + +// REQUIRES: mpiexec + // clang-format on // CHECK-DAG: data race diff --git a/test/runtime/21_chunked_streams_example.c b/test/runtime/21_chunked_streams_example.c index 90b21e0..fa4f5d0 100644 --- a/test/runtime/21_chunked_streams_example.c +++ b/test/runtime/21_chunked_streams_example.c @@ -2,6 +2,8 @@ // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s +// REQUIRES: mpiexec + // clang-format on // CHECK-NOT: data race diff --git a/test/runtime/22_cuda_to_mpi_partial_buff_write.c b/test/runtime/22_cuda_to_mpi_partial_buff_write.c index 1c8ffcd..8a29dd9 100644 --- a/test/runtime/22_cuda_to_mpi_partial_buff_write.c +++ b/test/runtime/22_cuda_to_mpi_partial_buff_write.c @@ -2,6 +2,8 @@ // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s --allow-empty +// REQUIRES: mpiexec + // clang-format on // CHECK-NOT: data race diff --git a/test/runtime/26_malloc_pitch.c b/test/runtime/26_malloc_pitch.c index b2b12da..3d7524f 100644 --- a/test/runtime/26_malloc_pitch.c +++ b/test/runtime/26_malloc_pitch.c @@ -5,6 +5,8 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC +// REQUIRES: mpiexec + // clang-format on // CHECK-DAG: data race diff --git a/test/runtime/29_tsan_cuda_to_mpi.c b/test/runtime/29_tsan_cuda_to_mpi.c index a39b5a0..14ebfa1 100644 --- a/test/runtime/29_tsan_cuda_to_mpi.c +++ b/test/runtime/29_tsan_cuda_to_mpi.c @@ -5,6 +5,8 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC +// REQUIRES: mpiexec + // clang-format on // CHECK: [Error] sync diff --git a/test/runtime/30_tsan_annotate_cuda_to_mpi.c b/test/runtime/30_tsan_annotate_cuda_to_mpi.c index 13246ef..24e59ce 100644 --- a/test/runtime/30_tsan_annotate_cuda_to_mpi.c +++ b/test/runtime/30_tsan_annotate_cuda_to_mpi.c @@ -4,6 +4,9 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda -g %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC + +// REQUIRES: mpiexec + // clang-format on // CHECK-DAG: data race diff --git a/test/runtime/lit.local.cfg b/test/runtime/lit.local.cfg index 3b823ec..f681f81 100644 --- a/test/runtime/lit.local.cfg +++ b/test/runtime/lit.local.cfg @@ -1,4 +1,16 @@ +import subprocess + if config.cusan_use_workarounds: config.environment['NEOReadDebugKeys'] = '1' config.environment['DisableDeepBind'] = '1' config.environment['OMPI_MCA_memory'] = '^patcher' + +def has_mpirun(): + try: + subprocess.check_call(['mpirun', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return True + except subprocess.CalledProcessError: + return False + + +config.available_features.add('openmpi' if has_mpirun() else '') \ No newline at end of file From c791956bb1ef7feaa45fb22cf93ebd3851f1d65e Mon Sep 17 00:00:00 2001 From: Tim Ziegler Date: Wed, 12 Mar 2025 10:17:56 +0100 Subject: [PATCH 09/15] Put multi gpu tests behind a requires --- test/multi_gpu/01_device_sync.c | 2 ++ test/multi_gpu/02_device_sync_event.c | 2 ++ test/multi_gpu/lit.local.cfg | 16 ++++++++++++++++ 3 files changed, 20 insertions(+) create mode 100644 test/multi_gpu/lit.local.cfg diff --git a/test/multi_gpu/01_device_sync.c b/test/multi_gpu/01_device_sync.c index d069a70..52747c3 100644 --- a/test/multi_gpu/01_device_sync.c +++ b/test/multi_gpu/01_device_sync.c @@ -5,6 +5,8 @@ // RUN: %wrapper-cc -DCUSAN_SYNC %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t-sync.exe // RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC +// REQUIRES: multigpu + // clang-format on // CHECK-DAG: data race diff --git a/test/multi_gpu/02_device_sync_event.c b/test/multi_gpu/02_device_sync_event.c index e0e5c23..a72a7fe 100644 --- a/test/multi_gpu/02_device_sync_event.c +++ b/test/multi_gpu/02_device_sync_event.c @@ -5,6 +5,8 @@ // RUN: %wrapper-cc %clang_args -DCUSAN_SYNC -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC +// REQUIRES: multigpu + // clang-format on // CHECK-DAG: data race diff --git a/test/multi_gpu/lit.local.cfg b/test/multi_gpu/lit.local.cfg new file mode 100644 index 0000000..0f360f0 --- /dev/null +++ b/test/multi_gpu/lit.local.cfg @@ -0,0 +1,16 @@ +import subprocess + +if config.cusan_use_workarounds: + config.environment['NEOReadDebugKeys'] = '1' + config.environment['DisableDeepBind'] = '1' + config.environment['OMPI_MCA_memory'] = '^patcher' + +def has_multigpu(): + try: + subprocess.check_call(['nvidia-smi', '-i=1'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return True + except subprocess.CalledProcessError: + return False + + +config.available_features.add('multigpu' if has_multigpu() else '') \ No newline at end of file From 96b619974a29cf5557d39c5dfbe502028e2f4df8 Mon Sep 17 00:00:00 2001 From: Tim Ziegler Date: Thu, 13 Mar 2025 10:49:55 +0100 Subject: [PATCH 10/15] Added stream/event to sync callback --- lib/pass/AnalysisTransform.cpp | 6 +++--- lib/pass/FunctionDecl.cpp | 2 +- lib/runtime/CusanRuntime.cpp | 13 ++++++++++++- lib/runtime/CusanRuntime.h | 8 ++++---- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/lib/pass/AnalysisTransform.cpp b/lib/pass/AnalysisTransform.cpp index 001e3f5..e84c7eb 100644 --- a/lib/pass/AnalysisTransform.cpp +++ b/lib/pass/AnalysisTransform.cpp @@ -662,7 +662,7 @@ CudaStreamSyncCallback::CudaStreamSyncCallback(callback::FunctionDecl* decls) { llvm::SmallVector CudaStreamSyncCallback::map_arguments(IRBuilder<>& irb, llvm::ArrayRef args) { //( void* stream) assert(args.size() == 1); - return {irb.getInt8(1)}; + return {irb.getInt8(1), args[0]}; } llvm::SmallVector CudaStreamSyncCallback::map_return_value(IRBuilder<>& irb, Value* result) { return {result}; @@ -674,7 +674,7 @@ CudaEventSyncCallback::CudaEventSyncCallback(callback::FunctionDecl* decls) { llvm::SmallVector CudaEventSyncCallback::map_arguments(IRBuilder<>& irb, llvm::ArrayRef args) { //( void* event) assert(args.size() == 1); - return {irb.getInt8(2)}; + return {irb.getInt8(2), args[0]}; } llvm::SmallVector CudaEventSyncCallback::map_return_value(IRBuilder<>& irb, Value* result) { return {result}; @@ -686,7 +686,7 @@ CudaDeviceSyncCallback::CudaDeviceSyncCallback(callback::FunctionDecl* decls) { llvm::SmallVector CudaDeviceSyncCallback::map_arguments(IRBuilder<>& irb, llvm::ArrayRef args) { //( ) assert(args.size() == 0); - return {irb.getInt8(0)}; + return {irb.getInt8(0), ConstantPointerNull::get(irb.getPtrTy())}; } llvm::SmallVector CudaDeviceSyncCallback::map_return_value(IRBuilder<>& irb, Value* result) { return {result}; diff --git a/lib/pass/FunctionDecl.cpp b/lib/pass/FunctionDecl.cpp index 9e8477f..cefd273 100644 --- a/lib/pass/FunctionDecl.cpp +++ b/lib/pass/FunctionDecl.cpp @@ -136,7 +136,7 @@ void FunctionDecl::initialize(llvm::Module& module) { make_function(cusan_choose_device, arg_types_choose_device); // u8 evenType, u32 returnValue - ArgTypes arg_types_sync_callback = {Type::getInt8Ty(c), Type::getInt32Ty(c)}; + ArgTypes arg_types_sync_callback = {Type::getInt8Ty(c), void_ptr, Type::getInt32Ty(c)}; make_function(cusan_sync_callback, arg_types_sync_callback); } diff --git a/lib/runtime/CusanRuntime.cpp b/lib/runtime/CusanRuntime.cpp index be31ff1..0ec5213 100644 --- a/lib/runtime/CusanRuntime.cpp +++ b/lib/runtime/CusanRuntime.cpp @@ -775,5 +775,16 @@ void _cusan_memcpy(void* target, const void* from, size_t count, cusan_memcpy_ki _cusan_memcpy_impl(target, count, from, count, kind); } -void cusan_sync_callback(cusan_sync_type /*type*/, unsigned int /*return_value*/) { +void cusan_sync_callback(cusan_sync_type type, const void* event_or_stream, unsigned int return_value) { + //switch (type) { + // case cusan_Device: + // printf("Device sync return value %i\n", return_value); + // break; + // case cusan_Stream: + // printf("Stream %#x sync return value %i\n", event_or_stream, return_value); + // break; + // case cusan_Event: + // printf("Event %#x sync return value %i\n", event_or_stream, return_value); + // break; + //} } diff --git a/lib/runtime/CusanRuntime.h b/lib/runtime/CusanRuntime.h index 0b5fb60..048e566 100644 --- a/lib/runtime/CusanRuntime.h +++ b/lib/runtime/CusanRuntime.h @@ -94,12 +94,12 @@ void _cusan_device_alloc(void** ptr, size_t size); void _cusan_device_free(void* ptr); typedef enum cusan_sync_type_t : unsigned char { - cusan_Device = 0, - cusan_Stream = 1, - cusan_Event = 2, + cusan_Device = 0, //second argument is a nullptr + cusan_Stream = 1, //second argument is the stream pointer + cusan_Event = 2, //second argument is the event pointer } cusan_sync_type; -void cusan_sync_callback(cusan_sync_type /*type*/, unsigned int /*return_value*/); +void cusan_sync_callback(cusan_sync_type /*type*/, const void* /*event or stream*/, unsigned int /*return_value*/); #ifdef __cplusplus } From 7b517c3ea123a0022bea70fde5135ba6244a7db5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20H=C3=BCck?= Date: Sun, 20 Apr 2025 13:43:32 +0200 Subject: [PATCH 11/15] Fix LLVM-14 compile error, fix some warnings --- cmake/cusanToolchain.cmake | 2 ++ cmake/modules/cusan-format.cmake | 7 ++--- externals/CMakeLists.txt | 6 +++++ lib/pass/AnalysisTransform.cpp | 13 ++++++--- lib/pass/CMakeLists.txt | 1 + lib/pass/CusanPass.cpp | 4 ++- lib/runtime/CusanRuntime.cpp | 45 +++++++++++++++++--------------- lib/runtime/StatsCounter.h | 4 +-- lib/runtime/TSanInterface.h | 3 +++ 9 files changed, 54 insertions(+), 31 deletions(-) diff --git a/cmake/cusanToolchain.cmake b/cmake/cusanToolchain.cmake index 9e98131..449a0ff 100644 --- a/cmake/cusanToolchain.cmake +++ b/cmake/cusanToolchain.cmake @@ -41,6 +41,8 @@ option(CUSAN_FIBERPOOL "Use external fiber pool to manage ThreadSanitizer fibers option(CUSAN_SOFTCOUNTER "Print runtime counters" OFF) option(CUSAN_SYNC_DETAIL_LEVEL "Enable implicit sync analysis of memcpy/memset" ON) +option(CUSAN_DEVICE_SYNC_CALLBACKS "Enable runtime callbacks after sync calls" ON) + option(CUSAN_TEST_WORKAROUNDS "Enable workarounds for MPI + TSan regarding runtime tests" ON) mark_as_advanced(CUSAN_TEST_WORKAROUNDS) diff --git a/cmake/modules/cusan-format.cmake b/cmake/modules/cusan-format.cmake index bacdf5d..96c393d 100644 --- a/cmake/modules/cusan-format.cmake +++ b/cmake/modules/cusan-format.cmake @@ -20,11 +20,12 @@ function(cusan_target_format target comment) filter_dir(${exclude}) endforeach() - find_program(FORMAT_COMMAND + find_program(CLANG_FORMAT_COMMAND NAMES clang-format-${LLVM_VERSION_MAJOR} clang-format) - if(FORMAT_COMMAND) + mark_as_advanced(CLANG_FORMAT_COMMAND) + if(CLANG_FORMAT_COMMAND) add_custom_target(${target} - COMMAND ${FORMAT_COMMAND} -i -style=file ${ARG_OTHER} ${ARG_UNPARSED_ARGUMENTS} + COMMAND ${CLANG_FORMAT_COMMAND} -i -style=file ${ARG_OTHER} ${ARG_UNPARSED_ARGUMENTS} ${ALL_CXX_FILES} WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} COMMENT "${comment}" diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt index 863ccc1..4dc9ddf 100644 --- a/externals/CMakeLists.txt +++ b/externals/CMakeLists.txt @@ -42,3 +42,9 @@ if(CUSAN_TYPEART) ) FetchContent_MakeAvailable(typeart) endif() + +mark_as_advanced( + FETCHCONTENT_BASE_DIR + FETCHCONTENT_FULLY_DISCONNECTED + FETCHCONTENT_QUIET + FETCHCONTENT_UPDATES_DISCONNECTED) \ No newline at end of file diff --git a/lib/pass/AnalysisTransform.cpp b/lib/pass/AnalysisTransform.cpp index e84c7eb..6da40e1 100644 --- a/lib/pass/AnalysisTransform.cpp +++ b/lib/pass/AnalysisTransform.cpp @@ -664,7 +664,7 @@ llvm::SmallVector CudaStreamSyncCallback::map_arguments(IRBuilder<>& irb assert(args.size() == 1); return {irb.getInt8(1), args[0]}; } -llvm::SmallVector CudaStreamSyncCallback::map_return_value(IRBuilder<>& irb, Value* result) { +llvm::SmallVector CudaStreamSyncCallback::map_return_value(IRBuilder<>&, Value* result) { return {result}; } @@ -676,7 +676,7 @@ llvm::SmallVector CudaEventSyncCallback::map_arguments(IRBuilder<>& irb, assert(args.size() == 1); return {irb.getInt8(2), args[0]}; } -llvm::SmallVector CudaEventSyncCallback::map_return_value(IRBuilder<>& irb, Value* result) { +llvm::SmallVector CudaEventSyncCallback::map_return_value(IRBuilder<>&, Value* result) { return {result}; } @@ -686,9 +686,14 @@ CudaDeviceSyncCallback::CudaDeviceSyncCallback(callback::FunctionDecl* decls) { llvm::SmallVector CudaDeviceSyncCallback::map_arguments(IRBuilder<>& irb, llvm::ArrayRef args) { //( ) assert(args.size() == 0); - return {irb.getInt8(0), ConstantPointerNull::get(irb.getPtrTy())}; +#if LLVM_VERSION_MAJOR < 15 + auto* ptr_type = PointerType::getUnqual(irb.getContext()); +#else + auto* ptr_type = irb.getPtrTy(); +#endif + return {irb.getInt8(0), ConstantPointerNull::get(ptr_type)}; } -llvm::SmallVector CudaDeviceSyncCallback::map_return_value(IRBuilder<>& irb, Value* result) { +llvm::SmallVector CudaDeviceSyncCallback::map_return_value(IRBuilder<>&, Value* result) { return {result}; } diff --git a/lib/pass/CMakeLists.txt b/lib/pass/CMakeLists.txt index f52baa5..8fbefd9 100644 --- a/lib/pass/CMakeLists.txt +++ b/lib/pass/CMakeLists.txt @@ -14,6 +14,7 @@ mark_as_advanced(LLVM_CUSAN_TRANSFORMPASS_LINK_INTO_TOOLS) target_compile_definitions(cusan_TransformPass PRIVATE $<$:CUSAN_TYPEART=1> + $<$:CUSAN_DEVICE_SYNC_CALLBACKS=1> ) set_target_properties( diff --git a/lib/pass/CusanPass.cpp b/lib/pass/CusanPass.cpp index aebe65f..48cbf50 100644 --- a/lib/pass/CusanPass.cpp +++ b/lib/pass/CusanPass.cpp @@ -60,7 +60,7 @@ class LegacyCusanPass : public llvm::ModulePass { public: static char ID; // NOLINT - LegacyCusanPass() : ModulePass(ID) {}; + LegacyCusanPass() : ModulePass(ID){}; bool runOnModule(llvm::Module& module) override; @@ -184,9 +184,11 @@ bool CusanPass::runOnFunc(llvm::Function& function) { modified |= transform::CudaSetDevice(&cusan_decls_).instrument(function); // callbacks +#ifdef CUSAN_DEVICE_SYNC_CALLBACKS modified |= transform::CudaDeviceSyncCallback(&cusan_decls_).instrument(function); modified |= transform::CudaEventSyncCallback(&cusan_decls_).instrument(function); modified |= transform::CudaStreamSyncCallback(&cusan_decls_).instrument(function); +#endif auto data_for_host = host::kernel_model_for_stub(&function, this->kernel_models_); if (data_for_host) { diff --git a/lib/runtime/CusanRuntime.cpp b/lib/runtime/CusanRuntime.cpp index 0ec5213..da2488c 100644 --- a/lib/runtime/CusanRuntime.cpp +++ b/lib/runtime/CusanRuntime.cpp @@ -12,7 +12,7 @@ #include "analysis/KernelModel.h" #include "support/Logger.h" #include "StatsCounter.h" -#if CUSAN_SOFTCOUNTER +#ifdef CUSAN_SOFTCOUNTER #include "support/Table.h" #endif #include "TSanInterface.h" @@ -217,7 +217,7 @@ class Device { void output_statistics() { #undef cusan_stat_handle #define cusan_stat_handle(name) table.put(Row::make(#name, stats_recorder.get_##name())); -#if CUSAN_SOFTCOUNTER +#ifdef CUSAN_SOFTCOUNTER Table table{"Cusan device statistics"}; #ifdef CUSAN_FIBERPOOL table.put(Row::make("Fiberpool", 1)); @@ -244,7 +244,7 @@ class Runtime { std::map> events_; int32_t current_device_; bool init_; -#if CUSAN_SOFTCOUNTER +#ifdef CUSAN_SOFTCOUNTER softcounter::AtomicCounter device_switches = 0; #endif public: @@ -262,7 +262,7 @@ class Runtime { void operator=(const Runtime&) = delete; -#if CUSAN_SOFTCOUNTER +#ifdef CUSAN_SOFTCOUNTER inline void inc_device_switches() { this->device_switches++; } @@ -286,7 +286,7 @@ class Runtime { if (devices_.find(device) == devices_.end()) { devices_[device]; } -#if CUSAN_SOFTCOUNTER +#ifdef CUSAN_SOFTCOUNTER if (current_device_ != device) { inc_device_switches(); } @@ -312,7 +312,7 @@ class Runtime { Runtime() = default; ~Runtime() { -#if CUSAN_SOFTCOUNTER +#ifdef CUSAN_SOFTCOUNTER for (auto& [_, device] : devices_) { device.output_statistics(); } @@ -595,7 +595,8 @@ void _cusan_memset_impl(void* target, size_t count) { auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_memset_calls(); runtime.switch_to_stream(Device::kDefaultStream); - LOG_TRACE("[cusan] " << "Write to " << target << " with size: " << count) + LOG_TRACE("[cusan] " + << "Write to " << target << " with size: " << count) TsanMemoryWritePC(target, count, __builtin_return_address(0)); runtime.stats_recorder.inc_TsanMemoryWrite(); runtime.happens_before(); @@ -604,10 +605,12 @@ void _cusan_memset_impl(void* target, size_t count) { auto* alloc_info = runtime.get_allocation_info(target); // if we couldn't find alloc info we just assume the worst and don't sync if ((alloc_info && (alloc_info->is_pinned || alloc_info->is_managed)) || CUSAN_SYNC_DETAIL_LEVEL == 0) { - LOG_TRACE("[cusan] " << "Memset is blocking") + LOG_TRACE("[cusan] " + << "Memset is blocking") runtime.happens_after_stream(Device::kDefaultStream); } else { - LOG_TRACE("[cusan] " << "Memset is not blocking") + LOG_TRACE("[cusan] " + << "Memset is not blocking") if (!alloc_info) { LOG_DEBUG("[cusan] Failed to get alloc info " << target); } else if (!alloc_info->is_pinned && !alloc_info->is_managed) { @@ -775,16 +778,16 @@ void _cusan_memcpy(void* target, const void* from, size_t count, cusan_memcpy_ki _cusan_memcpy_impl(target, count, from, count, kind); } -void cusan_sync_callback(cusan_sync_type type, const void* event_or_stream, unsigned int return_value) { - //switch (type) { - // case cusan_Device: - // printf("Device sync return value %i\n", return_value); - // break; - // case cusan_Stream: - // printf("Stream %#x sync return value %i\n", event_or_stream, return_value); - // break; - // case cusan_Event: - // printf("Event %#x sync return value %i\n", event_or_stream, return_value); - // break; - //} +void cusan_sync_callback(cusan_sync_type /*type*/, const void* /*event_or_stream*/, unsigned int /*return_value*/) { + // switch (type) { + // case cusan_Device: + // printf("Device sync return value %i\n", return_value); + // break; + // case cusan_Stream: + // printf("Stream %#x sync return value %i\n", event_or_stream, return_value); + // break; + // case cusan_Event: + // printf("Event %#x sync return value %i\n", event_or_stream, return_value); + // break; + // } } diff --git a/lib/runtime/StatsCounter.h b/lib/runtime/StatsCounter.h index 2b00377..5e66b6e 100644 --- a/lib/runtime/StatsCounter.h +++ b/lib/runtime/StatsCounter.h @@ -107,9 +107,9 @@ class NoneRecorder { Statistics stats_r; CUSAN_CUDA_EVENT_LIST #include "TsanEvents.inc" - void inc_TsanMemoryReadCount(unsigned count) { + void inc_TsanMemoryReadCount(unsigned) { } - void inc_TsanMemoryWriteCount(unsigned count) { + void inc_TsanMemoryWriteCount(unsigned) { } }; diff --git a/lib/runtime/TSanInterface.h b/lib/runtime/TSanInterface.h index 88d0f48..324b688 100644 --- a/lib/runtime/TSanInterface.h +++ b/lib/runtime/TSanInterface.h @@ -1,6 +1,9 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wformat" #pragma GCC diagnostic ignored "-Wmacro-redefined" +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wformat-pedantic" +#pragma GCC diagnostic ignored "-Wcast-qual" #include "TSan_External.h" From 3a838df11e1c815471419517166150fa4544852d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20H=C3=BCck?= Date: Sun, 20 Apr 2025 15:10:58 +0200 Subject: [PATCH 12/15] Fix MPI check for tests --- test/lit.cfg | 3 +++ test/runtime/lit.local.cfg | 12 ------------ 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/test/lit.cfg b/test/lit.cfg index 8146092..cf956ce 100644 --- a/test/lit.cfg +++ b/test/lit.cfg @@ -43,6 +43,9 @@ else: if config.cusan_typeart: config.available_features.add('typeart') +if config.mpiexec: + config.available_features.add('mpiexec') + config.substitutions.append(("%clang-cpp", clang_cpp)) config.substitutions.append(("%clang-cc", clang_cc)) config.substitutions.append(("%opt", opt)) diff --git a/test/runtime/lit.local.cfg b/test/runtime/lit.local.cfg index f681f81..3b823ec 100644 --- a/test/runtime/lit.local.cfg +++ b/test/runtime/lit.local.cfg @@ -1,16 +1,4 @@ -import subprocess - if config.cusan_use_workarounds: config.environment['NEOReadDebugKeys'] = '1' config.environment['DisableDeepBind'] = '1' config.environment['OMPI_MCA_memory'] = '^patcher' - -def has_mpirun(): - try: - subprocess.check_call(['mpirun', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - return True - except subprocess.CalledProcessError: - return False - - -config.available_features.add('openmpi' if has_mpirun() else '') \ No newline at end of file From 616f8619970df1c282193a1f1c1f6f253236c217 Mon Sep 17 00:00:00 2001 From: ahueck Date: Sun, 20 Apr 2025 18:14:05 +0200 Subject: [PATCH 13/15] MPI dependency optional (#7) --- CMakeLists.txt | 10 +- cmake/cusanToolchain.cmake | 2 +- lib/runtime/CMakeLists.txt | 126 +++++++++--------- lib/runtime/TSanInterface.h | 6 +- scripts/CMakeLists.txt | 1 - test/CMakeLists.txt | 23 +++- test/kernel_analysis/03_struct_write.c | 2 + test/kernel_analysis/08_big_struct_write.c | 2 + test/lit.cfg | 2 + test/lit.site.cfg.in | 1 + test/pass/03_cuda_to_mpi.c | 4 +- test/pass/04_mpi_to_cuda.c | 2 + test/pass/05_cuda_to_mpi_stream.c | 2 + test/pass/06_cuda_to_mpi_event.c | 2 + test/pass/07_cuda_to_mpi_read.c | 2 + test/pass/08_cudamemcpy_to_mpi.c | 2 + test/pass/11_cuda_to_mpi_struct_of_buff.c | 2 + .../18_cuda_to_mpi_event_query_busy_loop.c | 1 + ...pi_send_cudaMemcpyAsyncH2H_implicit_sync.c | 1 + test/pass/20_cuda_to_mpi_send_ds_sync_w_r.c | 2 + test/pass/21_chunked_streams_example.c | 2 + test/pass/22_cuda_to_mpi_partial_buff_write.c | 2 + test/pass/26_malloc_pitch.c | 2 + test/pass/29_tsan_cuda_to_mpi.c | 1 + test/pass/30_tsan_annotate_cuda_to_mpi.c | 2 + test/pass/31_tsan_cuda_event.c | 9 +- test/pass/32_tsan_async_copy.c | 2 +- test/runtime/03_cuda_to_mpi.c | 2 +- test/runtime/04_mpi_to_cuda.c | 2 +- test/runtime/05_cuda_to_mpi_stream.c | 2 +- test/runtime/06_cuda_to_mpi_event.c | 2 +- test/runtime/07_cuda_to_mpi_read.c | 2 +- test/runtime/08_cudamemcpy_to_mpi.c | 2 +- test/runtime/11_cuda_to_mpi_struct_of_buff.c | 2 +- test/runtime/12_struct_ptr.c | 2 +- .../18_cuda_to_mpi_event_query_busy_loop.c | 2 +- ...pi_send_cudaMemcpyAsyncH2H_implicit_sync.c | 2 +- .../runtime/20_cuda_to_mpi_send_ds_sync_w_r.c | 2 +- test/runtime/21_chunked_streams_example.c | 2 +- .../22_cuda_to_mpi_partial_buff_write.c | 2 +- test/runtime/26_malloc_pitch.c | 2 +- test/runtime/29_tsan_cuda_to_mpi.c | 2 +- test/runtime/30_tsan_annotate_cuda_to_mpi.c | 2 +- 43 files changed, 146 insertions(+), 101 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 58f070b..69051f5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,11 +30,6 @@ add_subdirectory(externals) add_subdirectory(lib) add_subdirectory(scripts) -if(PROJECT_IS_TOP_LEVEL) - enable_testing() - add_subdirectory(test) -endif() - write_basic_package_version_file( ${CMAKE_CURRENT_BINARY_DIR}/cusanConfigVersion.cmake VERSION ${PROJECT_VERSION} @@ -68,3 +63,8 @@ if(PROJECT_IS_TOP_LEVEL) QUIET_ON_EMPTY ) endif() + +if(PROJECT_IS_TOP_LEVEL) + enable_testing() + add_subdirectory(test) +endif() diff --git a/cmake/cusanToolchain.cmake b/cmake/cusanToolchain.cmake index 449a0ff..cf2a55a 100644 --- a/cmake/cusanToolchain.cmake +++ b/cmake/cusanToolchain.cmake @@ -26,7 +26,7 @@ string(COMPARE EQUAL "${CMAKE_SOURCE_DIR}" "${PROJECT_SOURCE_DIR}" ) find_package(CUDAToolkit REQUIRED) -find_package(MPI REQUIRED) +find_package(MPI) option(CUSAN_TEST_CONFIGURE_IDE "Add targets for tests to help the IDE with completion etc." ON) mark_as_advanced(CUSAN_TEST_CONFIGURE_IDE) diff --git a/lib/runtime/CMakeLists.txt b/lib/runtime/CMakeLists.txt index 741bbc2..5397584 100644 --- a/lib/runtime/CMakeLists.txt +++ b/lib/runtime/CMakeLists.txt @@ -1,67 +1,69 @@ -find_package(MPI) -add_library(cusan_mpi_interceptor SHARED MPIInterception.cpp) -target_link_libraries(cusan_mpi_interceptor - PRIVATE MPI::MPI_CXX -) -set_target_properties( - cusan_mpi_interceptor - PROPERTIES - OUTPUT_NAME "CusanMPIInterceptor" - EXPORT_NAME "MPIInterceptor" -) -add_library(cusan::MPI_Interceptor ALIAS cusan_mpi_interceptor) -target_compile_features(cusan_mpi_interceptor PUBLIC cxx_std_17) -target_compile_definitions( - cusan_mpi_interceptor - PRIVATE CUSAN_LOG_LEVEL=${CUSAN_LOG_LEVEL_RT} - LLVM_VERSION_MAJOR=${LLVM_VERSION_MAJOR} - $<$:CUSAN_FIBERPOOL=1> - $<$:CUSAN_TYPEART=1> - $<$:CUSAN_SOFTCOUNTER> -) - -target_include_directories(cusan_mpi_interceptor ${warning_guard} - PUBLIC $ - PRIVATE $ -) - -target_include_directories(cusan_mpi_interceptor - SYSTEM - PRIVATE - ${LLVM_INCLUDE_DIRS} -) - -target_link_libraries(cusan_mpi_interceptor PRIVATE LLVMSupport) - -if(CUSAN_FIBERPOOL) - target_link_libraries(cusan_mpi_interceptor PUBLIC cusan::fiberpool) - set_target_properties(cusan_mpi_interceptor PROPERTIES - BUILD_WITH_INSTALL_RPATH FALSE - LINK_FLAGS "-Wl,-rpath,${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") +if (MPI_FOUND) + add_library(cusan_mpi_interceptor SHARED MPIInterception.cpp) + target_link_libraries(cusan_mpi_interceptor + PRIVATE MPI::MPI_CXX + ) + set_target_properties( + cusan_mpi_interceptor + PROPERTIES + OUTPUT_NAME "CusanMPIInterceptor" + EXPORT_NAME "MPIInterceptor" + ) + add_library(cusan::MPI_Interceptor ALIAS cusan_mpi_interceptor) + + target_compile_features(cusan_mpi_interceptor PUBLIC cxx_std_17) + target_compile_definitions( + cusan_mpi_interceptor + PRIVATE CUSAN_LOG_LEVEL=${CUSAN_LOG_LEVEL_RT} + LLVM_VERSION_MAJOR=${LLVM_VERSION_MAJOR} + $<$:CUSAN_FIBERPOOL=1> + $<$:CUSAN_TYPEART=1> + $<$:CUSAN_SOFTCOUNTER> + ) + + target_include_directories(cusan_mpi_interceptor ${warning_guard} + PUBLIC $ + PRIVATE $ + ) + + target_include_directories(cusan_mpi_interceptor + SYSTEM + PRIVATE + ${LLVM_INCLUDE_DIRS} + ) + + target_link_libraries(cusan_mpi_interceptor PRIVATE LLVMSupport) + + if(CUSAN_FIBERPOOL) + target_link_libraries(cusan_mpi_interceptor PUBLIC cusan::fiberpool) + set_target_properties(cusan_mpi_interceptor PROPERTIES + BUILD_WITH_INSTALL_RPATH FALSE + LINK_FLAGS "-Wl,-rpath,${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") + endif() + + set(CONFIG_NAME cusanMPIInterceptor) + set(TARGETS_EXPORT_NAME ${CONFIG_NAME}Targets) + + install( + TARGETS cusan_mpi_interceptor + EXPORT ${TARGETS_EXPORT_NAME} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + ) + + install( + EXPORT ${TARGETS_EXPORT_NAME} + NAMESPACE cusan:: + DESTINATION ${CUSAN_INSTALL_CONFIGDIR} + ) + + export( + EXPORT ${TARGETS_EXPORT_NAME} + FILE ${CMAKE_BINARY_DIR}/${TARGETS_EXPORT_NAME}.cmake + NAMESPACE cusan:: + ) endif() -set(CONFIG_NAME cusanMPIInterceptor) -set(TARGETS_EXPORT_NAME ${CONFIG_NAME}Targets) - -install( - TARGETS cusan_mpi_interceptor - EXPORT ${TARGETS_EXPORT_NAME} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} -) - -install( - EXPORT ${TARGETS_EXPORT_NAME} - NAMESPACE cusan:: - DESTINATION ${CUSAN_INSTALL_CONFIGDIR} -) - -export( - EXPORT ${TARGETS_EXPORT_NAME} - FILE ${CMAKE_BINARY_DIR}/${TARGETS_EXPORT_NAME}.cmake - NAMESPACE cusan:: -) - add_library(cusan_Runtime SHARED CusanRuntime.cpp CusanRuntime_cudaSpecific.cpp) set_target_properties( cusan_Runtime diff --git a/lib/runtime/TSanInterface.h b/lib/runtime/TSanInterface.h index 324b688..3bfe367 100644 --- a/lib/runtime/TSanInterface.h +++ b/lib/runtime/TSanInterface.h @@ -1,9 +1,11 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wformat" -#pragma GCC diagnostic ignored "-Wmacro-redefined" #pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wformat-pedantic" #pragma GCC diagnostic ignored "-Wcast-qual" +#if defined(__clang__) +#pragma clang diagnostic ignored "-Wmacro-redefined" +#pragma clang diagnostic ignored "-Wformat-pedantic" +#endif #include "TSan_External.h" diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt index 74d956c..b27f7fb 100644 --- a/scripts/CMakeLists.txt +++ b/scripts/CMakeLists.txt @@ -175,7 +175,6 @@ if(NOT CUSAN_TYPEART) ) endif() -find_package(MPI) if(MPI_C_FOUND) configure_cusan_script( ${CUSAN_WRAPPER} cusan-mpicc${CMAKE_DEBUG_POSTFIX} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 64266d6..d13fe37 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -36,6 +36,7 @@ function(configure_cusan_lit_site_cfg input output) pythonize_bool(${CUSAN_TEST_WORKAROUNDS} CUSAN_TEST_WORKAROUND) pythonize_bool(${CUSAN_TYPEART} CUSAN_HAS_TYPEART) + pythonize_bool(${MPI_FOUND} CUSAN_HAS_MPI) cusan_target_generate_file(${input} ${output}) endfunction() @@ -90,16 +91,32 @@ function(cusan_add_lit_target) endforeach() endfunction() + +if(NOT MPI_FOUND) + # mock target to make test generation work + add_library(cusan_mpi_interceptor_mock SHARED IMPORTED) + set_target_properties( + cusan_mpi_interceptor_mock + PROPERTIES + OUTPUT_NAME "CusanMPIInterceptorPlaceholder" + IMPORTED_LOCATION "${PROJECT_SOURCE_DIR}/lib/runtime" + ) + set(_cusan_interceptor_target cusan_mpi_interceptor_mock) + add_library(cusan::MPI_Interceptor ALIAS cusan_mpi_interceptor_mock) +endif() + set(CUSAN_TEST_DEPENDS cusan::TransformPass - cusan::MPI_Interceptor cusan::Runtime cusan::Analysis - ) +if(FOUND_MPI) + list(APPEND CUSAN_TEST_DEPENDS cusan::MPI_Interceptor) +endif() + if(CUSAN_TYPEART) -list(APPEND CUSAN_TEST_DEPENDS typeart::Runtime typeart::TransformPass) + list(APPEND CUSAN_TEST_DEPENDS typeart::Runtime typeart::TransformPass) endif() set(CUSAN_SUITES diff --git a/test/kernel_analysis/03_struct_write.c b/test/kernel_analysis/03_struct_write.c index c001e03..a6edeef 100644 --- a/test/kernel_analysis/03_struct_write.c +++ b/test/kernel_analysis/03_struct_write.c @@ -22,6 +22,8 @@ // CHECK-NEXT: subarg: {{.*}}ptr: 0, rw: ReadWrite // CHECK-NOT: Handling Arg: +// REQUIRES: mpi + // clang-format on #include "../support/gpu_mpi.h" diff --git a/test/kernel_analysis/08_big_struct_write.c b/test/kernel_analysis/08_big_struct_write.c index 5d34cd5..3df5664 100644 --- a/test/kernel_analysis/08_big_struct_write.c +++ b/test/kernel_analysis/08_big_struct_write.c @@ -22,6 +22,8 @@ // CHECK-NEXT: subarg: {{.*}}ptr: 0, rw: ReadWrite // CHECK-NOT: Handling Arg: +// REQUIRES: mpi + // clang-format on #include "../support/gpu_mpi.h" diff --git a/test/lit.cfg b/test/lit.cfg index cf956ce..909e384 100644 --- a/test/lit.cfg +++ b/test/lit.cfg @@ -45,6 +45,8 @@ if config.cusan_typeart: if config.mpiexec: config.available_features.add('mpiexec') +if config.cusan_mpi: + config.available_features.add('mpi') config.substitutions.append(("%clang-cpp", clang_cpp)) config.substitutions.append(("%clang-cc", clang_cc)) diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in index 70030c3..d6cde3f 100644 --- a/test/lit.site.cfg.in +++ b/test/lit.site.cfg.in @@ -31,6 +31,7 @@ config.cusan_pass = "$" config.cusan_mpi_interceptor = "$" config.cusan_use_workarounds = @CUSAN_TEST_WORKAROUND@ config.cusan_typeart = @CUSAN_HAS_TYPEART@ +config.cusan_mpi = @CUSAN_HAS_MPI@ # Let the main config do the real work. config.loaded_site_config = True diff --git a/test/pass/03_cuda_to_mpi.c b/test/pass/03_cuda_to_mpi.c index 82d6a98..5186403 100644 --- a/test/pass/03_cuda_to_mpi.c +++ b/test/pass/03_cuda_to_mpi.c @@ -2,11 +2,11 @@ // RUN: %rm-file %t.yaml // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR - - // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[target:%[0-9a-z]+]], {{i8\*|ptr}} {{.*}}[[from:%[0-9a-z]+]], // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]], {{i8\*|ptr}} {{.*}}[[from]], +// REQUIRES: mpi + // clang-format on #include "../support/gpu_mpi.h" diff --git a/test/pass/04_mpi_to_cuda.c b/test/pass/04_mpi_to_cuda.c index e27f41a..6196dcc 100644 --- a/test/pass/04_mpi_to_cuda.c +++ b/test/pass/04_mpi_to_cuda.c @@ -8,6 +8,8 @@ // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[target:%[0-9a-z]+]], {{i8\*|ptr}} {{.*}}[[from:%[0-9a-z]+]], // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]], {{i8\*|ptr}} {{.*}}[[from]], +// REQUIRES: mpi + // clang-format on #include "../support/gpu_mpi.h" diff --git a/test/pass/05_cuda_to_mpi_stream.c b/test/pass/05_cuda_to_mpi_stream.c index bdb304f..6a22f3e 100644 --- a/test/pass/05_cuda_to_mpi_stream.c +++ b/test/pass/05_cuda_to_mpi_stream.c @@ -10,6 +10,8 @@ // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamSynchronize // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_stream +// REQUIRES: mpi + // clang-format on #include "../support/gpu_mpi.h" diff --git a/test/pass/06_cuda_to_mpi_event.c b/test/pass/06_cuda_to_mpi_event.c index 2253965..50bdd34 100644 --- a/test/pass/06_cuda_to_mpi_event.c +++ b/test/pass/06_cuda_to_mpi_event.c @@ -10,6 +10,8 @@ // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[target:%[0-9a-z]+]], {{i8\*|ptr}} {{.*}}[[from:%[0-9a-z]+]], // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]], {{i8\*|ptr}} {{.*}}[[from]], +// REQUIRES: mpi + // clang-format on #include "../support/gpu_mpi.h" diff --git a/test/pass/07_cuda_to_mpi_read.c b/test/pass/07_cuda_to_mpi_read.c index c1ce739..f379fb7 100644 --- a/test/pass/07_cuda_to_mpi_read.c +++ b/test/pass/07_cuda_to_mpi_read.c @@ -11,6 +11,8 @@ // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[target:%[0-9a-z]+]], {{i8\*|ptr}} {{.*}}[[from:%[0-9a-z]+]], // CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]], {{i8\*|ptr}} {{.*}}[[from]], +// REQUIRES: mpi + // clang-format on #include "../support/gpu_mpi.h" diff --git a/test/pass/08_cudamemcpy_to_mpi.c b/test/pass/08_cudamemcpy_to_mpi.c index 3d1139c..1d553a2 100644 --- a/test/pass/08_cudamemcpy_to_mpi.c +++ b/test/pass/08_cudamemcpy_to_mpi.c @@ -15,6 +15,8 @@ // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamSynchronize // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_stream +// REQUIRES: mpi + // clang-format on // Tsan sometimes crashes with this test it seems diff --git a/test/pass/11_cuda_to_mpi_struct_of_buff.c b/test/pass/11_cuda_to_mpi_struct_of_buff.c index 2dcea96..3a2001d 100644 --- a/test/pass/11_cuda_to_mpi_struct_of_buff.c +++ b/test/pass/11_cuda_to_mpi_struct_of_buff.c @@ -3,6 +3,8 @@ // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR +// REQUIRES: mpi + // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate diff --git a/test/pass/18_cuda_to_mpi_event_query_busy_loop.c b/test/pass/18_cuda_to_mpi_event_query_busy_loop.c index cd00aae..8c590cc 100644 --- a/test/pass/18_cuda_to_mpi_event_query_busy_loop.c +++ b/test/pass/18_cuda_to_mpi_event_query_busy_loop.c @@ -3,6 +3,7 @@ // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR +// REQUIRES: mpi // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate // CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_stream diff --git a/test/pass/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c b/test/pass/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c index 1c5d299..248c3a6 100644 --- a/test/pass/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c +++ b/test/pass/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c @@ -3,6 +3,7 @@ // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR +// REQUIRES: mpi // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate // CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_stream diff --git a/test/pass/20_cuda_to_mpi_send_ds_sync_w_r.c b/test/pass/20_cuda_to_mpi_send_ds_sync_w_r.c index f86f3a0..871b3ab 100644 --- a/test/pass/20_cuda_to_mpi_send_ds_sync_w_r.c +++ b/test/pass/20_cuda_to_mpi_send_ds_sync_w_r.c @@ -3,6 +3,8 @@ // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR +// REQUIRES: mpi + // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaMemset diff --git a/test/pass/21_chunked_streams_example.c b/test/pass/21_chunked_streams_example.c index 13562f7..03774ce 100644 --- a/test/pass/21_chunked_streams_example.c +++ b/test/pass/21_chunked_streams_example.c @@ -3,6 +3,8 @@ // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR +// REQUIRES: mpi + // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate diff --git a/test/pass/22_cuda_to_mpi_partial_buff_write.c b/test/pass/22_cuda_to_mpi_partial_buff_write.c index c037c8a..2f23ae8 100644 --- a/test/pass/22_cuda_to_mpi_partial_buff_write.c +++ b/test/pass/22_cuda_to_mpi_partial_buff_write.c @@ -3,6 +3,8 @@ // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR +// REQUIRES: mpi + // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamSynchronize diff --git a/test/pass/26_malloc_pitch.c b/test/pass/26_malloc_pitch.c index b71af1a..7a0a97c 100644 --- a/test/pass/26_malloc_pitch.c +++ b/test/pass/26_malloc_pitch.c @@ -6,6 +6,8 @@ // {{.*}}[[from:%[0-9a-z]+]], CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]], // {{i8\*|ptr}} {{.*}}[[from]], +// REQUIRES: mpi + // clang-format on #include "../support/gpu_mpi.h" diff --git a/test/pass/29_tsan_cuda_to_mpi.c b/test/pass/29_tsan_cuda_to_mpi.c index 7608db6..9ab6fdc 100644 --- a/test/pass/29_tsan_cuda_to_mpi.c +++ b/test/pass/29_tsan_cuda_to_mpi.c @@ -3,6 +3,7 @@ // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR +// REQUIRES: mpi // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy // CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memcpy diff --git a/test/pass/30_tsan_annotate_cuda_to_mpi.c b/test/pass/30_tsan_annotate_cuda_to_mpi.c index 506eff1..335e372 100644 --- a/test/pass/30_tsan_annotate_cuda_to_mpi.c +++ b/test/pass/30_tsan_annotate_cuda_to_mpi.c @@ -8,6 +8,8 @@ // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFree // CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_free +// REQUIRES: mpi + // clang-format on #include "../support/gpu_mpi.h" diff --git a/test/pass/31_tsan_cuda_event.c b/test/pass/31_tsan_cuda_event.c index 918ba99..eb819ea 100644 --- a/test/pass/31_tsan_cuda_event.c +++ b/test/pass/31_tsan_cuda_event.c @@ -1,8 +1,7 @@ // clang-format off // RUN: %rm-file %t.yaml -// RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR - +// RUN: %wrapper-cc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaEventCreate // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_event @@ -15,8 +14,6 @@ // clang-format on -#include "../support/gpu_mpi.h" - #include __global__ void kernel(int* arr, const int N) { // CHECK-DAG: [[FILENAME]]:[[@LINE]] @@ -27,10 +24,6 @@ __global__ void kernel(int* arr, const int N) { // CHECK-DAG: [[FILENAME]]:[[@L } int main(int argc, char* argv[]) { - if (!has_gpu_aware_mpi()) { - printf("This example is designed for CUDA-aware MPI. Exiting.\n"); - return 1; - } cudaEvent_t first_finished_event; cudaEventCreate(&first_finished_event); cudaStream_t stream1; diff --git a/test/pass/32_tsan_async_copy.c b/test/pass/32_tsan_async_copy.c index cc7c12f..d30220d 100644 --- a/test/pass/32_tsan_async_copy.c +++ b/test/pass/32_tsan_async_copy.c @@ -1,7 +1,7 @@ // clang-format off // RUN: %rm-file %t.yaml -// RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR +// RUN: %wrapper-cc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate diff --git a/test/runtime/03_cuda_to_mpi.c b/test/runtime/03_cuda_to_mpi.c index d7f11b6..e2dcd9d 100644 --- a/test/runtime/03_cuda_to_mpi.c +++ b/test/runtime/03_cuda_to_mpi.c @@ -5,7 +5,7 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC -// REQUIRES: mpiexec +// REQUIRES: mpi // clang-format on diff --git a/test/runtime/04_mpi_to_cuda.c b/test/runtime/04_mpi_to_cuda.c index 370f48f..77f8915 100644 --- a/test/runtime/04_mpi_to_cuda.c +++ b/test/runtime/04_mpi_to_cuda.c @@ -5,7 +5,7 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC -// REQUIRES: mpiexec +// REQUIRES: mpi // clang-format on diff --git a/test/runtime/05_cuda_to_mpi_stream.c b/test/runtime/05_cuda_to_mpi_stream.c index af827b9..4ff4400 100644 --- a/test/runtime/05_cuda_to_mpi_stream.c +++ b/test/runtime/05_cuda_to_mpi_stream.c @@ -5,7 +5,7 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC -// REQUIRES: mpiexec +// REQUIRES: mpi // clang-format on diff --git a/test/runtime/06_cuda_to_mpi_event.c b/test/runtime/06_cuda_to_mpi_event.c index 0585479..65ef15e 100644 --- a/test/runtime/06_cuda_to_mpi_event.c +++ b/test/runtime/06_cuda_to_mpi_event.c @@ -5,7 +5,7 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC -// REQUIRES: mpiexec +// REQUIRES: mpi // clang-format on diff --git a/test/runtime/07_cuda_to_mpi_read.c b/test/runtime/07_cuda_to_mpi_read.c index e367e2a..d1ee1fc 100644 --- a/test/runtime/07_cuda_to_mpi_read.c +++ b/test/runtime/07_cuda_to_mpi_read.c @@ -2,7 +2,7 @@ // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck --allow-empty %s -// REQUIRES: mpiexec +// REQUIRES: mpi // clang-format on diff --git a/test/runtime/08_cudamemcpy_to_mpi.c b/test/runtime/08_cudamemcpy_to_mpi.c index b8fa5bf..4aedbcc 100644 --- a/test/runtime/08_cudamemcpy_to_mpi.c +++ b/test/runtime/08_cudamemcpy_to_mpi.c @@ -2,7 +2,7 @@ // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe // RUN: %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s --allow-empty -// REQUIRES: mpiexec +// REQUIRES: mpi // clang-format on diff --git a/test/runtime/11_cuda_to_mpi_struct_of_buff.c b/test/runtime/11_cuda_to_mpi_struct_of_buff.c index 4a3141d..8455f74 100644 --- a/test/runtime/11_cuda_to_mpi_struct_of_buff.c +++ b/test/runtime/11_cuda_to_mpi_struct_of_buff.c @@ -5,7 +5,7 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC -// REQUIRES: mpiexec +// REQUIRES: mpi // clang-format on diff --git a/test/runtime/12_struct_ptr.c b/test/runtime/12_struct_ptr.c index 5a71312..41ab21d 100644 --- a/test/runtime/12_struct_ptr.c +++ b/test/runtime/12_struct_ptr.c @@ -5,7 +5,7 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC -// REQUIRES: mpiexec +// REQUIRES: mpi // CHECK-DAG: data race diff --git a/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c b/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c index 73674a7..0b82829 100644 --- a/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c +++ b/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c @@ -5,7 +5,7 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC -// REQUIRES: mpiexec +// REQUIRES: mpi // clang-format on diff --git a/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c b/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c index 61eee40..23d9a9c 100644 --- a/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c +++ b/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c @@ -5,7 +5,7 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC -// REQUIRES: mpiexec +// REQUIRES: mpi // clang-format on diff --git a/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c b/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c index 820912b..7f88a11 100644 --- a/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c +++ b/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c @@ -5,7 +5,7 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC -// REQUIRES: mpiexec +// REQUIRES: mpi // clang-format on diff --git a/test/runtime/21_chunked_streams_example.c b/test/runtime/21_chunked_streams_example.c index fa4f5d0..6b7dee5 100644 --- a/test/runtime/21_chunked_streams_example.c +++ b/test/runtime/21_chunked_streams_example.c @@ -2,7 +2,7 @@ // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s -// REQUIRES: mpiexec +// REQUIRES: mpi // clang-format on diff --git a/test/runtime/22_cuda_to_mpi_partial_buff_write.c b/test/runtime/22_cuda_to_mpi_partial_buff_write.c index 8a29dd9..2639ce8 100644 --- a/test/runtime/22_cuda_to_mpi_partial_buff_write.c +++ b/test/runtime/22_cuda_to_mpi_partial_buff_write.c @@ -2,7 +2,7 @@ // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s --allow-empty -// REQUIRES: mpiexec +// REQUIRES: mpi // clang-format on diff --git a/test/runtime/26_malloc_pitch.c b/test/runtime/26_malloc_pitch.c index 3d7524f..0dac955 100644 --- a/test/runtime/26_malloc_pitch.c +++ b/test/runtime/26_malloc_pitch.c @@ -5,7 +5,7 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC -// REQUIRES: mpiexec +// REQUIRES: mpi // clang-format on diff --git a/test/runtime/29_tsan_cuda_to_mpi.c b/test/runtime/29_tsan_cuda_to_mpi.c index 14ebfa1..2ac700f 100644 --- a/test/runtime/29_tsan_cuda_to_mpi.c +++ b/test/runtime/29_tsan_cuda_to_mpi.c @@ -5,7 +5,7 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC -// REQUIRES: mpiexec +// REQUIRES: mpi // clang-format on diff --git a/test/runtime/30_tsan_annotate_cuda_to_mpi.c b/test/runtime/30_tsan_annotate_cuda_to_mpi.c index 24e59ce..19c6d97 100644 --- a/test/runtime/30_tsan_annotate_cuda_to_mpi.c +++ b/test/runtime/30_tsan_annotate_cuda_to_mpi.c @@ -5,7 +5,7 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda -g %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC -// REQUIRES: mpiexec +// REQUIRES: mpi // clang-format on From 53986fd5baadcc878bbdb461fcbc962944a32198 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20H=C3=BCck?= Date: Mon, 21 Apr 2025 12:09:19 +0200 Subject: [PATCH 14/15] Find MPI quiet --- cmake/cusanToolchain.cmake | 2 +- scripts/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/cusanToolchain.cmake b/cmake/cusanToolchain.cmake index cf2a55a..63cf466 100644 --- a/cmake/cusanToolchain.cmake +++ b/cmake/cusanToolchain.cmake @@ -26,7 +26,7 @@ string(COMPARE EQUAL "${CMAKE_SOURCE_DIR}" "${PROJECT_SOURCE_DIR}" ) find_package(CUDAToolkit REQUIRED) -find_package(MPI) +find_package(MPI QUIET) option(CUSAN_TEST_CONFIGURE_IDE "Add targets for tests to help the IDE with completion etc." ON) mark_as_advanced(CUSAN_TEST_CONFIGURE_IDE) diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt index b27f7fb..6dffaf8 100644 --- a/scripts/CMakeLists.txt +++ b/scripts/CMakeLists.txt @@ -92,7 +92,7 @@ function(configure_cusan_script input output) endfunction() function(cusan_find_mpi_vendor_helper symbol ret_value) - find_package(MPI) + find_package(MPI QUIET) if(NOT MPI_FOUND) set(${ret_value} false) From 3d4ff1b02396f8c1072d73da9db5c413e8ae5605 Mon Sep 17 00:00:00 2001 From: ahueck Date: Mon, 21 Apr 2025 12:50:20 +0200 Subject: [PATCH 15/15] Sync callback default OFF (#9) --- .github/workflows/ci.yml | 7 ++++++- cmake/cusanToolchain.cmake | 2 +- lib/runtime/CusanRuntime.cpp | 1 + test/CMakeLists.txt | 1 + test/lit.cfg | 3 +++ test/lit.site.cfg.in | 1 + test/pass/34_test_sync_callback.c | 14 ++++++++++++++ test/pass/35_test_sync_callback_stream.c | 15 +++++++++++++++ test/pass/36_test_sync_callback_event.c | 14 ++++++++++++++ 9 files changed, 56 insertions(+), 2 deletions(-) create mode 100644 test/pass/34_test_sync_callback.c create mode 100644 test/pass/35_test_sync_callback_stream.c create mode 100644 test/pass/36_test_sync_callback_event.c diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 014567f..6ae9856 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,6 +30,11 @@ jobs: os: ubuntu-24.04 preset: release cuda: 12.6.0 + - llvm-version: 19 + os: ubuntu-24.04 + preset: release + cuda: 12.6.0 + cusan-option: -DCUSAN_DEVICE_SYNC_CALLBACKS=ON runs-on: ${{ matrix.os }} @@ -70,7 +75,7 @@ jobs: echo "EXTERNAL_LIT=/usr/lib/llvm-${{ matrix.llvm-version }}/build/utils/lit/lit.py" >> $GITHUB_ENV - name: Configure CuSan - run: cmake -B build --preset ${{ matrix.preset }} -DLLVM_DIR=${LLVM_CMAKE_DIR} -DLLVM_EXTERNAL_LIT=${EXTERNAL_LIT} + run: cmake -B build --preset ${{ matrix.preset }} -DLLVM_DIR=${LLVM_CMAKE_DIR} -DLLVM_EXTERNAL_LIT=${EXTERNAL_LIT} ${{ matrix.cusan-option }} - name: Build CuSan run: cmake --build build --parallel 2 diff --git a/cmake/cusanToolchain.cmake b/cmake/cusanToolchain.cmake index 63cf466..4a8d132 100644 --- a/cmake/cusanToolchain.cmake +++ b/cmake/cusanToolchain.cmake @@ -41,7 +41,7 @@ option(CUSAN_FIBERPOOL "Use external fiber pool to manage ThreadSanitizer fibers option(CUSAN_SOFTCOUNTER "Print runtime counters" OFF) option(CUSAN_SYNC_DETAIL_LEVEL "Enable implicit sync analysis of memcpy/memset" ON) -option(CUSAN_DEVICE_SYNC_CALLBACKS "Enable runtime callbacks after sync calls" ON) +option(CUSAN_DEVICE_SYNC_CALLBACKS "Enable runtime callbacks after sync calls" OFF) option(CUSAN_TEST_WORKAROUNDS "Enable workarounds for MPI + TSan regarding runtime tests" ON) mark_as_advanced(CUSAN_TEST_WORKAROUNDS) diff --git a/lib/runtime/CusanRuntime.cpp b/lib/runtime/CusanRuntime.cpp index da2488c..9c00736 100644 --- a/lib/runtime/CusanRuntime.cpp +++ b/lib/runtime/CusanRuntime.cpp @@ -779,6 +779,7 @@ void _cusan_memcpy(void* target, const void* from, size_t count, cusan_memcpy_ki } void cusan_sync_callback(cusan_sync_type /*type*/, const void* /*event_or_stream*/, unsigned int /*return_value*/) { + LOG_TRACE("[cusan]Callback"); // switch (type) { // case cusan_Device: // printf("Device sync return value %i\n", return_value); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d13fe37..42e693a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -37,6 +37,7 @@ function(configure_cusan_lit_site_cfg input output) pythonize_bool(${CUSAN_TEST_WORKAROUNDS} CUSAN_TEST_WORKAROUND) pythonize_bool(${CUSAN_TYPEART} CUSAN_HAS_TYPEART) pythonize_bool(${MPI_FOUND} CUSAN_HAS_MPI) + pythonize_bool(${CUSAN_DEVICE_SYNC_CALLBACKS} CUSAN_HAS_SYNC_CALLBACKS) cusan_target_generate_file(${input} ${output}) endfunction() diff --git a/test/lit.cfg b/test/lit.cfg index 909e384..bfea736 100644 --- a/test/lit.cfg +++ b/test/lit.cfg @@ -48,6 +48,9 @@ if config.mpiexec: if config.cusan_mpi: config.available_features.add('mpi') +if config.cusan_sync_callback: + config.available_features.add('sync_callback') + config.substitutions.append(("%clang-cpp", clang_cpp)) config.substitutions.append(("%clang-cc", clang_cc)) config.substitutions.append(("%opt", opt)) diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in index d6cde3f..14e1304 100644 --- a/test/lit.site.cfg.in +++ b/test/lit.site.cfg.in @@ -32,6 +32,7 @@ config.cusan_mpi_interceptor = "$" config.cusan_use_workarounds = @CUSAN_TEST_WORKAROUND@ config.cusan_typeart = @CUSAN_HAS_TYPEART@ config.cusan_mpi = @CUSAN_HAS_MPI@ +config.cusan_sync_callback = @CUSAN_HAS_SYNC_CALLBACKS@ # Let the main config do the real work. config.loaded_site_config = True diff --git a/test/pass/34_test_sync_callback.c b/test/pass/34_test_sync_callback.c new file mode 100644 index 0000000..2015bfe --- /dev/null +++ b/test/pass/34_test_sync_callback.c @@ -0,0 +1,14 @@ +// clang-format off +// RUN: %rm-file %t.yaml +// RUN: %wrapper-cc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s +// REQUIRES: sync_callback +// clang-format on + +// CHECK: {{(invoke|call)}} i32 @cudaDeviceSynchronize +// CHECK: {{(invoke|call)}} void @cusan_sync_callback + +int main(int argc, char* argv[]) { + cudaDeviceSynchronize(); + + return 0; +} diff --git a/test/pass/35_test_sync_callback_stream.c b/test/pass/35_test_sync_callback_stream.c new file mode 100644 index 0000000..d8c5b68 --- /dev/null +++ b/test/pass/35_test_sync_callback_stream.c @@ -0,0 +1,15 @@ +// clang-format off +// RUN: %rm-file %t.yaml +// RUN: %wrapper-cc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s +// REQUIRES: sync_callback +// clang-format on + +// CHECK: {{(invoke|call)}} i32 @cudaStreamSynchronize +// CHECK: {{(invoke|call)}} void @cusan_sync_callback + +int main(int argc, char* argv[]) { + cudaStream_t stream = NULL; + cudaStreamSynchronize(stream); + + return 0; +} diff --git a/test/pass/36_test_sync_callback_event.c b/test/pass/36_test_sync_callback_event.c new file mode 100644 index 0000000..6ee4679 --- /dev/null +++ b/test/pass/36_test_sync_callback_event.c @@ -0,0 +1,14 @@ +// clang-format off +// RUN: %rm-file %t.yaml +// RUN: %wrapper-cc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s +// REQUIRES: sync_callback +// clang-format on + +// CHECK: {{(invoke|call)}} i32 @cudaEventSynchronize +// CHECK: {{(invoke|call)}} void @cusan_sync_callback + +int main(int argc, char* argv[]) { + cudaEvent_t event; + cudaEventSynchronize(event); + return 0; +}