diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d121748..6ae9856 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,25 +10,6 @@ env: CC: clang jobs: - format-check: - runs-on: ubuntu-22.04 - - steps: - - uses: actions/checkout@v4 - - - name: Format source code - run: | - find lib test \ - -type f \ - -a \( -name "*.c" -o -name "*.cpp" -o -name "*.h" \) \ - -print0 \ - | xargs -0 clang-format-14 -i - - - name: Format check - run: | - git status --porcelain --untracked-files=no - git status --porcelain --untracked-files=no | xargs -o -I {} test -z \"{}\" - codespell: runs-on: ubuntu-22.04 @@ -44,19 +25,35 @@ jobs: - llvm-version: 14 os: ubuntu-22.04 preset: release + cuda: 11.8.0 + - llvm-version: 19 + os: ubuntu-24.04 + preset: release + cuda: 12.6.0 + - llvm-version: 19 + os: ubuntu-24.04 + preset: release + cuda: 12.6.0 + cusan-option: -DCUSAN_DEVICE_SYNC_CALLBACKS=ON runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 + - name: LLVM apt + if: ${{ matrix.llvm-version == 19 }} + run: | + wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - + echo "deb http://apt.llvm.org/noble/ llvm-toolchain-noble-19 main" | sudo tee /etc/apt/sources.list.d/llvm-19.list + - name: Update apt run: sudo apt-get update - uses: Jimver/cuda-toolkit@v0.2.19 id: cuda-toolkit with: - cuda: '11.8.0' + cuda: '${{ matrix.cuda }}' method: network sub-packages: '["nvcc", "cudart", "cudart-dev"]' non-cuda-sub-packages: '["libcurand", "libcurand-dev"]' @@ -78,7 +75,7 @@ jobs: echo "EXTERNAL_LIT=/usr/lib/llvm-${{ matrix.llvm-version }}/build/utils/lit/lit.py" >> $GITHUB_ENV - name: Configure CuSan - run: cmake -B build --preset ${{ matrix.preset }} -DLLVM_DIR=${LLVM_CMAKE_DIR} -DLLVM_EXTERNAL_LIT=${EXTERNAL_LIT} + run: cmake -B build --preset ${{ matrix.preset }} -DLLVM_DIR=${LLVM_CMAKE_DIR} -DLLVM_EXTERNAL_LIT=${EXTERNAL_LIT} ${{ matrix.cusan-option }} - name: Build CuSan run: cmake --build build --parallel 2 diff --git a/CMakeLists.txt b/CMakeLists.txt index 191bca8..69051f5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,6 +9,8 @@ project( ) set(CMAKE_VERBOSE_MAKEFILE ON) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) list( APPEND @@ -28,11 +30,6 @@ add_subdirectory(externals) add_subdirectory(lib) add_subdirectory(scripts) -if(PROJECT_IS_TOP_LEVEL) - enable_testing() - add_subdirectory(test) -endif() - write_basic_package_version_file( ${CMAKE_CURRENT_BINARY_DIR}/cusanConfigVersion.cmake VERSION ${PROJECT_VERSION} @@ -66,3 +63,8 @@ if(PROJECT_IS_TOP_LEVEL) QUIET_ON_EMPTY ) endif() + +if(PROJECT_IS_TOP_LEVEL) + enable_testing() + add_subdirectory(test) +endif() diff --git a/cmake/cusanToolchain.cmake b/cmake/cusanToolchain.cmake index 9e98131..4a8d132 100644 --- a/cmake/cusanToolchain.cmake +++ b/cmake/cusanToolchain.cmake @@ -26,7 +26,7 @@ string(COMPARE EQUAL "${CMAKE_SOURCE_DIR}" "${PROJECT_SOURCE_DIR}" ) find_package(CUDAToolkit REQUIRED) -find_package(MPI REQUIRED) +find_package(MPI QUIET) option(CUSAN_TEST_CONFIGURE_IDE "Add targets for tests to help the IDE with completion etc." ON) mark_as_advanced(CUSAN_TEST_CONFIGURE_IDE) @@ -41,6 +41,8 @@ option(CUSAN_FIBERPOOL "Use external fiber pool to manage ThreadSanitizer fibers option(CUSAN_SOFTCOUNTER "Print runtime counters" OFF) option(CUSAN_SYNC_DETAIL_LEVEL "Enable implicit sync analysis of memcpy/memset" ON) +option(CUSAN_DEVICE_SYNC_CALLBACKS "Enable runtime callbacks after sync calls" OFF) + option(CUSAN_TEST_WORKAROUNDS "Enable workarounds for MPI + TSan regarding runtime tests" ON) mark_as_advanced(CUSAN_TEST_WORKAROUNDS) diff --git a/cmake/modules/cusan-format.cmake b/cmake/modules/cusan-format.cmake index bacdf5d..96c393d 100644 --- a/cmake/modules/cusan-format.cmake +++ b/cmake/modules/cusan-format.cmake @@ -20,11 +20,12 @@ function(cusan_target_format target comment) filter_dir(${exclude}) endforeach() - find_program(FORMAT_COMMAND + find_program(CLANG_FORMAT_COMMAND NAMES clang-format-${LLVM_VERSION_MAJOR} clang-format) - if(FORMAT_COMMAND) + mark_as_advanced(CLANG_FORMAT_COMMAND) + if(CLANG_FORMAT_COMMAND) add_custom_target(${target} - COMMAND ${FORMAT_COMMAND} -i -style=file ${ARG_OTHER} ${ARG_UNPARSED_ARGUMENTS} + COMMAND ${CLANG_FORMAT_COMMAND} -i -style=file ${ARG_OTHER} ${ARG_UNPARSED_ARGUMENTS} ${ALL_CXX_FILES} WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} COMMENT "${comment}" diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt index 863ccc1..4dc9ddf 100644 --- a/externals/CMakeLists.txt +++ b/externals/CMakeLists.txt @@ -42,3 +42,9 @@ if(CUSAN_TYPEART) ) FetchContent_MakeAvailable(typeart) endif() + +mark_as_advanced( + FETCHCONTENT_BASE_DIR + FETCHCONTENT_FULLY_DISCONNECTED + FETCHCONTENT_QUIET + FETCHCONTENT_UPDATES_DISCONNECTED) \ No newline at end of file diff --git a/lib/pass/AnalysisTransform.cpp b/lib/pass/AnalysisTransform.cpp index f2a9983..6da40e1 100644 --- a/lib/pass/AnalysisTransform.cpp +++ b/lib/pass/AnalysisTransform.cpp @@ -582,6 +582,48 @@ llvm::SmallVector CudaFree::map_arguments(IRBuilder<>& irb, llvm::ArrayR return {ptr}; } +// CudaMallocPitch + +CudaMallocPitch::CudaMallocPitch(callback::FunctionDecl* decls) { + setup("cudaMallocPitch", &decls->cusan_device_alloc.f); +} +llvm::SmallVector CudaMallocPitch::map_arguments(IRBuilder<>& irb, llvm::ArrayRef args) { + //(void** devPtr, size_t* pitch, size_t width, size_t height ) + assert(args.size() == 4); + auto* ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb)); + + //"The function may pad the allocation" + //"*pitch by cudaMallocPitch() is the width in bytes of the allocation" + auto* pitch = irb.CreateLoad(irb.getIntPtrTy(irb.GetInsertBlock()->getModule()->getDataLayout()), args[1]); + // auto* width = args[2]; + auto* height = args[3]; + + auto* real_size = irb.CreateMul(pitch, height); + return {ptr, real_size}; +} + +// CudaSetDevice + +CudaSetDevice::CudaSetDevice(callback::FunctionDecl* decls) { + setup("cudaSetDevice", &decls->cusan_set_device.f); +} +llvm::SmallVector CudaSetDevice::map_arguments(IRBuilder<>&, llvm::ArrayRef args) { + // cudaSetDevice ( int device ) + assert(args.size() == 1); + return {args[0]}; +} + +// CudaCHooseDevice + +CudaChooseDevice::CudaChooseDevice(callback::FunctionDecl* decls) { + setup("cudaChooseDevice", &decls->cusan_choose_device.f); +} +llvm::SmallVector CudaChooseDevice::map_arguments(IRBuilder<>&, llvm::ArrayRef args) { + // cudaChooseDevice ( int* device, const cudaDeviceProp* prop ) + assert(args.size() == 2); + return {args[0]}; +} + // CudaStreamQuery CudaStreamQuery::CudaStreamQuery(callback::FunctionDecl* decls) { @@ -614,4 +656,45 @@ llvm::SmallVector CudaEventQuery::map_return_value(IRBuilder<>& irb, return {result}; } +CudaStreamSyncCallback::CudaStreamSyncCallback(callback::FunctionDecl* decls) { + setup("cudaStreamSynchronize", &decls->cusan_sync_callback.f); +} +llvm::SmallVector CudaStreamSyncCallback::map_arguments(IRBuilder<>& irb, llvm::ArrayRef args) { + //( void* stream) + assert(args.size() == 1); + return {irb.getInt8(1), args[0]}; +} +llvm::SmallVector CudaStreamSyncCallback::map_return_value(IRBuilder<>&, Value* result) { + return {result}; +} + +CudaEventSyncCallback::CudaEventSyncCallback(callback::FunctionDecl* decls) { + setup("cudaEventSynchronize", &decls->cusan_sync_callback.f); +} +llvm::SmallVector CudaEventSyncCallback::map_arguments(IRBuilder<>& irb, llvm::ArrayRef args) { + //( void* event) + assert(args.size() == 1); + return {irb.getInt8(2), args[0]}; +} +llvm::SmallVector CudaEventSyncCallback::map_return_value(IRBuilder<>&, Value* result) { + return {result}; +} + +CudaDeviceSyncCallback::CudaDeviceSyncCallback(callback::FunctionDecl* decls) { + setup("cudaDeviceSynchronize", &decls->cusan_sync_callback.f); +} +llvm::SmallVector CudaDeviceSyncCallback::map_arguments(IRBuilder<>& irb, llvm::ArrayRef args) { + //( ) + assert(args.size() == 0); +#if LLVM_VERSION_MAJOR < 15 + auto* ptr_type = PointerType::getUnqual(irb.getContext()); +#else + auto* ptr_type = irb.getPtrTy(); +#endif + return {irb.getInt8(0), ConstantPointerNull::get(ptr_type)}; +} +llvm::SmallVector CudaDeviceSyncCallback::map_return_value(IRBuilder<>&, Value* result) { + return {result}; +} + } // namespace cusan::transform diff --git a/lib/pass/AnalysisTransform.h b/lib/pass/AnalysisTransform.h index fbb5cef..55e3f1b 100644 --- a/lib/pass/AnalysisTransform.h +++ b/lib/pass/AnalysisTransform.h @@ -211,27 +211,9 @@ BasicInstrumenterDecl(CudaHostFree); BasicInstrumenterDecl(CudaMallocManaged); BasicInstrumenterDecl(CudaMalloc); BasicInstrumenterDecl(CudaFree); - -class CudaMallocPitch : public SimpleInstrumenter { - public: - CudaMallocPitch(callback::FunctionDecl* decls) { - setup("cudaMallocPitch", &decls->cusan_device_alloc.f); - } - static llvm::SmallVector map_arguments(IRBuilder<>& irb, llvm::ArrayRef args) { - //(void** devPtr, size_t* pitch, size_t width, size_t height ) - assert(args.size() == 4); - auto* ptr = irb.CreateBitOrPointerCast(args[0], irb.getInt8Ty()->getPointerTo()); - - //"The function may pad the allocation" - //"*pitch by cudaMallocPitch() is the width in bytes of the allocation" - auto* pitch = irb.CreateLoad(irb.getIntPtrTy(irb.GetInsertBlock()->getModule()->getDataLayout()), args[1]); - // auto* width = args[2]; - auto* height = args[3]; - - auto* real_size = irb.CreateMul(pitch, height); - return {ptr, real_size}; - } -}; +BasicInstrumenterDecl(CudaMallocPitch); +BasicInstrumenterDecl(CudaSetDevice); +BasicInstrumenterDecl(CudaChooseDevice); class CudaStreamQuery : public SimpleInstrumenter { public: @@ -247,6 +229,25 @@ class CudaEventQuery : public SimpleInstrumenter { static llvm::SmallVector map_return_value(IRBuilder<>& irb, Value* result); }; +class CudaStreamSyncCallback : public SimpleInstrumenter { + public: + CudaStreamSyncCallback(callback::FunctionDecl* decls); + static llvm::SmallVector map_arguments(IRBuilder<>& irb, llvm::ArrayRef args); + static llvm::SmallVector map_return_value(IRBuilder<>& irb, Value* result); +}; +class CudaEventSyncCallback : public SimpleInstrumenter { + public: + CudaEventSyncCallback(callback::FunctionDecl* decls); + static llvm::SmallVector map_arguments(IRBuilder<>& irb, llvm::ArrayRef args); + static llvm::SmallVector map_return_value(IRBuilder<>& irb, Value* result); +}; +class CudaDeviceSyncCallback : public SimpleInstrumenter { + public: + CudaDeviceSyncCallback(callback::FunctionDecl* decls); + static llvm::SmallVector map_arguments(IRBuilder<>& irb, llvm::ArrayRef args); + static llvm::SmallVector map_return_value(IRBuilder<>& irb, Value* result); +}; + } // namespace transform } // namespace cusan diff --git a/lib/pass/CMakeLists.txt b/lib/pass/CMakeLists.txt index f52baa5..8fbefd9 100644 --- a/lib/pass/CMakeLists.txt +++ b/lib/pass/CMakeLists.txt @@ -14,6 +14,7 @@ mark_as_advanced(LLVM_CUSAN_TRANSFORMPASS_LINK_INTO_TOOLS) target_compile_definitions(cusan_TransformPass PRIVATE $<$:CUSAN_TYPEART=1> + $<$:CUSAN_DEVICE_SYNC_CALLBACKS=1> ) set_target_properties( diff --git a/lib/pass/CusanPass.cpp b/lib/pass/CusanPass.cpp index d205ddd..48cbf50 100644 --- a/lib/pass/CusanPass.cpp +++ b/lib/pass/CusanPass.cpp @@ -180,6 +180,15 @@ bool CusanPass::runOnFunc(llvm::Function& function) { modified |= transform::StreamCreateWithFlagsInstrumenter(&cusan_decls_).instrument(function); modified |= transform::StreamCreateWithPriorityInstrumenter(&cusan_decls_).instrument(function); modified |= transform::CudaMallocPitch(&cusan_decls_).instrument(function); + modified |= transform::CudaChooseDevice(&cusan_decls_).instrument(function); + modified |= transform::CudaSetDevice(&cusan_decls_).instrument(function); + + // callbacks +#ifdef CUSAN_DEVICE_SYNC_CALLBACKS + modified |= transform::CudaDeviceSyncCallback(&cusan_decls_).instrument(function); + modified |= transform::CudaEventSyncCallback(&cusan_decls_).instrument(function); + modified |= transform::CudaStreamSyncCallback(&cusan_decls_).instrument(function); +#endif auto data_for_host = host::kernel_model_for_stub(&function, this->kernel_models_); if (data_for_host) { diff --git a/lib/pass/FunctionDecl.cpp b/lib/pass/FunctionDecl.cpp index f58c828..cefd273 100644 --- a/lib/pass/FunctionDecl.cpp +++ b/lib/pass/FunctionDecl.cpp @@ -1,5 +1,7 @@ #include "FunctionDecl.h" +#include + namespace cusan::callback { void FunctionDecl::initialize(llvm::Module& module) { @@ -124,6 +126,18 @@ void FunctionDecl::initialize(llvm::Module& module) { // void* devPtr, size_t pitch, size_t width, size_t height ArgTypes arg_types_2d_memset = {void_ptr, size_t_ty, size_t_ty, size_t_ty}; make_function(cusan_memset_2d, arg_types_2d_memset); + + // int device + ArgTypes arg_types_set_device = {Type::getInt32Ty(c)}; + make_function(cusan_set_device, arg_types_set_device); + + // void* device + ArgTypes arg_types_choose_device = {Type::getInt32Ty(c)->getPointerTo()}; + make_function(cusan_choose_device, arg_types_choose_device); + + // u8 evenType, u32 returnValue + ArgTypes arg_types_sync_callback = {Type::getInt8Ty(c), void_ptr, Type::getInt32Ty(c)}; + make_function(cusan_sync_callback, arg_types_sync_callback); } } // namespace cusan::callback diff --git a/lib/pass/FunctionDecl.h b/lib/pass/FunctionDecl.h index d40e80a..f6457c1 100644 --- a/lib/pass/FunctionDecl.h +++ b/lib/pass/FunctionDecl.h @@ -41,10 +41,12 @@ struct FunctionDecl { CusanFunction cusan_host_register{"_cusan_host_register"}; CusanFunction cusan_host_unregister{"_cusan_host_unregister"}; CusanFunction cusan_device_alloc{"_cusan_device_alloc"}; + CusanFunction cusan_set_device{"_cusan_set_device"}; + CusanFunction cusan_choose_device{"_cusan_choose_device"}; CusanFunction cusan_device_free{"_cusan_device_free"}; CusanFunction cusan_stream_query{"_cusan_stream_query"}; CusanFunction cusan_event_query{"_cusan_event_query"}; - + CusanFunction cusan_sync_callback{"cusan_sync_callback"}; void initialize(llvm::Module& m); }; diff --git a/lib/runtime/CMakeLists.txt b/lib/runtime/CMakeLists.txt index 741bbc2..5397584 100644 --- a/lib/runtime/CMakeLists.txt +++ b/lib/runtime/CMakeLists.txt @@ -1,67 +1,69 @@ -find_package(MPI) -add_library(cusan_mpi_interceptor SHARED MPIInterception.cpp) -target_link_libraries(cusan_mpi_interceptor - PRIVATE MPI::MPI_CXX -) -set_target_properties( - cusan_mpi_interceptor - PROPERTIES - OUTPUT_NAME "CusanMPIInterceptor" - EXPORT_NAME "MPIInterceptor" -) -add_library(cusan::MPI_Interceptor ALIAS cusan_mpi_interceptor) -target_compile_features(cusan_mpi_interceptor PUBLIC cxx_std_17) -target_compile_definitions( - cusan_mpi_interceptor - PRIVATE CUSAN_LOG_LEVEL=${CUSAN_LOG_LEVEL_RT} - LLVM_VERSION_MAJOR=${LLVM_VERSION_MAJOR} - $<$:CUSAN_FIBERPOOL=1> - $<$:CUSAN_TYPEART=1> - $<$:CUSAN_SOFTCOUNTER> -) - -target_include_directories(cusan_mpi_interceptor ${warning_guard} - PUBLIC $ - PRIVATE $ -) - -target_include_directories(cusan_mpi_interceptor - SYSTEM - PRIVATE - ${LLVM_INCLUDE_DIRS} -) - -target_link_libraries(cusan_mpi_interceptor PRIVATE LLVMSupport) - -if(CUSAN_FIBERPOOL) - target_link_libraries(cusan_mpi_interceptor PUBLIC cusan::fiberpool) - set_target_properties(cusan_mpi_interceptor PROPERTIES - BUILD_WITH_INSTALL_RPATH FALSE - LINK_FLAGS "-Wl,-rpath,${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") +if (MPI_FOUND) + add_library(cusan_mpi_interceptor SHARED MPIInterception.cpp) + target_link_libraries(cusan_mpi_interceptor + PRIVATE MPI::MPI_CXX + ) + set_target_properties( + cusan_mpi_interceptor + PROPERTIES + OUTPUT_NAME "CusanMPIInterceptor" + EXPORT_NAME "MPIInterceptor" + ) + add_library(cusan::MPI_Interceptor ALIAS cusan_mpi_interceptor) + + target_compile_features(cusan_mpi_interceptor PUBLIC cxx_std_17) + target_compile_definitions( + cusan_mpi_interceptor + PRIVATE CUSAN_LOG_LEVEL=${CUSAN_LOG_LEVEL_RT} + LLVM_VERSION_MAJOR=${LLVM_VERSION_MAJOR} + $<$:CUSAN_FIBERPOOL=1> + $<$:CUSAN_TYPEART=1> + $<$:CUSAN_SOFTCOUNTER> + ) + + target_include_directories(cusan_mpi_interceptor ${warning_guard} + PUBLIC $ + PRIVATE $ + ) + + target_include_directories(cusan_mpi_interceptor + SYSTEM + PRIVATE + ${LLVM_INCLUDE_DIRS} + ) + + target_link_libraries(cusan_mpi_interceptor PRIVATE LLVMSupport) + + if(CUSAN_FIBERPOOL) + target_link_libraries(cusan_mpi_interceptor PUBLIC cusan::fiberpool) + set_target_properties(cusan_mpi_interceptor PROPERTIES + BUILD_WITH_INSTALL_RPATH FALSE + LINK_FLAGS "-Wl,-rpath,${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") + endif() + + set(CONFIG_NAME cusanMPIInterceptor) + set(TARGETS_EXPORT_NAME ${CONFIG_NAME}Targets) + + install( + TARGETS cusan_mpi_interceptor + EXPORT ${TARGETS_EXPORT_NAME} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + ) + + install( + EXPORT ${TARGETS_EXPORT_NAME} + NAMESPACE cusan:: + DESTINATION ${CUSAN_INSTALL_CONFIGDIR} + ) + + export( + EXPORT ${TARGETS_EXPORT_NAME} + FILE ${CMAKE_BINARY_DIR}/${TARGETS_EXPORT_NAME}.cmake + NAMESPACE cusan:: + ) endif() -set(CONFIG_NAME cusanMPIInterceptor) -set(TARGETS_EXPORT_NAME ${CONFIG_NAME}Targets) - -install( - TARGETS cusan_mpi_interceptor - EXPORT ${TARGETS_EXPORT_NAME} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} -) - -install( - EXPORT ${TARGETS_EXPORT_NAME} - NAMESPACE cusan:: - DESTINATION ${CUSAN_INSTALL_CONFIGDIR} -) - -export( - EXPORT ${TARGETS_EXPORT_NAME} - FILE ${CMAKE_BINARY_DIR}/${TARGETS_EXPORT_NAME}.cmake - NAMESPACE cusan:: -) - add_library(cusan_Runtime SHARED CusanRuntime.cpp CusanRuntime_cudaSpecific.cpp) set_target_properties( cusan_Runtime diff --git a/lib/runtime/CusanRuntime.cpp b/lib/runtime/CusanRuntime.cpp index 25ed8d0..9c00736 100644 --- a/lib/runtime/CusanRuntime.cpp +++ b/lib/runtime/CusanRuntime.cpp @@ -12,7 +12,7 @@ #include "analysis/KernelModel.h" #include "support/Logger.h" #include "StatsCounter.h" -#if CUSAN_SOFTCOUNTER +#ifdef CUSAN_SOFTCOUNTER #include "support/Table.h" #endif #include "TSanInterface.h" @@ -76,44 +76,37 @@ struct PointerAccess { AccessState mode{AccessState::kRW}; }; -class Runtime { +class Runtime; + +class Device { // NOTE: assumed to be a ordered map so we can iterate in ascending order std::map allocations_; std::map streams_; - std::map events_; TsanFiber cpu_fiber_; TsanFiber curr_fiber_; - bool init_ = false; public: static constexpr Stream kDefaultStream = Stream(); Recorder stats_recorder; - static Runtime& get() { - static Runtime run_t; - if (!run_t.init_) { -#ifdef CUSAN_FIBERPOOL - TsanFiberPoolInit(); -#endif - run_t.cpu_fiber_ = TsanGetCurrentFiber(); - run_t.curr_fiber_ = run_t.cpu_fiber_; - - // default '0' cuda stream - { run_t.register_stream(kDefaultStream); } - - run_t.init_ = true; - } - return run_t; + Device() : stats_recorder() { + // every device has a default stream + { register_stream(Device::kDefaultStream); } + cpu_fiber_ = TsanGetCurrentFiber(); } - Runtime(const Runtime&) = delete; - - void operator=(const Runtime&) = delete; + bool operator==(const Device& other) const { + return curr_fiber_ == other.curr_fiber_; + } - [[nodiscard]] const std::map& get_allocations() const { + [[nodiscard]] const std::map& get_allocations() { return allocations_; } + [[nodiscard]] TsanFiber get_stream_fiber(Stream stream) { + return streams_[stream]; + } + void happens_before() { LOG_TRACE("[cusan] HappensBefore of curr fiber") TsanHappensBefore(curr_fiber_); @@ -122,10 +115,7 @@ class Runtime { void switch_to_cpu() { LOG_TRACE("[cusan] Switch to cpu") - // if we where one a default stream we should also post sync - // meaning that all work submitted after from the cpu should also be run after the default kernels are done - // TODO: double check with blocking - auto search_result = streams_.find(Runtime::kDefaultStream); + auto search_result = streams_.find(Device::kDefaultStream); assert(search_result != streams_.end() && "Tried using stream that wasn't created prior"); if (curr_fiber_ == search_result->second) { LOG_TRACE("[cusan] syncing all other blocking GPU streams to run after since its default stream") @@ -195,19 +185,6 @@ class Runtime { stats_recorder.inc_TsanHappensAfter(); } - void record_event(Event event, Stream stream) { - LOG_TRACE("[cusan] Record event: " << event << " stream:" << stream.handle); - events_[event] = stream; - } - - // Sync the event on the current stream - void sync_event(Event event) { - auto search_result = events_.find(event); - assert(search_result != events_.end() && "Tried using event that wasn't recorded to prior"); - LOG_TRACE("[cusan] Sync event: " << event << " recorded on stream:" << events_[event].handle) - happens_after_stream(events_[event]); - } - void insert_allocation(void* ptr, AllocationInfo info) { assert(allocations_.find(ptr) == allocations_.end() && "Registered an allocation multiple times"); allocations_[ptr] = info; @@ -237,14 +214,11 @@ class Runtime { return &res->second; } - private: - Runtime() = default; - - ~Runtime() { + void output_statistics() { #undef cusan_stat_handle #define cusan_stat_handle(name) table.put(Row::make(#name, stats_recorder.get_##name())); -#if CUSAN_SOFTCOUNTER - Table table{"Cusan runtime statistics"}; +#ifdef CUSAN_SOFTCOUNTER + Table table{"Cusan device statistics"}; #ifdef CUSAN_FIBERPOOL table.put(Row::make("Fiberpool", 1)); #else @@ -262,14 +236,94 @@ class Runtime { #endif #undef cusan_stat_handle #undef CUSAN_CUDA_EVENT_LIST + } +}; -#ifdef CUSAN_FIBERPOOL - // TsanFiberPoolFini(); +class Runtime { + std::map devices_; + std::map> events_; + int32_t current_device_; + bool init_; +#ifdef CUSAN_SOFTCOUNTER + softcounter::AtomicCounter device_switches = 0; #endif + public: + static Runtime& get() { + static Runtime run_t; + if (!run_t.init_) { + run_t.current_device_ = get_current_device_id(); + run_t.devices_[run_t.current_device_]; + run_t.init_ = true; + } + return run_t; + } + + Runtime(const Runtime&) = delete; + + void operator=(const Runtime&) = delete; + +#ifdef CUSAN_SOFTCOUNTER + inline void inc_device_switches() { + this->device_switches++; + } + inline softcounter::Counter get_device_switches() { + return this->device_switches; + } +#endif + + Device& get_current_device() { + return devices_.at(current_device_); + } + + Device& get_device(DeviceID id) { + if (devices_.find(id) == devices_.end()) { + devices_[id]; + } + return devices_.at(id); + } + + void set_device(DeviceID device) { + if (devices_.find(device) == devices_.end()) { + devices_[device]; + } +#ifdef CUSAN_SOFTCOUNTER + if (current_device_ != device) { + inc_device_switches(); + } +#endif + current_device_ = device; } -}; -cusan_MemcpyKind infer_memcpy_direction(const void* target, const void* from); + void record_event(Event event, Stream stream) { + LOG_TRACE("[cusan] Record event: " << event << " stream:" << stream.handle); + auto& current_device = get_current_device(); + auto search_result = current_device.get_stream_fiber(stream); + events_.insert({event, {search_result, ¤t_device}}); + } + + // Sync the event on the current stream + void sync_event(Event event) { + auto [stream_fiber, device] = events_[event]; + TsanHappensAfter(stream_fiber); + device->stats_recorder.inc_TsanHappensAfter(); + } + + private: + Runtime() = default; + + ~Runtime() { +#ifdef CUSAN_SOFTCOUNTER + for (auto& [_, device] : devices_) { + device.output_statistics(); + } + + Table table{"Cusan runtime statistics"}; + table.put(Row::make("Device Switches ", get_device_switches())); + table.print(std::cout); + +#endif + } +}; } // namespace cusan::runtime @@ -277,7 +331,7 @@ using namespace cusan::runtime; namespace helper { #ifndef CUSAN_TYPEART -inline std::optional find_memory_alloc_size(const Runtime& runtime, const void* ptr) { +inline std::optional find_memory_alloc_size(Device& runtime, const void* ptr) { const auto& allocs = runtime.get_allocations(); // if there exists any allocation @@ -304,7 +358,7 @@ inline std::optional find_memory_alloc_size(const Runtime& runtime, cons return {}; } #else -inline std::optional find_memory_alloc_size(const Runtime&, const void* ptr) { +inline std::optional find_memory_alloc_size(const Device&, const void* ptr) { size_t alloc_size{0}; int alloc_id{0}; auto query_status = typeart_get_type(ptr, &alloc_id, &alloc_size); @@ -323,7 +377,7 @@ inline std::optional find_memory_alloc_size(const Runtime&, const void* void _cusan_kernel_register(void** kernel_args, short* modes, int n, RawStream stream) { LOG_TRACE("[cusan]Kernel Register with " << n << " Args and on stream:" << stream) - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); llvm::SmallVector sizes; for (int i = 0; i < n; ++i) { @@ -372,7 +426,7 @@ void _cusan_kernel_register(void** kernel_args, short* modes, int n, RawStream s void _cusan_sync_device() { LOG_TRACE("[cusan]Sync Device\n") - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_sync_device_calls(); runtime.happens_after_all_streams(); } @@ -380,13 +434,14 @@ void _cusan_sync_device() { void _cusan_event_record(Event event, RawStream stream) { LOG_TRACE("[cusan]Event Record") auto& runtime = Runtime::get(); - runtime.stats_recorder.inc_event_record_calls(); + auto& device = runtime.get_current_device(); + device.stats_recorder.inc_event_record_calls(); runtime.record_event(event, Stream(stream)); } void _cusan_sync_stream(RawStream raw_stream) { LOG_TRACE("[cusan]Sync Stream" << raw_stream) - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_sync_stream_calls(); const auto stream = Stream(raw_stream); if (stream.isDefaultStream()) { @@ -401,20 +456,21 @@ void _cusan_sync_stream(RawStream raw_stream) { void _cusan_sync_event(Event event) { LOG_TRACE("[cusan]Sync Event" << event) auto& runtime = Runtime::get(); - runtime.stats_recorder.inc_sync_event_calls(); + auto& device = runtime.get_current_device(); + device.stats_recorder.inc_sync_event_calls(); runtime.sync_event(event); } void _cusan_create_event(Event*) { LOG_TRACE("[cusan]create event") - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_create_event_calls(); } -void _cusan_create_stream(RawStream* stream, cusan_StreamCreateFlags flags) { +void _cusan_create_stream(RawStream* stream, cusan_stream_create_flags flags) { LOG_TRACE("[cusan]create stream with flags: " << flags << " isNonBlocking: " << (bool)(flags & cusan_StreamFlagsNonBlocking)) - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_create_stream_calls(); runtime.register_stream(Stream(*stream, !(bool)(flags & cusan_StreamFlagsNonBlocking))); } @@ -422,18 +478,19 @@ void _cusan_create_stream(RawStream* stream, cusan_StreamCreateFlags flags) { void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int) { LOG_TRACE("[cusan]StreamWaitEvent stream:" << stream << " on event:" << event) auto& runtime = Runtime::get(); - runtime.stats_recorder.inc_stream_wait_event_calls(); - runtime.switch_to_stream(Stream(stream)); + auto& device = runtime.get_current_device(); + device.stats_recorder.inc_stream_wait_event_calls(); + device.switch_to_stream(Stream(stream)); runtime.sync_event(event); - runtime.happens_before(); - runtime.switch_to_cpu(); + device.happens_before(); + device.switch_to_cpu(); } void _cusan_host_alloc(void** ptr, size_t size, unsigned int) { // at least based of this presentation and some comments in the cuda forums this syncs the whole device // https://developer.download.nvidia.com/CUDA/training/StreamsAndConcurrencyWebinar.pdf LOG_TRACE("[cusan]host alloc " << *ptr << " with size " << size) - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_host_alloc_calls(); // runtime.happens_after_all_streams(); @@ -442,27 +499,27 @@ void _cusan_host_alloc(void** ptr, size_t size, unsigned int) { void _cusan_host_free(void* ptr) { LOG_TRACE("[cusan]host free") - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_host_free_calls(); runtime.free_allocation(ptr); } void _cusan_host_register(void* ptr, size_t size, unsigned int) { LOG_TRACE("[cusan]host register " << ptr << " with size:" << size); - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_host_register_calls(); runtime.insert_allocation(ptr, AllocationInfo::Pinned(size)); } void _cusan_host_unregister(void* ptr) { LOG_TRACE("[cusan]host unregister " << ptr); - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_host_unregister_calls(); runtime.free_allocation(ptr); } void _cusan_managed_alloc(void** ptr, size_t size, unsigned int) { LOG_TRACE("[cusan]Managed host alloc " << *ptr << " with size " << size << " -> implicit device sync") - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_managed_alloc_calls(); runtime.happens_after_all_streams(); runtime.insert_allocation(*ptr, AllocationInfo::Managed(size)); @@ -472,7 +529,7 @@ void _cusan_device_alloc(void** ptr, size_t size) { // implicit syncs device // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#stream-ordered-memory-allocator LOG_TRACE("[cusan]Device alloc " << *ptr << " with size " << size << " -> implicit device sync") - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_device_alloc_calls(); runtime.insert_allocation(*ptr, AllocationInfo::Device(size)); @@ -483,37 +540,48 @@ void _cusan_device_free(void* ptr) { // implicit syncs device // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#stream-ordered-memory-allocator LOG_TRACE("[cusan]Device free " << ptr << " -> TODO maybe implicit device sync") - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_device_free_calls(); runtime.happens_after_all_streams(); } -// TODO: get rid of cudaSpecifc check for cudaSuccess 0 void _cusan_stream_query(RawStream stream, unsigned int err) { LOG_TRACE("[cusan] Stream query " << stream << " -> " << err) - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_stream_query_calls(); if (err == 0) { LOG_TRACE("[cusan] syncing") - runtime.happens_after_stream(Stream{stream}); } } -// TODO: get rid of cudaSpecifc check for cudaSuccess 0 void _cusan_event_query(Event event, unsigned int err) { LOG_TRACE("[cusan] Event query " << event << " -> " << err) auto& runtime = Runtime::get(); - runtime.stats_recorder.inc_event_query_calls(); + auto& device = runtime.get_current_device(); + device.stats_recorder.inc_event_query_calls(); if (err == 0) { LOG_TRACE("[cusan] syncing") runtime.sync_event(event); } } +void _cusan_set_device(DeviceID device) { + auto& r = Runtime::get(); + r.set_device(device); +} + +void _cusan_choose_device(DeviceID* device) { + // does this function ever return a nullptr? + // and what would that mean + assert(device); + auto& r = Runtime::get(); + r.set_device(*device); +} + void _cusan_memset_async_impl(void* target, size_t count, RawStream stream) { // The Async versions are always asynchronous with respect to the host. - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_memset_async_calls(); runtime.switch_to_stream(Stream(stream)); TsanMemoryWritePC(target, count, __builtin_return_address(0)); @@ -524,9 +592,9 @@ void _cusan_memset_async_impl(void* target, size_t count, RawStream stream) { void _cusan_memset_impl(void* target, size_t count) { // The cudaMemset functions are asynchronous with respect to the host except when the target memory is pinned host // memory. - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_memset_calls(); - runtime.switch_to_stream(Runtime::kDefaultStream); + runtime.switch_to_stream(Device::kDefaultStream); LOG_TRACE("[cusan] " << "Write to " << target << " with size: " << count) TsanMemoryWritePC(target, count, __builtin_return_address(0)); @@ -539,7 +607,7 @@ void _cusan_memset_impl(void* target, size_t count) { if ((alloc_info && (alloc_info->is_pinned || alloc_info->is_managed)) || CUSAN_SYNC_DETAIL_LEVEL == 0) { LOG_TRACE("[cusan] " << "Memset is blocking") - runtime.happens_after_stream(Runtime::kDefaultStream); + runtime.happens_after_stream(Device::kDefaultStream); } else { LOG_TRACE("[cusan] " << "Memset is not blocking") @@ -553,10 +621,10 @@ void _cusan_memset_impl(void* target, size_t count) { // r.happens_after_stream(Runtime::default_stream)); } -void _cusan_memset_2d(void* target, size_t pitch, size_t, size_t height, cusan_MemcpyKind) { +void _cusan_memset_2d(void* target, size_t pitch, size_t, size_t height, cusan_memcpy_kind) { _cusan_memset_impl(target, pitch * height); } -void _cusan_memset_2d_async(void* target, size_t pitch, size_t, size_t height, cusan_MemcpyKind, RawStream stream) { +void _cusan_memset_2d_async(void* target, size_t pitch, size_t, size_t height, cusan_memcpy_kind, RawStream stream) { _cusan_memset_async_impl(target, pitch * height, stream); } @@ -571,8 +639,8 @@ void _cusan_memset_async(void* target, size_t count, RawStream stream) { } void _cusan_memcpy_async_impl(void* target, size_t write_size, const void* from, size_t read_size, - cusan_MemcpyKind kind, RawStream stream) { - auto& runtime = Runtime::get(); + cusan_memcpy_kind kind, RawStream stream) { + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_memcpy_async_calls(); if (kind == cusan_MemcpyHostToHost && CUSAN_SYNC_DETAIL_LEVEL == 1) { // 2. For transfers from any host memory to any host memory, the function is fully synchronous with respect to the @@ -603,30 +671,30 @@ void _cusan_memcpy_async_impl(void* target, size_t write_size, const void* from, } } -void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_t read_size, cusan_MemcpyKind kind) { +void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_t read_size, cusan_memcpy_kind kind) { // TODO verify that the memcpy2d beheaviour is actually the same as normal memcpy if (kind == cusan_MemcpyDefault) { kind = infer_memcpy_direction(target, from); } - auto& runtime = Runtime::get(); + auto& runtime = Runtime::get().get_current_device(); runtime.stats_recorder.inc_memcpy_calls(); if (CUSAN_SYNC_DETAIL_LEVEL == 0) { LOG_TRACE("[cusan] DefaultStream+Blocking") // In this mode: Memcpy always blocks, no detailed view w.r.t. memory direction - runtime.switch_to_stream(Runtime::kDefaultStream); + runtime.switch_to_stream(Device::kDefaultStream); TsanMemoryReadPC(from, read_size, __builtin_return_address(0)); runtime.stats_recorder.inc_TsanMemoryRead(); TsanMemoryWritePC(target, write_size, __builtin_return_address(0)); runtime.stats_recorder.inc_TsanMemoryWrite(); runtime.happens_before(); runtime.switch_to_cpu(); - runtime.happens_after_stream(Runtime::kDefaultStream); + runtime.happens_after_stream(Device::kDefaultStream); } else if (kind == cusan_MemcpyDeviceToDevice) { // 4. For transfers from device memory to device memory, no host-side synchronization is performed. LOG_TRACE("[cusan] DefaultStream") - runtime.switch_to_stream(Runtime::kDefaultStream); + runtime.switch_to_stream(Device::kDefaultStream); TsanMemoryReadPC(from, read_size, __builtin_return_address(0)); runtime.stats_recorder.inc_TsanMemoryRead(); TsanMemoryWritePC(target, write_size, __builtin_return_address(0)); @@ -636,14 +704,14 @@ void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_ // 3. For transfers from device to either pageable or pinned host memory, the function returns only once the copy // has completed. LOG_TRACE("[cusan] DefaultStream+Blocking") - runtime.switch_to_stream(Runtime::kDefaultStream); + runtime.switch_to_stream(Device::kDefaultStream); TsanMemoryReadPC(from, read_size, __builtin_return_address(0)); runtime.stats_recorder.inc_TsanMemoryRead(); TsanMemoryWritePC(target, write_size, __builtin_return_address(0)); runtime.stats_recorder.inc_TsanMemoryWrite(); runtime.happens_before(); runtime.switch_to_cpu(); - runtime.happens_after_stream(Runtime::kDefaultStream); + runtime.happens_after_stream(Device::kDefaultStream); } else if (kind == cusan_MemcpyHostToDevice) { // 1. For transfers from pageable host memory to device memory, a stream sync is performed before the copy is // initiated. @@ -651,7 +719,7 @@ void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_ auto* alloc_info = runtime.get_allocation_info(from); // if we couldn't find alloc info we just assume the worst and don't sync if (alloc_info && !alloc_info->is_pinned) { - runtime.happens_after_stream(Runtime::kDefaultStream); + runtime.happens_after_stream(Device::kDefaultStream); LOG_TRACE("[cusan] DefaultStream+Blocking") } else { LOG_TRACE("[cusan] DefaultStream") @@ -660,20 +728,20 @@ void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_ // device memory TsanMemoryReadPC(from, read_size, __builtin_return_address(0)); runtime.stats_recorder.inc_TsanMemoryRead(); - runtime.switch_to_stream(Runtime::kDefaultStream); + runtime.switch_to_stream(Device::kDefaultStream); TsanMemoryWritePC(target, write_size, __builtin_return_address(0)); runtime.stats_recorder.inc_TsanMemoryWrite(); runtime.happens_before(); runtime.switch_to_cpu(); - runtime.happens_after_stream(Runtime::kDefaultStream); + runtime.happens_after_stream(Device::kDefaultStream); } else if (kind == cusan_MemcpyHostToHost) { // 5. For transfers from any host memory to any host memory, the function is fully synchronous with respect to the // host. LOG_TRACE("[cusan] DefaultStream+Blocking") - runtime.switch_to_stream(Runtime::kDefaultStream); + runtime.switch_to_stream(Device::kDefaultStream); runtime.happens_before(); runtime.switch_to_cpu(); - runtime.happens_after_stream(Runtime::kDefaultStream); + runtime.happens_after_stream(Device::kDefaultStream); TsanMemoryReadPC(from, read_size, __builtin_return_address(0)); runtime.stats_recorder.inc_TsanMemoryRead(); TsanMemoryWritePC(target, write_size, __builtin_return_address(0)); @@ -684,7 +752,7 @@ void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_ } void _cusan_memcpy_2d_async(void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height, - cusan_MemcpyKind kind, RawStream stream) { + cusan_memcpy_kind kind, RawStream stream) { LOG_TRACE("[cusan]MemcpyAsync" << width * height << " bytes to:" << target) size_t read_size = spitch * height; @@ -692,20 +760,35 @@ void _cusan_memcpy_2d_async(void* target, size_t dpitch, const void* from, size_ _cusan_memcpy_async_impl(target, write_size, from, read_size, kind, stream); } -void _cusan_memcpy_async(void* target, const void* from, size_t count, cusan_MemcpyKind kind, RawStream stream) { +void _cusan_memcpy_async(void* target, const void* from, size_t count, cusan_memcpy_kind kind, RawStream stream) { LOG_TRACE("[cusan]MemcpyAsync" << count << " bytes to:" << target) _cusan_memcpy_async_impl(target, count, from, count, kind, stream); } void _cusan_memcpy_2d(void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height, - cusan_MemcpyKind kind) { + cusan_memcpy_kind kind) { LOG_TRACE("[cusan]Memcpy2d " << width * height << " from:" << from << " to:" << target); size_t read_size = spitch * height; size_t write_size = dpitch * height; _cusan_memcpy_impl(target, write_size, from, read_size, kind); } -void _cusan_memcpy(void* target, const void* from, size_t count, cusan_MemcpyKind kind) { +void _cusan_memcpy(void* target, const void* from, size_t count, cusan_memcpy_kind kind) { LOG_TRACE("[cusan]Memcpy " << count << " from:" << from << " to:" << target); _cusan_memcpy_impl(target, count, from, count, kind); } + +void cusan_sync_callback(cusan_sync_type /*type*/, const void* /*event_or_stream*/, unsigned int /*return_value*/) { + LOG_TRACE("[cusan]Callback"); + // switch (type) { + // case cusan_Device: + // printf("Device sync return value %i\n", return_value); + // break; + // case cusan_Stream: + // printf("Stream %#x sync return value %i\n", event_or_stream, return_value); + // break; + // case cusan_Event: + // printf("Event %#x sync return value %i\n", event_or_stream, return_value); + // break; + // } +} diff --git a/lib/runtime/CusanRuntime.h b/lib/runtime/CusanRuntime.h index 8dcaef2..048e566 100644 --- a/lib/runtime/CusanRuntime.h +++ b/lib/runtime/CusanRuntime.h @@ -8,12 +8,37 @@ #define LIB_RUNTIME_CUSAN_H_ #include +#ifdef __cplusplus + +extern "C" { +#endif + +typedef enum cusan_memcpy_kind_t : unsigned int { + cusan_MemcpyHostToHost = 0, + cusan_MemcpyHostToDevice = 1, + cusan_MemcpyDeviceToHost = 2, + cusan_MemcpyDeviceToDevice = 3, + cusan_MemcpyDefault = 4, +} cusan_memcpy_kind; + +typedef enum cusan_stream_create_flags_t : unsigned int { + cusan_StreamFlagsDefault = 0, + cusan_StreamFlagsNonBlocking = 1, +} cusan_stream_create_flags; +#ifdef __cplusplus +} +#endif + #ifdef __cplusplus namespace cusan::runtime { using TsanFiber = void*; using Event = const void*; using RawStream = const void*; +using DeviceID = int; +cusan_memcpy_kind infer_memcpy_direction(const void* target, const void* from); +DeviceID get_current_device_id(); } // namespace cusan::runtime +using cusan::runtime::DeviceID; using cusan::runtime::Event; using cusan::runtime::RawStream; using cusan::runtime::TsanFiber; @@ -21,6 +46,7 @@ using cusan::runtime::TsanFiber; #define TsanFiber void* #define Event const void* #define RawStream const void* +#define DeviceID int #endif #ifdef __cplusplus @@ -28,40 +54,36 @@ using cusan::runtime::TsanFiber; extern "C" { #endif -typedef enum cusan_memcpy_kind_t : unsigned int { - cusan_MemcpyHostToHost = 0, - cusan_MemcpyHostToDevice = 1, - cusan_MemcpyDeviceToHost = 2, - cusan_MemcpyDeviceToDevice = 3, - cusan_MemcpyDefault = 4, -} cusan_MemcpyKind; - -typedef enum cusan_stream_create_flags_t : unsigned int { - cusan_StreamFlagsDefault = 0, - cusan_StreamFlagsNonBlocking = 1, -} cusan_StreamCreateFlags; - void _cusan_kernel_register(void** kernel_args, short* modes, int n, RawStream stream); + void _cusan_sync_device(); -void _cusan_event_record(Event event, RawStream stream); +void _cusan_set_device(DeviceID device); +void _cusan_choose_device(DeviceID* device); + void _cusan_sync_stream(RawStream stream); +void _cusan_create_stream(RawStream* stream, cusan_stream_create_flags flags); +void _cusan_stream_query(RawStream stream, unsigned int err); + void _cusan_sync_event(Event event); -void _cusan_stream_event(Event event); +void _cusan_event_record(Event event, RawStream stream); void _cusan_create_event(RawStream* event); -void _cusan_create_stream(RawStream* stream, cusan_StreamCreateFlags flags); -void _cusan_memcpy_async(void* target, const void* from, size_t count, cusan_MemcpyKind kind, RawStream stream); -void _cusan_memset_async(void* target, size_t count, RawStream stream); -void _cusan_memcpy(void* target, const void* from, size_t count, cusan_MemcpyKind); +void _cusan_event_query(Event event, unsigned int err); +void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int flags); +void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int flags); + +void _cusan_memcpy(void* target, const void* from, size_t count, cusan_memcpy_kind); +void _cusan_memcpy_async(void* target, const void* from, size_t count, cusan_memcpy_kind kind, RawStream stream); void _cusan_memcpy_2d(void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height, - cusan_MemcpyKind); + cusan_memcpy_kind); void _cusan_memcpy_2d_async(void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height, - cusan_MemcpyKind, RawStream stream); -void _cusan_memset_2d(void* target, size_t pitch, size_t width, size_t height, cusan_MemcpyKind); -void _cusan_memset_2d_async(void* target, size_t pitch, size_t width, size_t height, cusan_MemcpyKind, - RawStream stream); + cusan_memcpy_kind, RawStream stream); + void _cusan_memset(void* target, size_t count); -void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int flags); -void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int flags); +void _cusan_memset_async(void* target, size_t count, RawStream stream); +void _cusan_memset_2d(void* target, size_t pitch, size_t width, size_t height, cusan_memcpy_kind); +void _cusan_memset_2d_async(void* target, size_t pitch, size_t width, size_t height, cusan_memcpy_kind, + RawStream stream); + void _cusan_host_alloc(void** ptr, size_t size, unsigned int flags); void _cusan_host_free(void* ptr); void _cusan_managed_alloc(void** ptr, size_t size, unsigned int flags); @@ -70,8 +92,15 @@ void _cusan_host_register(void* ptr, size_t size, unsigned int flags); void _cusan_host_unregister(void* ptr); void _cusan_device_alloc(void** ptr, size_t size); void _cusan_device_free(void* ptr); -void _cusan_stream_query(RawStream stream, unsigned int err); -void _cusan_event_query(Event event, unsigned int err); + +typedef enum cusan_sync_type_t : unsigned char { + cusan_Device = 0, //second argument is a nullptr + cusan_Stream = 1, //second argument is the stream pointer + cusan_Event = 2, //second argument is the event pointer +} cusan_sync_type; + +void cusan_sync_callback(cusan_sync_type /*type*/, const void* /*event or stream*/, unsigned int /*return_value*/); + #ifdef __cplusplus } #endif diff --git a/lib/runtime/CusanRuntime_cudaSpecific.cpp b/lib/runtime/CusanRuntime_cudaSpecific.cpp index 6958db3..970401d 100644 --- a/lib/runtime/CusanRuntime_cudaSpecific.cpp +++ b/lib/runtime/CusanRuntime_cudaSpecific.cpp @@ -10,7 +10,14 @@ #include namespace cusan::runtime { -cusan_MemcpyKind infer_memcpy_direction(const void* target, const void* from) { + +DeviceID get_current_device_id() { + DeviceID res; + cudaGetDevice(&res); + return res; +} + +cusan_memcpy_kind infer_memcpy_direction(const void* target, const void* from) { cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 0); assert(prop.unifiedAddressing && "Can only use default direction for memcpy when Unified memory is supported."); diff --git a/lib/runtime/StatsCounter.h b/lib/runtime/StatsCounter.h index 2b00377..5e66b6e 100644 --- a/lib/runtime/StatsCounter.h +++ b/lib/runtime/StatsCounter.h @@ -107,9 +107,9 @@ class NoneRecorder { Statistics stats_r; CUSAN_CUDA_EVENT_LIST #include "TsanEvents.inc" - void inc_TsanMemoryReadCount(unsigned count) { + void inc_TsanMemoryReadCount(unsigned) { } - void inc_TsanMemoryWriteCount(unsigned count) { + void inc_TsanMemoryWriteCount(unsigned) { } }; diff --git a/lib/runtime/TSanInterface.h b/lib/runtime/TSanInterface.h index 88d0f48..3bfe367 100644 --- a/lib/runtime/TSanInterface.h +++ b/lib/runtime/TSanInterface.h @@ -1,6 +1,11 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wformat" -#pragma GCC diagnostic ignored "-Wmacro-redefined" +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wcast-qual" +#if defined(__clang__) +#pragma clang diagnostic ignored "-Wmacro-redefined" +#pragma clang diagnostic ignored "-Wformat-pedantic" +#endif #include "TSan_External.h" diff --git a/lib/support/Util.h b/lib/support/Util.h index 54014a9..081c6fc 100644 --- a/lib/support/Util.h +++ b/lib/support/Util.h @@ -34,7 +34,7 @@ inline std::string demangle(String&& s) { #if LLVM_VERSION_MAJOR >= 15 auto demangle = llvm::itaniumDemangle(name.data(), false); #else - auto* demangle = llvm::itaniumDemangle(name.data(), nullptr, nullptr, nullptr); + auto* demangle = llvm::itaniumDemangle(name.data(), nullptr, nullptr, nullptr); #endif if (demangle && !std::string(demangle).empty()) { return {demangle}; diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt index 74d956c..6dffaf8 100644 --- a/scripts/CMakeLists.txt +++ b/scripts/CMakeLists.txt @@ -92,7 +92,7 @@ function(configure_cusan_script input output) endfunction() function(cusan_find_mpi_vendor_helper symbol ret_value) - find_package(MPI) + find_package(MPI QUIET) if(NOT MPI_FOUND) set(${ret_value} false) @@ -175,7 +175,6 @@ if(NOT CUSAN_TYPEART) ) endif() -find_package(MPI) if(MPI_C_FOUND) configure_cusan_script( ${CUSAN_WRAPPER} cusan-mpicc${CMAKE_DEBUG_POSTFIX} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index db0f1df..42e693a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -36,6 +36,8 @@ function(configure_cusan_lit_site_cfg input output) pythonize_bool(${CUSAN_TEST_WORKAROUNDS} CUSAN_TEST_WORKAROUND) pythonize_bool(${CUSAN_TYPEART} CUSAN_HAS_TYPEART) + pythonize_bool(${MPI_FOUND} CUSAN_HAS_MPI) + pythonize_bool(${CUSAN_DEVICE_SYNC_CALLBACKS} CUSAN_HAS_SYNC_CALLBACKS) cusan_target_generate_file(${input} ${output}) endfunction() @@ -75,7 +77,7 @@ function(cusan_add_lit_target) set(TARGET_NAME check-cusan-${suite}) endif() - if(NOT EXISTS ${SUITE_PATH} AND NOT ${suite} STREQUAL "staging") + if(NOT EXISTS ${SUITE_PATH} AND NOT ${suite} STREQUAL "staging" AND NOT ${suite} STREQUAL "multi_gpu") message(WARNING "Could not find suitable lit test target at ${SUITE_PATH}") continue() endif() @@ -90,16 +92,32 @@ function(cusan_add_lit_target) endforeach() endfunction() + +if(NOT MPI_FOUND) + # mock target to make test generation work + add_library(cusan_mpi_interceptor_mock SHARED IMPORTED) + set_target_properties( + cusan_mpi_interceptor_mock + PROPERTIES + OUTPUT_NAME "CusanMPIInterceptorPlaceholder" + IMPORTED_LOCATION "${PROJECT_SOURCE_DIR}/lib/runtime" + ) + set(_cusan_interceptor_target cusan_mpi_interceptor_mock) + add_library(cusan::MPI_Interceptor ALIAS cusan_mpi_interceptor_mock) +endif() + set(CUSAN_TEST_DEPENDS cusan::TransformPass - cusan::MPI_Interceptor cusan::Runtime cusan::Analysis - ) +if(FOUND_MPI) + list(APPEND CUSAN_TEST_DEPENDS cusan::MPI_Interceptor) +endif() + if(CUSAN_TYPEART) -list(APPEND CUSAN_TEST_DEPENDS typeart::Runtime typeart::TransformPass) + list(APPEND CUSAN_TEST_DEPENDS typeart::Runtime typeart::TransformPass) endif() set(CUSAN_SUITES @@ -108,6 +126,7 @@ set(CUSAN_SUITES pass kernel_analysis staging + multi_gpu ) include(ProcessorCount) @@ -124,6 +143,7 @@ set(CUSAN_SUITES_WORKERS ${NUM_CPU} ${NUM_CPU} 1 + 1 ) cusan_add_lit_target(SUITES ${CUSAN_SUITES} WORKERS ${CUSAN_SUITES_WORKERS}) diff --git a/test/kernel_analysis/03_struct_write.c b/test/kernel_analysis/03_struct_write.c index c001e03..a6edeef 100644 --- a/test/kernel_analysis/03_struct_write.c +++ b/test/kernel_analysis/03_struct_write.c @@ -22,6 +22,8 @@ // CHECK-NEXT: subarg: {{.*}}ptr: 0, rw: ReadWrite // CHECK-NOT: Handling Arg: +// REQUIRES: mpi + // clang-format on #include "../support/gpu_mpi.h" diff --git a/test/kernel_analysis/08_big_struct_write.c b/test/kernel_analysis/08_big_struct_write.c index 5d34cd5..3df5664 100644 --- a/test/kernel_analysis/08_big_struct_write.c +++ b/test/kernel_analysis/08_big_struct_write.c @@ -22,6 +22,8 @@ // CHECK-NEXT: subarg: {{.*}}ptr: 0, rw: ReadWrite // CHECK-NOT: Handling Arg: +// REQUIRES: mpi + // clang-format on #include "../support/gpu_mpi.h" diff --git a/test/lit.cfg b/test/lit.cfg index 9836a64..bfea736 100644 --- a/test/lit.cfg +++ b/test/lit.cfg @@ -43,10 +43,18 @@ else: if config.cusan_typeart: config.available_features.add('typeart') +if config.mpiexec: + config.available_features.add('mpiexec') +if config.cusan_mpi: + config.available_features.add('mpi') + +if config.cusan_sync_callback: + config.available_features.add('sync_callback') + config.substitutions.append(("%clang-cpp", clang_cpp)) config.substitutions.append(("%clang-cc", clang_cc)) config.substitutions.append(("%opt", opt)) -config.substitutions.append(("%filecheck", filecheck + " --dump-input fail")) +config.substitutions.append(("%filecheck", filecheck + " --dump-input fail --dump-input-context=40")) config.substitutions.append(("%lib_dir", cusan_lib_dir)) config.substitutions.append(("%pass_dir", cusan_pass_dir)) @@ -77,4 +85,4 @@ config.substitutions.append(('%clang_args', '-D__STRICT_ANSI__ -fPIC -O1 -g0')) config.substitutions.append(('%rm-file', 'rm -f ')) -config.substitutions.append(('%clang-pass-only-args', '-D__STRICT_ANSI__ -O1 -c')) +config.substitutions.append(('%clang-pass-only-args', '-D__STRICT_ANSI__ -std=c++17 -O1 -c')) diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in index 70030c3..14e1304 100644 --- a/test/lit.site.cfg.in +++ b/test/lit.site.cfg.in @@ -31,6 +31,8 @@ config.cusan_pass = "$" config.cusan_mpi_interceptor = "$" config.cusan_use_workarounds = @CUSAN_TEST_WORKAROUND@ config.cusan_typeart = @CUSAN_HAS_TYPEART@ +config.cusan_mpi = @CUSAN_HAS_MPI@ +config.cusan_sync_callback = @CUSAN_HAS_SYNC_CALLBACKS@ # Let the main config do the real work. config.loaded_site_config = True diff --git a/test/multi_gpu/01_device_sync.c b/test/multi_gpu/01_device_sync.c new file mode 100644 index 0000000..52747c3 --- /dev/null +++ b/test/multi_gpu/01_device_sync.c @@ -0,0 +1,85 @@ +// clang-format off +// RUN: %wrapper-cc %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t.exe +// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s + +// RUN: %wrapper-cc -DCUSAN_SYNC %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t-sync.exe +// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC + +// REQUIRES: multigpu + +// clang-format on + +// CHECK-DAG: data race +// CHECK-DAG: [Error] sync + +// CHECK-SYNC-NOT: data race +// CHECK-SYNC-NOT: [Error] sync + +#include +#include + +__global__ void write_kernel_delay(int* arr, const int N, const unsigned int delay) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; +#if __CUDA_ARCH__ >= 700 + for (int i = 0; i < tid; i++) { + __nanosleep(delay); + } +#else + printf(">>> __CUDA_ARCH__ !\n"); +#endif + if (tid < N) { + arr[tid] = (tid + 1); + } +} + +int main() { + const int size = 256; + const int threadsPerBlock = size; + const int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock; + int* managed_data; + int* managed_data2; + + int n_devices = 0; + cudaGetDeviceCount(&n_devices); + if (n_devices < 2) { + printf("This test is designed for CUDA on multiple devices but there is only one or none here. Exiting.\n"); + return 1; + } + + cudaSetDevice(0); + cudaMallocManaged(&managed_data, size * sizeof(int)); + cudaMemset(managed_data, 0, size * sizeof(int)); + + cudaSetDevice(1); + cudaMallocManaged(&managed_data2, size * sizeof(int)); + cudaMemset(managed_data2, 0, size * sizeof(int)); + + cudaSetDevice(0); + write_kernel_delay<<>>(managed_data, size, 1316134912); + + // if we only have the later synchronize we will only synchronize the second device +#ifdef CUSAN_SYNC + cudaDeviceSynchronize(); +#endif + + cudaSetDevice(1); + write_kernel_delay<<>>(managed_data2, size, 1); + cudaDeviceSynchronize(); + + for (int i = 0; i < size; i++) { + if (managed_data[i] == 0) { + printf("[Error] sync managed_data %i\n", managed_data[i]); + break; + } + } + for (int i = 0; i < size; i++) { + if (managed_data2[i] == 0) { + printf("[Error] sync managed_data2 %i\n", managed_data[i]); + break; + } + } + + cudaFree(managed_data); + cudaFree(managed_data2); + return 0; +} diff --git a/test/multi_gpu/02_device_sync_event.c b/test/multi_gpu/02_device_sync_event.c new file mode 100644 index 0000000..a72a7fe --- /dev/null +++ b/test/multi_gpu/02_device_sync_event.c @@ -0,0 +1,75 @@ +// clang-format off +// RUN: %wrapper-cc %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t.exe +// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s + +// RUN: %wrapper-cc %clang_args -DCUSAN_SYNC -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t-sync.exe +// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC + +// REQUIRES: multigpu + +// clang-format on + +// CHECK-DAG: data race +// CHECK-DAG: [Error] sync + +// CHECK-SYNC-NOT: data race +// CHECK-SYNC-NOT: [Error] sync + +#include +#include + +__global__ void kernel(int* data) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; +#if __CUDA_ARCH__ >= 700 + for (int i = 0; i < tid; i++) { + __nanosleep(1000000U); + } +#else + printf(">>> __CUDA_ARCH__ !\n"); +#endif + data[tid] = (tid + 1); +} + +int main() { + const int size = 256; + const int threadsPerBlock = 256; + const int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock; + + int n_devices = 0; + cudaGetDeviceCount(&n_devices); + if (n_devices < 2) { + printf("This test is designed for CUDA on multiple devices but there is only one or none here. Exiting.\n"); + return 1; + } + + cudaSetDevice(0); + + int* d_data; // Unified Memory pointer + + // Allocate Unified Memory + cudaMallocManaged(&d_data, size * sizeof(int)); + cudaMemset(d_data, 0, size * sizeof(int)); + + cudaEvent_t endEvent; + cudaEventCreate(&endEvent); + kernel<<>>(d_data); + cudaEventRecord(endEvent); + + cudaSetDevice(1); + +#ifdef CUSAN_SYNC + cudaEventSynchronize(endEvent); +#endif + + for (int i = 0; i < size; i++) { + if (d_data[i] < 1) { + printf("[Error] sync\n"); + break; + } + } + + cudaEventDestroy(endEvent); + cudaFree(d_data); + + return 0; +} diff --git a/test/multi_gpu/TSan_External.h b/test/multi_gpu/TSan_External.h new file mode 100644 index 0000000..c7b5796 --- /dev/null +++ b/test/multi_gpu/TSan_External.h @@ -0,0 +1,276 @@ +/* Part of the MUST Project, under BSD-3-Clause License + * See https://hpc.rwth-aachen.de/must/LICENSE for license information. + * SPDX-License-Identifier: BSD-3-Clause + */ + +/** + * @file TSan_External.h + * Functions exported from ThreadSaniziter that can be used for dynamic annotations. + * https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/tsan/rtl/tsan_interface.cpp + * https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/tsan/rtl/tsan_interface_ann.cpp + */ + +#ifndef TSAN_EXTERNAL_H +#define TSAN_EXTERNAL_H + +typedef unsigned long uptr; + +typedef enum { mo_relaxed, mo_consume, mo_acquire, mo_release, mo_acq_rel, mo_seq_cst } morder; + +typedef unsigned char a8; +typedef unsigned short a16; +typedef unsigned int a32; +typedef unsigned long long a64; + +#ifdef MUST_DEBUG +// Print an error message *once* if an annotation function is used that is not overwritten by the +// TSan runtime +#define FALLBACK_PRINT(func_name) \ + { \ + static bool once = false; \ + if (!once) { \ + printf( \ + "[MUST-ERROR] %s fallback called, check your TSan runtime and the call " \ + "signature\n", \ + func_name); \ + once = true; \ + } \ + } +#else +#define FALLBACK_PRINT(func_name) +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +// ThreadSanitizer defines the following functions that can be used in MUST for dynamic annotations. +void __attribute__((weak)) AnnotateHappensAfter(const char* file, int line, const volatile void* cv) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) AnnotateHappensBefore(const char* file, int line, const volatile void* cv) { + FALLBACK_PRINT(__func__); +} + +void __attribute__((weak)) AnnotateNewMemory(const char* file, int line, const volatile void* cv, uptr size) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) AnnotateMemoryRead(const char* file, int line, const volatile void* cv, uptr size) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) AnnotateMemoryWrite(const char* file, int line, const volatile void* cv, uptr size) { + FALLBACK_PRINT(__func__); +} + +void __attribute__((weak)) AnnotateIgnoreReadsBegin(const char* file, int line) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) AnnotateIgnoreReadsEnd(const char* file, int line) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) AnnotateIgnoreWritesBegin(const char* file, int line) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) AnnotateIgnoreWritesEnd(const char* file, int line) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) AnnotateIgnoreSyncBegin(const char* file, int line) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) AnnotateIgnoreSyncEnd(const char* file, int line) { + FALLBACK_PRINT(__func__); +} + +void __attribute__((weak)) AnnotateRWLockCreate(const char* file, int line, const volatile void* cv) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) AnnotateRWLockDestroy(const char* file, int line, const volatile void* cv) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv, + unsigned long long is_w) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) AnnotateRWLockReleased(const char* file, int line, const volatile void* cv, + unsigned long long is_w) { + FALLBACK_PRINT(__func__); +} + +void __attribute__((weak)) __tsan_read_range(void* addr, uptr size) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) __tsan_write_range(void* addr, uptr size) { + FALLBACK_PRINT(__func__); +} + +void __attribute__((weak)) __tsan_read1_pc(void* addr, void* pc) { +} +void __attribute__((weak)) __tsan_write1_pc(void* addr, void* pc) { +} +void __attribute__((weak)) __tsan_read2_pc(void* addr, void* pc) { +} +void __attribute__((weak)) __tsan_write2_pc(void* addr, void* pc) { +} +void __attribute__((weak)) __tsan_read4_pc(void* addr, void* pc) { +} +void __attribute__((weak)) __tsan_write4_pc(void* addr, void* pc) { +} +void __attribute__((weak)) __tsan_read8_pc(void* addr, void* pc) { +} +void __attribute__((weak)) __tsan_write8_pc(void* addr, void* pc) { +} + +#define annotateHelper(rw, s) \ + if (len >= s) { \ + printf("annotateHelper(%s, %i, %p, %li)\n", #rw, s, addr, (uptr)pc); \ + len -= s; \ + __tsan_##rw##s##_pc(addr, pc); \ + addr += s; \ + size -= s; \ + } + +void __attribute__((weak)) __tsan_read_range_pc(void* a, uptr size, void* pc) { + char* addr = (char*)a; + uptr len = ((uptr)addr) % 8; + if (size < len) + len = size; + annotateHelper(read, 4) annotateHelper(read, 2) annotateHelper(read, 1) for (; size > 7; size -= 8) { + __tsan_read8_pc(addr, pc); + addr += 8; + } + len = size; + annotateHelper(read, 4) annotateHelper(read, 2) annotateHelper(read, 1) +} + +void __attribute__((weak)) __tsan_write_range_pc(void* a, uptr size, void* pc) { + char* addr = (char*)a; + uptr len = ((uptr)addr) % 8; + // printf("__tsan_write_range_pc(%p, %li, %li), %li\n", addr, size, pc, len); + if (size < len) + len = size; + annotateHelper(write, 4) annotateHelper(write, 2) annotateHelper(write, 1) for (; size > 7; size -= 8) { + // printf("__tsan_write8_pc(%p, %li)\n", addr, pc); + __tsan_write8_pc(addr, pc); + addr += 8; + } + len = size; + annotateHelper(write, 4) annotateHelper(write, 2) annotateHelper(write, 1) +} + +void __attribute__((weak)) __tsan_func_entry(void* call_pc) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) __tsan_func_exit() { + FALLBACK_PRINT(__func__); +} + +a8 __attribute__((weak)) __tsan_atomic8_load(const volatile a8* a, morder mo) { + FALLBACK_PRINT(__func__); + return 0; +} +a16 __attribute__((weak)) __tsan_atomic16_load(const volatile a16* a, morder mo) { + FALLBACK_PRINT(__func__); + return 0; +} +a32 __attribute__((weak)) __tsan_atomic32_load(const volatile a32* a, morder mo) { + FALLBACK_PRINT(__func__); + return 0; +} +a64 __attribute__((weak)) __tsan_atomic64_load(const volatile a64* a, morder mo) { + FALLBACK_PRINT(__func__); + return 0; +} +void __attribute__((weak)) __tsan_atomic8_store(volatile a8* a, a8 v, morder mo) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) __tsan_atomic16_store(volatile a16* a, a16 v, morder mo) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) __tsan_atomic32_store(volatile a32* a, a32 v, morder mo) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) __tsan_atomic64_store(volatile a64* a, a64 v, morder mo) { + FALLBACK_PRINT(__func__); +} + +// TLC extension +void __attribute__((weak)) AnnotateInitTLC(const char* file, int line, const volatile void* cv) { + FALLBACK_PRINT(__func__); + AnnotateHappensBefore(file, line, cv); +} +void __attribute__((weak)) AnnotateStartTLC(const char* file, int line, const volatile void* cv) { + FALLBACK_PRINT(__func__); + AnnotateHappensAfter(file, line, cv); +} + +// Fibers +void __attribute__((weak)) * __tsan_get_current_fiber() { + FALLBACK_PRINT(__func__); + return nullptr; +} +void __attribute__((weak)) * __tsan_create_fiber(unsigned flags) { + FALLBACK_PRINT(__func__); + return nullptr; +} +void __attribute__((weak)) __tsan_destroy_fiber(void* fiber) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) __tsan_switch_to_fiber(void* fiber, unsigned flags) { + FALLBACK_PRINT(__func__); +} +void __attribute__((weak)) __tsan_set_fiber_name(void* fiber, const char* name) { + FALLBACK_PRINT(__func__); +} + +#ifdef __cplusplus +} +#endif + +#define TsanHappensBefore(cv) AnnotateHappensBefore(__FILE__, __LINE__, cv) +#define TsanHappensAfter(cv) AnnotateHappensAfter(__FILE__, __LINE__, cv) + +#define TsanIgnoreWritesBegin() AnnotateIgnoreWritesBegin(__FILE__, __LINE__) +#define TsanIgnoreWritesEnd() AnnotateIgnoreWritesEnd(__FILE__, __LINE__) +#define TsanIgnoreReadsBegin() AnnotateIgnoreReadsBegin(__FILE__, __LINE__) +#define TsanIgnoreReadsEnd() AnnotateIgnoreReadsEnd(__FILE__, __LINE__) +#define TsanIgnoreSyncBegin() AnnotateIgnoreSyncBegin(__FILE__, __LINE__) +#define TsanIgnoreSyncEnd() AnnotateIgnoreSyncEnd(__FILE__, __LINE__) + +#define TsanInitTLC(cv) AnnotateInitTLC(__FILE__, __LINE__, cv) +#define TsanStartTLC(cv) AnnotateStartTLC(__FILE__, __LINE__, cv) + +#define TsanCreateFiber(flags) __tsan_create_fiber(flags) +#define TsanDestroyFiber(fiber) __tsan_destroy_fiber(fiber) +#define TsanSwitchToFiber(fiber, flags) __tsan_switch_to_fiber(fiber, flags) +#define TsanGetCurrentFiber() __tsan_get_current_fiber() +#define TsanSetFiberName(fiber, name) __tsan_set_fiber_name(fiber, name) + +#define TsanMemoryRead(addr, size) __tsan_read_range(addr, size) +#define TsanMemoryWrite(addr, size) __tsan_write_range(addr, size) +#define TsanMemoryReadPC(addr, size, pc) __tsan_read_range_pc(addr, size, pc) +#define TsanMemoryWritePC(addr, size, pc) __tsan_write_range_pc(addr, size, pc) + +#define TsanFuncEntry(pc) __tsan_func_entry(pc) +#define TsanFuncExit() __tsan_func_exit() + +#define TsanPCMemoryRead(pc, addr, size) AnnotatePCMemoryRead(pc, __FILE__, __LINE__, addr, size) +#define TsanPCMemoryWrite(pc, addr, size) AnnotatePCMemoryWrite(pc, __FILE__, __LINE__, addr, size) + +#define TsanNewMemory(addr, size) AnnotateNewMemory(__FILE__, __LINE__, addr, size) +#define TsanFreeMemory(addr, size) AnnotateNewMemory(__FILE__, __LINE__, addr, size) + +#define TsanRWLockCreate(cv) AnnotateRWLockCreate(__FILE__, __LINE__, cv) +#define TsanRWLockDestroy(cv) AnnotateRWLockDestroy(__FILE__, __LINE__, cv) +#define TsanRWLockAcquired(cv, is_w) AnnotateRWLockAcquired(__FILE__, __LINE__, cv, is_w) +#define TsanRWLockReleased(cv, is_w) AnnotateRWLockReleased(__FILE__, __LINE__, cv, is_w) + +#define TsanAtomic8Load(a8, mo) __tsan_atomic8_load(a8, mo) +#define TsanAtomic8Store(a8, v, mo) __tsan_atomic8_store(a8, v, mo) +#define TsanAtomic16Load(a16, mo) __tsan_atomic16_load(a16, mo) +#define TsanAtomic16Store(a16, v, mo) __tsan_atomic16_store(a16, v, mo) +#define TsanAtomic32Load(a32, mo) __tsan_atomic32_load(a32, mo) +#define TsanAtomic32Store(a32, v, mo) __tsan_atomic32_store(a32, v, mo) +#define TsanAtomic64Load(a64, mo) __tsan_atomic64_load(a64, mo) +#define TsanAtomic64Store(a64, v, mo) __tsan_atomic64_store(a64, v, mo) + +#endif /*TSAN_EXTERNAL_H*/ \ No newline at end of file diff --git a/test/multi_gpu/lit.local.cfg b/test/multi_gpu/lit.local.cfg new file mode 100644 index 0000000..0f360f0 --- /dev/null +++ b/test/multi_gpu/lit.local.cfg @@ -0,0 +1,16 @@ +import subprocess + +if config.cusan_use_workarounds: + config.environment['NEOReadDebugKeys'] = '1' + config.environment['DisableDeepBind'] = '1' + config.environment['OMPI_MCA_memory'] = '^patcher' + +def has_multigpu(): + try: + subprocess.check_call(['nvidia-smi', '-i=1'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return True + except subprocess.CalledProcessError: + return False + + +config.available_features.add('multigpu' if has_multigpu() else '') \ No newline at end of file diff --git a/test/multi_gpu/suppressions.txt b/test/multi_gpu/suppressions.txt new file mode 100644 index 0000000..3292e71 --- /dev/null +++ b/test/multi_gpu/suppressions.txt @@ -0,0 +1,29 @@ +called_from_lib:libmpi_cxx* +called_from_lib:libmpi.so* +called_from_lib:libcuda.so* +called_from_lib:libtypeartRuntime.so* + +# Probably not required, from previous experiments +#called_from_lib:libucx* +called_from_lib:libucp* +called_from_lib:libucs* +#called_from_lib:libucm_cuda.so* +called_from_lib:libucm.so* +called_from_lib:libuct_ib.so.0 +called_from_lib:libuct.so.0 +called_from_lib:mca_pmix_pmix3x* +called_from_lib:libibverbs.so* +#called_from_lib:libevent_pthreads-2.1.so* +called_from_lib:libevent_core-2.1.so* +#called_from_lib:libhfi1verbs-rdmav34.so* +called_from_lib:libopen-pal.so.40* +#called_from_lib:mca_errmgr_default_app.so* +#called_from_lib:mca_gds_hash.so* +#called_from_lib:mca_ptl_tcp.so* +#called_from_lib:mca_bfrops_v20.so* +#called_from_lib:mca_psec_munge.so* +#called_from_lib:mca_gds_ds21.so* +#called_from_lib:libmca_common_dstore.so* +called_from_lib:libopen-rte.so.40* +called_from_lib:ld-linux-x86-64.so* +called_from_lib:libmlx5-rdma* diff --git a/test/pass/03_cuda_to_mpi.c b/test/pass/03_cuda_to_mpi.c index 82d6a98..5186403 100644 --- a/test/pass/03_cuda_to_mpi.c +++ b/test/pass/03_cuda_to_mpi.c @@ -2,11 +2,11 @@ // RUN: %rm-file %t.yaml // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR - - // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[target:%[0-9a-z]+]], {{i8\*|ptr}} {{.*}}[[from:%[0-9a-z]+]], // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]], {{i8\*|ptr}} {{.*}}[[from]], +// REQUIRES: mpi + // clang-format on #include "../support/gpu_mpi.h" diff --git a/test/pass/04_mpi_to_cuda.c b/test/pass/04_mpi_to_cuda.c index e27f41a..6196dcc 100644 --- a/test/pass/04_mpi_to_cuda.c +++ b/test/pass/04_mpi_to_cuda.c @@ -8,6 +8,8 @@ // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[target:%[0-9a-z]+]], {{i8\*|ptr}} {{.*}}[[from:%[0-9a-z]+]], // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]], {{i8\*|ptr}} {{.*}}[[from]], +// REQUIRES: mpi + // clang-format on #include "../support/gpu_mpi.h" diff --git a/test/pass/05_cuda_to_mpi_stream.c b/test/pass/05_cuda_to_mpi_stream.c index bdb304f..6a22f3e 100644 --- a/test/pass/05_cuda_to_mpi_stream.c +++ b/test/pass/05_cuda_to_mpi_stream.c @@ -10,6 +10,8 @@ // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamSynchronize // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_stream +// REQUIRES: mpi + // clang-format on #include "../support/gpu_mpi.h" diff --git a/test/pass/06_cuda_to_mpi_event.c b/test/pass/06_cuda_to_mpi_event.c index 2253965..50bdd34 100644 --- a/test/pass/06_cuda_to_mpi_event.c +++ b/test/pass/06_cuda_to_mpi_event.c @@ -10,6 +10,8 @@ // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[target:%[0-9a-z]+]], {{i8\*|ptr}} {{.*}}[[from:%[0-9a-z]+]], // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]], {{i8\*|ptr}} {{.*}}[[from]], +// REQUIRES: mpi + // clang-format on #include "../support/gpu_mpi.h" diff --git a/test/pass/07_cuda_to_mpi_read.c b/test/pass/07_cuda_to_mpi_read.c index c1ce739..f379fb7 100644 --- a/test/pass/07_cuda_to_mpi_read.c +++ b/test/pass/07_cuda_to_mpi_read.c @@ -11,6 +11,8 @@ // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[target:%[0-9a-z]+]], {{i8\*|ptr}} {{.*}}[[from:%[0-9a-z]+]], // CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]], {{i8\*|ptr}} {{.*}}[[from]], +// REQUIRES: mpi + // clang-format on #include "../support/gpu_mpi.h" diff --git a/test/pass/08_cudamemcpy_to_mpi.c b/test/pass/08_cudamemcpy_to_mpi.c index 3d1139c..1d553a2 100644 --- a/test/pass/08_cudamemcpy_to_mpi.c +++ b/test/pass/08_cudamemcpy_to_mpi.c @@ -15,6 +15,8 @@ // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamSynchronize // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_stream +// REQUIRES: mpi + // clang-format on // Tsan sometimes crashes with this test it seems diff --git a/test/pass/11_cuda_to_mpi_struct_of_buff.c b/test/pass/11_cuda_to_mpi_struct_of_buff.c index 2dcea96..3a2001d 100644 --- a/test/pass/11_cuda_to_mpi_struct_of_buff.c +++ b/test/pass/11_cuda_to_mpi_struct_of_buff.c @@ -3,6 +3,8 @@ // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR +// REQUIRES: mpi + // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate diff --git a/test/pass/18_cuda_to_mpi_event_query_busy_loop.c b/test/pass/18_cuda_to_mpi_event_query_busy_loop.c index cd00aae..8c590cc 100644 --- a/test/pass/18_cuda_to_mpi_event_query_busy_loop.c +++ b/test/pass/18_cuda_to_mpi_event_query_busy_loop.c @@ -3,6 +3,7 @@ // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR +// REQUIRES: mpi // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate // CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_stream diff --git a/test/pass/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c b/test/pass/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c index 1c5d299..248c3a6 100644 --- a/test/pass/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c +++ b/test/pass/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c @@ -3,6 +3,7 @@ // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR +// REQUIRES: mpi // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate // CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_stream diff --git a/test/pass/20_cuda_to_mpi_send_ds_sync_w_r.c b/test/pass/20_cuda_to_mpi_send_ds_sync_w_r.c index f86f3a0..871b3ab 100644 --- a/test/pass/20_cuda_to_mpi_send_ds_sync_w_r.c +++ b/test/pass/20_cuda_to_mpi_send_ds_sync_w_r.c @@ -3,6 +3,8 @@ // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR +// REQUIRES: mpi + // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaMemset diff --git a/test/pass/21_chunked_streams_example.c b/test/pass/21_chunked_streams_example.c index 13562f7..03774ce 100644 --- a/test/pass/21_chunked_streams_example.c +++ b/test/pass/21_chunked_streams_example.c @@ -3,6 +3,8 @@ // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR +// REQUIRES: mpi + // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate diff --git a/test/pass/22_cuda_to_mpi_partial_buff_write.c b/test/pass/22_cuda_to_mpi_partial_buff_write.c index c037c8a..2f23ae8 100644 --- a/test/pass/22_cuda_to_mpi_partial_buff_write.c +++ b/test/pass/22_cuda_to_mpi_partial_buff_write.c @@ -3,6 +3,8 @@ // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR +// REQUIRES: mpi + // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream // CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamSynchronize diff --git a/test/pass/26_malloc_pitch.c b/test/pass/26_malloc_pitch.c index b71af1a..7a0a97c 100644 --- a/test/pass/26_malloc_pitch.c +++ b/test/pass/26_malloc_pitch.c @@ -6,6 +6,8 @@ // {{.*}}[[from:%[0-9a-z]+]], CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]], // {{i8\*|ptr}} {{.*}}[[from]], +// REQUIRES: mpi + // clang-format on #include "../support/gpu_mpi.h" diff --git a/test/pass/29_tsan_cuda_to_mpi.c b/test/pass/29_tsan_cuda_to_mpi.c index 7608db6..9ab6fdc 100644 --- a/test/pass/29_tsan_cuda_to_mpi.c +++ b/test/pass/29_tsan_cuda_to_mpi.c @@ -3,6 +3,7 @@ // RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR +// REQUIRES: mpi // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy // CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memcpy diff --git a/test/pass/30_tsan_annotate_cuda_to_mpi.c b/test/pass/30_tsan_annotate_cuda_to_mpi.c index 506eff1..335e372 100644 --- a/test/pass/30_tsan_annotate_cuda_to_mpi.c +++ b/test/pass/30_tsan_annotate_cuda_to_mpi.c @@ -8,6 +8,8 @@ // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFree // CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_free +// REQUIRES: mpi + // clang-format on #include "../support/gpu_mpi.h" diff --git a/test/pass/31_tsan_cuda_event.c b/test/pass/31_tsan_cuda_event.c index 918ba99..eb819ea 100644 --- a/test/pass/31_tsan_cuda_event.c +++ b/test/pass/31_tsan_cuda_event.c @@ -1,8 +1,7 @@ // clang-format off // RUN: %rm-file %t.yaml -// RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR - +// RUN: %wrapper-cc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaEventCreate // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_event @@ -15,8 +14,6 @@ // clang-format on -#include "../support/gpu_mpi.h" - #include __global__ void kernel(int* arr, const int N) { // CHECK-DAG: [[FILENAME]]:[[@LINE]] @@ -27,10 +24,6 @@ __global__ void kernel(int* arr, const int N) { // CHECK-DAG: [[FILENAME]]:[[@L } int main(int argc, char* argv[]) { - if (!has_gpu_aware_mpi()) { - printf("This example is designed for CUDA-aware MPI. Exiting.\n"); - return 1; - } cudaEvent_t first_finished_event; cudaEventCreate(&first_finished_event); cudaStream_t stream1; diff --git a/test/pass/32_tsan_async_copy.c b/test/pass/32_tsan_async_copy.c index cc7c12f..d30220d 100644 --- a/test/pass/32_tsan_async_copy.c +++ b/test/pass/32_tsan_async_copy.c @@ -1,7 +1,7 @@ // clang-format off // RUN: %rm-file %t.yaml -// RUN: %wrapper-mpicc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR +// RUN: %wrapper-cc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR // CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate diff --git a/test/pass/34_test_sync_callback.c b/test/pass/34_test_sync_callback.c new file mode 100644 index 0000000..2015bfe --- /dev/null +++ b/test/pass/34_test_sync_callback.c @@ -0,0 +1,14 @@ +// clang-format off +// RUN: %rm-file %t.yaml +// RUN: %wrapper-cc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s +// REQUIRES: sync_callback +// clang-format on + +// CHECK: {{(invoke|call)}} i32 @cudaDeviceSynchronize +// CHECK: {{(invoke|call)}} void @cusan_sync_callback + +int main(int argc, char* argv[]) { + cudaDeviceSynchronize(); + + return 0; +} diff --git a/test/pass/35_test_sync_callback_stream.c b/test/pass/35_test_sync_callback_stream.c new file mode 100644 index 0000000..d8c5b68 --- /dev/null +++ b/test/pass/35_test_sync_callback_stream.c @@ -0,0 +1,15 @@ +// clang-format off +// RUN: %rm-file %t.yaml +// RUN: %wrapper-cc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s +// REQUIRES: sync_callback +// clang-format on + +// CHECK: {{(invoke|call)}} i32 @cudaStreamSynchronize +// CHECK: {{(invoke|call)}} void @cusan_sync_callback + +int main(int argc, char* argv[]) { + cudaStream_t stream = NULL; + cudaStreamSynchronize(stream); + + return 0; +} diff --git a/test/pass/36_test_sync_callback_event.c b/test/pass/36_test_sync_callback_event.c new file mode 100644 index 0000000..6ee4679 --- /dev/null +++ b/test/pass/36_test_sync_callback_event.c @@ -0,0 +1,14 @@ +// clang-format off +// RUN: %rm-file %t.yaml +// RUN: %wrapper-cc %clang-pass-only-args --cusan-kernel-data=%t.yaml -x cuda --cuda-gpu-arch=sm_72 %s 2>&1 | %filecheck %s +// REQUIRES: sync_callback +// clang-format on + +// CHECK: {{(invoke|call)}} i32 @cudaEventSynchronize +// CHECK: {{(invoke|call)}} void @cusan_sync_callback + +int main(int argc, char* argv[]) { + cudaEvent_t event; + cudaEventSynchronize(event); + return 0; +} diff --git a/test/pass/TSan_External.h b/test/pass/TSan_External.h index 574f0db..c543fdc 100644 --- a/test/pass/TSan_External.h +++ b/test/pass/TSan_External.h @@ -87,12 +87,12 @@ void __attribute__((weak)) AnnotateRWLockCreate(const char* file, int line, cons void __attribute__((weak)) AnnotateRWLockDestroy(const char* file, int line, const volatile void* cv) { FALLBACK_PRINT(__func__); } -void __attribute__((weak)) -AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv, unsigned long long is_w) { +void __attribute__((weak)) AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv, + unsigned long long is_w) { FALLBACK_PRINT(__func__); } -void __attribute__((weak)) -AnnotateRWLockReleased(const char* file, int line, const volatile void* cv, unsigned long long is_w) { +void __attribute__((weak)) AnnotateRWLockReleased(const char* file, int line, const volatile void* cv, + unsigned long long is_w) { FALLBACK_PRINT(__func__); } diff --git a/test/runtime/03_cuda_to_mpi.c b/test/runtime/03_cuda_to_mpi.c index bbb61e6..e2dcd9d 100644 --- a/test/runtime/03_cuda_to_mpi.c +++ b/test/runtime/03_cuda_to_mpi.c @@ -5,6 +5,8 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC +// REQUIRES: mpi + // clang-format on // CHECK-DAG: data race diff --git a/test/runtime/04_mpi_to_cuda.c b/test/runtime/04_mpi_to_cuda.c index 3b54714..77f8915 100644 --- a/test/runtime/04_mpi_to_cuda.c +++ b/test/runtime/04_mpi_to_cuda.c @@ -5,6 +5,8 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC +// REQUIRES: mpi + // clang-format on // CHECK-DAG: data race diff --git a/test/runtime/05_cuda_to_mpi_stream.c b/test/runtime/05_cuda_to_mpi_stream.c index e07b35c..4ff4400 100644 --- a/test/runtime/05_cuda_to_mpi_stream.c +++ b/test/runtime/05_cuda_to_mpi_stream.c @@ -5,6 +5,8 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC +// REQUIRES: mpi + // clang-format on // CHECK-DAG: data race diff --git a/test/runtime/06_cuda_to_mpi_event.c b/test/runtime/06_cuda_to_mpi_event.c index b44e4a3..65ef15e 100644 --- a/test/runtime/06_cuda_to_mpi_event.c +++ b/test/runtime/06_cuda_to_mpi_event.c @@ -5,6 +5,8 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC +// REQUIRES: mpi + // clang-format on // CHECK-DAG: data race diff --git a/test/runtime/07_cuda_to_mpi_read.c b/test/runtime/07_cuda_to_mpi_read.c index 8d36e6b..d1ee1fc 100644 --- a/test/runtime/07_cuda_to_mpi_read.c +++ b/test/runtime/07_cuda_to_mpi_read.c @@ -2,6 +2,8 @@ // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck --allow-empty %s +// REQUIRES: mpi + // clang-format on // CHECK-NOT: data race diff --git a/test/runtime/08_cudamemcpy_to_mpi.c b/test/runtime/08_cudamemcpy_to_mpi.c index 18c6671..4aedbcc 100644 --- a/test/runtime/08_cudamemcpy_to_mpi.c +++ b/test/runtime/08_cudamemcpy_to_mpi.c @@ -2,6 +2,8 @@ // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe // RUN: %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s --allow-empty +// REQUIRES: mpi + // clang-format on // CHECK-NOT: data race diff --git a/test/runtime/11_cuda_to_mpi_struct_of_buff.c b/test/runtime/11_cuda_to_mpi_struct_of_buff.c index ea875dc..8455f74 100644 --- a/test/runtime/11_cuda_to_mpi_struct_of_buff.c +++ b/test/runtime/11_cuda_to_mpi_struct_of_buff.c @@ -5,6 +5,8 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC +// REQUIRES: mpi + // clang-format on // CHECK-DAG: data race diff --git a/test/runtime/11_struct_of_buff.c b/test/runtime/11_struct_of_buff.c index eb594f1..121c3a7 100644 --- a/test/runtime/11_struct_of_buff.c +++ b/test/runtime/11_struct_of_buff.c @@ -10,7 +10,7 @@ // CHECK-SYNC-NOT: data race -#include "../support/gpu_mpi.h" +// #include "../support/gpu_mpi.h" struct BufferStorage { int* buff1; diff --git a/test/runtime/12_struct_ptr.c b/test/runtime/12_struct_ptr.c index 3792d13..41ab21d 100644 --- a/test/runtime/12_struct_ptr.c +++ b/test/runtime/12_struct_ptr.c @@ -5,6 +5,8 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC +// REQUIRES: mpi + // CHECK-DAG: data race // CHECK-SYNC-NOT: data race diff --git a/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c b/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c index cf613ca..0b82829 100644 --- a/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c +++ b/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c @@ -5,6 +5,8 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC +// REQUIRES: mpi + // clang-format on // CHECK-DAG: data race diff --git a/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c b/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c index c5b221e..23d9a9c 100644 --- a/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c +++ b/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c @@ -5,6 +5,8 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC +// REQUIRES: mpi + // clang-format on // CHECK-DAG: data race diff --git a/test/runtime/20_cuda_default_stream_sync.c b/test/runtime/20_cuda_default_stream_sync.c index 78aedfc..c232483 100644 --- a/test/runtime/20_cuda_default_stream_sync.c +++ b/test/runtime/20_cuda_default_stream_sync.c @@ -1,8 +1,8 @@ // clang-format off -// RUN: %wrapper-cxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe +// RUN: %wrapper-cc %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t.exe // RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s -// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe +// RUN: %wrapper-cc -DCUSAN_SYNC %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t-sync.exe // RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC // clang-format on @@ -13,8 +13,9 @@ // CHECK-SYNC-NOT: data race // CHECK-SYNC-NOT: [Error] sync -#include "../support/gpu_mpi.h" +// #include "../support/gpu_mpi.h" +#include #include __global__ void write_kernel_delay(int* arr, const int N, const unsigned int delay) { diff --git a/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c b/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c index 5d72ad7..7f88a11 100644 --- a/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c +++ b/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c @@ -4,6 +4,9 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC + +// REQUIRES: mpi + // clang-format on // CHECK-DAG: data race diff --git a/test/runtime/21_chunked_streams_example.c b/test/runtime/21_chunked_streams_example.c index 90b21e0..6b7dee5 100644 --- a/test/runtime/21_chunked_streams_example.c +++ b/test/runtime/21_chunked_streams_example.c @@ -2,6 +2,8 @@ // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s +// REQUIRES: mpi + // clang-format on // CHECK-NOT: data race diff --git a/test/runtime/22_cuda_to_mpi_partial_buff_write.c b/test/runtime/22_cuda_to_mpi_partial_buff_write.c index 1c8ffcd..2639ce8 100644 --- a/test/runtime/22_cuda_to_mpi_partial_buff_write.c +++ b/test/runtime/22_cuda_to_mpi_partial_buff_write.c @@ -2,6 +2,8 @@ // RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s --allow-empty +// REQUIRES: mpi + // clang-format on // CHECK-NOT: data race diff --git a/test/runtime/26_malloc_pitch.c b/test/runtime/26_malloc_pitch.c index b2b12da..0dac955 100644 --- a/test/runtime/26_malloc_pitch.c +++ b/test/runtime/26_malloc_pitch.c @@ -5,6 +5,8 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC +// REQUIRES: mpi + // clang-format on // CHECK-DAG: data race diff --git a/test/runtime/29_tsan_cuda_to_mpi.c b/test/runtime/29_tsan_cuda_to_mpi.c index a39b5a0..2ac700f 100644 --- a/test/runtime/29_tsan_cuda_to_mpi.c +++ b/test/runtime/29_tsan_cuda_to_mpi.c @@ -5,6 +5,8 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC +// REQUIRES: mpi + // clang-format on // CHECK: [Error] sync diff --git a/test/runtime/30_tsan_annotate_cuda_to_mpi.c b/test/runtime/30_tsan_annotate_cuda_to_mpi.c index 13246ef..19c6d97 100644 --- a/test/runtime/30_tsan_annotate_cuda_to_mpi.c +++ b/test/runtime/30_tsan_annotate_cuda_to_mpi.c @@ -4,6 +4,9 @@ // RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda -g %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC + +// REQUIRES: mpi + // clang-format on // CHECK-DAG: data race diff --git a/test/runtime/31_tsan_cuda_event.c b/test/runtime/31_tsan_cuda_event.c index beacaeb..aa213fa 100644 --- a/test/runtime/31_tsan_cuda_event.c +++ b/test/runtime/31_tsan_cuda_event.c @@ -11,7 +11,7 @@ // CHECK-SYNC-NOT: data race -#include "../support/gpu_mpi.h" +// #include "../support/gpu_mpi.h" #include diff --git a/test/runtime/32_tsan_async_copy.c b/test/runtime/32_tsan_async_copy.c index 5a6f7f7..14cfcd5 100644 --- a/test/runtime/32_tsan_async_copy.c +++ b/test/runtime/32_tsan_async_copy.c @@ -1,9 +1,9 @@ // clang-format off -// RUN: %wrapper-mpicxx %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe -// RUN: %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s +// RUN: %wrapper-cc %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t.exe +// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s -// RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args -x cuda %s -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe -// RUN: %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC +// RUN: %wrapper-cc -DCUSAN_SYNC %clang_args -x cuda -gencode arch=compute_70,code=sm_70 %s -o %cusan_test_dir/%basename_t-sync.exe +// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC // CHECK-DAG: data race // CHECK-DAG: [Error] sync @@ -13,8 +13,7 @@ // clang-format on -#include "../support/gpu_mpi.h" - +#include #include __global__ void kernel(int* arr, const int N) { @@ -32,11 +31,6 @@ __global__ void kernel(int* arr, const int N) { } int main(int argc, char* argv[]) { - if (!has_gpu_aware_mpi()) { - printf("This example is designed for CUDA-aware MPI. Exiting.\n"); - return 1; - } - cudaEvent_t first_finished_event; cudaEventCreate(&first_finished_event); cudaStream_t stream1; diff --git a/test/runtime/33_tsan_wait_event.c b/test/runtime/33_tsan_wait_event.c index c675758..af2a5a1 100644 --- a/test/runtime/33_tsan_wait_event.c +++ b/test/runtime/33_tsan_wait_event.c @@ -11,7 +11,7 @@ // clang-format on -#include "../support/gpu_mpi.h" +// #include "../support/gpu_mpi.h" #include diff --git a/test/runtime/TSan_External.h b/test/runtime/TSan_External.h index 36849c9..c7b5796 100644 --- a/test/runtime/TSan_External.h +++ b/test/runtime/TSan_External.h @@ -87,12 +87,12 @@ void __attribute__((weak)) AnnotateRWLockCreate(const char* file, int line, cons void __attribute__((weak)) AnnotateRWLockDestroy(const char* file, int line, const volatile void* cv) { FALLBACK_PRINT(__func__); } -void __attribute__((weak)) -AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv, unsigned long long is_w) { +void __attribute__((weak)) AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv, + unsigned long long is_w) { FALLBACK_PRINT(__func__); } -void __attribute__((weak)) -AnnotateRWLockReleased(const char* file, int line, const volatile void* cv, unsigned long long is_w) { +void __attribute__((weak)) AnnotateRWLockReleased(const char* file, int line, const volatile void* cv, + unsigned long long is_w) { FALLBACK_PRINT(__func__); }