From aa86e001e05467e4016edfe120fd61a78cdcfb4f Mon Sep 17 00:00:00 2001 From: LiuKaixuan Date: Wed, 6 Mar 2024 11:24:38 +0800 Subject: [PATCH] naive-gpu-sw --- CMakeLists.txt | 27 ++- gpu-sw/CMakeLists.txt | 11 ++ gpu-sw/core.hh | 165 ++++++++++++++++++ gpu-sw/cuda-utils.cc | 29 ++++ gpu-sw/cuda-utils.hh | 258 ++++++++++++++++++++++++++++ gpu-sw/mats.cc | 61 +++++++ gpu-sw/seqs.cc | 286 +++++++++++++++++++++++++++++++ gpu-sw/seqs.hh | 307 +++++++++++++++++++++++++++++++++ gpu-sw/sw-lib.cc | 292 +++++++++++++++++++++++++++++++ gpu-sw/sw-lib.hh | 146 ++++++++++++++++ gpu-sw/sw.cu | 361 +++++++++++++++++++++++++++++++++++++++ gpu-sw/sw.hh | 224 ++++++++++++++++++++++++ include/smith.h | 4 +- include/util.h | 3 + src/banded_smith_cpu.cpp | 260 +++++++++++++++++++++++++++- src/blastp.cu | 97 ++++++++++- src/createDB.cpp | 2 +- src/main.cpp | 2 + src/util.cpp | 99 ++++++++++- 19 files changed, 2618 insertions(+), 16 deletions(-) create mode 100644 gpu-sw/CMakeLists.txt create mode 100644 gpu-sw/core.hh create mode 100644 gpu-sw/cuda-utils.cc create mode 100644 gpu-sw/cuda-utils.hh create mode 100644 gpu-sw/mats.cc create mode 100644 gpu-sw/seqs.cc create mode 100644 gpu-sw/seqs.hh create mode 100644 gpu-sw/sw-lib.cc create mode 100644 gpu-sw/sw-lib.hh create mode 100644 gpu-sw/sw.cu create mode 100644 gpu-sw/sw.hh diff --git a/CMakeLists.txt b/CMakeLists.txt index a521314..808cd07 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,9 +3,22 @@ cmake_minimum_required(VERSION 3.5) set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda) project(BLASTP LANGUAGES CXX C CUDA) -SET(CMAKE_BUILD_TYPE "Release") +# SET(CMAKE_BUILD_TYPE "Release") +SET(CMAKE_BUILD_TYPE DEBUG) + +option(GLF_GPU_SW "Use GLF-GPU Smith-waterman" ON) +if(GLF_GPU_SW) + add_definitions(-DGLF_GPU_SW) +endif() + +option(USE_GPU_SW "Use GPU Smith-waterman" OFF) +if(USE_GPU_SW) + add_definitions(-DUSE_GPU_SW) +endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -pthread") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -std=c++17 -Xcompiler -pthread") + if (NOT ("${CMAKE_SIZEOF_VOID_P}" STREQUAL "8")) message(SEND_ERROR "require 64 bit system") endif() @@ -50,14 +63,24 @@ target_compile_options (util ${OpenMP_CXX_FLAGS} ) +# gpu_sw +add_subdirectory(./gpu-sw) + add_executable(query src/main.cpp ${SOURCES_SEARCH}) -target_link_libraries(query util) + +# find_library(GPU_SW_LIBRARY +# PATHS ${CMAKE_BINARY_DIR}/gpu-sw +# ) +# target_link_libraries(query PRIVATE util ${GPU_SW_LIBRARY}) +target_link_libraries(query PRIVATE util sw-lib) +# target_link_libraries(query util) add_executable(createDB src/createDB.cpp) target_include_directories(query PRIVATE ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/gpu-sw ) target_include_directories(createDB diff --git a/gpu-sw/CMakeLists.txt b/gpu-sw/CMakeLists.txt new file mode 100644 index 0000000..48848ac --- /dev/null +++ b/gpu-sw/CMakeLists.txt @@ -0,0 +1,11 @@ +add_library(sw-lib STATIC) +set_property(TARGET sw-lib PROPERTY CUDA_ARCHITECTURES "${GPU_ARCHS}") +target_sources(sw-lib PRIVATE + "sw.cu" + "cuda-utils.cc" + "mats.cc" + "seqs.cc" + "sw-lib.cc" + ) +target_link_libraries(sw-lib PUBLIC "${CUDA_LIBRARIES}") +target_include_directories(sw-lib PUBLIC "${CUDA_INCLUDE_DIRS}" ".") \ No newline at end of file diff --git a/gpu-sw/core.hh b/gpu-sw/core.hh new file mode 100644 index 0000000..f1f1779 --- /dev/null +++ b/gpu-sw/core.hh @@ -0,0 +1,165 @@ +#ifndef __ECCL_CORE_HH__ +#define __ECCL_CORE_HH__ + +#include +#include +#include +#include +#include +#include + +/*! core types/definitions/functions, cheap to include */ + + +#ifdef __CUDACC__ +#define CUDA_HOST __host__ +#define CUDA_DEVICE __device__ +#else +#define CUDA_HOST +#define CUDA_DEVICE +#endif + +/*! 10 necleotides per unsigned int (instead of 8) */ +#define ECCL_COMPACT_CODE + +namespace eccl { + +enum class seq_type { + xna, /*! DNA or RNA */ + //dna, + //rna, + prot, +}; + +template +struct code { + unsigned char value{0}; + constexpr static unsigned int width=(Type==seq_type::xna?3:5); +#ifndef ECCL_COMPACT_CODE + constexpr static unsigned int n_per_word=(Type==seq_type::xna?8:4); +#else + constexpr static unsigned int n_per_word=(Type==seq_type::xna?10:6); +#endif + /*! complement neucleotide */ + code operator~() const noexcept { + static_assert(Type==seq_type::xna); + return code{static_cast((~value)&0b111)}; + } +}; +using nucleotide=code; +using amino_acid=code; + +template +struct pair { + unsigned int value{0}; + CUDA_HOST CUDA_DEVICE constexpr pair(code a, code b) noexcept: + value{(static_cast(a.value)< +inline unsigned int padded_len(unsigned int len) noexcept { +#ifndef ECCL_COMPACT_CODE + return Type==seq_type::xna?(len+7)/8*8:(len+3)/4*4; +#else + //return (len+9)/10*10; + return Type==seq_type::xna?(len+39)/40*40:(len+23)/24*24; +#endif +} + +template +inline CUDA_DEVICE code get_code(const unsigned int* buf, unsigned long idx); +template<> +inline CUDA_DEVICE code get_code(const unsigned int* buf, unsigned long idx) { +#ifndef ECCL_COMPACT_CODE + auto v=buf[idx/8]; + return {static_cast((v>>((7-idx%8)*4))&0x0f)}; +#else + auto v=buf[idx/10]; + return {static_cast((v>>((9-idx%10)*3))&0b0111)}; +#endif +}; +template<> +inline CUDA_DEVICE code get_code(const unsigned int* buf, unsigned long idx) { +#ifndef ECCL_COMPACT_CODE + auto v=buf[idx/4]; + return {static_cast((v>>((3-idx%4)*8))&0x1f)}; +#else + auto v=buf[idx/6]; + return {static_cast((v>>((5-idx%6)*5))&0b011111)}; +#endif +}; + +template +class chunk { +public: + constexpr chunk() noexcept: _p{nullptr}, _s{0} { } + explicit chunk(std::size_t size): _p{std::make_unique(size)}, _s{size<<1} { } + + explicit operator bool() const noexcept { return _p.get(); } + std::size_t size() const noexcept { return _s>>1; } + T& operator[](std::size_t i) noexcept { return _p[i]; } + const T& operator[](std::size_t i) const noexcept { return _p[i]; } + bool eof() const noexcept { return _s&1; } + + void shrink(std::size_t size, bool eof=false) noexcept { + assert(size<=(_s>>1)); + _s=(size<<1)|(eof?1:0); + } + +private: + std::unique_ptr _p; + /*! use last bit for eof */ + std::size_t _s; +}; + +template +class source { +public: + chunk get() { + auto chk=_fut.get(); + _fut=prepare(); + return chk; + } + +protected: + constexpr source() noexcept { } + void post_ctor() { + _fut=prepare(); + } + virtual ~source() { } + + virtual std::future> prepare() =0; +private: + std::future> _fut; +}; + + +} + +#if __cplusplus < 201703L +namespace std { +class string_view { +public: + constexpr string_view(const char* p, std::size_t s) noexcept: + _p{p}, _s{s} { } + string_view(const char* p): + _p{p}, _s{std::strlen(p)} { } + std::size_t size() const noexcept { return _s; } + const char& operator[](std::size_t i) const noexcept { return _p[i]; } +private: + const char* _p; + std::size_t _s; +}; +inline bool operator!=(const std::string& a, std::string_view b) noexcept { + return a.compare(0, a.size(), &b[0], b.size())!=0; +} +inline bool operator==(const std::string& a, std::string_view b) noexcept { + return !(a!=b); +} +inline std::ostream& operator<<(std::ostream& oss, std::string_view b) { + return oss.write(&b[0], b.size()); +} +} +#endif + +#endif diff --git a/gpu-sw/cuda-utils.cc b/gpu-sw/cuda-utils.cc new file mode 100644 index 0000000..64dc558 --- /dev/null +++ b/gpu-sw/cuda-utils.cc @@ -0,0 +1,29 @@ +#include "cuda-utils.hh" + +#include +#include + +void eccl::dump_device_info(int device) { + struct cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, device); + printf("%s\n", prop.name); + printf("Major revision number: %d\n", prop.major); + printf("Minor revision number: %d\n", prop.minor); + printf("Total global memory: %zu", prop.totalGlobalMem); + printf(" bytes\n"); + printf("Number of multiprocessors: %d\n", prop.multiProcessorCount); + printf("Total amount of shared memory per block: %zu\n",prop.sharedMemPerBlock); + printf("Total registers per block: %d\n", prop.regsPerBlock); + printf("Warp size: %d\n", prop.warpSize); + printf("Maximum memory pitch: %zu\n", prop.memPitch); + printf("Total amount of constant memory: %zu\n", prop.totalConstMem); +} + +namespace eccl { +void operator,(cudaError_t error, eccl::check_cuda checker) { + if(!error) + return; + std::cerr<<"error: "< +#include + +#include + +namespace eccl { + +enum class device { + cpu, + cuda, +}; + +/*! eg: cudaBlah(...), eccl::check_cuda{"error tag"}; */ +class check_cuda { +public: + explicit constexpr check_cuda(const char* msg) noexcept: + _msg{msg} { } +private: + const char* _msg; + friend void operator,(cudaError_t error, check_cuda checker); +}; + +void dump_device_info(int device); + +//pointer/buffer types: +// +//host_only +// + +/*! device raw pointer, no deference in CPU, no memory management */ +template +class dev_ptr { +public: + constexpr dev_ptr() noexcept: _p{nullptr} { } + explicit constexpr dev_ptr(T* p) noexcept: _p{p} { } + ~dev_ptr() { } + dev_ptr(const dev_ptr&) noexcept =default; + dev_ptr& operator=(const dev_ptr&) noexcept =default; + + operator T*() const noexcept { return static_cast(_p); } + +private: + void* _p; +}; + +template +using host_ptr=T*; + +namespace detail { + +template +struct cuda_host_allocator { + using value_type=T; + using size_type=std::size_t; + constexpr cuda_host_allocator() noexcept { } + ~cuda_host_allocator() { } + + T* allocate(size_type n) { + T* r; + cudaMallocHost(&r, n*sizeof(T)), check_cuda{"malloc host"}; + //fprintf(stderr, "malloc host: %p %zd\n", r, n*sizeof(T)); + return r; + } + void deallocate(T* p, size_type n) { + //fprintf(stderr, "free.. host: %p %zd\n", p, n*sizeof(T)); + cudaFreeHost(p), check_cuda{"free host"}; + } +}; + +template +struct cuda_dev_allocator { + using value_type=T; + using size_type=std::size_t; + constexpr cuda_dev_allocator() noexcept { } + ~cuda_dev_allocator() { } + + T* allocate(size_type n) { + T* r; + cudaMalloc(&r, n*sizeof(T)), check_cuda{"malloc dev"}; + //fprintf(stderr, "malloc dev: %p %zd\n", r, n*sizeof(T)); + //fprintf(stderr, "malloc dev: %p %zd\n", r, n*sizeof(T)); + return r; + } + void deallocate(T* p, size_type n) { + //fprintf(stderr, "free.. dev: %p %zd\n", p, n*sizeof(T)); + cudaFree(p), check_cuda{"free dev"}; + } +}; + +} + +/*! host buffer, accessible to device */ +template +class host_vector: public std::vector> { +public: + using std::vector>::vector; + operator dev_ptr() noexcept { + return dev_ptr(&(*this)[0]); + } +}; +//using host_vector= + +/*! device buffer, std::vector-like API */ +template +class dev_vector { +public: + constexpr dev_vector() noexcept: _p{nullptr}, _s{0} { } + ~dev_vector() { + if(_p) { + detail::cuda_dev_allocator{}.deallocate(_p, _s); + } + } + dev_vector(const dev_vector&) =delete; + dev_vector& operator=(const dev_vector&) =delete; + dev_vector(dev_vector&& r) noexcept: _p{r._p}, _s{r._s} { + r._p=nullptr; + r._s=0; + } + dev_vector& operator=(dev_vector&& r) noexcept { + std::swap(_p, r._p); + std::swap(_s, r._s); + return *this; + } + + operator T*() const noexcept { return static_cast(_p); } + operator eccl::dev_ptr() const noexcept { + return eccl::dev_ptr{static_cast(_p)}; + } + std::size_t size() const noexcept { return _s; } + + void try_malloc(std::size_t s) { + if(_p) { + if(_s(1024, 16*(s-_s)); + malloc(ss); + } + } else { + malloc(s); + } + } + void malloc(std::size_t s) { + assert(_p==nullptr); + _p=detail::cuda_dev_allocator{}.allocate(s); + _s=s; + } + + void free() { + assert(_p); + detail::cuda_dev_allocator{}.deallocate(_p, _s); + _p=nullptr; + } + + template + void try_malloc_h2d(const Vec& vec, cudaStream_t stream) { + try_malloc(vec.size()); + h2d(vec, stream); + } + void malloc_h2d(const host_vector& vec, cudaStream_t stream) { + malloc(vec.size()); + h2d(vec, stream); + } + template + void h2d(const std::vector& vec, cudaStream_t stream) { + h2d(&vec[0], vec.size(), stream); + } + void h2d(const T* p, std::size_t s, cudaStream_t stream) { + assert(s<=_s); + //fprintf(stderr, "H %p -> D %p %zd\n", &_q[0], _p, _q.size()*sizeof(T)); + cudaMemcpyAsync(_p, p, s*sizeof(T), cudaMemcpyHostToDevice, stream), check_cuda{"memcpy async to device"}; + } + void h2d(const T* p, std::size_t s) { + assert(s<=_s); + cudaMemcpy(_p, p, s*sizeof(T), cudaMemcpyHostToDevice), check_cuda{"memcpy to device"}; + } + void d2h(T* p, std::size_t s) { + assert(s<=_s); + //fprintf(stderr, "D2h %p %p %zd\n", _p, p, s*sizeof(T)); + cudaMemcpy(p, _p, s*sizeof(T), cudaMemcpyDeviceToHost), check_cuda{"memcpy to host"}; + } + void d2h(T* p, std::size_t s, cudaStream_t stream) { + assert(s<=_s); + //fprintf(stderr, "D2h async %p %p %zd\n", _p, p, s*sizeof(T)); + cudaMemcpyAsync(p, _p, s*sizeof(T), cudaMemcpyDeviceToHost, stream), check_cuda{"memcpy async to host"}; + } + template + void d2h(std::vector& vec, cudaStream_t stream) { + d2h(&vec[0], vec.size(), stream); + } + +private: + T* _p; + std::size_t _s{0}; +}; + +///////////////////////////////////////// + +class event { +public: + constexpr event() noexcept: _e{nullptr} { } + ~event() { + if(_e) + cudaEventDestroy(_e), check_cuda{"create event"}; + } + event(const event&) =delete; + event& operator=(const event&) =delete; + //event(event&& r) =delete; + //event& operator=(event&& r) =delete; + + void create() { + cudaEventCreate(&_e), check_cuda{"create event"}; + } + void record(cudaStream_t stream) { + cudaEventRecord(_e, stream), check_cuda{"record event"}; + } + + operator cudaEvent_t() const noexcept { return _e; } + +private: + cudaEvent_t _e; +}; + + +class stream { +public: + constexpr stream() noexcept: _s{nullptr} { } + ~stream() { + if(_s) + cudaStreamDestroy(_s), check_cuda{"destroy stream"}; + } + stream(const stream&) =delete; + stream& operator=(const stream&) =delete; + //stream(stream&& r) =delete; + //stream& operator=(stream&& r) =delete; + + void create() { + cudaStreamCreate(&_s), check_cuda{"create stream"}; + } + + operator cudaStream_t() const noexcept { return _s; } + + void wait(cudaEvent_t event) { + cudaStreamWaitEvent(_s, event, cudaEventWaitDefault), check_cuda{"wait event"}; + } + void sync() { + cudaStreamSynchronize(_s), check_cuda{"sync stream"}; + } + +private: + cudaStream_t _s; +}; + +} + +#endif diff --git a/gpu-sw/mats.cc b/gpu-sw/mats.cc new file mode 100644 index 0000000..cc40161 --- /dev/null +++ b/gpu-sw/mats.cc @@ -0,0 +1,61 @@ +#include +#include + +#include "core.hh" +#include "seqs.hh" + +namespace { + +template +struct mat_helper { + std::string_view hdr; + eccl::seq_encoder b2b; + std::array& mat; + explicit mat_helper(const char* hdr, std::array& mat): hdr{hdr}, b2b{}, mat{mat} { } + void set(eccl::code a, unsigned int i) { + } + template + void set(eccl::code a, unsigned int i, Arg val, Args... vals) { + assert(i{a, b}.value]=val; + set(a, i+1, vals...); + } + template + void operator()(char c, Args... vals) { + auto a=b2b(c); + set(a, 0, vals...); + } +}; + +} + +void eccl::mat_blosum62(std::array& mat) { + // ftp://ftp.ncbi.nih.gov/blast/matrices/BLOSUM62 + mat_helper h{"ARNDCQEGHILKMFPSTWYVBZX*", mat}; + h('A', 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-2,-1, 0,-4); + h('R',-1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-1, 0,-1,-4); + h('N',-2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3, 3, 0,-1,-4); + h('D',-2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3, 4, 1,-1,-4); + h('C', 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-3,-3,-2,-4); + h('Q',-1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2, 0, 3,-1,-4); + h('E',-1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2, 1, 4,-1,-4); + h('G', 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-1,-2,-1,-4); + h('H',-2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3, 0, 0,-1,-4); + h('I',-1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-3,-3,-1,-4); + h('L',-1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-3,-1,-4); + h('K',-1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2, 0, 1,-1,-4); + h('M',-1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-3,-1,-1,-4); + h('F',-2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-3,-3,-1,-4); + h('P',-1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-2,-1,-2,-4); + h('S', 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2, 0, 0, 0,-4); + h('T', 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-1,-1, 0,-4); + h('W',-3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-3,-2,-4); + h('Y',-2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-3,-2,-1,-4); + h('V', 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-3,-2,-1,-4); + h('B',-2,-1, 3, 4,-3, 0, 1,-1, 0,-3,-4, 0,-3,-3,-2, 0,-1,-4,-3,-3, 4, 1,-1,-4); + h('Z',-1, 0, 0, 1,-3, 3, 4,-2, 0,-3,-3, 1,-1,-3,-1, 0,-1,-3,-2,-2, 1, 4,-1,-4); + h('X', 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-1,-1,-1,-4); + h('*',-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1); +} + diff --git a/gpu-sw/seqs.cc b/gpu-sw/seqs.cc new file mode 100644 index 0000000..be1ab44 --- /dev/null +++ b/gpu-sw/seqs.cc @@ -0,0 +1,286 @@ +#include "seqs.hh" + +//#include +#include +#include + + +// XXX alternative: 32-bit, 10-bp, 2-bit unused per word +// avoid 4-bit padded to 3-bit conversion. +// but, is 10 harder to divide by than 8??? +template +class eccl::detail::BitStream { +public: + explicit constexpr BitStream(Cont& cont) noexcept: + cont{cont} { } + std::size_t size() const noexcept { return tmp_cnt; } + void add(unsigned int v) { +#ifndef ECCL_COMPACT_CODE + tmp_bits=(tmp_bits<=bits_thr) { + cont.push_back(tmp_bits); + tmp_bits=1; + } +#endif + } + void flush() { +#ifndef ECCL_COMPACT_CODE + auto rem=tmp_cnt%padded_count; + if(rem!=0) { + tmp_cnt+=padded_count-rem; + cont.push_back(tmp_bits<<(padded_count-rem)*padded_width); + } +#else + auto rem=tmp_cnt%nopad_count; + if(rem!=0) { + tmp_cnt+=nopad_count-rem; + cont.push_back(tmp_bits<<(nopad_count-rem)*width); + tmp_bits=1; + } +#endif + }; +private: + Cont& cont; + std::size_t tmp_cnt=0; + unsigned int tmp_bits{1}; + static constexpr unsigned int bits_thr=(Type==eccl::seq_type::xna?0x4000'0000:0x4000'0000); + static constexpr unsigned int width=(Type==eccl::seq_type::xna?3:5); + static constexpr unsigned int padded_width=(Type==eccl::seq_type::xna?4:8); + static constexpr unsigned int padded_count=(Type==eccl::seq_type::xna?8:4); + static constexpr unsigned int nopad_count=(Type==eccl::seq_type::xna?10:6); +}; + + +std::shared_ptr eccl::fasta_reader::load(eccl::source& str, std::size_t buf_size, unsigned int num_seq) { + std::shared_ptr chk{}; + if(st==St::end) + return chk; + std::size_t cur_idx{0}; + std::size_t non_seq{0}; + using Coords=fasta_chunk::Coords; + std::size_t n; + do { + while(cur_idx') { + cur.coords.emplace_back(Coords{cur_idx+1, SIZE_MAX, SIZE_MAX, SIZE_MAX}); + st=St::in_hdr; + break; + } + break; + case St::in_hdr: + if((n=cur.buf.find('\n', cur_idx))!=std::string::npos) { + cur_idx=n; + auto& coords=cur.coords.back(); + coords.name_e=cur_idx; + cur.names_total+=coords.name_e-coords.name_s; + coords.seq_s=cur_idx+1; + st=St::expect_seq; + non_seq=0; + break; + } + cur_idx=cur.buf.size()-1; + break; + case St::expect_seq: + if(c=='>') { + auto& coords=cur.coords.back(); + coords.seq_e=cur_idx; + cur.seqs_total+=coords.seq_e-coords.seq_s-non_seq; + st=St::in_hdr; + if((buf_size>0 && cur.seqs_total>=buf_size) || (num_seq>0 && cur.coords.size()>=num_seq)) { + Chunk next{}; + next.buf.assign(&cur.buf[cur_idx], &cur.buf[cur.buf.size()]); + next.coords.emplace_back(Coords{1, SIZE_MAX, SIZE_MAX, SIZE_MAX}); + chk=std::make_shared(std::move(cur)); + cur=std::move(next); + return chk; + } + cur.coords.emplace_back(Coords{cur_idx+1, SIZE_MAX, SIZE_MAX, SIZE_MAX}); + break; + } + if(c=='\n') { + ++non_seq; + break; + } + st=St::in_seq; + break; + case St::in_seq: + if((n=cur.buf.find('\n', cur_idx))!=std::string::npos) { + cur_idx=n; + ++non_seq; + st=St::expect_seq; + break; + } + cur_idx=cur.buf.size()-1; + break; + case St::end: + assert(0); + } + ++cur_idx; + } + if(hiteof) + break; + auto blk=str.get(); + if(blk.eof()) + hiteof=true; + cur.buf.resize(cur_idx+blk.size()); + std::copy_n(&blk[0], blk.size(), &cur.buf[cur_idx]); + } while(true); + switch(st) { + case St::expect_hdr: + return {}; + case St::expect_seq: + break; + default: + fprintf(stderr, "wrong st: %d\n", (int)st); + assert(0); + throw st; + } + st=St::end; + auto& coords=cur.coords.back(); + coords.seq_e=cur_idx; + cur.seqs_total+=coords.seq_e-coords.seq_s-non_seq; + chk=std::make_shared(std::move(cur)); + return chk; +} + +template +eccl::seqs::seqs(const eccl::fasta_chunk& chunk) { + _b.seq_buf.reserve(chunk.seqs_total/2/sizeof(unsigned int)+chunk.coords.size()); + _b.name_buf.reserve(chunk.names_total); + _b.coords.reserve(chunk.coords.size()); + + detail::BitStream> bits{_b.seq_buf}; + eccl::seq_encoder b2b; + _b.max_seq_l=0; + for(auto coord: chunk.coords) { + detail::seqs_base::Coords cc; + while(coord.name_s::seqs(const eccl::fasta_chunk& chunk); +template eccl::seqs::seqs(const eccl::fasta_chunk& chunk); + +eccl::seq_encoder_tables::seq_encoder_tables() noexcept: +_table_xna{0, }, _table_prot{0, } +{ + _rev_xna[0b000]='\x00'; + _rev_xna[0b001]='U'; + _rev_xna[0b010]='C'; + _rev_xna[0b011]='T'; + _rev_xna[0b100]='A'; + _rev_xna[0b101]='G'; + _rev_xna[0b110]='X'; + _rev_xna[0b111]='N'; + + _rev_prot[0b00000]='\x00'; + _rev_prot[0b11111]='X'; + _rev_prot[0b11110]='*'; + _rev_prot[0b11101]='-'; + _rev_prot[0b00001]='A'; + _rev_prot[0b00010]='B'; + _rev_prot[0b00011]='C'; + _rev_prot[0b00100]='D'; + _rev_prot[0b00101]='E'; + _rev_prot[0b00110]='F'; + _rev_prot[0b00111]='G'; + _rev_prot[0b01000]='H'; + _rev_prot[0b01001]='I'; + _rev_prot[0b01010]='K'; + _rev_prot[0b01011]='L'; + _rev_prot[0b01100]='M'; + _rev_prot[0b01101]='N'; + _rev_prot[0b01110]='P'; + _rev_prot[0b01111]='Q'; + _rev_prot[0b10000]='R'; + _rev_prot[0b10001]='S'; + _rev_prot[0b10010]='T'; + _rev_prot[0b10011]='U'; + _rev_prot[0b10100]='V'; + _rev_prot[0b10101]='W'; + _rev_prot[0b10110]='Y'; + _rev_prot[0b10111]='Z'; + + for(unsigned int i=0; i<_rev_xna.size(); ++i) { + auto c=_rev_xna[i]; + if(c) { + _table_xna[std::tolower(c)]=_table_xna[c]=i; + } + } + for(unsigned int i=0; i<_rev_prot.size(); ++i) { + auto c=_rev_prot[i]; + if(c) { + _table_prot[std::tolower(c)]=_table_prot[c]=i; + } + } +} + +template +eccl::seqs_builder::seqs_builder(std::size_t num_bases, std::size_t names_size, std::size_t num_seqs): + _b{}, + _str{std::make_shared>>(_b.seq_buf)}, + _encoder{} +{ +} +template eccl::seqs_builder::seqs_builder(std::size_t num_bases, std::size_t names_size, std::size_t num_seqs); +template eccl::seqs_builder::seqs_builder(std::size_t num_bases, std::size_t names_size, std::size_t num_seqs); + +template +inline static void copy_seq(T& b, E& e, std::string_view seq, int*) { + for(std::size_t i=0; i +inline static void copy_seq(T& b, E& e, std::string_view seq, char*) { + static_assert(Type==eccl::seq_type::xna); + for(std::size_t i=seq.size(); i>0; --i) { + auto comp=~e(seq[i-1]); + b.add(comp.value); + } +} +template +template +void eccl::seqs_builder::push_back(std::string_view seq) { + detail::seqs_base::Coords s; + s.name_e=_b.name_buf.size(); + s.seq_s=_b.seq_buf.size(); + copy_seq(*_str, _encoder, seq, std::conditional_t{}); + _str->flush(); + s.seq_l=seq.size(); + if(_b.max_seq_l::push_back(std::string_view seq); +template void eccl::seqs_builder::push_back(std::string_view seq); +template void eccl::seqs_builder::push_back(std::string_view seq); + +#if 0 +#endif + +eccl::seq_encoder_tables eccl::seq_encoder_tables::_t; + diff --git a/gpu-sw/seqs.hh b/gpu-sw/seqs.hh new file mode 100644 index 0000000..db8725f --- /dev/null +++ b/gpu-sw/seqs.hh @@ -0,0 +1,307 @@ +#ifndef __ECCL_SEQS_HH__ +#define __ECCL_SEQS_HH__ + +#include +#include +#include +#include +#include +#include +#include + +#include "core.hh" + +namespace eccl { + +/*! API for a batch of sequences + * + * optimized for medium-sized batches of reads; + * may be used for a single sequence or a whole genome. + * + * limits: + * - maximum length of one sequence: 4Gi; + */ + + +template +struct seq_view { + /*! if reverse/complement sequence is not relevent, + * use bits directly. + * otherwise, + * bits&1 means the reverse sequence, + * bits&2 means the complement sequence, + * and use bits without the 2 LSB. + */ + const unsigned int* bits; + /*! sub-seq range: [begin, end). begin<=end. */ + uint32_t begin, end; + uint32_t len() const noexcept { return end-begin; } + std::size_t buf_size() const noexcept { +#ifndef ECCL_COMPACT_CODE + return Type==seq_type::xna?(len()+7)/8:(len()+3)/4; +#else + return Type==seq_type::xna?(len()+9)/10:(len()+5)/6; +#endif + } +}; + +namespace detail { +struct seqs_base { + struct Coords { + uint64_t seq_s; + uint32_t seq_l; + uint32_t name_e; + }; + uint32_t max_seq_l{0}; + std::vector coords; + std::vector seq_buf; + std::vector name_buf; +}; +template +class BitStream; +} + +class fasta_chunk; +template +class seqs_builder; + +template +class seqs { +public: + explicit seqs(const fasta_chunk& chunk); + seqs(seqs_builder&& builder) noexcept; + ~seqs() =default; + seqs(const seqs&) =delete; + seqs& operator=(const seqs&) =delete; + seqs(seqs&& r) noexcept: _b{std::move(r._b)} { } + seqs& operator=(seqs&& r) noexcept { + _b=std::move(r._b); + return *this; + } +#if 0 + std::pair start_len(std::size_t i) const noexcept { + auto& c=_coords[i]; + return {c.seq_s, c.seq_l}; + } +#endif + std::size_t size() const noexcept { return _b.coords.size(); } + seq_view seq(std::size_t idx) const noexcept { + auto& c=_b.coords[idx]; + return {&_b.seq_buf[c.seq_s], 0, c.seq_l}; + } + seq_view operator[](std::size_t idx) const noexcept { + return seq(idx); + } + std::string_view name(std::size_t i) const noexcept { + auto a=i>0?_b.coords[i-1].name_e:0; + auto b=_b.coords[i].name_e-a; + return {&_b.name_buf[a], b}; + } + uint64_t begin(std::size_t idx) const noexcept { + auto& c=_b.coords[idx]; + return c.seq_s*code::n_per_word; + } + + const unsigned int* buf() const noexcept { return &_b.seq_buf[0]; } + std::size_t buf_size() const noexcept { return _b.seq_buf.size(); } + uint32_t max_seq_len() const noexcept { return _b.max_seq_l; } + +private: + detail::seqs_base _b; + friend std::ostream& operator<<(std::ostream& str, const seqs& seqs) { + return str<<&seqs._b.seq_buf[0]<<'@'<; +using prot_seqs=seqs; + +/*! use 3 bits for each DNA/RNA base, but 4-bit aligned in a sequence. + * use 5 bits for each amino acid. + */ + +class seq_encoder_tables; + +template +class seq_encoder { +public: + explicit seq_encoder() noexcept =default; + ~seq_encoder() =default; + + code operator()(char c) const noexcept; + char rev(code i) const noexcept; +}; + +template +class seqs_builder { +public: + seqs_builder(std::size_t num_bases, std::size_t names_size, std::size_t num_seqs); + ~seqs_builder() =default; + seqs_builder(const seqs_builder&) =delete; + seqs_builder& operator=(const seqs_builder&) =delete; + + template + void push_back(std::string_view seq); + +private: + detail::seqs_base _b; + std::shared_ptr>> _str; + seq_encoder _encoder; + friend class seqs; +}; + +template +seqs::seqs(seqs_builder&& builder) noexcept: +_b{std::move(builder._b)} { } + + +class seq_encoder_tables { +public: + explicit seq_encoder_tables() noexcept; +private: + std::array _table_xna, _table_prot; + std::array _rev_xna; + std::array _rev_prot; + template + const unsigned char* table() const noexcept { + return Type==seq_type::xna?&_table_xna[0]:&_table_prot[0]; + } + template + const char* rev() const noexcept { + return Type==seq_type::xna?&_rev_xna[0]:&_rev_prot[0]; + } + static seq_encoder_tables _t; + template friend class seq_encoder; +}; + +template +inline code seq_encoder::operator()(char c) const noexcept { + return {seq_encoder_tables::_t.table()[static_cast(c)]}; +} +template +inline char seq_encoder::rev(code i) const noexcept { + return seq_encoder_tables::_t.rev()[i.value]; +} + +class fasta_chunk { +public: + fasta_chunk(fasta_chunk&&) =default; + ~fasta_chunk() =default; + fasta_chunk(const fasta_chunk&) =delete; + fasta_chunk& operator=(const fasta_chunk&) =delete; + +private: + explicit fasta_chunk() =default; + fasta_chunk& operator=(fasta_chunk&&) =default; + + struct Coords { + std::size_t name_s, name_e; + std::size_t seq_s, seq_e; + }; + std::size_t names_total{0}; + std::size_t seqs_total{0}; + std::vector coords; + std::string buf; + friend class fasta_reader; + template + friend class seqs; +}; + +class fasta_reader { +public: + explicit fasta_reader() =default; + ~fasta_reader() =default; + fasta_reader(const fasta_reader&) =delete; + fasta_reader& operator=(const fasta_reader&) =delete; + + std::shared_ptr load(eccl::source& str, std::size_t buf_size, unsigned int num_seq); + +private: + using Chunk=fasta_chunk; + Chunk cur{}; + enum class St { + expect_hdr, + in_hdr, + expect_seq, + in_seq, + end, + } st; + bool hiteof{false}; +}; + +// XXX +class seqs_ref { +public: +private: +}; + +template +void fill_score_table_xna(std::vector& _table, const ScoreOpts& scores) { + _table.reserve(8*8+2); + _table.resize(8*8, -100); + auto set=[&_table](auto a, auto b, Int v) { + _table[pair{a, b}.value]=v; + _table[pair{b, a}.value]=v; + }; + const Int m=scores.match ? *scores.match : 1; + const Int mm=scores.mismatch ? *scores.mismatch : -4; + const Int go=scores.gap_open ? *scores.gap_open : -6; + const Int ge=scores.gap_extend ? *scores.gap_extend : -1; + _table.push_back(go); + _table.push_back(ge); + + eccl::seq_encoder b2b{}; + const auto A=b2b('A'); + const auto T=b2b('T'); + const auto C=b2b('C'); + const auto G=b2b('G'); + const auto U=b2b('U'); + set(A, A, m); + set(T, T, m); + set(T, U, m); + set(U, U, m); + set(C, C, m); + set(G, G, m); + + set(A, T, mm); + set(A, U, mm); + set(A, C, mm); + set(A, G, mm); + + set(T, C, mm); + set(T, G, mm); + set(U, C, mm); + set(U, G, mm); + + set(C, G, mm); +} +void mat_blosum62(std::array& mat); + +template> +inline void fill_score_table_impl(std::vector& _table, const ScoreOpts& scores) { + fill_score_table_xna(_table, scores); +} +template> +inline void fill_score_table_impl(std::vector& _table) { + _table.reserve(32*32+2); + std::array mat{-100, }; + mat_blosum62(mat); + for(auto v: mat) + _table.push_back(v); + const Int gap_open=-11; + const Int gap_ext=-1; + _table.push_back(gap_open); + _table.push_back(gap_ext); +} +template> +inline void fill_score_table(std::vector& _table, const ScoreOpts& scores) { + return fill_score_table_impl(_table, scores); +} +template> +inline void fill_score_table(std::vector& _table) { + return fill_score_table_impl(_table); +} + +} + +#endif diff --git a/gpu-sw/sw-lib.cc b/gpu-sw/sw-lib.cc new file mode 100644 index 0000000..80dbcdd --- /dev/null +++ b/gpu-sw/sw-lib.cc @@ -0,0 +1,292 @@ + +#include "sw.hh" +#include "sw-lib.hh" + +#include +#include +#include +using namespace glf; +// struct glf::sw_handle { + // eccl::sw_buffers buffs; +// }; +template +struct glf::sw_handle { + eccl::sw_buffers buffs; +}; + + +template +sw_handle* glf::sw_create(const sw_opts& opts) { + auto res=std::make_unique>(); + res->buffs.init(0, opts.scores); + return res.release(); +} + +template +sw_handle* glf::sw_create() { + auto res=std::make_unique>(); + res->buffs.init(0); + return res.release(); +} + +template +void glf::sw_destroy(sw_handle* hdl) { + delete hdl; +} + + +template +std::vector glf::sw_batch(sw_handle* hdl, + const std::vector& query, + const std::vector& target, + const sw_batch_opts& opts) { + auto gen_seq_enc=[](auto& raw) { + eccl::seqs_builder builder{0, 0, 0}; + for(auto s: raw) + builder.template push_back(s); + return eccl::seqs{std::move(builder)}; + }; + auto query_enc=gen_seq_enc(query); + auto target_enc=gen_seq_enc(target); + auto& buffs=hdl->buffs; + buffs.run(query_enc, target_enc); + + std::vector results; + results.reserve(query.size()); + for(unsigned int i=0; i0) { + --nn; + unsigned int v=(cigar[nn/4]>>((nn%4)*8))&0xff; + auto op="\x00\x03\x02\x00"[v&0b11]; + mmcnt+="\x01\x01\x01\x00"[v&0b11]; + v=(v>>2)+1; + if(op==prev_op) { + prev_cnt+=v; + } else { + if(prev_cnt>0) { + put_cigar(prev_cnt, prev_op); + } + prev_cnt=v; + prev_op=op; + } + } + if(prev_cnt>0) { + put_cigar(prev_cnt, prev_op); + } + if(res.cigar_len>MAX_NUM_CIGAR) + res.sw_score=0; + res.mismatch=mmcnt; + if(false) { + std::array buf; + std::cerr<<"seq1 "<& q_res, std::vector& r_res) { +// q_res.clear(); +// r_res.clear(); +// int qp = alignment.query_begin; +// int rp = alignment.ref_begin; +// for (int c=0;c +static void inspect(const eccl::dev_vector& obj, sw_stats& stats) { + stats.dev_mem_usage+=obj.size()*sizeof(T); +} +template +static void inspect(const eccl::host_vector& obj, sw_stats& stats) { + stats.host_mem_usage+=obj.size()*sizeof(T); +} +template +static void inspect(const eccl::sw_buffers& obj, sw_stats& stats) { + inspect(obj.subst_mat, stats); + inspect(obj.query_buf, stats); + inspect(obj.query_se, stats); + inspect(obj.target_buf, stats); + inspect(obj.target_se, stats); + inspect(obj.tmp_scores, stats); + inspect(obj.tmp_traceback_s, stats); + inspect(obj.tmp_traceback_buf, stats); + inspect(obj.results, stats); + inspect(obj.cigar_s, stats); + inspect(obj.cigar_buf, stats); + inspect(obj.results_host, stats); + inspect(obj.cigar_buf_host, stats); + inspect(obj.cigar_s_host, stats); +} + +template +static void inspect(const sw_handle& obj, sw_stats& stats) { + inspect(obj.buffs, stats); +} + +template +sw_stats sw_inspect(sw_handle* hdl) { + assert(hdl); + sw_stats stats; + stats.dev_mem_usage=0; + stats.host_mem_usage=0; + inspect(*hdl, stats); + return stats; +} + +inline static char* format_impl(char* p, char* q, const char* n) { + while(*n) { + *p=*n; + ++p; + ++n; + } + return p; +} +inline static char* format_impl(char* p, char* q, char n) { + *p=n; + ++p; + return p; +} +inline static char* format_impl(char* p, char* q, std::string_view n) { + for(unsigned int i=0; i::value>> +inline static char* format_impl(char* p, char* q, T n) { + auto res=std::to_chars(p, q, n); + if(res.ec!=std::errc{}) + throw; + p=res.ptr; + return p; +} + +std::string_view eccl::format_sw_result(std::array& buf, + const eccl::sw_result& align, + std::string_view name_query, + std::string_view name_target, + const unsigned int* cigar, + unsigned int cigar_len) { + auto put=[&buf](char* p, auto n) ->char* { + return format_impl(p, &buf[4096], n); + }; + auto p=&buf[0]; + p=put(p, "query_name="); + p=put(p, name_query); + p=put(p, "\ttarget_name="); + p=put(p, name_target); + p=put(p, "\tscore="); + p=put(p, align.score); + p=put(p, "\tquery_batch_start="); + p=put(p, align.query_se.x); + p=put(p, "\ttarget_batch_start="); + p=put(p, align.target_se.x); + p=put(p, "\tquery_batch_end="); + p=put(p, align.query_se.y); + p=put(p, "\ttarget_batch_end="); + p=put(p, align.target_se.y); + p=put(p, "\tCIGAR="); + // cigar is slow + //continue; + + //fprintf(stderr, "alignment: %zd %u\n", oo, nn); + std::size_t prev_cnt{0}; + char prev_op{'\x00'}; + auto nn=cigar_len; + while(nn>0) { + --nn; + unsigned int v=(cigar[nn/4]>>((nn%4)*8))&0xff; + auto op="XIDM"[v&0b11]; + v=(v>>2)+1; + if(op==prev_op) { + prev_cnt+=v; + } else { + if(prev_cnt>0) { + p=put(p, prev_cnt); + p=put(p, prev_op); + } + prev_cnt=v; + prev_op=op; + } + } + if(prev_cnt>0) { + p=put(p, prev_cnt); + p=put(p, prev_op); + } + p=put(p, '\n'); + std::size_t l=p-&buf[0]; + return {&buf[0], l}; +} diff --git a/gpu-sw/sw-lib.hh b/gpu-sw/sw-lib.hh new file mode 100644 index 0000000..dd55dd7 --- /dev/null +++ b/gpu-sw/sw-lib.hh @@ -0,0 +1,146 @@ +#ifndef GLF_SW_LIB_HH +#define GLF_SW_LIB_HH + +#include +#include +#include + +//refactor!!! +#include +#include +#include +#include + +#include "core.hh" + +#define MAPSTR "MXDIE" +#ifndef CIGAR_SHIFT +#define CIGAR_SHIFT 2u +#endif + +#define MAX_NUM_CIGAR 16 + +typedef struct { + uint16_t sw_score; + uint16_t mismatch; + int32_t ref_begin; + int32_t ref_end; + int32_t query_begin; + int32_t query_end; + uint32_t cigar[MAX_NUM_CIGAR]; + int32_t cigar_len; +} sw_align; + +extern const uint8_t encoded_ops[]; + +static inline uint32_t to_cigar_int(uint32_t length, char op) { + return (length << CIGAR_SHIFT) | (encoded_ops[(int)op]); +} + +static inline char cigar_int_to_op(uint32_t cigar_int) { + return MAPSTR[cigar_int & 0x3U]; +} + +static inline uint32_t cigar_int_to_len(uint32_t cigar_int) { + return cigar_int >> CIGAR_SHIFT; +} + + +// static void cigar_to_string(const sw_align& alignment, std::string_view& query, std::string_view& ref, std::vector& q_res, std::vector& r_res); +static void cigar_to_string(const sw_align& alignment, std::string_view& query, std::string_view& ref, std::vector& q_res, std::vector& r_res) { + q_res.clear(); + r_res.clear(); + int qp = alignment.query_begin; + int rp = alignment.ref_begin; + for (int c=0;c match; + std::optional mismatch; + std::optional gap_open; + std::optional gap_extend; + } scores; +}; + +struct sw_batch_opts { + //minscore +}; + +struct sw_stats { + std::size_t dev_mem_usage; + std::size_t host_mem_usage; +}; + +template +struct sw_handle; + +template +sw_handle* sw_create(const sw_opts& opts); + +template +sw_handle* sw_create(); + +template +void sw_destroy(sw_handle* hdl); + +template +std::vector sw_batch(sw_handle* hdl, + const std::vector& query, + const std::vector& target, + const sw_batch_opts& opts); + +template +sw_stats sw_inspect(sw_handle* hdl); + +} + +#endif diff --git a/gpu-sw/sw.cu b/gpu-sw/sw.cu new file mode 100644 index 0000000..a26424e --- /dev/null +++ b/gpu-sw/sw.cu @@ -0,0 +1,361 @@ +#include +#include +#include +#include + +#include + +#include "cuda-utils.hh" +#include "sw.hh" +#include "core.hh" + + + +#include + +using check_cuda=eccl::check_cuda; + +template +__global__ void align_kernel( + /* Req/Reuse/Share/Combine Input Output index */ + /* *!!! * ! */const unsigned int work_size, + /* *!!! * ! */const unsigned int max_query_len, + const unsigned int band_size, + /* ***! * ! */const signed short* subst_mat, + /* *..! * ! */const unsigned int* target_buf, + /* *..! * ! work */const ulong2* target_se, + /* *!!! * ! */const unsigned int* query_buf, + /* *!!! * ! work */const ulong2* query_se, + /* *!!! ! ! */int2* tmp_scores, + /* *!!! ! ! */unsigned int* tmp_traceback_buf, + /* *!!! * ! thr */const ulong* tmp_traceback_s, + /* *!!! ! * */eccl::sw_result* results, + /* *!!! ! * */unsigned int* cigar_buf, + /* *!!! * ! work */const unsigned long* cigar_s, + int) { + + const unsigned int thr_id=threadIdx.x+blockIdx.x*blockDim.x; + const unsigned int debug=0; + //const unsigned int thr_num=blockDim.x*gridDim.x; + constexpr uint2 tile_size={8, 8*2}; + const int gap_ext=subst_mat[(Type==eccl::seq_type::xna?8*8:32*32)+1]; + const int gap_open=subst_mat[(Type==eccl::seq_type::xna?8*8:32*32)+0]-gap_ext; + + if(thr_id>=work_size) + return; + const auto work_id=thr_id; + + auto max_prev_row=&tmp_scores[thr_id*max_query_len]; + auto t_offset=tmp_traceback_s[thr_id]; + [[maybe_unused]] unsigned int t_size=tmp_traceback_s[thr_id+1]-t_offset; + assert(t_offset%4==0); + auto mat2=&tmp_traceback_buf[t_offset]; + + struct { + unsigned int ii; + unsigned int jj; + int score; + unsigned int i0, j0; + } _top; + auto a_offset=query_se[work_id].x; + assert(a_offsetuint2 { + unsigned int lb=0, ub=a_size; + if(a_sizehalf_band_width+j1) + ub=half_band_width+j1; + if(a_size+j0+1>b_size+half_band_width) + lb=a_size+j0+1-b_size-half_band_width; + } else { + // similarly + if(j0+1>half_band_width) + lb=j0-half_band_width+1; + if(half_band_width+j1unsigned int { + /* each c* can be {0, 1, 2}; thus 27 possible cases; but only 8 casses can occur. + encode the 8 cases with 4 bits to save space. + */ + auto c=(c1*3u+c2)*3u+c3; + auto ec=(c*3)%32; + assert(ec<16); + assert(decode_conds(ec)==c); + return ec; + }; + + for(unsigned int i=0; i b_cache[tile_size.y]; + for(unsigned int jj=0; jj(target_buf, + b_offset+j+jj):eccl::code{}; + max_prev_col[jj]=make_int2(0, gap_open); + } + auto qrange=get_query_range(j, j+tile_size.y-1); + qrange.x=qrange.x/tile_size.x*tile_size.x; + if(debug) + printf("@%u i range: %u %u %u\n", thr_id, j, qrange.x, qrange.y); + for(unsigned int i=qrange.x; i a_cache[tile_size.x]; + for(unsigned int ii=0; ii(query_buf, + a_offset+i+ii):eccl::code{}; + } +// per-tile +for(unsigned int ii=0; ii bp{a_cache[ii], b_cache[jj]}; + auto bp_score=subst_mat[bp.value]; + score.x=max(max_prev_row_i_ii.x+bp_score, 0); + score.y=max_prev_row_i_ii.y+gap_ext; + score.z=max_prev_col[jj].y+gap_ext; + + max_prev_row_i_ii.x=max_prev_col[jj].x; + /* comparison conditions for backtrack */ + unsigned char cond1=0, cond2=0, cond3=0; + max_prev_col[jj].x=score.x; + if(max_prev_col[jj].x_top.score) { + _top={i+ii, j+jj, score.x}; + } + auto& tb_tmp=tb_batch[jj/8+(tile_size.y/8)*ii]; + tb_tmp=(tb_tmp<<4)|encode_conds(cond1, cond2, cond3); + } + max_prev_row[i+ii]=max_prev_row_i_ii; +} + auto ptr=&mat2[j*max_query_len/8+i*(tile_size.y/8)]; + for(unsigned int jj=0; jj(tb_batch)[jj]; + } + } + } + + auto tb_offset=cigar_s[work_id]; + unsigned int tb_size=cigar_s[work_id+1]-tb_offset; + //assert(tb_size>=a_size/4+1); + //assert(tb_size>=48+1); + auto tb_ops=&cigar_buf[tb_offset]; + + unsigned int tb_i=0; + unsigned int i=_top.ii; + unsigned int j=_top.jj; + +#if 1 + enum { + Mat, + Ins, + Del, + End, + } st=Mat; + int s=1; + int sss=_top.score; + unsigned int tmp_cnt{0}; + unsigned char tmp_op{0b100000}; + unsigned int tmp_cache{0}; + auto add_op=[&tmp_cnt,&tmp_op,&tb_i,&tmp_cache,tb_ops,tb_size](unsigned char op) { + if(op==tmp_op) { + ++tmp_cnt; + } else { + constexpr unsigned int per_slot=(0xffu>>2)+1; + while(tmp_cnt>0) { + unsigned int n=min(tmp_cnt, per_slot); + if(tb_i/4+1unsigned int { + auto jj=j%tile_size.y; + j-=jj; + return (mat2[(j*max_query_len+i*tile_size.y+jj)/8]>>((7-jj%8)*4))&0xf; + }; + do { + if(debug) + printf("@%u %u %u %u %u %d\n", thr_id, i, j, eccl::get_code(query_buf, i).value, eccl::get_code(target_buf, j).value, sss); + unsigned int cond; + switch(st) { + case Mat: + { + auto a=eccl::get_code(query_buf, + a_offset+i); + auto b=eccl::get_code(target_buf, + b_offset+j); + s=subst_mat[eccl::pair{a, b}.value]; + } + if(s>0) { + add_op(0b11); + //assert(ss>0); + } else { + add_op(0b00); + //assert(ss<=0); + } + sss-=s; + if(sss==0) { + st=End; + break; + } + assert(i>0); + assert(j>0); + cond=decode_conds(elem2(i-1, j-1))/9; + switch(cond) { + case 0: + break; + case 1: + st=Del; + break; + case 2: + st=Ins; + break; + default: + assert(0); + } + --i; + --j; + break; + case Del: + add_op(0b10); + sss-=gap_ext; + cond=j>0?decode_conds(elem2(i, j-1))/3%3:0; + switch(cond) { + case 0: + sss-=gap_open; + st=Mat; + break; + case 1: + break; + case 2: + sss-=gap_open; + st=Ins; + break; + default: + assert(0); + } + --j; + break; + case Ins: + add_op(0b01); + sss-=gap_ext; + cond=i>0?decode_conds(elem2(i-1, j))%3:0; + switch(cond) { + case 0: + sss-=gap_open; + st=Mat; + break; + case 1: + sss-=gap_open; + st=Del; + break; + case 2: + break; + default: + assert(0); + } + --i; + break; + case End: + assert(0); + } + } while(st!=End); + add_op(0b100000); + if((tb_i%4)>0) + tb_ops[tb_i/4+1]=tmp_cache; + assert((tb_i+3)/4 +void eccl::sw_gpu(const sw_gpu_opts& opts, sw_gpu_args&& args) { + + align_kernel<<>>( + args.work_size, args.max_query_len, + opts.band_size, + opts.subst_mat, + args.target_buf, args.target_se, + args.query_buf, args.query_se, + args.tmp_scores, + args.tmp_traceback_buf, args.tmp_traceback_s, + args.results, + args.cigar_buf, args.cigar_s, + 0); + +} + +template void eccl::sw_gpu(const sw_gpu_opts& opts, sw_gpu_args&& args); +template void eccl::sw_gpu(const sw_gpu_opts& opts, sw_gpu_args&& args); + diff --git a/gpu-sw/sw.hh b/gpu-sw/sw.hh new file mode 100644 index 0000000..e46c98c --- /dev/null +++ b/gpu-sw/sw.hh @@ -0,0 +1,224 @@ +#include +// +#include "core.hh" +#include "seqs.hh" +#include "cuda-utils.hh" +#include + +/*! sw alg. + * + * usage: + * + * 1. use eccl::seqs_builder to build a seq batch (in host memory). + * call .push_back(seq) to add a sequence. + * construct eccl::seqs after sequences are added. + * 2. use eccl::sw_buffers to manage device buffers. + * call .init(band_size) once to init. + * call [res, cigar_buf, cigar_s]=.run(query, target) to run. + * call eccl::format_sw_result() to format output to a buffer. + * eccl::sw_buffers can be reused. + * + * see check_xna() in sw-test.cc for simple examples. + * + * NOTES + * - correct compared to gasal2. + * - just use the alternative API (eccl::sw_buffers). + * not removing existing async stuff. + * - why no batch? + * input is not a single seq, not all the seqs, then it *is* a batch. + * - pass nullptr if no stream is needed. + * - the sw part may need further work, i'll do code clean up after that. + * print and time related calls are silenced. + * - non-relevant (and relevant) codes are in a separate dir. + * just leave them there and use the API. + * + * + * some rough benchmarks: + * + * gasal2 no output: total 0m4.430s, kernel 1.03s + * gasal2: total 0m12.239s, kernel 8.77s + * eccl: total 0m4.289s, kernel 2.4s + * eccl banded(16): total 0m3.632s, kernel 1.8s + * + * eccl tests are run asynchronously. + * there are noticeable overheads in data copying, i'll improve that later. + * + */ + +namespace eccl { + +struct sw_result { + int score; + uint2 query_se; + uint2 target_se; +}; + +/*! global/fixed stuff */ +struct sw_gpu_opts { + unsigned int num_blocks; + unsigned int num_threads; + unsigned int band_size; + dev_ptr subst_mat; +}; + +/*! per-batch stuff */ +struct sw_gpu_args { + cudaStream_t stream; + unsigned int work_size; + unsigned int max_query_len; + dev_ptr target_buf; + dev_ptr target_se; + dev_ptr query_buf; + dev_ptr query_se; + dev_ptr tmp_scores; + dev_ptr tmp_traceback_buf; + dev_ptr tmp_traceback_s; + dev_ptr results; + dev_ptr cigar_buf; + dev_ptr cigar_s; +}; + +inline unsigned int estimate_cigar_size(unsigned int query_len, unsigned int target_len) { + // XXX upper bound??? + //return query_len/4; + //return 48; + //return 20; + return 48; + // (450)/16 == 28 + // no_cigar(flat_traceback): 1.31s + // 8: 1.48s + // 16: 1.45s + // 32: 1.50s + // 64: 1.78s + // 128: 3.0s +} + +template +void sw_gpu(const sw_gpu_opts& opts, sw_gpu_args&& args); + +template +struct sw_buffers { + unsigned int band_size; + eccl::dev_vector subst_mat; + eccl::dev_vector query_buf; + eccl::dev_vector query_se; + eccl::dev_vector target_buf; + eccl::dev_vector target_se; + eccl::dev_vector tmp_scores; + eccl::dev_vector tmp_traceback_s; + eccl::dev_vector tmp_traceback_buf; + eccl::dev_vector results; + eccl::dev_vector cigar_s; + eccl::dev_vector cigar_buf; + + /*! results. use after run */ + eccl::host_vector results_host; + eccl::host_vector cigar_buf_host; + eccl::host_vector cigar_s_host; + + template + void init(unsigned int band_size, const ScoreOpts& scores) { + this->band_size=band_size; + eccl::host_vector subst_mat_host; + eccl::fill_score_table(subst_mat_host, scores); + subst_mat.malloc_h2d(subst_mat_host, nullptr); + } + void init(unsigned int band_size) { + this->band_size=band_size; + eccl::host_vector subst_mat_host; + eccl::fill_score_table(subst_mat_host); + subst_mat.malloc_h2d(subst_mat_host, nullptr); + } + inline void run(const seqs& query, const seqs& target); +}; + +template +inline void sw_buffers::run(const seqs& query, const seqs& target) { + sw_gpu_opts opts; + opts.num_blocks=40; + opts.num_threads=128; + opts.band_size=this->band_size; + opts.subst_mat=this->subst_mat; + + sw_gpu_args args; + args.stream=nullptr; + assert(query.size()==target.size()); + args.work_size=query.size(); + args.max_query_len=eccl::padded_len(query.max_seq_len()); + + eccl::host_vector query_buf_host; + query_buf_host.insert(query_buf_host.end(), query.buf(), query.buf()+query.buf_size()); + query_buf.try_malloc_h2d(query_buf_host, nullptr); + eccl::host_vector query_se_host; + for(unsigned int i=0; i target_buf_host; + target_buf_host.insert(target_buf_host.end(), target.buf(), target.buf()+target.buf_size()); + target_buf.try_malloc_h2d(target_buf_host, nullptr); + eccl::host_vector target_se_host; + for(unsigned int i=0; i tmp_traceback_s_host; + tmp_traceback_s_host.push_back(0); + for(unsigned int i=0; i(n, s); + } + n=args.max_query_len*((n+31)/32)*4; + tmp_traceback_s_host.push_back(tmp_traceback_s_host.back()+n); + } + tmp_traceback_s.try_malloc_h2d(tmp_traceback_s_host, nullptr); + tmp_traceback_buf.try_malloc(tmp_traceback_s_host.back()); + + results.try_malloc(query.size()); + + cigar_s_host.clear(); + cigar_s_host.push_back(0); + for(unsigned int i=0; i(opts, std::move(args)); + cudaGetLastError(), eccl::check_cuda{"align"}; + cigar_buf_host.resize(cigar_buf.size()); + cigar_buf.d2h(cigar_buf_host, nullptr); + results_host.resize(results.size()); + results.d2h(results_host, nullptr); + cudaStreamSynchronize(nullptr), check_cuda{"sync stream"}; +}; + +std::string_view format_sw_result(std::array& buf, + const sw_result& align, + std::string_view name_query, + std::string_view name_target, + const unsigned int* cigar, + unsigned int cigar_len); + +} diff --git a/include/smith.h b/include/smith.h index 32a55de..3643020 100644 --- a/include/smith.h +++ b/include/smith.h @@ -9,5 +9,7 @@ using namespace std; // void banded_smith_waterman(const char *q, const char *c, vector& q_idxs, vector& q_lens, vector& diags, size_t c_len, size_t num_task, vector &res, ThreadPool* pool, vector>& rs); void smith_waterman_kernel(const int idx, SWResult *res, SWTasks* sw_task); +void gpu_SW( vector &res,SWTasks* sw_task); -void gasal_run(SWTasks tasks, vector res[][NUM_STREAM],const char* q_dev, const char* s_dev, int num_g, int span); \ No newline at end of file +// void gasal_run(SWTasks tasks, vector res[][NUM_STREAM],const char* q_dev, int num_g, int span); +void total_gpu_SW(SWTasks tasks, vector res[][NUM_STREAM],const char* query,const char* target,int num_g, int span); \ No newline at end of file diff --git a/include/util.h b/include/util.h index 1204b3b..8349a10 100644 --- a/include/util.h +++ b/include/util.h @@ -31,6 +31,7 @@ #include "omp.h" // #include "boost/asio.hpp" +#define MASK5(v) (v & 0b11111) extern ThreadPool* pool; // extern StripedSmithWaterman::Aligner aligner; // extern StripedSmithWaterman::Filter filter; @@ -196,6 +197,8 @@ int check_db(const char *db, size_t &max_size, size_t& total_size); void load_seq(string db, int num, char *&str, size_t &len); void proceed_result(vector *res_d, vector &res_t, const char *query, const char *subj, QueryGroup &q_group, const char *s_name, const size_t* s_offsets, const size_t* sn_offsets, const size_t s_num ,size_t total_db_size); +void proceed_result_glf(vector *res_d, vector &res_t, const char *query, const char *subj, QueryGroup &q_group, const char *s_name, const size_t* s_offsets, const size_t* sn_offsets, const size_t s_num ,size_t total_db_size); + char get_char(const char *s, size_t offset); void get_arg(const char* name, int& v, int default_v); diff --git a/src/banded_smith_cpu.cpp b/src/banded_smith_cpu.cpp index 5ff236b..8fbe471 100644 --- a/src/banded_smith_cpu.cpp +++ b/src/banded_smith_cpu.cpp @@ -1,10 +1,12 @@ #include "smith.h" +#include "../gpu-sw/sw-lib.cc" +#include "../gpu-sw/core.hh" // #include // #include #define max2(m, n) ((m) > (n) ? (m) : (n)) #define max3(m, n, p) ((m) > (n) ? ((m) > (p) ? (m) : (p)) : ((n) > (p) ? (n) : (p))) -#define MASK5(v) (v & 0b11111) +// #define MASK5(v) (v & 0b11111) #define END 0 #define TOP 1 #define LEFT 2 @@ -27,6 +29,28 @@ inline char get_char(const char *s, size_t offset) return MASK5((unsigned)((*((uint16_t *)&(s[n_bit >> 3]))) >> (n_bit & 7))); } +string get_substr(const char* s, int begin_p, int end_p) { + + char tmp[end_p - begin_p] = {0}; + for (size_t i = begin_p; i < end_p; i++) + { + tmp[i - begin_p] = char(s[i] + 65); + } + string ss(tmp, end_p - begin_p); + return ss; +} + +string get_substr_T(const char* s, int begin_p, int end_p) { + + string ss; + for (size_t i = begin_p; i < end_p; i++) + { + if(get_char(s,i) == END_SIGNAL) continue; + ss += get_char(s,i) + 65; + } + return ss; +} + void smith_waterman_kernel(const int idx, SWResult *res, SWTasks *sw_task) { const char *q = sw_task->q; @@ -290,7 +314,241 @@ void smith_waterman_kernel(const int idx, SWResult *res, SWTasks *sw_task) res->s_ori = s_ori; res->match = match; } +} + +char* str_get_substr(const char* str, int begin, int end) { + int len = end - begin ; + char* substr = new char[len + 1]; + for(int i = 0; i < len; ++i) substr[i] = str[begin+i]; + // strncpy(substr, str + begin, len); + substr[len] = '\0'; + return substr; +} + +void gpu_SW(vector &res,SWTasks *sw_task) +{ + // 两个序列 + const char *query = sw_task->q; + const char *target = sw_task->c; + size_t c_len = sw_task->c_len; + + + struct glf::sw_handle* hdl; //没有指定dna,或蛋白,创造handle的选项可能需要指定,ctype 模板参数.可能可以 + // 有些选项,DNA 蛋白的打分矩阵, + // 蛋白的话,cigar的值也需要调整,eg. score的正负和match和mismatch + // 需求不一样 + struct glf::sw_batch_opts batch_opts; + hdl = glf::sw_create(); + + std::vector q; + std::vector c; + std::vector qs; + std::vector cs; + + // std::vector cur; + + + for(int idx = 0; idx < sw_task->num_task; ++idx){ + + size_t q_idx = sw_task->q_idxs[idx]; + size_t n = sw_task->q_lens[idx]; + size_t diag = sw_task->diags[idx]; // pos of c at end of q + + int64_t c_begin = (int64_t)diag - band_width - n; + size_t c_end = diag + band_width; + c_begin = c_begin >= 0 ? c_begin : 0; + c_end = c_end <= c_len ? c_end : c_len; + if (has_must_include) + { + if (!check_include(target, c_begin, c_end)) + { + res[idx].report = false; + return; + } + } + size_t width = (n + band_width) << 1; + size_t height = band_width + 2; + // cout << "\nQUERY\n"; + // for(int it = 0; it< n; ++it)cout << char(query[q_idx+it]+65); + string _q = get_substr(query,q_idx,q_idx+n); + string _c = get_substr_T(target,c_begin,c_end); + qs.push_back(_q); + cs.push_back(_c); + } + for(int i = 0; i < qs.size(); ++i){ + q.push_back(qs[i]); + c.push_back(cs[i]); + } + std::vector cur = glf::sw_batch(hdl,q,c, batch_opts); + + for(int k = 0; k < cur.size(); k++){ + cigar_to_string(cur[k],q[k], c[k], res[k].q_res, res[k].s_res); + // cout << "\nq_res\n"; + // for (int i=0;iq_idxs[k]; + size_t n = sw_task->q_lens[k]; + size_t diag = sw_task->diags[k]; // pos of c at end of q + + int64_t c_begin = (int64_t)diag - band_width - n; + size_t c_end = diag + band_width; + c_begin = c_begin >= 0 ? c_begin : 0; + c_end = c_end <= c_len ? c_end : c_len; + + res[k].begin_q = cur[k].query_begin + q_idx; + res[k].begin_s = cur[k].ref_begin + c_begin; + res[k].end_q = cur[k].query_end + q_idx; + res[k].end_s = cur[k].ref_end + c_begin; + + size_t len = res[k].s_res.size(); + res[k].align_length = len; + res[k].score = cur[k].sw_score; + res[k].bitscore = (E_lambda * res[k].score - log(E_k)) / (0.69314718055995); + char s[len + 1] = {0}; + char q_seq[len + 1] = {0}; + char s_ori[len + 1] = {0}; + char match[len + 1] = {0}; + int s_ori_len = 0; + res[k].gap_open = 0; + res[k].gaps = 0; + bool ga = false; + for (int i = 0; i < len; i++) + { + if (res[k].s_res[i] != (size_t)(-1)) + { + ga = false; + s[i] = c[k][res[k].s_res[i]]; + // s[i] = get_char(c, res[k].s_res[i]) + 65; + if (s[i] == 95) + s[i] = '*'; + s_ori[s_ori_len++] = s[i]; + } + else + { + s[i] = '-'; + res[k].gaps++; + if (!ga) + { + ga = true; + res[k].gap_open++; + } + } + } + + if (has_must_include) + { + string s_ori_str(s_ori, len); + if (!check_include(s_ori_str)) + { + res[k].report = false; + return; + } + } + ga = false; + for (int i = 0; i < len; i++) + { + // cout< 0 && q_seq[i]!='-' && s[i]!='-') + { + res[k].positive++; + match[i]='+'; + } + if (q_seq[i] != s[i]) + { + res[k].mismatch++; + } + else + { + match[i]=q_seq[i]; + } + + } + res[k].n_identity = res[k].align_length - res[k].mismatch; + res[k].p_identity = (1 - (double)res[k].mismatch / res[k].align_length) * 100; + // todo + res[k].s_res[0] += c_begin; + if (detailed_alignment) + { + res[k].q = q_seq; + res[k].s = s; + res[k].s_ori = s_ori; + res[k].match = match; + } + } +} + +void total_gpu_SW(SWTasks tasks, vector res[][NUM_STREAM],\ + const char* query,const char* target,int num_g, int span) +{ + + size_t n = tasks.num_task; + // assert(n == num_g * NUM_STREAM); + + // get 每一个query与 target + struct glf::sw_handle* hdl; + struct glf::sw_batch_opts batch_opts; + hdl = glf::sw_create(); + std::vector q; + std::vector t; + std::vector qs; + std::vector cs; + + for(int i = 0; i < n; ++i){ + // TODO check split task + size_t q_begin = tasks.q_idxs[i]; + size_t q_len = tasks.q_lens[i]; + int64_t t_begin = (int64_t)tasks.diags[i] - band_width - tasks.q_lens[i]; + size_t t_end = tasks.diags[i] + band_width; + + // cout << "Q\t" << q_begin << "\t" << q_len + q_begin << "\n"; + // cout << "T\t" << t_begin << "\t" << t_end << "\n"; + + string _q = get_substr(query,q_begin,q_begin+q_len); + string _c = get_substr_T(target,t_begin,t_end); + qs.push_back(_q); + cs.push_back(_c); + } + for(int i = 0; i < qs.size(); ++i){ + q.push_back(qs[i]); + t.push_back(cs[i]); + } + std::vector cur = glf::sw_batch(hdl,q,t, batch_opts); + } // void banded_smith_waterman(const char *q, const char *c, vector &q_idxs, vector &q_lens, vector &diags, size_t c_len, size_t num_task, vector &res, ThreadPool *pool, vector> &rs) diff --git a/src/blastp.cu b/src/blastp.cu index 48112ea..4a4232b 100644 --- a/src/blastp.cu +++ b/src/blastp.cu @@ -142,7 +142,9 @@ __global__ void filter_kernel(KeyValue *ht, Task *tasks, uint32_t *num_task, uin } #ifdef USE_GPU_SW -void handle_results(cudaEvent_t &stream, Task *task_host, uint32_t *num_task, QueryGroup &q_group, size_t s_length, int stream_id, vector &res, SWTasks &sw_task) +void handle_results(cudaEvent_t &stream, Task *task_host, uint32_t *num_task,\ + QueryGroup &q_group, size_t s_length, int stream_id, \ + vector &res, SWTasks &sw_task) { cudaEventSynchronize(stream); mu2.lock(); @@ -187,8 +189,44 @@ void handle_results(cudaEvent_t &stream, Task *task_host, uint32_t *num_task, Qu mu2.unlock(); } +#elif GLF_GPU_SW +void handle_results(cudaEvent_t &stream, const char *query, const char *subj, \ + Task *task_host, uint32_t *num_task, QueryGroup &q_group, \ + size_t s_length, int stream_id, vector &res,\ + SWTasks &sw_task, ThreadPool *pool, vector> &rs) +{ + cudaEventSynchronize(stream); + cout << "="; + res.clear(); + res.resize(*num_task); + sw_task.q = query; + sw_task.c = subj; + sw_task.c_len = s_length; + sw_task.q_idxs.resize(*num_task); + sw_task.q_lens.resize(*num_task); + sw_task.diags.resize(*num_task); + Task *t_begin = task_host; + sw_task.num_task = *num_task; +#pragma omp parallel for + for (int i = 0; i < *num_task; i++) + { + Task &kv = *(t_begin + i); + sw_task.q_idxs[i]=q_group.offset[kv.q_id]; + sw_task.q_lens[i]=q_group.length[kv.q_id]; + sw_task.diags[i]=kv.key; + res[i].num_q = kv.q_id; + } + + gpu_SW(res,&sw_task); + +} + + #else -void handle_results(cudaEvent_t &stream, const char *query, const char *subj, Task *task_host, uint32_t *num_task, QueryGroup &q_group, size_t s_length, int stream_id, vector &res, SWTasks &sw_task, ThreadPool *pool, vector> &rs) +void handle_results(cudaEvent_t &stream, const char *query, const char *subj, \ + Task *task_host, uint32_t *num_task, QueryGroup &q_group, \ + size_t s_length, int stream_id, vector &res,\ + SWTasks &sw_task, ThreadPool *pool, vector> &rs) { cudaEventSynchronize(stream); cout << "="; @@ -223,7 +261,10 @@ void handle_results(cudaEvent_t &stream, const char *query, const char *subj, Ta } #endif -void search_db_batch(const char *query, char *subj[], vector &q_groups, size_t s_length[], Task *task_host[][NUM_STREAM], uint32_t *task_num_host[][NUM_STREAM], size_t max_hashtable_capacity, uint32_t max_n_query, uint32_t total_len_query, string db_name, uint32_t db_num, vector *res, size_t total_db_size, TimeProfile &time_prof) +void search_db_batch(const char *query, char *subj[], vector &q_groups,\ + size_t s_length[], Task *task_host[][NUM_STREAM], uint32_t *task_num_host[][NUM_STREAM], \ + size_t max_hashtable_capacity, uint32_t max_n_query, uint32_t total_len_query, \ + string db_name, uint32_t db_num, vector *res, size_t total_db_size, TimeProfile &time_prof) { struct timeval t_start, t_end, tt_start; @@ -380,9 +421,23 @@ void search_db_batch(const char *query, char *subj[], vector &q_grou CUDA_CALL(cudaMemcpyAsync(task_num_host[g_idx][s], task_num_dev[s], sizeof(uint32_t), cudaMemcpyDeviceToHost, streams[s])); CUDA_CALL(cudaEventRecord(seeding_finished[g_idx][s])); #ifdef USE_GPU_SW - result_threads[g_idx][s] = thread(handle_results, ref(seeding_finished[g_idx][s]), task_host[g_idx][s], task_num_host[g_idx][s], ref(q_groups[g]), s_length[s], s, ref(res_s[g][s]), ref(sw_tasks_total)); + result_threads[g_idx][s] = thread(handle_results, \ + ref(seeding_finished[g_idx][s]),\ + task_host[g_idx][s], task_num_host[g_idx][s], ref(q_groups[g]), \ + s_length[s], s, ref(res_s[g][s]), \ + ref(sw_tasks_total)); +#elif GLF_GPU_SW + result_threads[g_idx][s] = thread(handle_results, \ + ref(seeding_finished[g_idx][s]), query, subj[s], \ + task_host[g_idx][s], task_num_host[g_idx][s], ref(q_groups[g]), \ + s_length[s], s, ref(res_s[g][s]),\ + ref(sw_tasks[g][s]), pool, ref(rs[g_idx][s])); #else - result_threads[g_idx][s] = thread(handle_results, ref(seeding_finished[g_idx][s]), query, subj[s], task_host[g_idx][s], task_num_host[g_idx][s], ref(q_groups[g]), s_length[s], s, ref(res_s[g][s]), ref(sw_tasks[g][s]), pool, ref(rs[g_idx][s])); + result_threads[g_idx][s] = thread(handle_results, \ + ref(seeding_finished[g_idx][s]), query, subj[s], \ + task_host[g_idx][s], task_num_host[g_idx][s], ref(q_groups[g]), \ + s_length[s], s, ref(res_s[g][s]),\ + ref(sw_tasks[g][s]), pool, ref(rs[g_idx][s])); #endif } s_begin += s_length_stream_byte; @@ -441,10 +496,21 @@ void search_db_batch(const char *query, char *subj[], vector &q_grou CUDA_CALL(cudaEventDestroy(seeding_finished[g_idx][s])); hsp_count += res_s[g][s].size(); cout << "="; -#ifndef USE_GPU_SW +#ifdef GLF_GPU_SW for (auto &r : rs[g_idx][s]) r.get(); - proceed_result(res, res_s[g][s], query, subj[s], q_groups[g], s_name[s], s_offsets[s], sn_offsets[s], s_num[s], total_db_size); + proceed_result_glf(res, res_s[g][s], query, subj[s], \ + q_groups[g], s_name[s], s_offsets[s], \ + sn_offsets[s], s_num[s], total_db_size); + cout << "="; +#elif defined(USE_GPU_SW) + +#else + for (auto &r : rs[g_idx][s]) + r.get(); + proceed_result(res, res_s[g][s], query, subj[s], \ + q_groups[g], s_name[s], s_offsets[s], \ + sn_offsets[s], s_num[s], total_db_size); cout << "="; #endif } @@ -498,7 +564,22 @@ void search_db_batch(const char *query, char *subj[], vector &q_grou sw_tasks_total.c_all[s] = subj[s]; sw_tasks_total.c_offs[s] = s==0? 0: sw_tasks_total.c_offs[s-1] +s_length[s-1]; } - gasal_run(sw_tasks_total, res_s, query_dev, subj_dev, q_groups.size(), band_width); + + // 计数单位byte + size_t total_len_target = 0; + for(int i = 0; i < NUM_STREAM; ++i){ + total_len_target += s_length[i]*5/8; + } + + char* target = (char*)malloc(sizeof(char) * total_len_target); + CUDA_CALL(cudaMemcpy(target, subj_dev, total_len_target, cudaMemcpyDeviceToHost)); + for(int i = 0; i < total_len_target; ++i) { + char ch_tar = get_char(target,i) ; + target[i] = static_cast(ch_tar+ 65); + } + total_gpu_SW(sw_tasks_total,res_s,query,target,q_groups.size(),band_width); + // gasal_run(sw_tasks_total, res_s, query_dev, subj_dev, q_groups.size(), band_width); + cout << "Done.\t["; gettimeofday(&t_end, NULL); diff --git a/src/createDB.cpp b/src/createDB.cpp index 5e0d909..c1cce9a 100644 --- a/src/createDB.cpp +++ b/src/createDB.cpp @@ -1,7 +1,7 @@ #include "params.h" #include "util.h" -#define MASK5(v) (v & 0b11111) + #define MAX_QUERY_LENGTH 10000 diff --git a/src/main.cpp b/src/main.cpp index a7b7f4a..bdbfbb4 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -91,6 +91,8 @@ void show_args() cout << "Alignment Type" << "\t"; #ifdef USE_GPU_SW cout << "GPU SW" << endl; +#elif defined(GLF_GPU_SW) + cout << "GLF SW" << endl; #else cout << "CPU BANDED SW" << endl; #endif diff --git a/src/util.cpp b/src/util.cpp index 2c38a33..6497746 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -1,6 +1,6 @@ #include "util.h" -#define MASK5(v) (v & 0b11111) +// #define MASK5(v) (v & 0b11111) ArgumentParser arg_parser; int seed_length; @@ -240,8 +240,10 @@ void load_seq(string db, int num, char *&str, size_t &len) str = str1; // cout << "Load db " << db << num << " successful. Total length = " << len << endl; } - -void proceed_result(vector *res_d, vector &res_t, const char *query, const char *subj, QueryGroup &q_group, const char *s_name, const size_t *s_offsets, const size_t *sn_offsets, const size_t s_num, size_t total_db_size) +// TODO +void proceed_result(vector *res_d, vector &res_t, const char *query, const char *subj, \ + QueryGroup &q_group, const char *s_name, const size_t *s_offsets,\ + const size_t *sn_offsets, const size_t s_num, size_t total_db_size) { set modified_resq; @@ -395,6 +397,97 @@ void proceed_result(vector *res_d, vector &res_t, const char // cout< *res_d, vector &res_t, \ + const char *query, const char *subj, QueryGroup &q_group,\ + const char *s_name, const size_t *s_offsets, const size_t *sn_offsets, const size_t s_num, \ + size_t total_db_size) +{ + set modified_resq; + size_t n_subj = s_num / sizeof(size_t); +// get res_t score,bitscore->evalue +// #pragma omp parallel for + for (int t = 0; t < res_t.size(); t++) + { + if (!res_t[t].report) + continue; + double score = res_t[t].score; + if (score >= min_score) + { + int num_s = get_pos(res_t[t].s_res[0], s_offsets, n_subj); + int num_q = res_t[t].num_q; + res_t[t].num_q = q_group.id[num_q]; + uint32_t s_len = s_offsets[num_s + 1] - s_offsets[num_s] - 1; + + double e_value = total_db_size * (q_group.length[num_q]) * pow(2, -res_t[t].bitscore); + // cout << "\nEvalue " << e_value << endl; + if (e_value <= max_evalue) + { + res_t[t].e_value = e_value; + res_t[t].sn_offset = sn_offsets[num_s]; + res_t[t].sn_len = sn_offsets[num_s + 1] - sn_offsets[num_s] - 1; + string sn(s_name + res_t[t].sn_offset, res_t[t].sn_len); + res_t[t].s_name = sn; + // size_t len = res_t[t].align_length; + res_t[t].s_len = s_len; + } + else{ + res_t[t].report=false; + res_t[t].q.clear(); + res_t[t].s.clear(); + res_t[t].s_ori.clear(); + res_t[t].match.clear(); + } + } + else + { + res_t[t].report=false; + res_t[t].q.clear(); + res_t[t].s.clear(); + res_t[t].s_ori.clear(); + res_t[t].match.clear(); + } + res_t[t].s_res.clear(); + res_t[t].q_res.clear(); + } + for (int t = 0; t < res_t.size(); t++) + { + if (!res_t[t].report) + continue; + // SWResult& res = res_t[t]; + res_d[res_t[t].num_q].emplace_back(res_t[t]); + push_heap(res_d[res_t[t].num_q].begin(), res_d[res_t[t].num_q].end(), [&](const SWResult &sw1, const SWResult &sw2) + { return (sw1.e_value == sw2.e_value) ? (sw1.score > sw2.score) : (sw1.e_value < sw2.e_value); }); + // res_tmp.emplace_back(res); + modified_resq.insert(res_t[t].num_q); + } + + vector modified_resq_list; + modified_resq_list.assign(modified_resq.begin(), modified_resq.end()); +#pragma omp parallel for + for (int i=0;i &res = res_d[mq]; + + // sort(res.begin(), res.end(), [&](const DisplayResult &sw1, const DisplayResult &sw2) + // { return (sw1.e_value == sw2.e_value) ? (sw1.score > sw2.score) : (sw1.e_value < sw2.e_value); }); + if (max_display_align > 0 && res.size() > max_display_align) + { + int pop_size = res.size() - max_display_align; + for (int i = 0; i < pop_size; i++) + { + pop_heap(res.begin(), res.end() - i, [&](const SWResult &sw1, const SWResult &sw2) + { return (sw1.e_value == sw2.e_value) ? (sw1.score > sw2.score) : (sw1.e_value < sw2.e_value); }); + // res.pop_back(); + } + // res.erase(res.begin() + max_display_align, res.end()); + res.resize(max_display_align); + assert(res.size() == max_display_align); + } + } + // cout<