From aa86e001e05467e4016edfe120fd61a78cdcfb4f Mon Sep 17 00:00:00 2001
From: LiuKaixuan <liaukx@gmail.com>
Date: Wed, 6 Mar 2024 11:24:38 +0800
Subject: [PATCH] naive-gpu-sw

---
 CMakeLists.txt           |  27 ++-
 gpu-sw/CMakeLists.txt    |  11 ++
 gpu-sw/core.hh           | 165 ++++++++++++++++++
 gpu-sw/cuda-utils.cc     |  29 ++++
 gpu-sw/cuda-utils.hh     | 258 ++++++++++++++++++++++++++++
 gpu-sw/mats.cc           |  61 +++++++
 gpu-sw/seqs.cc           | 286 +++++++++++++++++++++++++++++++
 gpu-sw/seqs.hh           | 307 +++++++++++++++++++++++++++++++++
 gpu-sw/sw-lib.cc         | 292 +++++++++++++++++++++++++++++++
 gpu-sw/sw-lib.hh         | 146 ++++++++++++++++
 gpu-sw/sw.cu             | 361 +++++++++++++++++++++++++++++++++++++++
 gpu-sw/sw.hh             | 224 ++++++++++++++++++++++++
 include/smith.h          |   4 +-
 include/util.h           |   3 +
 src/banded_smith_cpu.cpp | 260 +++++++++++++++++++++++++++-
 src/blastp.cu            |  97 ++++++++++-
 src/createDB.cpp         |   2 +-
 src/main.cpp             |   2 +
 src/util.cpp             |  99 ++++++++++-
 19 files changed, 2618 insertions(+), 16 deletions(-)
 create mode 100644 gpu-sw/CMakeLists.txt
 create mode 100644 gpu-sw/core.hh
 create mode 100644 gpu-sw/cuda-utils.cc
 create mode 100644 gpu-sw/cuda-utils.hh
 create mode 100644 gpu-sw/mats.cc
 create mode 100644 gpu-sw/seqs.cc
 create mode 100644 gpu-sw/seqs.hh
 create mode 100644 gpu-sw/sw-lib.cc
 create mode 100644 gpu-sw/sw-lib.hh
 create mode 100644 gpu-sw/sw.cu
 create mode 100644 gpu-sw/sw.hh
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a521314..808cd07 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,9 +3,22 @@ cmake_minimum_required(VERSION 3.5)
 set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda)
 project(BLASTP LANGUAGES CXX C CUDA)
 
-SET(CMAKE_BUILD_TYPE "Release")
+# SET(CMAKE_BUILD_TYPE "Release")
+SET(CMAKE_BUILD_TYPE DEBUG)
+
+option(GLF_GPU_SW "Use GLF-GPU Smith-waterman" ON)
+if(GLF_GPU_SW)
+    add_definitions(-DGLF_GPU_SW)
+endif()
+
+option(USE_GPU_SW "Use GPU Smith-waterman" OFF)
+if(USE_GPU_SW)
+    add_definitions(-DUSE_GPU_SW)
+endif()
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -pthread")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -std=c++17 -Xcompiler -pthread")
+
 if (NOT ("${CMAKE_SIZEOF_VOID_P}" STREQUAL "8"))
     message(SEND_ERROR "require 64 bit system")
 endif()
@@ -50,14 +63,24 @@ target_compile_options (util
 		${OpenMP_CXX_FLAGS}
 )
 
+# gpu_sw
+add_subdirectory(./gpu-sw)
+
 add_executable(query src/main.cpp ${SOURCES_SEARCH})
-target_link_libraries(query util)
+
+# find_library(GPU_SW_LIBRARY
+#     PATHS ${CMAKE_BINARY_DIR}/gpu-sw
+# )
+# target_link_libraries(query PRIVATE util ${GPU_SW_LIBRARY})
+target_link_libraries(query PRIVATE util sw-lib)
+# target_link_libraries(query util)
 
 add_executable(createDB src/createDB.cpp)
 
 target_include_directories(query
     PRIVATE 
         ${PROJECT_SOURCE_DIR}/include
+        ${PROJECT_SOURCE_DIR}/gpu-sw
 )
 
 target_include_directories(createDB
diff --git a/gpu-sw/CMakeLists.txt b/gpu-sw/CMakeLists.txt
new file mode 100644
index 0000000..48848ac
--- /dev/null
+++ b/gpu-sw/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_library(sw-lib STATIC)
+set_property(TARGET sw-lib PROPERTY CUDA_ARCHITECTURES "${GPU_ARCHS}")
+target_sources(sw-lib PRIVATE
+	"sw.cu"
+	"cuda-utils.cc"
+	"mats.cc"
+	"seqs.cc"
+	"sw-lib.cc"
+	)
+target_link_libraries(sw-lib PUBLIC "${CUDA_LIBRARIES}")
+target_include_directories(sw-lib PUBLIC "${CUDA_INCLUDE_DIRS}" ".")
\ No newline at end of file
diff --git a/gpu-sw/core.hh b/gpu-sw/core.hh
new file mode 100644
index 0000000..f1f1779
--- /dev/null
+++ b/gpu-sw/core.hh
@@ -0,0 +1,165 @@
+#ifndef __ECCL_CORE_HH__
+#define __ECCL_CORE_HH__
+
+#include <memory>
+#include <string>
+#include <future>
+#include <cstring>
+#include <ostream>
+#include <cassert>
+
+/*! core types/definitions/functions, cheap to include */
+
+
+#ifdef __CUDACC__
+#define CUDA_HOST __host__
+#define CUDA_DEVICE __device__
+#else
+#define CUDA_HOST
+#define CUDA_DEVICE
+#endif
+
+/*! 10 necleotides per unsigned int (instead of 8) */
+#define ECCL_COMPACT_CODE
+
+namespace eccl {
+
+enum class seq_type {
+	xna, /*! DNA or RNA */
+	//dna,
+	//rna,
+	prot,
+};
+
+template<seq_type Type>
+struct code {
+	unsigned char value{0};
+	constexpr static unsigned int width=(Type==seq_type::xna?3:5);
+#ifndef ECCL_COMPACT_CODE
+	constexpr static unsigned int n_per_word=(Type==seq_type::xna?8:4);
+#else
+	constexpr static unsigned int n_per_word=(Type==seq_type::xna?10:6);
+#endif
+	/*! complement neucleotide */
+	code<Type> operator~() const noexcept {
+		static_assert(Type==seq_type::xna);
+		return code<Type>{static_cast<unsigned char>((~value)&0b111)};
+	}
+};
+using nucleotide=code<seq_type::xna>;
+using amino_acid=code<seq_type::prot>;
+
+template<seq_type Type>
+struct pair {
+	unsigned int value{0};
+	CUDA_HOST CUDA_DEVICE constexpr pair(code<Type> a, code<Type> b) noexcept:
+		value{(static_cast<unsigned int>(a.value)<<a.width)+b.value} { }
+};
+
+template<seq_type Type>
+inline unsigned int padded_len(unsigned int len) noexcept {
+#ifndef ECCL_COMPACT_CODE
+	return Type==seq_type::xna?(len+7)/8*8:(len+3)/4*4;
+#else
+	//return (len+9)/10*10;
+	return Type==seq_type::xna?(len+39)/40*40:(len+23)/24*24;
+#endif
+}
+
+template<eccl::seq_type Type>
+inline CUDA_DEVICE code<Type> get_code(const unsigned int* buf, unsigned long idx);
+template<>
+inline CUDA_DEVICE code<eccl::seq_type::xna> get_code(const unsigned int* buf, unsigned long idx) {
+#ifndef ECCL_COMPACT_CODE
+	auto v=buf[idx/8];
+	return {static_cast<unsigned char>((v>>((7-idx%8)*4))&0x0f)};
+#else
+	auto v=buf[idx/10];
+	return {static_cast<unsigned char>((v>>((9-idx%10)*3))&0b0111)};
+#endif
+};
+template<>
+inline CUDA_DEVICE code<eccl::seq_type::prot> get_code(const unsigned int* buf, unsigned long idx) {
+#ifndef ECCL_COMPACT_CODE
+	auto v=buf[idx/4];
+	return {static_cast<unsigned char>((v>>((3-idx%4)*8))&0x1f)};
+#else
+	auto v=buf[idx/6];
+	return {static_cast<unsigned char>((v>>((5-idx%6)*5))&0b011111)};
+#endif
+};
+
+template<typename T>
+class chunk {
+public:
+	constexpr chunk() noexcept: _p{nullptr}, _s{0} { }
+	explicit chunk(std::size_t size): _p{std::make_unique<T[]>(size)}, _s{size<<1} { }
+
+	explicit operator bool() const noexcept { return _p.get(); }
+	std::size_t size() const noexcept { return _s>>1; }
+	T& operator[](std::size_t i) noexcept { return _p[i]; }
+	const T& operator[](std::size_t i) const noexcept { return _p[i]; }
+	bool eof() const noexcept { return _s&1; }
+
+	void shrink(std::size_t size, bool eof=false) noexcept {
+		assert(size<=(_s>>1));
+		_s=(size<<1)|(eof?1:0);
+	}
+
+private:
+	std::unique_ptr<T[]> _p;
+	/*! use last bit for eof */
+	std::size_t _s;
+};
+
+template<typename T>
+class source {
+public:
+	chunk<T> get() {
+		auto chk=_fut.get();
+		_fut=prepare();
+		return chk;
+	}
+
+protected:
+	constexpr source() noexcept { }
+	void post_ctor() {
+		_fut=prepare();
+	}
+	virtual ~source() { }
+
+	virtual std::future<chunk<T>> prepare() =0;
+private:
+	std::future<chunk<T>> _fut;
+};
+
+
+}
+
+#if __cplusplus < 201703L
+namespace std {
+class string_view {
+public:
+	constexpr string_view(const char* p, std::size_t s) noexcept:
+		_p{p}, _s{s} { }
+	string_view(const char* p):
+		_p{p}, _s{std::strlen(p)} { }
+	std::size_t size() const noexcept { return _s; }
+	const char& operator[](std::size_t i) const noexcept { return _p[i]; }
+private:
+	const char* _p;
+	std::size_t _s;
+};
+inline bool operator!=(const std::string& a, std::string_view b) noexcept {
+	return a.compare(0, a.size(), &b[0], b.size())!=0;
+}
+inline bool operator==(const std::string& a, std::string_view b) noexcept {
+	return !(a!=b);
+}
+inline std::ostream& operator<<(std::ostream& oss, std::string_view b) {
+	return oss.write(&b[0], b.size());
+}
+}
+#endif
+
+#endif
diff --git a/gpu-sw/cuda-utils.cc b/gpu-sw/cuda-utils.cc
new file mode 100644
index 0000000..64dc558
--- /dev/null
+++ b/gpu-sw/cuda-utils.cc
@@ -0,0 +1,29 @@
+#include "cuda-utils.hh"
+
+#include <cstdio>
+#include <iostream>
+
+void eccl::dump_device_info(int device) {
+	struct cudaDeviceProp prop;
+	cudaGetDeviceProperties(&prop, device);
+	printf("%s\n", prop.name);
+	printf("Major revision number:         %d\n", prop.major);
+	printf("Minor revision number:         %d\n", prop.minor);
+	printf("Total global memory:           %zu", prop.totalGlobalMem);
+	printf(" bytes\n");
+	printf("Number of multiprocessors:     %d\n", prop.multiProcessorCount);
+	printf("Total amount of shared memory per block: %zu\n",prop.sharedMemPerBlock);
+	printf("Total registers per block:     %d\n", prop.regsPerBlock);
+	printf("Warp size:                     %d\n", prop.warpSize);
+	printf("Maximum memory pitch:          %zu\n", prop.memPitch);
+	printf("Total amount of constant memory:         %zu\n",   prop.totalConstMem);
+}
+
+namespace eccl {
+void operator,(cudaError_t error, eccl::check_cuda checker) {
+	if(!error)
+		return;
+	std::cerr<<"error: "<<checker._msg<<": "<<cudaGetErrorName(error)<<": "<<cudaGetErrorString(error)<<"\n";
+	std::exit(EXIT_FAILURE);
+}
+}
diff --git a/gpu-sw/cuda-utils.hh b/gpu-sw/cuda-utils.hh
new file mode 100644
index 0000000..31f96bd
--- /dev/null
+++ b/gpu-sw/cuda-utils.hh
@@ -0,0 +1,258 @@
+#ifndef __ECCL_CUDA_UTILS_HH__
+#define __ECCL_CUDA_UTILS_HH__
+
+#include <vector>
+#include <cassert>
+
+#include <cuda_runtime.h>
+
+namespace eccl {
+
+enum class device {
+	cpu,
+	cuda,
+};
+
+/*! eg: cudaBlah(...), eccl::check_cuda{"error tag"}; */
+class check_cuda {
+public:
+	explicit constexpr check_cuda(const char* msg) noexcept:
+		_msg{msg} { }
+private:
+	const char* _msg;
+	friend void operator,(cudaError_t error, check_cuda checker);
+};
+
+void dump_device_info(int device);
+
+//pointer/buffer types:
+//
+//host_only<t>
+//
+
+/*! device raw pointer, no deference in CPU, no memory management */
+template<typename T>
+class dev_ptr {
+public:
+	constexpr dev_ptr() noexcept: _p{nullptr} { }
+	explicit constexpr dev_ptr(T* p) noexcept: _p{p} { }
+	~dev_ptr() { }
+	dev_ptr(const dev_ptr&) noexcept =default;
+	dev_ptr& operator=(const dev_ptr&) noexcept =default;
+
+	operator T*() const noexcept { return static_cast<T*>(_p); }
+
+private:
+	void* _p;
+};
+
+template<typename T>
+using host_ptr=T*;
+
+namespace detail {
+
+template<typename T>
+struct cuda_host_allocator {
+	using value_type=T;
+	using size_type=std::size_t;
+	constexpr cuda_host_allocator() noexcept { }
+	~cuda_host_allocator() { }
+
+	T* allocate(size_type n) {
+		T* r;
+		cudaMallocHost(&r, n*sizeof(T)), check_cuda{"malloc host"};
+		//fprintf(stderr, "malloc host: %p %zd\n", r, n*sizeof(T));
+		return r;
+	}
+	void deallocate(T* p, size_type n) {
+		//fprintf(stderr, "free.. host: %p %zd\n", p, n*sizeof(T));
+		cudaFreeHost(p), check_cuda{"free host"};
+	}
+};
+
+template<typename T>
+struct cuda_dev_allocator {
+	using value_type=T;
+	using size_type=std::size_t;
+	constexpr cuda_dev_allocator() noexcept { }
+	~cuda_dev_allocator() { }
+
+	T* allocate(size_type n) {
+		T* r;
+		cudaMalloc(&r, n*sizeof(T)), check_cuda{"malloc dev"};
+		//fprintf(stderr, "malloc dev: %p %zd\n", r, n*sizeof(T));
+		//fprintf(stderr, "malloc dev: %p %zd\n", r, n*sizeof(T));
+		return r;
+	}
+	void deallocate(T* p, size_type n) {
+		//fprintf(stderr, "free.. dev: %p %zd\n", p, n*sizeof(T));
+		cudaFree(p), check_cuda{"free dev"};
+	}
+};
+
+}
+
+/*! host buffer, accessible to device */
+template<typename T>
+class host_vector: public std::vector<T, detail::cuda_host_allocator<T>> {
+public:
+	using std::vector<T, detail::cuda_host_allocator<T>>::vector;
+	operator dev_ptr<T>() noexcept {
+		return dev_ptr<T>(&(*this)[0]);
+	}
+};
+//using host_vector=
+
+/*! device buffer, std::vector-like API */
+template<typename T>
+class dev_vector {
+public:
+	constexpr dev_vector() noexcept: _p{nullptr}, _s{0} { }
+	~dev_vector() {
+		if(_p) {
+			detail::cuda_dev_allocator<T>{}.deallocate(_p, _s);
+		}
+	}
+	dev_vector(const dev_vector&) =delete;
+	dev_vector& operator=(const dev_vector&) =delete;
+	dev_vector(dev_vector&& r) noexcept: _p{r._p}, _s{r._s} {
+		r._p=nullptr;
+		r._s=0;
+	}
+	dev_vector& operator=(dev_vector&& r) noexcept {
+		std::swap(_p, r._p);
+		std::swap(_s, r._s);
+		return *this;
+	}
+
+	operator T*() const noexcept { return static_cast<T*>(_p); }
+	operator eccl::dev_ptr<T>() const noexcept {
+		return eccl::dev_ptr<T>{static_cast<T*>(_p)};
+	}
+	std::size_t size() const noexcept { return _s; }
+
+	void try_malloc(std::size_t s) {
+		if(_p) {
+			if(_s<s) {
+				free();
+				auto ss=_s+std::max<std::size_t>(1024, 16*(s-_s));
+				malloc(ss);
+			}
+		} else {
+			malloc(s);
+		}
+	}
+	void malloc(std::size_t s) {
+		assert(_p==nullptr);
+		_p=detail::cuda_dev_allocator<T>{}.allocate(s);
+		_s=s;
+	}
+
+	void free() {
+		assert(_p);
+		detail::cuda_dev_allocator<T>{}.deallocate(_p, _s);
+		_p=nullptr;
+	}
+
+	template<typename Vec>
+			void try_malloc_h2d(const Vec& vec, cudaStream_t stream) {
+				try_malloc(vec.size());
+				h2d(vec, stream);
+			}
+	void malloc_h2d(const host_vector<T>& vec, cudaStream_t stream) {
+		malloc(vec.size());
+		h2d(vec, stream);
+	}
+	template<typename Alloc>
+	void h2d(const std::vector<T, Alloc>& vec, cudaStream_t stream) {
+		h2d(&vec[0], vec.size(), stream);
+	}
+	void h2d(const T* p, std::size_t s, cudaStream_t stream) {
+		assert(s<=_s);
+		//fprintf(stderr, "H %p -> D %p %zd\n", &_q[0], _p, _q.size()*sizeof(T));
+		cudaMemcpyAsync(_p, p, s*sizeof(T), cudaMemcpyHostToDevice, stream), check_cuda{"memcpy async to device"};
+	}
+	void h2d(const T* p, std::size_t s) {
+		assert(s<=_s);
+		cudaMemcpy(_p, p, s*sizeof(T), cudaMemcpyHostToDevice), check_cuda{"memcpy to device"};
+	}
+	void d2h(T* p, std::size_t s) {
+		assert(s<=_s);
+		//fprintf(stderr, "D2h %p %p %zd\n", _p, p, s*sizeof(T));
+		cudaMemcpy(p, _p, s*sizeof(T), cudaMemcpyDeviceToHost), check_cuda{"memcpy to host"};
+	}
+	void d2h(T* p, std::size_t s, cudaStream_t stream) {
+		assert(s<=_s);
+		//fprintf(stderr, "D2h async %p %p %zd\n", _p, p, s*sizeof(T));
+		cudaMemcpyAsync(p, _p, s*sizeof(T), cudaMemcpyDeviceToHost, stream), check_cuda{"memcpy async to host"};
+	}
+	template<typename Alloc>
+	void d2h(std::vector<T, Alloc>& vec, cudaStream_t stream) {
+		d2h(&vec[0], vec.size(), stream);
+	}
+
+private:
+	T* _p;
+	std::size_t _s{0};
+};
+
+/////////////////////////////////////////
+
+class event {
+public:
+	constexpr event() noexcept: _e{nullptr} { }
+	~event() {
+		if(_e)
+			cudaEventDestroy(_e), check_cuda{"create event"};
+	}
+	event(const event&) =delete;
+	event& operator=(const event&) =delete;
+	//event(event&& r) =delete;
+	//event& operator=(event&& r) =delete;
+
+	void create() {
+		cudaEventCreate(&_e), check_cuda{"create event"};
+	}
+	void record(cudaStream_t stream) {
+		cudaEventRecord(_e, stream), check_cuda{"record event"};
+	}
+
+	operator cudaEvent_t() const noexcept { return _e; }
+
+private:
+	cudaEvent_t _e;
+};
+
+
+class stream {
+public:
+	constexpr stream() noexcept: _s{nullptr} { }
+	~stream() {
+		if(_s)
+			cudaStreamDestroy(_s), check_cuda{"destroy stream"};
+	}
+	stream(const stream&) =delete;
+	stream& operator=(const stream&) =delete;
+	//stream(stream&& r) =delete;
+	//stream& operator=(stream&& r) =delete;
+
+	void create() {
+		cudaStreamCreate(&_s), check_cuda{"create stream"};
+	}
+
+	operator cudaStream_t() const noexcept { return _s; }
+
+	void wait(cudaEvent_t event) {
+		cudaStreamWaitEvent(_s, event, cudaEventWaitDefault), check_cuda{"wait event"};
+	}
+	void sync() {
+		cudaStreamSynchronize(_s), check_cuda{"sync stream"};
+	}
+
+private:
+	cudaStream_t _s;
+};
+
+}
+
+#endif
diff --git a/gpu-sw/mats.cc b/gpu-sw/mats.cc
new file mode 100644
index 0000000..cc40161
--- /dev/null
+++ b/gpu-sw/mats.cc
@@ -0,0 +1,61 @@
+#include <string_view>
+#include <array>
+
+#include "core.hh"
+#include "seqs.hh"
+
+namespace {
+
+template<eccl::seq_type Type>
+struct mat_helper {
+	std::string_view hdr;
+	eccl::seq_encoder<Type> b2b;
+	std::array<int, 32*32>& mat;
+	explicit mat_helper(const char* hdr, std::array<int, 32*32>& mat): hdr{hdr}, b2b{}, mat{mat} { }
+	void set(eccl::code<Type> a, unsigned int i) {
+	}
+	template<typename Arg, typename... Args>
+	void set(eccl::code<Type> a, unsigned int i, Arg val, Args... vals) {
+		assert(i<hdr.size());
+		auto b=b2b(hdr[i]);
+		mat[eccl::pair<Type>{a, b}.value]=val;
+		set(a, i+1, vals...);
+	}
+	template<typename... Args>
+	void operator()(char c, Args... vals) {
+		auto a=b2b(c);
+		set(a, 0, vals...);
+	}
+};
+
+}
+
+void eccl::mat_blosum62(std::array<int, 32*32>& mat) {
+	// ftp://ftp.ncbi.nih.gov/blast/matrices/BLOSUM62
+	mat_helper<eccl::seq_type::prot> h{"ARNDCQEGHILKMFPSTWYVBZX*", mat};
+	h('A', 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-2,-1, 0,-4);
+	h('R',-1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-1, 0,-1,-4);
+	h('N',-2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3, 3, 0,-1,-4);
+	h('D',-2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3, 4, 1,-1,-4);
+	h('C', 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-3,-3,-2,-4);
+	h('Q',-1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2, 0, 3,-1,-4);
+	h('E',-1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2, 1, 4,-1,-4);
+	h('G', 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-1,-2,-1,-4);
+	h('H',-2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3, 0, 0,-1,-4);
+	h('I',-1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-3,-3,-1,-4);
+	h('L',-1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-3,-1,-4);
+	h('K',-1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2, 0, 1,-1,-4);
+	h('M',-1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-3,-1,-1,-4);
+	h('F',-2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-3,-3,-1,-4);
+	h('P',-1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-2,-1,-2,-4);
+	h('S', 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2, 0, 0, 0,-4);
+	h('T', 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-1,-1, 0,-4);
+	h('W',-3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-3,-2,-4);
+	h('Y',-2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-3,-2,-1,-4);
+	h('V', 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-3,-2,-1,-4);
+	h('B',-2,-1, 3, 4,-3, 0, 1,-1, 0,-3,-4, 0,-3,-3,-2, 0,-1,-4,-3,-3, 4, 1,-1,-4);
+	h('Z',-1, 0, 0, 1,-3, 3, 4,-2, 0,-3,-3, 1,-1,-3,-1, 0,-1,-3,-2,-2, 1, 4,-1,-4);
+	h('X', 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-1,-1,-1,-4);
+	h('*',-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1);
+}
+
diff --git a/gpu-sw/seqs.cc b/gpu-sw/seqs.cc
new file mode 100644
index 0000000..be1ab44
--- /dev/null
+++ b/gpu-sw/seqs.cc
@@ -0,0 +1,286 @@
+#include "seqs.hh"
+
+//#include <iostream>
+#include <algorithm>
+#include <type_traits>
+
+
+// XXX alternative: 32-bit, 10-bp, 2-bit unused per word
+// avoid 4-bit padded to 3-bit conversion.
+// but, is 10 harder to divide by than 8???
+template<eccl::seq_type Type, typename Cont>
+class eccl::detail::BitStream {
+public:
+	explicit constexpr BitStream(Cont& cont) noexcept:
+		cont{cont} { }
+	std::size_t size() const noexcept { return tmp_cnt; }
+	void add(unsigned int v) {
+#ifndef ECCL_COMPACT_CODE
+		tmp_bits=(tmp_bits<<padded_width)+v;
+		if((++tmp_cnt)%(padded_count)==0)
+			cont.push_back(tmp_bits);
+#else
+		++tmp_cnt;
+		tmp_bits=(tmp_bits<<width)+v;
+		if(tmp_bits>=bits_thr) {
+			cont.push_back(tmp_bits);
+			tmp_bits=1;
+		}
+#endif
+	}
+	void flush() {
+#ifndef ECCL_COMPACT_CODE
+		auto rem=tmp_cnt%padded_count;
+		if(rem!=0) {
+			tmp_cnt+=padded_count-rem;
+			cont.push_back(tmp_bits<<(padded_count-rem)*padded_width);
+		}
+#else
+		auto rem=tmp_cnt%nopad_count;
+		if(rem!=0) {
+			tmp_cnt+=nopad_count-rem;
+			cont.push_back(tmp_bits<<(nopad_count-rem)*width);
+			tmp_bits=1;
+		}
+#endif
+	};
+private:
+	Cont& cont;
+	std::size_t tmp_cnt=0;
+	unsigned int tmp_bits{1};
+	static constexpr unsigned int bits_thr=(Type==eccl::seq_type::xna?0x4000'0000:0x4000'0000);
+	static constexpr unsigned int width=(Type==eccl::seq_type::xna?3:5);
+	static constexpr unsigned int padded_width=(Type==eccl::seq_type::xna?4:8);
+	static constexpr unsigned int padded_count=(Type==eccl::seq_type::xna?8:4);
+	static constexpr unsigned int nopad_count=(Type==eccl::seq_type::xna?10:6);
+};
+
+
+std::shared_ptr<eccl::fasta_chunk> eccl::fasta_reader::load(eccl::source<char>& str, std::size_t buf_size, unsigned int num_seq) {
+	std::shared_ptr<eccl::fasta_chunk> chk{};
+	if(st==St::end)
+		return chk;
+	std::size_t cur_idx{0};
+	std::size_t non_seq{0};
+	using Coords=fasta_chunk::Coords;
+	std::size_t n;
+	do {
+		while(cur_idx<cur.buf.size()) {
+			auto c=cur.buf[cur_idx];
+			switch(st) {
+			case St::expect_hdr:
+				if(c=='>') {
+					cur.coords.emplace_back(Coords{cur_idx+1, SIZE_MAX, SIZE_MAX, SIZE_MAX});
+					st=St::in_hdr;
+					break;
+				}
+				break;
+			case St::in_hdr:
+				if((n=cur.buf.find('\n', cur_idx))!=std::string::npos) {
+					cur_idx=n;
+					auto& coords=cur.coords.back();
+					coords.name_e=cur_idx;
+					cur.names_total+=coords.name_e-coords.name_s;
+					coords.seq_s=cur_idx+1;
+					st=St::expect_seq;
+					non_seq=0;
+					break;
+				}
+				cur_idx=cur.buf.size()-1;
+				break;
+			case St::expect_seq:
+				if(c=='>') {
+					auto& coords=cur.coords.back();
+					coords.seq_e=cur_idx;
+					cur.seqs_total+=coords.seq_e-coords.seq_s-non_seq;
+					st=St::in_hdr;
+					if((buf_size>0 && cur.seqs_total>=buf_size) || (num_seq>0 && cur.coords.size()>=num_seq)) {
+						Chunk next{};
+						next.buf.assign(&cur.buf[cur_idx], &cur.buf[cur.buf.size()]);
+						next.coords.emplace_back(Coords{1, SIZE_MAX, SIZE_MAX, SIZE_MAX});
+						chk=std::make_shared<Chunk>(std::move(cur));
+						cur=std::move(next);
+						return chk;
+					}
+					cur.coords.emplace_back(Coords{cur_idx+1, SIZE_MAX, SIZE_MAX, SIZE_MAX});
+					break;
+				}
+				if(c=='\n') {
+					++non_seq;
+					break;
+				}
+				st=St::in_seq;
+				break;
+			case St::in_seq:
+				if((n=cur.buf.find('\n', cur_idx))!=std::string::npos) {
+					cur_idx=n;
+					++non_seq;
+					st=St::expect_seq;
+					break;
+				}
+				cur_idx=cur.buf.size()-1;
+				break;
+			case St::end:
+				assert(0);
+			}
+			++cur_idx;
+		}
+		if(hiteof)
+			break;
+		auto blk=str.get();
+		if(blk.eof())
+			hiteof=true;
+		cur.buf.resize(cur_idx+blk.size());
+		std::copy_n(&blk[0], blk.size(), &cur.buf[cur_idx]);
+	} while(true);
+	switch(st) {
+	case St::expect_hdr:
+		return {};
+	case St::expect_seq:
+		break;
+	default:
+		fprintf(stderr, "wrong st: %d\n", (int)st);
+		assert(0);
+		throw st;
+	}
+	st=St::end;
+	auto& coords=cur.coords.back();
+	coords.seq_e=cur_idx;
+	cur.seqs_total+=coords.seq_e-coords.seq_s-non_seq;
+	chk=std::make_shared<Chunk>(std::move(cur));
+	return chk;
+}
+
+template<eccl::seq_type Type>
+eccl::seqs<Type>::seqs(const eccl::fasta_chunk& chunk) {
+	_b.seq_buf.reserve(chunk.seqs_total/2/sizeof(unsigned int)+chunk.coords.size());
+	_b.name_buf.reserve(chunk.names_total);
+	_b.coords.reserve(chunk.coords.size());
+
+	detail::BitStream<Type, std::vector<unsigned int>> bits{_b.seq_buf};
+	eccl::seq_encoder<Type> b2b;
+	_b.max_seq_l=0;
+	for(auto coord: chunk.coords) {
+		detail::seqs_base::Coords cc;
+		while(coord.name_s<coord.name_e)
+			_b.name_buf.push_back(chunk.buf[coord.name_s++]);
+		cc.name_e=_b.name_buf.size();
+		cc.seq_s=_b.seq_buf.size();
+		auto sl=coord.seq_e-coord.seq_s;
+		while(coord.seq_s<coord.seq_e) {
+			auto c=chunk.buf[coord.seq_s++];
+			if(c=='\n') {
+				--sl;
+			} else {
+				bits.add(b2b(c).value);
+			}
+		}
+		bits.flush();
+		cc.seq_l=sl;
+		if(_b.max_seq_l<sl)
+			_b.max_seq_l=sl;
+		_b.coords.push_back(cc);
+	}
+}
+template eccl::seqs<eccl::seq_type::xna>::seqs(const eccl::fasta_chunk& chunk);
+template eccl::seqs<eccl::seq_type::prot>::seqs(const eccl::fasta_chunk& chunk);
+
+eccl::seq_encoder_tables::seq_encoder_tables() noexcept:
+_table_xna{0, }, _table_prot{0, }
+{
+	_rev_xna[0b000]='\x00';
+	_rev_xna[0b001]='U';
+	_rev_xna[0b010]='C';
+	_rev_xna[0b011]='T';
+	_rev_xna[0b100]='A';
+	_rev_xna[0b101]='G';
+	_rev_xna[0b110]='X';
+	_rev_xna[0b111]='N';
+
+	_rev_prot[0b00000]='\x00';
+	_rev_prot[0b11111]='X';
+	_rev_prot[0b11110]='*';
+	_rev_prot[0b11101]='-';
+	_rev_prot[0b00001]='A';
+	_rev_prot[0b00010]='B';
+	_rev_prot[0b00011]='C';
+	_rev_prot[0b00100]='D';
+	_rev_prot[0b00101]='E';
+	_rev_prot[0b00110]='F';
+	_rev_prot[0b00111]='G';
+	_rev_prot[0b01000]='H';
+	_rev_prot[0b01001]='I';
+	_rev_prot[0b01010]='K';
+	_rev_prot[0b01011]='L';
+	_rev_prot[0b01100]='M';
+	_rev_prot[0b01101]='N';
+	_rev_prot[0b01110]='P';
+	_rev_prot[0b01111]='Q';
+	_rev_prot[0b10000]='R';
+	_rev_prot[0b10001]='S';
+	_rev_prot[0b10010]='T';
+	_rev_prot[0b10011]='U';
+	_rev_prot[0b10100]='V';
+	_rev_prot[0b10101]='W';
+	_rev_prot[0b10110]='Y';
+	_rev_prot[0b10111]='Z';
+
+	for(unsigned int i=0; i<_rev_xna.size(); ++i) {
+		auto c=_rev_xna[i];
+		if(c) {
+			_table_xna[std::tolower(c)]=_table_xna[c]=i;
+		}
+	}
+	for(unsigned int i=0; i<_rev_prot.size(); ++i) {
+		auto c=_rev_prot[i];
+		if(c) {
+			_table_prot[std::tolower(c)]=_table_prot[c]=i;
+		}
+	}
+}
+
+template<eccl::seq_type Type>
+eccl::seqs_builder<Type>::seqs_builder(std::size_t num_bases, std::size_t names_size, std::size_t num_seqs):
+	_b{},
+	_str{std::make_shared<detail::BitStream<Type, std::vector<unsigned int>>>(_b.seq_buf)},
+	_encoder{}
+{
+}
+template eccl::seqs_builder<eccl::seq_type::xna>::seqs_builder(std::size_t num_bases, std::size_t names_size, std::size_t num_seqs);
+template eccl::seqs_builder<eccl::seq_type::prot>::seqs_builder(std::size_t num_bases, std::size_t names_size, std::size_t num_seqs);
+
+template<eccl::seq_type Type, typename T, typename E>
+inline static void copy_seq(T& b, E& e, std::string_view seq, int*) {
+	for(std::size_t i=0; i<seq.size(); ++i)
+		b.add(e(seq[i]).value);
+}
+template<eccl::seq_type Type, typename T, typename E>
+inline static void copy_seq(T& b, E& e, std::string_view seq, char*) {
+	static_assert(Type==eccl::seq_type::xna);
+	for(std::size_t i=seq.size(); i>0; --i) {
+		auto comp=~e(seq[i-1]);
+		b.add(comp.value);
+	}
+}
+template<eccl::seq_type Type>
+template<bool RC>
+void eccl::seqs_builder<Type>::push_back(std::string_view seq) {
+	detail::seqs_base::Coords s;
+	s.name_e=_b.name_buf.size();
+	s.seq_s=_b.seq_buf.size();
+	copy_seq<Type>(*_str, _encoder, seq, std::conditional_t<RC, char*, int*>{});
+	_str->flush();
+	s.seq_l=seq.size();
+	if(_b.max_seq_l<seq.size())
+		_b.max_seq_l=seq.size();
+	_b.coords.push_back(s);
+}
+template void eccl::seqs_builder<eccl::seq_type::xna>::push_back<true>(std::string_view seq);
+template void eccl::seqs_builder<eccl::seq_type::xna>::push_back<false>(std::string_view seq);
+template void eccl::seqs_builder<eccl::seq_type::prot>::push_back<false>(std::string_view seq);
+
+#if 0
+#endif
+
+eccl::seq_encoder_tables eccl::seq_encoder_tables::_t;
+
diff --git a/gpu-sw/seqs.hh b/gpu-sw/seqs.hh
new file mode 100644
index 0000000..db8725f
--- /dev/null
+++ b/gpu-sw/seqs.hh
@@ -0,0 +1,307 @@
+#ifndef __ECCL_SEQS_HH__
+#define __ECCL_SEQS_HH__
+
+#include <array>
+#include <cstdint>
+#include <istream>
+#include <cassert>
+#include <vector>
+#include <memory>
+#include <string_view>
+
+#include "core.hh"
+
+namespace eccl {
+
+/*! API for a batch of sequences
+ *
+ * optimized for medium-sized batches of reads;
+ * may be used for a single sequence or a whole genome.
+ *
+ * limits:
+ *   - maximum length of one sequence: 4Gi;
+ */
+
+
+template<seq_type Type>
+struct seq_view {
+	/*! if reverse/complement sequence is not relevent,
+	 *   use bits directly.
+	 * otherwise,
+	 *   bits&1 means the reverse sequence,
+	 *   bits&2 means the complement sequence,
+	 *   and use bits without the 2 LSB.
+	 */
+	const unsigned int* bits;
+	/*! sub-seq range: [begin, end).  begin<=end. */
+	uint32_t begin, end;
+	uint32_t len() const noexcept { return end-begin; }
+	std::size_t buf_size() const noexcept {
+#ifndef ECCL_COMPACT_CODE
+		return Type==seq_type::xna?(len()+7)/8:(len()+3)/4;
+#else
+		return Type==seq_type::xna?(len()+9)/10:(len()+5)/6;
+#endif
+	}
+};
+
+namespace detail {
+struct seqs_base {
+	struct Coords {
+		uint64_t seq_s;
+		uint32_t seq_l;
+		uint32_t name_e;
+	};
+	uint32_t max_seq_l{0};
+	std::vector<Coords> coords;
+	std::vector<unsigned int> seq_buf;
+	std::vector<char> name_buf;
+};
+template<eccl::seq_type Type, typename Cont>
+class BitStream;
+}
+
+class fasta_chunk;
+template<seq_type Type>
+class seqs_builder;
+
+template<seq_type Type>
+class seqs {
+public:
+	explicit seqs(const fasta_chunk& chunk);
+	seqs(seqs_builder<Type>&& builder) noexcept;
+	~seqs() =default;
+	seqs(const seqs&) =delete;
+	seqs& operator=(const seqs&) =delete;
+	seqs(seqs&& r) noexcept: _b{std::move(r._b)} { }
+	seqs& operator=(seqs&& r) noexcept {
+		_b=std::move(r._b);
+		return *this;
+	}
+#if 0
+	std::pair<uint64_t, uint32_t> start_len(std::size_t i) const noexcept {
+		auto& c=_coords[i];
+		return {c.seq_s, c.seq_l};
+	}
+#endif
+	std::size_t size() const noexcept { return _b.coords.size(); }
+	seq_view<Type> seq(std::size_t idx) const noexcept {
+		auto& c=_b.coords[idx];
+		return {&_b.seq_buf[c.seq_s], 0, c.seq_l};
+	}
+	seq_view<Type> operator[](std::size_t idx) const noexcept {
+		return seq(idx);
+	}
+	std::string_view name(std::size_t i) const noexcept {
+		auto a=i>0?_b.coords[i-1].name_e:0;
+		auto b=_b.coords[i].name_e-a;
+		return {&_b.name_buf[a], b};
+	}
+	uint64_t begin(std::size_t idx) const noexcept {
+		auto& c=_b.coords[idx];
+		return c.seq_s*code<Type>::n_per_word;
+	}
+
+	const unsigned int* buf() const noexcept { return &_b.seq_buf[0]; }
+	std::size_t buf_size() const noexcept { return _b.seq_buf.size(); }
+	uint32_t max_seq_len() const noexcept { return _b.max_seq_l; }
+
+private:
+	detail::seqs_base _b;
+	friend std::ostream& operator<<(std::ostream& str, const seqs& seqs) {
+		return str<<&seqs._b.seq_buf[0]<<'@'<<seqs._b.coords.size();
+	}
+};
+
+using xna_seqs=seqs<seq_type::xna>;
+using prot_seqs=seqs<seq_type::prot>;
+
+/*! use 3 bits for each DNA/RNA base, but 4-bit aligned in a sequence.
+ * use 5 bits for each amino acid.
+ */
+
+class seq_encoder_tables;
+
+template<seq_type Type>
+class seq_encoder {
+public:
+	explicit seq_encoder() noexcept =default;
+	~seq_encoder() =default;
+
+	code<Type> operator()(char c) const noexcept;
+	char rev(code<Type> i) const noexcept;
+};
+
+template<seq_type Type>
+class seqs_builder {
+public:
+	seqs_builder(std::size_t num_bases, std::size_t names_size, std::size_t num_seqs);
+	~seqs_builder() =default;
+	seqs_builder(const seqs_builder&) =delete;
+	seqs_builder& operator=(const seqs_builder&) =delete;
+
+	template<bool RC=false>
+	void push_back(std::string_view seq);
+
+private:
+	detail::seqs_base _b;
+	std::shared_ptr<detail::BitStream<Type, std::vector<unsigned int>>> _str;
+	seq_encoder<Type> _encoder;
+	friend class seqs<Type>;
+};
+
+template<seq_type Type>
+seqs<Type>::seqs(seqs_builder<Type>&& builder) noexcept:
+_b{std::move(builder._b)} { }
+
+
+class seq_encoder_tables {
+public:
+	explicit seq_encoder_tables() noexcept;
+private:
+	std::array<unsigned char, 256> _table_xna, _table_prot;
+	std::array<char, 8> _rev_xna;
+	std::array<char, 32> _rev_prot;
+	template<seq_type Type>
+	const unsigned char* table() const noexcept {
+		return Type==seq_type::xna?&_table_xna[0]:&_table_prot[0];
+	}
+	template<seq_type Type>
+	const char* rev() const noexcept {
+		return Type==seq_type::xna?&_rev_xna[0]:&_rev_prot[0];
+	}
+	static seq_encoder_tables _t;
+	template<seq_type> friend class seq_encoder;
+};
+
+template<seq_type Type>
+inline code<Type> seq_encoder<Type>::operator()(char c) const noexcept {
+	return {seq_encoder_tables::_t.table<Type>()[static_cast<unsigned char>(c)]};
+}
+template<seq_type Type>
+inline char seq_encoder<Type>::rev(code<Type> i) const noexcept {
+	return seq_encoder_tables::_t.rev<Type>()[i.value];
+}
+
+class fasta_chunk {
+public:
+	fasta_chunk(fasta_chunk&&) =default;
+	~fasta_chunk() =default;
+	fasta_chunk(const fasta_chunk&) =delete;
+	fasta_chunk& operator=(const fasta_chunk&) =delete;
+
+private:
+	explicit fasta_chunk() =default;
+	fasta_chunk& operator=(fasta_chunk&&) =default;
+
+	struct Coords {
+		std::size_t name_s, name_e;
+		std::size_t seq_s, seq_e;
+	};
+	std::size_t names_total{0};
+	std::size_t seqs_total{0};
+	std::vector<Coords> coords;
+	std::string buf;
+	friend class fasta_reader;
+	template<seq_type>
+	friend class seqs;
+};
+
+class fasta_reader {
+public:
+	explicit fasta_reader() =default;
+	~fasta_reader() =default;
+	fasta_reader(const fasta_reader&) =delete;
+	fasta_reader& operator=(const fasta_reader&) =delete;
+
+	std::shared_ptr<fasta_chunk> load(eccl::source<char>& str, std::size_t buf_size, unsigned int num_seq);
+
+private:
+	using Chunk=fasta_chunk;
+	Chunk cur{};
+	enum class St {
+		expect_hdr,
+		in_hdr,
+		expect_seq,
+		in_seq,
+		end,
+	} st;
+	bool hiteof{false};
+};
+
+// XXX
+class seqs_ref {
+public:
+private:
+};
+
+template<typename Int, typename Alloc, typename ScoreOpts>
+void fill_score_table_xna(std::vector<Int, Alloc>& _table, const ScoreOpts& scores) {
+	_table.reserve(8*8+2);
+	_table.resize(8*8, -100);
+	auto set=[&_table](auto a, auto b, Int v) {
+		_table[pair<seq_type::xna>{a, b}.value]=v;
+		_table[pair<seq_type::xna>{b, a}.value]=v;
+	};
+	const Int m=scores.match ? *scores.match : 1;
+	const Int mm=scores.mismatch ? *scores.mismatch : -4;
+	const Int go=scores.gap_open ? *scores.gap_open : -6;
+	const Int ge=scores.gap_extend ? *scores.gap_extend : -1;
+	_table.push_back(go);
+	_table.push_back(ge);
+
+	eccl::seq_encoder<seq_type::xna> b2b{};
+	const auto A=b2b('A');
+	const auto T=b2b('T');
+	const auto C=b2b('C');
+	const auto G=b2b('G');
+	const auto U=b2b('U');
+	set(A, A, m);
+	set(T, T, m);
+	set(T, U, m);
+	set(U, U, m);
+	set(C, C, m);
+	set(G, G, m);
+
+	set(A, T, mm);
+	set(A, U, mm);
+	set(A, C, mm);
+	set(A, G, mm);
+
+	set(T, C, mm);
+	set(T, G, mm);
+	set(U, C, mm);
+	set(U, G, mm);
+
+	set(C, G, mm);
+}
+void mat_blosum62(std::array<int, 32*32>& mat);
+
+template<seq_type Type, typename Int, typename Alloc, typename ScoreOpts, typename=std::enable_if_t<Type==seq_type::xna>>
+inline void fill_score_table_impl(std::vector<Int, Alloc>& _table, const ScoreOpts& scores) {
+	fill_score_table_xna(_table, scores);
+}
+template<seq_type Type, typename Int, typename Alloc, typename=std::enable_if_t<Type==seq_type::prot>>
+inline void fill_score_table_impl(std::vector<Int, Alloc>& _table) {
+	_table.reserve(32*32+2);
+	std::array<int, 32*32> mat{-100, };
+	mat_blosum62(mat);
+	for(auto v: mat)
+		_table.push_back(v);
+	const Int gap_open=-11;
+	const Int gap_ext=-1;
+	_table.push_back(gap_open);
+	_table.push_back(gap_ext);
+}
+template<seq_type Type, typename Int, typename Alloc, typename ScoreOpts, typename=std::enable_if_t<Type==seq_type::xna>>
+inline void fill_score_table(std::vector<Int, Alloc>& _table, const ScoreOpts& scores) {
+	return fill_score_table_impl<Type, Int, Alloc>(_table, scores);
+}
+template<seq_type Type, typename Int, typename Alloc,  typename=std::enable_if_t<Type==seq_type::prot>>
+inline void fill_score_table(std::vector<Int, Alloc>& _table) {
+	return fill_score_table_impl<Type, Int, Alloc>(_table);
+}
+
+}
+
+#endif
diff --git a/gpu-sw/sw-lib.cc b/gpu-sw/sw-lib.cc
new file mode 100644
index 0000000..80dbcdd
--- /dev/null
+++ b/gpu-sw/sw-lib.cc
@@ -0,0 +1,292 @@
+
+#include "sw.hh"
+#include "sw-lib.hh"
+
+#include <iostream>
+#include <charconv>
+#include <typeinfo>
+using namespace glf;
+// struct glf::sw_handle {
+	// eccl::sw_buffers<eccl::seq_type::xna> buffs;
+// };
+template<eccl::seq_type T>
+struct glf::sw_handle {
+	eccl::sw_buffers<T> buffs;
+};
+
+
+template<eccl::seq_type T>
+sw_handle<T>* glf::sw_create(const sw_opts& opts) {
+	auto res=std::make_unique<sw_handle<T>>();
+	res->buffs.init(0, opts.scores);
+	return res.release();
+}
+
+template<eccl::seq_type T>
+sw_handle<T>* glf::sw_create() {
+	auto res=std::make_unique<sw_handle<T>>();
+	res->buffs.init(0);
+	return res.release();
+}
+
+template<eccl::seq_type T>
+void glf::sw_destroy(sw_handle<T>* hdl) {
+	delete hdl;
+}
+
+
+template<eccl::seq_type T>
+std::vector<sw_align> glf::sw_batch(sw_handle<T>* hdl,
+		const std::vector<std::string_view>& query,
+		const std::vector<std::string_view>& target,
+		const sw_batch_opts& opts) {
+	auto gen_seq_enc=[](auto& raw) {
+		eccl::seqs_builder<T> builder{0, 0, 0};
+		for(auto s: raw)
+			builder.template push_back<false>(s);
+		return eccl::seqs<T>{std::move(builder)};
+	};
+	auto query_enc=gen_seq_enc(query);
+	auto target_enc=gen_seq_enc(target);
+	auto& buffs=hdl->buffs;
+	buffs.run(query_enc, target_enc);
+	
+	std::vector<sw_align> results;
+	results.reserve(query.size());
+	for(unsigned int i=0; i<query.size(); ++i) {
+		auto& res=results.emplace_back();
+		auto& res0=buffs.results_host[i];
+		auto off=buffs.cigar_s_host[i];
+		res.sw_score=res0.score;
+		res.ref_begin=res0.target_se.x;
+		res.ref_end=res0.target_se.y+1;
+		res.query_begin=res0.query_se.x;
+		res.query_end=res0.query_se.y+1;
+		auto cigar=&buffs.cigar_buf_host[off+1];
+		auto cigar_len=buffs.cigar_buf_host[off];
+
+		res.cigar_len=0;
+		auto put_cigar=[&res](unsigned int cnt, unsigned int op) {
+			if(res.cigar_len<MAX_NUM_CIGAR) {
+				res.cigar[res.cigar_len]=(cnt<<2)|op;
+			}
+			res.cigar_len+=1;
+		};
+
+		unsigned int prev_cnt{0};
+		unsigned char prev_op{'\x00'};
+		auto nn=cigar_len;
+		unsigned int mmcnt=0;
+		while(nn>0) {
+			--nn;
+			unsigned int v=(cigar[nn/4]>>((nn%4)*8))&0xff;
+			auto op="\x00\x03\x02\x00"[v&0b11];
+			mmcnt+="\x01\x01\x01\x00"[v&0b11];
+			v=(v>>2)+1;
+			if(op==prev_op) {
+				prev_cnt+=v;
+			} else {
+				if(prev_cnt>0) {
+					put_cigar(prev_cnt, prev_op);
+				}
+				prev_cnt=v;
+				prev_op=op;
+			}
+		}
+		if(prev_cnt>0) {
+			put_cigar(prev_cnt, prev_op);
+		}
+		if(res.cigar_len>MAX_NUM_CIGAR)
+			res.sw_score=0;
+		res.mismatch=mmcnt;
+		if(false) {
+			std::array<char, 4096> buf;
+			std::cerr<<"seq1 "<<query[i]<<" "<<query[i].size();
+			std::cerr<<"\nseq2 "<<target[i]<<" "<<target[i].size();
+			auto r=format_sw_result(buf, res0, "q", "t", cigar, cigar_len);
+			std::cerr<<"\n"<<r<<"\n";
+		}
+	}
+	return results;
+}
+
+
+
+// static void cigar_to_string(const sw_align& alignment, std::string_view& query, std::string_view& ref, std::vector<int>& q_res, std::vector<size_t>& r_res) {
+//   q_res.clear();
+//   r_res.clear();
+//   int qp = alignment.query_begin;
+//   int rp = alignment.ref_begin;
+//   for (int c=0;c<alignment.cigar_len;c++)
+//   {
+//     int cigar_int = cigar_int_to_len(alignment.cigar[c]);
+//     char cigar_op = cigar_int_to_op(alignment.cigar[c]);
+//     // cout << cigar_int << cigar_op<<endl;
+//     if (cigar_op=='M'||cigar_op=='X'){
+//       // q_res.append(query+qp, query+qp+cigar_int);
+//       for (int i=0;i<cigar_int;i++)
+//       {
+//         q_res.push_back(qp+i);
+//         r_res.push_back(rp+i);
+//       }
+//       // r_res.append(ref+rp, ref+rp+cigar_int);
+//       qp+=cigar_int;
+//       rp+=cigar_int;
+//     }
+//     else if (cigar_op=='I')
+//     {
+//       // q_res.append(query+qp, query+qp+cigar_int);
+//       for (int i=0;i<cigar_int;i++)
+//       {
+//         q_res.push_back(qp+i);
+//         r_res.push_back(-1);
+//       }
+//       qp+=cigar_int;
+//     }
+//     else if (cigar_op=='D')
+//     {
+//       // r_res.append(ref+rp, ref+rp+cigar_int);
+//       for (int i=0;i<cigar_int;i++)
+//       {
+//         r_res.push_back(rp+i);
+//         q_res.push_back(-1);
+//       }
+//       rp+=cigar_int;
+//     }
+//     else if (cigar_op=='S')
+//     {
+//       // qp+=cigar_int;
+//     }
+//     else{
+//       std::cout <<"error cigar:"<<cigar_op<<std::endl;
+//       exit(-1);
+//     }
+//   }
+// }
+
+template<typename T>
+static void inspect(const eccl::dev_vector<T>& obj, sw_stats& stats) {
+	stats.dev_mem_usage+=obj.size()*sizeof(T);
+}
+template<typename T>
+static void inspect(const eccl::host_vector<T>& obj, sw_stats& stats) {
+	stats.host_mem_usage+=obj.size()*sizeof(T);
+}
+template<eccl::seq_type Type>
+static void inspect(const eccl::sw_buffers<Type>& obj, sw_stats& stats) {
+	inspect(obj.subst_mat, stats);
+	inspect(obj.query_buf, stats);
+	inspect(obj.query_se, stats);
+	inspect(obj.target_buf, stats);
+	inspect(obj.target_se, stats);
+	inspect(obj.tmp_scores, stats);
+	inspect(obj.tmp_traceback_s, stats);
+	inspect(obj.tmp_traceback_buf, stats);
+	inspect(obj.results, stats);
+	inspect(obj.cigar_s, stats);
+	inspect(obj.cigar_buf, stats);
+	inspect(obj.results_host, stats);
+	inspect(obj.cigar_buf_host, stats);
+	inspect(obj.cigar_s_host, stats);
+}
+
+template<eccl::seq_type T>
+static void inspect(const sw_handle<T>& obj, sw_stats& stats) {
+	inspect(obj.buffs, stats);
+}
+
+template<eccl::seq_type T>
+sw_stats sw_inspect(sw_handle<T>* hdl) {
+	assert(hdl);
+	sw_stats stats;
+	stats.dev_mem_usage=0;
+	stats.host_mem_usage=0;
+	inspect(*hdl, stats);
+	return stats;
+}
+
+inline static char* format_impl(char* p, char* q, const char* n) {
+	while(*n) {
+		*p=*n;
+		++p;
+		++n;
+	}
+	return p;
+}
+inline static char* format_impl(char* p, char* q, char n) {
+	*p=n;
+	++p;
+	return p;
+}
+inline static char* format_impl(char* p, char* q, std::string_view n) {
+	for(unsigned int i=0; i<n.size(); ++i) {
+		*p=n[i];
+		++p;
+	}
+	return p;
+}
+template<typename T, typename=std::enable_if_t<std::is_integral<T>::value>>
+inline static char* format_impl(char* p, char* q, T n) {
+	auto res=std::to_chars(p, q, n);
+	if(res.ec!=std::errc{})
+		throw;
+	p=res.ptr;
+	return p;
+}
+
+std::string_view eccl::format_sw_result(std::array<char, 4096>& buf,
+		const eccl::sw_result& align,
+		std::string_view name_query,
+		std::string_view name_target,
+		const unsigned int* cigar,
+		unsigned int cigar_len) {
+	auto put=[&buf](char* p, auto n) ->char* {
+		return format_impl(p, &buf[4096], n);
+	};
+	auto p=&buf[0];
+	p=put(p, "query_name=");
+	p=put(p, name_query);
+	p=put(p, "\ttarget_name=");
+	p=put(p, name_target);
+	p=put(p, "\tscore=");
+	p=put(p, align.score);
+	p=put(p, "\tquery_batch_start=");
+	p=put(p, align.query_se.x);
+	p=put(p, "\ttarget_batch_start=");
+	p=put(p, align.target_se.x);
+	p=put(p, "\tquery_batch_end=");
+	p=put(p, align.query_se.y);
+	p=put(p, "\ttarget_batch_end=");
+	p=put(p, align.target_se.y);
+	p=put(p, "\tCIGAR=");
+	// cigar is slow
+	//continue;
+
+	//fprintf(stderr, "alignment: %zd %u\n", oo, nn);
+	std::size_t prev_cnt{0};
+	char prev_op{'\x00'};
+	auto nn=cigar_len;
+	while(nn>0) {
+		--nn;
+		unsigned int v=(cigar[nn/4]>>((nn%4)*8))&0xff;
+		auto op="XIDM"[v&0b11];
+		v=(v>>2)+1;
+		if(op==prev_op) {
+			prev_cnt+=v;
+		} else {
+			if(prev_cnt>0) {
+				p=put(p, prev_cnt);
+				p=put(p, prev_op);
+			}
+			prev_cnt=v;
+			prev_op=op;
+		}
+	}
+	if(prev_cnt>0) {
+		p=put(p, prev_cnt);
+		p=put(p, prev_op);
+	}
+	p=put(p, '\n');
+	std::size_t l=p-&buf[0];
+	return {&buf[0], l};
+}
diff --git a/gpu-sw/sw-lib.hh b/gpu-sw/sw-lib.hh
new file mode 100644
index 0000000..dd55dd7
--- /dev/null
+++ b/gpu-sw/sw-lib.hh
@@ -0,0 +1,146 @@
+#ifndef GLF_SW_LIB_HH
+#define GLF_SW_LIB_HH
+
+#include <vector>
+#include <array>
+#include <optional>
+
+//refactor!!!
+#include <cstdint>
+#include <vector>
+#include <string>
+#include <string_view>
+
+#include "core.hh"
+
+#define MAPSTR "MXDIE"
+#ifndef CIGAR_SHIFT
+#define CIGAR_SHIFT 2u
+#endif
+
+#define MAX_NUM_CIGAR 16
+
+typedef struct {
+    uint16_t sw_score;
+    uint16_t mismatch;
+    int32_t ref_begin;
+    int32_t ref_end;
+    int32_t query_begin;
+    int32_t query_end;
+    uint32_t cigar[MAX_NUM_CIGAR];
+    int32_t cigar_len;
+} sw_align;
+
+extern const uint8_t encoded_ops[];
+
+static inline uint32_t to_cigar_int(uint32_t length, char op) {
+    return (length << CIGAR_SHIFT) | (encoded_ops[(int)op]);
+}
+
+static inline char cigar_int_to_op(uint32_t cigar_int) {
+    return MAPSTR[cigar_int & 0x3U];
+}
+
+static inline uint32_t cigar_int_to_len(uint32_t cigar_int) {
+    return cigar_int >> CIGAR_SHIFT;
+}
+
+
+// static void cigar_to_string(const sw_align& alignment, std::string_view& query, std::string_view& ref, std::vector<int>& q_res, std::vector<size_t>& r_res);
+static void cigar_to_string(const sw_align& alignment, std::string_view& query, std::string_view& ref, std::vector<int>& q_res, std::vector<size_t>& r_res) {
+  q_res.clear();
+  r_res.clear();
+  int qp = alignment.query_begin;
+  int rp = alignment.ref_begin;
+  for (int c=0;c<alignment.cigar_len;c++)
+  {
+    int cigar_int = cigar_int_to_len(alignment.cigar[c]);
+    char cigar_op = cigar_int_to_op(alignment.cigar[c]);
+    // cout << cigar_int << cigar_op<<endl;
+    if (cigar_op=='M'||cigar_op=='X'){
+      // q_res.append(query+qp, query+qp+cigar_int);
+      for (int i=0;i<cigar_int;i++)
+      {
+        q_res.push_back(qp+i);
+        r_res.push_back(rp+i);
+      }
+      // r_res.append(ref+rp, ref+rp+cigar_int);
+      qp+=cigar_int;
+      rp+=cigar_int;
+    }
+    else if (cigar_op=='I')
+    {
+      // q_res.append(query+qp, query+qp+cigar_int);
+      for (int i=0;i<cigar_int;i++)
+      {
+        q_res.push_back(qp+i);
+        r_res.push_back(-1);
+      }
+      qp+=cigar_int;
+    }
+    else if (cigar_op=='D')
+    {
+      // r_res.append(ref+rp, ref+rp+cigar_int);
+      for (int i=0;i<cigar_int;i++)
+      {
+        r_res.push_back(rp+i);
+        q_res.push_back(-1);
+      }
+      rp+=cigar_int;
+    }
+    else if (cigar_op=='S')
+    {
+      // qp+=cigar_int;
+    }
+    else{
+      // std::cout <<"error cigar:"<<cigar_op<<std::endl;
+      exit(-1);
+    }
+  }
+}
+//!!!
+
+namespace glf {
+
+struct sw_opts {
+	struct {
+		std::optional<int> match;
+		std::optional<int> mismatch;
+		std::optional<int> gap_open;
+		std::optional<int> gap_extend;
+	} scores;
+};
+
+struct sw_batch_opts {
+	//minscore
+};
+
+struct sw_stats {
+	std::size_t dev_mem_usage;
+	std::size_t host_mem_usage;
+};
+
+template<eccl::seq_type T>
+struct sw_handle;
+
+template<eccl::seq_type T>
+sw_handle<T>* sw_create(const sw_opts& opts);
+
+template<eccl::seq_type T>
+sw_handle<T>* sw_create();
+
+template<eccl::seq_type T>
+void sw_destroy(sw_handle<T>* hdl);
+
+template<eccl::seq_type T>
+std::vector<sw_align> sw_batch(sw_handle<T>* hdl,
+		const std::vector<std::string_view>& query,
+		const std::vector<std::string_view>& target,
+		const sw_batch_opts& opts);
+
+template<eccl::seq_type T>
+sw_stats sw_inspect(sw_handle<T>* hdl);
+
+}
+
+#endif
diff --git a/gpu-sw/sw.cu b/gpu-sw/sw.cu
new file mode 100644
index 0000000..a26424e
--- /dev/null
+++ b/gpu-sw/sw.cu
@@ -0,0 +1,361 @@
+#include <cassert>
+#include <chrono>
+#include <vector>
+#include <charconv>
+
+#include <cuda_runtime.h>
+
+#include "cuda-utils.hh"
+#include "sw.hh"
+#include "core.hh"
+
+
+
+#include <iostream>
+
+using check_cuda=eccl::check_cuda;
+
+template<eccl::seq_type Type>
+__global__ void align_kernel(
+		/* Req/Reuse/Share/Combine Input Output index */
+		/* *!!! * ! */const unsigned int work_size,
+		/* *!!! * ! */const unsigned int max_query_len,
+		const unsigned int band_size,
+		/* ***! * ! */const signed short* subst_mat,
+		/* *..! * ! */const unsigned int* target_buf,
+		/* *..! * ! work */const ulong2* target_se,
+		/* *!!! * ! */const unsigned int* query_buf,
+		/* *!!! * ! work */const ulong2* query_se,
+		/* *!!! ! ! */int2* tmp_scores,
+		/* *!!! ! ! */unsigned int* tmp_traceback_buf,
+		/* *!!! * ! thr */const ulong* tmp_traceback_s,
+		/* *!!! ! * */eccl::sw_result* results,
+		/* *!!! ! * */unsigned int* cigar_buf,
+		/* *!!! * ! work */const unsigned long* cigar_s,
+		int) {
+
+	const unsigned int thr_id=threadIdx.x+blockIdx.x*blockDim.x;
+	const unsigned int debug=0;
+	//const unsigned int thr_num=blockDim.x*gridDim.x;
+	constexpr uint2 tile_size={8, 8*2};
+	const int gap_ext=subst_mat[(Type==eccl::seq_type::xna?8*8:32*32)+1];
+	const int gap_open=subst_mat[(Type==eccl::seq_type::xna?8*8:32*32)+0]-gap_ext;
+
+	if(thr_id>=work_size)
+		return;
+	const auto work_id=thr_id;
+
+	auto max_prev_row=&tmp_scores[thr_id*max_query_len];
+	auto t_offset=tmp_traceback_s[thr_id];
+	[[maybe_unused]] unsigned int t_size=tmp_traceback_s[thr_id+1]-t_offset;
+	assert(t_offset%4==0);
+	auto mat2=&tmp_traceback_buf[t_offset];
+
+	struct {
+		unsigned int ii;
+		unsigned int jj;
+		int score;
+		unsigned int i0, j0;
+	} _top;
+	auto a_offset=query_se[work_id].x;
+	assert(a_offset<query_se[work_id].y);
+	unsigned int a_size=query_se[work_id].y-a_offset;
+	auto b_offset=target_se[work_id].x;
+	assert(b_offset<target_se[work_id].y);
+	unsigned int b_size=target_se[work_id].y-b_offset;
+	assert(max_query_len*b_size/8<=t_size);
+	if(debug)
+	printf("@%u sizes: %u %u\n", thr_id, a_size, b_size);
+
+	_top.score=-9999;
+
+	const unsigned int half_band_width=band_size?band_size:1024*1024*1024;
+	auto get_query_range=[a_size,b_size,half_band_width](unsigned int j0, unsigned int j1) ->uint2 {
+		unsigned int lb=0, ub=a_size;
+		if(a_size<b_size) {
+			// 0: [0 hbw)
+			// 1: [0 hbw+1)
+			// ...
+			// b_size-2: [a_size-hbw-1 a_size)
+			// b_size-1: [a_size-hbw a_size)
+			if(ub>half_band_width+j1)
+				ub=half_band_width+j1;
+			if(a_size+j0+1>b_size+half_band_width)
+				lb=a_size+j0+1-b_size-half_band_width;
+		} else {
+			// similarly
+			if(j0+1>half_band_width)
+				lb=j0-half_band_width+1;
+			if(half_band_width+j1<b_size)
+				ub=a_size-b_size+half_band_width+j1;
+		}
+		return {lb, ub};
+	};
+	auto decode_conds=[](unsigned int ec) {
+		return (ec*11)%32;
+	};
+	auto encode_conds=[decode_conds](unsigned char c1, unsigned char c2, unsigned char c3) ->unsigned int {
+		/* each c* can be {0, 1, 2};  thus 27 possible cases;  but only 8 casses can occur.
+		   encode the 8 cases with 4 bits to save space.
+		 */
+		auto c=(c1*3u+c2)*3u+c3;
+		auto ec=(c*3)%32;
+		assert(ec<16);
+		assert(decode_conds(ec)==c);
+		return ec;
+	};
+
+	for(unsigned int i=0; i<max_query_len; ++i) {
+		max_prev_row[i]=make_int2(0, gap_open);
+	}
+	for(unsigned int j=0; j<b_size; j+=tile_size.y) {
+		int2 max_prev_col[tile_size.y];
+		eccl::code<Type> b_cache[tile_size.y];
+		for(unsigned int jj=0; jj<tile_size.y; ++jj) {
+			b_cache[jj]=(j+jj<b_size)?eccl::get_code<Type>(target_buf,
+					b_offset+j+jj):eccl::code<Type>{};
+			max_prev_col[jj]=make_int2(0, gap_open);
+		}
+		auto qrange=get_query_range(j, j+tile_size.y-1);
+		qrange.x=qrange.x/tile_size.x*tile_size.x;
+		if(debug)
+		printf("@%u i range: %u %u %u\n", thr_id, j, qrange.x, qrange.y);
+		for(unsigned int i=qrange.x; i<qrange.y; i+=tile_size.x) {
+			unsigned int tb_batch[tile_size.y/8*tile_size.x];
+			eccl::code<Type> a_cache[tile_size.x];
+			for(unsigned int ii=0; ii<tile_size.x; ++ii) {
+				a_cache[ii]=(i+ii<a_size)?eccl::get_code<Type>(query_buf,
+						a_offset+i+ii):eccl::code<Type>{};
+			}
+// per-tile
+for(unsigned int ii=0; ii<tile_size.x; ++ii) {
+	auto max_prev_row_i_ii=max_prev_row[i+ii];
+	for(unsigned int jj=0; jj<tile_size.y; ++jj) {
+		int3 score;
+		eccl::pair<Type> bp{a_cache[ii], b_cache[jj]};
+		auto bp_score=subst_mat[bp.value];
+		score.x=max(max_prev_row_i_ii.x+bp_score, 0);
+		score.y=max_prev_row_i_ii.y+gap_ext;
+		score.z=max_prev_col[jj].y+gap_ext;
+
+		max_prev_row_i_ii.x=max_prev_col[jj].x;
+		/* comparison conditions for backtrack */
+		unsigned char cond1=0, cond2=0, cond3=0;
+		max_prev_col[jj].x=score.x;
+		if(max_prev_col[jj].x<score.y) {
+			max_prev_col[jj].x=score.y;
+			cond1=1;
+		}
+		if(max_prev_col[jj].x<score.z) {
+			max_prev_col[jj].x=score.z;
+			cond1=2;
+		}
+
+		max_prev_col[jj].y=max_prev_row_i_ii.y=score.x+gap_open;
+		if(max_prev_row_i_ii.y<score.y) {
+			max_prev_row_i_ii.y=score.y;
+			cond2=1;
+		}
+		auto gap_open_y=score.y+gap_open;
+		if(max_prev_col[jj].y<gap_open_y) {
+			max_prev_col[jj].y=gap_open_y;
+			cond3=1;
+		}
+		auto gap_open_z=score.z+gap_open;
+		if(max_prev_row_i_ii.y<gap_open_z) {
+			max_prev_row_i_ii.y=gap_open_z;
+			cond2=2;
+		}
+		if(max_prev_col[jj].y<score.z) {
+			max_prev_col[jj].y=score.z;
+			cond3=2;
+		}
+
+		if(debug)
+		printf("@%u score: %u %u (%u %u %d): %d %d %d\n", thr_id, i+ii, j+jj, a_cache[ii].value, b_cache[jj].value, bp_score, score.x);
+		if(/*i+ii<a_size && */score.x>_top.score) {
+			_top={i+ii, j+jj, score.x};
+		}
+		auto& tb_tmp=tb_batch[jj/8+(tile_size.y/8)*ii];
+		tb_tmp=(tb_tmp<<4)|encode_conds(cond1, cond2, cond3);
+	}
+	max_prev_row[i+ii]=max_prev_row_i_ii;
+}
+			auto ptr=&mat2[j*max_query_len/8+i*(tile_size.y/8)];
+			for(unsigned int jj=0; jj<tile_size.y/8*tile_size.x; ++jj) {
+				ptr[jj]=reinterpret_cast<unsigned int*>(tb_batch)[jj];
+			}
+		}
+	}
+
+	auto tb_offset=cigar_s[work_id];
+	unsigned int tb_size=cigar_s[work_id+1]-tb_offset;
+	//assert(tb_size>=a_size/4+1);
+	//assert(tb_size>=48+1);
+	auto tb_ops=&cigar_buf[tb_offset];
+
+	unsigned int tb_i=0;
+	unsigned int i=_top.ii;
+	unsigned int j=_top.jj;
+
+#if 1
+	enum {
+		Mat,
+		Ins,
+		Del,
+		End,
+	} st=Mat;
+	int s=1;
+	int sss=_top.score;
+	unsigned int tmp_cnt{0};
+	unsigned char tmp_op{0b100000};
+	unsigned int tmp_cache{0};
+	auto add_op=[&tmp_cnt,&tmp_op,&tb_i,&tmp_cache,tb_ops,tb_size](unsigned char op) {
+		if(op==tmp_op) {
+			++tmp_cnt;
+		} else {
+			constexpr unsigned int per_slot=(0xffu>>2)+1;
+			while(tmp_cnt>0) {
+				unsigned int n=min(tmp_cnt, per_slot);
+				if(tb_i/4+1<tb_size) {
+					auto rem=tb_i%4;
+					tmp_cache|=(((n-1)<<2)|tmp_op)<<(rem*8);
+					if(rem==3) {
+						tb_ops[tb_i/4+1]=tmp_cache;
+						tmp_cache=0;
+					}
+					++tb_i;
+				}
+				tmp_cnt-=n;
+			}
+			tmp_cnt=1;
+			tmp_op=op;
+		}
+	};
+	auto elem2=[mat2,max_query_len](unsigned int i, unsigned int j) ->unsigned int {
+		auto jj=j%tile_size.y;
+		j-=jj;
+		return (mat2[(j*max_query_len+i*tile_size.y+jj)/8]>>((7-jj%8)*4))&0xf;
+	};
+	do {
+		if(debug)
+		printf("@%u %u %u %u %u %d\n", thr_id, i, j, eccl::get_code<Type>(query_buf, i).value, eccl::get_code<Type>(target_buf, j).value, sss);
+		unsigned int cond;
+		switch(st) {
+			case Mat:
+				{
+					auto a=eccl::get_code<Type>(query_buf,
+							a_offset+i);
+					auto b=eccl::get_code<Type>(target_buf,
+							b_offset+j);
+					s=subst_mat[eccl::pair<Type>{a, b}.value];
+				}
+				if(s>0) {
+					add_op(0b11);
+					//assert(ss>0);
+				} else {
+					add_op(0b00);
+					//assert(ss<=0);
+				}
+				sss-=s;
+				if(sss==0) {
+					st=End;
+					break;
+				}
+				assert(i>0);
+				assert(j>0);
+				cond=decode_conds(elem2(i-1, j-1))/9;
+				switch(cond) {
+					case 0:
+						break;
+					case 1:
+						st=Del;
+						break;
+					case 2:
+						st=Ins;
+						break;
+					default:
+						assert(0);
+				}
+				--i;
+				--j;
+				break;
+			case Del:
+				add_op(0b10);
+				sss-=gap_ext;
+				cond=j>0?decode_conds(elem2(i, j-1))/3%3:0;
+				switch(cond) {
+					case 0:
+						sss-=gap_open;
+						st=Mat;
+						break;
+					case 1:
+						break;
+					case 2:
+						sss-=gap_open;
+						st=Ins;
+						break;
+					default:
+						assert(0);
+				}
+				--j;
+				break;
+			case Ins:
+				add_op(0b01);
+				sss-=gap_ext;
+				cond=i>0?decode_conds(elem2(i-1, j))%3:0;
+				switch(cond) {
+					case 0:
+						sss-=gap_open;
+						st=Mat;
+						break;
+					case 1:
+						sss-=gap_open;
+						st=Del;
+						break;
+					case 2:
+						break;
+					default:
+						assert(0);
+				}
+				--i;
+				break;
+			case End:
+				assert(0);
+		}
+	} while(st!=End);
+	add_op(0b100000);
+	if((tb_i%4)>0)
+		tb_ops[tb_i/4+1]=tmp_cache;
+	assert((tb_i+3)/4<tb_size);
+#endif
+	_top.i0=i;
+	_top.j0=j;
+	tb_ops[0]=tb_i;
+
+	auto& output=results[work_id];
+	output.score=_top.score;
+	output.query_se={_top.i0, _top.ii};
+	output.target_se={_top.j0, _top.jj};
+}
+
+template<eccl::seq_type Type>
+void eccl::sw_gpu(const sw_gpu_opts& opts, sw_gpu_args&& args) {
+
+	align_kernel<Type><<<opts.num_blocks, opts.num_threads, 0, args.stream>>>(
+			args.work_size, args.max_query_len,
+			opts.band_size,
+			opts.subst_mat,
+			args.target_buf, args.target_se,
+			args.query_buf, args.query_se,
+			args.tmp_scores,
+			args.tmp_traceback_buf, args.tmp_traceback_s,
+			args.results,
+			args.cigar_buf, args.cigar_s,
+			0);
+
+}
+
+template void eccl::sw_gpu<eccl::seq_type::xna>(const sw_gpu_opts& opts, sw_gpu_args&& args);
+template void eccl::sw_gpu<eccl::seq_type::prot>(const sw_gpu_opts& opts, sw_gpu_args&& args);
+
diff --git a/gpu-sw/sw.hh b/gpu-sw/sw.hh
new file mode 100644
index 0000000..e46c98c
--- /dev/null
+++ b/gpu-sw/sw.hh
@@ -0,0 +1,224 @@
+#include <cuda_runtime.h>
+//
+#include "core.hh"
+#include "seqs.hh"
+#include "cuda-utils.hh"
+#include<iostream>
+
+/*! sw alg.
+ *
+ * usage:
+ *
+ * 1.  use eccl::seqs_builder to build a seq batch (in host memory).
+ *     call .push_back(seq) to add a sequence.
+ *     construct eccl::seqs after sequences are added.
+ * 2.  use eccl::sw_buffers to manage device buffers.
+ *     call .init(band_size) once to init.
+ *     call [res, cigar_buf, cigar_s]=.run(query, target) to run.
+ *     call eccl::format_sw_result() to format output to a buffer.
+ *     eccl::sw_buffers can be reused.
+ *
+ * see check_xna() in sw-test.cc for simple examples.
+ *
+ * NOTES
+ * - correct compared to gasal2.
+ * - just use the alternative API (eccl::sw_buffers).
+ *   not removing existing async stuff.
+ * - why no batch?
+ *   input is not a single seq, not all the seqs, then it *is* a batch.
+ * - pass nullptr if no stream is needed.
+ * - the sw part may need further work, i'll do code clean up after that.
+ *   print and time related calls are silenced.
+ * - non-relevant (and relevant) codes are in a separate dir.
+ *   just leave them there and use the API.
+ *
+ *
+ * some rough benchmarks:
+ *
+ * gasal2 no output: total 0m4.430s,  kernel 1.03s
+ * gasal2:           total 0m12.239s, kernel 8.77s
+ * eccl:             total 0m4.289s,  kernel 2.4s
+ * eccl banded(16):  total 0m3.632s,  kernel 1.8s
+ *
+ * eccl tests are run asynchronously.
+ * there are noticeable overheads in data copying, i'll improve that later.
+ *
+ */
+
+namespace eccl {
+
+struct sw_result {
+	int score;
+	uint2 query_se;
+	uint2 target_se;
+};
+
+/*! global/fixed stuff */
+struct sw_gpu_opts {
+	unsigned int num_blocks;
+	unsigned int num_threads;
+	unsigned int band_size;
+	dev_ptr<signed short> subst_mat;
+};
+
+/*! per-batch stuff */
+struct sw_gpu_args {
+	cudaStream_t stream;
+	unsigned int work_size;
+	unsigned int max_query_len;
+	dev_ptr<unsigned int> target_buf;
+	dev_ptr<ulong2> target_se;
+	dev_ptr<unsigned int> query_buf;
+	dev_ptr<ulong2> query_se;
+	dev_ptr<int2> tmp_scores;
+	dev_ptr<unsigned int> tmp_traceback_buf;
+	dev_ptr<unsigned long> tmp_traceback_s;
+	dev_ptr<sw_result> results;
+	dev_ptr<unsigned int> cigar_buf;
+	dev_ptr<unsigned long> cigar_s;
+};
+
+inline unsigned int estimate_cigar_size(unsigned int query_len, unsigned int target_len) {
+	// XXX upper bound???
+	//return query_len/4;
+	//return 48;
+	//return 20;
+	return 48;
+	// (450)/16 == 28
+	// no_cigar(flat_traceback): 1.31s
+	// 8: 1.48s
+	// 16: 1.45s
+	// 32: 1.50s
+	// 64: 1.78s
+	// 128: 3.0s
+}
+
+template<seq_type Type>
+void sw_gpu(const sw_gpu_opts& opts, sw_gpu_args&& args);
+
+template<seq_type Type>
+struct sw_buffers {
+	unsigned int band_size;
+	eccl::dev_vector<signed short> subst_mat;
+	eccl::dev_vector<unsigned int> query_buf;
+	eccl::dev_vector<ulong2> query_se;
+	eccl::dev_vector<unsigned int> target_buf;
+	eccl::dev_vector<ulong2> target_se;
+	eccl::dev_vector<int2> tmp_scores;
+	eccl::dev_vector<ulong> tmp_traceback_s;
+	eccl::dev_vector<unsigned int> tmp_traceback_buf;
+	eccl::dev_vector<eccl::sw_result> results;
+	eccl::dev_vector<ulong> cigar_s;
+	eccl::dev_vector<unsigned int> cigar_buf;
+
+	/*! results.  use after run */
+	eccl::host_vector<eccl::sw_result> results_host;
+	eccl::host_vector<unsigned int> cigar_buf_host;
+	eccl::host_vector<ulong> cigar_s_host;
+
+	template<typename ScoreOpts>
+	void init(unsigned int band_size, const ScoreOpts& scores) {
+		this->band_size=band_size;
+		eccl::host_vector<signed short> subst_mat_host;
+		eccl::fill_score_table<Type>(subst_mat_host, scores);
+		subst_mat.malloc_h2d(subst_mat_host, nullptr);
+	}
+	void init(unsigned int band_size) {
+		this->band_size=band_size;
+		eccl::host_vector<signed short> subst_mat_host;
+		eccl::fill_score_table<Type>(subst_mat_host);
+		subst_mat.malloc_h2d(subst_mat_host, nullptr);
+	}
+	inline void run(const seqs<Type>& query, const seqs<Type>& target);
+};
+
+template<seq_type Type>
+inline void sw_buffers<Type>::run(const seqs<Type>& query, const seqs<Type>& target) {
+	sw_gpu_opts opts;
+	opts.num_blocks=40;
+	opts.num_threads=128;
+	opts.band_size=this->band_size;
+	opts.subst_mat=this->subst_mat;
+
+	sw_gpu_args args;
+	args.stream=nullptr;
+	assert(query.size()==target.size());
+	args.work_size=query.size();
+	args.max_query_len=eccl::padded_len<Type>(query.max_seq_len());
+
+	eccl::host_vector<unsigned int> query_buf_host;
+	query_buf_host.insert(query_buf_host.end(), query.buf(), query.buf()+query.buf_size());
+	query_buf.try_malloc_h2d(query_buf_host, nullptr);
+	eccl::host_vector<ulong2> query_se_host;
+	for(unsigned int i=0; i<query.size(); ++i) {
+		auto begin=query.begin(i);
+		auto seq=query[i];
+		query_se_host.push_back({begin+seq.begin, begin+seq.end});
+	}
+	query_se.try_malloc_h2d(query_se_host, nullptr);
+
+	eccl::host_vector<unsigned int> target_buf_host;
+	target_buf_host.insert(target_buf_host.end(), target.buf(), target.buf()+target.buf_size());
+	target_buf.try_malloc_h2d(target_buf_host, nullptr);
+	eccl::host_vector<ulong2> target_se_host;
+	for(unsigned int i=0; i<query.size(); ++i) {
+		auto begin=target.begin(i);
+		auto seq=target[i];
+		target_se_host.push_back({begin+seq.begin, begin+seq.end});
+	}
+	target_se.try_malloc_h2d(target_se_host, nullptr);
+
+	tmp_scores.try_malloc(opts.num_threads*opts.num_blocks*args.max_query_len);
+	eccl::host_vector<ulong> tmp_traceback_s_host;
+	tmp_traceback_s_host.push_back(0);
+	for(unsigned int i=0; i<opts.num_threads*opts.num_blocks; ++i) {
+		std::size_t n{0};
+		for(unsigned int j=i; j<query.size(); j+=opts.num_threads*opts.num_blocks) {
+			auto s=target[j].len();
+			n=std::max<std::size_t>(n, s);
+		}
+		n=args.max_query_len*((n+31)/32)*4;
+		tmp_traceback_s_host.push_back(tmp_traceback_s_host.back()+n);
+	}
+	tmp_traceback_s.try_malloc_h2d(tmp_traceback_s_host, nullptr);
+	tmp_traceback_buf.try_malloc(tmp_traceback_s_host.back());
+
+	results.try_malloc(query.size());
+
+	cigar_s_host.clear();
+	cigar_s_host.push_back(0);
+	for(unsigned int i=0; i<query.size(); ++i) {
+		auto len=eccl::estimate_cigar_size(query[i].len(),
+				target[i].len());
+		cigar_s_host.push_back(cigar_s_host.back()+(len+3)/4+1);
+	}
+	cigar_s.try_malloc_h2d(cigar_s_host, nullptr);
+	cigar_buf.try_malloc(cigar_s_host.back());
+
+	args.target_buf=target_buf;
+	args.target_se=target_se;
+	args.query_buf=query_buf;
+	args.query_se=query_se;
+	args.tmp_scores=tmp_scores;
+	args.tmp_traceback_buf=tmp_traceback_buf;
+	args.tmp_traceback_s=tmp_traceback_s;
+	args.results=results;
+	args.cigar_buf=cigar_buf;
+	args.cigar_s=cigar_s;
+	sw_gpu<Type>(opts, std::move(args));
+	cudaGetLastError(), eccl::check_cuda{"align"};
+	cigar_buf_host.resize(cigar_buf.size());
+	cigar_buf.d2h(cigar_buf_host, nullptr);
+	results_host.resize(results.size());
+	results.d2h(results_host, nullptr);
+	cudaStreamSynchronize(nullptr), check_cuda{"sync stream"};
+};
+
+std::string_view format_sw_result(std::array<char, 4096>& buf,
+		const sw_result& align,
+		std::string_view name_query,
+		std::string_view name_target,
+		const unsigned int* cigar,
+		unsigned int cigar_len);
+
+}
diff --git a/include/smith.h b/include/smith.h
index 32a55de..3643020 100644
--- a/include/smith.h
+++ b/include/smith.h
@@ -9,5 +9,7 @@ using namespace std;
 // void banded_smith_waterman(const char *q, const char *c, vector<uint32_t>& q_idxs, vector<uint32_t>& q_lens, vector<size_t>& diags, size_t c_len, size_t num_task, vector<SWResult> &res, ThreadPool* pool, vector<future<int>>& rs);
 
 void smith_waterman_kernel(const int idx, SWResult *res, SWTasks* sw_task);
+void gpu_SW( vector<SWResult> &res,SWTasks* sw_task);
 
-void gasal_run(SWTasks tasks, vector<SWResult> res[][NUM_STREAM],const char* q_dev, const char* s_dev, int num_g, int span);
\ No newline at end of file
+// void gasal_run(SWTasks tasks, vector<SWResult> res[][NUM_STREAM],const char* q_dev,  int num_g, int span);
+void total_gpu_SW(SWTasks tasks, vector<SWResult> res[][NUM_STREAM],const char* query,const char* target,int num_g, int span);
\ No newline at end of file
diff --git a/include/util.h b/include/util.h
index 1204b3b..8349a10 100644
--- a/include/util.h
+++ b/include/util.h
@@ -31,6 +31,7 @@
 #include "omp.h"
 // #include "boost/asio.hpp"
 
+#define MASK5(v) (v & 0b11111)
 extern ThreadPool* pool;
 // extern StripedSmithWaterman::Aligner aligner;
 // extern StripedSmithWaterman::Filter filter;
@@ -196,6 +197,8 @@ int check_db(const char *db, size_t &max_size, size_t& total_size);
 void load_seq(string db, int num, char *&str, size_t &len);
 void proceed_result(vector<SWResult> *res_d, vector<SWResult> &res_t, const char *query, const char *subj, QueryGroup &q_group, const char *s_name, const size_t* s_offsets, const size_t* sn_offsets, const size_t s_num ,size_t total_db_size);
 
+void proceed_result_glf(vector<SWResult> *res_d, vector<SWResult> &res_t, const char *query, const char *subj, QueryGroup &q_group, const char *s_name, const size_t* s_offsets, const size_t* sn_offsets, const size_t s_num ,size_t total_db_size);
+
 char get_char(const char *s, size_t offset);
 
 void get_arg(const char* name, int& v, int default_v);
diff --git a/src/banded_smith_cpu.cpp b/src/banded_smith_cpu.cpp
index 5ff236b..8fbe471 100644
--- a/src/banded_smith_cpu.cpp
+++ b/src/banded_smith_cpu.cpp
@@ -1,10 +1,12 @@
 #include "smith.h"
+#include "../gpu-sw/sw-lib.cc"
+#include "../gpu-sw/core.hh"
 // #include <libunwind.h>
 // #include <gperftools/tcmalloc.h>
 
 #define max2(m, n) ((m) > (n) ? (m) : (n))
 #define max3(m, n, p) ((m) > (n) ? ((m) > (p) ? (m) : (p)) : ((n) > (p) ? (n) : (p)))
-#define MASK5(v) (v & 0b11111)
+// #define MASK5(v) (v & 0b11111)
 #define END 0
 #define TOP 1
 #define LEFT 2
@@ -27,6 +29,28 @@ inline char get_char(const char *s, size_t offset)
     return MASK5((unsigned)((*((uint16_t *)&(s[n_bit >> 3]))) >> (n_bit & 7)));
 }
 
+string get_substr(const char* s, int begin_p, int end_p) {
+
+    char tmp[end_p - begin_p] = {0};
+    for (size_t i = begin_p; i < end_p; i++)
+    {
+        tmp[i - begin_p] = char(s[i] + 65);
+    }
+    string ss(tmp, end_p - begin_p);
+    return ss;
+}
+
+string get_substr_T(const char* s, int begin_p, int end_p) {
+
+    string ss;
+    for (size_t i = begin_p; i < end_p; i++)
+    {
+        if(get_char(s,i) == END_SIGNAL) continue;
+        ss += get_char(s,i) + 65;
+    }
+    return ss;
+}
+
 void smith_waterman_kernel(const int idx, SWResult *res, SWTasks *sw_task)
 {
     const char *q = sw_task->q;
@@ -290,7 +314,241 @@ void smith_waterman_kernel(const int idx, SWResult *res, SWTasks *sw_task)
         res->s_ori = s_ori;
         res->match = match;
     }
+}
+
+char* str_get_substr(const char* str, int begin, int end) {
+    int len = end - begin ;
+    char* substr = new char[len + 1];
+    for(int i = 0; i < len; ++i) substr[i] = str[begin+i];
+    // strncpy(substr, str + begin, len);
+    substr[len] = '\0';
+    return substr;
+}
+
+void gpu_SW(vector<SWResult> &res,SWTasks *sw_task)
+{
+    // 两个序列
+    const char *query = sw_task->q;
+    const char *target = sw_task->c;
+    size_t c_len = sw_task->c_len;
+
+
+    struct glf::sw_handle<eccl::seq_type::prot>* hdl;     //没有指定dna,或蛋白,创造handle的选项可能需要指定,ctype 模板参数.可能可以
+    // 有些选项,DNA 蛋白的打分矩阵,
+    // 蛋白的话,cigar的值也需要调整,eg. score的正负和match和mismatch
+    // 需求不一样
+    struct glf::sw_batch_opts  batch_opts;
+    hdl = glf::sw_create<eccl::seq_type::prot>();
+    
+    std::vector<std::string_view> q;
+    std::vector<std::string_view> c;
+    std::vector<string> qs;
+    std::vector<string> cs;
+    
+    // std::vector<sw_align> cur;
+
+
+    for(int idx = 0; idx < sw_task->num_task; ++idx){
+
+        size_t q_idx = sw_task->q_idxs[idx];
+        size_t n = sw_task->q_lens[idx];
+        size_t diag = sw_task->diags[idx]; //  pos of c at end of q
+
+        int64_t c_begin = (int64_t)diag - band_width - n;
+        size_t c_end = diag + band_width;
+        c_begin = c_begin >= 0 ? c_begin : 0;
+        c_end = c_end <= c_len ? c_end : c_len;
+        if (has_must_include)
+        {
+            if (!check_include(target, c_begin, c_end))
+            {
+                res[idx].report = false;
+                return;
+            }
+        }
+        size_t width = (n + band_width) << 1;
+        size_t height = band_width + 2;
+        // cout << "\nQUERY\n";
+        // for(int it = 0; it< n; ++it)cout << char(query[q_idx+it]+65);
+        string _q = get_substr(query,q_idx,q_idx+n);
+        string _c = get_substr_T(target,c_begin,c_end);
+        qs.push_back(_q);
+        cs.push_back(_c);
+    }
+    for(int i = 0; i < qs.size(); ++i){
+        q.push_back(qs[i]);
+        c.push_back(cs[i]);
+    }
+    std::vector<sw_align> cur = glf::sw_batch(hdl,q,c, batch_opts);
     
+
+    for(int k = 0; k < cur.size(); k++){
+        cigar_to_string(cur[k],q[k], c[k], res[k].q_res, res[k].s_res);
+        // cout << "\nq_res\n";
+        // for (int i=0;i<res[k].q_res.size();i++)
+        // {
+        //     if (res[k].q_res[i]==-1) cout<<"-";
+        //     else cout<<(char)(q[k][res[k].q_res[i]]);
+        // }
+        // cout<<endl;
+        // cout << "s_res\n";
+        // for (int i=0;i<res[k].s_res.size();i++)
+        // {
+        //     if (res[k].s_res[i]==-1) cout<<"-";
+        //     else cout<<(char)(c[k][res[k].s_res[i]]);
+        // }
+        // cout<<endl;
+
+        size_t q_idx = sw_task->q_idxs[k];
+        size_t n = sw_task->q_lens[k];
+        size_t diag = sw_task->diags[k]; //  pos of c at end of q
+
+        int64_t c_begin = (int64_t)diag - band_width - n;
+        size_t c_end = diag + band_width;
+        c_begin = c_begin >= 0 ? c_begin : 0;
+        c_end = c_end <= c_len ? c_end : c_len;
+
+        res[k].begin_q = cur[k].query_begin + q_idx;
+        res[k].begin_s = cur[k].ref_begin + c_begin;
+        res[k].end_q = cur[k].query_end + q_idx;
+        res[k].end_s = cur[k].ref_end + c_begin;
+
+        size_t len = res[k].s_res.size();
+        res[k].align_length = len;
+        res[k].score = cur[k].sw_score;
+        res[k].bitscore = (E_lambda * res[k].score - log(E_k)) / (0.69314718055995);
+        char s[len + 1] = {0};
+        char q_seq[len + 1] = {0};
+        char s_ori[len + 1] = {0};
+        char match[len + 1] = {0};
+        int s_ori_len = 0;
+        res[k].gap_open = 0;
+        res[k].gaps = 0;
+        bool ga = false;
+        for (int i = 0; i < len; i++)
+        {
+            if (res[k].s_res[i] != (size_t)(-1))
+            {
+                ga = false;
+                s[i] = c[k][res[k].s_res[i]];
+                // s[i] = get_char(c, res[k].s_res[i]) + 65;
+                if (s[i] == 95)
+                    s[i] = '*';
+                s_ori[s_ori_len++] = s[i];
+            }
+            else
+            {
+                s[i] = '-';
+                res[k].gaps++;
+                if (!ga)
+                {
+                    ga = true;
+                    res[k].gap_open++;
+                }
+            }
+        }
+
+        if (has_must_include)
+        {
+            string s_ori_str(s_ori, len);
+            if (!check_include(s_ori_str))
+            {
+                res[k].report = false;
+                return;
+            }
+        }
+        ga = false;
+        for (int i = 0; i < len; i++)
+        {
+            // cout<<q_res[t][i]<<" ";
+            if (res[k].q_res[i] != (size_t)(-1))
+            {
+                ga = false;
+                q_seq[i] = q[k][res[k].q_res[i]];
+            }
+            else
+            {
+                q_seq[i] = '-';
+                res[k].gaps++;
+                if (!ga)
+                {
+                    ga = true;
+                    res[k].gap_open++;
+                }
+            }
+        }
+
+        res[k].mismatch = 0;
+        res[k].positive = 0;
+        for (int i = 0; i < len; i++)
+        {
+            match[i]=' ';
+            if (BLOSUM62[(q_seq[i] - 65) * 26 + (s[i] - 65)] > 0 && q_seq[i]!='-' && s[i]!='-')
+            {
+                res[k].positive++;
+                match[i]='+';
+            }
+            if (q_seq[i] != s[i])
+            {
+                res[k].mismatch++;
+            }
+            else
+            {
+                match[i]=q_seq[i];
+            }
+                
+        }
+        res[k].n_identity = res[k].align_length - res[k].mismatch;
+        res[k].p_identity = (1 - (double)res[k].mismatch / res[k].align_length) * 100;
+        // todo
+        res[k].s_res[0] += c_begin;
+        if (detailed_alignment)
+        {
+            res[k].q = q_seq;
+            res[k].s = s;
+            res[k].s_ori = s_ori;
+            res[k].match = match;
+        }
+    }
+}
+
+void total_gpu_SW(SWTasks tasks, vector<SWResult> res[][NUM_STREAM],\
+                  const char* query,const char* target,int num_g, int span)
+{
+    
+    size_t n = tasks.num_task;
+    // assert(n == num_g * NUM_STREAM);
+
+    // get 每一个query与 target
+    struct glf::sw_handle<eccl::seq_type::prot>* hdl;  
+    struct glf::sw_batch_opts  batch_opts;
+    hdl = glf::sw_create<eccl::seq_type::prot>();
+    std::vector<std::string_view> q;
+    std::vector<std::string_view> t;
+    std::vector<string> qs;
+    std::vector<string> cs;
+
+    for(int i = 0; i < n; ++i){
+        // TODO check split task
+        size_t q_begin = tasks.q_idxs[i];
+        size_t q_len = tasks.q_lens[i];
+        int64_t t_begin = (int64_t)tasks.diags[i] - band_width - tasks.q_lens[i];
+        size_t t_end = tasks.diags[i] + band_width;
+        
+        // cout << "Q\t" << q_begin << "\t" << q_len + q_begin << "\n";
+        // cout << "T\t" << t_begin << "\t" << t_end << "\n";
+        
+        string _q = get_substr(query,q_begin,q_begin+q_len);
+        string _c = get_substr_T(target,t_begin,t_end);
+        qs.push_back(_q);
+        cs.push_back(_c);
+    }
+    for(int i = 0; i < qs.size(); ++i){
+        q.push_back(qs[i]);
+        t.push_back(cs[i]);
+    }
+    std::vector<sw_align> cur = glf::sw_batch(hdl,q,t, batch_opts);
+   
 }
 
 // void banded_smith_waterman(const char *q, const char *c, vector<uint32_t> &q_idxs, vector<uint32_t> &q_lens, vector<size_t> &diags, size_t c_len, size_t num_task, vector<SWResult> &res, ThreadPool *pool, vector<future<int>> &rs)
diff --git a/src/blastp.cu b/src/blastp.cu
index 48112ea..4a4232b 100644
--- a/src/blastp.cu
+++ b/src/blastp.cu
@@ -142,7 +142,9 @@ __global__ void filter_kernel(KeyValue *ht, Task *tasks, uint32_t *num_task, uin
 }
 
 #ifdef USE_GPU_SW
-void handle_results(cudaEvent_t &stream, Task *task_host, uint32_t *num_task, QueryGroup &q_group, size_t s_length, int stream_id, vector<SWResult> &res, SWTasks &sw_task)
+void handle_results(cudaEvent_t &stream, Task *task_host, uint32_t *num_task,\
+                     QueryGroup &q_group, size_t s_length, int stream_id, \
+                     vector<SWResult> &res, SWTasks &sw_task)
 {
     cudaEventSynchronize(stream);
     mu2.lock();
@@ -187,8 +189,44 @@ void handle_results(cudaEvent_t &stream, Task *task_host, uint32_t *num_task, Qu
     mu2.unlock();
 
 }
+#elif GLF_GPU_SW
+void handle_results(cudaEvent_t &stream, const char *query, const char *subj, \
+                    Task *task_host, uint32_t *num_task, QueryGroup &q_group, \
+                    size_t s_length, int stream_id, vector<SWResult> &res,\
+                     SWTasks &sw_task, ThreadPool *pool, vector<future<int>> &rs)
+{
+    cudaEventSynchronize(stream);
+    cout << "=";
+    res.clear();
+    res.resize(*num_task);
+    sw_task.q = query;
+    sw_task.c = subj;
+    sw_task.c_len = s_length;
+    sw_task.q_idxs.resize(*num_task);
+    sw_task.q_lens.resize(*num_task);
+    sw_task.diags.resize(*num_task);
+    Task *t_begin = task_host;
+    sw_task.num_task = *num_task;
+#pragma omp parallel for
+    for (int i = 0; i < *num_task; i++)
+    {
+        Task &kv = *(t_begin + i);
+        sw_task.q_idxs[i]=q_group.offset[kv.q_id];
+        sw_task.q_lens[i]=q_group.length[kv.q_id];
+        sw_task.diags[i]=kv.key;
+        res[i].num_q = kv.q_id;
+    }
+    
+    gpu_SW(res,&sw_task);
+
+}
+
+
 #else
-void handle_results(cudaEvent_t &stream, const char *query, const char *subj, Task *task_host, uint32_t *num_task, QueryGroup &q_group, size_t s_length, int stream_id, vector<SWResult> &res, SWTasks &sw_task, ThreadPool *pool, vector<future<int>> &rs)
+void handle_results(cudaEvent_t &stream, const char *query, const char *subj, \
+                    Task *task_host, uint32_t *num_task, QueryGroup &q_group, \
+                    size_t s_length, int stream_id, vector<SWResult> &res,\
+                     SWTasks &sw_task, ThreadPool *pool, vector<future<int>> &rs)
 {
     cudaEventSynchronize(stream);
     cout << "=";
@@ -223,7 +261,10 @@ void handle_results(cudaEvent_t &stream, const char *query, const char *subj, Ta
 }
 #endif
 
-void search_db_batch(const char *query, char *subj[], vector<QueryGroup> &q_groups, size_t s_length[], Task *task_host[][NUM_STREAM], uint32_t *task_num_host[][NUM_STREAM], size_t max_hashtable_capacity, uint32_t max_n_query, uint32_t total_len_query, string db_name, uint32_t db_num, vector<SWResult> *res, size_t total_db_size, TimeProfile &time_prof)
+void search_db_batch(const char *query, char *subj[], vector<QueryGroup> &q_groups,\
+                     size_t s_length[], Task *task_host[][NUM_STREAM], uint32_t *task_num_host[][NUM_STREAM], \
+                     size_t max_hashtable_capacity, uint32_t max_n_query, uint32_t total_len_query, \
+                     string db_name, uint32_t db_num, vector<SWResult> *res, size_t total_db_size, TimeProfile &time_prof)
 {
     struct timeval t_start, t_end, tt_start;
 
@@ -380,9 +421,23 @@ void search_db_batch(const char *query, char *subj[], vector<QueryGroup> &q_grou
                 CUDA_CALL(cudaMemcpyAsync(task_num_host[g_idx][s], task_num_dev[s], sizeof(uint32_t), cudaMemcpyDeviceToHost, streams[s]));
                 CUDA_CALL(cudaEventRecord(seeding_finished[g_idx][s]));
 #ifdef USE_GPU_SW
-                result_threads[g_idx][s] = thread(handle_results, ref(seeding_finished[g_idx][s]), task_host[g_idx][s], task_num_host[g_idx][s], ref(q_groups[g]), s_length[s], s, ref(res_s[g][s]), ref(sw_tasks_total));
+                result_threads[g_idx][s] = thread(handle_results, \
+                                        ref(seeding_finished[g_idx][s]),\
+                                        task_host[g_idx][s], task_num_host[g_idx][s], ref(q_groups[g]), \
+                                        s_length[s], s, ref(res_s[g][s]), \
+                                        ref(sw_tasks_total));
+#elif GLF_GPU_SW
+                result_threads[g_idx][s] = thread(handle_results, \
+                                        ref(seeding_finished[g_idx][s]), query, subj[s], \
+                                        task_host[g_idx][s], task_num_host[g_idx][s], ref(q_groups[g]), \
+                                        s_length[s], s, ref(res_s[g][s]),\
+                                         ref(sw_tasks[g][s]), pool, ref(rs[g_idx][s]));
 #else
-                result_threads[g_idx][s] = thread(handle_results, ref(seeding_finished[g_idx][s]), query, subj[s], task_host[g_idx][s], task_num_host[g_idx][s], ref(q_groups[g]), s_length[s], s, ref(res_s[g][s]), ref(sw_tasks[g][s]), pool, ref(rs[g_idx][s]));
+                result_threads[g_idx][s] = thread(handle_results, \
+                                        ref(seeding_finished[g_idx][s]), query, subj[s], \
+                                        task_host[g_idx][s], task_num_host[g_idx][s], ref(q_groups[g]), \
+                                        s_length[s], s, ref(res_s[g][s]),\
+                                         ref(sw_tasks[g][s]), pool, ref(rs[g_idx][s]));
 #endif 
             }
             s_begin += s_length_stream_byte;
@@ -441,10 +496,21 @@ void search_db_batch(const char *query, char *subj[], vector<QueryGroup> &q_grou
                 CUDA_CALL(cudaEventDestroy(seeding_finished[g_idx][s]));
                 hsp_count += res_s[g][s].size();
                 cout << "=";
-#ifndef USE_GPU_SW
+#ifdef GLF_GPU_SW
                 for (auto &r : rs[g_idx][s])
                     r.get();
-                proceed_result(res, res_s[g][s], query, subj[s], q_groups[g], s_name[s], s_offsets[s], sn_offsets[s], s_num[s], total_db_size);
+                proceed_result_glf(res, res_s[g][s], query, subj[s], \
+                    q_groups[g], s_name[s], s_offsets[s], \
+                    sn_offsets[s], s_num[s], total_db_size);
+                cout << "=";
+#elif defined(USE_GPU_SW)
+
+#else
+                for (auto &r : rs[g_idx][s])
+                    r.get();
+                proceed_result(res, res_s[g][s], query, subj[s], \
+                              q_groups[g], s_name[s], s_offsets[s], \
+                              sn_offsets[s], s_num[s], total_db_size);
                 cout << "=";
 #endif
             }
@@ -498,7 +564,22 @@ void search_db_batch(const char *query, char *subj[], vector<QueryGroup> &q_grou
         sw_tasks_total.c_all[s] = subj[s];
         sw_tasks_total.c_offs[s] = s==0? 0: sw_tasks_total.c_offs[s-1] +s_length[s-1];
     }
-    gasal_run(sw_tasks_total, res_s, query_dev, subj_dev, q_groups.size(), band_width);
+    
+    // 计数单位byte 
+    size_t total_len_target = 0;
+    for(int i = 0; i < NUM_STREAM; ++i){
+        total_len_target += s_length[i]*5/8; 
+    }
+    
+    char* target = (char*)malloc(sizeof(char) * total_len_target);
+    CUDA_CALL(cudaMemcpy(target, subj_dev, total_len_target, cudaMemcpyDeviceToHost));
+    for(int i = 0; i < total_len_target; ++i) {
+        char ch_tar = get_char(target,i) ;
+        target[i] = static_cast<char>(ch_tar+ 65);
+    }
+    total_gpu_SW(sw_tasks_total,res_s,query,target,q_groups.size(),band_width);
+    // gasal_run(sw_tasks_total, res_s, query_dev, subj_dev, q_groups.size(), band_width);
+
     cout << "Done.\t[";
 
     gettimeofday(&t_end, NULL);
diff --git a/src/createDB.cpp b/src/createDB.cpp
index 5e0d909..c1cce9a 100644
--- a/src/createDB.cpp
+++ b/src/createDB.cpp
@@ -1,7 +1,7 @@
 
 #include "params.h"
 #include "util.h"
-#define MASK5(v) (v & 0b11111)
+
 
 #define MAX_QUERY_LENGTH 10000
 
diff --git a/src/main.cpp b/src/main.cpp
index a7b7f4a..bdbfbb4 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -91,6 +91,8 @@ void show_args()
     cout << "Alignment Type" << "\t";
 #ifdef USE_GPU_SW
     cout << "GPU SW" << endl;
+#elif defined(GLF_GPU_SW)
+     cout << "GLF SW" << endl;
 #else
     cout << "CPU BANDED SW" << endl;
 #endif
diff --git a/src/util.cpp b/src/util.cpp
index 2c38a33..6497746 100644
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -1,6 +1,6 @@
 #include "util.h"
 
-#define MASK5(v) (v & 0b11111)
+// #define MASK5(v) (v & 0b11111)
 
 ArgumentParser arg_parser;
 int seed_length;
@@ -240,8 +240,10 @@ void load_seq(string db, int num, char *&str, size_t &len)
     str = str1;
     // cout << "Load db " << db << num << " successful. Total length = " << len << endl;
 }
-
-void proceed_result(vector<SWResult> *res_d, vector<SWResult> &res_t, const char *query, const char *subj, QueryGroup &q_group, const char *s_name, const size_t *s_offsets, const size_t *sn_offsets, const size_t s_num, size_t total_db_size)
+// TODO 
+void proceed_result(vector<SWResult> *res_d, vector<SWResult> &res_t, const char *query, const char *subj, \
+                    QueryGroup &q_group, const char *s_name, const size_t *s_offsets,\
+                    const size_t *sn_offsets, const size_t s_num, size_t total_db_size)
 {
     set<int> modified_resq;
 
@@ -395,6 +397,97 @@ void proceed_result(vector<SWResult> *res_d, vector<SWResult> &res_t, const char
     // cout<<endl;
 }
 
+void proceed_result_glf(vector<SWResult> *res_d, vector<SWResult> &res_t, \
+                        const char *query, const char *subj, QueryGroup &q_group,\
+                        const char *s_name, const size_t *s_offsets, const size_t *sn_offsets, const size_t s_num, \
+                        size_t total_db_size)
+{
+    set<int> modified_resq;
+    size_t n_subj = s_num / sizeof(size_t);
+// get res_t score,bitscore->evalue
+// #pragma omp parallel for
+    for (int t = 0; t < res_t.size(); t++)
+    {
+        if (!res_t[t].report)
+            continue;
+        double score = res_t[t].score;
+        if (score >= min_score)
+        {
+            int num_s = get_pos(res_t[t].s_res[0], s_offsets, n_subj);
+            int num_q = res_t[t].num_q;
+            res_t[t].num_q = q_group.id[num_q];
+            uint32_t s_len = s_offsets[num_s + 1] - s_offsets[num_s] - 1;
+
+            double e_value = total_db_size * (q_group.length[num_q]) * pow(2, -res_t[t].bitscore);
+            // cout << "\nEvalue " << e_value << endl;
+            if (e_value <= max_evalue)
+            {
+                res_t[t].e_value = e_value;
+                res_t[t].sn_offset = sn_offsets[num_s];
+                res_t[t].sn_len = sn_offsets[num_s + 1] - sn_offsets[num_s] - 1;
+                string sn(s_name + res_t[t].sn_offset, res_t[t].sn_len);
+                res_t[t].s_name = sn;
+                // size_t len = res_t[t].align_length;
+                res_t[t].s_len = s_len;
+            }
+            else{
+                res_t[t].report=false;
+                res_t[t].q.clear();
+                res_t[t].s.clear();
+                res_t[t].s_ori.clear();
+                res_t[t].match.clear();
+            }
+        }
+        else
+        {
+            res_t[t].report=false;
+            res_t[t].q.clear();
+            res_t[t].s.clear();
+            res_t[t].s_ori.clear();
+            res_t[t].match.clear();
+        }
+        res_t[t].s_res.clear();
+        res_t[t].q_res.clear();
+    }
+    for (int t = 0; t < res_t.size(); t++)
+    {
+        if (!res_t[t].report)
+            continue;
+        // SWResult& res = res_t[t];
+        res_d[res_t[t].num_q].emplace_back(res_t[t]);
+        push_heap(res_d[res_t[t].num_q].begin(), res_d[res_t[t].num_q].end(), [&](const SWResult &sw1, const SWResult &sw2)
+                          { return (sw1.e_value == sw2.e_value) ? (sw1.score > sw2.score) : (sw1.e_value < sw2.e_value); });
+                // res_tmp.emplace_back(res);
+        modified_resq.insert(res_t[t].num_q);
+    }
+
+    vector<int> modified_resq_list;
+    modified_resq_list.assign(modified_resq.begin(), modified_resq.end());
+#pragma omp parallel for
+    for (int i=0;i<modified_resq_list.size();i++)
+    {
+        int mq = modified_resq_list[i];
+        vector<SWResult> &res = res_d[mq];
+
+        // sort(res.begin(), res.end(), [&](const DisplayResult &sw1, const DisplayResult &sw2)
+        //      { return (sw1.e_value == sw2.e_value) ? (sw1.score > sw2.score) : (sw1.e_value < sw2.e_value); });
+        if (max_display_align > 0 && res.size() > max_display_align)
+        {
+            int pop_size = res.size() - max_display_align;
+            for (int i = 0; i < pop_size; i++)
+            {
+                pop_heap(res.begin(), res.end() - i, [&](const SWResult &sw1, const SWResult &sw2)
+                         { return (sw1.e_value == sw2.e_value) ? (sw1.score > sw2.score) : (sw1.e_value < sw2.e_value); });
+                // res.pop_back();
+            }
+            // res.erase(res.begin() + max_display_align, res.end());
+            res.resize(max_display_align);
+            assert(res.size() == max_display_align);
+        }
+    }
+    // cout<<endl;
+}
+
 inline char get_char(const char *s, size_t offset)
 {
     offset *= 5;