Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 58 additions & 47 deletions src/rapids_singlecell/_cuda/aggr/aggr.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,28 +8,28 @@ using namespace nb::literals;
constexpr int BLOCK_SIZE_SPARSE = 64;
constexpr int BLOCK_SIZE_DENSE = 256;

template <typename T>
static inline void launch_csr_aggr(const int* indptr, const int* index,
template <typename T, typename IdxT>
static inline void launch_csr_aggr(const IdxT* indptr, const IdxT* index,
const T* data, double* out, const int* cats,
const bool* mask, size_t n_cells,
size_t n_genes, size_t n_groups,
cudaStream_t stream) {
dim3 grid((unsigned)n_cells);
dim3 block(BLOCK_SIZE_SPARSE);
csr_aggr_kernel<T><<<grid, block, 0, stream>>>(
csr_aggr_kernel<T, IdxT><<<grid, block, 0, stream>>>(
indptr, index, data, out, cats, mask, n_cells, n_genes, n_groups);
CUDA_CHECK_LAST_ERROR(csr_aggr_kernel);
}

template <typename T>
static inline void launch_csc_aggr(const int* indptr, const int* index,
template <typename T, typename IdxT>
static inline void launch_csc_aggr(const IdxT* indptr, const IdxT* index,
const T* data, double* out, const int* cats,
const bool* mask, size_t n_cells,
size_t n_genes, size_t n_groups,
cudaStream_t stream) {
dim3 grid((unsigned)n_genes);
dim3 block(BLOCK_SIZE_SPARSE);
csc_aggr_kernel<T><<<grid, block, 0, stream>>>(
csc_aggr_kernel<T, IdxT><<<grid, block, 0, stream>>>(
indptr, index, data, out, cats, mask, n_cells, n_genes, n_groups);
CUDA_CHECK_LAST_ERROR(csc_aggr_kernel);
}
Expand Down Expand Up @@ -58,50 +58,51 @@ static inline void launch_dense_aggr_F(const T* data, double* out,
CUDA_CHECK_LAST_ERROR(dense_aggr_kernel_F);
}

template <typename T>
static inline void launch_csr_to_coo(const int* indptr, const int* index,
template <typename T, typename IdxT>
static inline void launch_csr_to_coo(const IdxT* indptr, const IdxT* index,
const T* data, int* row, int* col,
double* ndata, const int* cats,
const bool* mask, int n_cells,
cudaStream_t stream) {
dim3 grid((unsigned)n_cells);
dim3 block(BLOCK_SIZE_SPARSE);
csr_to_coo_kernel<T><<<grid, block, 0, stream>>>(
csr_to_coo_kernel<T, IdxT><<<grid, block, 0, stream>>>(
indptr, index, data, row, col, ndata, cats, mask, n_cells);
CUDA_CHECK_LAST_ERROR(csr_to_coo_kernel);
}

static inline void launch_sparse_var(const int* indptr, const int* index,
template <typename IdxT>
static inline void launch_sparse_var(const IdxT* indptr, const IdxT* index,
double* data, const double* mean_data,
double* n_cells, int dof, int n_groups,
cudaStream_t stream) {
dim3 grid((unsigned)n_groups);
dim3 block(BLOCK_SIZE_SPARSE);
sparse_var_kernel<<<grid, block, 0, stream>>>(
sparse_var_kernel<IdxT><<<grid, block, 0, stream>>>(
indptr, index, data, mean_data, n_cells, dof, n_groups);
CUDA_CHECK_LAST_ERROR(sparse_var_kernel);
}

template <typename T, typename Device>
template <typename T, typename IdxT, typename Device>
void def_sparse_aggr(nb::module_& m) {
m.def(
"sparse_aggr",
[](gpu_array_c<const int, Device> indptr,
gpu_array_c<const int, Device> index,
[](gpu_array_c<const IdxT, Device> indptr,
gpu_array_c<const IdxT, Device> index,
gpu_array_c<const T, Device> data, gpu_array_c<double, Device> out,
gpu_array_c<const int, Device> cats,
gpu_array_c<const bool, Device> mask, size_t n_cells, size_t n_genes,
size_t n_groups, bool is_csc, std::uintptr_t stream) {
if (is_csc) {
launch_csc_aggr<T>(indptr.data(), index.data(), data.data(),
out.data(), cats.data(), mask.data(),
n_cells, n_genes, n_groups,
(cudaStream_t)stream);
launch_csc_aggr<T, IdxT>(indptr.data(), index.data(),
data.data(), out.data(), cats.data(),
mask.data(), n_cells, n_genes,
n_groups, (cudaStream_t)stream);
} else {
launch_csr_aggr<T>(indptr.data(), index.data(), data.data(),
out.data(), cats.data(), mask.data(),
n_cells, n_genes, n_groups,
(cudaStream_t)stream);
launch_csr_aggr<T, IdxT>(indptr.data(), index.data(),
data.data(), out.data(), cats.data(),
mask.data(), n_cells, n_genes,
n_groups, (cudaStream_t)stream);
}
},
"indptr"_a, "index"_a, "data"_a, nb::kw_only(), "out"_a, "cats"_a,
Expand Down Expand Up @@ -131,56 +132,66 @@ void def_dense_aggr(nb::module_& m) {
"n_genes"_a, "n_groups"_a, "is_fortran"_a, "stream"_a = 0);
}

template <typename T, typename Device>
template <typename T, typename IdxT, typename Device>
void def_csr_to_coo(nb::module_& m) {
m.def(
"csr_to_coo",
[](gpu_array_c<const int, Device> indptr,
gpu_array_c<const int, Device> index,
[](gpu_array_c<const IdxT, Device> indptr,
gpu_array_c<const IdxT, Device> index,
gpu_array_c<const T, Device> data, gpu_array_c<int, Device> out_row,
gpu_array_c<int, Device> out_col,
gpu_array_c<double, Device> out_data,
gpu_array_c<const int, Device> cats,
gpu_array_c<const bool, Device> mask, int n_cells,
std::uintptr_t stream) {
launch_csr_to_coo<T>(indptr.data(), index.data(), data.data(),
out_row.data(), out_col.data(),
out_data.data(), cats.data(), mask.data(),
n_cells, (cudaStream_t)stream);
launch_csr_to_coo<T, IdxT>(
indptr.data(), index.data(), data.data(), out_row.data(),
out_col.data(), out_data.data(), cats.data(), mask.data(),
n_cells, (cudaStream_t)stream);
},
"indptr"_a, "index"_a, "data"_a, nb::kw_only(), "out_row"_a,
"out_col"_a, "out_data"_a, "cats"_a, "mask"_a, "n_cells"_a,
"stream"_a = 0);
}

template <typename IdxT, typename Device>
void def_sparse_var(nb::module_& m) {
m.def(
"sparse_var",
[](gpu_array_c<const IdxT, Device> indptr,
gpu_array_c<const IdxT, Device> index,
gpu_array_c<double, Device> data,
gpu_array_c<const double, Device> means,
gpu_array_c<double, Device> n_cells, int dof, int n_groups,
std::uintptr_t stream) {
launch_sparse_var<IdxT>(indptr.data(), index.data(), data.data(),
means.data(), n_cells.data(), dof, n_groups,
(cudaStream_t)stream);
},
"indptr"_a, "index"_a, "data"_a, nb::kw_only(), "means"_a, "n_cells"_a,
"dof"_a, "n_groups"_a, "stream"_a = 0);
}

template <typename Device>
void register_bindings(nb::module_& m) {
def_sparse_aggr<float, Device>(m);
def_sparse_aggr<double, Device>(m);
def_sparse_aggr<float, int, Device>(m);
def_sparse_aggr<float, long long, Device>(m);
def_sparse_aggr<double, int, Device>(m);
def_sparse_aggr<double, long long, Device>(m);

// F-order must come before C-order for proper dispatch
def_dense_aggr<float, nb::f_contig, Device>(m);
def_dense_aggr<float, nb::c_contig, Device>(m);
def_dense_aggr<double, nb::f_contig, Device>(m);
def_dense_aggr<double, nb::c_contig, Device>(m);

def_csr_to_coo<float, Device>(m);
def_csr_to_coo<double, Device>(m);
def_csr_to_coo<float, int, Device>(m);
def_csr_to_coo<float, long long, Device>(m);
def_csr_to_coo<double, int, Device>(m);
def_csr_to_coo<double, long long, Device>(m);

m.def(
"sparse_var",
[](gpu_array_c<const int, Device> indptr,
gpu_array_c<const int, Device> index,
gpu_array_c<double, Device> data,
gpu_array_c<const double, Device> means,
gpu_array_c<double, Device> n_cells, int dof, int n_groups,
std::uintptr_t stream) {
launch_sparse_var(indptr.data(), index.data(), data.data(),
means.data(), n_cells.data(), dof, n_groups,
(cudaStream_t)stream);
},
"indptr"_a, "index"_a, "data"_a, nb::kw_only(), "means"_a, "n_cells"_a,
"dof"_a, "n_groups"_a, "stream"_a = 0);
def_sparse_var<int, Device>(m);
def_sparse_var<long long, Device>(m);
}

NB_MODULE(_aggr_cuda, m) {
Expand Down
49 changes: 25 additions & 24 deletions src/rapids_singlecell/_cuda/aggr/kernels_aggr.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,20 @@
#include <cuda_runtime.h>

// sparse -> dense aggregate (CSR by cells), mask per cell, cats per cell
template <typename T>
__global__ void csr_aggr_kernel(const int* __restrict__ indptr,
const int* __restrict__ index,
template <typename T, typename IdxT>
__global__ void csr_aggr_kernel(const IdxT* __restrict__ indptr,
const IdxT* __restrict__ index,
const T* __restrict__ data,
double* __restrict__ out,
const int* __restrict__ cats,
const bool* __restrict__ mask, size_t n_cells,
size_t n_genes, size_t n_groups) {
size_t cell = blockIdx.x;
if (cell >= n_cells || !mask[cell]) return;
int cell_start = indptr[cell];
int cell_end = indptr[cell + 1];
IdxT cell_start = indptr[cell];
IdxT cell_end = indptr[cell + 1];
size_t group = static_cast<size_t>(cats[cell]);
for (int p = cell_start + threadIdx.x; p < cell_end; p += blockDim.x) {
for (IdxT p = cell_start + threadIdx.x; p < cell_end; p += blockDim.x) {
size_t gene_pos = static_cast<size_t>(index[p]);
double v = static_cast<double>(data[p]);
atomicAdd(&out[group * n_genes + gene_pos], v);
Expand All @@ -27,19 +27,19 @@ __global__ void csr_aggr_kernel(const int* __restrict__ indptr,
}

// sparse -> dense aggregate (CSC by genes), mask per cell, cats per cell
template <typename T>
__global__ void csc_aggr_kernel(const int* __restrict__ indptr,
const int* __restrict__ index,
template <typename T, typename IdxT>
__global__ void csc_aggr_kernel(const IdxT* __restrict__ indptr,
const IdxT* __restrict__ index,
const T* __restrict__ data,
double* __restrict__ out,
const int* __restrict__ cats,
const bool* __restrict__ mask, size_t n_cells,
size_t n_genes, size_t n_groups) {
size_t gene = blockIdx.x;
if (gene >= n_genes) return;
int gene_start = indptr[gene];
int gene_end = indptr[gene + 1];
for (int p = gene_start + threadIdx.x; p < gene_end; p += blockDim.x) {
IdxT gene_start = indptr[gene];
IdxT gene_end = indptr[gene + 1];
for (IdxT p = gene_start + threadIdx.x; p < gene_end; p += blockDim.x) {
size_t cell = static_cast<size_t>(index[p]);
if (!mask[cell]) continue;
size_t group = static_cast<size_t>(cats[cell]);
Expand All @@ -52,41 +52,42 @@ __global__ void csc_aggr_kernel(const int* __restrict__ indptr,

// sparse -> sparse copy (CSR by cells) row/col/value from one to another by
// cats/mask
template <typename T>
__global__ void csr_to_coo_kernel(const int* __restrict__ indptr,
const int* __restrict__ index,
template <typename T, typename IdxT>
__global__ void csr_to_coo_kernel(const IdxT* __restrict__ indptr,
const IdxT* __restrict__ index,
const T* __restrict__ data,
int* __restrict__ row, int* __restrict__ col,
double* __restrict__ ndata,
const int* __restrict__ cats,
const bool* __restrict__ mask, int n_cells) {
int cell = blockIdx.x;
if (cell >= n_cells || !mask[cell]) return;
int start = indptr[cell];
int end = indptr[cell + 1];
IdxT start = indptr[cell];
IdxT end = indptr[cell + 1];
int group = cats[cell];
for (int p = start + threadIdx.x; p < end; p += blockDim.x) {
int g = index[p];
for (IdxT p = start + threadIdx.x; p < end; p += blockDim.x) {
int g = static_cast<int>(index[p]);
ndata[p] = static_cast<double>(data[p]);
row[p] = group;
col[p] = g;
}
}

// variance adjust per group (CSR-like segment)
__global__ void sparse_var_kernel(const int* __restrict__ indptr,
const int* __restrict__ index,
template <typename IdxT>
__global__ void sparse_var_kernel(const IdxT* __restrict__ indptr,
const IdxT* __restrict__ index,
double* __restrict__ data,
const double* __restrict__ mean_data,
double* __restrict__ n_cells, int dof,
int n_groups) {
int group = blockIdx.x;
if (group >= n_groups) return;
int start = indptr[group];
int end = indptr[group + 1];
IdxT start = indptr[group];
IdxT end = indptr[group + 1];
double doffer =
n_cells[group] / (n_cells[group] - static_cast<double>(dof));
for (int p = start + threadIdx.x; p < end; p += blockDim.x) {
for (IdxT p = start + threadIdx.x; p < end; p += blockDim.x) {
double var = data[p];
double mean_sq = mean_data[p] * mean_data[p];
var = var - mean_sq;
Expand Down
60 changes: 45 additions & 15 deletions src/rapids_singlecell/_cuda/autocorr/autocorr.cu
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,16 @@ static inline void launch_gearys_sparse(
CUDA_CHECK_LAST_ERROR(gearys_C_num_sparse_kernel);
}

template <typename T>
static inline void launch_pre_den_sparse(const int* data_col_ind,
const T* data_values, int nnz,
template <typename T, typename IdxT>
static inline void launch_pre_den_sparse(const IdxT* data_col_ind,
const T* data_values, long long nnz,
const T* mean_array, T* den,
int* counter, cudaStream_t stream) {
dim3 block(ELEMENTWISE_BLOCK_SIZE);
dim3 grid((nnz + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE);
pre_den_sparse_kernel<<<grid, block, 0, stream>>>(
long long grid_size =
(nnz + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE;
dim3 grid(grid_size);
pre_den_sparse_kernel<T, IdxT><<<grid, block, 0, stream>>>(
data_col_ind, data_values, nnz, mean_array, den, counter);
CUDA_CHECK_LAST_ERROR(pre_den_sparse_kernel);
}
Expand Down Expand Up @@ -220,31 +222,59 @@ void register_bindings(nb::module_& m) {
"data_row_ptr"_a, "data_col_ind"_a, "data_values"_a, "n_samples"_a,
"n_features"_a, "num"_a, "stream"_a = 0);

// pre_den_sparse - float32
// pre_den_sparse - float32, int32
m.def(
"pre_den_sparse",
[](gpu_array_c<const int, Device> data_col_ind,
gpu_array_c<const float, Device> data_values, int nnz,
gpu_array_c<const float, Device> data_values, long long nnz,
gpu_array_c<const float, Device> mean_array,
gpu_array_c<float, Device> den, gpu_array_c<int, Device> counter,
std::uintptr_t stream) {
launch_pre_den_sparse(data_col_ind.data(), data_values.data(), nnz,
mean_array.data(), den.data(), counter.data(),
(cudaStream_t)stream);
launch_pre_den_sparse<float, int>(
data_col_ind.data(), data_values.data(), nnz, mean_array.data(),
den.data(), counter.data(), (cudaStream_t)stream);
},
"data_col_ind"_a, "data_values"_a, nb::kw_only(), "nnz"_a,
"mean_array"_a, "den"_a, "counter"_a, "stream"_a = 0);
// pre_den_sparse - float64
// pre_den_sparse - float64, int32
m.def(
"pre_den_sparse",
[](gpu_array_c<const int, Device> data_col_ind,
gpu_array_c<const double, Device> data_values, int nnz,
gpu_array_c<const double, Device> data_values, long long nnz,
gpu_array_c<const double, Device> mean_array,
gpu_array_c<double, Device> den, gpu_array_c<int, Device> counter,
std::uintptr_t stream) {
launch_pre_den_sparse<double, int>(
data_col_ind.data(), data_values.data(), nnz, mean_array.data(),
den.data(), counter.data(), (cudaStream_t)stream);
},
"data_col_ind"_a, "data_values"_a, nb::kw_only(), "nnz"_a,
"mean_array"_a, "den"_a, "counter"_a, "stream"_a = 0);
// pre_den_sparse - float32, int64
m.def(
"pre_den_sparse",
[](gpu_array_c<const long long, Device> data_col_ind,
gpu_array_c<const float, Device> data_values, long long nnz,
gpu_array_c<const float, Device> mean_array,
gpu_array_c<float, Device> den, gpu_array_c<int, Device> counter,
std::uintptr_t stream) {
launch_pre_den_sparse<float, long long>(
data_col_ind.data(), data_values.data(), nnz, mean_array.data(),
den.data(), counter.data(), (cudaStream_t)stream);
},
"data_col_ind"_a, "data_values"_a, nb::kw_only(), "nnz"_a,
"mean_array"_a, "den"_a, "counter"_a, "stream"_a = 0);
// pre_den_sparse - float64, int64
m.def(
"pre_den_sparse",
[](gpu_array_c<const long long, Device> data_col_ind,
gpu_array_c<const double, Device> data_values, long long nnz,
gpu_array_c<const double, Device> mean_array,
gpu_array_c<double, Device> den, gpu_array_c<int, Device> counter,
std::uintptr_t stream) {
launch_pre_den_sparse(data_col_ind.data(), data_values.data(), nnz,
mean_array.data(), den.data(), counter.data(),
(cudaStream_t)stream);
launch_pre_den_sparse<double, long long>(
data_col_ind.data(), data_values.data(), nnz, mean_array.data(),
den.data(), counter.data(), (cudaStream_t)stream);
},
"data_col_ind"_a, "data_values"_a, nb::kw_only(), "nnz"_a,
"mean_array"_a, "den"_a, "counter"_a, "stream"_a = 0);
Expand Down
Loading
Loading