diff --git a/src/rapids_singlecell/_cuda/aggr/aggr.cu b/src/rapids_singlecell/_cuda/aggr/aggr.cu index 000a67e4..9dbf97ac 100644 --- a/src/rapids_singlecell/_cuda/aggr/aggr.cu +++ b/src/rapids_singlecell/_cuda/aggr/aggr.cu @@ -8,28 +8,28 @@ using namespace nb::literals; constexpr int BLOCK_SIZE_SPARSE = 64; constexpr int BLOCK_SIZE_DENSE = 256; -template -static inline void launch_csr_aggr(const int* indptr, const int* index, +template +static inline void launch_csr_aggr(const IdxT* indptr, const IdxT* index, const T* data, double* out, const int* cats, const bool* mask, size_t n_cells, size_t n_genes, size_t n_groups, cudaStream_t stream) { dim3 grid((unsigned)n_cells); dim3 block(BLOCK_SIZE_SPARSE); - csr_aggr_kernel<<>>( + csr_aggr_kernel<<>>( indptr, index, data, out, cats, mask, n_cells, n_genes, n_groups); CUDA_CHECK_LAST_ERROR(csr_aggr_kernel); } -template -static inline void launch_csc_aggr(const int* indptr, const int* index, +template +static inline void launch_csc_aggr(const IdxT* indptr, const IdxT* index, const T* data, double* out, const int* cats, const bool* mask, size_t n_cells, size_t n_genes, size_t n_groups, cudaStream_t stream) { dim3 grid((unsigned)n_genes); dim3 block(BLOCK_SIZE_SPARSE); - csc_aggr_kernel<<>>( + csc_aggr_kernel<<>>( indptr, index, data, out, cats, mask, n_cells, n_genes, n_groups); CUDA_CHECK_LAST_ERROR(csc_aggr_kernel); } @@ -58,50 +58,51 @@ static inline void launch_dense_aggr_F(const T* data, double* out, CUDA_CHECK_LAST_ERROR(dense_aggr_kernel_F); } -template -static inline void launch_csr_to_coo(const int* indptr, const int* index, +template +static inline void launch_csr_to_coo(const IdxT* indptr, const IdxT* index, const T* data, int* row, int* col, double* ndata, const int* cats, const bool* mask, int n_cells, cudaStream_t stream) { dim3 grid((unsigned)n_cells); dim3 block(BLOCK_SIZE_SPARSE); - csr_to_coo_kernel<<>>( + csr_to_coo_kernel<<>>( indptr, index, data, row, col, ndata, cats, mask, n_cells); CUDA_CHECK_LAST_ERROR(csr_to_coo_kernel); } -static inline void launch_sparse_var(const int* indptr, const int* index, +template +static inline void launch_sparse_var(const IdxT* indptr, const IdxT* index, double* data, const double* mean_data, double* n_cells, int dof, int n_groups, cudaStream_t stream) { dim3 grid((unsigned)n_groups); dim3 block(BLOCK_SIZE_SPARSE); - sparse_var_kernel<<>>( + sparse_var_kernel<<>>( indptr, index, data, mean_data, n_cells, dof, n_groups); CUDA_CHECK_LAST_ERROR(sparse_var_kernel); } -template +template void def_sparse_aggr(nb::module_& m) { m.def( "sparse_aggr", - [](gpu_array_c indptr, - gpu_array_c index, + [](gpu_array_c indptr, + gpu_array_c index, gpu_array_c data, gpu_array_c out, gpu_array_c cats, gpu_array_c mask, size_t n_cells, size_t n_genes, size_t n_groups, bool is_csc, std::uintptr_t stream) { if (is_csc) { - launch_csc_aggr(indptr.data(), index.data(), data.data(), - out.data(), cats.data(), mask.data(), - n_cells, n_genes, n_groups, - (cudaStream_t)stream); + launch_csc_aggr(indptr.data(), index.data(), + data.data(), out.data(), cats.data(), + mask.data(), n_cells, n_genes, + n_groups, (cudaStream_t)stream); } else { - launch_csr_aggr(indptr.data(), index.data(), data.data(), - out.data(), cats.data(), mask.data(), - n_cells, n_genes, n_groups, - (cudaStream_t)stream); + launch_csr_aggr(indptr.data(), index.data(), + data.data(), out.data(), cats.data(), + mask.data(), n_cells, n_genes, + n_groups, (cudaStream_t)stream); } }, "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "out"_a, "cats"_a, @@ -131,32 +132,52 @@ void def_dense_aggr(nb::module_& m) { "n_genes"_a, "n_groups"_a, "is_fortran"_a, "stream"_a = 0); } -template +template void def_csr_to_coo(nb::module_& m) { m.def( "csr_to_coo", - [](gpu_array_c indptr, - gpu_array_c index, + [](gpu_array_c indptr, + gpu_array_c index, gpu_array_c data, gpu_array_c out_row, gpu_array_c out_col, gpu_array_c out_data, gpu_array_c cats, gpu_array_c mask, int n_cells, std::uintptr_t stream) { - launch_csr_to_coo(indptr.data(), index.data(), data.data(), - out_row.data(), out_col.data(), - out_data.data(), cats.data(), mask.data(), - n_cells, (cudaStream_t)stream); + launch_csr_to_coo( + indptr.data(), index.data(), data.data(), out_row.data(), + out_col.data(), out_data.data(), cats.data(), mask.data(), + n_cells, (cudaStream_t)stream); }, "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "out_row"_a, "out_col"_a, "out_data"_a, "cats"_a, "mask"_a, "n_cells"_a, "stream"_a = 0); } +template +void def_sparse_var(nb::module_& m) { + m.def( + "sparse_var", + [](gpu_array_c indptr, + gpu_array_c index, + gpu_array_c data, + gpu_array_c means, + gpu_array_c n_cells, int dof, int n_groups, + std::uintptr_t stream) { + launch_sparse_var(indptr.data(), index.data(), data.data(), + means.data(), n_cells.data(), dof, n_groups, + (cudaStream_t)stream); + }, + "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "means"_a, "n_cells"_a, + "dof"_a, "n_groups"_a, "stream"_a = 0); +} + template void register_bindings(nb::module_& m) { - def_sparse_aggr(m); - def_sparse_aggr(m); + def_sparse_aggr(m); + def_sparse_aggr(m); + def_sparse_aggr(m); + def_sparse_aggr(m); // F-order must come before C-order for proper dispatch def_dense_aggr(m); @@ -164,23 +185,13 @@ void register_bindings(nb::module_& m) { def_dense_aggr(m); def_dense_aggr(m); - def_csr_to_coo(m); - def_csr_to_coo(m); + def_csr_to_coo(m); + def_csr_to_coo(m); + def_csr_to_coo(m); + def_csr_to_coo(m); - m.def( - "sparse_var", - [](gpu_array_c indptr, - gpu_array_c index, - gpu_array_c data, - gpu_array_c means, - gpu_array_c n_cells, int dof, int n_groups, - std::uintptr_t stream) { - launch_sparse_var(indptr.data(), index.data(), data.data(), - means.data(), n_cells.data(), dof, n_groups, - (cudaStream_t)stream); - }, - "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "means"_a, "n_cells"_a, - "dof"_a, "n_groups"_a, "stream"_a = 0); + def_sparse_var(m); + def_sparse_var(m); } NB_MODULE(_aggr_cuda, m) { diff --git a/src/rapids_singlecell/_cuda/aggr/kernels_aggr.cuh b/src/rapids_singlecell/_cuda/aggr/kernels_aggr.cuh index b2d1cd4d..9a91a9d0 100644 --- a/src/rapids_singlecell/_cuda/aggr/kernels_aggr.cuh +++ b/src/rapids_singlecell/_cuda/aggr/kernels_aggr.cuh @@ -3,9 +3,9 @@ #include // sparse -> dense aggregate (CSR by cells), mask per cell, cats per cell -template -__global__ void csr_aggr_kernel(const int* __restrict__ indptr, - const int* __restrict__ index, +template +__global__ void csr_aggr_kernel(const IdxT* __restrict__ indptr, + const IdxT* __restrict__ index, const T* __restrict__ data, double* __restrict__ out, const int* __restrict__ cats, @@ -13,10 +13,10 @@ __global__ void csr_aggr_kernel(const int* __restrict__ indptr, size_t n_genes, size_t n_groups) { size_t cell = blockIdx.x; if (cell >= n_cells || !mask[cell]) return; - int cell_start = indptr[cell]; - int cell_end = indptr[cell + 1]; + IdxT cell_start = indptr[cell]; + IdxT cell_end = indptr[cell + 1]; size_t group = static_cast(cats[cell]); - for (int p = cell_start + threadIdx.x; p < cell_end; p += blockDim.x) { + for (IdxT p = cell_start + threadIdx.x; p < cell_end; p += blockDim.x) { size_t gene_pos = static_cast(index[p]); double v = static_cast(data[p]); atomicAdd(&out[group * n_genes + gene_pos], v); @@ -27,9 +27,9 @@ __global__ void csr_aggr_kernel(const int* __restrict__ indptr, } // sparse -> dense aggregate (CSC by genes), mask per cell, cats per cell -template -__global__ void csc_aggr_kernel(const int* __restrict__ indptr, - const int* __restrict__ index, +template +__global__ void csc_aggr_kernel(const IdxT* __restrict__ indptr, + const IdxT* __restrict__ index, const T* __restrict__ data, double* __restrict__ out, const int* __restrict__ cats, @@ -37,9 +37,9 @@ __global__ void csc_aggr_kernel(const int* __restrict__ indptr, size_t n_genes, size_t n_groups) { size_t gene = blockIdx.x; if (gene >= n_genes) return; - int gene_start = indptr[gene]; - int gene_end = indptr[gene + 1]; - for (int p = gene_start + threadIdx.x; p < gene_end; p += blockDim.x) { + IdxT gene_start = indptr[gene]; + IdxT gene_end = indptr[gene + 1]; + for (IdxT p = gene_start + threadIdx.x; p < gene_end; p += blockDim.x) { size_t cell = static_cast(index[p]); if (!mask[cell]) continue; size_t group = static_cast(cats[cell]); @@ -52,9 +52,9 @@ __global__ void csc_aggr_kernel(const int* __restrict__ indptr, // sparse -> sparse copy (CSR by cells) row/col/value from one to another by // cats/mask -template -__global__ void csr_to_coo_kernel(const int* __restrict__ indptr, - const int* __restrict__ index, +template +__global__ void csr_to_coo_kernel(const IdxT* __restrict__ indptr, + const IdxT* __restrict__ index, const T* __restrict__ data, int* __restrict__ row, int* __restrict__ col, double* __restrict__ ndata, @@ -62,11 +62,11 @@ __global__ void csr_to_coo_kernel(const int* __restrict__ indptr, const bool* __restrict__ mask, int n_cells) { int cell = blockIdx.x; if (cell >= n_cells || !mask[cell]) return; - int start = indptr[cell]; - int end = indptr[cell + 1]; + IdxT start = indptr[cell]; + IdxT end = indptr[cell + 1]; int group = cats[cell]; - for (int p = start + threadIdx.x; p < end; p += blockDim.x) { - int g = index[p]; + for (IdxT p = start + threadIdx.x; p < end; p += blockDim.x) { + int g = static_cast(index[p]); ndata[p] = static_cast(data[p]); row[p] = group; col[p] = g; @@ -74,19 +74,20 @@ __global__ void csr_to_coo_kernel(const int* __restrict__ indptr, } // variance adjust per group (CSR-like segment) -__global__ void sparse_var_kernel(const int* __restrict__ indptr, - const int* __restrict__ index, +template +__global__ void sparse_var_kernel(const IdxT* __restrict__ indptr, + const IdxT* __restrict__ index, double* __restrict__ data, const double* __restrict__ mean_data, double* __restrict__ n_cells, int dof, int n_groups) { int group = blockIdx.x; if (group >= n_groups) return; - int start = indptr[group]; - int end = indptr[group + 1]; + IdxT start = indptr[group]; + IdxT end = indptr[group + 1]; double doffer = n_cells[group] / (n_cells[group] - static_cast(dof)); - for (int p = start + threadIdx.x; p < end; p += blockDim.x) { + for (IdxT p = start + threadIdx.x; p < end; p += blockDim.x) { double var = data[p]; double mean_sq = mean_data[p] * mean_data[p]; var = var - mean_sq; diff --git a/src/rapids_singlecell/_cuda/autocorr/autocorr.cu b/src/rapids_singlecell/_cuda/autocorr/autocorr.cu index 4f2dcf47..0b7ec6e7 100644 --- a/src/rapids_singlecell/_cuda/autocorr/autocorr.cu +++ b/src/rapids_singlecell/_cuda/autocorr/autocorr.cu @@ -64,14 +64,16 @@ static inline void launch_gearys_sparse( CUDA_CHECK_LAST_ERROR(gearys_C_num_sparse_kernel); } -template -static inline void launch_pre_den_sparse(const int* data_col_ind, - const T* data_values, int nnz, +template +static inline void launch_pre_den_sparse(const IdxT* data_col_ind, + const T* data_values, long long nnz, const T* mean_array, T* den, int* counter, cudaStream_t stream) { dim3 block(ELEMENTWISE_BLOCK_SIZE); - dim3 grid((nnz + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE); - pre_den_sparse_kernel<<>>( + long long grid_size = + (nnz + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE; + dim3 grid(grid_size); + pre_den_sparse_kernel<<>>( data_col_ind, data_values, nnz, mean_array, den, counter); CUDA_CHECK_LAST_ERROR(pre_den_sparse_kernel); } @@ -220,31 +222,59 @@ void register_bindings(nb::module_& m) { "data_row_ptr"_a, "data_col_ind"_a, "data_values"_a, "n_samples"_a, "n_features"_a, "num"_a, "stream"_a = 0); - // pre_den_sparse - float32 + // pre_den_sparse - float32, int32 m.def( "pre_den_sparse", [](gpu_array_c data_col_ind, - gpu_array_c data_values, int nnz, + gpu_array_c data_values, long long nnz, gpu_array_c mean_array, gpu_array_c den, gpu_array_c counter, std::uintptr_t stream) { - launch_pre_den_sparse(data_col_ind.data(), data_values.data(), nnz, - mean_array.data(), den.data(), counter.data(), - (cudaStream_t)stream); + launch_pre_den_sparse( + data_col_ind.data(), data_values.data(), nnz, mean_array.data(), + den.data(), counter.data(), (cudaStream_t)stream); }, "data_col_ind"_a, "data_values"_a, nb::kw_only(), "nnz"_a, "mean_array"_a, "den"_a, "counter"_a, "stream"_a = 0); - // pre_den_sparse - float64 + // pre_den_sparse - float64, int32 m.def( "pre_den_sparse", [](gpu_array_c data_col_ind, - gpu_array_c data_values, int nnz, + gpu_array_c data_values, long long nnz, + gpu_array_c mean_array, + gpu_array_c den, gpu_array_c counter, + std::uintptr_t stream) { + launch_pre_den_sparse( + data_col_ind.data(), data_values.data(), nnz, mean_array.data(), + den.data(), counter.data(), (cudaStream_t)stream); + }, + "data_col_ind"_a, "data_values"_a, nb::kw_only(), "nnz"_a, + "mean_array"_a, "den"_a, "counter"_a, "stream"_a = 0); + // pre_den_sparse - float32, int64 + m.def( + "pre_den_sparse", + [](gpu_array_c data_col_ind, + gpu_array_c data_values, long long nnz, + gpu_array_c mean_array, + gpu_array_c den, gpu_array_c counter, + std::uintptr_t stream) { + launch_pre_den_sparse( + data_col_ind.data(), data_values.data(), nnz, mean_array.data(), + den.data(), counter.data(), (cudaStream_t)stream); + }, + "data_col_ind"_a, "data_values"_a, nb::kw_only(), "nnz"_a, + "mean_array"_a, "den"_a, "counter"_a, "stream"_a = 0); + // pre_den_sparse - float64, int64 + m.def( + "pre_den_sparse", + [](gpu_array_c data_col_ind, + gpu_array_c data_values, long long nnz, gpu_array_c mean_array, gpu_array_c den, gpu_array_c counter, std::uintptr_t stream) { - launch_pre_den_sparse(data_col_ind.data(), data_values.data(), nnz, - mean_array.data(), den.data(), counter.data(), - (cudaStream_t)stream); + launch_pre_den_sparse( + data_col_ind.data(), data_values.data(), nnz, mean_array.data(), + den.data(), counter.data(), (cudaStream_t)stream); }, "data_col_ind"_a, "data_values"_a, nb::kw_only(), "nnz"_a, "mean_array"_a, "den"_a, "counter"_a, "stream"_a = 0); diff --git a/src/rapids_singlecell/_cuda/autocorr/kernels_autocorr.cuh b/src/rapids_singlecell/_cuda/autocorr/kernels_autocorr.cuh index 00e8113f..b98f3cc4 100644 --- a/src/rapids_singlecell/_cuda/autocorr/kernels_autocorr.cuh +++ b/src/rapids_singlecell/_cuda/autocorr/kernels_autocorr.cuh @@ -173,17 +173,18 @@ __global__ void gearys_C_num_sparse_kernel( } // Pre-denominator for sparse paths -template -__global__ void pre_den_sparse_kernel(const int* __restrict__ data_col_ind, +template +__global__ void pre_den_sparse_kernel(const IdxT* __restrict__ data_col_ind, const T* __restrict__ data_values, - int nnz, const T* __restrict__ mean_array, + long long nnz, + const T* __restrict__ mean_array, T* __restrict__ den, int* __restrict__ counter) { - int i = blockIdx.x * blockDim.x + threadIdx.x; + long long i = (long long)blockIdx.x * blockDim.x + threadIdx.x; if (i >= nnz) { return; } - int geneidx = data_col_ind[i]; + IdxT geneidx = data_col_ind[i]; T value = data_values[i] - mean_array[geneidx]; atomicAdd(&counter[geneidx], 1); atomicAdd(&den[geneidx], value * value); diff --git a/src/rapids_singlecell/_cuda/ligrec/kernels_ligrec.cuh b/src/rapids_singlecell/_cuda/ligrec/kernels_ligrec.cuh index 70f9e266..afc86149 100644 --- a/src/rapids_singlecell/_cuda/ligrec/kernels_ligrec.cuh +++ b/src/rapids_singlecell/_cuda/ligrec/kernels_ligrec.cuh @@ -20,9 +20,9 @@ __global__ void sum_and_count_dense_kernel(const T* __restrict__ data, } } -template -__global__ void sum_and_count_sparse_kernel(const int* __restrict__ indptr, - const int* __restrict__ index, +template +__global__ void sum_and_count_sparse_kernel(const IdxT* __restrict__ indptr, + const IdxT* __restrict__ index, const T* __restrict__ data, const int* __restrict__ clusters, T* __restrict__ sum_gt0, @@ -30,10 +30,10 @@ __global__ void sum_and_count_sparse_kernel(const int* __restrict__ indptr, int nrows, int n_cls) { int cell = blockDim.x * blockIdx.x + threadIdx.x; if (cell >= nrows) return; - int start_idx = indptr[cell]; - int stop_idx = indptr[cell + 1]; + IdxT start_idx = indptr[cell]; + IdxT stop_idx = indptr[cell + 1]; int cluster = clusters[cell]; - for (int gene = start_idx; gene < stop_idx; gene++) { + for (IdxT gene = start_idx; gene < stop_idx; gene++) { T value = data[gene]; int gene_number = index[gene]; if (value > (T)0) { @@ -54,19 +54,19 @@ __global__ void mean_dense_kernel(const T* __restrict__ data, atomicAdd(&g_cluster[j * n_cls + clusters[i]], data[i * num_cols + j]); } -template -__global__ void mean_sparse_kernel(const int* __restrict__ indptr, - const int* __restrict__ index, +template +__global__ void mean_sparse_kernel(const IdxT* __restrict__ indptr, + const IdxT* __restrict__ index, const T* __restrict__ data, const int* __restrict__ clusters, T* __restrict__ sum_gt0, int nrows, int n_cls) { int cell = blockDim.x * blockIdx.x + threadIdx.x; if (cell >= nrows) return; - int start_idx = indptr[cell]; - int stop_idx = indptr[cell + 1]; + IdxT start_idx = indptr[cell]; + IdxT stop_idx = indptr[cell + 1]; int cluster = clusters[cell]; - for (int gene = start_idx; gene < stop_idx; gene++) { + for (IdxT gene = start_idx; gene < stop_idx; gene++) { T value = data[gene]; int gene_number = index[gene]; if (value > (T)0) { diff --git a/src/rapids_singlecell/_cuda/ligrec/ligrec.cu b/src/rapids_singlecell/_cuda/ligrec/ligrec.cu index 1aacc391..ce711151 100644 --- a/src/rapids_singlecell/_cuda/ligrec/ligrec.cu +++ b/src/rapids_singlecell/_cuda/ligrec/ligrec.cu @@ -21,14 +21,15 @@ static inline void launch_sum_count_dense(const T* data, const int* clusters, CUDA_CHECK_LAST_ERROR(sum_and_count_dense_kernel); } -template -static inline void launch_sum_count_sparse(const int* indptr, const int* index, - const T* data, const int* clusters, - T* sum, int* count, int rows, - int ncls, cudaStream_t stream) { +template +static inline void launch_sum_count_sparse(const IdxT* indptr, + const IdxT* index, const T* data, + const int* clusters, T* sum, + int* count, int rows, int ncls, + cudaStream_t stream) { dim3 block(SPARSE_BLOCK_SIZE); dim3 grid((rows + SPARSE_BLOCK_SIZE - 1) / SPARSE_BLOCK_SIZE); - sum_and_count_sparse_kernel<<>>( + sum_and_count_sparse_kernel<<>>( indptr, index, data, clusters, sum, count, rows, ncls); CUDA_CHECK_LAST_ERROR(sum_and_count_sparse_kernel); } @@ -45,14 +46,14 @@ static inline void launch_mean_dense(const T* data, const int* clusters, T* g, CUDA_CHECK_LAST_ERROR(mean_dense_kernel); } -template -static inline void launch_mean_sparse(const int* indptr, const int* index, +template +static inline void launch_mean_sparse(const IdxT* indptr, const IdxT* index, const T* data, const int* clusters, T* g, int rows, int ncls, cudaStream_t stream) { dim3 block(SPARSE_BLOCK_SIZE); dim3 grid((rows + SPARSE_BLOCK_SIZE - 1) / SPARSE_BLOCK_SIZE); - mean_sparse_kernel<<>>(indptr, index, data, - clusters, g, rows, ncls); + mean_sparse_kernel<<>>( + indptr, index, data, clusters, g, rows, ncls); CUDA_CHECK_LAST_ERROR(mean_sparse_kernel); } @@ -128,7 +129,7 @@ void register_bindings(nb::module_& m) { "data"_a, nb::kw_only(), "clusters"_a, "sum"_a, "count"_a, "rows"_a, "cols"_a, "ncls"_a, "stream"_a = 0); - // sum_count_sparse - float32 + // sum_count_sparse - float32, int indices m.def( "sum_count_sparse", [](gpu_array_c indptr, @@ -144,7 +145,7 @@ void register_bindings(nb::module_& m) { "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "clusters"_a, "sum"_a, "count"_a, "rows"_a, "ncls"_a, "stream"_a = 0); - // sum_count_sparse - float64 + // sum_count_sparse - float64, int indices m.def( "sum_count_sparse", [](gpu_array_c indptr, @@ -160,6 +161,38 @@ void register_bindings(nb::module_& m) { "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "clusters"_a, "sum"_a, "count"_a, "rows"_a, "ncls"_a, "stream"_a = 0); + // sum_count_sparse - float32, int64 indices + m.def( + "sum_count_sparse", + [](gpu_array_c indptr, + gpu_array_c index, + gpu_array_c data, + gpu_array_c clusters, + gpu_array_c sum, gpu_array_c count, + int rows, int ncls, std::uintptr_t stream) { + launch_sum_count_sparse( + indptr.data(), index.data(), data.data(), clusters.data(), + sum.data(), count.data(), rows, ncls, (cudaStream_t)stream); + }, + "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "clusters"_a, "sum"_a, + "count"_a, "rows"_a, "ncls"_a, "stream"_a = 0); + + // sum_count_sparse - float64, int64 indices + m.def( + "sum_count_sparse", + [](gpu_array_c indptr, + gpu_array_c index, + gpu_array_c data, + gpu_array_c clusters, + gpu_array_c sum, gpu_array_c count, + int rows, int ncls, std::uintptr_t stream) { + launch_sum_count_sparse( + indptr.data(), index.data(), data.data(), clusters.data(), + sum.data(), count.data(), rows, ncls, (cudaStream_t)stream); + }, + "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "clusters"_a, "sum"_a, + "count"_a, "rows"_a, "ncls"_a, "stream"_a = 0); + // mean_dense - float32 m.def( "mean_dense", @@ -186,7 +219,7 @@ void register_bindings(nb::module_& m) { "data"_a, nb::kw_only(), "clusters"_a, "g"_a, "rows"_a, "cols"_a, "ncls"_a, "stream"_a = 0); - // mean_sparse - float32 + // mean_sparse - float32, int indices m.def( "mean_sparse", [](gpu_array_c indptr, @@ -202,7 +235,7 @@ void register_bindings(nb::module_& m) { "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "clusters"_a, "g"_a, "rows"_a, "ncls"_a, "stream"_a = 0); - // mean_sparse - float64 + // mean_sparse - float64, int indices m.def( "mean_sparse", [](gpu_array_c indptr, @@ -218,6 +251,38 @@ void register_bindings(nb::module_& m) { "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "clusters"_a, "g"_a, "rows"_a, "ncls"_a, "stream"_a = 0); + // mean_sparse - float32, int64 indices + m.def( + "mean_sparse", + [](gpu_array_c indptr, + gpu_array_c index, + gpu_array_c data, + gpu_array_c clusters, + gpu_array_c g, int rows, int ncls, + std::uintptr_t stream) { + launch_mean_sparse(indptr.data(), index.data(), data.data(), + clusters.data(), g.data(), rows, ncls, + (cudaStream_t)stream); + }, + "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "clusters"_a, "g"_a, + "rows"_a, "ncls"_a, "stream"_a = 0); + + // mean_sparse - float64, int64 indices + m.def( + "mean_sparse", + [](gpu_array_c indptr, + gpu_array_c index, + gpu_array_c data, + gpu_array_c clusters, + gpu_array_c g, int rows, int ncls, + std::uintptr_t stream) { + launch_mean_sparse(indptr.data(), index.data(), data.data(), + clusters.data(), g.data(), rows, ncls, + (cudaStream_t)stream); + }, + "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "clusters"_a, "g"_a, + "rows"_a, "ncls"_a, "stream"_a = 0); + // elementwise_diff - float32 m.def( "elementwise_diff", diff --git a/src/rapids_singlecell/_cuda/mean_var/kernels_mv.cuh b/src/rapids_singlecell/_cuda/mean_var/kernels_mv.cuh index e754719e..ae2aa38a 100644 --- a/src/rapids_singlecell/_cuda/mean_var/kernels_mv.cuh +++ b/src/rapids_singlecell/_cuda/mean_var/kernels_mv.cuh @@ -2,9 +2,9 @@ #include -template -__global__ void mean_var_major_kernel(const int* __restrict__ indptr, - const int* __restrict__ indices, +template +__global__ void mean_var_major_kernel(const IdxT* __restrict__ indptr, + const IdxT* __restrict__ indices, const T* __restrict__ data, double* __restrict__ means, double* __restrict__ vars, int major, @@ -12,8 +12,8 @@ __global__ void mean_var_major_kernel(const int* __restrict__ indptr, int major_idx = blockIdx.x; if (major_idx >= major) return; - int start_idx = indptr[major_idx]; - int stop_idx = indptr[major_idx + 1]; + IdxT start_idx = indptr[major_idx]; + IdxT stop_idx = indptr[major_idx + 1]; __shared__ double mean_place[64]; __shared__ double var_place[64]; @@ -22,7 +22,7 @@ __global__ void mean_var_major_kernel(const int* __restrict__ indptr, var_place[threadIdx.x] = 0.0; __syncthreads(); - for (int minor_idx = start_idx + threadIdx.x; minor_idx < stop_idx; + for (IdxT minor_idx = start_idx + threadIdx.x; minor_idx < stop_idx; minor_idx += blockDim.x) { double value = static_cast(data[minor_idx]); mean_place[threadIdx.x] += value; @@ -43,15 +43,16 @@ __global__ void mean_var_major_kernel(const int* __restrict__ indptr, } } -template -__global__ void mean_var_minor_kernel(const int* __restrict__ indices, +template +__global__ void mean_var_minor_kernel(const IdxT* __restrict__ indices, const T* __restrict__ data, double* __restrict__ means, - double* __restrict__ vars, int nnz) { - int idx = blockDim.x * blockIdx.x + threadIdx.x; + double* __restrict__ vars, + long long nnz) { + long long idx = (long long)blockDim.x * blockIdx.x + threadIdx.x; if (idx >= nnz) return; double value = static_cast(data[idx]); - int minor_pos = indices[idx]; + IdxT minor_pos = indices[idx]; atomicAdd(&means[minor_pos], value); atomicAdd(&vars[minor_pos], value * value); } diff --git a/src/rapids_singlecell/_cuda/mean_var/mean_var.cu b/src/rapids_singlecell/_cuda/mean_var/mean_var.cu index 97b5d6ff..60e5fca3 100644 --- a/src/rapids_singlecell/_cuda/mean_var/mean_var.cu +++ b/src/rapids_singlecell/_cuda/mean_var/mean_var.cu @@ -8,32 +8,32 @@ using namespace nb::literals; constexpr int BLOCK_SIZE_MAJOR = 64; constexpr int BLOCK_SIZE_MINOR = 256; -template -static inline void launch_mean_var_major(const int* indptr, const int* indices, - const T* data, double* means, - double* vars, int major, int minor, - cudaStream_t stream) { +template +static inline void launch_mean_var_major(const IdxT* indptr, + const IdxT* indices, const T* data, + double* means, double* vars, int major, + int minor, cudaStream_t stream) { dim3 block(BLOCK_SIZE_MAJOR); dim3 grid(major); - mean_var_major_kernel<<>>( + mean_var_major_kernel<<>>( indptr, indices, data, means, vars, major, minor); CUDA_CHECK_LAST_ERROR(mean_var_major_kernel); } -template -static inline void launch_mean_var_minor(const int* indices, const T* data, - double* means, double* vars, int nnz, - cudaStream_t stream) { +template +static inline void launch_mean_var_minor(const IdxT* indices, const T* data, + double* means, double* vars, + long long nnz, cudaStream_t stream) { int block = BLOCK_SIZE_MINOR; - int grid = (nnz + block - 1) / block; - mean_var_minor_kernel + long long grid = (nnz + block - 1) / block; + mean_var_minor_kernel <<>>(indices, data, means, vars, nnz); CUDA_CHECK_LAST_ERROR(mean_var_minor_kernel); } template void register_bindings(nb::module_& m) { - // Float32 major + // Float32 major, int32 m.def( "mean_var_major", [](gpu_array_c indptr, @@ -48,7 +48,7 @@ void register_bindings(nb::module_& m) { "indptr"_a, "indices"_a, "data"_a, "means"_a, "vars"_a, nb::kw_only(), "major"_a, "minor"_a, "stream"_a = 0); - // Float64 major + // Float64 major, int32 m.def( "mean_var_major", [](gpu_array_c indptr, @@ -63,13 +63,43 @@ void register_bindings(nb::module_& m) { "indptr"_a, "indices"_a, "data"_a, "means"_a, "vars"_a, nb::kw_only(), "major"_a, "minor"_a, "stream"_a = 0); - // Float32 minor + // Float32 major, int64 + m.def( + "mean_var_major", + [](gpu_array_c indptr, + gpu_array_c indices, + gpu_array_c data, + gpu_array_c means, gpu_array_c vars, + int major, int minor, std::uintptr_t stream) { + launch_mean_var_major(indptr.data(), indices.data(), + data.data(), means.data(), vars.data(), + major, minor, (cudaStream_t)stream); + }, + "indptr"_a, "indices"_a, "data"_a, "means"_a, "vars"_a, nb::kw_only(), + "major"_a, "minor"_a, "stream"_a = 0); + + // Float64 major, int64 + m.def( + "mean_var_major", + [](gpu_array_c indptr, + gpu_array_c indices, + gpu_array_c data, + gpu_array_c means, gpu_array_c vars, + int major, int minor, std::uintptr_t stream) { + launch_mean_var_major( + indptr.data(), indices.data(), data.data(), means.data(), + vars.data(), major, minor, (cudaStream_t)stream); + }, + "indptr"_a, "indices"_a, "data"_a, "means"_a, "vars"_a, nb::kw_only(), + "major"_a, "minor"_a, "stream"_a = 0); + + // Float32 minor, int32 m.def( "mean_var_minor", [](gpu_array_c indices, gpu_array_c data, gpu_array_c means, gpu_array_c vars, - int nnz, std::uintptr_t stream) { + long long nnz, std::uintptr_t stream) { launch_mean_var_minor(indices.data(), data.data(), means.data(), vars.data(), nnz, (cudaStream_t)stream); @@ -77,13 +107,41 @@ void register_bindings(nb::module_& m) { "indices"_a, "data"_a, "means"_a, "vars"_a, nb::kw_only(), "nnz"_a, "stream"_a = 0); - // Float64 minor + // Float64 minor, int32 m.def( "mean_var_minor", [](gpu_array_c indices, gpu_array_c data, gpu_array_c means, gpu_array_c vars, - int nnz, std::uintptr_t stream) { + long long nnz, std::uintptr_t stream) { + launch_mean_var_minor(indices.data(), data.data(), + means.data(), vars.data(), nnz, + (cudaStream_t)stream); + }, + "indices"_a, "data"_a, "means"_a, "vars"_a, nb::kw_only(), "nnz"_a, + "stream"_a = 0); + + // Float32 minor, int64 + m.def( + "mean_var_minor", + [](gpu_array_c indices, + gpu_array_c data, + gpu_array_c means, gpu_array_c vars, + long long nnz, std::uintptr_t stream) { + launch_mean_var_minor(indices.data(), data.data(), + means.data(), vars.data(), nnz, + (cudaStream_t)stream); + }, + "indices"_a, "data"_a, "means"_a, "vars"_a, nb::kw_only(), "nnz"_a, + "stream"_a = 0); + + // Float64 minor, int64 + m.def( + "mean_var_minor", + [](gpu_array_c indices, + gpu_array_c data, + gpu_array_c means, gpu_array_c vars, + long long nnz, std::uintptr_t stream) { launch_mean_var_minor(indices.data(), data.data(), means.data(), vars.data(), nnz, (cudaStream_t)stream); diff --git a/src/rapids_singlecell/_cuda/nanmean/kernels_nanmean.cuh b/src/rapids_singlecell/_cuda/nanmean/kernels_nanmean.cuh index 0ef21d30..652bfae3 100644 --- a/src/rapids_singlecell/_cuda/nanmean/kernels_nanmean.cuh +++ b/src/rapids_singlecell/_cuda/nanmean/kernels_nanmean.cuh @@ -2,17 +2,18 @@ #include -template -__global__ void nan_mean_minor_kernel(const int* __restrict__ index, +template +__global__ void nan_mean_minor_kernel(const IdxT* __restrict__ index, const T* __restrict__ data, double* __restrict__ means, int* __restrict__ nans, - const bool* __restrict__ mask, int nnz) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; + const bool* __restrict__ mask, + long long nnz) { + long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x; if (idx >= nnz) { return; } - int minor_pos = index[idx]; + IdxT minor_pos = index[idx]; if (mask[minor_pos] == false) { return; } @@ -24,9 +25,9 @@ __global__ void nan_mean_minor_kernel(const int* __restrict__ index, } } -template -__global__ void nan_mean_major_kernel(const int* __restrict__ indptr, - const int* __restrict__ index, +template +__global__ void nan_mean_major_kernel(const IdxT* __restrict__ indptr, + const IdxT* __restrict__ index, const T* __restrict__ data, double* __restrict__ means, int* __restrict__ nans, @@ -36,8 +37,8 @@ __global__ void nan_mean_major_kernel(const int* __restrict__ indptr, if (major_idx >= major) { return; } - int start_idx = indptr[major_idx]; - int stop_idx = indptr[major_idx + 1]; + IdxT start_idx = indptr[major_idx]; + IdxT stop_idx = indptr[major_idx + 1]; __shared__ double mean_place[64]; __shared__ int nan_place[64]; @@ -46,9 +47,9 @@ __global__ void nan_mean_major_kernel(const int* __restrict__ indptr, nan_place[threadIdx.x] = 0; __syncthreads(); - for (int minor_idx = start_idx + threadIdx.x; minor_idx < stop_idx; + for (IdxT minor_idx = start_idx + threadIdx.x; minor_idx < stop_idx; minor_idx += blockDim.x) { - int gene_number = index[minor_idx]; + IdxT gene_number = index[minor_idx]; if (mask[gene_number]) { T v = data[minor_idx]; if (isnan((double)v)) { diff --git a/src/rapids_singlecell/_cuda/nanmean/nanmean.cu b/src/rapids_singlecell/_cuda/nanmean/nanmean.cu index 3a9d660c..45517255 100644 --- a/src/rapids_singlecell/_cuda/nanmean/nanmean.cu +++ b/src/rapids_singlecell/_cuda/nanmean/nanmean.cu @@ -8,93 +8,74 @@ using namespace nb::literals; constexpr int BLOCK_SIZE_MINOR = 32; constexpr int BLOCK_SIZE_MAJOR = 64; -template -static inline void launch_nan_mean_minor(const int* index, const T* data, +template +static inline void launch_nan_mean_minor(const IdxT* index, const T* data, double* means, int* nans, - const bool* mask, int nnz, + const bool* mask, long long nnz, cudaStream_t stream) { dim3 block(BLOCK_SIZE_MINOR); dim3 grid((nnz + BLOCK_SIZE_MINOR - 1) / BLOCK_SIZE_MINOR); - nan_mean_minor_kernel + nan_mean_minor_kernel <<>>(index, data, means, nans, mask, nnz); CUDA_CHECK_LAST_ERROR(nan_mean_minor_kernel); } -template -static inline void launch_nan_mean_major(const int* indptr, const int* index, +template +static inline void launch_nan_mean_major(const IdxT* indptr, const IdxT* index, const T* data, double* means, int* nans, const bool* mask, int major, int minor, cudaStream_t stream) { dim3 block(BLOCK_SIZE_MAJOR); dim3 grid(major); - nan_mean_major_kernel<<>>( + nan_mean_major_kernel<<>>( indptr, index, data, means, nans, mask, major, minor); CUDA_CHECK_LAST_ERROR(nan_mean_major_kernel); } -template -void register_bindings(nb::module_& m) { - // nan_mean_minor - float32 - m.def( - "nan_mean_minor", - [](gpu_array_c index, - gpu_array_c data, - gpu_array_c means, gpu_array_c nans, - gpu_array_c mask, int nnz, - std::uintptr_t stream) { - launch_nan_mean_minor(index.data(), data.data(), - means.data(), nans.data(), mask.data(), - nnz, (cudaStream_t)stream); - }, - "index"_a, "data"_a, nb::kw_only(), "means"_a, "nans"_a, "mask"_a, - "nnz"_a, "stream"_a = 0); - - // nan_mean_minor - float64 +template +void def_nan_mean_minor(nb::module_& m) { m.def( "nan_mean_minor", - [](gpu_array_c index, - gpu_array_c data, - gpu_array_c means, gpu_array_c nans, - gpu_array_c mask, int nnz, - std::uintptr_t stream) { - launch_nan_mean_minor( + [](gpu_array_c index, + gpu_array_c data, gpu_array_c means, + gpu_array_c nans, gpu_array_c mask, + long long nnz, std::uintptr_t stream) { + launch_nan_mean_minor( index.data(), data.data(), means.data(), nans.data(), mask.data(), nnz, (cudaStream_t)stream); }, "index"_a, "data"_a, nb::kw_only(), "means"_a, "nans"_a, "mask"_a, "nnz"_a, "stream"_a = 0); +} - // nan_mean_major - float32 +template +void def_nan_mean_major(nb::module_& m) { m.def( "nan_mean_major", - [](gpu_array_c indptr, - gpu_array_c index, - gpu_array_c data, - gpu_array_c means, gpu_array_c nans, - gpu_array_c mask, int major, int minor, - std::uintptr_t stream) { - launch_nan_mean_major( + [](gpu_array_c indptr, + gpu_array_c index, + gpu_array_c data, gpu_array_c means, + gpu_array_c nans, gpu_array_c mask, + int major, int minor, std::uintptr_t stream) { + launch_nan_mean_major( indptr.data(), index.data(), data.data(), means.data(), nans.data(), mask.data(), major, minor, (cudaStream_t)stream); }, "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "means"_a, "nans"_a, "mask"_a, "major"_a, "minor"_a, "stream"_a = 0); +} - // nan_mean_major - float64 - m.def( - "nan_mean_major", - [](gpu_array_c indptr, - gpu_array_c index, - gpu_array_c data, - gpu_array_c means, gpu_array_c nans, - gpu_array_c mask, int major, int minor, - std::uintptr_t stream) { - launch_nan_mean_major( - indptr.data(), index.data(), data.data(), means.data(), - nans.data(), mask.data(), major, minor, (cudaStream_t)stream); - }, - "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "means"_a, "nans"_a, - "mask"_a, "major"_a, "minor"_a, "stream"_a = 0); +template +void register_bindings(nb::module_& m) { + def_nan_mean_minor(m); + def_nan_mean_minor(m); + def_nan_mean_minor(m); + def_nan_mean_minor(m); + + def_nan_mean_major(m); + def_nan_mean_major(m); + def_nan_mean_major(m); + def_nan_mean_major(m); } NB_MODULE(_nanmean_cuda, m) { diff --git a/src/rapids_singlecell/_cuda/norm/kernels_norm.cuh b/src/rapids_singlecell/_cuda/norm/kernels_norm.cuh index 96d0e838..638b75c3 100644 --- a/src/rapids_singlecell/_cuda/norm/kernels_norm.cuh +++ b/src/rapids_singlecell/_cuda/norm/kernels_norm.cuh @@ -48,19 +48,19 @@ __global__ void dense_row_scale_kernel(T* __restrict__ data, int nrows, } // One block per row. Parallel row sum + scale in one pass. -template -__global__ void csr_row_scale_kernel(const int* __restrict__ indptr, +template +__global__ void csr_row_scale_kernel(const IdxT* __restrict__ indptr, T* __restrict__ data, int nrows, T target_sum) { int row = blockIdx.x; if (row >= nrows) return; - int start = indptr[row]; - int end = indptr[row + 1]; + IdxT start = indptr[row]; + IdxT end = indptr[row + 1]; // Parallel row sum T val = (T)0; - for (int i = start + threadIdx.x; i < end; i += blockDim.x) val += data[i]; + for (IdxT i = start + threadIdx.x; i < end; i += blockDim.x) val += data[i]; // Warp-level reduction for (int offset = warpSize / 2; offset > 0; offset >>= 1) @@ -88,24 +88,24 @@ __global__ void csr_row_scale_kernel(const int* __restrict__ indptr, // Parallel multiply T scale = warp_sums[0]; if (scale > (T)0) { - for (int i = start + threadIdx.x; i < end; i += blockDim.x) + for (IdxT i = start + threadIdx.x; i < end; i += blockDim.x) data[i] *= scale; } } // One block per row. Warp-shuffle reduction for row sum. -template -__global__ void csr_sum_major_kernel(const int* __restrict__ indptr, +template +__global__ void csr_sum_major_kernel(const IdxT* __restrict__ indptr, const T* __restrict__ data, T* __restrict__ sums, int major) { int major_idx = blockIdx.x; if (major_idx >= major) return; - int start_idx = indptr[major_idx]; - int stop_idx = indptr[major_idx + 1]; + IdxT start_idx = indptr[major_idx]; + IdxT stop_idx = indptr[major_idx + 1]; T val = (T)0; - for (int i = start_idx + threadIdx.x; i < stop_idx; i += blockDim.x) + for (IdxT i = start_idx + threadIdx.x; i < stop_idx; i += blockDim.x) val += data[i]; // Warp-level reduction @@ -132,21 +132,21 @@ __global__ void csr_sum_major_kernel(const int* __restrict__ indptr, // ---------- find_hi_genes_csr ---------- // One block per row. Computes row sum, then marks genes whose value > // max_fraction * row_sum. -template -__global__ void find_hi_genes_csr_kernel(const int* __restrict__ indptr, - const int* __restrict__ indices, +template +__global__ void find_hi_genes_csr_kernel(const IdxT* __restrict__ indptr, + const IdxT* __restrict__ indices, const T* __restrict__ data, bool* __restrict__ gene_is_hi, T max_fraction, int nrows) { int row = blockIdx.x; if (row >= nrows) return; - int start = indptr[row]; - int end = indptr[row + 1]; + IdxT start = indptr[row]; + IdxT end = indptr[row + 1]; // Phase 1: partial row sum T val = (T)0; - for (int i = start + threadIdx.x; i < end; i += blockDim.x) val += data[i]; + for (IdxT i = start + threadIdx.x; i < end; i += blockDim.x) val += data[i]; // Warp-level reduction for (int offset = warpSize / 2; offset > 0; offset >>= 1) @@ -172,7 +172,7 @@ __global__ void find_hi_genes_csr_kernel(const int* __restrict__ indptr, // Phase 2: check elements against threshold T threshold = warp_sums[0]; - for (int i = start + threadIdx.x; i < end; i += blockDim.x) { + for (IdxT i = start + threadIdx.x; i < end; i += blockDim.x) { if (data[i] > threshold) gene_is_hi[indices[i]] = true; } } @@ -180,21 +180,21 @@ __global__ void find_hi_genes_csr_kernel(const int* __restrict__ indptr, // ---------- masked_mul_csr ---------- // One block per row. Computes masked row sum (skipping masked genes), then // scales all data. -template -__global__ void masked_mul_csr_kernel(const int* __restrict__ indptr, - const int* __restrict__ indices, +template +__global__ void masked_mul_csr_kernel(const IdxT* __restrict__ indptr, + const IdxT* __restrict__ indices, T* __restrict__ data, const bool* __restrict__ gene_mask, int nrows, T tsum) { int row = blockIdx.x; if (row >= nrows) return; - int start = indptr[row]; - int end = indptr[row + 1]; + IdxT start = indptr[row]; + IdxT end = indptr[row + 1]; // Phase 1: masked row sum T val = (T)0; - for (int i = start + threadIdx.x; i < end; i += blockDim.x) { + for (IdxT i = start + threadIdx.x; i < end; i += blockDim.x) { if (!gene_mask[indices[i]]) val += data[i]; } @@ -223,27 +223,27 @@ __global__ void masked_mul_csr_kernel(const int* __restrict__ indptr, // Phase 2: multiply T scale = warp_sums[0]; if (scale > (T)0) { - for (int i = start + threadIdx.x; i < end; i += blockDim.x) + for (IdxT i = start + threadIdx.x; i < end; i += blockDim.x) data[i] *= scale; } } // ---------- masked_sum_major ---------- // One block per row. Computes sum skipping masked genes. -template -__global__ void masked_sum_major_kernel(const int* __restrict__ indptr, - const int* __restrict__ indices, +template +__global__ void masked_sum_major_kernel(const IdxT* __restrict__ indptr, + const IdxT* __restrict__ indices, const T* __restrict__ data, const bool* __restrict__ gene_mask, T* __restrict__ sums, int major) { int major_idx = blockIdx.x; if (major_idx >= major) return; - int start_idx = indptr[major_idx]; - int stop_idx = indptr[major_idx + 1]; + IdxT start_idx = indptr[major_idx]; + IdxT stop_idx = indptr[major_idx + 1]; T val = (T)0; - for (int i = start_idx + threadIdx.x; i < stop_idx; i += blockDim.x) { + for (IdxT i = start_idx + threadIdx.x; i < stop_idx; i += blockDim.x) { if (!gene_mask[indices[i]]) val += data[i]; } @@ -270,8 +270,8 @@ __global__ void masked_sum_major_kernel(const int* __restrict__ indptr, // ---------- prescaled_mul_csr ---------- // One block per row. Multiplies each element by a pre-computed per-row scale. -template -__global__ void prescaled_mul_csr_kernel(const int* __restrict__ indptr, +template +__global__ void prescaled_mul_csr_kernel(const IdxT* __restrict__ indptr, T* __restrict__ data, const T* __restrict__ scales, int nrows) { @@ -279,10 +279,10 @@ __global__ void prescaled_mul_csr_kernel(const int* __restrict__ indptr, if (row >= nrows) return; T scale = scales[row]; - int start = indptr[row]; - int end = indptr[row + 1]; + IdxT start = indptr[row]; + IdxT end = indptr[row + 1]; - for (int i = start + threadIdx.x; i < end; i += blockDim.x) + for (IdxT i = start + threadIdx.x; i < end; i += blockDim.x) data[i] *= scale; } diff --git a/src/rapids_singlecell/_cuda/norm/norm.cu b/src/rapids_singlecell/_cuda/norm/norm.cu index d589d308..3ebfd9f5 100644 --- a/src/rapids_singlecell/_cuda/norm/norm.cu +++ b/src/rapids_singlecell/_cuda/norm/norm.cu @@ -17,70 +17,70 @@ static inline void launch_dense_row_scale(T* data, int nrows, int ncols, CUDA_CHECK_LAST_ERROR(dense_row_scale_kernel); } -template -static inline void launch_csr_row_scale(const int* indptr, T* data, int nrows, +template +static inline void launch_csr_row_scale(const IdxT* indptr, T* data, int nrows, T target_sum, cudaStream_t stream) { dim3 block(BLOCK_SIZE); dim3 grid(nrows); - csr_row_scale_kernel + csr_row_scale_kernel <<>>(indptr, data, nrows, target_sum); CUDA_CHECK_LAST_ERROR(csr_row_scale_kernel); } -template -static inline void launch_csr_sum_major(const int* indptr, const T* data, +template +static inline void launch_csr_sum_major(const IdxT* indptr, const T* data, T* sums, int major, cudaStream_t stream) { dim3 block(BLOCK_SIZE); dim3 grid(major); - csr_sum_major_kernel + csr_sum_major_kernel <<>>(indptr, data, sums, major); CUDA_CHECK_LAST_ERROR(csr_sum_major_kernel); } -template -static inline void launch_find_hi_genes_csr(const int* indptr, - const int* indices, const T* data, +template +static inline void launch_find_hi_genes_csr(const IdxT* indptr, + const IdxT* indices, const T* data, bool* gene_is_hi, T max_fraction, int nrows, cudaStream_t stream) { dim3 block(BLOCK_SIZE); dim3 grid(nrows); - find_hi_genes_csr_kernel<<>>( + find_hi_genes_csr_kernel<<>>( indptr, indices, data, gene_is_hi, max_fraction, nrows); CUDA_CHECK_LAST_ERROR(find_hi_genes_csr_kernel); } -template -static inline void launch_masked_mul_csr(const int* indptr, const int* indices, - T* data, const bool* gene_mask, - int nrows, T tsum, - cudaStream_t stream) { +template +static inline void launch_masked_mul_csr(const IdxT* indptr, + const IdxT* indices, T* data, + const bool* gene_mask, int nrows, + T tsum, cudaStream_t stream) { dim3 block(BLOCK_SIZE); dim3 grid(nrows); - masked_mul_csr_kernel<<>>( + masked_mul_csr_kernel<<>>( indptr, indices, data, gene_mask, nrows, tsum); CUDA_CHECK_LAST_ERROR(masked_mul_csr_kernel); } -template -static inline void launch_masked_sum_major(const int* indptr, - const int* indices, const T* data, +template +static inline void launch_masked_sum_major(const IdxT* indptr, + const IdxT* indices, const T* data, const bool* gene_mask, T* sums, int major, cudaStream_t stream) { dim3 block(BLOCK_SIZE); dim3 grid(major); - masked_sum_major_kernel<<>>( + masked_sum_major_kernel<<>>( indptr, indices, data, gene_mask, sums, major); CUDA_CHECK_LAST_ERROR(masked_sum_major_kernel); } -template -static inline void launch_prescaled_mul_csr(const int* indptr, T* data, +template +static inline void launch_prescaled_mul_csr(const IdxT* indptr, T* data, const T* scales, int nrows, cudaStream_t stream) { dim3 block(BLOCK_SIZE); dim3 grid(nrows); - prescaled_mul_csr_kernel + prescaled_mul_csr_kernel <<>>(indptr, data, scales, nrows); CUDA_CHECK_LAST_ERROR(prescaled_mul_csr_kernel); } @@ -96,197 +96,159 @@ static inline void launch_prescaled_mul_dense(T* data, const T* scales, CUDA_CHECK_LAST_ERROR(prescaled_mul_dense_kernel); } -template -void register_bindings(nb::module_& m) { - // mul_dense - float32 - m.def( - "mul_dense", - [](gpu_array_c data, int nrows, int ncols, - float target_sum, std::uintptr_t stream) { - launch_dense_row_scale(data.data(), nrows, ncols, target_sum, - (cudaStream_t)stream); - }, - "data"_a, nb::kw_only(), "nrows"_a, "ncols"_a, "target_sum"_a, - "stream"_a = 0); - - // mul_dense - float64 - m.def( - "mul_dense", - [](gpu_array_c data, int nrows, int ncols, - double target_sum, std::uintptr_t stream) { - launch_dense_row_scale(data.data(), nrows, ncols, - target_sum, (cudaStream_t)stream); - }, - "data"_a, nb::kw_only(), "nrows"_a, "ncols"_a, "target_sum"_a, - "stream"_a = 0); - - // mul_csr - float32 +template +void def_mul_csr(nb::module_& m) { m.def( "mul_csr", - [](gpu_array_c indptr, - gpu_array_c data, int nrows, float target_sum, - std::uintptr_t stream) { - launch_csr_row_scale(indptr.data(), data.data(), nrows, - target_sum, (cudaStream_t)stream); + [](gpu_array_c indptr, gpu_array_c data, + int nrows, T target_sum, std::uintptr_t stream) { + launch_csr_row_scale(indptr.data(), data.data(), nrows, + target_sum, (cudaStream_t)stream); }, "indptr"_a, "data"_a, nb::kw_only(), "nrows"_a, "target_sum"_a, "stream"_a = 0); +} - // mul_csr - float64 - m.def( - "mul_csr", - [](gpu_array_c indptr, - gpu_array_c data, int nrows, double target_sum, - std::uintptr_t stream) { - launch_csr_row_scale(indptr.data(), data.data(), nrows, - target_sum, (cudaStream_t)stream); - }, - "indptr"_a, "data"_a, nb::kw_only(), "nrows"_a, "target_sum"_a, - "stream"_a = 0); - - // sum_major - float32 - m.def( - "sum_major", - [](gpu_array_c indptr, - gpu_array_c data, - gpu_array_c sums, int major, std::uintptr_t stream) { - launch_csr_sum_major(indptr.data(), data.data(), sums.data(), - major, (cudaStream_t)stream); - }, - "indptr"_a, "data"_a, nb::kw_only(), "sums"_a, "major"_a, - "stream"_a = 0); - - // sum_major - float64 +template +void def_sum_major(nb::module_& m) { m.def( "sum_major", - [](gpu_array_c indptr, - gpu_array_c data, - gpu_array_c sums, int major, std::uintptr_t stream) { - launch_csr_sum_major(indptr.data(), data.data(), - sums.data(), major, - (cudaStream_t)stream); + [](gpu_array_c indptr, + gpu_array_c data, gpu_array_c sums, + int major, std::uintptr_t stream) { + launch_csr_sum_major(indptr.data(), data.data(), + sums.data(), major, + (cudaStream_t)stream); }, "indptr"_a, "data"_a, nb::kw_only(), "sums"_a, "major"_a, "stream"_a = 0); +} - // find_hi_genes_csr - float32 - m.def( - "find_hi_genes_csr", - [](gpu_array_c indptr, - gpu_array_c indices, - gpu_array_c data, - gpu_array_c gene_is_hi, float max_fraction, int nrows, - std::uintptr_t stream) { - launch_find_hi_genes_csr( - indptr.data(), indices.data(), data.data(), gene_is_hi.data(), - max_fraction, nrows, (cudaStream_t)stream); - }, - "indptr"_a, "indices"_a, "data"_a, nb::kw_only(), "gene_is_hi"_a, - "max_fraction"_a, "nrows"_a, "stream"_a = 0); - - // find_hi_genes_csr - float64 +template +void def_find_hi_genes_csr(nb::module_& m) { m.def( "find_hi_genes_csr", - [](gpu_array_c indptr, - gpu_array_c indices, - gpu_array_c data, - gpu_array_c gene_is_hi, double max_fraction, int nrows, + [](gpu_array_c indptr, + gpu_array_c indices, + gpu_array_c data, + gpu_array_c gene_is_hi, T max_fraction, int nrows, std::uintptr_t stream) { - launch_find_hi_genes_csr( + launch_find_hi_genes_csr( indptr.data(), indices.data(), data.data(), gene_is_hi.data(), max_fraction, nrows, (cudaStream_t)stream); }, "indptr"_a, "indices"_a, "data"_a, nb::kw_only(), "gene_is_hi"_a, "max_fraction"_a, "nrows"_a, "stream"_a = 0); +} - // masked_mul_csr - float32 - m.def( - "masked_mul_csr", - [](gpu_array_c indptr, - gpu_array_c indices, - gpu_array_c data, - gpu_array_c gene_mask, int nrows, float tsum, - std::uintptr_t stream) { - launch_masked_mul_csr(indptr.data(), indices.data(), - data.data(), gene_mask.data(), nrows, - tsum, (cudaStream_t)stream); - }, - "indptr"_a, "indices"_a, "data"_a, nb::kw_only(), "gene_mask"_a, - "nrows"_a, "tsum"_a, "stream"_a = 0); - - // masked_mul_csr - float64 +template +void def_masked_mul_csr(nb::module_& m) { m.def( "masked_mul_csr", - [](gpu_array_c indptr, - gpu_array_c indices, - gpu_array_c data, - gpu_array_c gene_mask, int nrows, double tsum, + [](gpu_array_c indptr, + gpu_array_c indices, gpu_array_c data, + gpu_array_c gene_mask, int nrows, T tsum, std::uintptr_t stream) { - launch_masked_mul_csr(indptr.data(), indices.data(), - data.data(), gene_mask.data(), nrows, - tsum, (cudaStream_t)stream); + launch_masked_mul_csr(indptr.data(), indices.data(), + data.data(), gene_mask.data(), nrows, + tsum, (cudaStream_t)stream); }, "indptr"_a, "indices"_a, "data"_a, nb::kw_only(), "gene_mask"_a, "nrows"_a, "tsum"_a, "stream"_a = 0); +} - // masked_sum_major - float32 - m.def( - "masked_sum_major", - [](gpu_array_c indptr, - gpu_array_c indices, - gpu_array_c data, - gpu_array_c gene_mask, - gpu_array_c sums, int major, std::uintptr_t stream) { - launch_masked_sum_major( - indptr.data(), indices.data(), data.data(), gene_mask.data(), - sums.data(), major, (cudaStream_t)stream); - }, - "indptr"_a, "indices"_a, "data"_a, nb::kw_only(), "gene_mask"_a, - "sums"_a, "major"_a, "stream"_a = 0); - - // masked_sum_major - float64 +template +void def_masked_sum_major(nb::module_& m) { m.def( "masked_sum_major", - [](gpu_array_c indptr, - gpu_array_c indices, - gpu_array_c data, + [](gpu_array_c indptr, + gpu_array_c indices, + gpu_array_c data, gpu_array_c gene_mask, - gpu_array_c sums, int major, std::uintptr_t stream) { - launch_masked_sum_major( + gpu_array_c sums, int major, std::uintptr_t stream) { + launch_masked_sum_major( indptr.data(), indices.data(), data.data(), gene_mask.data(), sums.data(), major, (cudaStream_t)stream); }, "indptr"_a, "indices"_a, "data"_a, nb::kw_only(), "gene_mask"_a, "sums"_a, "major"_a, "stream"_a = 0); +} - // prescaled_mul_csr - float32 +template +void def_prescaled_mul_csr(nb::module_& m) { m.def( "prescaled_mul_csr", - [](gpu_array_c indptr, - gpu_array_c data, - gpu_array_c scales, int nrows, + [](gpu_array_c indptr, gpu_array_c data, + gpu_array_c scales, int nrows, std::uintptr_t stream) { - launch_prescaled_mul_csr(indptr.data(), data.data(), - scales.data(), nrows, - (cudaStream_t)stream); + launch_prescaled_mul_csr(indptr.data(), data.data(), + scales.data(), nrows, + (cudaStream_t)stream); }, "indptr"_a, "data"_a, nb::kw_only(), "scales"_a, "nrows"_a, "stream"_a = 0); +} + +template +void register_bindings(nb::module_& m) { + // mul_dense - float32 + m.def( + "mul_dense", + [](gpu_array_c data, int nrows, int ncols, + float target_sum, std::uintptr_t stream) { + launch_dense_row_scale(data.data(), nrows, ncols, target_sum, + (cudaStream_t)stream); + }, + "data"_a, nb::kw_only(), "nrows"_a, "ncols"_a, "target_sum"_a, + "stream"_a = 0); - // prescaled_mul_csr - float64 + // mul_dense - float64 m.def( - "prescaled_mul_csr", - [](gpu_array_c indptr, - gpu_array_c data, - gpu_array_c scales, int nrows, - std::uintptr_t stream) { - launch_prescaled_mul_csr(indptr.data(), data.data(), - scales.data(), nrows, - (cudaStream_t)stream); + "mul_dense", + [](gpu_array_c data, int nrows, int ncols, + double target_sum, std::uintptr_t stream) { + launch_dense_row_scale(data.data(), nrows, ncols, + target_sum, (cudaStream_t)stream); }, - "indptr"_a, "data"_a, nb::kw_only(), "scales"_a, "nrows"_a, + "data"_a, nb::kw_only(), "nrows"_a, "ncols"_a, "target_sum"_a, "stream"_a = 0); + // mul_csr - float32/float64 x int32/int64 + def_mul_csr(m); + def_mul_csr(m); + def_mul_csr(m); + def_mul_csr(m); + + // sum_major - float32/float64 x int32/int64 + def_sum_major(m); + def_sum_major(m); + def_sum_major(m); + def_sum_major(m); + + // find_hi_genes_csr - float32/float64 x int32/int64 + def_find_hi_genes_csr(m); + def_find_hi_genes_csr(m); + def_find_hi_genes_csr(m); + def_find_hi_genes_csr(m); + + // masked_mul_csr - float32/float64 x int32/int64 + def_masked_mul_csr(m); + def_masked_mul_csr(m); + def_masked_mul_csr(m); + def_masked_mul_csr(m); + + // masked_sum_major - float32/float64 x int32/int64 + def_masked_sum_major(m); + def_masked_sum_major(m); + def_masked_sum_major(m); + def_masked_sum_major(m); + + // prescaled_mul_csr - float32/float64 x int32/int64 + def_prescaled_mul_csr(m); + def_prescaled_mul_csr(m); + def_prescaled_mul_csr(m); + def_prescaled_mul_csr(m); + // prescaled_mul_dense - float32 m.def( "prescaled_mul_dense", diff --git a/src/rapids_singlecell/_cuda/pr/kernels_pr.cuh b/src/rapids_singlecell/_cuda/pr/kernels_pr.cuh index e31936e4..285aec18 100644 --- a/src/rapids_singlecell/_cuda/pr/kernels_pr.cuh +++ b/src/rapids_singlecell/_cuda/pr/kernels_pr.cuh @@ -2,9 +2,9 @@ #include -template +template __global__ void sparse_norm_res_csc_kernel( - const int* __restrict__ indptr, const int* __restrict__ index, + const IdxT* __restrict__ indptr, const IdxT* __restrict__ index, const T* __restrict__ data, const T* __restrict__ sums_cells, const T* __restrict__ sums_genes, T* __restrict__ residuals, const T inv_sum_total, const T clip, const T inv_theta, int n_cells, @@ -13,9 +13,9 @@ __global__ void sparse_norm_res_csc_kernel( if (gene >= n_genes) { return; } - int start = indptr[gene]; - int stop = indptr[gene + 1]; - int sparse_idx = start; + IdxT start = indptr[gene]; + IdxT stop = indptr[gene + 1]; + IdxT sparse_idx = start; for (int cell = 0; cell < n_cells; ++cell) { T mu = sums_genes[gene] * sums_cells[cell] * inv_sum_total; long long res_index = static_cast(cell) * n_genes + gene; @@ -31,9 +31,9 @@ __global__ void sparse_norm_res_csc_kernel( } } -template +template __global__ void sparse_norm_res_csr_kernel( - const int* __restrict__ indptr, const int* __restrict__ index, + const IdxT* __restrict__ indptr, const IdxT* __restrict__ index, const T* __restrict__ data, const T* __restrict__ sums_cells, const T* __restrict__ sums_genes, T* __restrict__ residuals, const T inv_sum_total, const T clip, const T inv_theta, int n_cells, @@ -42,9 +42,9 @@ __global__ void sparse_norm_res_csr_kernel( if (cell >= n_cells) { return; } - int start = indptr[cell]; - int stop = indptr[cell + 1]; - int sparse_idx = start; + IdxT start = indptr[cell]; + IdxT stop = indptr[cell + 1]; + IdxT sparse_idx = start; for (int gene = 0; gene < n_genes; ++gene) { long long res_index = static_cast(cell) * n_genes + gene; T mu = sums_genes[gene] * sums_cells[cell] * inv_sum_total; diff --git a/src/rapids_singlecell/_cuda/pr/kernels_pr_hvg.cuh b/src/rapids_singlecell/_cuda/pr/kernels_pr_hvg.cuh index 9e8c91f4..03b06774 100644 --- a/src/rapids_singlecell/_cuda/pr/kernels_pr_hvg.cuh +++ b/src/rapids_singlecell/_cuda/pr/kernels_pr_hvg.cuh @@ -4,9 +4,9 @@ // Compute column sums (sums_genes) and row sums (sums_cells) from CSC sparse // matrix One thread per column (gene), atomicAdd for row sums -template -__global__ void sparse_sum_csc_kernel(const int* __restrict__ indptr, - const int* __restrict__ index, +template +__global__ void sparse_sum_csc_kernel(const IdxT* __restrict__ indptr, + const IdxT* __restrict__ index, const T* __restrict__ data, T* __restrict__ sums_genes, T* __restrict__ sums_cells, int n_genes) { @@ -14,10 +14,10 @@ __global__ void sparse_sum_csc_kernel(const int* __restrict__ indptr, if (gene >= n_genes) { return; } - int start = indptr[gene]; - int stop = indptr[gene + 1]; + IdxT start = indptr[gene]; + IdxT stop = indptr[gene + 1]; T col_sum = (T)0; - for (int i = start; i < stop; ++i) { + for (IdxT i = start; i < stop; ++i) { T val = data[i]; col_sum += val; atomicAdd(&sums_cells[index[i]], val); @@ -27,9 +27,9 @@ __global__ void sparse_sum_csc_kernel(const int* __restrict__ indptr, // Welford's single-pass algorithm for variance of clipped Pearson residuals // (CSC sparse) -template +template __global__ void csc_hvg_res_kernel( - const int* __restrict__ indptr, const int* __restrict__ index, + const IdxT* __restrict__ indptr, const IdxT* __restrict__ index, const T* __restrict__ data, const T* __restrict__ sums_genes, const T* __restrict__ sums_cells, T* __restrict__ residuals, const T inv_sum_total, const T clip, const T inv_theta, int n_genes, @@ -37,15 +37,15 @@ __global__ void csc_hvg_res_kernel( int gene = blockDim.x * blockIdx.x + threadIdx.x; if (gene >= n_genes) return; - int start = indptr[gene]; - int stop = indptr[gene + 1]; + IdxT start = indptr[gene]; + IdxT stop = indptr[gene + 1]; T gene_sum = sums_genes[gene]; // Welford's online algorithm: single pass for mean and variance T mean = (T)0; T M2 = (T)0; - int sparse_idx = start; + IdxT sparse_idx = start; for (int cell = 0; cell < n_cells; ++cell) { T mu = gene_sum * sums_cells[cell] * inv_sum_total; diff --git a/src/rapids_singlecell/_cuda/pr/pr.cu b/src/rapids_singlecell/_cuda/pr/pr.cu index e9c3f077..72ed50d7 100644 --- a/src/rapids_singlecell/_cuda/pr/pr.cu +++ b/src/rapids_singlecell/_cuda/pr/pr.cu @@ -10,27 +10,27 @@ constexpr int SPARSE_BLOCK_SIZE = 32; constexpr int CSR_NORM_BLOCK_SIZE = 8; constexpr int DENSE_BLOCK_DIM = 8; -template +template static inline void launch_sparse_norm_res_csc( - const int* indptr, const int* index, const T* data, const T* sums_cells, + const IdxT* indptr, const IdxT* index, const T* data, const T* sums_cells, const T* sums_genes, T* residuals, T inv_sum_total, T clip, T inv_theta, int n_cells, int n_genes, cudaStream_t stream) { dim3 block(SPARSE_BLOCK_SIZE); dim3 grid((n_genes + SPARSE_BLOCK_SIZE - 1) / SPARSE_BLOCK_SIZE); - sparse_norm_res_csc_kernel<<>>( + sparse_norm_res_csc_kernel<<>>( indptr, index, data, sums_cells, sums_genes, residuals, inv_sum_total, clip, inv_theta, n_cells, n_genes); CUDA_CHECK_LAST_ERROR(sparse_norm_res_csc_kernel); } -template +template static inline void launch_sparse_norm_res_csr( - const int* indptr, const int* index, const T* data, const T* sums_cells, + const IdxT* indptr, const IdxT* index, const T* data, const T* sums_cells, const T* sums_genes, T* residuals, T inv_sum_total, T clip, T inv_theta, int n_cells, int n_genes, cudaStream_t stream) { dim3 block(CSR_NORM_BLOCK_SIZE); dim3 grid((n_cells + CSR_NORM_BLOCK_SIZE - 1) / CSR_NORM_BLOCK_SIZE); - sparse_norm_res_csr_kernel<<>>( + sparse_norm_res_csr_kernel<<>>( indptr, index, data, sums_cells, sums_genes, residuals, inv_sum_total, clip, inv_theta, n_cells, n_genes); CUDA_CHECK_LAST_ERROR(sparse_norm_res_csr_kernel); @@ -51,20 +51,20 @@ static inline void launch_dense_norm_res(const T* X, T* residuals, CUDA_CHECK_LAST_ERROR(dense_norm_res_kernel); } -template -static inline void launch_sparse_sum_csc(const int* indptr, const int* index, +template +static inline void launch_sparse_sum_csc(const IdxT* indptr, const IdxT* index, const T* data, T* sums_genes, T* sums_cells, int n_genes, cudaStream_t stream) { dim3 block(SPARSE_BLOCK_SIZE); dim3 grid((n_genes + SPARSE_BLOCK_SIZE - 1) / SPARSE_BLOCK_SIZE); - sparse_sum_csc_kernel<<>>( + sparse_sum_csc_kernel<<>>( indptr, index, data, sums_genes, sums_cells, n_genes); CUDA_CHECK_LAST_ERROR(sparse_sum_csc_kernel); } -template -static inline void launch_csc_hvg_res(const int* indptr, const int* index, +template +static inline void launch_csc_hvg_res(const IdxT* indptr, const IdxT* index, const T* data, const T* sums_genes, const T* sums_cells, T* residuals, T inv_sum_total, T clip, T inv_theta, @@ -72,7 +72,7 @@ static inline void launch_csc_hvg_res(const int* indptr, const int* index, cudaStream_t stream) { dim3 block(SPARSE_BLOCK_SIZE); dim3 grid((n_genes + SPARSE_BLOCK_SIZE - 1) / SPARSE_BLOCK_SIZE); - csc_hvg_res_kernel<<>>( + csc_hvg_res_kernel<<>>( indptr, index, data, sums_genes, sums_cells, residuals, inv_sum_total, clip, inv_theta, n_genes, n_cells); CUDA_CHECK_LAST_ERROR(csc_hvg_res_kernel); @@ -92,19 +92,19 @@ static inline void launch_dense_hvg_res(const T* data, const T* sums_genes, CUDA_CHECK_LAST_ERROR(dense_hvg_res_kernel); } -// Helper to define sparse_norm_res_csc for a given dtype -template +// Helper to define sparse_norm_res_csc for a given dtype and index type +template void def_sparse_norm_res_csc(nb::module_& m) { m.def( "sparse_norm_res_csc", - [](gpu_array_c indptr, - gpu_array_c index, + [](gpu_array_c indptr, + gpu_array_c index, gpu_array_c data, gpu_array_c sums_cells, gpu_array_c sums_genes, gpu_array_c residuals, T inv_sum_total, T clip, T inv_theta, int n_cells, int n_genes, std::uintptr_t stream) { - launch_sparse_norm_res_csc( + launch_sparse_norm_res_csc( indptr.data(), index.data(), data.data(), sums_cells.data(), sums_genes.data(), residuals.data(), inv_sum_total, clip, inv_theta, n_cells, n_genes, (cudaStream_t)stream); @@ -114,19 +114,19 @@ void def_sparse_norm_res_csc(nb::module_& m) { "inv_theta"_a, "n_cells"_a, "n_genes"_a, "stream"_a = 0); } -// Helper to define sparse_norm_res_csr for a given dtype -template +// Helper to define sparse_norm_res_csr for a given dtype and index type +template void def_sparse_norm_res_csr(nb::module_& m) { m.def( "sparse_norm_res_csr", - [](gpu_array_c indptr, - gpu_array_c index, + [](gpu_array_c indptr, + gpu_array_c index, gpu_array_c data, gpu_array_c sums_cells, gpu_array_c sums_genes, gpu_array_c residuals, T inv_sum_total, T clip, T inv_theta, int n_cells, int n_genes, std::uintptr_t stream) { - launch_sparse_norm_res_csr( + launch_sparse_norm_res_csr( indptr.data(), index.data(), data.data(), sums_cells.data(), sums_genes.data(), residuals.data(), inv_sum_total, clip, inv_theta, n_cells, n_genes, (cudaStream_t)stream); @@ -155,37 +155,37 @@ void def_dense_norm_res(nb::module_& m) { "stream"_a = 0); } -// Helper to define sparse_sum_csc for a given dtype -template +// Helper to define sparse_sum_csc for a given dtype and index type +template void def_sparse_sum_csc(nb::module_& m) { m.def( "sparse_sum_csc", - [](gpu_array_c indptr, - gpu_array_c index, + [](gpu_array_c indptr, + gpu_array_c index, gpu_array_c data, gpu_array_c sums_genes, gpu_array_c sums_cells, int n_genes, std::uintptr_t stream) { - launch_sparse_sum_csc(indptr.data(), index.data(), data.data(), - sums_genes.data(), sums_cells.data(), - n_genes, (cudaStream_t)stream); + launch_sparse_sum_csc( + indptr.data(), index.data(), data.data(), sums_genes.data(), + sums_cells.data(), n_genes, (cudaStream_t)stream); }, "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "sums_genes"_a, "sums_cells"_a, "n_genes"_a, "stream"_a = 0); } -// Helper to define csc_hvg_res for a given dtype -template +// Helper to define csc_hvg_res for a given dtype and index type +template void def_csc_hvg_res(nb::module_& m) { m.def( "csc_hvg_res", - [](gpu_array_c indptr, - gpu_array_c index, + [](gpu_array_c indptr, + gpu_array_c index, gpu_array_c data, gpu_array_c sums_genes, gpu_array_c sums_cells, gpu_array_c residuals, T inv_sum_total, T clip, T inv_theta, int n_genes, int n_cells, std::uintptr_t stream) { - launch_csc_hvg_res( + launch_csc_hvg_res( indptr.data(), index.data(), data.data(), sums_genes.data(), sums_cells.data(), residuals.data(), inv_sum_total, clip, inv_theta, n_genes, n_cells, (cudaStream_t)stream); @@ -218,24 +218,32 @@ void def_dense_hvg_res(nb::module_& m) { template void register_bindings(nb::module_& m) { // sparse_norm_res_csc - def_sparse_norm_res_csc(m); - def_sparse_norm_res_csc(m); + def_sparse_norm_res_csc(m); + def_sparse_norm_res_csc(m); + def_sparse_norm_res_csc(m); + def_sparse_norm_res_csc(m); // sparse_norm_res_csr - def_sparse_norm_res_csr(m); - def_sparse_norm_res_csr(m); + def_sparse_norm_res_csr(m); + def_sparse_norm_res_csr(m); + def_sparse_norm_res_csr(m); + def_sparse_norm_res_csr(m); // dense_norm_res def_dense_norm_res(m); def_dense_norm_res(m); // sparse_sum_csc - def_sparse_sum_csc(m); - def_sparse_sum_csc(m); + def_sparse_sum_csc(m); + def_sparse_sum_csc(m); + def_sparse_sum_csc(m); + def_sparse_sum_csc(m); // csc_hvg_res - def_csc_hvg_res(m); - def_csc_hvg_res(m); + def_csc_hvg_res(m); + def_csc_hvg_res(m); + def_csc_hvg_res(m); + def_csc_hvg_res(m); // dense_hvg_res - always F-contiguous (Python calls cp.asfortranarray) def_dense_hvg_res(m); diff --git a/src/rapids_singlecell/_cuda/qc/kernels_qc.cuh b/src/rapids_singlecell/_cuda/qc/kernels_qc.cuh index 96860272..c9b5ddb3 100644 --- a/src/rapids_singlecell/_cuda/qc/kernels_qc.cuh +++ b/src/rapids_singlecell/_cuda/qc/kernels_qc.cuh @@ -2,9 +2,9 @@ #include -template -__global__ void qc_csc_kernel(const int* __restrict__ indptr, - const int* __restrict__ index, +template +__global__ void qc_csc_kernel(const IdxT* __restrict__ indptr, + const IdxT* __restrict__ index, const T* __restrict__ data, T* __restrict__ sums_cells, T* __restrict__ sums_genes, @@ -12,13 +12,13 @@ __global__ void qc_csc_kernel(const int* __restrict__ indptr, int* __restrict__ gene_ex, int n_genes) { int gene = blockDim.x * blockIdx.x + threadIdx.x; if (gene >= n_genes) return; - int start_idx = indptr[gene]; - int stop_idx = indptr[gene + 1]; + IdxT start_idx = indptr[gene]; + IdxT stop_idx = indptr[gene + 1]; T sums_genes_i = T(0); int gene_ex_i = 0; - for (int p = start_idx; p < stop_idx; ++p) { + for (IdxT p = start_idx; p < stop_idx; ++p) { T v = data[p]; - int cell = index[p]; + IdxT cell = index[p]; sums_genes_i += v; atomicAdd(&sums_cells[cell], v); ++gene_ex_i; @@ -28,9 +28,9 @@ __global__ void qc_csc_kernel(const int* __restrict__ indptr, gene_ex[gene] = gene_ex_i; } -template -__global__ void qc_csr_kernel(const int* __restrict__ indptr, - const int* __restrict__ index, +template +__global__ void qc_csr_kernel(const IdxT* __restrict__ indptr, + const IdxT* __restrict__ index, const T* __restrict__ data, T* __restrict__ sums_cells, T* __restrict__ sums_genes, @@ -38,13 +38,13 @@ __global__ void qc_csr_kernel(const int* __restrict__ indptr, int* __restrict__ gene_ex, int n_cells) { int cell = blockDim.x * blockIdx.x + threadIdx.x; if (cell >= n_cells) return; - int start_idx = indptr[cell]; - int stop_idx = indptr[cell + 1]; + IdxT start_idx = indptr[cell]; + IdxT stop_idx = indptr[cell + 1]; T sums_cells_i = T(0); int cell_ex_i = 0; - for (int p = start_idx; p < stop_idx; ++p) { + for (IdxT p = start_idx; p < stop_idx; ++p) { T v = data[p]; - int gene = index[p]; + IdxT gene = index[p]; atomicAdd(&sums_genes[gene], v); sums_cells_i += v; atomicAdd(&gene_ex[gene], 1); @@ -74,36 +74,36 @@ __global__ void qc_dense_kernel(const T* __restrict__ data, } } -template -__global__ void qc_csc_sub_kernel(const int* __restrict__ indptr, - const int* __restrict__ index, +template +__global__ void qc_csc_sub_kernel(const IdxT* __restrict__ indptr, + const IdxT* __restrict__ index, const T* __restrict__ data, T* __restrict__ sums_cells, const bool* __restrict__ mask, int n_genes) { int gene = blockDim.x * blockIdx.x + threadIdx.x; if (gene >= n_genes) return; if (!mask[gene]) return; - int start_idx = indptr[gene]; - int stop_idx = indptr[gene + 1]; - for (int p = start_idx; p < stop_idx; ++p) { - int cell = index[p]; + IdxT start_idx = indptr[gene]; + IdxT stop_idx = indptr[gene + 1]; + for (IdxT p = start_idx; p < stop_idx; ++p) { + IdxT cell = index[p]; atomicAdd(&sums_cells[cell], data[p]); } } -template -__global__ void qc_csr_sub_kernel(const int* __restrict__ indptr, - const int* __restrict__ index, +template +__global__ void qc_csr_sub_kernel(const IdxT* __restrict__ indptr, + const IdxT* __restrict__ index, const T* __restrict__ data, T* __restrict__ sums_cells, const bool* __restrict__ mask, int n_cells) { int cell = blockDim.x * blockIdx.x + threadIdx.x; if (cell >= n_cells) return; - int start_idx = indptr[cell]; - int stop_idx = indptr[cell + 1]; + IdxT start_idx = indptr[cell]; + IdxT stop_idx = indptr[cell + 1]; T sums_cells_i = T(0); - for (int p = start_idx; p < stop_idx; ++p) { - int gene = index[p]; + for (IdxT p = start_idx; p < stop_idx; ++p) { + IdxT gene = index[p]; if (mask[gene]) sums_cells_i += data[p]; } sums_cells[cell] = sums_cells_i; diff --git a/src/rapids_singlecell/_cuda/qc/qc.cu b/src/rapids_singlecell/_cuda/qc/qc.cu index b8b714bc..f61de010 100644 --- a/src/rapids_singlecell/_cuda/qc/qc.cu +++ b/src/rapids_singlecell/_cuda/qc/qc.cu @@ -8,26 +8,26 @@ using namespace nb::literals; constexpr int SPARSE_BLOCK_SIZE = 32; constexpr int DENSE_BLOCK_DIM = 16; -template -static inline void launch_qc_csc(const int* indptr, const int* index, +template +static inline void launch_qc_csc(const IdxT* indptr, const IdxT* index, const T* data, T* sums_cells, T* sums_genes, int* cell_ex, int* gene_ex, int n_genes, cudaStream_t stream) { dim3 block(SPARSE_BLOCK_SIZE); dim3 grid((n_genes + SPARSE_BLOCK_SIZE - 1) / SPARSE_BLOCK_SIZE); - qc_csc_kernel<<>>( + qc_csc_kernel<<>>( indptr, index, data, sums_cells, sums_genes, cell_ex, gene_ex, n_genes); CUDA_CHECK_LAST_ERROR(qc_csc_kernel); } -template -static inline void launch_qc_csr(const int* indptr, const int* index, +template +static inline void launch_qc_csr(const IdxT* indptr, const IdxT* index, const T* data, T* sums_cells, T* sums_genes, int* cell_ex, int* gene_ex, int n_cells, cudaStream_t stream) { dim3 block(SPARSE_BLOCK_SIZE); dim3 grid((n_cells + SPARSE_BLOCK_SIZE - 1) / SPARSE_BLOCK_SIZE); - qc_csr_kernel<<>>( + qc_csr_kernel<<>>( indptr, index, data, sums_cells, sums_genes, cell_ex, gene_ex, n_cells); CUDA_CHECK_LAST_ERROR(qc_csr_kernel); } @@ -44,27 +44,27 @@ static inline void launch_qc_dense(const T* data, T* sums_cells, T* sums_genes, CUDA_CHECK_LAST_ERROR(qc_dense_kernel); } -template -static inline void launch_qc_csc_sub(const int* indptr, const int* index, +template +static inline void launch_qc_csc_sub(const IdxT* indptr, const IdxT* index, const T* data, T* sums_cells, const bool* mask, int n_genes, cudaStream_t stream) { dim3 block(SPARSE_BLOCK_SIZE); dim3 grid((n_genes + SPARSE_BLOCK_SIZE - 1) / SPARSE_BLOCK_SIZE); - qc_csc_sub_kernel<<>>(indptr, index, data, - sums_cells, mask, n_genes); + qc_csc_sub_kernel<<>>( + indptr, index, data, sums_cells, mask, n_genes); CUDA_CHECK_LAST_ERROR(qc_csc_sub_kernel); } -template -static inline void launch_qc_csr_sub(const int* indptr, const int* index, +template +static inline void launch_qc_csr_sub(const IdxT* indptr, const IdxT* index, const T* data, T* sums_cells, const bool* mask, int n_cells, cudaStream_t stream) { dim3 block(SPARSE_BLOCK_SIZE); dim3 grid((n_cells + SPARSE_BLOCK_SIZE - 1) / SPARSE_BLOCK_SIZE); - qc_csr_sub_kernel<<>>(indptr, index, data, - sums_cells, mask, n_cells); + qc_csr_sub_kernel<<>>( + indptr, index, data, sums_cells, mask, n_cells); CUDA_CHECK_LAST_ERROR(qc_csr_sub_kernel); } @@ -82,7 +82,7 @@ static inline void launch_qc_dense_sub(const T* data, T* sums_cells, template void register_bindings(nb::module_& m) { - // sparse_qc_csc - float32 + // sparse_qc_csc - float32, int32 m.def( "sparse_qc_csc", [](gpu_array_c indptr, @@ -100,7 +100,7 @@ void register_bindings(nb::module_& m) { "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "sums_cells"_a, "sums_genes"_a, "cell_ex"_a, "gene_ex"_a, "n_genes"_a, "stream"_a = 0); - // sparse_qc_csc - float64 + // sparse_qc_csc - float64, int32 m.def( "sparse_qc_csc", [](gpu_array_c indptr, @@ -118,7 +118,43 @@ void register_bindings(nb::module_& m) { "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "sums_cells"_a, "sums_genes"_a, "cell_ex"_a, "gene_ex"_a, "n_genes"_a, "stream"_a = 0); - // sparse_qc_csr - float32 + // sparse_qc_csc - float32, int64 + m.def( + "sparse_qc_csc", + [](gpu_array_c indptr, + gpu_array_c index, + gpu_array_c data, + gpu_array_c sums_cells, + gpu_array_c sums_genes, + gpu_array_c cell_ex, gpu_array_c gene_ex, + int n_genes, std::uintptr_t stream) { + launch_qc_csc(indptr.data(), index.data(), data.data(), + sums_cells.data(), sums_genes.data(), + cell_ex.data(), gene_ex.data(), n_genes, + (cudaStream_t)stream); + }, + "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "sums_cells"_a, + "sums_genes"_a, "cell_ex"_a, "gene_ex"_a, "n_genes"_a, "stream"_a = 0); + + // sparse_qc_csc - float64, int64 + m.def( + "sparse_qc_csc", + [](gpu_array_c indptr, + gpu_array_c index, + gpu_array_c data, + gpu_array_c sums_cells, + gpu_array_c sums_genes, + gpu_array_c cell_ex, gpu_array_c gene_ex, + int n_genes, std::uintptr_t stream) { + launch_qc_csc(indptr.data(), index.data(), data.data(), + sums_cells.data(), sums_genes.data(), + cell_ex.data(), gene_ex.data(), n_genes, + (cudaStream_t)stream); + }, + "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "sums_cells"_a, + "sums_genes"_a, "cell_ex"_a, "gene_ex"_a, "n_genes"_a, "stream"_a = 0); + + // sparse_qc_csr - float32, int32 m.def( "sparse_qc_csr", [](gpu_array_c indptr, @@ -136,7 +172,7 @@ void register_bindings(nb::module_& m) { "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "sums_cells"_a, "sums_genes"_a, "cell_ex"_a, "gene_ex"_a, "n_cells"_a, "stream"_a = 0); - // sparse_qc_csr - float64 + // sparse_qc_csr - float64, int32 m.def( "sparse_qc_csr", [](gpu_array_c indptr, @@ -154,6 +190,42 @@ void register_bindings(nb::module_& m) { "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "sums_cells"_a, "sums_genes"_a, "cell_ex"_a, "gene_ex"_a, "n_cells"_a, "stream"_a = 0); + // sparse_qc_csr - float32, int64 + m.def( + "sparse_qc_csr", + [](gpu_array_c indptr, + gpu_array_c index, + gpu_array_c data, + gpu_array_c sums_cells, + gpu_array_c sums_genes, + gpu_array_c cell_ex, gpu_array_c gene_ex, + int n_cells, std::uintptr_t stream) { + launch_qc_csr(indptr.data(), index.data(), data.data(), + sums_cells.data(), sums_genes.data(), + cell_ex.data(), gene_ex.data(), n_cells, + (cudaStream_t)stream); + }, + "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "sums_cells"_a, + "sums_genes"_a, "cell_ex"_a, "gene_ex"_a, "n_cells"_a, "stream"_a = 0); + + // sparse_qc_csr - float64, int64 + m.def( + "sparse_qc_csr", + [](gpu_array_c indptr, + gpu_array_c index, + gpu_array_c data, + gpu_array_c sums_cells, + gpu_array_c sums_genes, + gpu_array_c cell_ex, gpu_array_c gene_ex, + int n_cells, std::uintptr_t stream) { + launch_qc_csr(indptr.data(), index.data(), data.data(), + sums_cells.data(), sums_genes.data(), + cell_ex.data(), gene_ex.data(), n_cells, + (cudaStream_t)stream); + }, + "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "sums_cells"_a, + "sums_genes"_a, "cell_ex"_a, "gene_ex"_a, "n_cells"_a, "stream"_a = 0); + // sparse_qc_dense - float32 m.def( "sparse_qc_dense", @@ -186,7 +258,7 @@ void register_bindings(nb::module_& m) { "data"_a, nb::kw_only(), "sums_cells"_a, "sums_genes"_a, "cell_ex"_a, "gene_ex"_a, "n_cells"_a, "n_genes"_a, "stream"_a = 0); - // sparse_qc_csc_sub - float32 + // sparse_qc_csc_sub - float32, int32 m.def( "sparse_qc_csc_sub", [](gpu_array_c indptr, @@ -202,7 +274,7 @@ void register_bindings(nb::module_& m) { "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "sums_cells"_a, "mask"_a, "n_genes"_a, "stream"_a = 0); - // sparse_qc_csc_sub - float64 + // sparse_qc_csc_sub - float64, int32 m.def( "sparse_qc_csc_sub", [](gpu_array_c indptr, @@ -218,7 +290,39 @@ void register_bindings(nb::module_& m) { "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "sums_cells"_a, "mask"_a, "n_genes"_a, "stream"_a = 0); - // sparse_qc_csr_sub - float32 + // sparse_qc_csc_sub - float32, int64 + m.def( + "sparse_qc_csc_sub", + [](gpu_array_c indptr, + gpu_array_c index, + gpu_array_c data, + gpu_array_c sums_cells, + gpu_array_c mask, int n_genes, + std::uintptr_t stream) { + launch_qc_csc_sub(indptr.data(), index.data(), data.data(), + sums_cells.data(), mask.data(), n_genes, + (cudaStream_t)stream); + }, + "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "sums_cells"_a, + "mask"_a, "n_genes"_a, "stream"_a = 0); + + // sparse_qc_csc_sub - float64, int64 + m.def( + "sparse_qc_csc_sub", + [](gpu_array_c indptr, + gpu_array_c index, + gpu_array_c data, + gpu_array_c sums_cells, + gpu_array_c mask, int n_genes, + std::uintptr_t stream) { + launch_qc_csc_sub(indptr.data(), index.data(), data.data(), + sums_cells.data(), mask.data(), n_genes, + (cudaStream_t)stream); + }, + "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "sums_cells"_a, + "mask"_a, "n_genes"_a, "stream"_a = 0); + + // sparse_qc_csr_sub - float32, int32 m.def( "sparse_qc_csr_sub", [](gpu_array_c indptr, @@ -234,7 +338,7 @@ void register_bindings(nb::module_& m) { "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "sums_cells"_a, "mask"_a, "n_cells"_a, "stream"_a = 0); - // sparse_qc_csr_sub - float64 + // sparse_qc_csr_sub - float64, int32 m.def( "sparse_qc_csr_sub", [](gpu_array_c indptr, @@ -250,6 +354,38 @@ void register_bindings(nb::module_& m) { "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "sums_cells"_a, "mask"_a, "n_cells"_a, "stream"_a = 0); + // sparse_qc_csr_sub - float32, int64 + m.def( + "sparse_qc_csr_sub", + [](gpu_array_c indptr, + gpu_array_c index, + gpu_array_c data, + gpu_array_c sums_cells, + gpu_array_c mask, int n_cells, + std::uintptr_t stream) { + launch_qc_csr_sub(indptr.data(), index.data(), data.data(), + sums_cells.data(), mask.data(), n_cells, + (cudaStream_t)stream); + }, + "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "sums_cells"_a, + "mask"_a, "n_cells"_a, "stream"_a = 0); + + // sparse_qc_csr_sub - float64, int64 + m.def( + "sparse_qc_csr_sub", + [](gpu_array_c indptr, + gpu_array_c index, + gpu_array_c data, + gpu_array_c sums_cells, + gpu_array_c mask, int n_cells, + std::uintptr_t stream) { + launch_qc_csr_sub(indptr.data(), index.data(), data.data(), + sums_cells.data(), mask.data(), n_cells, + (cudaStream_t)stream); + }, + "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "sums_cells"_a, + "mask"_a, "n_cells"_a, "stream"_a = 0); + // sparse_qc_dense_sub - float32 m.def( "sparse_qc_dense_sub", diff --git a/src/rapids_singlecell/_cuda/qc_dask/kernels_qcd.cuh b/src/rapids_singlecell/_cuda/qc_dask/kernels_qcd.cuh index 59bdfa5f..b5700905 100644 --- a/src/rapids_singlecell/_cuda/qc_dask/kernels_qcd.cuh +++ b/src/rapids_singlecell/_cuda/qc_dask/kernels_qcd.cuh @@ -2,19 +2,19 @@ #include -template -__global__ void qc_csr_cells_kernel(const int* __restrict__ indptr, - const int* __restrict__ index, +template +__global__ void qc_csr_cells_kernel(const IdxT* __restrict__ indptr, + const IdxT* __restrict__ index, const T* __restrict__ data, T* __restrict__ sums_cells, int* __restrict__ cell_ex, int n_cells) { int cell = blockDim.x * blockIdx.x + threadIdx.x; if (cell >= n_cells) return; - int start_idx = indptr[cell]; - int stop_idx = indptr[cell + 1]; + IdxT start_idx = indptr[cell]; + IdxT stop_idx = indptr[cell + 1]; T sums = T(0); int ex = 0; - for (int p = start_idx; p < stop_idx; ++p) { + for (IdxT p = start_idx; p < stop_idx; ++p) { sums += data[p]; ++ex; } @@ -22,14 +22,14 @@ __global__ void qc_csr_cells_kernel(const int* __restrict__ indptr, cell_ex[cell] = ex; } -template -__global__ void qc_csr_genes_kernel(const int* __restrict__ index, +template +__global__ void qc_csr_genes_kernel(const IdxT* __restrict__ index, const T* __restrict__ data, T* __restrict__ sums_genes, - int* __restrict__ gene_ex, int nnz) { - int i = blockDim.x * blockIdx.x + threadIdx.x; + int* __restrict__ gene_ex, long long nnz) { + long long i = (long long)blockDim.x * blockIdx.x + threadIdx.x; if (i >= nnz) return; - int g = index[i]; + IdxT g = index[i]; T v = data[i]; atomicAdd(&sums_genes[g], v); atomicAdd(&gene_ex[g], 1); diff --git a/src/rapids_singlecell/_cuda/qc_dask/qc_kernels_dask.cu b/src/rapids_singlecell/_cuda/qc_dask/qc_kernels_dask.cu index e609326a..a482d921 100644 --- a/src/rapids_singlecell/_cuda/qc_dask/qc_kernels_dask.cu +++ b/src/rapids_singlecell/_cuda/qc_dask/qc_kernels_dask.cu @@ -9,25 +9,25 @@ constexpr int SPARSE_BLOCK_SIZE = 32; constexpr int GENES_BLOCK_SIZE = 256; constexpr int DENSE_BLOCK_DIM = 16; -template -static inline void launch_qc_csr_cells(const int* indptr, const int* index, +template +static inline void launch_qc_csr_cells(const IdxT* indptr, const IdxT* index, const T* data, T* sums_cells, int* cell_ex, int n_cells, cudaStream_t stream) { dim3 block(SPARSE_BLOCK_SIZE); dim3 grid((n_cells + SPARSE_BLOCK_SIZE - 1) / SPARSE_BLOCK_SIZE); - qc_csr_cells_kernel<<>>( + qc_csr_cells_kernel<<>>( indptr, index, data, sums_cells, cell_ex, n_cells); CUDA_CHECK_LAST_ERROR(qc_csr_cells_kernel); } -template -static inline void launch_qc_csr_genes(const int* index, const T* data, - T* sums_genes, int* gene_ex, int nnz, - cudaStream_t stream) { +template +static inline void launch_qc_csr_genes(const IdxT* index, const T* data, + T* sums_genes, int* gene_ex, + long long nnz, cudaStream_t stream) { int block = GENES_BLOCK_SIZE; - int grid = (nnz + GENES_BLOCK_SIZE - 1) / GENES_BLOCK_SIZE; - qc_csr_genes_kernel + long long grid = (nnz + GENES_BLOCK_SIZE - 1) / GENES_BLOCK_SIZE; + qc_csr_genes_kernel <<>>(index, data, sums_genes, gene_ex, nnz); CUDA_CHECK_LAST_ERROR(qc_csr_genes_kernel); } @@ -56,67 +56,50 @@ static inline void launch_qc_dense_genes(const T* data, T* sums_genes, CUDA_CHECK_LAST_ERROR(qc_dense_genes_kernel); } -template -void register_bindings(nb::module_& m) { - // sparse_qc_csr_cells - float32 - m.def( - "sparse_qc_csr_cells", - [](gpu_array_c indptr, - gpu_array_c index, - gpu_array_c data, - gpu_array_c sums_cells, - gpu_array_c cell_ex, int n_cells, - std::uintptr_t stream) { - launch_qc_csr_cells(indptr.data(), index.data(), data.data(), - sums_cells.data(), cell_ex.data(), - n_cells, (cudaStream_t)stream); - }, - "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "sums_cells"_a, - "cell_ex"_a, "n_cells"_a, "stream"_a = 0); - - // sparse_qc_csr_cells - float64 +template +void def_sparse_qc_csr_cells(nb::module_& m) { m.def( "sparse_qc_csr_cells", - [](gpu_array_c indptr, - gpu_array_c index, - gpu_array_c data, - gpu_array_c sums_cells, + [](gpu_array_c indptr, + gpu_array_c index, + gpu_array_c data, gpu_array_c sums_cells, gpu_array_c cell_ex, int n_cells, std::uintptr_t stream) { - launch_qc_csr_cells( + launch_qc_csr_cells( indptr.data(), index.data(), data.data(), sums_cells.data(), cell_ex.data(), n_cells, (cudaStream_t)stream); }, "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "sums_cells"_a, "cell_ex"_a, "n_cells"_a, "stream"_a = 0); +} - // sparse_qc_csr_genes - float32 +template +void def_sparse_qc_csr_genes(nb::module_& m) { m.def( "sparse_qc_csr_genes", - [](gpu_array_c index, - gpu_array_c data, - gpu_array_c sums_genes, - gpu_array_c gene_ex, int nnz, std::uintptr_t stream) { - launch_qc_csr_genes(index.data(), data.data(), - sums_genes.data(), gene_ex.data(), nnz, - (cudaStream_t)stream); + [](gpu_array_c index, + gpu_array_c data, gpu_array_c sums_genes, + gpu_array_c gene_ex, long long nnz, + std::uintptr_t stream) { + launch_qc_csr_genes(index.data(), data.data(), + sums_genes.data(), gene_ex.data(), nnz, + (cudaStream_t)stream); }, "index"_a, "data"_a, nb::kw_only(), "sums_genes"_a, "gene_ex"_a, "nnz"_a, "stream"_a = 0); +} - // sparse_qc_csr_genes - float64 - m.def( - "sparse_qc_csr_genes", - [](gpu_array_c index, - gpu_array_c data, - gpu_array_c sums_genes, - gpu_array_c gene_ex, int nnz, std::uintptr_t stream) { - launch_qc_csr_genes(index.data(), data.data(), - sums_genes.data(), gene_ex.data(), nnz, - (cudaStream_t)stream); - }, - "index"_a, "data"_a, nb::kw_only(), "sums_genes"_a, "gene_ex"_a, - "nnz"_a, "stream"_a = 0); +template +void register_bindings(nb::module_& m) { + def_sparse_qc_csr_cells(m); + def_sparse_qc_csr_cells(m); + def_sparse_qc_csr_cells(m); + def_sparse_qc_csr_cells(m); + + def_sparse_qc_csr_genes(m); + def_sparse_qc_csr_genes(m); + def_sparse_qc_csr_genes(m); + def_sparse_qc_csr_genes(m); // sparse_qc_dense_cells - float32 m.def( diff --git a/src/rapids_singlecell/_cuda/scale/kernels_scale.cuh b/src/rapids_singlecell/_cuda/scale/kernels_scale.cuh index 99277d72..d13d01ef 100644 --- a/src/rapids_singlecell/_cuda/scale/kernels_scale.cuh +++ b/src/rapids_singlecell/_cuda/scale/kernels_scale.cuh @@ -5,23 +5,23 @@ // All scale kernels assume std[col] > 0, guaranteed by the Python caller // (_scale.py clips std to a minimum value before invoking these kernels). -template -__global__ void csc_scale_diff_kernel(const int* __restrict__ indptr, +template +__global__ void csc_scale_diff_kernel(const IdxT* __restrict__ indptr, T* __restrict__ data, const T* __restrict__ std, int ncols) { int col = blockIdx.x; if (col >= ncols) return; - int start_idx = indptr[col]; - int stop_idx = indptr[col + 1]; + long long start_idx = (long long)indptr[col]; + long long stop_idx = (long long)indptr[col + 1]; T diver = T(1) / std[col]; - for (int i = start_idx + threadIdx.x; i < stop_idx; i += blockDim.x) { + for (long long i = start_idx + threadIdx.x; i < stop_idx; i += blockDim.x) { data[i] *= diver; } } -template -__global__ void csr_scale_diff_kernel(const int* __restrict__ indptr, - const int* __restrict__ indices, +template +__global__ void csr_scale_diff_kernel(const IdxT* __restrict__ indptr, + const IdxT* __restrict__ indices, T* __restrict__ data, const T* __restrict__ std, const int* __restrict__ mask, T clipper, @@ -29,10 +29,11 @@ __global__ void csr_scale_diff_kernel(const int* __restrict__ indptr, int row = blockIdx.x; if (row >= nrows) return; if (mask[row]) { - int start_idx = indptr[row]; - int stop_idx = indptr[row + 1]; - for (int i = start_idx + threadIdx.x; i < stop_idx; i += blockDim.x) { - int idx = indices[i]; + long long start_idx = (long long)indptr[row]; + long long stop_idx = (long long)indptr[row + 1]; + for (long long i = start_idx + threadIdx.x; i < stop_idx; + i += blockDim.x) { + long long idx = (long long)indices[i]; T res = data[i] / std[idx]; data[i] = res < clipper ? res : clipper; } @@ -61,8 +62,8 @@ __global__ void dense_scale_diff_kernel(T* __restrict__ data, const T* __restrict__ std, const int* __restrict__ mask, T clipper, long long nrows, long long ncols) { - long long row = (long long)(blockIdx.x * blockDim.x + threadIdx.x); - long long col = (long long)(blockIdx.y * blockDim.y + threadIdx.y); + long long row = (long long)blockIdx.x * blockDim.x + threadIdx.x; + long long col = (long long)blockIdx.y * blockDim.y + threadIdx.y; if (row < nrows && col < ncols) { if (mask[row]) { T res = data[row * ncols + col] / std[col]; diff --git a/src/rapids_singlecell/_cuda/scale/scale.cu b/src/rapids_singlecell/_cuda/scale/scale.cu index b798ab4c..22ecbd48 100644 --- a/src/rapids_singlecell/_cuda/scale/scale.cu +++ b/src/rapids_singlecell/_cuda/scale/scale.cu @@ -8,25 +8,26 @@ using namespace nb::literals; constexpr int BLOCK_SIZE_SPARSE = 64; constexpr int BLOCK_SIZE_DENSE_TILE = 32; -template -static inline void launch_csc_scale_diff(const int* indptr, T* data, +template +static inline void launch_csc_scale_diff(const IdxT* indptr, T* data, const T* std, int ncols, cudaStream_t stream) { dim3 block(BLOCK_SIZE_SPARSE); dim3 grid(ncols); - csc_scale_diff_kernel + csc_scale_diff_kernel <<>>(indptr, data, std, ncols); CUDA_CHECK_LAST_ERROR(csc_scale_diff_kernel); } -template -static inline void launch_csr_scale_diff(const int* indptr, const int* indices, - T* data, const T* std, const int* mask, +template +static inline void launch_csr_scale_diff(const IdxT* indptr, + const IdxT* indices, T* data, + const T* std, const int* mask, T clipper, int nrows, cudaStream_t stream) { dim3 block(BLOCK_SIZE_SPARSE); dim3 grid(nrows); - csr_scale_diff_kernel<<>>( + csr_scale_diff_kernel<<>>( indptr, indices, data, std, mask, clipper, nrows); CUDA_CHECK_LAST_ERROR(csr_scale_diff_kernel); } @@ -58,66 +59,52 @@ static inline void launch_dense_scale_diff(T* data, const T* std, CUDA_CHECK_LAST_ERROR(dense_scale_diff_kernel); } -template -void register_bindings(nb::module_& m) { - // csc_scale_diff - float32 +template +void def_csc_scale_diff(nb::module_& m) { m.def( "csc_scale_diff", - [](gpu_array_c indptr, - gpu_array_c data, - gpu_array_c std, int ncols, - std::uintptr_t stream) { - launch_csc_scale_diff(indptr.data(), data.data(), std.data(), - ncols, (cudaStream_t)stream); - }, - "indptr"_a, "data"_a, "std"_a, nb::kw_only(), "ncols"_a, - "stream"_a = 0); - - // csc_scale_diff - float64 - m.def( - "csc_scale_diff", - [](gpu_array_c indptr, - gpu_array_c data, - gpu_array_c std, int ncols, - std::uintptr_t stream) { - launch_csc_scale_diff(indptr.data(), data.data(), - std.data(), ncols, - (cudaStream_t)stream); + [](gpu_array_c indptr, gpu_array_c data, + gpu_array_c std, int ncols, std::uintptr_t stream) { + launch_csc_scale_diff(indptr.data(), data.data(), + std.data(), ncols, + (cudaStream_t)stream); }, "indptr"_a, "data"_a, "std"_a, nb::kw_only(), "ncols"_a, "stream"_a = 0); +} - // csr_scale_diff - float32 +template +void def_csr_scale_diff(nb::module_& m) { m.def( "csr_scale_diff", - [](gpu_array_c indptr, - gpu_array_c indices, - gpu_array_c data, - gpu_array_c std, - gpu_array_c mask, float clipper, int nrows, + [](gpu_array_c indptr, + gpu_array_c indices, gpu_array_c data, + gpu_array_c std, + gpu_array_c mask, T clipper, int nrows, std::uintptr_t stream) { - launch_csr_scale_diff(indptr.data(), indices.data(), - data.data(), std.data(), mask.data(), - clipper, nrows, (cudaStream_t)stream); + launch_csr_scale_diff( + indptr.data(), indices.data(), data.data(), std.data(), + mask.data(), clipper, nrows, (cudaStream_t)stream); }, "indptr"_a, "indices"_a, "data"_a, "std"_a, "mask"_a, nb::kw_only(), "clipper"_a, "nrows"_a, "stream"_a = 0); +} - // csr_scale_diff - float64 - m.def( - "csr_scale_diff", - [](gpu_array_c indptr, - gpu_array_c indices, - gpu_array_c data, - gpu_array_c std, - gpu_array_c mask, double clipper, int nrows, - std::uintptr_t stream) { - launch_csr_scale_diff(indptr.data(), indices.data(), - data.data(), std.data(), mask.data(), - clipper, nrows, (cudaStream_t)stream); - }, - "indptr"_a, "indices"_a, "data"_a, "std"_a, "mask"_a, nb::kw_only(), - "clipper"_a, "nrows"_a, "stream"_a = 0); +template +void register_bindings(nb::module_& m) { + // csc_scale_diff - int32 indices + def_csc_scale_diff(m); + def_csc_scale_diff(m); + // csc_scale_diff - int64 indices + def_csc_scale_diff(m); + def_csc_scale_diff(m); + + // csr_scale_diff - int32 indices + def_csr_scale_diff(m); + def_csr_scale_diff(m); + // csr_scale_diff - int64 indices + def_csr_scale_diff(m); + def_csr_scale_diff(m); // dense_scale_center_diff - float32 m.def( diff --git a/src/rapids_singlecell/_cuda/sparse2dense/kernels_s2d.cuh b/src/rapids_singlecell/_cuda/sparse2dense/kernels_s2d.cuh index d1272b3d..4ea27c40 100644 --- a/src/rapids_singlecell/_cuda/sparse2dense/kernels_s2d.cuh +++ b/src/rapids_singlecell/_cuda/sparse2dense/kernels_s2d.cuh @@ -2,9 +2,9 @@ #include -template -__global__ void sparse2dense_kernel(const int* __restrict__ indptr, - const int* __restrict__ index, +template +__global__ void sparse2dense_kernel(const IdxT* __restrict__ indptr, + const IdxT* __restrict__ index, const T* __restrict__ data, T* __restrict__ out, long long major, long long minor) { diff --git a/src/rapids_singlecell/_cuda/sparse2dense/sparse2dense.cu b/src/rapids_singlecell/_cuda/sparse2dense/sparse2dense.cu index 1a4361b2..99aaacd9 100644 --- a/src/rapids_singlecell/_cuda/sparse2dense/sparse2dense.cu +++ b/src/rapids_singlecell/_cuda/sparse2dense/sparse2dense.cu @@ -6,8 +6,8 @@ using namespace nb::literals; // Fully templated kernel launch - no runtime branches -template -static inline void launch_sparse2dense(const int* indptr, const int* index, +template +static inline void launch_sparse2dense(const IdxT* indptr, const IdxT* index, const T* data, T* out, long long major, long long minor, int max_nnz, cudaStream_t stream) { @@ -29,40 +29,41 @@ static inline void launch_sparse2dense(const int* indptr, const int* index, grid_y = (unsigned)max_grid_y; } dim3 grid(grid_x, grid_y); - sparse2dense_kernel + sparse2dense_kernel <<>>(indptr, index, data, out, major, minor); CUDA_CHECK_LAST_ERROR(sparse2dense_kernel); } // Runtime dispatch wrapper - c_switch depends on both sparse format and output // order -template -static inline void dispatch_sparse2dense(const int* indptr, const int* index, +template +static inline void dispatch_sparse2dense(const IdxT* indptr, const IdxT* index, const T* data, T* out, long long major, long long minor, bool c_switch, int max_nnz, cudaStream_t stream) { if (c_switch) { - launch_sparse2dense(indptr, index, data, out, major, minor, - max_nnz, stream); + launch_sparse2dense(indptr, index, data, out, major, + minor, max_nnz, stream); } else { - launch_sparse2dense(indptr, index, data, out, major, minor, - max_nnz, stream); + launch_sparse2dense(indptr, index, data, out, major, + minor, max_nnz, stream); } } -// Helper to define sparse2dense for a given dtype and output contiguity -template +// Helper to define sparse2dense for a given index type, dtype and output +// contiguity +template void def_sparse2dense(nb::module_& m) { m.def( "sparse2dense", - [](gpu_array_contig indptr, - gpu_array_contig index, + [](gpu_array_contig indptr, + gpu_array_contig index, gpu_array_contig data, gpu_array_contig out, long long major, long long minor, bool c_switch, int max_nnz, std::uintptr_t stream) { - dispatch_sparse2dense(indptr.data(), index.data(), data.data(), - out.data(), major, minor, c_switch, - max_nnz, (cudaStream_t)stream); + dispatch_sparse2dense( + indptr.data(), index.data(), data.data(), out.data(), major, + minor, c_switch, max_nnz, (cudaStream_t)stream); }, "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "out"_a, "major"_a, "minor"_a, "c_switch"_a, "max_nnz"_a, "stream"_a = 0); @@ -71,10 +72,14 @@ void def_sparse2dense(nb::module_& m) { template void register_bindings(nb::module_& m) { // F-order output must come before C-order for proper dispatch - def_sparse2dense(m); - def_sparse2dense(m); - def_sparse2dense(m); - def_sparse2dense(m); + def_sparse2dense(m); + def_sparse2dense(m); + def_sparse2dense(m); + def_sparse2dense(m); + def_sparse2dense(m); + def_sparse2dense(m); + def_sparse2dense(m); + def_sparse2dense(m); } NB_MODULE(_sparse2dense_cuda, m) { diff --git a/src/rapids_singlecell/_cuda/spca/kernels_spca.cuh b/src/rapids_singlecell/_cuda/spca/kernels_spca.cuh index b7ab555c..8c5f5ed5 100644 --- a/src/rapids_singlecell/_cuda/spca/kernels_spca.cuh +++ b/src/rapids_singlecell/_cuda/spca/kernels_spca.cuh @@ -2,22 +2,22 @@ #include -template -__global__ void gram_csr_upper_kernel(const int* indptr, const int* index, +template +__global__ void gram_csr_upper_kernel(const IdxT* indptr, const IdxT* index, const T* data, int nrows, int ncols, T* out) { int row = blockIdx.x; int col_offset = threadIdx.x; if (row >= nrows) return; - int start = indptr[row]; - int end = indptr[row + 1]; + IdxT start = indptr[row]; + IdxT end = indptr[row + 1]; - for (int idx1 = start; idx1 < end; ++idx1) { - int index1 = index[idx1]; + for (IdxT idx1 = start; idx1 < end; ++idx1) { + IdxT index1 = index[idx1]; T data1 = data[idx1]; - for (int idx2 = idx1 + col_offset; idx2 < end; idx2 += blockDim.x) { - int index2 = index[idx2]; + for (IdxT idx2 = idx1 + col_offset; idx2 < end; idx2 += blockDim.x) { + IdxT index2 = index[idx2]; T data2 = data[idx2]; size_t lo = min(index1, index2); size_t hi = max(index1, index2); @@ -47,11 +47,12 @@ __global__ void cov_from_gram_kernel(T* cov_values, const T* gram_matrix, gram_matrix[rid * ncols + cid] - mean_x[rid] * mean_y[cid]; } -__global__ void check_zero_genes_kernel(const int* indices, int* genes, int nnz, - int num_genes) { - int value = blockIdx.x * blockDim.x + threadIdx.x; +template +__global__ void check_zero_genes_kernel(const IdxT* indices, int* genes, + long long nnz, int num_genes) { + long long value = (long long)blockIdx.x * blockDim.x + threadIdx.x; if (value >= nnz) return; - int gene_index = indices[value]; + int gene_index = static_cast(indices[value]); if (gene_index >= 0 && gene_index < num_genes) { atomicAdd(&genes[gene_index], 1); } diff --git a/src/rapids_singlecell/_cuda/spca/spca.cu b/src/rapids_singlecell/_cuda/spca/spca.cu index 04eb4e2e..3b3ced96 100644 --- a/src/rapids_singlecell/_cuda/spca/spca.cu +++ b/src/rapids_singlecell/_cuda/spca/spca.cu @@ -9,13 +9,13 @@ constexpr int GRAM_BLOCK_SIZE = 128; constexpr int MATRIX_BLOCK_DIM = 32; constexpr int ELEMENTWISE_BLOCK_SIZE = 32; -template -static inline void launch_gram_csr_upper(const int* indptr, const int* index, +template +static inline void launch_gram_csr_upper(const IdxT* indptr, const IdxT* index, const T* data, int nrows, int ncols, T* out, cudaStream_t stream) { dim3 block(GRAM_BLOCK_SIZE); dim3 grid(nrows); - gram_csr_upper_kernel + gram_csr_upper_kernel <<>>(indptr, index, data, nrows, ncols, out); CUDA_CHECK_LAST_ERROR(gram_csr_upper_kernel); } @@ -42,44 +42,71 @@ static inline void launch_cov_from_gram(T* cov, const T* gram, const T* meanx, CUDA_CHECK_LAST_ERROR(cov_from_gram_kernel); } -static inline void launch_check_zero_genes(const int* indices, int* genes, - int nnz, int num_genes, +template +static inline void launch_check_zero_genes(const IdxT* indices, int* genes, + long long nnz, int num_genes, cudaStream_t stream) { if (nnz > 0) { dim3 block(ELEMENTWISE_BLOCK_SIZE); dim3 grid((nnz + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE); - check_zero_genes_kernel<<>>(indices, genes, nnz, - num_genes); + check_zero_genes_kernel + <<>>(indices, genes, nnz, num_genes); CUDA_CHECK_LAST_ERROR(check_zero_genes_kernel); } } template void register_bindings(nb::module_& m) { - // gram_csr_upper - float32 + // gram_csr_upper - int32 indices m.def( "gram_csr_upper", [](gpu_array_c indptr, gpu_array_c index, gpu_array_c data, int nrows, int ncols, gpu_array_c out, std::uintptr_t stream) { - launch_gram_csr_upper(indptr.data(), index.data(), - data.data(), nrows, ncols, out.data(), - (cudaStream_t)stream); + launch_gram_csr_upper(indptr.data(), index.data(), + data.data(), nrows, ncols, + out.data(), (cudaStream_t)stream); }, "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "nrows"_a, "ncols"_a, "out"_a, "stream"_a = 0); - // gram_csr_upper - float64 m.def( "gram_csr_upper", [](gpu_array_c indptr, gpu_array_c index, gpu_array_c data, int nrows, int ncols, gpu_array_c out, std::uintptr_t stream) { - launch_gram_csr_upper(indptr.data(), index.data(), - data.data(), nrows, ncols, out.data(), - (cudaStream_t)stream); + launch_gram_csr_upper( + indptr.data(), index.data(), data.data(), nrows, ncols, + out.data(), (cudaStream_t)stream); + }, + "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "nrows"_a, "ncols"_a, + "out"_a, "stream"_a = 0); + + // gram_csr_upper - int64 indices + m.def( + "gram_csr_upper", + [](gpu_array_c indptr, + gpu_array_c index, + gpu_array_c data, int nrows, int ncols, + gpu_array_c out, std::uintptr_t stream) { + launch_gram_csr_upper( + indptr.data(), index.data(), data.data(), nrows, ncols, + out.data(), (cudaStream_t)stream); + }, + "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "nrows"_a, "ncols"_a, + "out"_a, "stream"_a = 0); + + m.def( + "gram_csr_upper", + [](gpu_array_c indptr, + gpu_array_c index, + gpu_array_c data, int nrows, int ncols, + gpu_array_c out, std::uintptr_t stream) { + launch_gram_csr_upper( + indptr.data(), index.data(), data.data(), nrows, ncols, + out.data(), (cudaStream_t)stream); }, "indptr"_a, "index"_a, "data"_a, nb::kw_only(), "nrows"_a, "ncols"_a, "out"_a, "stream"_a = 0); @@ -130,13 +157,25 @@ void register_bindings(nb::module_& m) { "gram"_a, "meanx"_a, "meany"_a, nb::kw_only(), "cov"_a, "ncols"_a, "stream"_a = 0); - // check_zero_genes + // check_zero_genes - int32 indices m.def( "check_zero_genes", [](gpu_array_c indices, gpu_array_c out, - int nnz, int num_genes, std::uintptr_t stream) { - launch_check_zero_genes(indices.data(), out.data(), nnz, num_genes, - (cudaStream_t)stream); + long long nnz, int num_genes, std::uintptr_t stream) { + launch_check_zero_genes(indices.data(), out.data(), nnz, + num_genes, (cudaStream_t)stream); + }, + "indices"_a, nb::kw_only(), "out"_a, "nnz"_a, "num_genes"_a, + "stream"_a = 0); + + // check_zero_genes - int64 indices + m.def( + "check_zero_genes", + [](gpu_array_c indices, + gpu_array_c out, long long nnz, int num_genes, + std::uintptr_t stream) { + launch_check_zero_genes(indices.data(), out.data(), nnz, + num_genes, (cudaStream_t)stream); }, "indices"_a, nb::kw_only(), "out"_a, "nnz"_a, "num_genes"_a, "stream"_a = 0); diff --git a/src/rapids_singlecell/_cuda/wilcoxon_binned/kernels_wilcoxon_binned.cuh b/src/rapids_singlecell/_cuda/wilcoxon_binned/kernels_wilcoxon_binned.cuh index 4eb4981c..c7f98ec1 100644 --- a/src/rapids_singlecell/_cuda/wilcoxon_binned/kernels_wilcoxon_binned.cuh +++ b/src/rapids_singlecell/_cuda/wilcoxon_binned/kernels_wilcoxon_binned.cuh @@ -30,10 +30,10 @@ __global__ void dense_hist_kernel(const T* __restrict__ X, // ---------- csr_hist ---------- // One block per cell row. Threads stride over nonzeros. -template +template __global__ void csr_hist_kernel( - const T* __restrict__ data, const int* __restrict__ indices, - const int* __restrict__ indptr, const int* __restrict__ gcodes, + const T* __restrict__ data, const IdxT* __restrict__ indices, + const IdxT* __restrict__ indptr, const int* __restrict__ gcodes, unsigned int* __restrict__ hist, int n_cells, int n_genes, int n_groups, int n_bins, double bin_low, double inv_bin_width, int gene_start) { int row = blockIdx.x; @@ -43,11 +43,11 @@ __global__ void csr_hist_kernel( if (grp >= n_groups) return; // skip unselected cells int nbt = n_bins + 1; - int row_start = indptr[row]; - int row_end = indptr[row + 1]; + IdxT row_start = indptr[row]; + IdxT row_end = indptr[row + 1]; int gene_stop = gene_start + n_genes; - for (int i = row_start + threadIdx.x; i < row_end; i += blockDim.x) { + for (IdxT i = row_start + threadIdx.x; i < row_end; i += blockDim.x) { int col = indices[i]; if (col < gene_start || col >= gene_stop) continue; @@ -65,10 +65,10 @@ __global__ void csr_hist_kernel( // ---------- csc_hist ---------- // One block per gene column. Threads stride over nonzeros. -template +template __global__ void csc_hist_kernel( - const T* __restrict__ data, const int* __restrict__ indices, - const int* __restrict__ indptr, const int* __restrict__ gcodes, + const T* __restrict__ data, const IdxT* __restrict__ indices, + const IdxT* __restrict__ indptr, const int* __restrict__ gcodes, unsigned int* __restrict__ hist, int n_cells, int n_genes, int n_groups, int n_bins, double bin_low, double inv_bin_width, int gene_start) { int gene = blockIdx.x; @@ -78,10 +78,10 @@ __global__ void csc_hist_kernel( unsigned int* dst = hist + (long long)gene * n_groups * nbt; int col = gene_start + gene; - int col_start = indptr[col]; - int col_end = indptr[col + 1]; + IdxT col_start = indptr[col]; + IdxT col_end = indptr[col + 1]; - for (int i = col_start + threadIdx.x; i < col_end; i += blockDim.x) { + for (IdxT i = col_start + threadIdx.x; i < col_end; i += blockDim.x) { double val = (double)data[i]; if (val == 0.0) continue; diff --git a/src/rapids_singlecell/_cuda/wilcoxon_binned/wilcoxon_binned.cu b/src/rapids_singlecell/_cuda/wilcoxon_binned/wilcoxon_binned.cu index 6f69b27e..6119cbc1 100644 --- a/src/rapids_singlecell/_cuda/wilcoxon_binned/wilcoxon_binned.cu +++ b/src/rapids_singlecell/_cuda/wilcoxon_binned/wilcoxon_binned.cu @@ -21,31 +21,31 @@ static inline void launch_dense_hist(const T* X, const int* gcodes, CUDA_CHECK_LAST_ERROR(dense_hist_kernel); } -template -static inline void launch_csr_hist(const T* data, const int* indices, - const int* indptr, const int* gcodes, +template +static inline void launch_csr_hist(const T* data, const IdxT* indices, + const IdxT* indptr, const int* gcodes, unsigned int* hist, int n_cells, int n_genes, int n_groups, int n_bins, double bin_low, double inv_bin_width, int gene_start, cudaStream_t stream) { dim3 block(BLOCK_SIZE); dim3 grid(n_cells); - csr_hist_kernel<<>>( + csr_hist_kernel<<>>( data, indices, indptr, gcodes, hist, n_cells, n_genes, n_groups, n_bins, bin_low, inv_bin_width, gene_start); CUDA_CHECK_LAST_ERROR(csr_hist_kernel); } -template -static inline void launch_csc_hist(const T* data, const int* indices, - const int* indptr, const int* gcodes, +template +static inline void launch_csc_hist(const T* data, const IdxT* indices, + const IdxT* indptr, const int* gcodes, unsigned int* hist, int n_cells, int n_genes, int n_groups, int n_bins, double bin_low, double inv_bin_width, int gene_start, cudaStream_t stream) { dim3 block(BLOCK_SIZE); dim3 grid(n_genes); - csc_hist_kernel<<>>( + csc_hist_kernel<<>>( data, indices, indptr, gcodes, hist, n_cells, n_genes, n_groups, n_bins, bin_low, inv_bin_width, gene_start); CUDA_CHECK_LAST_ERROR(csc_hist_kernel); @@ -85,7 +85,7 @@ void register_bindings(nb::module_& m) { "n_groups"_a, "n_bins"_a, "bin_low"_a, "inv_bin_width"_a, "stream"_a = 0); - // csr_hist - float32 + // csr_hist - float32, int indices m.def( "csr_hist", [](gpu_array_c data, @@ -104,7 +104,7 @@ void register_bindings(nb::module_& m) { "n_cells"_a, "n_genes"_a, "n_groups"_a, "n_bins"_a, "bin_low"_a, "inv_bin_width"_a, "gene_start"_a, "stream"_a = 0); - // csr_hist - float64 + // csr_hist - float64, int indices m.def( "csr_hist", [](gpu_array_c data, @@ -123,7 +123,45 @@ void register_bindings(nb::module_& m) { "n_cells"_a, "n_genes"_a, "n_groups"_a, "n_bins"_a, "bin_low"_a, "inv_bin_width"_a, "gene_start"_a, "stream"_a = 0); - // csc_hist - float32 + // csr_hist - float32, int64 indices + m.def( + "csr_hist", + [](gpu_array_c data, + gpu_array_c indices, + gpu_array_c indptr, + gpu_array_c gcodes, + gpu_array_c hist, int n_cells, int n_genes, + int n_groups, int n_bins, double bin_low, double inv_bin_width, + int gene_start, std::uintptr_t stream) { + launch_csr_hist(data.data(), indices.data(), indptr.data(), + gcodes.data(), hist.data(), n_cells, n_genes, + n_groups, n_bins, bin_low, inv_bin_width, + gene_start, (cudaStream_t)stream); + }, + "data"_a, "indices"_a, "indptr"_a, "gcodes"_a, "hist"_a, nb::kw_only(), + "n_cells"_a, "n_genes"_a, "n_groups"_a, "n_bins"_a, "bin_low"_a, + "inv_bin_width"_a, "gene_start"_a, "stream"_a = 0); + + // csr_hist - float64, int64 indices + m.def( + "csr_hist", + [](gpu_array_c data, + gpu_array_c indices, + gpu_array_c indptr, + gpu_array_c gcodes, + gpu_array_c hist, int n_cells, int n_genes, + int n_groups, int n_bins, double bin_low, double inv_bin_width, + int gene_start, std::uintptr_t stream) { + launch_csr_hist( + data.data(), indices.data(), indptr.data(), gcodes.data(), + hist.data(), n_cells, n_genes, n_groups, n_bins, bin_low, + inv_bin_width, gene_start, (cudaStream_t)stream); + }, + "data"_a, "indices"_a, "indptr"_a, "gcodes"_a, "hist"_a, nb::kw_only(), + "n_cells"_a, "n_genes"_a, "n_groups"_a, "n_bins"_a, "bin_low"_a, + "inv_bin_width"_a, "gene_start"_a, "stream"_a = 0); + + // csc_hist - float32, int indices m.def( "csc_hist", [](gpu_array_c data, @@ -142,7 +180,7 @@ void register_bindings(nb::module_& m) { "n_cells"_a, "n_genes"_a, "n_groups"_a, "n_bins"_a, "bin_low"_a, "inv_bin_width"_a, "gene_start"_a, "stream"_a = 0); - // csc_hist - float64 + // csc_hist - float64, int indices m.def( "csc_hist", [](gpu_array_c data, @@ -160,6 +198,44 @@ void register_bindings(nb::module_& m) { "data"_a, "indices"_a, "indptr"_a, "gcodes"_a, "hist"_a, nb::kw_only(), "n_cells"_a, "n_genes"_a, "n_groups"_a, "n_bins"_a, "bin_low"_a, "inv_bin_width"_a, "gene_start"_a, "stream"_a = 0); + + // csc_hist - float32, int64 indices + m.def( + "csc_hist", + [](gpu_array_c data, + gpu_array_c indices, + gpu_array_c indptr, + gpu_array_c gcodes, + gpu_array_c hist, int n_cells, int n_genes, + int n_groups, int n_bins, double bin_low, double inv_bin_width, + int gene_start, std::uintptr_t stream) { + launch_csc_hist(data.data(), indices.data(), indptr.data(), + gcodes.data(), hist.data(), n_cells, n_genes, + n_groups, n_bins, bin_low, inv_bin_width, + gene_start, (cudaStream_t)stream); + }, + "data"_a, "indices"_a, "indptr"_a, "gcodes"_a, "hist"_a, nb::kw_only(), + "n_cells"_a, "n_genes"_a, "n_groups"_a, "n_bins"_a, "bin_low"_a, + "inv_bin_width"_a, "gene_start"_a, "stream"_a = 0); + + // csc_hist - float64, int64 indices + m.def( + "csc_hist", + [](gpu_array_c data, + gpu_array_c indices, + gpu_array_c indptr, + gpu_array_c gcodes, + gpu_array_c hist, int n_cells, int n_genes, + int n_groups, int n_bins, double bin_low, double inv_bin_width, + int gene_start, std::uintptr_t stream) { + launch_csc_hist( + data.data(), indices.data(), indptr.data(), gcodes.data(), + hist.data(), n_cells, n_genes, n_groups, n_bins, bin_low, + inv_bin_width, gene_start, (cudaStream_t)stream); + }, + "data"_a, "indices"_a, "indptr"_a, "gcodes"_a, "hist"_a, nb::kw_only(), + "n_cells"_a, "n_genes"_a, "n_groups"_a, "n_bins"_a, "bin_low"_a, + "inv_bin_width"_a, "gene_start"_a, "stream"_a = 0); } NB_MODULE(_wilcoxon_binned_cuda, m) {