diff --git a/r/NAMESPACE b/r/NAMESPACE index 8625c143..304a2878 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -48,6 +48,7 @@ export(gene_score_archr) export(gene_score_tiles_archr) export(gene_score_weights_archr) export(get_trackplot_height) +export(highly_variable_features) export(import_matrix_market) export(import_matrix_market_10x) export(knn_annoy) @@ -55,6 +56,7 @@ export(knn_hnsw) export(knn_to_geodesic_graph) export(knn_to_snn_graph) export(log1p_slow) +export(lsi) export(marker_features) export(match_gene_symbol) export(matrix_stats) diff --git a/r/NEWS.md b/r/NEWS.md index f14ddac8..432bb2f6 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -22,6 +22,8 @@ Contributions welcome :) - Add `rowQuantiles()` and `colQuantiles()` functions, which return the quantiles of each row/column of a matrix. Currently `rowQuantiles()` only works on row-major matrices and `colQuantiles()` only works on col-major matrices. If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128) - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128) +- Add `lsi()` function to perform latent semantic indexing on a matrix (pull request #156). +- Add `highly_variable_features()` function to identify highly variable features in a matrix (pull request #156). ## Improvements - `trackplot_loop()` now accepts discrete color scales diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index 0fa1711d..bdbb60b4 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -1,4 +1,4 @@ -# Copyright 2023 BPCells contributors +# Copyright 2024 BPCells contributors # # Licensed under the Apache License, Version 2.0 or the MIT license @@ -70,6 +70,154 @@ marker_features <- function(mat, groups, method="wilcoxon") { ) } + +#' Perform latent semantic indexing (LSI) on a matrix. +#' +#' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`. +#' @param mat (IterableMatrix) dimensions features x cells +#' @param n_dimensions (integer) Number of dimensions to keep during PCA. +#' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth. If the PC has a correlation that is great or equal to +#' the corr_cutoff, it will be excluded from the final PCA matrix. +#' @param scale_factor (integer) Scale factor for the tf-idf log transform. +#' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector. +#' @param threads (integer) Number of threads to use. +#' @return +#' - If `save_lsi` is `FALSE`, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`. +#' - If `save_lsi` is `TRUE`, return a list with the following elements: +#' - `pca_res`: dgCMatrix of shape `(n_dimensions, ncol(mat))` +#' - `unused_pcs`: Integer vector of PCs that were filtered out due to high correlation with sequencing depth +#' - `svd_attr`: List of SVD attributes +#' - `idf`: Inverse document frequency vector +#' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. +#' +#' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example: +#' - 17.1 MB memory usage, 25.1 seconds runtime +#' @export +lsi <- function( + mat, + n_dimensions = 50L, corr_cutoff = 1, scale_factor = 1e4, + save_lsi = FALSE, + threads = 1L +) { + assert_is(mat, "IterableMatrix") + assert_is_wholenumber(n_dimensions) + assert_len(n_dimensions, 1) + assert_greater_than_zero(n_dimensions) + assert_true(n_dimensions < min(ncol(mat), nrow(mat))) + assert_true((corr_cutoff >= 0) && (corr_cutoff <= 1)) + assert_is_wholenumber(threads) + + # log(tf-idf) transform + mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean")) + read_depth <- mat_stats$col_stats["mean",] * nrow(mat) + tf <- mat %>% multiply_cols(1 / read_depth) + idf_ <- 1 / mat_stats$row_stats["mean",] + mat_tfidf <- tf %>% multiply_rows(idf_) + mat_log_tfidf <- log1p(scale_factor * mat_tfidf) + # Save to prevent re-calculation of queued operations + mat_log_tfidf <- write_matrix_dir( + convert_matrix_type(mat_log_tfidf, type = "float"), + tempfile("mat_log_tfidf"), compress = TRUE + ) + # Run pca + svd_attr_ <- svds(mat_log_tfidf, k = n_dimensions, threads = threads) + pca_res <- t(svd_attr_$u) %*% mat_log_tfidf + + # Filter out PCs that are highly correlated with sequencing depth + pca_corrs <- abs(cor(read_depth, t(pca_res))) + pca_feats_to_keep <- which(pca_corrs < corr_cutoff) + if (length(pca_feats_to_keep) != n_dimensions) { + # not sure if this is the route we want to take. Should we just leave the PCs in, + # and not use them in downstream analysis? + pca_res <- t(svd_attr_$u[, pca_feats_to_keep]) %*% mat_log_tfidf + } + + if(save_lsi) { + return(list( + pca_res = pca_res, + svd_attr = svd_attr_, + unused_pcs <- which(pca_corrs >= corr_cutoff), + idf = idf_ + )) + } + return(pca_res) +} + +#' Get the most variable features within a matrix. +#' @param num_feats (integer) Number of features to return. If the number is higher than the number of features in the matrix, +#' all features will be returned. +#' @param n_bins (integer) Number of bins for binning mean gene expression. Normalizing dispersion is done with respect to each bin, +#' and if the number of features +#' within a bin is less than 2, the dispersion is set to 1. +#' @param save_feat_selection (logical) If `TRUE`, save the dispersions, means, and the features selected. +#' @returns +#' - If `save_feat_selection` is `FALSE`, return an IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`. +#' - If `save_feat_selection` is `TRUE`, return a list with the following elements: +#' - `mat`: IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`. +#' - `feature_selection`: Dataframe with columns `name`, `mean`, `dispersion`, `bin`, `feature_dispersion_norm`. +#' @inheritParams lsi +#' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015). +#' +#' Calculate using the following process: +#' 1. Calculate the dispersion of each feature (variance / mean) +#' 2. Log normalize dispersion and mean +#' 3. Bin the features by their means, and normalize dispersion within each bin +#' @export +highly_variable_features <- function( + mat, num_feats, n_bins = 20, + save_feat_selection = FALSE, + threads = 1L +) { + assert_is(mat, "IterableMatrix") + assert_greater_than_zero(num_feats) + assert_is_wholenumber(num_feats) + assert_len(num_feats, 1) + assert_is_wholenumber(n_bins) + assert_len(n_bins, 1) + assert_greater_than_zero(n_bins) + if (nrow(mat) <= num_feats) { + log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats)) + return(mat) + } + # Calculate row information for dispersion + mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads) + feature_means <- mat_stats$row_stats["mean", ] + feature_vars <- mat_stats$row_stats["variance", ] + # Calculate dispersion, and log normalize + feature_dispersion <- feature_vars / feature_means + feature_dispersion[feature_dispersion == 0] <- NA + feature_dispersion <- log(feature_dispersion) + feature_dispersion[feature_means == 0] <- 0 + feature_means <- log1p(feature_means) + features_df <- data.frame( + name = names(feature_means), + var = feature_vars, + mean = feature_means, + dispersion = feature_dispersion + ) + + # Bin by mean, and normalize dispersion with each bin + features_df <- features_df %>% + dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% + dplyr::group_by(bin) %>% + dplyr::mutate( + feature_dispersion_norm = (dispersion - mean(dispersion)) / sd(dispersion), + feature_dispersion_norm = if (dplyr::n() == 1) {1} else {feature_dispersion_norm} # Set feats that are in bins with only one feat to have a norm dispersion of 1 + ) %>% + dplyr::ungroup() %>% + dplyr::slice_max(order_by = feature_dispersion_norm, n = num_feats, with_ties = FALSE) + feats_of_interest <- which(rownames(mat) %in% features_df$name) # get rownames to get original sorted order + if (save_feat_selection) { + # get rownames that are in features_df$name + return(list( + mat = mat[feats_of_interest,], + feature_selection = features_df + )) + } + return(mat[feats_of_interest,]) +} + + #' Aggregate counts matrices by cell group or feature. #' #' Given a `(features x cells)` matrix, group cells by `cell_groups` and aggregate counts by `method` for each diff --git a/r/R/transforms.R b/r/R/transforms.R index 2a5de994..8d36e2e1 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -922,4 +922,4 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { global_params = numeric(), vars_to_regress = vars_to_regress ) -} +} \ No newline at end of file diff --git a/r/man/highly_variable_features.Rd b/r/man/highly_variable_features.Rd new file mode 100644 index 00000000..005a4e37 --- /dev/null +++ b/r/man/highly_variable_features.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/singlecell_utils.R +\name{highly_variable_features} +\alias{highly_variable_features} +\title{Get the most variable features within a matrix.} +\usage{ +highly_variable_features( + mat, + num_feats, + n_bins = 20, + save_feat_selection = FALSE, + threads = 1L +) +} +\arguments{ +\item{mat}{(IterableMatrix) dimensions features x cells} + +\item{num_feats}{(integer) Number of features to return. If the number is higher than the number of features in the matrix, +all features will be returned.} + +\item{n_bins}{(integer) Number of bins for binning mean gene expression. Normalizing dispersion is done with respect to each bin, +and if the number of features +within a bin is less than 2, the dispersion is set to 1.} + +\item{save_feat_selection}{(logical) If \code{TRUE}, save the dispersions, means, and the features selected.} + +\item{threads}{(integer) Number of threads to use.} +} +\value{ +\itemize{ +\item If \code{save_feat_selection} is \code{FALSE}, return an IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}. +\item If \code{save_feat_selection} is \code{TRUE}, return a list with the following elements: +\itemize{ +\item \code{mat}: IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}. +\item \code{feature_selection}: Dataframe with columns \code{name}, \code{mean}, \code{dispersion}, \code{bin}, \code{feature_dispersion_norm}. +} +} +} +\description{ +Get the most variable features within a matrix. +} +\details{ +The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015). + +Calculate using the following process: +\enumerate{ +\item Calculate the dispersion of each feature (variance / mean) +\item Log normalize dispersion and mean +\item Bin the features by their means, and normalize dispersion within each bin +} +} diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd new file mode 100644 index 00000000..ea2687f9 --- /dev/null +++ b/r/man/lsi.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/singlecell_utils.R +\name{lsi} +\alias{lsi} +\title{Perform latent semantic indexing (LSI) on a matrix.} +\usage{ +lsi( + mat, + z_score_norm = TRUE, + n_dimensions = 50L, + scale_factor = 10000, + save_lsi = FALSE, + threads = 1L +) +} +\arguments{ +\item{mat}{(IterableMatrix) dimensions features x cells} + +\item{z_score_norm}{(logical) If \code{TRUE}, z-score normalize the matrix before PCA.} + +\item{n_dimensions}{(integer) Number of dimensions to keep during PCA.} + +\item{scale_factor}{(integer) Scale factor for the tf-idf log transform. +#' @param save_lsi (logical) If \code{TRUE}, save the SVD attributes for the matrix, as well as the idf normalization vector.} + +\item{threads}{(integer) Number of threads to use.} +} +\value{ +\itemize{ +\item If \code{save_lsi} is \code{FALSE}, return a dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}. +\item If \code{save_lsi} is \code{TRUE}, return a list with the following elements: +\itemize{ +\item \code{pca_res}: dgCMatrix of shape \verb{(n_dimensions, ncol(mat))} +\item \code{svd_attr}: List of SVD attributes +\item \code{idf}: Inverse document frequency vector +} +} +} +\description{ +Given a \verb{(features x cells)} matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}. +} +\details{ +Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. + +Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example: +\itemize{ +\item 17.1 MB memory usage, 25.1 seconds runtime +} +} diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index b1459570..c08f55f0 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -122,6 +122,8 @@ reference: - checksum - apply_by_row - regress_out + - lsi + - highly_variable_features - IterableMatrix-methods - pseudobulk_matrix diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R new file mode 100644 index 00000000..084645d3 --- /dev/null +++ b/r/tests/real_data/ArchR_LSI.R @@ -0,0 +1,70 @@ +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +library("BPCells") +library("ArchR") + +# Set up temp dir in case it's not already set +create_temp_dir <- function(dir = NULL) { + if (is.null(dir)) { + dir <- file.path(tempdir(), "lsi_test") + if (dir.exists(dir)) unlink(dir, recursive = TRUE) + dir.create(dir) + } + return(dir) +} + +#' Perform a dimensionality reduction with tf-idf and SVD (LSI) on a matrix on ArchR and BPCells. +#' As LSI uses an iterative approach on ArchR, we compare by using a single-iteration private function on ArchR. +#' As the SVD implementation is not necessarily the same between the two packages, we take the SVD matrix +#' from both functions and compare the matrix multiplication of the U and SVD matrices, which should give an approximation +#' we can compare between the two packages. +#' @param proj An archr project. +test_lsi_similarity_to_archr <- function(dir = NULL) { + dir <- create_temp_dir(dir) + setwd(dir) + # add iterative lsi for dim reduction + proj <- getTestProject() + proj <- addPeakMatrix(proj) + # Get the peak matrix + test_mat <- assay(getMatrixFromProject(proj, useMatrix = "PeakMatrix")) + # Calculate LSI on ArchR + # running LSI without binarizing, as we don't do this in the BPCells implementation + # we also don't filter quantile outliers. + lsi_archr <- .computeLSI( + mat = test_mat, + LSIMethod = 2, + nDimensions = 20, + binarize = FALSE, + outlierQuantiles = NULL + ) + svd_archr <- lsi_archr$svd + lsi_mat_archr <- t(lsi_archr$matSVD) + # set rownames to NA, as we don't have rownames in the BPCells implementation + rownames(lsi_mat_archr) <- NULL + # PCA Matrix = T(u) * Pre-SVD Matrix + # u * PCA Matrix = u * T(u) * Pre-SVD Matrix + # u * PCA Matrix = Pre-SVD Matrix + pre_svd_mat_approx_archr <- lsi_archr$svd$u %*% lsi_mat_archr + # Calculate LSI on BPCells + # Do not use z-score normalization, as this isn't done with ArchR + lsi_bpcells <- lsi( + test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"), + n_dimensions = 20, + save_lsi = TRUE + ) + pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res + testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-4)) + # convert signs + lsi_mat_archr <- sweep(lsi_mat_archr, MARGIN = 1, (2 * (lsi_mat_archr[,1] * lsi_bpcells$pca_res[,1] > 0) - 1), `*`) + # Check for post-pca matrix similarity + testthat::expect_true(all.equal(lsi_mat_archr, lsi_bpcells$pca_res, tolerance = 1e-4)) + # also check for correlation between the two matrices in PC space + testthat::expect_true(cor(as.vector(lsi_mat_archr), as.vector(lsi_bpcells$pca_res)) > 0.999) +} +test_lsi_similarity_to_archr() \ No newline at end of file diff --git a/r/tests/real_data/ArchR_insertions.R b/r/tests/real_data/ArchR_insertions.R index 150939d4..8818d7e7 100644 --- a/r/tests/real_data/ArchR_insertions.R +++ b/r/tests/real_data/ArchR_insertions.R @@ -1,5 +1,23 @@ -devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/") -devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/") +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r") +devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/ArchR") + +# Set up temp dir in case it's not already set +create_temp_dir <- function(dir = NULL) { + if (is.null(dir)) { + dir <- file.path(tempdir(), "lsi_test") + if (dir.exists(dir)) unlink(dir, recursive = TRUE) + dir.create(dir) + } + return(dir) +} fix_granges_syntax_for_archr <- function(gr) { mcols(gr)$RG <- gsub("PBSmall#", "", mcols(gr)$RG) diff --git a/r/tests/real_data/Scanpy_variable_feat_selection.py b/r/tests/real_data/Scanpy_variable_feat_selection.py new file mode 100644 index 00000000..6ae28c1b --- /dev/null +++ b/r/tests/real_data/Scanpy_variable_feat_selection.py @@ -0,0 +1,46 @@ +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +import sys, tempfile, os +import numpy as np +import pandas as pd +import scanpy as sc + + +def call_highly_var_genes_single_batch(temp_dir: str) -> None: + """ + Call highly_variable genes on a single batch of PBMC68k data using their interpreation of + the Seurat implementation. + Write the input anndata object csv at `/highly_var_genes_scanpy_input.csv` + Write the output as a csv, at `/highly_var_genes_scanpy_output.csv` + + Args: + temp_dir (str): Path to the temporary directory to write the input and output files. + """ + # Dataset is only (765, 700) + adata = sc.datasets.pbmc68k_reduced() + adata.var_names_make_unique() + res = sc.pp._highly_variable_genes.highly_variable_genes(adata, + n_top_genes = 50, + n_bins = 20, + flavor = "seurat", + inplace = False, + check_values = False).drop(columns = 'mean_bin') + # remove mean_bin + adata.to_df().to_csv(os.path.join(temp_dir, "highly_var_genes_scanpy_input.csv")) + res.to_csv(os.path.join(temp_dir, "highly_var_genes_scanpy_output.csv")) + + +if __name__ == "__main__": + # We use the first argument as the temporary directory + if len(sys.argv) < 2: + # If no argument is provided, use the current directory + call_highly_var_genes_single_batch(".") + else: + call_highly_var_genes_single_batch(sys.argv[1]) + \ No newline at end of file diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R new file mode 100644 index 00000000..d2eb08b2 --- /dev/null +++ b/r/tests/real_data/scanpy_variable_feat_selection.R @@ -0,0 +1,58 @@ +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +library("BPCells") + +# Set up temp dir in case it's not already set +create_temp_dir <- function(dir = NULL) { + if (is.null(dir)) { + dir <- file.path(tempdir(), "lsi_test") + if (dir.exists(dir)) unlink(dir, recursive = TRUE) + dir.create(dir) + } + return(dir) +} + +# Compare the feature selection output of BPCells to that of Scanpy. +# Scanpy technically utilizes the Seurat (Satija et al. 2015) method for feature selection, so we should expect similar results of either pkg. +# This function calls a python script that runs Scanpy feature selection on a test dataset, and writes both input/output to `dir`. +# It then reads in the input/output from the python script, calls the BPCells feature selection function, and compares the output to the Scanpy output. +compare_feat_selection_to_scanpy <- function(dir = NULL) { + dir <- create_temp_dir(dir) + + # Call python script + system2("python3", c("Scanpy_variable_feat_selection.py", dir)) + + # read in input csv + input_mat_scanpy <- t(read.csv(file.path(dir, "highly_var_genes_scanpy_input.csv"), row.names = 1)) + output_mat_scanpy <- read.csv(file.path(dir, "highly_var_genes_scanpy_output.csv"), row.names = 1) + # filter output mat to only where "highly_variable" is true + output_mat_scanpy$highly_variable <- as.logical(output_mat_scanpy$highly_variable) + output_mat_scanpy <- output_mat_scanpy[output_mat_scanpy$highly_variable,] %>% + dplyr::arrange(desc(dispersions_norm)) %>% + dplyr::select(-highly_variable) %>% # convert rownames to a column + tibble::rownames_to_column("name") %>% + dplyr::as_tibble() + + # Scanpy undoes a log1p transformation on the input matrix, so we do the same here + input_mat_bpcells <- expm1(input_mat_scanpy) + + output_bpcells <- highly_variable_features( + input_mat_bpcells %>% as("dgCMatrix") %>% as("IterableMatrix"), + num_feats = 50, + n_bins = 20, + save_feat_selection = TRUE + ) + output_mat_bpcells <- output_bpcells$feature_selection + expect_true(all.equal(output_mat_bpcells$name, output_mat_scanpy$name)) + expect_true(all.equal(output_mat_bpcells$mean, output_mat_scanpy$means, tolerance = 1e-6)) + expect_true(all.equal(output_mat_bpcells$dispersion, output_mat_scanpy$dispersions, tolerance = 1e-6)) + expect_true(all.equal(output_mat_bpcells$feature_dispersion_norm, output_mat_scanpy$dispersions_norm, tolerance = 1e-6)) +} + +compare_feat_selection_to_scanpy() diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R index 8bae8966..8968d574 100644 --- a/r/tests/testthat/test-singlecell_utils.R +++ b/r/tests/testthat/test-singlecell_utils.R @@ -8,6 +8,12 @@ generate_sparse_matrix <- function(nrow, ncol, fraction_nonzero = 0.5, max_val = 10) { m <- matrix(rbinom(nrow * ncol, 1, fraction_nonzero) * sample.int(max_val, nrow * ncol, replace = TRUE), nrow = nrow) + rownames(m) <- paste0("feat", seq_len(nrow(m))) + colnames(m) <- paste0("cell", seq_len(ncol(m))) + as(m, "dgCMatrix") +} +generate_dense_matrix <- function(nrow, ncol) { + m <- matrix(runif(nrow * ncol), nrow = nrow) rownames(m) <- paste0("row", seq_len(nrow(m))) colnames(m) <- paste0("col", seq_len(ncol(m))) as(m, "dgCMatrix") @@ -125,6 +131,39 @@ test_that("Pseudobulk aggregation works", { } }) +test_that("C++ UMAP graph calculation works", { + # This uses a pre-calculated graph from the umap-learn python package. + # See ../data/generate_iris_geodesic_graph.R for the generation code + test_data <- readRDS("../data/iris_geodesic_graph.rds") + knn <- test_data$knn + res <- knn_to_geodesic_graph(knn) + expect_equal(as.matrix(res + t(res)), as.matrix(test_data$graph), tolerance=1e-6) +}) + +test_that("Highly variable feature selection works", { + mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix") + # Test only that outputs are reasonable. There is a full comparison in `tests/real_data/` that compares implementation to Seurat + res <- highly_variable_features(mat, num_feats = 10, n_bins = 5, threads = 1) + res_t <- highly_variable_features(t(mat), num_feats = 10, n_bins = 5, threads = 1) + expect_equal(nrow(res), 10) + expect_equal(ncol(res), 26) + expect_equal(nrow(res_t), 10) + expect_equal(ncol(res_t), 500) +}) + +test_that("LSI works", { + mat <- matrix(runif(240), nrow=10) %>% as("dgCMatrix") %>% as("IterableMatrix") + rownames(mat) <- paste0("feat", seq_len(nrow(mat))) + colnames(mat) <- paste0("cell", seq_len(ncol(mat))) + # Test only that outputs are reasonable. There is a full comparison in `tests/real_data/` that compares implementation to ArchR + lsi_res <- lsi(mat, n_dimensions = 5) + lsi_res_t <- lsi(t(mat), n_dimensions = 5) + expect_equal(nrow(lsi_res), 5) + expect_equal(ncol(lsi_res), ncol(mat)) + expect_equal(nrow(lsi_res_t), 5) + expect_equal(ncol(lsi_res_t), nrow(mat)) +}) + test_that("Pseudobulk aggregation works with multiple return types", { m0 <- generate_sparse_matrix(20, 10, max_val = 10) m1 <- m0 |> as("IterableMatrix")