From ed158200d3dc58196ef8c33f3a081700bce065fe Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Wed, 30 Oct 2024 19:00:33 -0700 Subject: [PATCH 01/15] [r] add lsi, var feature selection --- r/R/transforms.R | 73 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/r/R/transforms.R b/r/R/transforms.R index 2a5de994..7d9ea388 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -923,3 +923,76 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { vars_to_regress = vars_to_regress ) } + +#' Compute LSI For a peak matrix +#' @param mat PeakMatrix +#' @param n_dimensions Number of dimensions to keep during PCA +#' @param scale_factor Scale factor for the tf-idf log transform +#' @param verbose Whether to print out progress +#' @param threads Number of threads to use +#' @return dgCMatrix of shape (n_dimensions, ncol(mat)) +#' @details Compute LSI through first doing a tf-idf transform, a z-score normalization, then PCA. +#' Tf-idf implementation is from Stuart & Butler et al. 2019. +#' @export +compute_lsi <- function(mat, n_dimensions = 50, scale_factor = 1e-4, verbose = FALSE, threads = 1) { + assert_is(mat, "IterableMatrix") # Should be a peak matrix, should we enforce this? + assert_is(n_dimensions, "integer") + assert_len(n_dimensions, 1) + assert_greater_than_zero(n_dimensions) + assert_true(n_dimensions < min(ncol(mat), nrow(mat))) + + # Signac implementation + npeaks <- colSums(mat) + tf <- mat %>% multiply_cols(1/npeaks) + idf_ <- ncol(mat) / rowSums(mat) + mat_tfidf <- tf %>% multiply_rows(idf_) + mat_log_tfidf <- log1p(scale_factor * mat_tfidf) + + # run z-score norm + cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats="variance")$col_stats + cell_means <- cell_peak_stats["mean",] + cell_vars <- cell_peak_stats["variance",] + mat_lsi_norm <- mat_log_tfidf %>% + add_cols(-cell_means) %>% + multiply_cols(1 / cell_vars) + + # Run pca + svd_attr_ <- svds(mat_lsi_norm, k = n_dimensions) + pca_res <- t(svd_attr$u) %*% mat_lsi_norm + return(pca_res) +} + +#' Get most variable features, given a non-log normalized matrix +highly_variable_features <- function(mat, num_feats, n_bins, verbose = FALSE) { + assert_is(mat, "IterableMatrix") + assert_is(num_feats, "integer") + assert_greater_than_zero(num_feats) + assert_len(num_feats, 1) + assert_is(n_bins, "integer") + assert_len(n_bins, 1) + assert_greater_than_zero(n_bins) + if (nrow(mat) <= num_feats) { + if (verbose) log_progress(sprintf("Number of features (%s) is less than num_feats (%s), + returning all features", nrow(mat), num_feats)) + return(mat) + } + # Calculate the mean and variance of each feature + # should we set entries that are 0 to NA? + feature_means <- matrix_stats(mat, row_stats = c("mean"))$row_stats['mean', ] + feature_vars <- matrix_stats(mat, row_stats = c("variance"))$row_stats['variance', ] + feature_dispersion <- feature_vars / feature_means + feature_dispersion <- log(feature_dispersion) + feature_means <- log1p(feature_means) + mean_bins <- cut(feature_means, n_bins, labels = FALSE) + + # Calculate the mean and variance of dispersion of each bin + bin_mean <- tapply(feature_dispersion, mean_bins, function(x) mean(x)) + bin_sd <- tapply(feature_dispersion, mean_bins, function(x) sd(x)) + # Set bin_sd value to bin_mean if bin_sd is NA, results in norm dispersion of 1 + bin_sd[is.na(bin_sd)] <- bin_mean[is.na(bin_sd)] + # map mean_bins indices to bin_stats + feature_dispersion_norm <- (feature_dispersion - bin_mean[mean_bins]) / bin_sd[mean_bins] + names(feature_dispersion_norm) <- names(feature_dispersion) + variable_features_ <- sort(feature_dispersion_norm)[nrow(mat)-num_feats:nrow(mat)] + return(mat[names(variable_features_), ]) +} \ No newline at end of file From d8df4cd9d5f0e26ed26d4da8473eb00244231fb6 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 4 Nov 2024 15:14:26 -0800 Subject: [PATCH 02/15] [r] add lsi, variable feature selection --- r/NAMESPACE | 2 + r/NEWS.md | 2 + r/R/singlecell_utils.R | 106 +++++++++++++++++++++++ r/R/transforms.R | 73 ---------------- r/man/highly_variable_features.Rd | 36 ++++++++ r/man/lsi.Rd | 46 ++++++++++ r/pkgdown/_pkgdown.yml | 2 + r/tests/testthat/test-singlecell_utils.R | 33 +++++++ 8 files changed, 227 insertions(+), 73 deletions(-) create mode 100644 r/man/highly_variable_features.Rd create mode 100644 r/man/lsi.Rd diff --git a/r/NAMESPACE b/r/NAMESPACE index d9f0d36b..4a90558e 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -44,6 +44,7 @@ export(gene_score_archr) export(gene_score_tiles_archr) export(gene_score_weights_archr) export(get_trackplot_height) +export(highly_variable_features) export(import_matrix_market) export(import_matrix_market_10x) export(knn_annoy) @@ -51,6 +52,7 @@ export(knn_hnsw) export(knn_to_geodesic_graph) export(knn_to_snn_graph) export(log1p_slow) +export(lsi) export(marker_features) export(match_gene_symbol) export(matrix_stats) diff --git a/r/NEWS.md b/r/NEWS.md index 2837c230..cef29f7f 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -19,6 +19,8 @@ Contributions welcome :) Thanks to @ycli1995 for pull request #110 - Add `trackplot_genome_annotation()` for plotting peaks, with options for directional arrows, colors, labels, and peak widths. (pull request #113) - Add MACS2/3 input creation and peak calling through `call_macs_peaks()` (pull request #118) +- Add `lsi()` function to perform latent semantic indexing on a matrix (pull request #156). +- Add `highly_variable_features()` function to identify highly variable features in a matrix (pull request #156). ## Improvements - `trackplot_loop()` now accepts discrete color scales diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index 42e3933f..8471884d 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -68,4 +68,110 @@ marker_features <- function(mat, groups, method="wilcoxon") { foreground_mean = as.numeric(foreground_means), background_mean = as.numeric(background_means) ) +} + +#' Perform latent semantic indexing (LSI) on a matrix. +#' @param mat (IterableMatrix) dimensions features x cells +#' @param n_dimensions (integer) Number of dimensions to keep during PCA. +#' @param scale_factor (integer) Scale factor for the tf-idf log transform. +#' @param save_in_memory (logical) If TRUE, save the log(tf-idf) matrix in memory. +#' If FALSE, save to a temporary location in disk. Saving in memory will result in faster downstream operations, +#' but will require in higher memory usage. Comparison of memory usage and speed is in the details section. +#' @param threads (integer) Number of threads to use. +#' @return dgCMatrix of shape (n_dimensions, ncol(mat)). +#' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. +#' +#' ** Saving in memory vs disk: ** +#' Following the log(tf-idf) transform, the matrix is stored into a temporary location, as the next step will break the sparsity pattern of the matrix. +#' This is done to prevent re-calculation of queued operations during PCA optimization. +#' +#' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example: +#' - Saving in memory: 233 MB memory usage, 22.7 seconds runtime +#' - Saving in disk: 17.1 MB memory usage, 25.1 seconds runtime +#' +#' @export +lsi <- function(mat, n_dimensions = 50L, scale_factor = 1e4, save_in_memory = FALSE, threads = 1L) { + assert_is(mat, "IterableMatrix") + assert_is_wholenumber(n_dimensions) + assert_len(n_dimensions, 1) + assert_greater_than_zero(n_dimensions) + assert_true(n_dimensions < min(ncol(mat), nrow(mat))) + assert_is_wholenumber(threads) + + # log(tf-idf) transform + npeaks <- colSums(mat) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h` + tf <- mat %>% multiply_cols(1 / npeaks) + idf_ <- ncol(mat) / rowSums(mat) + mat_tfidf <- tf %>% multiply_rows(idf_) + mat_log_tfidf <- log1p(scale_factor * mat_tfidf) + # Save to prevent re-calculation of queued operations + if (save_in_memory) { + mat_log_tfidf <- write_matrix_memory(mat_log_tfidf, compress = FALSE) + } else { + mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE) + } + # Z-score normalization + cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats="variance", threads = threads)$col_stats + cell_means <- cell_peak_stats["mean",] + cell_vars <- cell_peak_stats["variance",] + mat_lsi_norm <- mat_log_tfidf %>% + add_cols(-cell_means) %>% + multiply_cols(1 / cell_vars) + # Run pca + svd_attr_ <- svds(mat_lsi_norm, k = n_dimensions, threads = threads) + pca_res <- t(svd_attr_$u) %*% mat_lsi_norm + return(pca_res) +} + +#' Get the most variable features within a matrix +#' @param num_feats (integer) Number of features to return. If the number is higher than the number of features in the matrix, +#' ll features will be returned. +#' @param n_bins (integer) Number of bins for binning mean gene expression. Normalizing dispersion is done with respect to each bin, +#' and if the number of features +#' within a bin is less than 2, the dispersion is set to 1. +#' @returns IterableMatrix subset of the most variable features. +#' @inheritParams lsi +#' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015). +#' +#' Calculate using the following process: +#' 1. Calculate the dispersion of each feature (variance / mean) +#' 2. Log normalize dispersion and mean +#' 3. Bin the features by their means, and normalize dispersion within each bin +#' @export +highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) { + assert_is(mat, "IterableMatrix") + assert_greater_than_zero(num_feats) + assert_is_wholenumber(num_feats) + assert_len(num_feats, 1) + assert_is_wholenumber(n_bins) + assert_len(n_bins, 1) + assert_greater_than_zero(n_bins) + if (nrow(mat) <= num_feats) { + log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats)) + return(mat) + } + + feature_means <- matrix_stats(mat, row_stats = c("mean"))$row_stats["mean", ] + feature_vars <- matrix_stats(mat, row_stats = c("variance"))$row_stats["variance", ] + feature_means[feature_means == 0] <- 1e-12 + feature_dispersion <- feature_vars / feature_means + feature_dispersion[feature_dispersion == 0] <- NA + feature_dispersion <- log(feature_dispersion) + feature_means <- log1p(feature_means) + mean_bins <- cut(feature_means, n_bins, labels = FALSE) + + bin_mean <- tapply(feature_dispersion, mean_bins, function(x) mean(x, na.rm = TRUE)) + bin_sd <- tapply(feature_dispersion, mean_bins, function(x) sd(x, na.rm = TRUE)) + # Set feats that are in bins with only one feat to have a norm dispersion of 1 + one_gene_bin <- is.na(bin_sd) + bin_sd[one_gene_bin] <- bin_mean[one_gene_bin] + bin_mean[one_gene_bin] <- 0 + # map mean_bins indices to bin_stats + # Do a character search as bins without features mess up numeric indexing + feature_dispersion_norm <- (feature_dispersion - bin_mean[as.character(mean_bins)]) / bin_sd[as.character(mean_bins)] + names(feature_dispersion_norm) <- names(feature_dispersion) + feature_dispersion_norm <- sort(feature_dispersion_norm) # sorting automatically removes NA values + if (length(feature_dispersion_norm) < num_feats) log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all non-zero features", length(feature_dispersion_norm), num_feats)) + variable_features_ <- feature_dispersion_norm[max(1, (length(feature_dispersion_norm) - num_feats + 1)):length(feature_dispersion_norm)] + return(mat[names(variable_features_), ]) } \ No newline at end of file diff --git a/r/R/transforms.R b/r/R/transforms.R index 7d9ea388..8d36e2e1 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -922,77 +922,4 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { global_params = numeric(), vars_to_regress = vars_to_regress ) -} - -#' Compute LSI For a peak matrix -#' @param mat PeakMatrix -#' @param n_dimensions Number of dimensions to keep during PCA -#' @param scale_factor Scale factor for the tf-idf log transform -#' @param verbose Whether to print out progress -#' @param threads Number of threads to use -#' @return dgCMatrix of shape (n_dimensions, ncol(mat)) -#' @details Compute LSI through first doing a tf-idf transform, a z-score normalization, then PCA. -#' Tf-idf implementation is from Stuart & Butler et al. 2019. -#' @export -compute_lsi <- function(mat, n_dimensions = 50, scale_factor = 1e-4, verbose = FALSE, threads = 1) { - assert_is(mat, "IterableMatrix") # Should be a peak matrix, should we enforce this? - assert_is(n_dimensions, "integer") - assert_len(n_dimensions, 1) - assert_greater_than_zero(n_dimensions) - assert_true(n_dimensions < min(ncol(mat), nrow(mat))) - - # Signac implementation - npeaks <- colSums(mat) - tf <- mat %>% multiply_cols(1/npeaks) - idf_ <- ncol(mat) / rowSums(mat) - mat_tfidf <- tf %>% multiply_rows(idf_) - mat_log_tfidf <- log1p(scale_factor * mat_tfidf) - - # run z-score norm - cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats="variance")$col_stats - cell_means <- cell_peak_stats["mean",] - cell_vars <- cell_peak_stats["variance",] - mat_lsi_norm <- mat_log_tfidf %>% - add_cols(-cell_means) %>% - multiply_cols(1 / cell_vars) - - # Run pca - svd_attr_ <- svds(mat_lsi_norm, k = n_dimensions) - pca_res <- t(svd_attr$u) %*% mat_lsi_norm - return(pca_res) -} - -#' Get most variable features, given a non-log normalized matrix -highly_variable_features <- function(mat, num_feats, n_bins, verbose = FALSE) { - assert_is(mat, "IterableMatrix") - assert_is(num_feats, "integer") - assert_greater_than_zero(num_feats) - assert_len(num_feats, 1) - assert_is(n_bins, "integer") - assert_len(n_bins, 1) - assert_greater_than_zero(n_bins) - if (nrow(mat) <= num_feats) { - if (verbose) log_progress(sprintf("Number of features (%s) is less than num_feats (%s), - returning all features", nrow(mat), num_feats)) - return(mat) - } - # Calculate the mean and variance of each feature - # should we set entries that are 0 to NA? - feature_means <- matrix_stats(mat, row_stats = c("mean"))$row_stats['mean', ] - feature_vars <- matrix_stats(mat, row_stats = c("variance"))$row_stats['variance', ] - feature_dispersion <- feature_vars / feature_means - feature_dispersion <- log(feature_dispersion) - feature_means <- log1p(feature_means) - mean_bins <- cut(feature_means, n_bins, labels = FALSE) - - # Calculate the mean and variance of dispersion of each bin - bin_mean <- tapply(feature_dispersion, mean_bins, function(x) mean(x)) - bin_sd <- tapply(feature_dispersion, mean_bins, function(x) sd(x)) - # Set bin_sd value to bin_mean if bin_sd is NA, results in norm dispersion of 1 - bin_sd[is.na(bin_sd)] <- bin_mean[is.na(bin_sd)] - # map mean_bins indices to bin_stats - feature_dispersion_norm <- (feature_dispersion - bin_mean[mean_bins]) / bin_sd[mean_bins] - names(feature_dispersion_norm) <- names(feature_dispersion) - variable_features_ <- sort(feature_dispersion_norm)[nrow(mat)-num_feats:nrow(mat)] - return(mat[names(variable_features_), ]) } \ No newline at end of file diff --git a/r/man/highly_variable_features.Rd b/r/man/highly_variable_features.Rd new file mode 100644 index 00000000..54e67599 --- /dev/null +++ b/r/man/highly_variable_features.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/singlecell_utils.R +\name{highly_variable_features} +\alias{highly_variable_features} +\title{Get the most variable features within a matrix} +\usage{ +highly_variable_features(mat, num_feats, n_bins, threads = 1L) +} +\arguments{ +\item{mat}{(IterableMatrix) dimensions features x cells} + +\item{num_feats}{(integer) Number of features to return. If the number is higher than the number of features in the matrix, +ll features will be returned.} + +\item{n_bins}{(integer) Number of bins for binning mean gene expression. Normalizing dispersion is done with respect to each bin, +and if the number of features +within a bin is less than 2, the dispersion is set to 1.} + +\item{threads}{(integer) Number of threads to use.} +} +\value{ +IterableMatrix subset of the most variable features. +} +\description{ +Get the most variable features within a matrix +} +\details{ +The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015). + +Calculate using the following process: +\enumerate{ +\item Calculate the dispersion of each feature (variance / mean) +\item Log normalize dispersion and mean +\item Bin the features by their means, and normalize dispersion within each bin +} +} diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd new file mode 100644 index 00000000..38c0d73b --- /dev/null +++ b/r/man/lsi.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/singlecell_utils.R +\name{lsi} +\alias{lsi} +\title{Perform latent semantic indexing (LSI) on a matrix.} +\usage{ +lsi( + mat, + n_dimensions = 50L, + scale_factor = 10000, + save_in_memory = FALSE, + threads = 1L +) +} +\arguments{ +\item{mat}{(IterableMatrix) dimensions features x cells} + +\item{n_dimensions}{(integer) Number of dimensions to keep during PCA.} + +\item{scale_factor}{(integer) Scale factor for the tf-idf log transform.} + +\item{save_in_memory}{(logical) If TRUE, save the log(tf-idf) matrix in memory. +If FALSE, save to a temporary location in disk. Saving in memory will result in faster downstream operations, +but will require in higher memory usage. Comparison of memory usage and speed is in the details section.} + +\item{threads}{(integer) Number of threads to use.} +} +\value{ +dgCMatrix of shape (n_dimensions, ncol(mat)). +} +\description{ +Perform latent semantic indexing (LSI) on a matrix. +} +\details{ +Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. + +** Saving in memory vs disk: ** +Following the log(tf-idf) transform, the matrix is stored into a temporary location, as the next step will break the sparsity pattern of the matrix. +This is done to prevent re-calculation of queued operations during PCA optimization. + +Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example: +\itemize{ +\item Saving in memory: 233 MB memory usage, 22.7 seconds runtime +\item Saving in disk: 17.1 MB memory usage, 25.1 seconds runtime +} +} diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index 6e1b584b..f7530216 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -122,6 +122,8 @@ reference: - checksum - apply_by_row - regress_out + - lsi + - highly_variable_features - IterableMatrix-methods - title: "Reference Annotations" diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R index 962d0048..5ef6e5f3 100644 --- a/r/tests/testthat/test-singlecell_utils.R +++ b/r/tests/testthat/test-singlecell_utils.R @@ -6,6 +6,15 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. +generate_sparse_matrix <- function(nrow, ncol, fraction_nonzero = 0.5, max_val = 10) { + m <- matrix(rbinom(nrow * ncol, 1, fraction_nonzero) * sample.int(max_val, nrow * ncol, replace = TRUE), nrow = nrow) + rownames(m) <- paste0("feat", seq_len(nrow(m))) + colnames(m) <- paste0("cell", seq_len(ncol(m))) + as(m, "dgCMatrix") +} +generate_dense_matrix <- function(nrow, ncol) { + m <- matrix(runif(nrow * ncol), nrow = nrow) +} test_that("Wilcoxon rank sum works (small)", { x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46) y <- c(1.15, 0.88, 0.90, 0.74, 1.21) @@ -114,4 +123,28 @@ test_that("C++ UMAP graph calculation works", { knn <- test_data$knn res <- knn_to_geodesic_graph(knn) expect_equal(as.matrix(res + t(res)), as.matrix(test_data$graph), tolerance=1e-6) +}) + +test_that("Highly variable feature selection works", { + mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix") + # Test only that outputs are reasonable. There is a full comparison in `tests/real_data/` that compares implementation to Seurat + res <- highly_variable_features(mat, num_feats = 10, n_bins = 5, threads = 1) + res_t <- highly_variable_features(t(mat), num_feats = 10, n_bins = 5, threads = 1) + expect_equal(nrow(res), 10) + expect_equal(ncol(res), 26) + expect_equal(nrow(res_t), 10) + expect_equal(ncol(res_t), 500) +}) + +test_that("LSI works", { + mat <- matrix(runif(240), nrow=10) %>% as("dgCMatrix") %>% as("IterableMatrix") + rownames(mat) <- paste0("feat", seq_len(nrow(mat))) + colnames(mat) <- paste0("cell", seq_len(ncol(mat))) + # Test only that outputs are reasonable. There is a full comparison in `tests/real_data/` that compares implementation to ArchR + lsi_res <- lsi(mat, n_dimensions = 5) + lsi_res_t <- lsi(t(mat), n_dimensions = 5) + expect_equal(nrow(lsi_res), 5) + expect_equal(ncol(lsi_res), ncol(mat)) + expect_equal(nrow(lsi_res_t), 5) + expect_equal(ncol(lsi_res_t), nrow(mat)) }) \ No newline at end of file From 16047ae65779c97b46f0a785e232b8a8c3880d24 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Wed, 6 Nov 2024 16:53:53 -0800 Subject: [PATCH 03/15] [r] parametrize z_score_norm, create temp option to return more info `save_lsi`, remove `save_in_memory` in `lsi()` --- r/R/singlecell_utils.R | 105 ++++++++++++++++++++++++----------------- r/man/lsi.Rd | 24 +++++----- 2 files changed, 74 insertions(+), 55 deletions(-) diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index 8b797513..95097c09 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -73,25 +73,28 @@ marker_features <- function(mat, groups, method="wilcoxon") { #' Perform latent semantic indexing (LSI) on a matrix. #' @param mat (IterableMatrix) dimensions features x cells +#' @param z_score_norm (logical) If TRUE, z-score normalize the matrix before PCA. #' @param n_dimensions (integer) Number of dimensions to keep during PCA. #' @param scale_factor (integer) Scale factor for the tf-idf log transform. -#' @param save_in_memory (logical) If TRUE, save the log(tf-idf) matrix in memory. -#' If FALSE, save to a temporary location in disk. Saving in memory will result in faster downstream operations, -#' but will require in higher memory usage. Comparison of memory usage and speed is in the details section. #' @param threads (integer) Number of threads to use. -#' @return dgCMatrix of shape (n_dimensions, ncol(mat)). +#' @param save_lsi (logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector. +#' @return +#' - If save_lsi is FALSE, return a dgCMatrix of shape (n_dimensions, ncol(mat)). +#' - If save_lsi is TRUE, return a list with the following elements: +#' - **pca_res**: dgCMatrix of shape (n_dimensions, ncol(mat)) +#' - **svd_attr**: List of SVD attributes +#' - **idf**: Inverse document frequency vector #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. #' -#' ** Saving in memory vs disk: ** -#' Following the log(tf-idf) transform, the matrix is stored into a temporary location, as the next step will break the sparsity pattern of the matrix. -#' This is done to prevent re-calculation of queued operations during PCA optimization. -#' #' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example: -#' - Saving in memory: 233 MB memory usage, 22.7 seconds runtime -#' - Saving in disk: 17.1 MB memory usage, 25.1 seconds runtime -#' +#' - 17.1 MB memory usage, 25.1 seconds runtime #' @export -lsi <- function(mat, n_dimensions = 50L, scale_factor = 1e4, save_in_memory = FALSE, threads = 1L) { +lsi <- function( + mat, + z_score_norm = TRUE, n_dimensions = 50L, scale_factor = 1e4, + save_lsi = FALSE, + threads = 1L +) { assert_is(mat, "IterableMatrix") assert_is_wholenumber(n_dimensions) assert_len(n_dimensions, 1) @@ -100,27 +103,34 @@ lsi <- function(mat, n_dimensions = 50L, scale_factor = 1e4, save_in_memory = FA assert_is_wholenumber(threads) # log(tf-idf) transform + mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean")) + npeaks <- colSums(mat) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h` tf <- mat %>% multiply_cols(1 / npeaks) idf_ <- ncol(mat) / rowSums(mat) mat_tfidf <- tf %>% multiply_rows(idf_) mat_log_tfidf <- log1p(scale_factor * mat_tfidf) # Save to prevent re-calculation of queued operations - if (save_in_memory) { - mat_log_tfidf <- write_matrix_memory(mat_log_tfidf, compress = FALSE) - } else { - mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE) - } + mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE) # Z-score normalization - cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats="variance", threads = threads)$col_stats - cell_means <- cell_peak_stats["mean",] - cell_vars <- cell_peak_stats["variance",] - mat_lsi_norm <- mat_log_tfidf %>% - add_cols(-cell_means) %>% - multiply_cols(1 / cell_vars) + if (z_score_norm) { + cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats = "variance", threads = threads)$col_stats + cell_means <- cell_peak_stats["mean",] + cell_vars <- cell_peak_stats["variance",] + mat_log_tfidf <- mat_log_tfidf %>% + add_cols(-cell_means) %>% + multiply_cols(1 / cell_vars) + } # Run pca - svd_attr_ <- svds(mat_lsi_norm, k = n_dimensions, threads = threads) - pca_res <- t(svd_attr_$u) %*% mat_lsi_norm + svd_attr_ <- svds(mat_log_tfidf, k = n_dimensions, threads = threads) + pca_res <- t(svd_attr_$u) %*% mat_log_tfidf + if(save_lsi) { + return(list( + pca_res = pca_res, + svd_attr = svd_attr_, + idf = idf_ + )) + } return(pca_res) } @@ -151,31 +161,38 @@ highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) { log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats)) return(mat) } - - feature_means <- matrix_stats(mat, row_stats = c("mean"))$row_stats["mean", ] - feature_vars <- matrix_stats(mat, row_stats = c("variance"))$row_stats["variance", ] + mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads) + feature_means <- mat_stats$row_stats["mean", ] + feature_vars <- mat_stats$row_stats["variance", ] feature_means[feature_means == 0] <- 1e-12 feature_dispersion <- feature_vars / feature_means feature_dispersion[feature_dispersion == 0] <- NA feature_dispersion <- log(feature_dispersion) feature_means <- log1p(feature_means) - mean_bins <- cut(feature_means, n_bins, labels = FALSE) - - bin_mean <- tapply(feature_dispersion, mean_bins, function(x) mean(x, na.rm = TRUE)) - bin_sd <- tapply(feature_dispersion, mean_bins, function(x) sd(x, na.rm = TRUE)) - # Set feats that are in bins with only one feat to have a norm dispersion of 1 - one_gene_bin <- is.na(bin_sd) - bin_sd[one_gene_bin] <- bin_mean[one_gene_bin] - bin_mean[one_gene_bin] <- 0 - # map mean_bins indices to bin_stats - # Do a character search as bins without features mess up numeric indexing - feature_dispersion_norm <- (feature_dispersion - bin_mean[as.character(mean_bins)]) / bin_sd[as.character(mean_bins)] - names(feature_dispersion_norm) <- names(feature_dispersion) - feature_dispersion_norm <- sort(feature_dispersion_norm) # sorting automatically removes NA values - if (length(feature_dispersion_norm) < num_feats) log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all non-zero features", length(feature_dispersion_norm), num_feats)) - variable_features_ <- feature_dispersion_norm[max(1, (length(feature_dispersion_norm) - num_feats + 1)):length(feature_dispersion_norm)] - return(mat[names(variable_features_), ]) + features_df <- data.frame( + name = names(feature_means), + vars = feature_vars, + means = feature_means, + dispersion = feature_dispersion + ) + features_df <- features_df %>% + dplyr::mutate(bin = cut(means, n_bins, labels=FALSE)) %>% + dplyr::group_by(bin) %>% + dplyr::mutate( + bin_mean = mean(dispersion, na.rm = TRUE), + bin_sd = sd(dispersion, na.rm = TRUE), + bin_sd_is_na = is.na(bin_sd), + bin_sd = ifelse(bin_sd_is_na, bin_mean, bin_sd), # Set feats that are in bins with only one feat to have a norm dispersion of 1 + bin_mean = ifelse(bin_sd_is_na, 0, bin_mean), + feature_dispersion_norm = (dispersion - bin_mean) / bin_sd + ) %>% + dplyr::ungroup() %>% + dplyr::select(name, feature_dispersion_norm) %>% + dplyr::arrange(desc(feature_dispersion_norm)) %>% + dplyr::slice(1:min(num_feats, nrow(.))) + return(mat[features_df$name,]) } + #' Aggregate counts matrices by cell group or feature. #' diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd index 38c0d73b..137e4437 100644 --- a/r/man/lsi.Rd +++ b/r/man/lsi.Rd @@ -6,27 +6,34 @@ \usage{ lsi( mat, + z_score_norm = TRUE, n_dimensions = 50L, scale_factor = 10000, - save_in_memory = FALSE, + save_lsi = FALSE, threads = 1L ) } \arguments{ \item{mat}{(IterableMatrix) dimensions features x cells} +\item{z_score_norm}{(logical) If TRUE, z-score normalize the matrix before PCA.} + \item{n_dimensions}{(integer) Number of dimensions to keep during PCA.} \item{scale_factor}{(integer) Scale factor for the tf-idf log transform.} -\item{save_in_memory}{(logical) If TRUE, save the log(tf-idf) matrix in memory. -If FALSE, save to a temporary location in disk. Saving in memory will result in faster downstream operations, -but will require in higher memory usage. Comparison of memory usage and speed is in the details section.} +\item{save_lsi}{(logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector.} \item{threads}{(integer) Number of threads to use.} } \value{ -dgCMatrix of shape (n_dimensions, ncol(mat)). +\itemize{ +\item If save_lsi is FALSE, return a dgCMatrix of shape (n_dimensions, ncol(mat)). +\item If save_lsi is TRUE, return a list with the following elements: +\item \strong{pca_res}: dgCMatrix of shape (n_dimensions, ncol(mat)) +\item \strong{svd_attr}: List of SVD attributes +\item \strong{idf}: Inverse document frequency vector +} } \description{ Perform latent semantic indexing (LSI) on a matrix. @@ -34,13 +41,8 @@ Perform latent semantic indexing (LSI) on a matrix. \details{ Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. -** Saving in memory vs disk: ** -Following the log(tf-idf) transform, the matrix is stored into a temporary location, as the next step will break the sparsity pattern of the matrix. -This is done to prevent re-calculation of queued operations during PCA optimization. - Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example: \itemize{ -\item Saving in memory: 233 MB memory usage, 22.7 seconds runtime -\item Saving in disk: 17.1 MB memory usage, 25.1 seconds runtime +\item 17.1 MB memory usage, 25.1 seconds runtime } } From d6c674b5b5937d2ade9de22893cc2cf445e3558c Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Wed, 6 Nov 2024 16:54:18 -0800 Subject: [PATCH 04/15] [r] add test case for LSI comparing to archr --- r/tests/real_data/ArchR_LSI.R | 50 +++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 r/tests/real_data/ArchR_LSI.R diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R new file mode 100644 index 00000000..b78d700f --- /dev/null +++ b/r/tests/real_data/ArchR_LSI.R @@ -0,0 +1,50 @@ +devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/") +devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/") + +#' Perform a dimensionality reduction with tf-idf and SVD (LSI) on a matrix on ArchR and BPCells. +#' As LSI uses an iterative approach on ArchR, we compare by using a single-iteration private function on ArchR. +#' As the SVD implementation is not necessarily the same between the two packages, we take the SVD matrix +#' from both functions and compare the matrix multiplication of the U and SVD matrices, which should give an approximation +#' we can compare between the two packages. +#' @param proj An archr project. +test_lsi_similarity_to_archr <- function() { + # Set up temp dir + int_dir <- file.path(tempdir(), "insertion_test") + dir.create(int_dir) + setwd(int_dir) + # add iterative lsi for dim reduction + proj <- getTestProject() + proj <- addPeakMatrix(proj) + # Get the peak matrix + test_mat <- assay(getMatrixFromProject(proj, useMatrix = "PeakMatrix")) + # Calculate LSI on ArchR + # running LSI without binarizing, as we don't do this in the BPCells implementation + # we also don't filter quantile outliers. + lsi_archr <- .computeLSI( + mat = test_mat, + LSIMethod = 2, + nDimensions = 2, + binarize = FALSE, + outlierQuantiles = NULL, + test_mat = test_mat + ) + svd_archr <- lsi_archr$svd + lsi_mat_archr <- t(lsi_archr$matSVD) + # set rownames to NA, as we don't have rownames in the BPCells implementation + rownames(lsi_mat_archr) <- NULL + # PCA Matrix = T(u) * Pre-SVD Matrix + # u * PCA Matrix = u * T(u) * Pre-SVD Matrix + # u * PCA Matrix = Pre-SVD Matrix + pre_svd_mat_approx_archr <- lsi_archr$svd$u %*% lsi_mat_archr + # Calculate LSI on BPCells + # Do not use z-score normalization, as this isn't done with ArchR + lsi_bpcells <- lsi( + test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"), + z_score_norm = FALSE, + n_dimensions = 2, + save_lsi = TRUE + ) + pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res + testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-6)) +} +test_lsi_similarity_to_archr() \ No newline at end of file From 7bda387819288ba6e64116cd8fb368bc6ea7f14b Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Wed, 6 Nov 2024 19:29:32 -0800 Subject: [PATCH 05/15] [r] clean up var gene selection, lsi docstring --- r/R/singlecell_utils.R | 38 ++++++++++++++++++++++++------- r/man/highly_variable_features.Rd | 17 ++++++++++++-- r/man/lsi.Rd | 6 ++--- 3 files changed, 48 insertions(+), 13 deletions(-) diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index 95097c09..94e5fe53 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -72,6 +72,8 @@ marker_features <- function(mat, groups, method="wilcoxon") { #' Perform latent semantic indexing (LSI) on a matrix. +#' +#' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`. #' @param mat (IterableMatrix) dimensions features x cells #' @param z_score_norm (logical) If TRUE, z-score normalize the matrix before PCA. #' @param n_dimensions (integer) Number of dimensions to keep during PCA. @@ -79,9 +81,9 @@ marker_features <- function(mat, groups, method="wilcoxon") { #' @param threads (integer) Number of threads to use. #' @param save_lsi (logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector. #' @return -#' - If save_lsi is FALSE, return a dgCMatrix of shape (n_dimensions, ncol(mat)). +#' - If save_lsi is FALSE, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`. #' - If save_lsi is TRUE, return a list with the following elements: -#' - **pca_res**: dgCMatrix of shape (n_dimensions, ncol(mat)) +#' - **pca_res**: dgCMatrix of shape `(n_dimensions, ncol(mat))`` #' - **svd_attr**: List of SVD attributes #' - **idf**: Inverse document frequency vector #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. @@ -140,7 +142,12 @@ lsi <- function( #' @param n_bins (integer) Number of bins for binning mean gene expression. Normalizing dispersion is done with respect to each bin, #' and if the number of features #' within a bin is less than 2, the dispersion is set to 1. -#' @returns IterableMatrix subset of the most variable features. +#' @param save_feat_selection (logical) If TRUE, save the dispersions, means, and the features selected. +#' @returns +#' - If `save_feat_selection` is False, return an IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`. +#' - If `save_feat_selection` is True, return a list with the following elements: +#' - **mat**: IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`. +#' - **feature_selection**: Dataframe with the following columns: #' @inheritParams lsi #' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015). #' @@ -149,7 +156,11 @@ lsi <- function( #' 2. Log normalize dispersion and mean #' 3. Bin the features by their means, and normalize dispersion within each bin #' @export -highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) { +highly_variable_features <- function( + mat, num_feats, n_bins = 20, + save_feat_selection = FALSE, + threads = 1L +) { assert_is(mat, "IterableMatrix") assert_greater_than_zero(num_feats) assert_is_wholenumber(num_feats) @@ -161,22 +172,27 @@ highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) { log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats)) return(mat) } + # Calculate row information for dispersion mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads) feature_means <- mat_stats$row_stats["mean", ] feature_vars <- mat_stats$row_stats["variance", ] + # Give a small value to features with 0 mean, helps with 0 division feature_means[feature_means == 0] <- 1e-12 + # Calculate dispersion, and log normalize feature_dispersion <- feature_vars / feature_means feature_dispersion[feature_dispersion == 0] <- NA feature_dispersion <- log(feature_dispersion) feature_means <- log1p(feature_means) features_df <- data.frame( name = names(feature_means), - vars = feature_vars, - means = feature_means, + var = feature_vars, + mean = feature_means, dispersion = feature_dispersion ) + + # Bin by mean, and normalize dispersion with each bin features_df <- features_df %>% - dplyr::mutate(bin = cut(means, n_bins, labels=FALSE)) %>% + dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% dplyr::group_by(bin) %>% dplyr::mutate( bin_mean = mean(dispersion, na.rm = TRUE), @@ -187,9 +203,15 @@ highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) { feature_dispersion_norm = (dispersion - bin_mean) / bin_sd ) %>% dplyr::ungroup() %>% - dplyr::select(name, feature_dispersion_norm) %>% + dplyr::select(c(-bin_sd_is_na, -var, -bin_sd, -bin_mean)) %>% dplyr::arrange(desc(feature_dispersion_norm)) %>% dplyr::slice(1:min(num_feats, nrow(.))) + if (save_feat_selection) { + return(list( + mat = mat[features_df$name,], + feature_selection = features_df + )) + } return(mat[features_df$name,]) } diff --git a/r/man/highly_variable_features.Rd b/r/man/highly_variable_features.Rd index 54e67599..943d986b 100644 --- a/r/man/highly_variable_features.Rd +++ b/r/man/highly_variable_features.Rd @@ -4,7 +4,13 @@ \alias{highly_variable_features} \title{Get the most variable features within a matrix} \usage{ -highly_variable_features(mat, num_feats, n_bins, threads = 1L) +highly_variable_features( + mat, + num_feats, + n_bins = 20, + save_feat_selection = FALSE, + threads = 1L +) } \arguments{ \item{mat}{(IterableMatrix) dimensions features x cells} @@ -16,10 +22,17 @@ ll features will be returned.} and if the number of features within a bin is less than 2, the dispersion is set to 1.} +\item{save_feat_selection}{(logical) If TRUE, save the dispersions, means, and the features selected.} + \item{threads}{(integer) Number of threads to use.} } \value{ -IterableMatrix subset of the most variable features. +\itemize{ +\item If \code{save_feat_selection} is False, return an IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}. +\item If \code{save_feat_selection} is True, return a list with the following elements: +\item \strong{mat}: IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}. +\item \strong{feature_selection}: Dataframe with the following columns: +} } \description{ Get the most variable features within a matrix diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd index 137e4437..299513df 100644 --- a/r/man/lsi.Rd +++ b/r/man/lsi.Rd @@ -28,15 +28,15 @@ lsi( } \value{ \itemize{ -\item If save_lsi is FALSE, return a dgCMatrix of shape (n_dimensions, ncol(mat)). +\item If save_lsi is FALSE, return a dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}. \item If save_lsi is TRUE, return a list with the following elements: -\item \strong{pca_res}: dgCMatrix of shape (n_dimensions, ncol(mat)) +\item \strong{pca_res}: dgCMatrix of shape `(n_dimensions, ncol(mat))`` \item \strong{svd_attr}: List of SVD attributes \item \strong{idf}: Inverse document frequency vector } } \description{ -Perform latent semantic indexing (LSI) on a matrix. +Given a \verb{(features x cells)} matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}. } \details{ Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. From 16237b4f6a451e3515353a130ffcdf60df1ce35b Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Wed, 6 Nov 2024 19:30:00 -0800 Subject: [PATCH 06/15] [r] add variable gene selection test --- r/tests/real_data/ArchR_LSI.R | 9 +++- .../Scanpy_variable_feat_selection.py | 46 ++++++++++++++++++ .../scanpy_variable_feat_selection.R | 47 +++++++++++++++++++ 3 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 r/tests/real_data/Scanpy_variable_feat_selection.py create mode 100644 r/tests/real_data/scanpy_variable_feat_selection.R diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R index b78d700f..2ad74610 100644 --- a/r/tests/real_data/ArchR_LSI.R +++ b/r/tests/real_data/ArchR_LSI.R @@ -1,3 +1,11 @@ +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/") devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/") @@ -47,4 +55,3 @@ test_lsi_similarity_to_archr <- function() { pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-6)) } -test_lsi_similarity_to_archr() \ No newline at end of file diff --git a/r/tests/real_data/Scanpy_variable_feat_selection.py b/r/tests/real_data/Scanpy_variable_feat_selection.py new file mode 100644 index 00000000..6ae28c1b --- /dev/null +++ b/r/tests/real_data/Scanpy_variable_feat_selection.py @@ -0,0 +1,46 @@ +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +import sys, tempfile, os +import numpy as np +import pandas as pd +import scanpy as sc + + +def call_highly_var_genes_single_batch(temp_dir: str) -> None: + """ + Call highly_variable genes on a single batch of PBMC68k data using their interpreation of + the Seurat implementation. + Write the input anndata object csv at `/highly_var_genes_scanpy_input.csv` + Write the output as a csv, at `/highly_var_genes_scanpy_output.csv` + + Args: + temp_dir (str): Path to the temporary directory to write the input and output files. + """ + # Dataset is only (765, 700) + adata = sc.datasets.pbmc68k_reduced() + adata.var_names_make_unique() + res = sc.pp._highly_variable_genes.highly_variable_genes(adata, + n_top_genes = 50, + n_bins = 20, + flavor = "seurat", + inplace = False, + check_values = False).drop(columns = 'mean_bin') + # remove mean_bin + adata.to_df().to_csv(os.path.join(temp_dir, "highly_var_genes_scanpy_input.csv")) + res.to_csv(os.path.join(temp_dir, "highly_var_genes_scanpy_output.csv")) + + +if __name__ == "__main__": + # We use the first argument as the temporary directory + if len(sys.argv) < 2: + # If no argument is provided, use the current directory + call_highly_var_genes_single_batch(".") + else: + call_highly_var_genes_single_batch(sys.argv[1]) + \ No newline at end of file diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R new file mode 100644 index 00000000..da873a7b --- /dev/null +++ b/r/tests/real_data/scanpy_variable_feat_selection.R @@ -0,0 +1,47 @@ +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r") + +compare_feat_selection_to_scanpy <- function(dir = NULL) { + # Set up temp dir + if (is.null(dir)) { + dir <- file.path(tempdir(), "feat_selection_test") + if (dir.exists(dir)) unlink(dir, recursive = TRUE) + dir.create(dir) + } + + # Call python script + system2("python", c("Scanpy_variable_feat_selection.py", dir)) + + # read in input csv + input_mat_scanpy <- t(read.csv(file.path(dir, "highly_var_genes_scanpy_input.csv"), row.names = 1)) + output_mat_scanpy <- read.csv(file.path(dir, "highly_var_genes_scanpy_output.csv"), row.names = 1) + # filter output mat to only where "highly_variable" is true + output_mat_scanpy$highly_variable <- as.logical(output_mat_scanpy$highly_variable) + output_mat_scanpy <- output_mat_scanpy[output_mat_scanpy$highly_variable,] %>% + dplyr::arrange(desc(dispersions_norm)) %>% + dplyr::select(-highly_variable) %>% # convert rownames to a column + tibble::rownames_to_column("name") %>% + dplyr::as_tibble() + # unlog the input_mat + input_mat_bpcells <- expm1(input_mat_scanpy) + output_bpcells <- highly_variable_features( + input_mat_bpcells %>% as("dgCMatrix") %>% as("IterableMatrix"), + num_feats = 50, + n_bins = 20, + save_feat_selection = TRUE + ) + output_mat_bpcells <- output_bpcells$feature_selection + expect_true(all.equal(output_mat_bpcells$name, output_mat_scanpy$name)) + expect_true(all.equal(output_mat_bpcells$mean, output_mat_scanpy$means, tolerance = 1e-6)) + expect_true(all.equal(output_mat_bpcells$dispersion, output_mat_scanpy$dispersions, tolerance = 1e-6)) + expect_true(all.equal(output_mat_bpcells$feature_dispersion_norm, output_mat_scanpy$dispersions_norm, tolerance = 1e-6)) +} + +compare_feat_selection_to_scanpy() From fa1eb58c2386ca4b9ddbd0f2de54eebdfdf269a2 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Wed, 6 Nov 2024 19:35:03 -0800 Subject: [PATCH 07/15] [r] provide more colour to scanpy feat selection test --- r/tests/real_data/scanpy_variable_feat_selection.R | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R index da873a7b..3bac07ab 100644 --- a/r/tests/real_data/scanpy_variable_feat_selection.R +++ b/r/tests/real_data/scanpy_variable_feat_selection.R @@ -8,6 +8,11 @@ devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r") + +# Compare the feature selection output of BPCells to that of Scanpy. +# Scanpy technically utilizes the Seurat (Satija et al. 2015) method for feature selection, so we should expect similar results of either pkg. +# This function calls a python script that runs Scanpy feature selection on a test dataset, and writes both input/output to `dir`. +# It then reads in the input/output from the python script, calls the BPCells feature selection function, and compares the output to the Scanpy output. compare_feat_selection_to_scanpy <- function(dir = NULL) { # Set up temp dir if (is.null(dir)) { @@ -29,8 +34,10 @@ compare_feat_selection_to_scanpy <- function(dir = NULL) { dplyr::select(-highly_variable) %>% # convert rownames to a column tibble::rownames_to_column("name") %>% dplyr::as_tibble() - # unlog the input_mat + + # Scanpy undoes a log1p transformation on the input matrix, so we do the same here input_mat_bpcells <- expm1(input_mat_scanpy) + output_bpcells <- highly_variable_features( input_mat_bpcells %>% as("dgCMatrix") %>% as("IterableMatrix"), num_feats = 50, From b895cbd5fe666aae831de836f16867418b00324d Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Thu, 7 Nov 2024 15:57:13 -0800 Subject: [PATCH 08/15] [r] cleanup real data tests --- r/tests/real_data/ArchR_LSI.R | 17 +++++++------- r/tests/real_data/ArchR_insertions.R | 13 +++++++++-- r/tests/real_data/config.csv | 3 +++ .../scanpy_variable_feat_selection.R | 13 ++++------- r/tests/real_data/test_helpers.R | 22 +++++++++++++++++++ 5 files changed, 48 insertions(+), 20 deletions(-) create mode 100644 r/tests/real_data/config.csv create mode 100644 r/tests/real_data/test_helpers.R diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R index 2ad74610..e79ee98b 100644 --- a/r/tests/real_data/ArchR_LSI.R +++ b/r/tests/real_data/ArchR_LSI.R @@ -6,8 +6,9 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. -devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/") -devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/") +source("test_helpers.R") +devtools::load_all(config[["path_bpcells"]]) +devtools::load_all(config[["path_archr"]]) #' Perform a dimensionality reduction with tf-idf and SVD (LSI) on a matrix on ArchR and BPCells. #' As LSI uses an iterative approach on ArchR, we compare by using a single-iteration private function on ArchR. @@ -15,11 +16,9 @@ devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/") #' from both functions and compare the matrix multiplication of the U and SVD matrices, which should give an approximation #' we can compare between the two packages. #' @param proj An archr project. -test_lsi_similarity_to_archr <- function() { - # Set up temp dir - int_dir <- file.path(tempdir(), "insertion_test") - dir.create(int_dir) - setwd(int_dir) +test_lsi_similarity_to_archr <- function(dir = NULL) { + dir <- create_temp_dir(dir) + setwd(dir) # add iterative lsi for dim reduction proj <- getTestProject() proj <- addPeakMatrix(proj) @@ -33,8 +32,7 @@ test_lsi_similarity_to_archr <- function() { LSIMethod = 2, nDimensions = 2, binarize = FALSE, - outlierQuantiles = NULL, - test_mat = test_mat + outlierQuantiles = NULL ) svd_archr <- lsi_archr$svd lsi_mat_archr <- t(lsi_archr$matSVD) @@ -55,3 +53,4 @@ test_lsi_similarity_to_archr <- function() { pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-6)) } +test_lsi_similarity_to_archr() \ No newline at end of file diff --git a/r/tests/real_data/ArchR_insertions.R b/r/tests/real_data/ArchR_insertions.R index 150939d4..810133c1 100644 --- a/r/tests/real_data/ArchR_insertions.R +++ b/r/tests/real_data/ArchR_insertions.R @@ -1,5 +1,14 @@ -devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/") -devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/") +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +source("test_helpers.R") +devtools::load_all(config[["path_bpcells"]]) +devtools::load_all(config[["path_archr"]]) fix_granges_syntax_for_archr <- function(gr) { mcols(gr)$RG <- gsub("PBSmall#", "", mcols(gr)$RG) diff --git a/r/tests/real_data/config.csv b/r/tests/real_data/config.csv new file mode 100644 index 00000000..915de4af --- /dev/null +++ b/r/tests/real_data/config.csv @@ -0,0 +1,3 @@ +key,value +path_archr,/mnt/c/users/Imman/PycharmProjects/ArchR/ +path_bpcells,/mnt/c/users/Imman/PycharmProjects/BPCells/r diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R index 3bac07ab..8b608773 100644 --- a/r/tests/real_data/scanpy_variable_feat_selection.R +++ b/r/tests/real_data/scanpy_variable_feat_selection.R @@ -6,23 +6,18 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. -devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r") - +source("test_helpers.R") +devtools::load_all(config[["path_bpcells"]]) # Compare the feature selection output of BPCells to that of Scanpy. # Scanpy technically utilizes the Seurat (Satija et al. 2015) method for feature selection, so we should expect similar results of either pkg. # This function calls a python script that runs Scanpy feature selection on a test dataset, and writes both input/output to `dir`. # It then reads in the input/output from the python script, calls the BPCells feature selection function, and compares the output to the Scanpy output. compare_feat_selection_to_scanpy <- function(dir = NULL) { - # Set up temp dir - if (is.null(dir)) { - dir <- file.path(tempdir(), "feat_selection_test") - if (dir.exists(dir)) unlink(dir, recursive = TRUE) - dir.create(dir) - } + dir <- create_temp_dir(dir) # Call python script - system2("python", c("Scanpy_variable_feat_selection.py", dir)) + system2("python3", c("Scanpy_variable_feat_selection.py", dir)) # read in input csv input_mat_scanpy <- t(read.csv(file.path(dir, "highly_var_genes_scanpy_input.csv"), row.names = 1)) diff --git a/r/tests/real_data/test_helpers.R b/r/tests/real_data/test_helpers.R new file mode 100644 index 00000000..2da80072 --- /dev/null +++ b/r/tests/real_data/test_helpers.R @@ -0,0 +1,22 @@ +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +# Set up k-v pairs for other tests +config_csv <- read.csv("config.csv") +config <- as.list(config_csv)$value +names(config) <- as.list(config_csv)$key + +# Set up temp dir in case it's not already set +create_temp_dir <- function(dir = NULL) { + if (is.null(dir)) { + dir <- file.path(tempdir(), "lsi_test") + if (dir.exists(dir)) unlink(dir, recursive = TRUE) + dir.create(dir) + } + return(dir) +} \ No newline at end of file From aee5d220fdac284d5d4d871b39102fdb4bcda5c4 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Thu, 7 Nov 2024 16:15:54 -0800 Subject: [PATCH 09/15] [r] clean up lsi, var features docstrings --- r/R/singlecell_utils.R | 28 ++++++++++++++-------------- r/man/highly_variable_features.Rd | 18 ++++++++++-------- r/man/lsi.Rd | 19 ++++++++++--------- 3 files changed, 34 insertions(+), 31 deletions(-) diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index 94e5fe53..d8c6e39f 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -75,17 +75,17 @@ marker_features <- function(mat, groups, method="wilcoxon") { #' #' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`. #' @param mat (IterableMatrix) dimensions features x cells -#' @param z_score_norm (logical) If TRUE, z-score normalize the matrix before PCA. +#' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA. #' @param n_dimensions (integer) Number of dimensions to keep during PCA. #' @param scale_factor (integer) Scale factor for the tf-idf log transform. +#' #' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector. #' @param threads (integer) Number of threads to use. -#' @param save_lsi (logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector. #' @return -#' - If save_lsi is FALSE, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`. -#' - If save_lsi is TRUE, return a list with the following elements: -#' - **pca_res**: dgCMatrix of shape `(n_dimensions, ncol(mat))`` -#' - **svd_attr**: List of SVD attributes -#' - **idf**: Inverse document frequency vector +#' - If `save_lsi` is `FALSE`, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`. +#' - If `save_lsi` is `TRUE`, return a list with the following elements: +#' - `pca_res`: dgCMatrix of shape `(n_dimensions, ncol(mat))` +#' - `svd_attr`: List of SVD attributes +#' - `idf`: Inverse document frequency vector #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. #' #' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example: @@ -136,18 +136,18 @@ lsi <- function( return(pca_res) } -#' Get the most variable features within a matrix +#' Get the most variable features within a matrix. #' @param num_feats (integer) Number of features to return. If the number is higher than the number of features in the matrix, -#' ll features will be returned. +#' all features will be returned. #' @param n_bins (integer) Number of bins for binning mean gene expression. Normalizing dispersion is done with respect to each bin, #' and if the number of features #' within a bin is less than 2, the dispersion is set to 1. -#' @param save_feat_selection (logical) If TRUE, save the dispersions, means, and the features selected. +#' @param save_feat_selection (logical) If `TRUE`, save the dispersions, means, and the features selected. #' @returns -#' - If `save_feat_selection` is False, return an IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`. -#' - If `save_feat_selection` is True, return a list with the following elements: -#' - **mat**: IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`. -#' - **feature_selection**: Dataframe with the following columns: +#' - If `save_feat_selection` is `FALSE`, return an IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`. +#' - If `save_feat_selection` is `TRUE`, return a list with the following elements: +#' - `mat`: IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`. +#' - `feature_selection`: Dataframe with columns `name`, `mean`, `dispersion`, `bin`, `feature_dispersion_norm`. #' @inheritParams lsi #' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015). #' diff --git a/r/man/highly_variable_features.Rd b/r/man/highly_variable_features.Rd index 943d986b..005a4e37 100644 --- a/r/man/highly_variable_features.Rd +++ b/r/man/highly_variable_features.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/singlecell_utils.R \name{highly_variable_features} \alias{highly_variable_features} -\title{Get the most variable features within a matrix} +\title{Get the most variable features within a matrix.} \usage{ highly_variable_features( mat, @@ -16,26 +16,28 @@ highly_variable_features( \item{mat}{(IterableMatrix) dimensions features x cells} \item{num_feats}{(integer) Number of features to return. If the number is higher than the number of features in the matrix, -ll features will be returned.} +all features will be returned.} \item{n_bins}{(integer) Number of bins for binning mean gene expression. Normalizing dispersion is done with respect to each bin, and if the number of features within a bin is less than 2, the dispersion is set to 1.} -\item{save_feat_selection}{(logical) If TRUE, save the dispersions, means, and the features selected.} +\item{save_feat_selection}{(logical) If \code{TRUE}, save the dispersions, means, and the features selected.} \item{threads}{(integer) Number of threads to use.} } \value{ \itemize{ -\item If \code{save_feat_selection} is False, return an IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}. -\item If \code{save_feat_selection} is True, return a list with the following elements: -\item \strong{mat}: IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}. -\item \strong{feature_selection}: Dataframe with the following columns: +\item If \code{save_feat_selection} is \code{FALSE}, return an IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}. +\item If \code{save_feat_selection} is \code{TRUE}, return a list with the following elements: +\itemize{ +\item \code{mat}: IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}. +\item \code{feature_selection}: Dataframe with columns \code{name}, \code{mean}, \code{dispersion}, \code{bin}, \code{feature_dispersion_norm}. +} } } \description{ -Get the most variable features within a matrix +Get the most variable features within a matrix. } \details{ The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015). diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd index 299513df..ea2687f9 100644 --- a/r/man/lsi.Rd +++ b/r/man/lsi.Rd @@ -16,23 +16,24 @@ lsi( \arguments{ \item{mat}{(IterableMatrix) dimensions features x cells} -\item{z_score_norm}{(logical) If TRUE, z-score normalize the matrix before PCA.} +\item{z_score_norm}{(logical) If \code{TRUE}, z-score normalize the matrix before PCA.} \item{n_dimensions}{(integer) Number of dimensions to keep during PCA.} -\item{scale_factor}{(integer) Scale factor for the tf-idf log transform.} - -\item{save_lsi}{(logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector.} +\item{scale_factor}{(integer) Scale factor for the tf-idf log transform. +#' @param save_lsi (logical) If \code{TRUE}, save the SVD attributes for the matrix, as well as the idf normalization vector.} \item{threads}{(integer) Number of threads to use.} } \value{ \itemize{ -\item If save_lsi is FALSE, return a dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}. -\item If save_lsi is TRUE, return a list with the following elements: -\item \strong{pca_res}: dgCMatrix of shape `(n_dimensions, ncol(mat))`` -\item \strong{svd_attr}: List of SVD attributes -\item \strong{idf}: Inverse document frequency vector +\item If \code{save_lsi} is \code{FALSE}, return a dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}. +\item If \code{save_lsi} is \code{TRUE}, return a list with the following elements: +\itemize{ +\item \code{pca_res}: dgCMatrix of shape \verb{(n_dimensions, ncol(mat))} +\item \code{svd_attr}: List of SVD attributes +\item \code{idf}: Inverse document frequency vector +} } } \description{ From f8eec12f0dc112638169438b23a52457e389d83a Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Thu, 7 Nov 2024 18:45:32 -0800 Subject: [PATCH 10/15] [r] add in more lsi real data tests --- r/tests/real_data/ArchR_LSI.R | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R index e79ee98b..09c04f03 100644 --- a/r/tests/real_data/ArchR_LSI.R +++ b/r/tests/real_data/ArchR_LSI.R @@ -30,7 +30,7 @@ test_lsi_similarity_to_archr <- function(dir = NULL) { lsi_archr <- .computeLSI( mat = test_mat, LSIMethod = 2, - nDimensions = 2, + nDimensions = 20, binarize = FALSE, outlierQuantiles = NULL ) @@ -47,10 +47,16 @@ test_lsi_similarity_to_archr <- function(dir = NULL) { lsi_bpcells <- lsi( test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"), z_score_norm = FALSE, - n_dimensions = 2, + n_dimensions = 20, save_lsi = TRUE ) pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res - testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-6)) + testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-4)) + # convert signs + lsi_mat_archr <- sweep(lsi_mat_archr, MARGIN = 1, (2 * (lsi_mat_archr[,1] * lsi_bpcells$pca_res[,1] > 0) - 1), `*`) + # Check for post-pca matrix similarity + testthat::expect_true(all.equal(lsi_mat_archr, lsi_bpcells$pca_res, tolerance = 1e-4)) + # also check for correlation between the two matrices in PC space + testthat::expect_true(cor(as.vector(lsi_mat_archr), as.vector(lsi_bpcells$pca_res)) > 0.999) } test_lsi_similarity_to_archr() \ No newline at end of file From 0810635041ea9fb10d97fb41e626d4ac9a4cfcf0 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 18 Nov 2024 12:56:18 -0800 Subject: [PATCH 11/15] [r] remove unused variable from `lsi()` --- r/R/singlecell_utils.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index d8c6e39f..8ba8b803 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -105,8 +105,6 @@ lsi <- function( assert_is_wholenumber(threads) # log(tf-idf) transform - mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean")) - npeaks <- colSums(mat) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h` tf <- mat %>% multiply_cols(1 / npeaks) idf_ <- ncol(mat) / rowSums(mat) From 21f53f40e297372317c5f71aafb6205a9b7bf4ea Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 2 Dec 2024 02:12:19 -0800 Subject: [PATCH 12/15] [r] add requested changes --- r/R/singlecell_utils.R | 37 ++++++++++--------- r/tests/real_data/ArchR_LSI.R | 15 ++++++-- r/tests/real_data/config.csv | 3 -- .../scanpy_variable_feat_selection.R | 13 ++++++- r/tests/real_data/test_helpers.R | 22 ----------- 5 files changed, 42 insertions(+), 48 deletions(-) delete mode 100644 r/tests/real_data/config.csv delete mode 100644 r/tests/real_data/test_helpers.R diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index 8ba8b803..9b45cdb9 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -75,8 +75,8 @@ marker_features <- function(mat, groups, method="wilcoxon") { #' #' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`. #' @param mat (IterableMatrix) dimensions features x cells -#' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA. #' @param n_dimensions (integer) Number of dimensions to keep during PCA. +#' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA. #' @param scale_factor (integer) Scale factor for the tf-idf log transform. #' #' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector. #' @param threads (integer) Number of threads to use. @@ -93,7 +93,7 @@ marker_features <- function(mat, groups, method="wilcoxon") { #' @export lsi <- function( mat, - z_score_norm = TRUE, n_dimensions = 50L, scale_factor = 1e4, + n_dimensions = 50L, z_score_norm = TRUE, scale_factor = 1e4, save_lsi = FALSE, threads = 1L ) { @@ -105,13 +105,17 @@ lsi <- function( assert_is_wholenumber(threads) # log(tf-idf) transform - npeaks <- colSums(mat) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h` + mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean")) + npeaks <- mat_stats$col_stats["mean",] * nrow(mat) tf <- mat %>% multiply_cols(1 / npeaks) - idf_ <- ncol(mat) / rowSums(mat) + idf_ <- ncol(mat) / (mat_stats$row_stats["mean",] * nrow(mat)) mat_tfidf <- tf %>% multiply_rows(idf_) mat_log_tfidf <- log1p(scale_factor * mat_tfidf) # Save to prevent re-calculation of queued operations - mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE) + mat_log_tfidf <- write_matrix_dir( + convert_matrix_type(mat_log_tfidf, type = "float"), + tempfile("mat_log_tfidf"), compress = TRUE + ) # Z-score normalization if (z_score_norm) { cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats = "variance", threads = threads)$col_stats @@ -174,12 +178,11 @@ highly_variable_features <- function( mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads) feature_means <- mat_stats$row_stats["mean", ] feature_vars <- mat_stats$row_stats["variance", ] - # Give a small value to features with 0 mean, helps with 0 division - feature_means[feature_means == 0] <- 1e-12 # Calculate dispersion, and log normalize feature_dispersion <- feature_vars / feature_means feature_dispersion[feature_dispersion == 0] <- NA feature_dispersion <- log(feature_dispersion) + feature_dispersion[feature_means == 0] <- 0 feature_means <- log1p(feature_means) features_df <- data.frame( name = names(feature_means), @@ -193,24 +196,22 @@ highly_variable_features <- function( dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% dplyr::group_by(bin) %>% dplyr::mutate( - bin_mean = mean(dispersion, na.rm = TRUE), - bin_sd = sd(dispersion, na.rm = TRUE), - bin_sd_is_na = is.na(bin_sd), - bin_sd = ifelse(bin_sd_is_na, bin_mean, bin_sd), # Set feats that are in bins with only one feat to have a norm dispersion of 1 - bin_mean = ifelse(bin_sd_is_na, 0, bin_mean), - feature_dispersion_norm = (dispersion - bin_mean) / bin_sd - ) %>% + feature_dispersion_norm = (dispersion - mean(dispersion)) / sd(dispersion), + feature_dispersion_norm = dplyr::if_else(n() == 1, 1, feature_dispersion_norm) # Set feats that are in bins with only one feat to have a norm dispersion of 1 + ) %>% dplyr::ungroup() %>% dplyr::select(c(-bin_sd_is_na, -var, -bin_sd, -bin_mean)) %>% - dplyr::arrange(desc(feature_dispersion_norm)) %>% - dplyr::slice(1:min(num_feats, nrow(.))) + dplyr::slice_max(order_by = feature_dispersion_norm, n = num_feats) if (save_feat_selection) { + # get rownames that are in features_df$name + feats_of_interest <- which(rownames(mat) %in% features_df$name) return(list( - mat = mat[features_df$name,], + mat = mat[feats_of_interest,], feature_selection = features_df )) } - return(mat[features_df$name,]) + return(mat[feats_of_interest,]) + #return(mat[features_df$name,]) } diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R index 09c04f03..ff697b4c 100644 --- a/r/tests/real_data/ArchR_LSI.R +++ b/r/tests/real_data/ArchR_LSI.R @@ -6,9 +6,18 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. -source("test_helpers.R") -devtools::load_all(config[["path_bpcells"]]) -devtools::load_all(config[["path_archr"]]) +library("BPCells") +library("ArchR") + +# Set up temp dir in case it's not already set +create_temp_dir <- function(dir = NULL) { + if (is.null(dir)) { + dir <- file.path(tempdir(), "lsi_test") + if (dir.exists(dir)) unlink(dir, recursive = TRUE) + dir.create(dir) + } + return(dir) +} #' Perform a dimensionality reduction with tf-idf and SVD (LSI) on a matrix on ArchR and BPCells. #' As LSI uses an iterative approach on ArchR, we compare by using a single-iteration private function on ArchR. diff --git a/r/tests/real_data/config.csv b/r/tests/real_data/config.csv deleted file mode 100644 index 915de4af..00000000 --- a/r/tests/real_data/config.csv +++ /dev/null @@ -1,3 +0,0 @@ -key,value -path_archr,/mnt/c/users/Imman/PycharmProjects/ArchR/ -path_bpcells,/mnt/c/users/Imman/PycharmProjects/BPCells/r diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R index 8b608773..d2eb08b2 100644 --- a/r/tests/real_data/scanpy_variable_feat_selection.R +++ b/r/tests/real_data/scanpy_variable_feat_selection.R @@ -6,8 +6,17 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. -source("test_helpers.R") -devtools::load_all(config[["path_bpcells"]]) +library("BPCells") + +# Set up temp dir in case it's not already set +create_temp_dir <- function(dir = NULL) { + if (is.null(dir)) { + dir <- file.path(tempdir(), "lsi_test") + if (dir.exists(dir)) unlink(dir, recursive = TRUE) + dir.create(dir) + } + return(dir) +} # Compare the feature selection output of BPCells to that of Scanpy. # Scanpy technically utilizes the Seurat (Satija et al. 2015) method for feature selection, so we should expect similar results of either pkg. diff --git a/r/tests/real_data/test_helpers.R b/r/tests/real_data/test_helpers.R deleted file mode 100644 index 2da80072..00000000 --- a/r/tests/real_data/test_helpers.R +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2024 BPCells contributors -# -# Licensed under the Apache License, Version 2.0 or the MIT license -# , at your -# option. This file may not be copied, modified, or distributed -# except according to those terms. - -# Set up k-v pairs for other tests -config_csv <- read.csv("config.csv") -config <- as.list(config_csv)$value -names(config) <- as.list(config_csv)$key - -# Set up temp dir in case it's not already set -create_temp_dir <- function(dir = NULL) { - if (is.null(dir)) { - dir <- file.path(tempdir(), "lsi_test") - if (dir.exists(dir)) unlink(dir, recursive = TRUE) - dir.create(dir) - } - return(dir) -} \ No newline at end of file From ec2c1ed7fcb0924a624a820ad8154c00af070e0e Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 2 Dec 2024 13:54:25 -0800 Subject: [PATCH 13/15] [r] fix requested changes --- r/R/singlecell_utils.R | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index 9b45cdb9..48c42933 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -106,6 +106,7 @@ lsi <- function( # log(tf-idf) transform mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean")) + npeaks <- mat_stats$col_stats["mean",] * nrow(mat) tf <- mat %>% multiply_cols(1 / npeaks) idf_ <- ncol(mat) / (mat_stats$row_stats["mean",] * nrow(mat)) @@ -196,22 +197,20 @@ highly_variable_features <- function( dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% dplyr::group_by(bin) %>% dplyr::mutate( - feature_dispersion_norm = (dispersion - mean(dispersion)) / sd(dispersion), - feature_dispersion_norm = dplyr::if_else(n() == 1, 1, feature_dispersion_norm) # Set feats that are in bins with only one feat to have a norm dispersion of 1 + feature_dispersion_norm = (dispersion - mean(dispersion)) / sd(dispersion), + feature_dispersion_norm = if (dplyr::n() == 1) {1} else {feature_dispersion_norm} # Set feats that are in bins with only one feat to have a norm dispersion of 1 ) %>% dplyr::ungroup() %>% - dplyr::select(c(-bin_sd_is_na, -var, -bin_sd, -bin_mean)) %>% - dplyr::slice_max(order_by = feature_dispersion_norm, n = num_feats) + dplyr::slice_max(order_by = feature_dispersion_norm, n = num_feats, with_ties = FALSE) + feats_of_interest <- which(rownames(mat) %in% features_df$name) # get rownames to get original sorted order if (save_feat_selection) { # get rownames that are in features_df$name - feats_of_interest <- which(rownames(mat) %in% features_df$name) return(list( mat = mat[feats_of_interest,], feature_selection = features_df )) } return(mat[feats_of_interest,]) - #return(mat[features_df$name,]) } From 5280318944647bc254bb42811116e85c3127be01 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Tue, 3 Dec 2024 15:56:25 -0800 Subject: [PATCH 14/15] [r] fix lsi docstring, idf_ logic --- r/R/singlecell_utils.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index 48c42933..fea4d5dc 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -78,7 +78,7 @@ marker_features <- function(mat, groups, method="wilcoxon") { #' @param n_dimensions (integer) Number of dimensions to keep during PCA. #' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA. #' @param scale_factor (integer) Scale factor for the tf-idf log transform. -#' #' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector. +#' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector. #' @param threads (integer) Number of threads to use. #' @return #' - If `save_lsi` is `FALSE`, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`. @@ -106,10 +106,9 @@ lsi <- function( # log(tf-idf) transform mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean")) - npeaks <- mat_stats$col_stats["mean",] * nrow(mat) tf <- mat %>% multiply_cols(1 / npeaks) - idf_ <- ncol(mat) / (mat_stats$row_stats["mean",] * nrow(mat)) + idf_ <- 1 / mat_stats$row_stats["mean",] mat_tfidf <- tf %>% multiply_rows(idf_) mat_log_tfidf <- log1p(scale_factor * mat_tfidf) # Save to prevent re-calculation of queued operations From 2d05edb9cfe2f6ec45d18af8c3efb2d89b517b8c Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Fri, 6 Dec 2024 16:32:58 -0800 Subject: [PATCH 15/15] [r] replace z-score norm with corr cutoffs --- r/R/singlecell_utils.R | 31 ++++++++++++++++------------ r/tests/real_data/ArchR_LSI.R | 1 - r/tests/real_data/ArchR_insertions.R | 15 +++++++++++--- 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index fea4d5dc..bdbb60b4 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -76,7 +76,8 @@ marker_features <- function(mat, groups, method="wilcoxon") { #' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`. #' @param mat (IterableMatrix) dimensions features x cells #' @param n_dimensions (integer) Number of dimensions to keep during PCA. -#' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA. +#' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth. If the PC has a correlation that is great or equal to +#' the corr_cutoff, it will be excluded from the final PCA matrix. #' @param scale_factor (integer) Scale factor for the tf-idf log transform. #' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector. #' @param threads (integer) Number of threads to use. @@ -84,6 +85,7 @@ marker_features <- function(mat, groups, method="wilcoxon") { #' - If `save_lsi` is `FALSE`, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`. #' - If `save_lsi` is `TRUE`, return a list with the following elements: #' - `pca_res`: dgCMatrix of shape `(n_dimensions, ncol(mat))` +#' - `unused_pcs`: Integer vector of PCs that were filtered out due to high correlation with sequencing depth #' - `svd_attr`: List of SVD attributes #' - `idf`: Inverse document frequency vector #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. @@ -93,7 +95,7 @@ marker_features <- function(mat, groups, method="wilcoxon") { #' @export lsi <- function( mat, - n_dimensions = 50L, z_score_norm = TRUE, scale_factor = 1e4, + n_dimensions = 50L, corr_cutoff = 1, scale_factor = 1e4, save_lsi = FALSE, threads = 1L ) { @@ -102,12 +104,13 @@ lsi <- function( assert_len(n_dimensions, 1) assert_greater_than_zero(n_dimensions) assert_true(n_dimensions < min(ncol(mat), nrow(mat))) + assert_true((corr_cutoff >= 0) && (corr_cutoff <= 1)) assert_is_wholenumber(threads) # log(tf-idf) transform mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean")) - npeaks <- mat_stats$col_stats["mean",] * nrow(mat) - tf <- mat %>% multiply_cols(1 / npeaks) + read_depth <- mat_stats$col_stats["mean",] * nrow(mat) + tf <- mat %>% multiply_cols(1 / read_depth) idf_ <- 1 / mat_stats$row_stats["mean",] mat_tfidf <- tf %>% multiply_rows(idf_) mat_log_tfidf <- log1p(scale_factor * mat_tfidf) @@ -116,22 +119,24 @@ lsi <- function( convert_matrix_type(mat_log_tfidf, type = "float"), tempfile("mat_log_tfidf"), compress = TRUE ) - # Z-score normalization - if (z_score_norm) { - cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats = "variance", threads = threads)$col_stats - cell_means <- cell_peak_stats["mean",] - cell_vars <- cell_peak_stats["variance",] - mat_log_tfidf <- mat_log_tfidf %>% - add_cols(-cell_means) %>% - multiply_cols(1 / cell_vars) - } # Run pca svd_attr_ <- svds(mat_log_tfidf, k = n_dimensions, threads = threads) pca_res <- t(svd_attr_$u) %*% mat_log_tfidf + + # Filter out PCs that are highly correlated with sequencing depth + pca_corrs <- abs(cor(read_depth, t(pca_res))) + pca_feats_to_keep <- which(pca_corrs < corr_cutoff) + if (length(pca_feats_to_keep) != n_dimensions) { + # not sure if this is the route we want to take. Should we just leave the PCs in, + # and not use them in downstream analysis? + pca_res <- t(svd_attr_$u[, pca_feats_to_keep]) %*% mat_log_tfidf + } + if(save_lsi) { return(list( pca_res = pca_res, svd_attr = svd_attr_, + unused_pcs <- which(pca_corrs >= corr_cutoff), idf = idf_ )) } diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R index ff697b4c..084645d3 100644 --- a/r/tests/real_data/ArchR_LSI.R +++ b/r/tests/real_data/ArchR_LSI.R @@ -55,7 +55,6 @@ test_lsi_similarity_to_archr <- function(dir = NULL) { # Do not use z-score normalization, as this isn't done with ArchR lsi_bpcells <- lsi( test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"), - z_score_norm = FALSE, n_dimensions = 20, save_lsi = TRUE ) diff --git a/r/tests/real_data/ArchR_insertions.R b/r/tests/real_data/ArchR_insertions.R index 810133c1..8818d7e7 100644 --- a/r/tests/real_data/ArchR_insertions.R +++ b/r/tests/real_data/ArchR_insertions.R @@ -6,9 +6,18 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. -source("test_helpers.R") -devtools::load_all(config[["path_bpcells"]]) -devtools::load_all(config[["path_archr"]]) +devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r") +devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/ArchR") + +# Set up temp dir in case it's not already set +create_temp_dir <- function(dir = NULL) { + if (is.null(dir)) { + dir <- file.path(tempdir(), "lsi_test") + if (dir.exists(dir)) unlink(dir, recursive = TRUE) + dir.create(dir) + } + return(dir) +} fix_granges_syntax_for_archr <- function(gr) { mcols(gr)$RG <- gsub("PBSmall#", "", mcols(gr)$RG)