Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,15 @@ export(gene_score_archr)
export(gene_score_tiles_archr)
export(gene_score_weights_archr)
export(get_trackplot_height)
export(highly_variable_features)
export(import_matrix_market)
export(import_matrix_market_10x)
export(knn_annoy)
export(knn_hnsw)
export(knn_to_geodesic_graph)
export(knn_to_snn_graph)
export(log1p_slow)
export(lsi)
export(marker_features)
export(match_gene_symbol)
export(matrix_stats)
Expand Down
2 changes: 2 additions & 0 deletions r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ Contributions welcome :)
- Add `rowQuantiles()` and `colQuantiles()` functions, which return the quantiles of each row/column of a matrix. Currently `rowQuantiles()` only works on row-major matrices and `colQuantiles()` only works on col-major matrices.
If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128)
- Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128)
- Add `lsi()` function to perform latent semantic indexing on a matrix (pull request #156).
- Add `highly_variable_features()` function to identify highly variable features in a matrix (pull request #156).

## Improvements
- `trackplot_loop()` now accepts discrete color scales
Expand Down
150 changes: 149 additions & 1 deletion r/R/singlecell_utils.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023 BPCells contributors
# Copyright 2024 BPCells contributors
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
Expand Down Expand Up @@ -70,6 +70,154 @@ marker_features <- function(mat, groups, method="wilcoxon") {
)
}


#' Perform latent semantic indexing (LSI) on a matrix.
#'
#' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
#' @param mat (IterableMatrix) dimensions features x cells
#' @param n_dimensions (integer) Number of dimensions to keep during PCA.
#' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth. If the PC has a correlation that is great or equal to
#' the corr_cutoff, it will be excluded from the final PCA matrix.
#' @param scale_factor (integer) Scale factor for the tf-idf log transform.
#' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector.
#' @param threads (integer) Number of threads to use.
#' @return
#' - If `save_lsi` is `FALSE`, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`.
#' - If `save_lsi` is `TRUE`, return a list with the following elements:
#' - `pca_res`: dgCMatrix of shape `(n_dimensions, ncol(mat))`
#' - `unused_pcs`: Integer vector of PCs that were filtered out due to high correlation with sequencing depth
#' - `svd_attr`: List of SVD attributes
#' - `idf`: Inverse document frequency vector
#' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019.
#'
#' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
#' - 17.1 MB memory usage, 25.1 seconds runtime
#' @export
lsi <- function(
mat,
n_dimensions = 50L, corr_cutoff = 1, scale_factor = 1e4,
save_lsi = FALSE,
threads = 1L
) {
assert_is(mat, "IterableMatrix")
assert_is_wholenumber(n_dimensions)
assert_len(n_dimensions, 1)
assert_greater_than_zero(n_dimensions)
assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
assert_true((corr_cutoff >= 0) && (corr_cutoff <= 1))
assert_is_wholenumber(threads)

# log(tf-idf) transform
mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
read_depth <- mat_stats$col_stats["mean",] * nrow(mat)
tf <- mat %>% multiply_cols(1 / read_depth)
idf_ <- 1 / mat_stats$row_stats["mean",]
mat_tfidf <- tf %>% multiply_rows(idf_)
mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
# Save to prevent re-calculation of queued operations
mat_log_tfidf <- write_matrix_dir(
convert_matrix_type(mat_log_tfidf, type = "float"),
tempfile("mat_log_tfidf"), compress = TRUE
)
# Run pca
svd_attr_ <- svds(mat_log_tfidf, k = n_dimensions, threads = threads)
pca_res <- t(svd_attr_$u) %*% mat_log_tfidf

# Filter out PCs that are highly correlated with sequencing depth
pca_corrs <- abs(cor(read_depth, t(pca_res)))
pca_feats_to_keep <- which(pca_corrs < corr_cutoff)
if (length(pca_feats_to_keep) != n_dimensions) {
# not sure if this is the route we want to take. Should we just leave the PCs in,
# and not use them in downstream analysis?
pca_res <- t(svd_attr_$u[, pca_feats_to_keep]) %*% mat_log_tfidf
}

if(save_lsi) {
return(list(
pca_res = pca_res,
svd_attr = svd_attr_,
unused_pcs <- which(pca_corrs >= corr_cutoff),
idf = idf_
))
}
return(pca_res)
}

#' Get the most variable features within a matrix.
#' @param num_feats (integer) Number of features to return. If the number is higher than the number of features in the matrix,
#' all features will be returned.
#' @param n_bins (integer) Number of bins for binning mean gene expression. Normalizing dispersion is done with respect to each bin,
#' and if the number of features
#' within a bin is less than 2, the dispersion is set to 1.
#' @param save_feat_selection (logical) If `TRUE`, save the dispersions, means, and the features selected.
#' @returns
#' - If `save_feat_selection` is `FALSE`, return an IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
#' - If `save_feat_selection` is `TRUE`, return a list with the following elements:
#' - `mat`: IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
#' - `feature_selection`: Dataframe with columns `name`, `mean`, `dispersion`, `bin`, `feature_dispersion_norm`.
#' @inheritParams lsi
#' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
#'
#' Calculate using the following process:
#' 1. Calculate the dispersion of each feature (variance / mean)
#' 2. Log normalize dispersion and mean
#' 3. Bin the features by their means, and normalize dispersion within each bin
#' @export
highly_variable_features <- function(
mat, num_feats, n_bins = 20,
save_feat_selection = FALSE,
threads = 1L
) {
assert_is(mat, "IterableMatrix")
assert_greater_than_zero(num_feats)
assert_is_wholenumber(num_feats)
assert_len(num_feats, 1)
assert_is_wholenumber(n_bins)
assert_len(n_bins, 1)
assert_greater_than_zero(n_bins)
if (nrow(mat) <= num_feats) {
log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats))
return(mat)
}
# Calculate row information for dispersion
mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads)
feature_means <- mat_stats$row_stats["mean", ]
feature_vars <- mat_stats$row_stats["variance", ]
# Calculate dispersion, and log normalize
feature_dispersion <- feature_vars / feature_means
feature_dispersion[feature_dispersion == 0] <- NA
feature_dispersion <- log(feature_dispersion)
feature_dispersion[feature_means == 0] <- 0
feature_means <- log1p(feature_means)
features_df <- data.frame(
name = names(feature_means),
var = feature_vars,
mean = feature_means,
dispersion = feature_dispersion
)

# Bin by mean, and normalize dispersion with each bin
features_df <- features_df %>%
dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>%
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feature of cut gives me pause, though I think it's probably okay: cut(c(1:100,1000), 5, labels=FALSE) puts the numbers 1-100 in the lowest bin, then leaves the middle 3 bins empty and puts 1000 in the largest bin.

The cell_ranger flavor in Scanpy bins by decile which is the main alternative we could do, which appears to deciles for bin size

From a quick test on SeuratData's pbmc3k dataset, it appears that 92% of genes get put into one bin, though there's no huge enrichment of which bins genes get picked from vs. others. Either worth a bit of follow-up analysis to see if there is bias for high/low expressions within the huge first bin, or at least figuring out a function naming so it's clear this is one variable genes option among multiple possibilities

dplyr::group_by(bin) %>%
dplyr::mutate(
feature_dispersion_norm = (dispersion - mean(dispersion)) / sd(dispersion),
feature_dispersion_norm = if (dplyr::n() == 1) {1} else {feature_dispersion_norm} # Set feats that are in bins with only one feat to have a norm dispersion of 1
) %>%
dplyr::ungroup() %>%
dplyr::slice_max(order_by = feature_dispersion_norm, n = num_feats, with_ties = FALSE)
feats_of_interest <- which(rownames(mat) %in% features_df$name) # get rownames to get original sorted order
if (save_feat_selection) {
# get rownames that are in features_df$name
return(list(
mat = mat[feats_of_interest,],
feature_selection = features_df
))
}
return(mat[feats_of_interest,])
}


#' Aggregate counts matrices by cell group or feature.
#'
#' Given a `(features x cells)` matrix, group cells by `cell_groups` and aggregate counts by `method` for each
Expand Down
2 changes: 1 addition & 1 deletion r/R/transforms.R
Original file line number Diff line number Diff line change
Expand Up @@ -922,4 +922,4 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
global_params = numeric(),
vars_to_regress = vars_to_regress
)
}
}
51 changes: 51 additions & 0 deletions r/man/highly_variable_features.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

49 changes: 49 additions & 0 deletions r/man/lsi.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions r/pkgdown/_pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ reference:
- checksum
- apply_by_row
- regress_out
- lsi
- highly_variable_features
- IterableMatrix-methods
- pseudobulk_matrix

Expand Down
70 changes: 70 additions & 0 deletions r/tests/real_data/ArchR_LSI.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Copyright 2024 BPCells contributors
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.

library("BPCells")
library("ArchR")

# Set up temp dir in case it's not already set
create_temp_dir <- function(dir = NULL) {
if (is.null(dir)) {
dir <- file.path(tempdir(), "lsi_test")
if (dir.exists(dir)) unlink(dir, recursive = TRUE)
dir.create(dir)
}
return(dir)
}

#' Perform a dimensionality reduction with tf-idf and SVD (LSI) on a matrix on ArchR and BPCells.
#' As LSI uses an iterative approach on ArchR, we compare by using a single-iteration private function on ArchR.
#' As the SVD implementation is not necessarily the same between the two packages, we take the SVD matrix
#' from both functions and compare the matrix multiplication of the U and SVD matrices, which should give an approximation
#' we can compare between the two packages.
#' @param proj An archr project.
test_lsi_similarity_to_archr <- function(dir = NULL) {
dir <- create_temp_dir(dir)
setwd(dir)
# add iterative lsi for dim reduction
proj <- getTestProject()
proj <- addPeakMatrix(proj)
# Get the peak matrix
test_mat <- assay(getMatrixFromProject(proj, useMatrix = "PeakMatrix"))
# Calculate LSI on ArchR
# running LSI without binarizing, as we don't do this in the BPCells implementation
# we also don't filter quantile outliers.
lsi_archr <- .computeLSI(
mat = test_mat,
LSIMethod = 2,
nDimensions = 20,
binarize = FALSE,
outlierQuantiles = NULL
)
svd_archr <- lsi_archr$svd
lsi_mat_archr <- t(lsi_archr$matSVD)
# set rownames to NA, as we don't have rownames in the BPCells implementation
rownames(lsi_mat_archr) <- NULL
# PCA Matrix = T(u) * Pre-SVD Matrix
# u * PCA Matrix = u * T(u) * Pre-SVD Matrix
# u * PCA Matrix = Pre-SVD Matrix
pre_svd_mat_approx_archr <- lsi_archr$svd$u %*% lsi_mat_archr
# Calculate LSI on BPCells
# Do not use z-score normalization, as this isn't done with ArchR
lsi_bpcells <- lsi(
test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"),
n_dimensions = 20,
save_lsi = TRUE
)
pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res
testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-4))
# convert signs
lsi_mat_archr <- sweep(lsi_mat_archr, MARGIN = 1, (2 * (lsi_mat_archr[,1] * lsi_bpcells$pca_res[,1] > 0) - 1), `*`)
# Check for post-pca matrix similarity
testthat::expect_true(all.equal(lsi_mat_archr, lsi_bpcells$pca_res, tolerance = 1e-4))
# also check for correlation between the two matrices in PC space
testthat::expect_true(cor(as.vector(lsi_mat_archr), as.vector(lsi_bpcells$pca_res)) > 0.999)
}
test_lsi_similarity_to_archr()
Loading