From ed158200d3dc58196ef8c33f3a081700bce065fe Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 30 Oct 2024 19:00:33 -0700
Subject: [PATCH 01/15] [r] add lsi, var feature selection

---
 r/R/transforms.R | 73 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/r/R/transforms.R b/r/R/transforms.R
index 2a5de994..7d9ea388 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -923,3 +923,76 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
     vars_to_regress = vars_to_regress
   )
 }
+
+#' Compute LSI For a peak matrix 
+#' @param mat PeakMatrix
+#' @param n_dimensions Number of dimensions to keep during PCA
+#' @param scale_factor Scale factor for the tf-idf log transform
+#' @param verbose Whether to print out progress
+#' @param threads Number of threads to use
+#' @return dgCMatrix of shape (n_dimensions, ncol(mat))
+#' @details Compute LSI through first doing a tf-idf transform, a z-score normalization, then PCA.  
+#' Tf-idf implementation is from Stuart & Butler et al. 2019.  
+#' @export
+compute_lsi <- function(mat, n_dimensions = 50, scale_factor = 1e-4, verbose = FALSE, threads = 1) {
+  assert_is(mat, "IterableMatrix") # Should be a peak matrix, should we enforce this?
+  assert_is(n_dimensions, "integer")
+  assert_len(n_dimensions, 1)
+  assert_greater_than_zero(n_dimensions)
+  assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
+
+  # Signac implementation
+  npeaks <- colSums(mat)
+  tf <- mat %>% multiply_cols(1/npeaks)
+  idf_ <- ncol(mat) / rowSums(mat)
+  mat_tfidf <- tf %>% multiply_rows(idf_)
+  mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
+
+  # run z-score norm
+  cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats="variance")$col_stats
+  cell_means <- cell_peak_stats["mean",]
+  cell_vars <- cell_peak_stats["variance",]
+  mat_lsi_norm <- mat_log_tfidf %>%
+    add_cols(-cell_means) %>%
+    multiply_cols(1 / cell_vars)
+
+  # Run pca
+  svd_attr_ <- svds(mat_lsi_norm, k = n_dimensions)
+  pca_res <- t(svd_attr$u) %*% mat_lsi_norm
+  return(pca_res)
+}
+
+#' Get most variable features, given a non-log normalized matrix
+highly_variable_features <- function(mat, num_feats, n_bins, verbose = FALSE) {
+  assert_is(mat, "IterableMatrix")
+  assert_is(num_feats, "integer")
+  assert_greater_than_zero(num_feats)
+  assert_len(num_feats, 1)
+  assert_is(n_bins, "integer")
+  assert_len(n_bins, 1)
+  assert_greater_than_zero(n_bins)
+  if (nrow(mat) <= num_feats) {
+    if (verbose) log_progress(sprintf("Number of features (%s) is less than num_feats (%s), 
+                                       returning all features", nrow(mat), num_feats))
+    return(mat)
+  }
+  # Calculate the mean and variance of each feature
+  # should we set entries that are 0 to NA?
+  feature_means <- matrix_stats(mat, row_stats = c("mean"))$row_stats['mean', ]
+  feature_vars <- matrix_stats(mat, row_stats = c("variance"))$row_stats['variance', ]
+  feature_dispersion <- feature_vars / feature_means
+  feature_dispersion <- log(feature_dispersion)
+  feature_means <- log1p(feature_means)
+  mean_bins <- cut(feature_means, n_bins, labels = FALSE)
+
+  # Calculate the mean and variance of dispersion of each bin
+  bin_mean <- tapply(feature_dispersion, mean_bins, function(x) mean(x))
+  bin_sd <- tapply(feature_dispersion, mean_bins, function(x) sd(x))
+  # Set bin_sd value to bin_mean if bin_sd is NA, results in norm dispersion of 1
+  bin_sd[is.na(bin_sd)] <- bin_mean[is.na(bin_sd)]
+  # map mean_bins indices to bin_stats
+  feature_dispersion_norm <- (feature_dispersion - bin_mean[mean_bins]) / bin_sd[mean_bins]
+  names(feature_dispersion_norm) <- names(feature_dispersion)
+  variable_features_ <- sort(feature_dispersion_norm)[nrow(mat)-num_feats:nrow(mat)]
+  return(mat[names(variable_features_), ])
+}
\ No newline at end of file

From d8df4cd9d5f0e26ed26d4da8473eb00244231fb6 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 4 Nov 2024 15:14:26 -0800
Subject: [PATCH 02/15] [r] add lsi, variable feature selection

---
 r/NAMESPACE                              |   2 +
 r/NEWS.md                                |   2 +
 r/R/singlecell_utils.R                   | 106 +++++++++++++++++++++++
 r/R/transforms.R                         |  73 ----------------
 r/man/highly_variable_features.Rd        |  36 ++++++++
 r/man/lsi.Rd                             |  46 ++++++++++
 r/pkgdown/_pkgdown.yml                   |   2 +
 r/tests/testthat/test-singlecell_utils.R |  33 +++++++
 8 files changed, 227 insertions(+), 73 deletions(-)
 create mode 100644 r/man/highly_variable_features.Rd
 create mode 100644 r/man/lsi.Rd

diff --git a/r/NAMESPACE b/r/NAMESPACE
index d9f0d36b..4a90558e 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -44,6 +44,7 @@ export(gene_score_archr)
 export(gene_score_tiles_archr)
 export(gene_score_weights_archr)
 export(get_trackplot_height)
+export(highly_variable_features)
 export(import_matrix_market)
 export(import_matrix_market_10x)
 export(knn_annoy)
@@ -51,6 +52,7 @@ export(knn_hnsw)
 export(knn_to_geodesic_graph)
 export(knn_to_snn_graph)
 export(log1p_slow)
+export(lsi)
 export(marker_features)
 export(match_gene_symbol)
 export(matrix_stats)
diff --git a/r/NEWS.md b/r/NEWS.md
index 2837c230..cef29f7f 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -19,6 +19,8 @@ Contributions welcome :)
   Thanks to @ycli1995 for pull request #110
 - Add `trackplot_genome_annotation()` for plotting peaks, with options for directional arrows, colors, labels, and peak widths. (pull request #113)
 - Add MACS2/3 input creation and peak calling through `call_macs_peaks()` (pull request #118)
+- Add `lsi()` function to perform latent semantic indexing on a matrix (pull request #156).
+- Add `highly_variable_features()` function to identify highly variable features in a matrix (pull request #156). 
 
 ## Improvements
 - `trackplot_loop()` now accepts discrete color scales
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 42e3933f..8471884d 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -68,4 +68,110 @@ marker_features <- function(mat, groups, method="wilcoxon") {
         foreground_mean = as.numeric(foreground_means),
         background_mean = as.numeric(background_means)
     )
+}
+
+#' Perform latent semantic indexing (LSI) on a matrix.
+#' @param mat (IterableMatrix) dimensions features x cells
+#' @param n_dimensions (integer) Number of dimensions to keep during PCA.
+#' @param scale_factor (integer) Scale factor for the tf-idf log transform.
+#' @param save_in_memory (logical) If TRUE, save the log(tf-idf) matrix in memory.  
+#' If FALSE, save to a temporary location in disk.  Saving in memory will result in faster downstream operations,
+#' but will require in higher memory usage.  Comparison of memory usage and speed is in the details section.
+#' @param threads (integer) Number of threads to use.
+#' @return dgCMatrix of shape (n_dimensions, ncol(mat)).
+#' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
+#' 
+#' ** Saving in memory vs disk: **
+#' Following the log(tf-idf) transform, the matrix is stored into a temporary location, as the next step will break the sparsity pattern of the matrix.
+#' This is done to prevent re-calculation of queued operations during PCA optimization.
+#' 
+#' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
+#' - Saving in memory: 233 MB memory usage, 22.7 seconds runtime
+#' - Saving in disk: 17.1 MB memory usage, 25.1 seconds runtime
+#' 
+#' @export
+lsi <- function(mat, n_dimensions = 50L, scale_factor = 1e4, save_in_memory = FALSE, threads = 1L) {
+  assert_is(mat, "IterableMatrix")
+  assert_is_wholenumber(n_dimensions)
+  assert_len(n_dimensions, 1)
+  assert_greater_than_zero(n_dimensions)
+  assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
+  assert_is_wholenumber(threads)
+
+  # log(tf-idf) transform
+  npeaks <- colSums(mat) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h`
+  tf <- mat %>% multiply_cols(1 / npeaks)
+  idf_ <- ncol(mat) / rowSums(mat)
+  mat_tfidf <- tf %>% multiply_rows(idf_)
+  mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
+  # Save to prevent re-calculation of queued operations
+  if (save_in_memory) {
+    mat_log_tfidf <- write_matrix_memory(mat_log_tfidf, compress = FALSE)
+  } else {
+    mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE)
+  } 
+  # Z-score normalization
+  cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats="variance", threads = threads)$col_stats
+  cell_means <- cell_peak_stats["mean",]
+  cell_vars <- cell_peak_stats["variance",]
+  mat_lsi_norm <- mat_log_tfidf %>%
+    add_cols(-cell_means) %>%
+    multiply_cols(1 / cell_vars)
+  # Run pca
+  svd_attr_ <- svds(mat_lsi_norm, k = n_dimensions, threads = threads)
+  pca_res <- t(svd_attr_$u) %*% mat_lsi_norm
+  return(pca_res)
+}
+
+#' Get the most variable features within a matrix
+#' @param num_feats (integer) Number of features to return.  If the number is higher than the number of features in the matrix, 
+#' ll features will be returned.
+#' @param n_bins (integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin, 
+#' and if the number of features
+#' within a bin is less than 2, the dispersion is set to 1.
+#' @returns IterableMatrix subset of the most variable features.
+#' @inheritParams lsi
+#' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
+#' 
+#' Calculate using the following process:
+#'  1. Calculate the dispersion of each feature (variance / mean)
+#'  2. Log normalize dispersion and mean
+#'  3. Bin the features by their means, and normalize dispersion within each bin
+#' @export
+highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) {
+  assert_is(mat, "IterableMatrix")
+  assert_greater_than_zero(num_feats)
+  assert_is_wholenumber(num_feats)
+  assert_len(num_feats, 1)
+  assert_is_wholenumber(n_bins)
+  assert_len(n_bins, 1)
+  assert_greater_than_zero(n_bins)
+  if (nrow(mat) <= num_feats) {
+    log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats))
+    return(mat)
+  }
+  
+  feature_means <- matrix_stats(mat, row_stats = c("mean"))$row_stats["mean", ]
+  feature_vars <- matrix_stats(mat, row_stats = c("variance"))$row_stats["variance", ]
+  feature_means[feature_means == 0] <- 1e-12
+  feature_dispersion <- feature_vars / feature_means
+  feature_dispersion[feature_dispersion == 0] <- NA
+  feature_dispersion <- log(feature_dispersion)
+  feature_means <- log1p(feature_means)
+  mean_bins <- cut(feature_means, n_bins, labels = FALSE)
+  
+  bin_mean <- tapply(feature_dispersion, mean_bins, function(x) mean(x, na.rm = TRUE))
+  bin_sd <- tapply(feature_dispersion, mean_bins, function(x) sd(x, na.rm = TRUE))
+  # Set feats that are in bins with only one feat to have a norm dispersion of 1
+  one_gene_bin <- is.na(bin_sd)
+  bin_sd[one_gene_bin] <- bin_mean[one_gene_bin]
+  bin_mean[one_gene_bin] <- 0
+  # map mean_bins indices to bin_stats
+  # Do a character search as bins without features mess up numeric indexing
+  feature_dispersion_norm <- (feature_dispersion - bin_mean[as.character(mean_bins)]) / bin_sd[as.character(mean_bins)]
+  names(feature_dispersion_norm) <- names(feature_dispersion)
+  feature_dispersion_norm <- sort(feature_dispersion_norm) # sorting automatically removes NA values
+  if (length(feature_dispersion_norm) < num_feats) log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all non-zero features", length(feature_dispersion_norm), num_feats))
+  variable_features_ <- feature_dispersion_norm[max(1, (length(feature_dispersion_norm) - num_feats + 1)):length(feature_dispersion_norm)]
+  return(mat[names(variable_features_), ])
 }
\ No newline at end of file
diff --git a/r/R/transforms.R b/r/R/transforms.R
index 7d9ea388..8d36e2e1 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -922,77 +922,4 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
     global_params = numeric(),
     vars_to_regress = vars_to_regress
   )
-}
-
-#' Compute LSI For a peak matrix 
-#' @param mat PeakMatrix
-#' @param n_dimensions Number of dimensions to keep during PCA
-#' @param scale_factor Scale factor for the tf-idf log transform
-#' @param verbose Whether to print out progress
-#' @param threads Number of threads to use
-#' @return dgCMatrix of shape (n_dimensions, ncol(mat))
-#' @details Compute LSI through first doing a tf-idf transform, a z-score normalization, then PCA.  
-#' Tf-idf implementation is from Stuart & Butler et al. 2019.  
-#' @export
-compute_lsi <- function(mat, n_dimensions = 50, scale_factor = 1e-4, verbose = FALSE, threads = 1) {
-  assert_is(mat, "IterableMatrix") # Should be a peak matrix, should we enforce this?
-  assert_is(n_dimensions, "integer")
-  assert_len(n_dimensions, 1)
-  assert_greater_than_zero(n_dimensions)
-  assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
-
-  # Signac implementation
-  npeaks <- colSums(mat)
-  tf <- mat %>% multiply_cols(1/npeaks)
-  idf_ <- ncol(mat) / rowSums(mat)
-  mat_tfidf <- tf %>% multiply_rows(idf_)
-  mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
-
-  # run z-score norm
-  cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats="variance")$col_stats
-  cell_means <- cell_peak_stats["mean",]
-  cell_vars <- cell_peak_stats["variance",]
-  mat_lsi_norm <- mat_log_tfidf %>%
-    add_cols(-cell_means) %>%
-    multiply_cols(1 / cell_vars)
-
-  # Run pca
-  svd_attr_ <- svds(mat_lsi_norm, k = n_dimensions)
-  pca_res <- t(svd_attr$u) %*% mat_lsi_norm
-  return(pca_res)
-}
-
-#' Get most variable features, given a non-log normalized matrix
-highly_variable_features <- function(mat, num_feats, n_bins, verbose = FALSE) {
-  assert_is(mat, "IterableMatrix")
-  assert_is(num_feats, "integer")
-  assert_greater_than_zero(num_feats)
-  assert_len(num_feats, 1)
-  assert_is(n_bins, "integer")
-  assert_len(n_bins, 1)
-  assert_greater_than_zero(n_bins)
-  if (nrow(mat) <= num_feats) {
-    if (verbose) log_progress(sprintf("Number of features (%s) is less than num_feats (%s), 
-                                       returning all features", nrow(mat), num_feats))
-    return(mat)
-  }
-  # Calculate the mean and variance of each feature
-  # should we set entries that are 0 to NA?
-  feature_means <- matrix_stats(mat, row_stats = c("mean"))$row_stats['mean', ]
-  feature_vars <- matrix_stats(mat, row_stats = c("variance"))$row_stats['variance', ]
-  feature_dispersion <- feature_vars / feature_means
-  feature_dispersion <- log(feature_dispersion)
-  feature_means <- log1p(feature_means)
-  mean_bins <- cut(feature_means, n_bins, labels = FALSE)
-
-  # Calculate the mean and variance of dispersion of each bin
-  bin_mean <- tapply(feature_dispersion, mean_bins, function(x) mean(x))
-  bin_sd <- tapply(feature_dispersion, mean_bins, function(x) sd(x))
-  # Set bin_sd value to bin_mean if bin_sd is NA, results in norm dispersion of 1
-  bin_sd[is.na(bin_sd)] <- bin_mean[is.na(bin_sd)]
-  # map mean_bins indices to bin_stats
-  feature_dispersion_norm <- (feature_dispersion - bin_mean[mean_bins]) / bin_sd[mean_bins]
-  names(feature_dispersion_norm) <- names(feature_dispersion)
-  variable_features_ <- sort(feature_dispersion_norm)[nrow(mat)-num_feats:nrow(mat)]
-  return(mat[names(variable_features_), ])
 }
\ No newline at end of file
diff --git a/r/man/highly_variable_features.Rd b/r/man/highly_variable_features.Rd
new file mode 100644
index 00000000..54e67599
--- /dev/null
+++ b/r/man/highly_variable_features.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{highly_variable_features}
+\alias{highly_variable_features}
+\title{Get the most variable features within a matrix}
+\usage{
+highly_variable_features(mat, num_feats, n_bins, threads = 1L)
+}
+\arguments{
+\item{mat}{(IterableMatrix) dimensions features x cells}
+
+\item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
+ll features will be returned.}
+
+\item{n_bins}{(integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin,
+and if the number of features
+within a bin is less than 2, the dispersion is set to 1.}
+
+\item{threads}{(integer) Number of threads to use.}
+}
+\value{
+IterableMatrix subset of the most variable features.
+}
+\description{
+Get the most variable features within a matrix
+}
+\details{
+The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
+
+Calculate using the following process:
+\enumerate{
+\item Calculate the dispersion of each feature (variance / mean)
+\item Log normalize dispersion and mean
+\item Bin the features by their means, and normalize dispersion within each bin
+}
+}
diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd
new file mode 100644
index 00000000..38c0d73b
--- /dev/null
+++ b/r/man/lsi.Rd
@@ -0,0 +1,46 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{lsi}
+\alias{lsi}
+\title{Perform latent semantic indexing (LSI) on a matrix.}
+\usage{
+lsi(
+  mat,
+  n_dimensions = 50L,
+  scale_factor = 10000,
+  save_in_memory = FALSE,
+  threads = 1L
+)
+}
+\arguments{
+\item{mat}{(IterableMatrix) dimensions features x cells}
+
+\item{n_dimensions}{(integer) Number of dimensions to keep during PCA.}
+
+\item{scale_factor}{(integer) Scale factor for the tf-idf log transform.}
+
+\item{save_in_memory}{(logical) If TRUE, save the log(tf-idf) matrix in memory.
+If FALSE, save to a temporary location in disk.  Saving in memory will result in faster downstream operations,
+but will require in higher memory usage.  Comparison of memory usage and speed is in the details section.}
+
+\item{threads}{(integer) Number of threads to use.}
+}
+\value{
+dgCMatrix of shape (n_dimensions, ncol(mat)).
+}
+\description{
+Perform latent semantic indexing (LSI) on a matrix.
+}
+\details{
+Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
+
+** Saving in memory vs disk: **
+Following the log(tf-idf) transform, the matrix is stored into a temporary location, as the next step will break the sparsity pattern of the matrix.
+This is done to prevent re-calculation of queued operations during PCA optimization.
+
+Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
+\itemize{
+\item Saving in memory: 233 MB memory usage, 22.7 seconds runtime
+\item Saving in disk: 17.1 MB memory usage, 25.1 seconds runtime
+}
+}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index 6e1b584b..f7530216 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -122,6 +122,8 @@ reference:
   - checksum
   - apply_by_row
   - regress_out
+  - lsi
+  - highly_variable_features
   - IterableMatrix-methods
 
 - title: "Reference Annotations"
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 962d0048..5ef6e5f3 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -6,6 +6,15 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
+generate_sparse_matrix <- function(nrow, ncol, fraction_nonzero = 0.5, max_val = 10) {
+  m <- matrix(rbinom(nrow * ncol, 1, fraction_nonzero) * sample.int(max_val, nrow * ncol, replace = TRUE), nrow = nrow)
+  rownames(m) <- paste0("feat", seq_len(nrow(m)))
+  colnames(m) <- paste0("cell", seq_len(ncol(m)))
+  as(m, "dgCMatrix")
+}
+generate_dense_matrix <- function(nrow, ncol) {
+  m <- matrix(runif(nrow * ncol), nrow = nrow)
+}
 test_that("Wilcoxon rank sum works (small)", {
     x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46)
     y <- c(1.15, 0.88, 0.90, 0.74, 1.21)
@@ -114,4 +123,28 @@ test_that("C++ UMAP graph calculation works", {
     knn <- test_data$knn
     res <- knn_to_geodesic_graph(knn)
     expect_equal(as.matrix(res + t(res)), as.matrix(test_data$graph), tolerance=1e-6)
+})
+
+test_that("Highly variable feature selection works", {
+    mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix")
+    # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to Seurat
+    res <- highly_variable_features(mat, num_feats = 10, n_bins = 5, threads = 1)
+    res_t <- highly_variable_features(t(mat), num_feats = 10, n_bins = 5, threads = 1)
+    expect_equal(nrow(res), 10)
+    expect_equal(ncol(res), 26)
+    expect_equal(nrow(res_t), 10)
+    expect_equal(ncol(res_t), 500)
+})
+
+test_that("LSI works", {
+    mat <- matrix(runif(240), nrow=10) %>% as("dgCMatrix") %>% as("IterableMatrix")
+    rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
+    colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
+    # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to ArchR
+    lsi_res <- lsi(mat, n_dimensions = 5)
+    lsi_res_t <- lsi(t(mat), n_dimensions = 5)
+    expect_equal(nrow(lsi_res), 5)
+    expect_equal(ncol(lsi_res), ncol(mat))
+    expect_equal(nrow(lsi_res_t), 5)
+    expect_equal(ncol(lsi_res_t), nrow(mat))
 })
\ No newline at end of file

From 16047ae65779c97b46f0a785e232b8a8c3880d24 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 6 Nov 2024 16:53:53 -0800
Subject: [PATCH 03/15] [r] parametrize z_score_norm, create temp option to
 return more info `save_lsi`, remove `save_in_memory` in `lsi()`

---
 r/R/singlecell_utils.R | 105 ++++++++++++++++++++++++-----------------
 r/man/lsi.Rd           |  24 +++++-----
 2 files changed, 74 insertions(+), 55 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 8b797513..95097c09 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -73,25 +73,28 @@ marker_features <- function(mat, groups, method="wilcoxon") {
 
 #' Perform latent semantic indexing (LSI) on a matrix.
 #' @param mat (IterableMatrix) dimensions features x cells
+#' @param z_score_norm (logical) If TRUE, z-score normalize the matrix before PCA.
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
 #' @param scale_factor (integer) Scale factor for the tf-idf log transform.
-#' @param save_in_memory (logical) If TRUE, save the log(tf-idf) matrix in memory.  
-#' If FALSE, save to a temporary location in disk.  Saving in memory will result in faster downstream operations,
-#' but will require in higher memory usage.  Comparison of memory usage and speed is in the details section.
 #' @param threads (integer) Number of threads to use.
-#' @return dgCMatrix of shape (n_dimensions, ncol(mat)).
+#' @param save_lsi (logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector.
+#' @return 
+#' - If save_lsi is FALSE, return a dgCMatrix of shape (n_dimensions, ncol(mat)).
+#' - If save_lsi is TRUE, return a list with the following elements:
+#'  - **pca_res**: dgCMatrix of shape (n_dimensions, ncol(mat))
+#' - **svd_attr**: List of SVD attributes
+#' - **idf**: Inverse document frequency vector
 #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
 #' 
-#' ** Saving in memory vs disk: **
-#' Following the log(tf-idf) transform, the matrix is stored into a temporary location, as the next step will break the sparsity pattern of the matrix.
-#' This is done to prevent re-calculation of queued operations during PCA optimization.
-#' 
 #' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
-#' - Saving in memory: 233 MB memory usage, 22.7 seconds runtime
-#' - Saving in disk: 17.1 MB memory usage, 25.1 seconds runtime
-#' 
+#' - 17.1 MB memory usage, 25.1 seconds runtime
 #' @export
-lsi <- function(mat, n_dimensions = 50L, scale_factor = 1e4, save_in_memory = FALSE, threads = 1L) {
+lsi <- function(
+  mat,
+  z_score_norm = TRUE, n_dimensions = 50L, scale_factor = 1e4,
+  save_lsi = FALSE,
+  threads = 1L
+) {
   assert_is(mat, "IterableMatrix")
   assert_is_wholenumber(n_dimensions)
   assert_len(n_dimensions, 1)
@@ -100,27 +103,34 @@ lsi <- function(mat, n_dimensions = 50L, scale_factor = 1e4, save_in_memory = FA
   assert_is_wholenumber(threads)
 
   # log(tf-idf) transform
+  mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
+  
   npeaks <- colSums(mat) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h`
   tf <- mat %>% multiply_cols(1 / npeaks)
   idf_ <- ncol(mat) / rowSums(mat)
   mat_tfidf <- tf %>% multiply_rows(idf_)
   mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
   # Save to prevent re-calculation of queued operations
-  if (save_in_memory) {
-    mat_log_tfidf <- write_matrix_memory(mat_log_tfidf, compress = FALSE)
-  } else {
-    mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE)
-  } 
+  mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE)
   # Z-score normalization
-  cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats="variance", threads = threads)$col_stats
-  cell_means <- cell_peak_stats["mean",]
-  cell_vars <- cell_peak_stats["variance",]
-  mat_lsi_norm <- mat_log_tfidf %>%
-    add_cols(-cell_means) %>%
-    multiply_cols(1 / cell_vars)
+  if (z_score_norm) {
+    cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats = "variance", threads = threads)$col_stats
+    cell_means <- cell_peak_stats["mean",]
+    cell_vars <- cell_peak_stats["variance",]
+    mat_log_tfidf <- mat_log_tfidf %>%
+      add_cols(-cell_means) %>%
+      multiply_cols(1 / cell_vars)
+  }
   # Run pca
-  svd_attr_ <- svds(mat_lsi_norm, k = n_dimensions, threads = threads)
-  pca_res <- t(svd_attr_$u) %*% mat_lsi_norm
+  svd_attr_ <- svds(mat_log_tfidf, k = n_dimensions, threads = threads)
+  pca_res <- t(svd_attr_$u) %*% mat_log_tfidf
+  if(save_lsi) {
+    return(list(
+      pca_res = pca_res,
+      svd_attr = svd_attr_,
+      idf = idf_
+    ))
+  }
   return(pca_res)
 }
 
@@ -151,31 +161,38 @@ highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) {
     log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats))
     return(mat)
   }
-  
-  feature_means <- matrix_stats(mat, row_stats = c("mean"))$row_stats["mean", ]
-  feature_vars <- matrix_stats(mat, row_stats = c("variance"))$row_stats["variance", ]
+  mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads)
+  feature_means <- mat_stats$row_stats["mean", ]
+  feature_vars <- mat_stats$row_stats["variance", ]
   feature_means[feature_means == 0] <- 1e-12
   feature_dispersion <- feature_vars / feature_means
   feature_dispersion[feature_dispersion == 0] <- NA
   feature_dispersion <- log(feature_dispersion)
   feature_means <- log1p(feature_means)
-  mean_bins <- cut(feature_means, n_bins, labels = FALSE)
-  
-  bin_mean <- tapply(feature_dispersion, mean_bins, function(x) mean(x, na.rm = TRUE))
-  bin_sd <- tapply(feature_dispersion, mean_bins, function(x) sd(x, na.rm = TRUE))
-  # Set feats that are in bins with only one feat to have a norm dispersion of 1
-  one_gene_bin <- is.na(bin_sd)
-  bin_sd[one_gene_bin] <- bin_mean[one_gene_bin]
-  bin_mean[one_gene_bin] <- 0
-  # map mean_bins indices to bin_stats
-  # Do a character search as bins without features mess up numeric indexing
-  feature_dispersion_norm <- (feature_dispersion - bin_mean[as.character(mean_bins)]) / bin_sd[as.character(mean_bins)]
-  names(feature_dispersion_norm) <- names(feature_dispersion)
-  feature_dispersion_norm <- sort(feature_dispersion_norm) # sorting automatically removes NA values
-  if (length(feature_dispersion_norm) < num_feats) log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all non-zero features", length(feature_dispersion_norm), num_feats))
-  variable_features_ <- feature_dispersion_norm[max(1, (length(feature_dispersion_norm) - num_feats + 1)):length(feature_dispersion_norm)]
-  return(mat[names(variable_features_), ])
+  features_df <- data.frame(
+    name = names(feature_means),
+    vars = feature_vars, 
+    means = feature_means,
+    dispersion = feature_dispersion
+  ) 
+  features_df <- features_df %>% 
+    dplyr::mutate(bin = cut(means, n_bins, labels=FALSE)) %>% 
+    dplyr::group_by(bin) %>% 
+    dplyr::mutate( 
+      bin_mean = mean(dispersion, na.rm = TRUE), 
+      bin_sd = sd(dispersion, na.rm = TRUE),
+      bin_sd_is_na = is.na(bin_sd), 
+      bin_sd = ifelse(bin_sd_is_na, bin_mean, bin_sd), # Set feats that are in bins with only one feat to have a norm dispersion of 1
+      bin_mean = ifelse(bin_sd_is_na, 0, bin_mean),
+      feature_dispersion_norm = (dispersion - bin_mean) / bin_sd
+    ) %>%
+    dplyr::ungroup() %>%
+    dplyr::select(name, feature_dispersion_norm) %>%
+    dplyr::arrange(desc(feature_dispersion_norm)) %>% 
+    dplyr::slice(1:min(num_feats, nrow(.)))
+  return(mat[features_df$name,])
 }
+
                    
 #' Aggregate counts matrices by cell group or feature.
 #'
diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd
index 38c0d73b..137e4437 100644
--- a/r/man/lsi.Rd
+++ b/r/man/lsi.Rd
@@ -6,27 +6,34 @@
 \usage{
 lsi(
   mat,
+  z_score_norm = TRUE,
   n_dimensions = 50L,
   scale_factor = 10000,
-  save_in_memory = FALSE,
+  save_lsi = FALSE,
   threads = 1L
 )
 }
 \arguments{
 \item{mat}{(IterableMatrix) dimensions features x cells}
 
+\item{z_score_norm}{(logical) If TRUE, z-score normalize the matrix before PCA.}
+
 \item{n_dimensions}{(integer) Number of dimensions to keep during PCA.}
 
 \item{scale_factor}{(integer) Scale factor for the tf-idf log transform.}
 
-\item{save_in_memory}{(logical) If TRUE, save the log(tf-idf) matrix in memory.
-If FALSE, save to a temporary location in disk.  Saving in memory will result in faster downstream operations,
-but will require in higher memory usage.  Comparison of memory usage and speed is in the details section.}
+\item{save_lsi}{(logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector.}
 
 \item{threads}{(integer) Number of threads to use.}
 }
 \value{
-dgCMatrix of shape (n_dimensions, ncol(mat)).
+\itemize{
+\item If save_lsi is FALSE, return a dgCMatrix of shape (n_dimensions, ncol(mat)).
+\item If save_lsi is TRUE, return a list with the following elements:
+\item \strong{pca_res}: dgCMatrix of shape (n_dimensions, ncol(mat))
+\item \strong{svd_attr}: List of SVD attributes
+\item \strong{idf}: Inverse document frequency vector
+}
 }
 \description{
 Perform latent semantic indexing (LSI) on a matrix.
@@ -34,13 +41,8 @@ Perform latent semantic indexing (LSI) on a matrix.
 \details{
 Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
 
-** Saving in memory vs disk: **
-Following the log(tf-idf) transform, the matrix is stored into a temporary location, as the next step will break the sparsity pattern of the matrix.
-This is done to prevent re-calculation of queued operations during PCA optimization.
-
 Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
 \itemize{
-\item Saving in memory: 233 MB memory usage, 22.7 seconds runtime
-\item Saving in disk: 17.1 MB memory usage, 25.1 seconds runtime
+\item 17.1 MB memory usage, 25.1 seconds runtime
 }
 }

From d6c674b5b5937d2ade9de22893cc2cf445e3558c Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 6 Nov 2024 16:54:18 -0800
Subject: [PATCH 04/15] [r] add test case for LSI comparing to archr

---
 r/tests/real_data/ArchR_LSI.R | 50 +++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 r/tests/real_data/ArchR_LSI.R

diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R
new file mode 100644
index 00000000..b78d700f
--- /dev/null
+++ b/r/tests/real_data/ArchR_LSI.R
@@ -0,0 +1,50 @@
+devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/")
+devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/")
+
+#' Perform a dimensionality reduction with tf-idf and SVD (LSI) on a matrix on ArchR and BPCells.
+#' As LSI uses an iterative approach on ArchR, we compare by using a single-iteration private function on ArchR.
+#' As the SVD implementation is not necessarily the same between the two packages, we take the SVD matrix
+#' from both functions and compare the matrix multiplication of the U and SVD matrices, which should give an approximation
+#' we can compare between the two packages.
+#' @param proj An archr project.
+test_lsi_similarity_to_archr <- function() {
+    # Set up temp dir
+    int_dir <- file.path(tempdir(), "insertion_test")
+    dir.create(int_dir)
+    setwd(int_dir)
+    # add iterative lsi for dim reduction
+    proj <- getTestProject()
+    proj <- addPeakMatrix(proj)
+    # Get the peak matrix
+    test_mat <- assay(getMatrixFromProject(proj, useMatrix = "PeakMatrix"))
+    # Calculate LSI on ArchR
+    # running LSI without binarizing, as we don't do this in the BPCells implementation
+    # we also don't filter quantile outliers.
+    lsi_archr <- .computeLSI(
+        mat = test_mat,
+        LSIMethod = 2,
+        nDimensions = 2,
+        binarize = FALSE,
+        outlierQuantiles = NULL,
+        test_mat = test_mat
+    )
+    svd_archr <- lsi_archr$svd
+    lsi_mat_archr <- t(lsi_archr$matSVD)
+    # set rownames to NA, as we don't have rownames in the BPCells implementation
+    rownames(lsi_mat_archr) <- NULL
+    # PCA Matrix = T(u) * Pre-SVD Matrix
+    # u * PCA Matrix = u * T(u) * Pre-SVD Matrix
+    # u * PCA Matrix = Pre-SVD Matrix
+    pre_svd_mat_approx_archr <- lsi_archr$svd$u %*% lsi_mat_archr
+    # Calculate LSI on BPCells
+    # Do not use z-score normalization, as this isn't done with ArchR
+    lsi_bpcells <- lsi(
+        test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"), 
+        z_score_norm = FALSE,
+        n_dimensions = 2,
+        save_lsi = TRUE
+    )
+    pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res
+    testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-6))
+}
+test_lsi_similarity_to_archr()
\ No newline at end of file

From 7bda387819288ba6e64116cd8fb368bc6ea7f14b Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 6 Nov 2024 19:29:32 -0800
Subject: [PATCH 05/15] [r] clean up var gene selection, lsi docstring

---
 r/R/singlecell_utils.R            | 38 ++++++++++++++++++++++++-------
 r/man/highly_variable_features.Rd | 17 ++++++++++++--
 r/man/lsi.Rd                      |  6 ++---
 3 files changed, 48 insertions(+), 13 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 95097c09..94e5fe53 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -72,6 +72,8 @@ marker_features <- function(mat, groups, method="wilcoxon") {
 
 
 #' Perform latent semantic indexing (LSI) on a matrix.
+#' 
+#' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
 #' @param mat (IterableMatrix) dimensions features x cells
 #' @param z_score_norm (logical) If TRUE, z-score normalize the matrix before PCA.
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
@@ -79,9 +81,9 @@ marker_features <- function(mat, groups, method="wilcoxon") {
 #' @param threads (integer) Number of threads to use.
 #' @param save_lsi (logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector.
 #' @return 
-#' - If save_lsi is FALSE, return a dgCMatrix of shape (n_dimensions, ncol(mat)).
+#' - If save_lsi is FALSE, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`.
 #' - If save_lsi is TRUE, return a list with the following elements:
-#'  - **pca_res**: dgCMatrix of shape (n_dimensions, ncol(mat))
+#'  - **pca_res**: dgCMatrix of shape `(n_dimensions, ncol(mat))``
 #' - **svd_attr**: List of SVD attributes
 #' - **idf**: Inverse document frequency vector
 #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
@@ -140,7 +142,12 @@ lsi <- function(
 #' @param n_bins (integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin, 
 #' and if the number of features
 #' within a bin is less than 2, the dispersion is set to 1.
-#' @returns IterableMatrix subset of the most variable features.
+#' @param save_feat_selection (logical) If TRUE, save the dispersions, means, and the features selected.
+#' @returns 
+#' - If `save_feat_selection` is False, return an IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
+#' - If `save_feat_selection` is True, return a list with the following elements:
+#' - **mat**: IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
+#' - **feature_selection**: Dataframe with the following columns:
 #' @inheritParams lsi
 #' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
 #' 
@@ -149,7 +156,11 @@ lsi <- function(
 #'  2. Log normalize dispersion and mean
 #'  3. Bin the features by their means, and normalize dispersion within each bin
 #' @export
-highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) {
+highly_variable_features <- function(
+  mat, num_feats, n_bins = 20,
+  save_feat_selection = FALSE,
+  threads = 1L
+) {
   assert_is(mat, "IterableMatrix")
   assert_greater_than_zero(num_feats)
   assert_is_wholenumber(num_feats)
@@ -161,22 +172,27 @@ highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) {
     log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats))
     return(mat)
   }
+  # Calculate row information for dispersion
   mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads)
   feature_means <- mat_stats$row_stats["mean", ]
   feature_vars <- mat_stats$row_stats["variance", ]
+  # Give a small value to features with 0 mean, helps with 0 division
   feature_means[feature_means == 0] <- 1e-12
+  # Calculate dispersion, and log normalize
   feature_dispersion <- feature_vars / feature_means
   feature_dispersion[feature_dispersion == 0] <- NA
   feature_dispersion <- log(feature_dispersion)
   feature_means <- log1p(feature_means)
   features_df <- data.frame(
     name = names(feature_means),
-    vars = feature_vars, 
-    means = feature_means,
+    var = feature_vars, 
+    mean = feature_means,
     dispersion = feature_dispersion
   ) 
+
+  # Bin by mean, and normalize dispersion with each bin
   features_df <- features_df %>% 
-    dplyr::mutate(bin = cut(means, n_bins, labels=FALSE)) %>% 
+    dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% 
     dplyr::group_by(bin) %>% 
     dplyr::mutate( 
       bin_mean = mean(dispersion, na.rm = TRUE), 
@@ -187,9 +203,15 @@ highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) {
       feature_dispersion_norm = (dispersion - bin_mean) / bin_sd
     ) %>%
     dplyr::ungroup() %>%
-    dplyr::select(name, feature_dispersion_norm) %>%
+    dplyr::select(c(-bin_sd_is_na, -var, -bin_sd, -bin_mean)) %>%
     dplyr::arrange(desc(feature_dispersion_norm)) %>% 
     dplyr::slice(1:min(num_feats, nrow(.)))
+  if (save_feat_selection) {
+    return(list(
+      mat = mat[features_df$name,],
+      feature_selection = features_df
+    ))
+  }
   return(mat[features_df$name,])
 }
 
diff --git a/r/man/highly_variable_features.Rd b/r/man/highly_variable_features.Rd
index 54e67599..943d986b 100644
--- a/r/man/highly_variable_features.Rd
+++ b/r/man/highly_variable_features.Rd
@@ -4,7 +4,13 @@
 \alias{highly_variable_features}
 \title{Get the most variable features within a matrix}
 \usage{
-highly_variable_features(mat, num_feats, n_bins, threads = 1L)
+highly_variable_features(
+  mat,
+  num_feats,
+  n_bins = 20,
+  save_feat_selection = FALSE,
+  threads = 1L
+)
 }
 \arguments{
 \item{mat}{(IterableMatrix) dimensions features x cells}
@@ -16,10 +22,17 @@ ll features will be returned.}
 and if the number of features
 within a bin is less than 2, the dispersion is set to 1.}
 
+\item{save_feat_selection}{(logical) If TRUE, save the dispersions, means, and the features selected.}
+
 \item{threads}{(integer) Number of threads to use.}
 }
 \value{
-IterableMatrix subset of the most variable features.
+\itemize{
+\item If \code{save_feat_selection} is False, return an IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
+\item If \code{save_feat_selection} is True, return a list with the following elements:
+\item \strong{mat}: IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
+\item \strong{feature_selection}: Dataframe with the following columns:
+}
 }
 \description{
 Get the most variable features within a matrix
diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd
index 137e4437..299513df 100644
--- a/r/man/lsi.Rd
+++ b/r/man/lsi.Rd
@@ -28,15 +28,15 @@ lsi(
 }
 \value{
 \itemize{
-\item If save_lsi is FALSE, return a dgCMatrix of shape (n_dimensions, ncol(mat)).
+\item If save_lsi is FALSE, return a dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}.
 \item If save_lsi is TRUE, return a list with the following elements:
-\item \strong{pca_res}: dgCMatrix of shape (n_dimensions, ncol(mat))
+\item \strong{pca_res}: dgCMatrix of shape `(n_dimensions, ncol(mat))``
 \item \strong{svd_attr}: List of SVD attributes
 \item \strong{idf}: Inverse document frequency vector
 }
 }
 \description{
-Perform latent semantic indexing (LSI) on a matrix.
+Given a \verb{(features x cells)} matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}.
 }
 \details{
 Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.

From 16237b4f6a451e3515353a130ffcdf60df1ce35b Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 6 Nov 2024 19:30:00 -0800
Subject: [PATCH 06/15] [r] add variable gene selection test

---
 r/tests/real_data/ArchR_LSI.R                 |  9 +++-
 .../Scanpy_variable_feat_selection.py         | 46 ++++++++++++++++++
 .../scanpy_variable_feat_selection.R          | 47 +++++++++++++++++++
 3 files changed, 101 insertions(+), 1 deletion(-)
 create mode 100644 r/tests/real_data/Scanpy_variable_feat_selection.py
 create mode 100644 r/tests/real_data/scanpy_variable_feat_selection.R

diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R
index b78d700f..2ad74610 100644
--- a/r/tests/real_data/ArchR_LSI.R
+++ b/r/tests/real_data/ArchR_LSI.R
@@ -1,3 +1,11 @@
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
 devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/")
 devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/")
 
@@ -47,4 +55,3 @@ test_lsi_similarity_to_archr <- function() {
     pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res
     testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-6))
 }
-test_lsi_similarity_to_archr()
\ No newline at end of file
diff --git a/r/tests/real_data/Scanpy_variable_feat_selection.py b/r/tests/real_data/Scanpy_variable_feat_selection.py
new file mode 100644
index 00000000..6ae28c1b
--- /dev/null
+++ b/r/tests/real_data/Scanpy_variable_feat_selection.py
@@ -0,0 +1,46 @@
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+import sys, tempfile, os
+import numpy as np
+import pandas as pd
+import scanpy as sc
+
+
+def call_highly_var_genes_single_batch(temp_dir: str) -> None:
+    """
+    Call highly_variable genes on a single batch of PBMC68k data using their interpreation of 
+    the Seurat implementation.
+    Write the input anndata object csv at `<temp_dir>/highly_var_genes_scanpy_input.csv`
+    Write the output as a csv, at `<temp_dir>/highly_var_genes_scanpy_output.csv`
+
+    Args:
+        temp_dir (str): Path to the temporary directory to write the input and output files.
+    """
+    # Dataset is only (765, 700)
+    adata = sc.datasets.pbmc68k_reduced()
+    adata.var_names_make_unique()
+    res = sc.pp._highly_variable_genes.highly_variable_genes(adata, 
+                                                             n_top_genes = 50,
+                                                             n_bins = 20, 
+                                                             flavor = "seurat",
+                                                             inplace = False,
+                                                             check_values = False).drop(columns = 'mean_bin')
+    # remove mean_bin
+    adata.to_df().to_csv(os.path.join(temp_dir, "highly_var_genes_scanpy_input.csv"))
+    res.to_csv(os.path.join(temp_dir, "highly_var_genes_scanpy_output.csv"))
+
+
+if __name__ == "__main__":
+    # We use the first argument as the temporary directory
+    if len(sys.argv) < 2:
+        # If no argument is provided, use the current directory
+        call_highly_var_genes_single_batch(".")
+    else:
+        call_highly_var_genes_single_batch(sys.argv[1])
+    
\ No newline at end of file
diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R
new file mode 100644
index 00000000..da873a7b
--- /dev/null
+++ b/r/tests/real_data/scanpy_variable_feat_selection.R
@@ -0,0 +1,47 @@
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r")
+
+compare_feat_selection_to_scanpy <- function(dir = NULL) {
+    # Set up temp dir
+    if (is.null(dir)) {
+        dir <- file.path(tempdir(), "feat_selection_test")
+        if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+        dir.create(dir)
+    }
+
+    # Call python script
+    system2("python", c("Scanpy_variable_feat_selection.py", dir))
+    
+    # read in input csv
+    input_mat_scanpy <- t(read.csv(file.path(dir, "highly_var_genes_scanpy_input.csv"), row.names = 1))
+    output_mat_scanpy <- read.csv(file.path(dir, "highly_var_genes_scanpy_output.csv"), row.names = 1)
+    # filter output mat to only where "highly_variable" is true
+    output_mat_scanpy$highly_variable <- as.logical(output_mat_scanpy$highly_variable)
+    output_mat_scanpy <- output_mat_scanpy[output_mat_scanpy$highly_variable,] %>% 
+        dplyr::arrange(desc(dispersions_norm)) %>% 
+        dplyr::select(-highly_variable) %>% # convert rownames to a column
+        tibble::rownames_to_column("name") %>%
+        dplyr::as_tibble()
+    # unlog the input_mat
+    input_mat_bpcells <- expm1(input_mat_scanpy)
+    output_bpcells <- highly_variable_features(
+        input_mat_bpcells %>% as("dgCMatrix") %>% as("IterableMatrix"),
+        num_feats = 50,
+        n_bins = 20,
+        save_feat_selection = TRUE
+    )
+    output_mat_bpcells <- output_bpcells$feature_selection
+    expect_true(all.equal(output_mat_bpcells$name, output_mat_scanpy$name))
+    expect_true(all.equal(output_mat_bpcells$mean, output_mat_scanpy$means, tolerance = 1e-6))
+    expect_true(all.equal(output_mat_bpcells$dispersion, output_mat_scanpy$dispersions, tolerance = 1e-6))
+    expect_true(all.equal(output_mat_bpcells$feature_dispersion_norm, output_mat_scanpy$dispersions_norm, tolerance = 1e-6))
+}
+
+compare_feat_selection_to_scanpy()

From fa1eb58c2386ca4b9ddbd0f2de54eebdfdf269a2 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 6 Nov 2024 19:35:03 -0800
Subject: [PATCH 07/15] [r] provide more colour to scanpy feat selection test

---
 r/tests/real_data/scanpy_variable_feat_selection.R | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R
index da873a7b..3bac07ab 100644
--- a/r/tests/real_data/scanpy_variable_feat_selection.R
+++ b/r/tests/real_data/scanpy_variable_feat_selection.R
@@ -8,6 +8,11 @@
 
 devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r")
 
+
+# Compare the feature selection output of BPCells to that of Scanpy.
+# Scanpy technically utilizes the Seurat (Satija et al. 2015) method for feature selection, so we should expect similar results of either pkg. 
+# This function calls a python script that runs Scanpy feature selection on a test dataset, and writes both input/output to `dir`.
+# It then reads in the input/output from the python script, calls the BPCells feature selection function, and compares the output to the Scanpy output.
 compare_feat_selection_to_scanpy <- function(dir = NULL) {
     # Set up temp dir
     if (is.null(dir)) {
@@ -29,8 +34,10 @@ compare_feat_selection_to_scanpy <- function(dir = NULL) {
         dplyr::select(-highly_variable) %>% # convert rownames to a column
         tibble::rownames_to_column("name") %>%
         dplyr::as_tibble()
-    # unlog the input_mat
+
+    # Scanpy undoes a log1p transformation on the input matrix, so we do the same here
     input_mat_bpcells <- expm1(input_mat_scanpy)
+    
     output_bpcells <- highly_variable_features(
         input_mat_bpcells %>% as("dgCMatrix") %>% as("IterableMatrix"),
         num_feats = 50,

From b895cbd5fe666aae831de836f16867418b00324d Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 7 Nov 2024 15:57:13 -0800
Subject: [PATCH 08/15] [r] cleanup real data tests

---
 r/tests/real_data/ArchR_LSI.R                 | 17 +++++++-------
 r/tests/real_data/ArchR_insertions.R          | 13 +++++++++--
 r/tests/real_data/config.csv                  |  3 +++
 .../scanpy_variable_feat_selection.R          | 13 ++++-------
 r/tests/real_data/test_helpers.R              | 22 +++++++++++++++++++
 5 files changed, 48 insertions(+), 20 deletions(-)
 create mode 100644 r/tests/real_data/config.csv
 create mode 100644 r/tests/real_data/test_helpers.R

diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R
index 2ad74610..e79ee98b 100644
--- a/r/tests/real_data/ArchR_LSI.R
+++ b/r/tests/real_data/ArchR_LSI.R
@@ -6,8 +6,9 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
-devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/")
-devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/")
+source("test_helpers.R")
+devtools::load_all(config[["path_bpcells"]])
+devtools::load_all(config[["path_archr"]])
 
 #' Perform a dimensionality reduction with tf-idf and SVD (LSI) on a matrix on ArchR and BPCells.
 #' As LSI uses an iterative approach on ArchR, we compare by using a single-iteration private function on ArchR.
@@ -15,11 +16,9 @@ devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/")
 #' from both functions and compare the matrix multiplication of the U and SVD matrices, which should give an approximation
 #' we can compare between the two packages.
 #' @param proj An archr project.
-test_lsi_similarity_to_archr <- function() {
-    # Set up temp dir
-    int_dir <- file.path(tempdir(), "insertion_test")
-    dir.create(int_dir)
-    setwd(int_dir)
+test_lsi_similarity_to_archr <- function(dir = NULL) {
+    dir <- create_temp_dir(dir)
+    setwd(dir)
     # add iterative lsi for dim reduction
     proj <- getTestProject()
     proj <- addPeakMatrix(proj)
@@ -33,8 +32,7 @@ test_lsi_similarity_to_archr <- function() {
         LSIMethod = 2,
         nDimensions = 2,
         binarize = FALSE,
-        outlierQuantiles = NULL,
-        test_mat = test_mat
+        outlierQuantiles = NULL
     )
     svd_archr <- lsi_archr$svd
     lsi_mat_archr <- t(lsi_archr$matSVD)
@@ -55,3 +53,4 @@ test_lsi_similarity_to_archr <- function() {
     pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res
     testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-6))
 }
+test_lsi_similarity_to_archr()
\ No newline at end of file
diff --git a/r/tests/real_data/ArchR_insertions.R b/r/tests/real_data/ArchR_insertions.R
index 150939d4..810133c1 100644
--- a/r/tests/real_data/ArchR_insertions.R
+++ b/r/tests/real_data/ArchR_insertions.R
@@ -1,5 +1,14 @@
-devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/")
-devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/")
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+source("test_helpers.R")
+devtools::load_all(config[["path_bpcells"]])
+devtools::load_all(config[["path_archr"]])
 
 fix_granges_syntax_for_archr <- function(gr) {
     mcols(gr)$RG <- gsub("PBSmall#", "", mcols(gr)$RG)
diff --git a/r/tests/real_data/config.csv b/r/tests/real_data/config.csv
new file mode 100644
index 00000000..915de4af
--- /dev/null
+++ b/r/tests/real_data/config.csv
@@ -0,0 +1,3 @@
+key,value
+path_archr,/mnt/c/users/Imman/PycharmProjects/ArchR/
+path_bpcells,/mnt/c/users/Imman/PycharmProjects/BPCells/r
diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R
index 3bac07ab..8b608773 100644
--- a/r/tests/real_data/scanpy_variable_feat_selection.R
+++ b/r/tests/real_data/scanpy_variable_feat_selection.R
@@ -6,23 +6,18 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
-devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r")
-
+source("test_helpers.R")
+devtools::load_all(config[["path_bpcells"]])
 
 # Compare the feature selection output of BPCells to that of Scanpy.
 # Scanpy technically utilizes the Seurat (Satija et al. 2015) method for feature selection, so we should expect similar results of either pkg. 
 # This function calls a python script that runs Scanpy feature selection on a test dataset, and writes both input/output to `dir`.
 # It then reads in the input/output from the python script, calls the BPCells feature selection function, and compares the output to the Scanpy output.
 compare_feat_selection_to_scanpy <- function(dir = NULL) {
-    # Set up temp dir
-    if (is.null(dir)) {
-        dir <- file.path(tempdir(), "feat_selection_test")
-        if (dir.exists(dir)) unlink(dir, recursive = TRUE)
-        dir.create(dir)
-    }
+    dir <- create_temp_dir(dir)
 
     # Call python script
-    system2("python", c("Scanpy_variable_feat_selection.py", dir))
+    system2("python3", c("Scanpy_variable_feat_selection.py", dir))
     
     # read in input csv
     input_mat_scanpy <- t(read.csv(file.path(dir, "highly_var_genes_scanpy_input.csv"), row.names = 1))
diff --git a/r/tests/real_data/test_helpers.R b/r/tests/real_data/test_helpers.R
new file mode 100644
index 00000000..2da80072
--- /dev/null
+++ b/r/tests/real_data/test_helpers.R
@@ -0,0 +1,22 @@
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+# Set up k-v pairs for other tests
+config_csv <- read.csv("config.csv")
+config <- as.list(config_csv)$value
+names(config) <- as.list(config_csv)$key
+
+# Set up temp dir in case it's not already set
+create_temp_dir <- function(dir = NULL) {
+  if (is.null(dir)) {
+    dir <- file.path(tempdir(), "lsi_test")
+    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+    dir.create(dir)
+  }
+  return(dir)
+}
\ No newline at end of file

From aee5d220fdac284d5d4d871b39102fdb4bcda5c4 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 7 Nov 2024 16:15:54 -0800
Subject: [PATCH 09/15] [r] clean up lsi, var features docstrings

---
 r/R/singlecell_utils.R            | 28 ++++++++++++++--------------
 r/man/highly_variable_features.Rd | 18 ++++++++++--------
 r/man/lsi.Rd                      | 19 ++++++++++---------
 3 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 94e5fe53..d8c6e39f 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -75,17 +75,17 @@ marker_features <- function(mat, groups, method="wilcoxon") {
 #' 
 #' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
 #' @param mat (IterableMatrix) dimensions features x cells
-#' @param z_score_norm (logical) If TRUE, z-score normalize the matrix before PCA.
+#' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA.
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
 #' @param scale_factor (integer) Scale factor for the tf-idf log transform.
+#' #' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector.
 #' @param threads (integer) Number of threads to use.
-#' @param save_lsi (logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector.
 #' @return 
-#' - If save_lsi is FALSE, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`.
-#' - If save_lsi is TRUE, return a list with the following elements:
-#'  - **pca_res**: dgCMatrix of shape `(n_dimensions, ncol(mat))``
-#' - **svd_attr**: List of SVD attributes
-#' - **idf**: Inverse document frequency vector
+#' - If `save_lsi` is `FALSE`, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`.
+#' - If `save_lsi` is `TRUE`, return a list with the following elements:
+#'    - `pca_res`: dgCMatrix of shape `(n_dimensions, ncol(mat))`
+#'    - `svd_attr`: List of SVD attributes
+#'    - `idf`: Inverse document frequency vector
 #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
 #' 
 #' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
@@ -136,18 +136,18 @@ lsi <- function(
   return(pca_res)
 }
 
-#' Get the most variable features within a matrix
+#' Get the most variable features within a matrix.
 #' @param num_feats (integer) Number of features to return.  If the number is higher than the number of features in the matrix, 
-#' ll features will be returned.
+#' all features will be returned.
 #' @param n_bins (integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin, 
 #' and if the number of features
 #' within a bin is less than 2, the dispersion is set to 1.
-#' @param save_feat_selection (logical) If TRUE, save the dispersions, means, and the features selected.
+#' @param save_feat_selection (logical) If `TRUE`, save the dispersions, means, and the features selected.
 #' @returns 
-#' - If `save_feat_selection` is False, return an IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
-#' - If `save_feat_selection` is True, return a list with the following elements:
-#' - **mat**: IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
-#' - **feature_selection**: Dataframe with the following columns:
+#' - If `save_feat_selection` is `FALSE`, return an IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
+#' - If `save_feat_selection` is `TRUE`, return a list with the following elements:
+#'    - `mat`: IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
+#'    - `feature_selection`: Dataframe with columns `name`, `mean`, `dispersion`, `bin`, `feature_dispersion_norm`.
 #' @inheritParams lsi
 #' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
 #' 
diff --git a/r/man/highly_variable_features.Rd b/r/man/highly_variable_features.Rd
index 943d986b..005a4e37 100644
--- a/r/man/highly_variable_features.Rd
+++ b/r/man/highly_variable_features.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/singlecell_utils.R
 \name{highly_variable_features}
 \alias{highly_variable_features}
-\title{Get the most variable features within a matrix}
+\title{Get the most variable features within a matrix.}
 \usage{
 highly_variable_features(
   mat,
@@ -16,26 +16,28 @@ highly_variable_features(
 \item{mat}{(IterableMatrix) dimensions features x cells}
 
 \item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
-ll features will be returned.}
+all features will be returned.}
 
 \item{n_bins}{(integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin,
 and if the number of features
 within a bin is less than 2, the dispersion is set to 1.}
 
-\item{save_feat_selection}{(logical) If TRUE, save the dispersions, means, and the features selected.}
+\item{save_feat_selection}{(logical) If \code{TRUE}, save the dispersions, means, and the features selected.}
 
 \item{threads}{(integer) Number of threads to use.}
 }
 \value{
 \itemize{
-\item If \code{save_feat_selection} is False, return an IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
-\item If \code{save_feat_selection} is True, return a list with the following elements:
-\item \strong{mat}: IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
-\item \strong{feature_selection}: Dataframe with the following columns:
+\item If \code{save_feat_selection} is \code{FALSE}, return an IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
+\item If \code{save_feat_selection} is \code{TRUE}, return a list with the following elements:
+\itemize{
+\item \code{mat}: IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
+\item \code{feature_selection}: Dataframe with columns \code{name}, \code{mean}, \code{dispersion}, \code{bin}, \code{feature_dispersion_norm}.
+}
 }
 }
 \description{
-Get the most variable features within a matrix
+Get the most variable features within a matrix.
 }
 \details{
 The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd
index 299513df..ea2687f9 100644
--- a/r/man/lsi.Rd
+++ b/r/man/lsi.Rd
@@ -16,23 +16,24 @@ lsi(
 \arguments{
 \item{mat}{(IterableMatrix) dimensions features x cells}
 
-\item{z_score_norm}{(logical) If TRUE, z-score normalize the matrix before PCA.}
+\item{z_score_norm}{(logical) If \code{TRUE}, z-score normalize the matrix before PCA.}
 
 \item{n_dimensions}{(integer) Number of dimensions to keep during PCA.}
 
-\item{scale_factor}{(integer) Scale factor for the tf-idf log transform.}
-
-\item{save_lsi}{(logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector.}
+\item{scale_factor}{(integer) Scale factor for the tf-idf log transform.
+#' @param save_lsi (logical) If \code{TRUE}, save the SVD attributes for the matrix, as well as the idf normalization vector.}
 
 \item{threads}{(integer) Number of threads to use.}
 }
 \value{
 \itemize{
-\item If save_lsi is FALSE, return a dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}.
-\item If save_lsi is TRUE, return a list with the following elements:
-\item \strong{pca_res}: dgCMatrix of shape `(n_dimensions, ncol(mat))``
-\item \strong{svd_attr}: List of SVD attributes
-\item \strong{idf}: Inverse document frequency vector
+\item If \code{save_lsi} is \code{FALSE}, return a dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}.
+\item If \code{save_lsi} is \code{TRUE}, return a list with the following elements:
+\itemize{
+\item \code{pca_res}: dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}
+\item \code{svd_attr}: List of SVD attributes
+\item \code{idf}: Inverse document frequency vector
+}
 }
 }
 \description{

From f8eec12f0dc112638169438b23a52457e389d83a Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 7 Nov 2024 18:45:32 -0800
Subject: [PATCH 10/15] [r] add in more lsi real data tests

---
 r/tests/real_data/ArchR_LSI.R | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R
index e79ee98b..09c04f03 100644
--- a/r/tests/real_data/ArchR_LSI.R
+++ b/r/tests/real_data/ArchR_LSI.R
@@ -30,7 +30,7 @@ test_lsi_similarity_to_archr <- function(dir = NULL) {
     lsi_archr <- .computeLSI(
         mat = test_mat,
         LSIMethod = 2,
-        nDimensions = 2,
+        nDimensions = 20,
         binarize = FALSE,
         outlierQuantiles = NULL
     )
@@ -47,10 +47,16 @@ test_lsi_similarity_to_archr <- function(dir = NULL) {
     lsi_bpcells <- lsi(
         test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"), 
         z_score_norm = FALSE,
-        n_dimensions = 2,
+        n_dimensions = 20,
         save_lsi = TRUE
     )
     pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res
-    testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-6))
+    testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-4))
+    # convert signs
+    lsi_mat_archr <- sweep(lsi_mat_archr, MARGIN = 1,  (2 * (lsi_mat_archr[,1] * lsi_bpcells$pca_res[,1] > 0) - 1), `*`)
+    # Check for post-pca matrix similarity
+    testthat::expect_true(all.equal(lsi_mat_archr, lsi_bpcells$pca_res, tolerance = 1e-4))
+    # also check for correlation between the two matrices in PC space
+    testthat::expect_true(cor(as.vector(lsi_mat_archr), as.vector(lsi_bpcells$pca_res)) > 0.999)
 }
 test_lsi_similarity_to_archr()
\ No newline at end of file

From 0810635041ea9fb10d97fb41e626d4ac9a4cfcf0 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 18 Nov 2024 12:56:18 -0800
Subject: [PATCH 11/15] [r] remove unused variable from `lsi()`

---
 r/R/singlecell_utils.R | 2 --
 1 file changed, 2 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index d8c6e39f..8ba8b803 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -105,8 +105,6 @@ lsi <- function(
   assert_is_wholenumber(threads)
 
   # log(tf-idf) transform
-  mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
-  
   npeaks <- colSums(mat) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h`
   tf <- mat %>% multiply_cols(1 / npeaks)
   idf_ <- ncol(mat) / rowSums(mat)

From 21f53f40e297372317c5f71aafb6205a9b7bf4ea Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 2 Dec 2024 02:12:19 -0800
Subject: [PATCH 12/15] [r] add requested changes

---
 r/R/singlecell_utils.R                        | 37 ++++++++++---------
 r/tests/real_data/ArchR_LSI.R                 | 15 ++++++--
 r/tests/real_data/config.csv                  |  3 --
 .../scanpy_variable_feat_selection.R          | 13 ++++++-
 r/tests/real_data/test_helpers.R              | 22 -----------
 5 files changed, 42 insertions(+), 48 deletions(-)
 delete mode 100644 r/tests/real_data/config.csv
 delete mode 100644 r/tests/real_data/test_helpers.R

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 8ba8b803..9b45cdb9 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -75,8 +75,8 @@ marker_features <- function(mat, groups, method="wilcoxon") {
 #' 
 #' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
 #' @param mat (IterableMatrix) dimensions features x cells
-#' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA.
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
+#' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA.
 #' @param scale_factor (integer) Scale factor for the tf-idf log transform.
 #' #' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector.
 #' @param threads (integer) Number of threads to use.
@@ -93,7 +93,7 @@ marker_features <- function(mat, groups, method="wilcoxon") {
 #' @export
 lsi <- function(
   mat,
-  z_score_norm = TRUE, n_dimensions = 50L, scale_factor = 1e4,
+  n_dimensions = 50L, z_score_norm = TRUE,  scale_factor = 1e4,
   save_lsi = FALSE,
   threads = 1L
 ) {
@@ -105,13 +105,17 @@ lsi <- function(
   assert_is_wholenumber(threads)
 
   # log(tf-idf) transform
-  npeaks <- colSums(mat) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h`
+  mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
+  npeaks <- mat_stats$col_stats["mean",] * nrow(mat) 
   tf <- mat %>% multiply_cols(1 / npeaks)
-  idf_ <- ncol(mat) / rowSums(mat)
+  idf_ <- ncol(mat) / (mat_stats$row_stats["mean",] * nrow(mat))
   mat_tfidf <- tf %>% multiply_rows(idf_)
   mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
   # Save to prevent re-calculation of queued operations
-  mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE)
+  mat_log_tfidf <- write_matrix_dir(
+    convert_matrix_type(mat_log_tfidf, type = "float"), 
+    tempfile("mat_log_tfidf"), compress = TRUE
+  )
   # Z-score normalization
   if (z_score_norm) {
     cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats = "variance", threads = threads)$col_stats
@@ -174,12 +178,11 @@ highly_variable_features <- function(
   mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads)
   feature_means <- mat_stats$row_stats["mean", ]
   feature_vars <- mat_stats$row_stats["variance", ]
-  # Give a small value to features with 0 mean, helps with 0 division
-  feature_means[feature_means == 0] <- 1e-12
   # Calculate dispersion, and log normalize
   feature_dispersion <- feature_vars / feature_means
   feature_dispersion[feature_dispersion == 0] <- NA
   feature_dispersion <- log(feature_dispersion)
+  feature_dispersion[feature_means == 0] <- 0
   feature_means <- log1p(feature_means)
   features_df <- data.frame(
     name = names(feature_means),
@@ -193,24 +196,22 @@ highly_variable_features <- function(
     dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% 
     dplyr::group_by(bin) %>% 
     dplyr::mutate( 
-      bin_mean = mean(dispersion, na.rm = TRUE), 
-      bin_sd = sd(dispersion, na.rm = TRUE),
-      bin_sd_is_na = is.na(bin_sd), 
-      bin_sd = ifelse(bin_sd_is_na, bin_mean, bin_sd), # Set feats that are in bins with only one feat to have a norm dispersion of 1
-      bin_mean = ifelse(bin_sd_is_na, 0, bin_mean),
-      feature_dispersion_norm = (dispersion - bin_mean) / bin_sd
-    ) %>%
+      feature_dispersion_norm = (dispersion - mean(dispersion)) / sd(dispersion),  
+      feature_dispersion_norm = dplyr::if_else(n() == 1, 1, feature_dispersion_norm) # Set feats that are in bins with only one feat to have a norm dispersion of 1  
+    ) %>%  
     dplyr::ungroup() %>%
     dplyr::select(c(-bin_sd_is_na, -var, -bin_sd, -bin_mean)) %>%
-    dplyr::arrange(desc(feature_dispersion_norm)) %>% 
-    dplyr::slice(1:min(num_feats, nrow(.)))
+    dplyr::slice_max(order_by = feature_dispersion_norm, n = num_feats)
   if (save_feat_selection) {
+    # get rownames that are in features_df$name
+    feats_of_interest <- which(rownames(mat) %in% features_df$name)
     return(list(
-      mat = mat[features_df$name,],
+      mat = mat[feats_of_interest,],
       feature_selection = features_df
     ))
   }
-  return(mat[features_df$name,])
+  return(mat[feats_of_interest,])
+  #return(mat[features_df$name,])
 }
 
                    
diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R
index 09c04f03..ff697b4c 100644
--- a/r/tests/real_data/ArchR_LSI.R
+++ b/r/tests/real_data/ArchR_LSI.R
@@ -6,9 +6,18 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
-source("test_helpers.R")
-devtools::load_all(config[["path_bpcells"]])
-devtools::load_all(config[["path_archr"]])
+library("BPCells")
+library("ArchR")
+
+# Set up temp dir in case it's not already set
+create_temp_dir <- function(dir = NULL) {
+  if (is.null(dir)) {
+    dir <- file.path(tempdir(), "lsi_test")
+    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+    dir.create(dir)
+  }
+  return(dir)
+}
 
 #' Perform a dimensionality reduction with tf-idf and SVD (LSI) on a matrix on ArchR and BPCells.
 #' As LSI uses an iterative approach on ArchR, we compare by using a single-iteration private function on ArchR.
diff --git a/r/tests/real_data/config.csv b/r/tests/real_data/config.csv
deleted file mode 100644
index 915de4af..00000000
--- a/r/tests/real_data/config.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-key,value
-path_archr,/mnt/c/users/Imman/PycharmProjects/ArchR/
-path_bpcells,/mnt/c/users/Imman/PycharmProjects/BPCells/r
diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R
index 8b608773..d2eb08b2 100644
--- a/r/tests/real_data/scanpy_variable_feat_selection.R
+++ b/r/tests/real_data/scanpy_variable_feat_selection.R
@@ -6,8 +6,17 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
-source("test_helpers.R")
-devtools::load_all(config[["path_bpcells"]])
+library("BPCells")
+
+# Set up temp dir in case it's not already set
+create_temp_dir <- function(dir = NULL) {
+  if (is.null(dir)) {
+    dir <- file.path(tempdir(), "lsi_test")
+    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+    dir.create(dir)
+  }
+  return(dir)
+}
 
 # Compare the feature selection output of BPCells to that of Scanpy.
 # Scanpy technically utilizes the Seurat (Satija et al. 2015) method for feature selection, so we should expect similar results of either pkg. 
diff --git a/r/tests/real_data/test_helpers.R b/r/tests/real_data/test_helpers.R
deleted file mode 100644
index 2da80072..00000000
--- a/r/tests/real_data/test_helpers.R
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright 2024 BPCells contributors
-# 
-# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
-# option. This file may not be copied, modified, or distributed
-# except according to those terms.
-
-# Set up k-v pairs for other tests
-config_csv <- read.csv("config.csv")
-config <- as.list(config_csv)$value
-names(config) <- as.list(config_csv)$key
-
-# Set up temp dir in case it's not already set
-create_temp_dir <- function(dir = NULL) {
-  if (is.null(dir)) {
-    dir <- file.path(tempdir(), "lsi_test")
-    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
-    dir.create(dir)
-  }
-  return(dir)
-}
\ No newline at end of file

From ec2c1ed7fcb0924a624a820ad8154c00af070e0e Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 2 Dec 2024 13:54:25 -0800
Subject: [PATCH 13/15] [r] fix requested changes

---
 r/R/singlecell_utils.R | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 9b45cdb9..48c42933 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -106,6 +106,7 @@ lsi <- function(
 
   # log(tf-idf) transform
   mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
+  
   npeaks <- mat_stats$col_stats["mean",] * nrow(mat) 
   tf <- mat %>% multiply_cols(1 / npeaks)
   idf_ <- ncol(mat) / (mat_stats$row_stats["mean",] * nrow(mat))
@@ -196,22 +197,20 @@ highly_variable_features <- function(
     dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% 
     dplyr::group_by(bin) %>% 
     dplyr::mutate( 
-      feature_dispersion_norm = (dispersion - mean(dispersion)) / sd(dispersion),  
-      feature_dispersion_norm = dplyr::if_else(n() == 1, 1, feature_dispersion_norm) # Set feats that are in bins with only one feat to have a norm dispersion of 1  
+      feature_dispersion_norm = (dispersion - mean(dispersion)) / sd(dispersion),
+      feature_dispersion_norm = if (dplyr::n() == 1) {1} else {feature_dispersion_norm} # Set feats that are in bins with only one feat to have a norm dispersion of 1  
     ) %>%  
     dplyr::ungroup() %>%
-    dplyr::select(c(-bin_sd_is_na, -var, -bin_sd, -bin_mean)) %>%
-    dplyr::slice_max(order_by = feature_dispersion_norm, n = num_feats)
+    dplyr::slice_max(order_by = feature_dispersion_norm, n = num_feats, with_ties = FALSE)
+  feats_of_interest <- which(rownames(mat) %in% features_df$name) # get rownames to get original sorted order
   if (save_feat_selection) {
     # get rownames that are in features_df$name
-    feats_of_interest <- which(rownames(mat) %in% features_df$name)
     return(list(
       mat = mat[feats_of_interest,],
       feature_selection = features_df
     ))
   }
   return(mat[feats_of_interest,])
-  #return(mat[features_df$name,])
 }
 
                    

From 5280318944647bc254bb42811116e85c3127be01 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Tue, 3 Dec 2024 15:56:25 -0800
Subject: [PATCH 14/15] [r] fix lsi docstring, idf_ logic

---
 r/R/singlecell_utils.R | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 48c42933..fea4d5dc 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -78,7 +78,7 @@ marker_features <- function(mat, groups, method="wilcoxon") {
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
 #' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA.
 #' @param scale_factor (integer) Scale factor for the tf-idf log transform.
-#' #' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector.
+#' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector.
 #' @param threads (integer) Number of threads to use.
 #' @return 
 #' - If `save_lsi` is `FALSE`, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`.
@@ -106,10 +106,9 @@ lsi <- function(
 
   # log(tf-idf) transform
   mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
-  
   npeaks <- mat_stats$col_stats["mean",] * nrow(mat) 
   tf <- mat %>% multiply_cols(1 / npeaks)
-  idf_ <- ncol(mat) / (mat_stats$row_stats["mean",] * nrow(mat))
+  idf_ <- 1 / mat_stats$row_stats["mean",]
   mat_tfidf <- tf %>% multiply_rows(idf_)
   mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
   # Save to prevent re-calculation of queued operations

From 2d05edb9cfe2f6ec45d18af8c3efb2d89b517b8c Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 6 Dec 2024 16:32:58 -0800
Subject: [PATCH 15/15] [r] replace z-score norm with corr cutoffs

---
 r/R/singlecell_utils.R               | 31 ++++++++++++++++------------
 r/tests/real_data/ArchR_LSI.R        |  1 -
 r/tests/real_data/ArchR_insertions.R | 15 +++++++++++---
 3 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index fea4d5dc..bdbb60b4 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -76,7 +76,8 @@ marker_features <- function(mat, groups, method="wilcoxon") {
 #' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
 #' @param mat (IterableMatrix) dimensions features x cells
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
-#' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA.
+#' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is great or equal to
+#' the corr_cutoff, it will be excluded from the final PCA matrix.
 #' @param scale_factor (integer) Scale factor for the tf-idf log transform.
 #' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector.
 #' @param threads (integer) Number of threads to use.
@@ -84,6 +85,7 @@ marker_features <- function(mat, groups, method="wilcoxon") {
 #' - If `save_lsi` is `FALSE`, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`.
 #' - If `save_lsi` is `TRUE`, return a list with the following elements:
 #'    - `pca_res`: dgCMatrix of shape `(n_dimensions, ncol(mat))`
+#'    - `unused_pcs`: Integer vector of PCs that were filtered out due to high correlation with sequencing depth
 #'    - `svd_attr`: List of SVD attributes
 #'    - `idf`: Inverse document frequency vector
 #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
@@ -93,7 +95,7 @@ marker_features <- function(mat, groups, method="wilcoxon") {
 #' @export
 lsi <- function(
   mat,
-  n_dimensions = 50L, z_score_norm = TRUE,  scale_factor = 1e4,
+  n_dimensions = 50L, corr_cutoff = 1, scale_factor = 1e4,
   save_lsi = FALSE,
   threads = 1L
 ) {
@@ -102,12 +104,13 @@ lsi <- function(
   assert_len(n_dimensions, 1)
   assert_greater_than_zero(n_dimensions)
   assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
+  assert_true((corr_cutoff >= 0) && (corr_cutoff <= 1))
   assert_is_wholenumber(threads)
 
   # log(tf-idf) transform
   mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
-  npeaks <- mat_stats$col_stats["mean",] * nrow(mat) 
-  tf <- mat %>% multiply_cols(1 / npeaks)
+  read_depth <- mat_stats$col_stats["mean",] * nrow(mat) 
+  tf <- mat %>% multiply_cols(1 / read_depth)
   idf_ <- 1 / mat_stats$row_stats["mean",]
   mat_tfidf <- tf %>% multiply_rows(idf_)
   mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
@@ -116,22 +119,24 @@ lsi <- function(
     convert_matrix_type(mat_log_tfidf, type = "float"), 
     tempfile("mat_log_tfidf"), compress = TRUE
   )
-  # Z-score normalization
-  if (z_score_norm) {
-    cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats = "variance", threads = threads)$col_stats
-    cell_means <- cell_peak_stats["mean",]
-    cell_vars <- cell_peak_stats["variance",]
-    mat_log_tfidf <- mat_log_tfidf %>%
-      add_cols(-cell_means) %>%
-      multiply_cols(1 / cell_vars)
-  }
   # Run pca
   svd_attr_ <- svds(mat_log_tfidf, k = n_dimensions, threads = threads)
   pca_res <- t(svd_attr_$u) %*% mat_log_tfidf
+  
+  # Filter out PCs that are highly correlated with sequencing depth
+  pca_corrs <- abs(cor(read_depth, t(pca_res)))
+  pca_feats_to_keep <- which(pca_corrs < corr_cutoff)
+  if (length(pca_feats_to_keep) != n_dimensions) {
+    # not sure if this is the route we want to take.  Should we just leave the PCs in,
+    # and not use them in downstream analysis?
+    pca_res <- t(svd_attr_$u[, pca_feats_to_keep]) %*% mat_log_tfidf
+  }
+
   if(save_lsi) {
     return(list(
       pca_res = pca_res,
       svd_attr = svd_attr_,
+      unused_pcs <- which(pca_corrs >= corr_cutoff),
       idf = idf_
     ))
   }
diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R
index ff697b4c..084645d3 100644
--- a/r/tests/real_data/ArchR_LSI.R
+++ b/r/tests/real_data/ArchR_LSI.R
@@ -55,7 +55,6 @@ test_lsi_similarity_to_archr <- function(dir = NULL) {
     # Do not use z-score normalization, as this isn't done with ArchR
     lsi_bpcells <- lsi(
         test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"), 
-        z_score_norm = FALSE,
         n_dimensions = 20,
         save_lsi = TRUE
     )
diff --git a/r/tests/real_data/ArchR_insertions.R b/r/tests/real_data/ArchR_insertions.R
index 810133c1..8818d7e7 100644
--- a/r/tests/real_data/ArchR_insertions.R
+++ b/r/tests/real_data/ArchR_insertions.R
@@ -6,9 +6,18 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
-source("test_helpers.R")
-devtools::load_all(config[["path_bpcells"]])
-devtools::load_all(config[["path_archr"]])
+devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r")
+devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/ArchR")
+
+# Set up temp dir in case it's not already set
+create_temp_dir <- function(dir = NULL) {
+  if (is.null(dir)) {
+    dir <- file.path(tempdir(), "lsi_test")
+    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+    dir.create(dir)
+  }
+  return(dir)
+}
 
 fix_granges_syntax_for_archr <- function(gr) {
     mcols(gr)$RG <- gsub("PBSmall#", "", mcols(gr)$RG)