diff --git a/r/NAMESPACE b/r/NAMESPACE
index 8625c143..304a2878 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -48,6 +48,7 @@ export(gene_score_archr)
 export(gene_score_tiles_archr)
 export(gene_score_weights_archr)
 export(get_trackplot_height)
+export(highly_variable_features)
 export(import_matrix_market)
 export(import_matrix_market_10x)
 export(knn_annoy)
@@ -55,6 +56,7 @@ export(knn_hnsw)
 export(knn_to_geodesic_graph)
 export(knn_to_snn_graph)
 export(log1p_slow)
+export(lsi)
 export(marker_features)
 export(match_gene_symbol)
 export(matrix_stats)
diff --git a/r/NEWS.md b/r/NEWS.md
index f14ddac8..432bb2f6 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -22,6 +22,8 @@ Contributions welcome :)
 - Add `rowQuantiles()` and `colQuantiles()` functions, which return the quantiles of each row/column of a matrix. Currently `rowQuantiles()` only works on row-major matrices and `colQuantiles()` only works on col-major matrices.
   If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128)
 - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128)
+- Add `lsi()` function to perform latent semantic indexing on a matrix (pull request #156).
+- Add `highly_variable_features()` function to identify highly variable features in a matrix (pull request #156).
 
 ## Improvements
 - `trackplot_loop()` now accepts discrete color scales
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 0fa1711d..bdbb60b4 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -1,4 +1,4 @@
-# Copyright 2023 BPCells contributors
+# Copyright 2024 BPCells contributors
 # 
 # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 # https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
@@ -70,6 +70,154 @@ marker_features <- function(mat, groups, method="wilcoxon") {
     )
 }
 
+
+#' Perform latent semantic indexing (LSI) on a matrix.
+#' 
+#' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
+#' @param mat (IterableMatrix) dimensions features x cells
+#' @param n_dimensions (integer) Number of dimensions to keep during PCA.
+#' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is great or equal to
+#' the corr_cutoff, it will be excluded from the final PCA matrix.
+#' @param scale_factor (integer) Scale factor for the tf-idf log transform.
+#' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector.
+#' @param threads (integer) Number of threads to use.
+#' @return 
+#' - If `save_lsi` is `FALSE`, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`.
+#' - If `save_lsi` is `TRUE`, return a list with the following elements:
+#'    - `pca_res`: dgCMatrix of shape `(n_dimensions, ncol(mat))`
+#'    - `unused_pcs`: Integer vector of PCs that were filtered out due to high correlation with sequencing depth
+#'    - `svd_attr`: List of SVD attributes
+#'    - `idf`: Inverse document frequency vector
+#' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
+#' 
+#' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
+#' - 17.1 MB memory usage, 25.1 seconds runtime
+#' @export
+lsi <- function(
+  mat,
+  n_dimensions = 50L, corr_cutoff = 1, scale_factor = 1e4,
+  save_lsi = FALSE,
+  threads = 1L
+) {
+  assert_is(mat, "IterableMatrix")
+  assert_is_wholenumber(n_dimensions)
+  assert_len(n_dimensions, 1)
+  assert_greater_than_zero(n_dimensions)
+  assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
+  assert_true((corr_cutoff >= 0) && (corr_cutoff <= 1))
+  assert_is_wholenumber(threads)
+
+  # log(tf-idf) transform
+  mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
+  read_depth <- mat_stats$col_stats["mean",] * nrow(mat) 
+  tf <- mat %>% multiply_cols(1 / read_depth)
+  idf_ <- 1 / mat_stats$row_stats["mean",]
+  mat_tfidf <- tf %>% multiply_rows(idf_)
+  mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
+  # Save to prevent re-calculation of queued operations
+  mat_log_tfidf <- write_matrix_dir(
+    convert_matrix_type(mat_log_tfidf, type = "float"), 
+    tempfile("mat_log_tfidf"), compress = TRUE
+  )
+  # Run pca
+  svd_attr_ <- svds(mat_log_tfidf, k = n_dimensions, threads = threads)
+  pca_res <- t(svd_attr_$u) %*% mat_log_tfidf
+  
+  # Filter out PCs that are highly correlated with sequencing depth
+  pca_corrs <- abs(cor(read_depth, t(pca_res)))
+  pca_feats_to_keep <- which(pca_corrs < corr_cutoff)
+  if (length(pca_feats_to_keep) != n_dimensions) {
+    # not sure if this is the route we want to take.  Should we just leave the PCs in,
+    # and not use them in downstream analysis?
+    pca_res <- t(svd_attr_$u[, pca_feats_to_keep]) %*% mat_log_tfidf
+  }
+
+  if(save_lsi) {
+    return(list(
+      pca_res = pca_res,
+      svd_attr = svd_attr_,
+      unused_pcs <- which(pca_corrs >= corr_cutoff),
+      idf = idf_
+    ))
+  }
+  return(pca_res)
+}
+
+#' Get the most variable features within a matrix.
+#' @param num_feats (integer) Number of features to return.  If the number is higher than the number of features in the matrix, 
+#' all features will be returned.
+#' @param n_bins (integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin, 
+#' and if the number of features
+#' within a bin is less than 2, the dispersion is set to 1.
+#' @param save_feat_selection (logical) If `TRUE`, save the dispersions, means, and the features selected.
+#' @returns 
+#' - If `save_feat_selection` is `FALSE`, return an IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
+#' - If `save_feat_selection` is `TRUE`, return a list with the following elements:
+#'    - `mat`: IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
+#'    - `feature_selection`: Dataframe with columns `name`, `mean`, `dispersion`, `bin`, `feature_dispersion_norm`.
+#' @inheritParams lsi
+#' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
+#' 
+#' Calculate using the following process:
+#'  1. Calculate the dispersion of each feature (variance / mean)
+#'  2. Log normalize dispersion and mean
+#'  3. Bin the features by their means, and normalize dispersion within each bin
+#' @export
+highly_variable_features <- function(
+  mat, num_feats, n_bins = 20,
+  save_feat_selection = FALSE,
+  threads = 1L
+) {
+  assert_is(mat, "IterableMatrix")
+  assert_greater_than_zero(num_feats)
+  assert_is_wholenumber(num_feats)
+  assert_len(num_feats, 1)
+  assert_is_wholenumber(n_bins)
+  assert_len(n_bins, 1)
+  assert_greater_than_zero(n_bins)
+  if (nrow(mat) <= num_feats) {
+    log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats))
+    return(mat)
+  }
+  # Calculate row information for dispersion
+  mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads)
+  feature_means <- mat_stats$row_stats["mean", ]
+  feature_vars <- mat_stats$row_stats["variance", ]
+  # Calculate dispersion, and log normalize
+  feature_dispersion <- feature_vars / feature_means
+  feature_dispersion[feature_dispersion == 0] <- NA
+  feature_dispersion <- log(feature_dispersion)
+  feature_dispersion[feature_means == 0] <- 0
+  feature_means <- log1p(feature_means)
+  features_df <- data.frame(
+    name = names(feature_means),
+    var = feature_vars, 
+    mean = feature_means,
+    dispersion = feature_dispersion
+  ) 
+
+  # Bin by mean, and normalize dispersion with each bin
+  features_df <- features_df %>% 
+    dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% 
+    dplyr::group_by(bin) %>% 
+    dplyr::mutate( 
+      feature_dispersion_norm = (dispersion - mean(dispersion)) / sd(dispersion),
+      feature_dispersion_norm = if (dplyr::n() == 1) {1} else {feature_dispersion_norm} # Set feats that are in bins with only one feat to have a norm dispersion of 1  
+    ) %>%  
+    dplyr::ungroup() %>%
+    dplyr::slice_max(order_by = feature_dispersion_norm, n = num_feats, with_ties = FALSE)
+  feats_of_interest <- which(rownames(mat) %in% features_df$name) # get rownames to get original sorted order
+  if (save_feat_selection) {
+    # get rownames that are in features_df$name
+    return(list(
+      mat = mat[feats_of_interest,],
+      feature_selection = features_df
+    ))
+  }
+  return(mat[feats_of_interest,])
+}
+
+                   
 #' Aggregate counts matrices by cell group or feature.
 #'
 #' Given a `(features x cells)` matrix, group cells by `cell_groups` and aggregate counts by `method` for each
diff --git a/r/R/transforms.R b/r/R/transforms.R
index 2a5de994..8d36e2e1 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -922,4 +922,4 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
     global_params = numeric(),
     vars_to_regress = vars_to_regress
   )
-}
+}
\ No newline at end of file
diff --git a/r/man/highly_variable_features.Rd b/r/man/highly_variable_features.Rd
new file mode 100644
index 00000000..005a4e37
--- /dev/null
+++ b/r/man/highly_variable_features.Rd
@@ -0,0 +1,51 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{highly_variable_features}
+\alias{highly_variable_features}
+\title{Get the most variable features within a matrix.}
+\usage{
+highly_variable_features(
+  mat,
+  num_feats,
+  n_bins = 20,
+  save_feat_selection = FALSE,
+  threads = 1L
+)
+}
+\arguments{
+\item{mat}{(IterableMatrix) dimensions features x cells}
+
+\item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
+all features will be returned.}
+
+\item{n_bins}{(integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin,
+and if the number of features
+within a bin is less than 2, the dispersion is set to 1.}
+
+\item{save_feat_selection}{(logical) If \code{TRUE}, save the dispersions, means, and the features selected.}
+
+\item{threads}{(integer) Number of threads to use.}
+}
+\value{
+\itemize{
+\item If \code{save_feat_selection} is \code{FALSE}, return an IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
+\item If \code{save_feat_selection} is \code{TRUE}, return a list with the following elements:
+\itemize{
+\item \code{mat}: IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
+\item \code{feature_selection}: Dataframe with columns \code{name}, \code{mean}, \code{dispersion}, \code{bin}, \code{feature_dispersion_norm}.
+}
+}
+}
+\description{
+Get the most variable features within a matrix.
+}
+\details{
+The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
+
+Calculate using the following process:
+\enumerate{
+\item Calculate the dispersion of each feature (variance / mean)
+\item Log normalize dispersion and mean
+\item Bin the features by their means, and normalize dispersion within each bin
+}
+}
diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd
new file mode 100644
index 00000000..ea2687f9
--- /dev/null
+++ b/r/man/lsi.Rd
@@ -0,0 +1,49 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{lsi}
+\alias{lsi}
+\title{Perform latent semantic indexing (LSI) on a matrix.}
+\usage{
+lsi(
+  mat,
+  z_score_norm = TRUE,
+  n_dimensions = 50L,
+  scale_factor = 10000,
+  save_lsi = FALSE,
+  threads = 1L
+)
+}
+\arguments{
+\item{mat}{(IterableMatrix) dimensions features x cells}
+
+\item{z_score_norm}{(logical) If \code{TRUE}, z-score normalize the matrix before PCA.}
+
+\item{n_dimensions}{(integer) Number of dimensions to keep during PCA.}
+
+\item{scale_factor}{(integer) Scale factor for the tf-idf log transform.
+#' @param save_lsi (logical) If \code{TRUE}, save the SVD attributes for the matrix, as well as the idf normalization vector.}
+
+\item{threads}{(integer) Number of threads to use.}
+}
+\value{
+\itemize{
+\item If \code{save_lsi} is \code{FALSE}, return a dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}.
+\item If \code{save_lsi} is \code{TRUE}, return a list with the following elements:
+\itemize{
+\item \code{pca_res}: dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}
+\item \code{svd_attr}: List of SVD attributes
+\item \code{idf}: Inverse document frequency vector
+}
+}
+}
+\description{
+Given a \verb{(features x cells)} matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}.
+}
+\details{
+Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
+
+Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
+\itemize{
+\item 17.1 MB memory usage, 25.1 seconds runtime
+}
+}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index b1459570..c08f55f0 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -122,6 +122,8 @@ reference:
   - checksum
   - apply_by_row
   - regress_out
+  - lsi
+  - highly_variable_features
   - IterableMatrix-methods
   - pseudobulk_matrix
 
diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R
new file mode 100644
index 00000000..084645d3
--- /dev/null
+++ b/r/tests/real_data/ArchR_LSI.R
@@ -0,0 +1,70 @@
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+library("BPCells")
+library("ArchR")
+
+# Set up temp dir in case it's not already set
+create_temp_dir <- function(dir = NULL) {
+  if (is.null(dir)) {
+    dir <- file.path(tempdir(), "lsi_test")
+    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+    dir.create(dir)
+  }
+  return(dir)
+}
+
+#' Perform a dimensionality reduction with tf-idf and SVD (LSI) on a matrix on ArchR and BPCells.
+#' As LSI uses an iterative approach on ArchR, we compare by using a single-iteration private function on ArchR.
+#' As the SVD implementation is not necessarily the same between the two packages, we take the SVD matrix
+#' from both functions and compare the matrix multiplication of the U and SVD matrices, which should give an approximation
+#' we can compare between the two packages.
+#' @param proj An archr project.
+test_lsi_similarity_to_archr <- function(dir = NULL) {
+    dir <- create_temp_dir(dir)
+    setwd(dir)
+    # add iterative lsi for dim reduction
+    proj <- getTestProject()
+    proj <- addPeakMatrix(proj)
+    # Get the peak matrix
+    test_mat <- assay(getMatrixFromProject(proj, useMatrix = "PeakMatrix"))
+    # Calculate LSI on ArchR
+    # running LSI without binarizing, as we don't do this in the BPCells implementation
+    # we also don't filter quantile outliers.
+    lsi_archr <- .computeLSI(
+        mat = test_mat,
+        LSIMethod = 2,
+        nDimensions = 20,
+        binarize = FALSE,
+        outlierQuantiles = NULL
+    )
+    svd_archr <- lsi_archr$svd
+    lsi_mat_archr <- t(lsi_archr$matSVD)
+    # set rownames to NA, as we don't have rownames in the BPCells implementation
+    rownames(lsi_mat_archr) <- NULL
+    # PCA Matrix = T(u) * Pre-SVD Matrix
+    # u * PCA Matrix = u * T(u) * Pre-SVD Matrix
+    # u * PCA Matrix = Pre-SVD Matrix
+    pre_svd_mat_approx_archr <- lsi_archr$svd$u %*% lsi_mat_archr
+    # Calculate LSI on BPCells
+    # Do not use z-score normalization, as this isn't done with ArchR
+    lsi_bpcells <- lsi(
+        test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"), 
+        n_dimensions = 20,
+        save_lsi = TRUE
+    )
+    pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res
+    testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-4))
+    # convert signs
+    lsi_mat_archr <- sweep(lsi_mat_archr, MARGIN = 1,  (2 * (lsi_mat_archr[,1] * lsi_bpcells$pca_res[,1] > 0) - 1), `*`)
+    # Check for post-pca matrix similarity
+    testthat::expect_true(all.equal(lsi_mat_archr, lsi_bpcells$pca_res, tolerance = 1e-4))
+    # also check for correlation between the two matrices in PC space
+    testthat::expect_true(cor(as.vector(lsi_mat_archr), as.vector(lsi_bpcells$pca_res)) > 0.999)
+}
+test_lsi_similarity_to_archr()
\ No newline at end of file
diff --git a/r/tests/real_data/ArchR_insertions.R b/r/tests/real_data/ArchR_insertions.R
index 150939d4..8818d7e7 100644
--- a/r/tests/real_data/ArchR_insertions.R
+++ b/r/tests/real_data/ArchR_insertions.R
@@ -1,5 +1,23 @@
-devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/")
-devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/")
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r")
+devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/ArchR")
+
+# Set up temp dir in case it's not already set
+create_temp_dir <- function(dir = NULL) {
+  if (is.null(dir)) {
+    dir <- file.path(tempdir(), "lsi_test")
+    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+    dir.create(dir)
+  }
+  return(dir)
+}
 
 fix_granges_syntax_for_archr <- function(gr) {
     mcols(gr)$RG <- gsub("PBSmall#", "", mcols(gr)$RG)
diff --git a/r/tests/real_data/Scanpy_variable_feat_selection.py b/r/tests/real_data/Scanpy_variable_feat_selection.py
new file mode 100644
index 00000000..6ae28c1b
--- /dev/null
+++ b/r/tests/real_data/Scanpy_variable_feat_selection.py
@@ -0,0 +1,46 @@
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+import sys, tempfile, os
+import numpy as np
+import pandas as pd
+import scanpy as sc
+
+
+def call_highly_var_genes_single_batch(temp_dir: str) -> None:
+    """
+    Call highly_variable genes on a single batch of PBMC68k data using their interpreation of 
+    the Seurat implementation.
+    Write the input anndata object csv at `<temp_dir>/highly_var_genes_scanpy_input.csv`
+    Write the output as a csv, at `<temp_dir>/highly_var_genes_scanpy_output.csv`
+
+    Args:
+        temp_dir (str): Path to the temporary directory to write the input and output files.
+    """
+    # Dataset is only (765, 700)
+    adata = sc.datasets.pbmc68k_reduced()
+    adata.var_names_make_unique()
+    res = sc.pp._highly_variable_genes.highly_variable_genes(adata, 
+                                                             n_top_genes = 50,
+                                                             n_bins = 20, 
+                                                             flavor = "seurat",
+                                                             inplace = False,
+                                                             check_values = False).drop(columns = 'mean_bin')
+    # remove mean_bin
+    adata.to_df().to_csv(os.path.join(temp_dir, "highly_var_genes_scanpy_input.csv"))
+    res.to_csv(os.path.join(temp_dir, "highly_var_genes_scanpy_output.csv"))
+
+
+if __name__ == "__main__":
+    # We use the first argument as the temporary directory
+    if len(sys.argv) < 2:
+        # If no argument is provided, use the current directory
+        call_highly_var_genes_single_batch(".")
+    else:
+        call_highly_var_genes_single_batch(sys.argv[1])
+    
\ No newline at end of file
diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R
new file mode 100644
index 00000000..d2eb08b2
--- /dev/null
+++ b/r/tests/real_data/scanpy_variable_feat_selection.R
@@ -0,0 +1,58 @@
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+library("BPCells")
+
+# Set up temp dir in case it's not already set
+create_temp_dir <- function(dir = NULL) {
+  if (is.null(dir)) {
+    dir <- file.path(tempdir(), "lsi_test")
+    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+    dir.create(dir)
+  }
+  return(dir)
+}
+
+# Compare the feature selection output of BPCells to that of Scanpy.
+# Scanpy technically utilizes the Seurat (Satija et al. 2015) method for feature selection, so we should expect similar results of either pkg. 
+# This function calls a python script that runs Scanpy feature selection on a test dataset, and writes both input/output to `dir`.
+# It then reads in the input/output from the python script, calls the BPCells feature selection function, and compares the output to the Scanpy output.
+compare_feat_selection_to_scanpy <- function(dir = NULL) {
+    dir <- create_temp_dir(dir)
+
+    # Call python script
+    system2("python3", c("Scanpy_variable_feat_selection.py", dir))
+    
+    # read in input csv
+    input_mat_scanpy <- t(read.csv(file.path(dir, "highly_var_genes_scanpy_input.csv"), row.names = 1))
+    output_mat_scanpy <- read.csv(file.path(dir, "highly_var_genes_scanpy_output.csv"), row.names = 1)
+    # filter output mat to only where "highly_variable" is true
+    output_mat_scanpy$highly_variable <- as.logical(output_mat_scanpy$highly_variable)
+    output_mat_scanpy <- output_mat_scanpy[output_mat_scanpy$highly_variable,] %>% 
+        dplyr::arrange(desc(dispersions_norm)) %>% 
+        dplyr::select(-highly_variable) %>% # convert rownames to a column
+        tibble::rownames_to_column("name") %>%
+        dplyr::as_tibble()
+
+    # Scanpy undoes a log1p transformation on the input matrix, so we do the same here
+    input_mat_bpcells <- expm1(input_mat_scanpy)
+    
+    output_bpcells <- highly_variable_features(
+        input_mat_bpcells %>% as("dgCMatrix") %>% as("IterableMatrix"),
+        num_feats = 50,
+        n_bins = 20,
+        save_feat_selection = TRUE
+    )
+    output_mat_bpcells <- output_bpcells$feature_selection
+    expect_true(all.equal(output_mat_bpcells$name, output_mat_scanpy$name))
+    expect_true(all.equal(output_mat_bpcells$mean, output_mat_scanpy$means, tolerance = 1e-6))
+    expect_true(all.equal(output_mat_bpcells$dispersion, output_mat_scanpy$dispersions, tolerance = 1e-6))
+    expect_true(all.equal(output_mat_bpcells$feature_dispersion_norm, output_mat_scanpy$dispersions_norm, tolerance = 1e-6))
+}
+
+compare_feat_selection_to_scanpy()
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 8bae8966..8968d574 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -8,6 +8,12 @@
 
 generate_sparse_matrix <- function(nrow, ncol, fraction_nonzero = 0.5, max_val = 10) {
   m <- matrix(rbinom(nrow * ncol, 1, fraction_nonzero) * sample.int(max_val, nrow * ncol, replace = TRUE), nrow = nrow)
+  rownames(m) <- paste0("feat", seq_len(nrow(m)))
+  colnames(m) <- paste0("cell", seq_len(ncol(m)))
+  as(m, "dgCMatrix")
+}
+generate_dense_matrix <- function(nrow, ncol) {
+  m <- matrix(runif(nrow * ncol), nrow = nrow)
   rownames(m) <- paste0("row", seq_len(nrow(m)))
   colnames(m) <- paste0("col", seq_len(ncol(m)))
   as(m, "dgCMatrix")
@@ -125,6 +131,39 @@ test_that("Pseudobulk aggregation works", {
   }
 })
 
+test_that("C++ UMAP graph calculation works", {
+    # This uses a pre-calculated graph from the umap-learn python package.
+    # See ../data/generate_iris_geodesic_graph.R for the generation code 
+    test_data <- readRDS("../data/iris_geodesic_graph.rds")
+    knn <- test_data$knn
+    res <- knn_to_geodesic_graph(knn)
+    expect_equal(as.matrix(res + t(res)), as.matrix(test_data$graph), tolerance=1e-6)
+})
+
+test_that("Highly variable feature selection works", {
+    mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix")
+    # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to Seurat
+    res <- highly_variable_features(mat, num_feats = 10, n_bins = 5, threads = 1)
+    res_t <- highly_variable_features(t(mat), num_feats = 10, n_bins = 5, threads = 1)
+    expect_equal(nrow(res), 10)
+    expect_equal(ncol(res), 26)
+    expect_equal(nrow(res_t), 10)
+    expect_equal(ncol(res_t), 500)
+})
+
+test_that("LSI works", {
+    mat <- matrix(runif(240), nrow=10) %>% as("dgCMatrix") %>% as("IterableMatrix")
+    rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
+    colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
+    # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to ArchR
+    lsi_res <- lsi(mat, n_dimensions = 5)
+    lsi_res_t <- lsi(t(mat), n_dimensions = 5)
+    expect_equal(nrow(lsi_res), 5)
+    expect_equal(ncol(lsi_res), ncol(mat))
+    expect_equal(nrow(lsi_res_t), 5)
+    expect_equal(ncol(lsi_res_t), nrow(mat))
+})
+
 test_that("Pseudobulk aggregation works with multiple return types", {
   m0 <- generate_sparse_matrix(20, 10, max_val = 10)
   m1 <- m0 |> as("IterableMatrix")