From e4bf41d686aa3fb24c52bffb56d6002ec73cc150 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 13 Nov 2024 18:42:26 -0800
Subject: [PATCH 1/2] [r] add initial poc

---
 r/R/matrix_transformers.R | 84 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 r/R/matrix_transformers.R
diff --git a/r/R/matrix_transformers.R b/r/R/matrix_transformers.R
new file mode 100644
index 00000000..e9d8133a
--- /dev/null
+++ b/r/R/matrix_transformers.R
@@ -0,0 +1,84 @@
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+# #' Perform latent semantic indexing (LSI) on a matrix.
+# #' @name LSITransformer
+# #' @export
+# setClass("LSITransformer",
+#   contains = "Transformer",
+#   slots = list(
+#     idf_ = "numeric",
+#     svd_attr_ = "list",
+#     z_score_norm = "logical",
+#     n_dimensions = "integer",
+#     scale_factor = "integer",
+#     threads = "integer"
+#   ),
+#   prototype = list(
+#     idf_ = numeric(0),
+#     svd_attr_ = list(),
+#     z_score_norm = FALSE,
+#     n_dimensions = 20L,
+#     scale_factor = 1e4L,
+#     threads = 1L
+#   )
+# )
+
+# #' Create a new LSITransformer object
+# #' @export
+# LSITransformer <- function(z_score_norm, n_dimensions, scale_factor, threads) {
+#   return(new(
+#     "LSITransformer", z_score_norm = z_score_norm, n_dimensions = n_dimensions, 
+#     scale_factor = scale_factor, threads = threads, step_name = "LSITransformer"))
+# }
+
+# setMethod("fit", signature(object = "LSITransformer", x = "IterableMatrix"), function(object, x, ...) {
+#   ret <- lsi(
+#     x, z_score_norm = object@z_score_norm, n_dimensions = object@n_dimensions, 
+#     scale_factor = object@scale_factor, threads = object@threads,
+#     save_lsi = TRUE
+#   )
+#   object@idf_ <- ret$idf
+#   object@svd_attr_ <- ret$svd_attr
+#   object@fitted <- TRUE
+#   return(object)
+# })
+
+# setMethod("transform", signature(object = "LSITransformer", x = "IterableMatrix"), function(object, x, ...) {
+#   # rudimentary implementation -- Works but is duplicate code.  
+#   assert_true(object@fitted)
+#   # Wait until LSI PR has been reviewed
+#   npeaks <- colSums(x) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h`
+#   tf <- x %>% multiply_cols(1 / npeaks)
+#   mat_tfidf <- tf %>% multiply_rows(object@idf_)
+#   mat_log_tfidf <- log1p(object@scale_factor * mat_tfidf)
+#   mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE)
+#   if (object@z_score_norm) {
+#     cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats = "variance", threads = object@threads)$col_stats
+#     cell_means <- cell_peak_stats["mean",]
+#     cell_vars <- cell_peak_stats["variance",]
+#     mat_log_tfidf <- mat_log_tfidf %>%
+#       add_cols(-cell_means) %>%
+#       multiply_cols(1 / cell_vars)
+#   }
+#   pca_res <- t(object@svd_attr_$u) %*% mat_log_tfidf
+#   return(pca_res)
+# })
+
+# setMethod("short_description", "LSITransformer", function(x) {
+#   return(sprintf("LSITransformer(z_score_norm=%s, n_dimensions=%d, scale_factor=%d, threads=%d)",
+#     x@z_score_norm, x@n_dimensions, x@scale_factor, x@threads))
+# })
+
+# setClass("VarFeatSelectorTransformer",
+#   contains = "Transformer",
+#   slots = list(
+#     num_feats = "integer",
+#     n_bins = "integer"
+#   )
+# )
\ No newline at end of file

From ab3d71da73a0f74cbea820066e52f566be884d42 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 18 Nov 2024 15:20:12 -0800
Subject: [PATCH 2/2] [r] add in feat-selection, lsi pipeline

---
 r/NAMESPACE                         |   6 +
 r/R/matrix_transformers.R           | 198 ++++++++++++++++++----------
 r/man/LSITransformer.Rd             |  21 +++
 r/man/VarFeatSelectorTransformer.Rd |  36 +++++
 r/pkgdown/_pkgdown.yml              |   5 +
 r/tests/testthat/test-pipelines.R   |  53 ++++++++
 6 files changed, 248 insertions(+), 71 deletions(-)
 create mode 100644 r/man/LSITransformer.Rd
 create mode 100644 r/man/VarFeatSelectorTransformer.Rd
 create mode 100644 r/tests/testthat/test-pipelines.R

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 55d4adef..f7e7e679 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -7,7 +7,9 @@ S3method(svds,default)
 export("all_matrix_inputs<-")
 export("cellNames<-")
 export("chrNames<-")
+export(LSITransformer)
 export(Pipeline)
+export(VarFeatSelectorTransformer)
 export(add_cols)
 export(add_rows)
 export(all_matrix_inputs)
@@ -121,6 +123,7 @@ export(shift_fragments)
 export(storage_order)
 export(subset_lengths)
 export(svds)
+export(t)
 export(tile_matrix)
 export(trackplot_bulk)
 export(trackplot_combine)
@@ -141,15 +144,18 @@ export(write_matrix_dir)
 export(write_matrix_hdf5)
 export(write_matrix_memory)
 exportClasses(Estimator)
+exportClasses(LSITransformer)
 exportClasses(PipelineBase)
 exportClasses(PipelineStep)
 exportClasses(Transformer)
+exportClasses(VarFeatSelectorTransformer)
 exportMethods(as.data.frame)
 exportMethods(as.matrix)
 exportMethods(c)
 exportMethods(estimate)
 exportMethods(fit)
 exportMethods(project)
+exportMethods(short_description)
 exportMethods(show)
 exportMethods(t)
 importClassesFrom(Matrix,dgCMatrix)
diff --git a/r/R/matrix_transformers.R b/r/R/matrix_transformers.R
index e9d8133a..ba9cfb84 100644
--- a/r/R/matrix_transformers.R
+++ b/r/R/matrix_transformers.R
@@ -6,79 +6,135 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
-# #' Perform latent semantic indexing (LSI) on a matrix.
-# #' @name LSITransformer
-# #' @export
-# setClass("LSITransformer",
-#   contains = "Transformer",
-#   slots = list(
-#     idf_ = "numeric",
-#     svd_attr_ = "list",
-#     z_score_norm = "logical",
-#     n_dimensions = "integer",
-#     scale_factor = "integer",
-#     threads = "integer"
-#   ),
-#   prototype = list(
-#     idf_ = numeric(0),
-#     svd_attr_ = list(),
-#     z_score_norm = FALSE,
-#     n_dimensions = 20L,
-#     scale_factor = 1e4L,
-#     threads = 1L
-#   )
-# )
 
-# #' Create a new LSITransformer object
-# #' @export
-# LSITransformer <- function(z_score_norm, n_dimensions, scale_factor, threads) {
-#   return(new(
-#     "LSITransformer", z_score_norm = z_score_norm, n_dimensions = n_dimensions, 
-#     scale_factor = scale_factor, threads = threads, step_name = "LSITransformer"))
-# }
+#' Perform latent semantic indexing (LSI) on a matrix.
+#' TODO: Add more details from when upstream LSI PR is reviewed.
+#' @name LSITransformer
+#' @export
+setClass("LSITransformer",
+  contains = "Transformer",
+  slots = list(
+    idf_ = "numeric",
+    svd_attr_ = "list",
+    z_score_norm = "logical",
+    n_dimensions = "numeric",
+    scale_factor = "numeric",
+    threads = "integer"
+  )
+)
 
-# setMethod("fit", signature(object = "LSITransformer", x = "IterableMatrix"), function(object, x, ...) {
-#   ret <- lsi(
-#     x, z_score_norm = object@z_score_norm, n_dimensions = object@n_dimensions, 
-#     scale_factor = object@scale_factor, threads = object@threads,
-#     save_lsi = TRUE
-#   )
-#   object@idf_ <- ret$idf
-#   object@svd_attr_ <- ret$svd_attr
-#   object@fitted <- TRUE
-#   return(object)
-# })
+#' Create a new LSITransformer object
+#' @export
+LSITransformer <- function(z_score_norm = TRUE, n_dimensions = 50L, scale_factor = 1e4L, threads = 1L) {
+  return(new(
+    "LSITransformer", z_score_norm = z_score_norm, n_dimensions = n_dimensions, 
+    scale_factor = scale_factor, threads = threads, step_name = "LSITransformer"))
+}
 
-# setMethod("transform", signature(object = "LSITransformer", x = "IterableMatrix"), function(object, x, ...) {
-#   # rudimentary implementation -- Works but is duplicate code.  
-#   assert_true(object@fitted)
-#   # Wait until LSI PR has been reviewed
-#   npeaks <- colSums(x) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h`
-#   tf <- x %>% multiply_cols(1 / npeaks)
-#   mat_tfidf <- tf %>% multiply_rows(object@idf_)
-#   mat_log_tfidf <- log1p(object@scale_factor * mat_tfidf)
-#   mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE)
-#   if (object@z_score_norm) {
-#     cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats = "variance", threads = object@threads)$col_stats
-#     cell_means <- cell_peak_stats["mean",]
-#     cell_vars <- cell_peak_stats["variance",]
-#     mat_log_tfidf <- mat_log_tfidf %>%
-#       add_cols(-cell_means) %>%
-#       multiply_cols(1 / cell_vars)
-#   }
-#   pca_res <- t(object@svd_attr_$u) %*% mat_log_tfidf
-#   return(pca_res)
-# })
+#' @noRd
+#' @export
+setMethod("fit", signature(object = "LSITransformer", x = "IterableMatrix"), function(object, x, ...) {
+  ret <- lsi(
+    x, z_score_norm = object@z_score_norm, n_dimensions = object@n_dimensions, 
+    scale_factor = object@scale_factor, threads = object@threads,
+    save_lsi = TRUE
+  )
+  object@idf_ <- ret$idf
+  object@svd_attr_ <- ret$svd_attr
+  object@fitted <- TRUE
+  return(object)
+})
 
-# setMethod("short_description", "LSITransformer", function(x) {
-#   return(sprintf("LSITransformer(z_score_norm=%s, n_dimensions=%d, scale_factor=%d, threads=%d)",
-#     x@z_score_norm, x@n_dimensions, x@scale_factor, x@threads))
-# })
 
-# setClass("VarFeatSelectorTransformer",
-#   contains = "Transformer",
-#   slots = list(
-#     num_feats = "integer",
-#     n_bins = "integer"
-#   )
-# )
\ No newline at end of file
+#' @noRd
+#' @export
+setMethod("project", signature(object = "LSITransformer", x = "IterableMatrix"), function(object, x, ...) {
+  # rudimentary implementation -- Works but is duplicate code.
+  assert_true(object@fitted)
+  # Wait until LSI PR has been reviewed
+  npeaks <- colSums(x) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h`
+  tf <- x %>% multiply_cols(1 / npeaks)
+  mat_tfidf <- tf %>% multiply_rows(object@idf_)
+  mat_log_tfidf <- log1p(object@scale_factor * mat_tfidf)
+  mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE)
+  if (object@z_score_norm) {
+    cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats = "variance", threads = object@threads)$col_stats
+    cell_means <- cell_peak_stats["mean",]
+    cell_vars <- cell_peak_stats["variance",]
+    mat_log_tfidf <- mat_log_tfidf %>%
+      add_cols(-cell_means) %>%
+      multiply_cols(1 / cell_vars)
+  }
+  pca_res <- t(object@svd_attr_$u) %*% mat_log_tfidf
+  return(pca_res)
+})
+
+
+setMethod("short_description", "LSITransformer", function(x) {
+  return(sprintf("LSITransformer(z_score_norm=%s, n_dimensions=%d, scale_factor=%d, threads=%d)",
+    x@z_score_norm, x@n_dimensions, x@scale_factor, x@threads))
+})
+
+
+#' Perform feature selection on a matrix using dispersion.
+#' TODO: Add more details from when upstream lsi PR is reviewed.
+#' @name VarFeatSelectorTransformer
+#' @export
+setClass("VarFeatSelectorTransformer",
+  contains = "Transformer",
+  slots = list(
+    features_ = "character",
+    num_feats = "numeric",
+    n_bins = "numeric",
+    threads = "integer"
+  )
+)
+
+#' Get the most variable features within a matrix.
+#' @param num_feats (integer) Number of features to return.  If the number is higher than the number of features in the matrix,
+#' all features will be returned.
+#' @param n_bins (integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin,
+#' and if the number of features
+#' within a bin is less than 2, the dispersion is set to 1.
+#' @param threads (integer) Number of threads to use.
+#' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
+#'
+#' Calculate using the following process:
+#'  1. Calculate the dispersion of each feature (variance / mean)
+#'  2. Log normalize dispersion and mean
+#'  3. Bin the features by their means, and normalize dispersion within each bin
+#' @export
+VarFeatSelectorTransformer <- function(num_feats, n_bins = 20L, threads = 1L) {
+  return(new(
+    "VarFeatSelectorTransformer",
+    num_feats = num_feats,
+    n_bins = n_bins,
+    threads = threads,
+    step_name = "VarFeatSelectorTransformer")
+  )
+}
+
+#' @noRd
+#' @export
+setMethod("fit", signature(object = "VarFeatSelectorTransformer", x = "IterableMatrix"), function(object, x, ...) {
+  # Not sure what we should do in the case that x does not have rownames
+  assert_true(!is.null(rownames(x)))
+  ret <- highly_variable_features(x, num_feats = object@num_feats, n_bins = object@n_bins, save_feat_selection = TRUE, threads = object@threads)
+  object@features_ <- ret$feature_selection$name
+  object@fitted <- TRUE
+  return(object)
+})
+
+#' @noRd
+#' @export
+setMethod("project", signature(object = "VarFeatSelectorTransformer", x = "IterableMatrix"), function(object, x, ...) {
+  assert_true(object@fitted)
+  return(x[object@features_,])
+})
+
+#' @noRd
+#' @export
+setMethod("short_description", "VarFeatSelectorTransformer", function(x) {
+  return(sprintf("VarFeatSelectorTransformer(num_feats=%d, n_bins=%d, threads=%d)",
+    x@num_feats, x@n_bins, x@threads))
+})
diff --git a/r/man/LSITransformer.Rd b/r/man/LSITransformer.Rd
new file mode 100644
index 00000000..f8c273a0
--- /dev/null
+++ b/r/man/LSITransformer.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/matrix_transformers.R
+\docType{class}
+\name{LSITransformer}
+\alias{LSITransformer}
+\title{Perform latent semantic indexing (LSI) on a matrix.
+TODO: Add more details from when upstream LSI PR is reviewed.}
+\usage{
+LSITransformer(
+  z_score_norm = TRUE,
+  n_dimensions = 50L,
+  scale_factor = 10000L,
+  threads = 1L
+)
+}
+\description{
+Perform latent semantic indexing (LSI) on a matrix.
+TODO: Add more details from when upstream LSI PR is reviewed.
+
+Create a new LSITransformer object
+}
diff --git a/r/man/VarFeatSelectorTransformer.Rd b/r/man/VarFeatSelectorTransformer.Rd
new file mode 100644
index 00000000..19f7648b
--- /dev/null
+++ b/r/man/VarFeatSelectorTransformer.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/matrix_transformers.R
+\docType{class}
+\name{VarFeatSelectorTransformer}
+\alias{VarFeatSelectorTransformer}
+\title{Perform feature selection on a matrix using dispersion.
+TODO: Add more details from when upstream lsi PR is reviewed.}
+\usage{
+VarFeatSelectorTransformer(num_feats, n_bins = 20L, threads = 1L)
+}
+\arguments{
+\item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
+all features will be returned.}
+
+\item{n_bins}{(integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin,
+and if the number of features
+within a bin is less than 2, the dispersion is set to 1.}
+
+\item{threads}{(integer) Number of threads to use.}
+}
+\description{
+Perform feature selection on a matrix using dispersion.
+TODO: Add more details from when upstream lsi PR is reviewed.
+
+Get the most variable features within a matrix.
+}
+\details{
+The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
+
+Calculate using the following process:
+\enumerate{
+\item Calculate the dispersion of each feature (variance / mean)
+\item Log normalize dispersion and mean
+\item Bin the features by their means, and normalize dispersion within each bin
+}
+}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index 038b1cdb..6d8096a7 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -192,3 +192,8 @@ reference:
   - estimate(PipelineBase,IterableMatrix)
   - project(PipelineBase,IterableMatrix)
   - c(PipelineBase)
+
+- subtitle: "Transformers"
+- contents:
+  - LSITransformer
+  - VarFeatSelectorTransformer
diff --git a/r/tests/testthat/test-pipelines.R b/r/tests/testthat/test-pipelines.R
new file mode 100644
index 00000000..a212dd6e
--- /dev/null
+++ b/r/tests/testthat/test-pipelines.R
@@ -0,0 +1,53 @@
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+generate_sparse_matrix <- function(nrow, ncol, fraction_nonzero = 0.5, max_val = 10) {
+  m <- matrix(rbinom(nrow * ncol, 1, fraction_nonzero) * sample.int(max_val, nrow * ncol, replace = TRUE), nrow = nrow)
+  rownames(m) <- paste0("feat", seq_len(nrow(m)))
+  colnames(m) <- paste0("cell", seq_len(ncol(m)))
+  as(m, "dgCMatrix")
+}
+
+test_that("Highly variable feature pipeline works", {
+  mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix")
+  rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
+  colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
+  # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to Seurat
+  hvf_transformer <- VarFeatSelectorTransformer(num_feats = 10, n_bins = 5)
+  res <- fit(hvf_transformer, mat) %>% project(mat)
+  expect_equal(nrow(res), 10)
+  expect_equal(ncol(res), 26)
+})
+
+test_that("LSI Pipeline works", {
+  mat <- matrix(runif(240), nrow=10) %>% as("dgCMatrix") %>% as("IterableMatrix")
+  rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
+  colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
+  lsi_transformer <- LSITransformer(n_dimensions = 5)
+  res <- fit(lsi_transformer, mat) %>% project(mat)
+  expect_equal(nrow(res), 5)
+  expect_equal(ncol(res), ncol(mat))
+})
+
+test_that("Pipeline with var feat selection and LSI works", {
+  mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix")
+  rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
+  colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
+  # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to Seurat
+  hvf_transformer <- VarFeatSelectorTransformer(num_feats = 25, n_bins = 5)
+  lsi_transformer <- LSITransformer(n_dimensions = 5)
+  # test pipeline creation with both `c()` and `Pipeline()` constructor
+  pipeline_1 <- c(hvf_transformer, lsi_transformer)
+  pipeline_2 <- Pipeline(
+    steps = list(hvf_transformer, lsi_transformer)
+  )
+  assert_true(all.equal(pipeline_1, pipeline_2))
+  res <- fit(pipeline_1, mat) %>% project(mat)
+  expect_equal(nrow(res), 5)
+  expect_equal(ncol(res), 26)
+})
\ No newline at end of file