From e4bf41d686aa3fb24c52bffb56d6002ec73cc150 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Wed, 13 Nov 2024 18:42:26 -0800 Subject: [PATCH 1/2] [r] add initial poc --- r/R/matrix_transformers.R | 84 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 r/R/matrix_transformers.R diff --git a/r/R/matrix_transformers.R b/r/R/matrix_transformers.R new file mode 100644 index 00000000..e9d8133a --- /dev/null +++ b/r/R/matrix_transformers.R @@ -0,0 +1,84 @@ +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +# #' Perform latent semantic indexing (LSI) on a matrix. +# #' @name LSITransformer +# #' @export +# setClass("LSITransformer", +# contains = "Transformer", +# slots = list( +# idf_ = "numeric", +# svd_attr_ = "list", +# z_score_norm = "logical", +# n_dimensions = "integer", +# scale_factor = "integer", +# threads = "integer" +# ), +# prototype = list( +# idf_ = numeric(0), +# svd_attr_ = list(), +# z_score_norm = FALSE, +# n_dimensions = 20L, +# scale_factor = 1e4L, +# threads = 1L +# ) +# ) + +# #' Create a new LSITransformer object +# #' @export +# LSITransformer <- function(z_score_norm, n_dimensions, scale_factor, threads) { +# return(new( +# "LSITransformer", z_score_norm = z_score_norm, n_dimensions = n_dimensions, +# scale_factor = scale_factor, threads = threads, step_name = "LSITransformer")) +# } + +# setMethod("fit", signature(object = "LSITransformer", x = "IterableMatrix"), function(object, x, ...) { +# ret <- lsi( +# x, z_score_norm = object@z_score_norm, n_dimensions = object@n_dimensions, +# scale_factor = object@scale_factor, threads = object@threads, +# save_lsi = TRUE +# ) +# object@idf_ <- ret$idf +# object@svd_attr_ <- ret$svd_attr +# object@fitted <- TRUE +# return(object) +# }) + +# setMethod("transform", signature(object = "LSITransformer", x = "IterableMatrix"), function(object, x, ...) { +# # rudimentary implementation -- Works but is duplicate code. +# assert_true(object@fitted) +# # Wait until LSI PR has been reviewed +# npeaks <- colSums(x) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h` +# tf <- x %>% multiply_cols(1 / npeaks) +# mat_tfidf <- tf %>% multiply_rows(object@idf_) +# mat_log_tfidf <- log1p(object@scale_factor * mat_tfidf) +# mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE) +# if (object@z_score_norm) { +# cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats = "variance", threads = object@threads)$col_stats +# cell_means <- cell_peak_stats["mean",] +# cell_vars <- cell_peak_stats["variance",] +# mat_log_tfidf <- mat_log_tfidf %>% +# add_cols(-cell_means) %>% +# multiply_cols(1 / cell_vars) +# } +# pca_res <- t(object@svd_attr_$u) %*% mat_log_tfidf +# return(pca_res) +# }) + +# setMethod("short_description", "LSITransformer", function(x) { +# return(sprintf("LSITransformer(z_score_norm=%s, n_dimensions=%d, scale_factor=%d, threads=%d)", +# x@z_score_norm, x@n_dimensions, x@scale_factor, x@threads)) +# }) + +# setClass("VarFeatSelectorTransformer", +# contains = "Transformer", +# slots = list( +# num_feats = "integer", +# n_bins = "integer" +# ) +# ) \ No newline at end of file From ab3d71da73a0f74cbea820066e52f566be884d42 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 18 Nov 2024 15:20:12 -0800 Subject: [PATCH 2/2] [r] add in feat-selection, lsi pipeline --- r/NAMESPACE | 6 + r/R/matrix_transformers.R | 198 ++++++++++++++++++---------- r/man/LSITransformer.Rd | 21 +++ r/man/VarFeatSelectorTransformer.Rd | 36 +++++ r/pkgdown/_pkgdown.yml | 5 + r/tests/testthat/test-pipelines.R | 53 ++++++++ 6 files changed, 248 insertions(+), 71 deletions(-) create mode 100644 r/man/LSITransformer.Rd create mode 100644 r/man/VarFeatSelectorTransformer.Rd create mode 100644 r/tests/testthat/test-pipelines.R diff --git a/r/NAMESPACE b/r/NAMESPACE index 55d4adef..f7e7e679 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -7,7 +7,9 @@ S3method(svds,default) export("all_matrix_inputs<-") export("cellNames<-") export("chrNames<-") +export(LSITransformer) export(Pipeline) +export(VarFeatSelectorTransformer) export(add_cols) export(add_rows) export(all_matrix_inputs) @@ -121,6 +123,7 @@ export(shift_fragments) export(storage_order) export(subset_lengths) export(svds) +export(t) export(tile_matrix) export(trackplot_bulk) export(trackplot_combine) @@ -141,15 +144,18 @@ export(write_matrix_dir) export(write_matrix_hdf5) export(write_matrix_memory) exportClasses(Estimator) +exportClasses(LSITransformer) exportClasses(PipelineBase) exportClasses(PipelineStep) exportClasses(Transformer) +exportClasses(VarFeatSelectorTransformer) exportMethods(as.data.frame) exportMethods(as.matrix) exportMethods(c) exportMethods(estimate) exportMethods(fit) exportMethods(project) +exportMethods(short_description) exportMethods(show) exportMethods(t) importClassesFrom(Matrix,dgCMatrix) diff --git a/r/R/matrix_transformers.R b/r/R/matrix_transformers.R index e9d8133a..ba9cfb84 100644 --- a/r/R/matrix_transformers.R +++ b/r/R/matrix_transformers.R @@ -6,79 +6,135 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. -# #' Perform latent semantic indexing (LSI) on a matrix. -# #' @name LSITransformer -# #' @export -# setClass("LSITransformer", -# contains = "Transformer", -# slots = list( -# idf_ = "numeric", -# svd_attr_ = "list", -# z_score_norm = "logical", -# n_dimensions = "integer", -# scale_factor = "integer", -# threads = "integer" -# ), -# prototype = list( -# idf_ = numeric(0), -# svd_attr_ = list(), -# z_score_norm = FALSE, -# n_dimensions = 20L, -# scale_factor = 1e4L, -# threads = 1L -# ) -# ) -# #' Create a new LSITransformer object -# #' @export -# LSITransformer <- function(z_score_norm, n_dimensions, scale_factor, threads) { -# return(new( -# "LSITransformer", z_score_norm = z_score_norm, n_dimensions = n_dimensions, -# scale_factor = scale_factor, threads = threads, step_name = "LSITransformer")) -# } +#' Perform latent semantic indexing (LSI) on a matrix. +#' TODO: Add more details from when upstream LSI PR is reviewed. +#' @name LSITransformer +#' @export +setClass("LSITransformer", + contains = "Transformer", + slots = list( + idf_ = "numeric", + svd_attr_ = "list", + z_score_norm = "logical", + n_dimensions = "numeric", + scale_factor = "numeric", + threads = "integer" + ) +) -# setMethod("fit", signature(object = "LSITransformer", x = "IterableMatrix"), function(object, x, ...) { -# ret <- lsi( -# x, z_score_norm = object@z_score_norm, n_dimensions = object@n_dimensions, -# scale_factor = object@scale_factor, threads = object@threads, -# save_lsi = TRUE -# ) -# object@idf_ <- ret$idf -# object@svd_attr_ <- ret$svd_attr -# object@fitted <- TRUE -# return(object) -# }) +#' Create a new LSITransformer object +#' @export +LSITransformer <- function(z_score_norm = TRUE, n_dimensions = 50L, scale_factor = 1e4L, threads = 1L) { + return(new( + "LSITransformer", z_score_norm = z_score_norm, n_dimensions = n_dimensions, + scale_factor = scale_factor, threads = threads, step_name = "LSITransformer")) +} -# setMethod("transform", signature(object = "LSITransformer", x = "IterableMatrix"), function(object, x, ...) { -# # rudimentary implementation -- Works but is duplicate code. -# assert_true(object@fitted) -# # Wait until LSI PR has been reviewed -# npeaks <- colSums(x) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h` -# tf <- x %>% multiply_cols(1 / npeaks) -# mat_tfidf <- tf %>% multiply_rows(object@idf_) -# mat_log_tfidf <- log1p(object@scale_factor * mat_tfidf) -# mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE) -# if (object@z_score_norm) { -# cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats = "variance", threads = object@threads)$col_stats -# cell_means <- cell_peak_stats["mean",] -# cell_vars <- cell_peak_stats["variance",] -# mat_log_tfidf <- mat_log_tfidf %>% -# add_cols(-cell_means) %>% -# multiply_cols(1 / cell_vars) -# } -# pca_res <- t(object@svd_attr_$u) %*% mat_log_tfidf -# return(pca_res) -# }) +#' @noRd +#' @export +setMethod("fit", signature(object = "LSITransformer", x = "IterableMatrix"), function(object, x, ...) { + ret <- lsi( + x, z_score_norm = object@z_score_norm, n_dimensions = object@n_dimensions, + scale_factor = object@scale_factor, threads = object@threads, + save_lsi = TRUE + ) + object@idf_ <- ret$idf + object@svd_attr_ <- ret$svd_attr + object@fitted <- TRUE + return(object) +}) -# setMethod("short_description", "LSITransformer", function(x) { -# return(sprintf("LSITransformer(z_score_norm=%s, n_dimensions=%d, scale_factor=%d, threads=%d)", -# x@z_score_norm, x@n_dimensions, x@scale_factor, x@threads)) -# }) -# setClass("VarFeatSelectorTransformer", -# contains = "Transformer", -# slots = list( -# num_feats = "integer", -# n_bins = "integer" -# ) -# ) \ No newline at end of file +#' @noRd +#' @export +setMethod("project", signature(object = "LSITransformer", x = "IterableMatrix"), function(object, x, ...) { + # rudimentary implementation -- Works but is duplicate code. + assert_true(object@fitted) + # Wait until LSI PR has been reviewed + npeaks <- colSums(x) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h` + tf <- x %>% multiply_cols(1 / npeaks) + mat_tfidf <- tf %>% multiply_rows(object@idf_) + mat_log_tfidf <- log1p(object@scale_factor * mat_tfidf) + mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE) + if (object@z_score_norm) { + cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats = "variance", threads = object@threads)$col_stats + cell_means <- cell_peak_stats["mean",] + cell_vars <- cell_peak_stats["variance",] + mat_log_tfidf <- mat_log_tfidf %>% + add_cols(-cell_means) %>% + multiply_cols(1 / cell_vars) + } + pca_res <- t(object@svd_attr_$u) %*% mat_log_tfidf + return(pca_res) +}) + + +setMethod("short_description", "LSITransformer", function(x) { + return(sprintf("LSITransformer(z_score_norm=%s, n_dimensions=%d, scale_factor=%d, threads=%d)", + x@z_score_norm, x@n_dimensions, x@scale_factor, x@threads)) +}) + + +#' Perform feature selection on a matrix using dispersion. +#' TODO: Add more details from when upstream lsi PR is reviewed. +#' @name VarFeatSelectorTransformer +#' @export +setClass("VarFeatSelectorTransformer", + contains = "Transformer", + slots = list( + features_ = "character", + num_feats = "numeric", + n_bins = "numeric", + threads = "integer" + ) +) + +#' Get the most variable features within a matrix. +#' @param num_feats (integer) Number of features to return. If the number is higher than the number of features in the matrix, +#' all features will be returned. +#' @param n_bins (integer) Number of bins for binning mean gene expression. Normalizing dispersion is done with respect to each bin, +#' and if the number of features +#' within a bin is less than 2, the dispersion is set to 1. +#' @param threads (integer) Number of threads to use. +#' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015). +#' +#' Calculate using the following process: +#' 1. Calculate the dispersion of each feature (variance / mean) +#' 2. Log normalize dispersion and mean +#' 3. Bin the features by their means, and normalize dispersion within each bin +#' @export +VarFeatSelectorTransformer <- function(num_feats, n_bins = 20L, threads = 1L) { + return(new( + "VarFeatSelectorTransformer", + num_feats = num_feats, + n_bins = n_bins, + threads = threads, + step_name = "VarFeatSelectorTransformer") + ) +} + +#' @noRd +#' @export +setMethod("fit", signature(object = "VarFeatSelectorTransformer", x = "IterableMatrix"), function(object, x, ...) { + # Not sure what we should do in the case that x does not have rownames + assert_true(!is.null(rownames(x))) + ret <- highly_variable_features(x, num_feats = object@num_feats, n_bins = object@n_bins, save_feat_selection = TRUE, threads = object@threads) + object@features_ <- ret$feature_selection$name + object@fitted <- TRUE + return(object) +}) + +#' @noRd +#' @export +setMethod("project", signature(object = "VarFeatSelectorTransformer", x = "IterableMatrix"), function(object, x, ...) { + assert_true(object@fitted) + return(x[object@features_,]) +}) + +#' @noRd +#' @export +setMethod("short_description", "VarFeatSelectorTransformer", function(x) { + return(sprintf("VarFeatSelectorTransformer(num_feats=%d, n_bins=%d, threads=%d)", + x@num_feats, x@n_bins, x@threads)) +}) diff --git a/r/man/LSITransformer.Rd b/r/man/LSITransformer.Rd new file mode 100644 index 00000000..f8c273a0 --- /dev/null +++ b/r/man/LSITransformer.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/matrix_transformers.R +\docType{class} +\name{LSITransformer} +\alias{LSITransformer} +\title{Perform latent semantic indexing (LSI) on a matrix. +TODO: Add more details from when upstream LSI PR is reviewed.} +\usage{ +LSITransformer( + z_score_norm = TRUE, + n_dimensions = 50L, + scale_factor = 10000L, + threads = 1L +) +} +\description{ +Perform latent semantic indexing (LSI) on a matrix. +TODO: Add more details from when upstream LSI PR is reviewed. + +Create a new LSITransformer object +} diff --git a/r/man/VarFeatSelectorTransformer.Rd b/r/man/VarFeatSelectorTransformer.Rd new file mode 100644 index 00000000..19f7648b --- /dev/null +++ b/r/man/VarFeatSelectorTransformer.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/matrix_transformers.R +\docType{class} +\name{VarFeatSelectorTransformer} +\alias{VarFeatSelectorTransformer} +\title{Perform feature selection on a matrix using dispersion. +TODO: Add more details from when upstream lsi PR is reviewed.} +\usage{ +VarFeatSelectorTransformer(num_feats, n_bins = 20L, threads = 1L) +} +\arguments{ +\item{num_feats}{(integer) Number of features to return. If the number is higher than the number of features in the matrix, +all features will be returned.} + +\item{n_bins}{(integer) Number of bins for binning mean gene expression. Normalizing dispersion is done with respect to each bin, +and if the number of features +within a bin is less than 2, the dispersion is set to 1.} + +\item{threads}{(integer) Number of threads to use.} +} +\description{ +Perform feature selection on a matrix using dispersion. +TODO: Add more details from when upstream lsi PR is reviewed. + +Get the most variable features within a matrix. +} +\details{ +The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015). + +Calculate using the following process: +\enumerate{ +\item Calculate the dispersion of each feature (variance / mean) +\item Log normalize dispersion and mean +\item Bin the features by their means, and normalize dispersion within each bin +} +} diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index 038b1cdb..6d8096a7 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -192,3 +192,8 @@ reference: - estimate(PipelineBase,IterableMatrix) - project(PipelineBase,IterableMatrix) - c(PipelineBase) + +- subtitle: "Transformers" +- contents: + - LSITransformer + - VarFeatSelectorTransformer diff --git a/r/tests/testthat/test-pipelines.R b/r/tests/testthat/test-pipelines.R new file mode 100644 index 00000000..a212dd6e --- /dev/null +++ b/r/tests/testthat/test-pipelines.R @@ -0,0 +1,53 @@ +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +generate_sparse_matrix <- function(nrow, ncol, fraction_nonzero = 0.5, max_val = 10) { + m <- matrix(rbinom(nrow * ncol, 1, fraction_nonzero) * sample.int(max_val, nrow * ncol, replace = TRUE), nrow = nrow) + rownames(m) <- paste0("feat", seq_len(nrow(m))) + colnames(m) <- paste0("cell", seq_len(ncol(m))) + as(m, "dgCMatrix") +} + +test_that("Highly variable feature pipeline works", { + mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix") + rownames(mat) <- paste0("feat", seq_len(nrow(mat))) + colnames(mat) <- paste0("cell", seq_len(ncol(mat))) + # Test only that outputs are reasonable. There is a full comparison in `tests/real_data/` that compares implementation to Seurat + hvf_transformer <- VarFeatSelectorTransformer(num_feats = 10, n_bins = 5) + res <- fit(hvf_transformer, mat) %>% project(mat) + expect_equal(nrow(res), 10) + expect_equal(ncol(res), 26) +}) + +test_that("LSI Pipeline works", { + mat <- matrix(runif(240), nrow=10) %>% as("dgCMatrix") %>% as("IterableMatrix") + rownames(mat) <- paste0("feat", seq_len(nrow(mat))) + colnames(mat) <- paste0("cell", seq_len(ncol(mat))) + lsi_transformer <- LSITransformer(n_dimensions = 5) + res <- fit(lsi_transformer, mat) %>% project(mat) + expect_equal(nrow(res), 5) + expect_equal(ncol(res), ncol(mat)) +}) + +test_that("Pipeline with var feat selection and LSI works", { + mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix") + rownames(mat) <- paste0("feat", seq_len(nrow(mat))) + colnames(mat) <- paste0("cell", seq_len(ncol(mat))) + # Test only that outputs are reasonable. There is a full comparison in `tests/real_data/` that compares implementation to Seurat + hvf_transformer <- VarFeatSelectorTransformer(num_feats = 25, n_bins = 5) + lsi_transformer <- LSITransformer(n_dimensions = 5) + # test pipeline creation with both `c()` and `Pipeline()` constructor + pipeline_1 <- c(hvf_transformer, lsi_transformer) + pipeline_2 <- Pipeline( + steps = list(hvf_transformer, lsi_transformer) + ) + assert_true(all.equal(pipeline_1, pipeline_2)) + res <- fit(pipeline_1, mat) %>% project(mat) + expect_equal(nrow(res), 5) + expect_equal(ncol(res), 26) +}) \ No newline at end of file