diff --git a/R/transformTFIDF.R b/R/transformTFIDF.R index e177c094..394d0ba1 100644 --- a/R/transformTFIDF.R +++ b/R/transformTFIDF.R @@ -7,12 +7,8 @@ #' #' @param mat n x p input matrix (n = samples/cells; p = rna counts) #' @param scale.factor Scaling factor for the term-frequency (TF) -#' @param count.min The minimum expression count used for TF-IDF. Binarizes -#' when `count.min` = 0 and `count.max` = 1. -#' @param count.max The maximum expression count used for TF-IDF. Binarizes -#' when `count.min` = 0 and `count.max` = 1. -#' binarizes the matrix. A `cap` value greater than 1 will cap counts at that -#' value. +#' @param binarize Whether to binarize the input matrix: any value > 0 is set +#' to 1 #' #' @return A TF-IDF transformed matrix of the same dimensions as the input #' @@ -26,9 +22,7 @@ #' tfidf <- transformTFIDF(mat) #' #' @export -transformTFIDF <- function(mat, scale.factor = 1e5, count.min = 0, count.max = 1) { - stopifnot("'count.min' must be less than 'count.max'" = count.min < count.max) - +transformTFIDF <- function(mat, scale.factor = 1e5, binarize = FALSE) { if (!is(mat, "matrix") & !is(mat, "Matrix")) { stop("Input needs to be a matrix.") } @@ -41,25 +35,20 @@ transformTFIDF <- function(mat, scale.factor = 1e5, count.min = 0, count.max = 1 mat.capped <- t(Matrix(mat, sparse = TRUE)) } - # constrain the matrix - mat.capped@x <- .constrain(mat.capped@x, count.min, count.max) + if (binarize) { + mat.capped@x <- .binarize(mat.capped@x) + } tf <- t(t(mat.capped) / Matrix::colSums(mat.capped)) # compute term-frequency tf@x <- log1p(tf@x * scale.factor) # scale idf <- log(1 + ncol(mat.capped) / Matrix::rowSums(mat.capped)) # inverse-document frequency smooth tfidf <- .tfidf(tf, idf) # transform # cast back to a matrix since things like UMAP don't like sparse matrices - as.matrix(t(tfidf)) + as.matrix(tfidf) } -# binarize when lower is 0 and upper is 1, constrain otherwise -.constrain <- function(v, lower, upper) { - if (lower == 0 & upper == 1) { - v[v > 0] <- 1 - } else { - v[v < lower] <- 0 - v[v > upper] <- upper - } +.binarize <- function(v) { + v[v > 0] <- 1 v } diff --git a/man/transformTFIDF.Rd b/man/transformTFIDF.Rd index f00771c3..7c12267e 100644 --- a/man/transformTFIDF.Rd +++ b/man/transformTFIDF.Rd @@ -4,20 +4,15 @@ \alias{transformTFIDF} \title{Transform/normalize counts using TF-IDF} \usage{ -transformTFIDF(mat, scale.factor = 100000, count.min = 0, count.max = 1) +transformTFIDF(mat, scale.factor = 100000, binarize = FALSE) } \arguments{ \item{mat}{n x p input matrix (n = samples/cells; p = rna counts)} \item{scale.factor}{Scaling factor for the term-frequency (TF)} -\item{count.min}{The minimum expression count used for TF-IDF. Binarizes -when \code{count.min} = 0 and \code{count.max} = 1.} - -\item{count.max}{The maximum expression count used for TF-IDF. Binarizes -when \code{count.min} = 0 and \code{count.max} = 1. -binarizes the matrix. A \code{cap} value greater than 1 will cap counts at that -value.} +\item{binarize}{Whether to binarize the input matrix: any value > 0 is set +to 1} } \value{ A TF-IDF transformed matrix of the same dimensions as the input diff --git a/tests/testthat/test-transformTFIDF.R b/tests/testthat/test-transformTFIDF.R index 27f97f2a..f4a8b7f2 100644 --- a/tests/testthat/test-transformTFIDF.R +++ b/tests/testthat/test-transformTFIDF.R @@ -5,22 +5,14 @@ test_that("transformTFIDF", { "Input needs to be a matrix", fixed = TRUE ) - expect_error( - transformTFIDF(0:10, count.min = 1, count.max = 0), - "'count.min' must be less than 'count.max'" - ) }) # }}} # .constrain {{{ -test_that(".constrain", { - expect_equal( - compartmap:::.constrain(0:10, lower = 0, upper = 1), - c(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) - ) +test_that(".binarize", { expect_equal( - compartmap:::.constrain(0:10, lower = 2, upper = 5), - c(0, 0, 2, 3, 4, 5, 5, 5, 5, 5, 5) + compartmap:::.binarize(seq(0, 10, by = 0.5)), + c(0, rep(1, 20)) ) }) # }}} diff --git a/vignettes/compartmap.Rmd b/vignettes/compartmap.Rmd index 86324010..493b1ba1 100644 --- a/vignettes/compartmap.Rmd +++ b/vignettes/compartmap.Rmd @@ -270,7 +270,7 @@ data("k562_scrna_raw", package = "compartmap") k562_scrna_chr14_tfidf <- transformTFIDF(assay(k562_scrna_se_chr14)) # Add back the TF-IDF counts to the object in the counts slot -assay(k562_scrna_se_chr14, "counts") <- t(k562_scrna_chr14_tfidf) +assay(k562_scrna_se_chr14, "counts") <- k562_scrna_chr14_tfidf # Compute chromatin domains at the group level k562_scrna_chr14_raw_domains <- scCompartments(k562_scrna_se_chr14,