From 6c907e6a234e7e2368104f432d160ae2305877b5 Mon Sep 17 00:00:00 2001 From: James Eapen Date: Tue, 28 Oct 2025 12:37:55 -0400 Subject: [PATCH 1/2] fix(transformTFIDF): revert count.min/max for just binarizing option allows for a raw tfidf or binarized tfidf --- R/transformTFIDF.R | 27 ++++++++------------------- man/transformTFIDF.Rd | 11 +++-------- tests/testthat/test-transformTFIDF.R | 14 +++----------- 3 files changed, 14 insertions(+), 38 deletions(-) diff --git a/R/transformTFIDF.R b/R/transformTFIDF.R index e177c094..75268c50 100644 --- a/R/transformTFIDF.R +++ b/R/transformTFIDF.R @@ -7,12 +7,8 @@ #' #' @param mat n x p input matrix (n = samples/cells; p = rna counts) #' @param scale.factor Scaling factor for the term-frequency (TF) -#' @param count.min The minimum expression count used for TF-IDF. Binarizes -#' when `count.min` = 0 and `count.max` = 1. -#' @param count.max The maximum expression count used for TF-IDF. Binarizes -#' when `count.min` = 0 and `count.max` = 1. -#' binarizes the matrix. A `cap` value greater than 1 will cap counts at that -#' value. +#' @param binarize Whether to binarize the input matrix: any value > 0 is set +#' to 1 #' #' @return A TF-IDF transformed matrix of the same dimensions as the input #' @@ -26,9 +22,7 @@ #' tfidf <- transformTFIDF(mat) #' #' @export -transformTFIDF <- function(mat, scale.factor = 1e5, count.min = 0, count.max = 1) { - stopifnot("'count.min' must be less than 'count.max'" = count.min < count.max) - +transformTFIDF <- function(mat, scale.factor = 1e5, binarize = FALSE) { if (!is(mat, "matrix") & !is(mat, "Matrix")) { stop("Input needs to be a matrix.") } @@ -41,8 +35,9 @@ transformTFIDF <- function(mat, scale.factor = 1e5, count.min = 0, count.max = 1 mat.capped <- t(Matrix(mat, sparse = TRUE)) } - # constrain the matrix - mat.capped@x <- .constrain(mat.capped@x, count.min, count.max) + if (binarize) { + mat.capped@x <- .binarize(mat.capped@x) + } tf <- t(t(mat.capped) / Matrix::colSums(mat.capped)) # compute term-frequency tf@x <- log1p(tf@x * scale.factor) # scale idf <- log(1 + ncol(mat.capped) / Matrix::rowSums(mat.capped)) # inverse-document frequency smooth @@ -52,14 +47,8 @@ transformTFIDF <- function(mat, scale.factor = 1e5, count.min = 0, count.max = 1 as.matrix(t(tfidf)) } -# binarize when lower is 0 and upper is 1, constrain otherwise -.constrain <- function(v, lower, upper) { - if (lower == 0 & upper == 1) { - v[v > 0] <- 1 - } else { - v[v < lower] <- 0 - v[v > upper] <- upper - } +.binarize <- function(v) { + v[v > 0] <- 1 v } diff --git a/man/transformTFIDF.Rd b/man/transformTFIDF.Rd index f00771c3..7c12267e 100644 --- a/man/transformTFIDF.Rd +++ b/man/transformTFIDF.Rd @@ -4,20 +4,15 @@ \alias{transformTFIDF} \title{Transform/normalize counts using TF-IDF} \usage{ -transformTFIDF(mat, scale.factor = 100000, count.min = 0, count.max = 1) +transformTFIDF(mat, scale.factor = 100000, binarize = FALSE) } \arguments{ \item{mat}{n x p input matrix (n = samples/cells; p = rna counts)} \item{scale.factor}{Scaling factor for the term-frequency (TF)} -\item{count.min}{The minimum expression count used for TF-IDF. Binarizes -when \code{count.min} = 0 and \code{count.max} = 1.} - -\item{count.max}{The maximum expression count used for TF-IDF. Binarizes -when \code{count.min} = 0 and \code{count.max} = 1. -binarizes the matrix. A \code{cap} value greater than 1 will cap counts at that -value.} +\item{binarize}{Whether to binarize the input matrix: any value > 0 is set +to 1} } \value{ A TF-IDF transformed matrix of the same dimensions as the input diff --git a/tests/testthat/test-transformTFIDF.R b/tests/testthat/test-transformTFIDF.R index 27f97f2a..f4a8b7f2 100644 --- a/tests/testthat/test-transformTFIDF.R +++ b/tests/testthat/test-transformTFIDF.R @@ -5,22 +5,14 @@ test_that("transformTFIDF", { "Input needs to be a matrix", fixed = TRUE ) - expect_error( - transformTFIDF(0:10, count.min = 1, count.max = 0), - "'count.min' must be less than 'count.max'" - ) }) # }}} # .constrain {{{ -test_that(".constrain", { - expect_equal( - compartmap:::.constrain(0:10, lower = 0, upper = 1), - c(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) - ) +test_that(".binarize", { expect_equal( - compartmap:::.constrain(0:10, lower = 2, upper = 5), - c(0, 0, 2, 3, 4, 5, 5, 5, 5, 5, 5) + compartmap:::.binarize(seq(0, 10, by = 0.5)), + c(0, rep(1, 20)) ) }) # }}} From 55394cd5775c043ea7acf11db86a56fc9dea772e Mon Sep 17 00:00:00 2001 From: James Eapen Date: Tue, 28 Oct 2025 12:38:40 -0400 Subject: [PATCH 2/2] fix(transformTFIDF): no transpose for result of tfidf it would need to be transposed again to put in the assay slot --- R/transformTFIDF.R | 2 +- vignettes/compartmap.Rmd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/transformTFIDF.R b/R/transformTFIDF.R index 75268c50..394d0ba1 100644 --- a/R/transformTFIDF.R +++ b/R/transformTFIDF.R @@ -44,7 +44,7 @@ transformTFIDF <- function(mat, scale.factor = 1e5, binarize = FALSE) { tfidf <- .tfidf(tf, idf) # transform # cast back to a matrix since things like UMAP don't like sparse matrices - as.matrix(t(tfidf)) + as.matrix(tfidf) } .binarize <- function(v) { diff --git a/vignettes/compartmap.Rmd b/vignettes/compartmap.Rmd index 86324010..493b1ba1 100644 --- a/vignettes/compartmap.Rmd +++ b/vignettes/compartmap.Rmd @@ -270,7 +270,7 @@ data("k562_scrna_raw", package = "compartmap") k562_scrna_chr14_tfidf <- transformTFIDF(assay(k562_scrna_se_chr14)) # Add back the TF-IDF counts to the object in the counts slot -assay(k562_scrna_se_chr14, "counts") <- t(k562_scrna_chr14_tfidf) +assay(k562_scrna_se_chr14, "counts") <- k562_scrna_chr14_tfidf # Compute chromatin domains at the group level k562_scrna_chr14_raw_domains <- scCompartments(k562_scrna_se_chr14,