From 6a3d3ac972f96ccae8b391bd692718cc563e9faf Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Thu, 15 Jan 2026 15:13:32 +0200 Subject: [PATCH 01/11] Add implementation of binning transformation --- R/transformCounts.R | 62 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/R/transformCounts.R b/R/transformCounts.R index 40bdbbef3..857ce1a53 100644 --- a/R/transformCounts.R +++ b/R/transformCounts.R @@ -36,6 +36,8 @@ #' to fill reference sample's column in returned assay when calculating alr. #' (Default: \code{NA}) #' \item \code{ref_vals} Deprecated. Use \code{reference} instead. +#' \item \code{bins}: \code{Numeric scalar}. For \code{"binning"}, specifies +#' the number of bins to use. (Default: \code{4}) #' \item \code{percentile}: \code{Numeric scalar} or \code{NULL} (css). Used #' to set the percentile value that calculates the scaling factors in the css #' normalization. If \code{NULL}, percentile is estimated from the data by @@ -269,7 +271,7 @@ setMethod("transformAssay", signature = c(x = "SingleCellExperiment"), .transform_assay <- function( x, assay.type = "counts", assay_name = NULL, method = c( - "alr", "chi.square", "clr", "css", "cutoff", "difference", "-", + "alr", "binning", "chi.square", "clr", "css", "cutoff", "difference", "-", "division", "/", "frequency", "hellinger", "invnorm", "log", "log10", "log2", "max", "normalize", "pa", "philr", "pseudocount", "range", "rank", "rclr", "relabundance", "rrank", "standardize", @@ -325,8 +327,8 @@ setMethod("transformAssay", signature = c(x = "SingleCellExperiment"), attr(assay, "pseudocount") <- NULL # Calls help function that does the transformation # Help function is different for mia and vegan transformations - if( method %in% c( - "log10", "log2", "css", "difference", "division", "invnorm") ){ + if( method %in% c("binning", "log10", "log2", "css", "difference", + "division", "invnorm") ){ transformed_table <- .apply_transformation( assay, method, MARGIN, ...) } else if( method %in% c("philr") ){ @@ -360,6 +362,7 @@ setMethod("transformAssay", signature = c(x = "SingleCellExperiment"), # Function is selected based on the "method" variable FUN <- switch( method, + binning = .apply_binning, log10 = .calc_log, log2 = .calc_log, css = .calc_css, @@ -907,6 +910,59 @@ NULL return(res) } +################################ .apply_binning ################################ +# This function divides the data into a specified number of bins. +.apply_binning <- function(mat, bins = 4, ...){ + # Check that bins is a single positive numeric value + if( !.is_a_numeric(bins) || bins <= 0 ){ + stop("'bins' must be a single positive numeric value.", call. = FALSE) + } + bins <- as.integer(bins) + + # Apply binning + res <- apply(mat, MARGIN = 2, function(x) { + # Initialize result with 0 (for zero values) + res <- rep(0, length(x)) + + # Identify non-zero values + is_nonzero <- x != 0 + n_nonzero <- sum(is_nonzero) + + if (n_nonzero > 0) { + # Get indices of non-zero values + nonzero_indices <- which(is_nonzero) + nonzero_values <- x[nonzero_indices] + + # Sort indices based on values (descending) + ord <- order(nonzero_values, decreasing = TRUE) + + # Calculate bin assignments + if( n_nonzero < bins ){ + # For samples with fewer than B non-zero abundance species, + # species are distributed proportionally across bins 1 through B + bin_values <- round(seq(from = bins, to = 1, length.out = n_nonzero)) + } else { + # Ranks 1 to n_nonzero + # Formula: bin = B - ceiling(rank * B / N) + 1 + ranks <- seq_len(n_nonzero) + bin_values <- bins - ceiling(ranks * bins / n_nonzero) + 1 + } + + # Assign back + res[nonzero_indices[ord]] <- bin_values + } + return(res) + }) + + # Ensure dimensions are preserved (apply simplifies to vector if dim is 1) + if( is.null(dim(res)) && !is.null(dim(mat)) ){ + dim(res) <- dim(mat) + } + dimnames(res) <- dimnames(mat) + + return(res) +} + # This function is used to add transformed table back to TreeSE. With most of # the methods it is simple: it is added to assay. However, with transformations # that change the dimensionality (e.g. philr, difference, division), the From b7b03ddb406a8855ba82ca25b7c193b694c327e4 Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Thu, 15 Jan 2026 15:14:19 +0200 Subject: [PATCH 02/11] Add unit tests for binning transformation --- tests/testthat/test-5transformCounts.R | 43 ++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tests/testthat/test-5transformCounts.R b/tests/testthat/test-5transformCounts.R index bb9587179..be536104c 100644 --- a/tests/testthat/test-5transformCounts.R +++ b/tests/testthat/test-5transformCounts.R @@ -423,6 +423,49 @@ test_that("transformAssay", { "'value' must be a single numeric value or NA" ) + ############################## BINNING ############################### + # Test that binning transformation works + tse_bin <- transformAssay(tse, method = "binning", bins = 3) + # Check that the assay was created + expect_true("binning" %in% assayNames(tse_bin)) + + # Check that values are between 0 and 3 + binned_assay <- assay(tse_bin, "binning") + expect_true(all(binned_assay >= 0 & binned_assay <= 3, na.rm = TRUE)) + + # Check that 0s are 0 + counts <- assay(tse, "counts") + expect_true(all(binned_assay[counts == 0] == 0)) + + # Check non-zeros are > 0 + expect_true(all(binned_assay[counts != 0] > 0)) + + # Manual check for N < B case + # 2 non-zero values, 4 bins. Should map to 4 and 1. + test_mat <- matrix(c(10, 5, 0, 0), ncol=1) + tse_test <- SummarizedExperiment(assays = list(counts = test_mat)) + tse_test <- transformAssay(tse_test, method = "binning", bins = 4) + expect_equal(as.vector(assay(tse_test, "binning")), c(4, 1, 0, 0)) + + # Test error + expect_error(transformAssay(tse, method = "binning", bins = "a")) + expect_error(transformAssay(tse, method = "binning", bins = 0)) + + # Test feature-wise binning + tse_bin_feat <- transformAssay(tse, method = "binning", bins = 3, MARGIN = "features") + expect_equal(dim(assay(tse_bin_feat, "binning")), dim(assay(tse, "counts"))) + + # Manual check for feature-wise + # 2 features, 3 samples + mat_feat <- matrix(c(10, 5, 0, 20, 0, 10), nrow=2, byrow=TRUE) + tse_feat <- SummarizedExperiment(assays = list(counts = mat_feat)) + tse_feat <- transformAssay(tse_feat, method = "binning", bins = 3, MARGIN = "features") + res_feat <- assay(tse_feat, "binning") + + # Check rows (features) + expect_equal(as.vector(res_feat[1,]), c(3, 1, 0)) + expect_equal(as.vector(res_feat[2,]), c(3, 0, 1)) + ############################## DIFFERENCE ############################# # Test that difference transformation works on GlobalPatterns subset # Load data From a586a75fe0510d547a430b15d9347556064e98d0 Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Wed, 21 Jan 2026 13:59:32 +0200 Subject: [PATCH 03/11] rename argument bins to nbins this renaming alings with other functions in the package --- R/transformCounts.R | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/R/transformCounts.R b/R/transformCounts.R index 857ce1a53..56d7d4e53 100644 --- a/R/transformCounts.R +++ b/R/transformCounts.R @@ -912,12 +912,12 @@ NULL ################################ .apply_binning ################################ # This function divides the data into a specified number of bins. -.apply_binning <- function(mat, bins = 4, ...){ - # Check that bins is a single positive numeric value - if( !.is_a_numeric(bins) || bins <= 0 ){ - stop("'bins' must be a single positive numeric value.", call. = FALSE) +.apply_binning <- function(mat, nbins = 4, ...){ + # Check that nbins is a single positive numeric value + if( !.is_a_numeric(nbins) || nbins <= 0 ){ + stop("'nbins' must be a single positive numeric value.", call. = FALSE) } - bins <- as.integer(bins) + nbins <- as.integer(nbins) # Apply binning res <- apply(mat, MARGIN = 2, function(x) { @@ -937,15 +937,15 @@ NULL ord <- order(nonzero_values, decreasing = TRUE) # Calculate bin assignments - if( n_nonzero < bins ){ + if( n_nonzero < nbins ){ # For samples with fewer than B non-zero abundance species, - # species are distributed proportionally across bins 1 through B - bin_values <- round(seq(from = bins, to = 1, length.out = n_nonzero)) + # species are distributed proportionally across nbins 1 through B + bin_values <- round(seq(from = nbins, to = 1, length.out = n_nonzero)) } else { # Ranks 1 to n_nonzero # Formula: bin = B - ceiling(rank * B / N) + 1 ranks <- seq_len(n_nonzero) - bin_values <- bins - ceiling(ranks * bins / n_nonzero) + 1 + bin_values <- nbins - ceiling(ranks * nbins / n_nonzero) + 1 } # Assign back From 0e0a48d700cd6f3d101d7a67afad12a6492496bc Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Wed, 21 Jan 2026 14:01:07 +0200 Subject: [PATCH 04/11] missed rename of bins -> nbins --- R/transformCounts.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/transformCounts.R b/R/transformCounts.R index 56d7d4e53..7cdf96761 100644 --- a/R/transformCounts.R +++ b/R/transformCounts.R @@ -36,7 +36,7 @@ #' to fill reference sample's column in returned assay when calculating alr. #' (Default: \code{NA}) #' \item \code{ref_vals} Deprecated. Use \code{reference} instead. -#' \item \code{bins}: \code{Numeric scalar}. For \code{"binning"}, specifies +#' \item \code{nbins}: \code{Numeric scalar}. For \code{"binning"}, specifies #' the number of bins to use. (Default: \code{4}) #' \item \code{percentile}: \code{Numeric scalar} or \code{NULL} (css). Used #' to set the percentile value that calculates the scaling factors in the css From 4691df1b74a756de100607a8dd47e4fb4d9d4486 Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Wed, 21 Jan 2026 14:42:18 +0200 Subject: [PATCH 05/11] rename bins to nbins in unit test aswell --- man/transformAssay.Rd | 2 ++ tests/testthat/test-5transformCounts.R | 14 +++++++------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/man/transformAssay.Rd b/man/transformAssay.Rd index e13f18532..a9b543ebf 100644 --- a/man/transformAssay.Rd +++ b/man/transformAssay.Rd @@ -31,6 +31,8 @@ or \code{philr::philr}. to fill reference sample's column in returned assay when calculating alr. (Default: \code{NA}) \item \code{ref_vals} Deprecated. Use \code{reference} instead. +\item \code{nbins}: \code{Numeric scalar}. For \code{"binning"}, specifies +the number of bins to use. (Default: \code{4}) \item \code{percentile}: \code{Numeric scalar} or \code{NULL} (css). Used to set the percentile value that calculates the scaling factors in the css normalization. If \code{NULL}, percentile is estimated from the data by diff --git a/tests/testthat/test-5transformCounts.R b/tests/testthat/test-5transformCounts.R index be536104c..487ec4036 100644 --- a/tests/testthat/test-5transformCounts.R +++ b/tests/testthat/test-5transformCounts.R @@ -425,7 +425,7 @@ test_that("transformAssay", { ############################## BINNING ############################### # Test that binning transformation works - tse_bin <- transformAssay(tse, method = "binning", bins = 3) + tse_bin <- transformAssay(tse, method = "binning", nbins = 3) # Check that the assay was created expect_true("binning" %in% assayNames(tse_bin)) @@ -441,25 +441,25 @@ test_that("transformAssay", { expect_true(all(binned_assay[counts != 0] > 0)) # Manual check for N < B case - # 2 non-zero values, 4 bins. Should map to 4 and 1. + # 2 non-zero values, 4 nbins. Should map to 4 and 1. test_mat <- matrix(c(10, 5, 0, 0), ncol=1) tse_test <- SummarizedExperiment(assays = list(counts = test_mat)) - tse_test <- transformAssay(tse_test, method = "binning", bins = 4) + tse_test <- transformAssay(tse_test, method = "binning", nbins = 4) expect_equal(as.vector(assay(tse_test, "binning")), c(4, 1, 0, 0)) # Test error - expect_error(transformAssay(tse, method = "binning", bins = "a")) - expect_error(transformAssay(tse, method = "binning", bins = 0)) + expect_error(transformAssay(tse, method = "binning", nbins = "a")) + expect_error(transformAssay(tse, method = "binning", nbins = 0)) # Test feature-wise binning - tse_bin_feat <- transformAssay(tse, method = "binning", bins = 3, MARGIN = "features") + tse_bin_feat <- transformAssay(tse, method = "binning", nbins = 3, MARGIN = "features") expect_equal(dim(assay(tse_bin_feat, "binning")), dim(assay(tse, "counts"))) # Manual check for feature-wise # 2 features, 3 samples mat_feat <- matrix(c(10, 5, 0, 20, 0, 10), nrow=2, byrow=TRUE) tse_feat <- SummarizedExperiment(assays = list(counts = mat_feat)) - tse_feat <- transformAssay(tse_feat, method = "binning", bins = 3, MARGIN = "features") + tse_feat <- transformAssay(tse_feat, method = "binning", nbins = 3, MARGIN = "features") res_feat <- assay(tse_feat, "binning") # Check rows (features) From 6ed957cb33c9c350a9e42e55f5665ee3b91e116a Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Wed, 21 Jan 2026 14:52:58 +0200 Subject: [PATCH 06/11] Add unittest for binning negative values --- R/transformCounts.R | 5 +++++ tests/testthat/test-5transformCounts.R | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/R/transformCounts.R b/R/transformCounts.R index 7cdf96761..f30e5cc50 100644 --- a/R/transformCounts.R +++ b/R/transformCounts.R @@ -919,6 +919,11 @@ NULL } nbins <- as.integer(nbins) + # Check does not contain negative numeric values + if( !all(mat >= 0) ) { + warning("The assay contains negative numeric values. Resulting bins will not make sense.") + } + # Apply binning res <- apply(mat, MARGIN = 2, function(x) { # Initialize result with 0 (for zero values) diff --git a/tests/testthat/test-5transformCounts.R b/tests/testthat/test-5transformCounts.R index 487ec4036..5f5494010 100644 --- a/tests/testthat/test-5transformCounts.R +++ b/tests/testthat/test-5transformCounts.R @@ -426,6 +426,11 @@ test_that("transformAssay", { ############################## BINNING ############################### # Test that binning transformation works tse_bin <- transformAssay(tse, method = "binning", nbins = 3) + + # Expect warning trying to bin negative values + tse <- transformAssay(tse, method = "rclr") + expect_warning(transformAssay(tse, method = "binning", assay.type = "rclr")) + # Check that the assay was created expect_true("binning" %in% assayNames(tse_bin)) From 9a4f8646496cc91fe4881e88e6488b55c61c8a46 Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Thu, 22 Jan 2026 11:49:51 +0200 Subject: [PATCH 07/11] add mention of option to function documentation --- R/transformCounts.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/R/transformCounts.R b/R/transformCounts.R index f30e5cc50..fa3e80c4b 100644 --- a/R/transformCounts.R +++ b/R/transformCounts.R @@ -128,6 +128,12 @@ #' are taken into account. This method keeps only values greater than #' \code{threshold} and replaces all other values with \code{value}. #' +#' \item 'binning': Binning of the abundance values into a specified number of +#' bins. The non-zero values are sorted and divided into \code{nbins} groups of +#' equal size (quantiles). The group with the highest abundances is assigned to +#' bin \code{nbins}, while the group with the lowest non-zero abundances is +#' assigned to bin 1. Zero values are assigned to bin 0. +#' #' } #' #' @return From 3d7cef6fa32f824d76e91fd622c4ffa5fa1d3509 Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Thu, 22 Jan 2026 12:04:56 +0200 Subject: [PATCH 08/11] warning to error on attempt to bin negative values --- R/transformCounts.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/transformCounts.R b/R/transformCounts.R index fa3e80c4b..9bdbf0a86 100644 --- a/R/transformCounts.R +++ b/R/transformCounts.R @@ -927,7 +927,7 @@ NULL # Check does not contain negative numeric values if( !all(mat >= 0) ) { - warning("The assay contains negative numeric values. Resulting bins will not make sense.") + stop("The assay contains negative numeric values. Resulting bins will not make sense.", call. = FALSE) } # Apply binning From 3cd1f4034b3c8e55ece3653aec913545f587fe51 Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Thu, 22 Jan 2026 13:21:16 +0200 Subject: [PATCH 09/11] fix test --- tests/testthat/test-5transformCounts.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-5transformCounts.R b/tests/testthat/test-5transformCounts.R index 5f5494010..e61f0068a 100644 --- a/tests/testthat/test-5transformCounts.R +++ b/tests/testthat/test-5transformCounts.R @@ -429,7 +429,7 @@ test_that("transformAssay", { # Expect warning trying to bin negative values tse <- transformAssay(tse, method = "rclr") - expect_warning(transformAssay(tse, method = "binning", assay.type = "rclr")) + expect_error(transformAssay(tse, method = "binning", assay.type = "rclr")) # Check that the assay was created expect_true("binning" %in% assayNames(tse_bin)) From 1d4c092a769067ed31473f6fbe7deae72f7c115c Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Tue, 10 Feb 2026 15:56:23 +0200 Subject: [PATCH 10/11] add lit ref --- R/transformCounts.R | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/R/transformCounts.R b/R/transformCounts.R index 9bdbf0a86..6e1b37128 100644 --- a/R/transformCounts.R +++ b/R/transformCounts.R @@ -132,7 +132,8 @@ #' bins. The non-zero values are sorted and divided into \code{nbins} groups of #' equal size (quantiles). The group with the highest abundances is assigned to #' bin \code{nbins}, while the group with the lowest non-zero abundances is -#' assigned to bin 1. Zero values are assigned to bin 0. +#' assigned to bin 1. Zero values are assigned to bin 0. This binning approach +#' is based on the binning strategy described by Medearis et al. (2026). #' #' } #' @@ -148,6 +149,11 @@ #' _Nature Methods_ 10, 1200–1202. #' doi:10.1038/nmeth.2658 #' +#' Medearis, N. A., Zhu, S., & Zomorrodi, A. R. (2026). +#' BiomeGPT: A foundation model for the human gut microbiome +#' _bioRxiv_ +#' doi:10.64898/2026.01.05.697599 +#' #' @seealso #' \itemize{ #' \item \code{\link[vegan:decostand]{vegan::decostand}} From 7597721ceeb41133d116adb5f876a4a3c8da2878 Mon Sep 17 00:00:00 2001 From: raivo-otus Date: Tue, 10 Feb 2026 16:18:35 +0200 Subject: [PATCH 11/11] implementation of logic with cut --- R/transformCounts.R | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/R/transformCounts.R b/R/transformCounts.R index 6e1b37128..9cb7d49cc 100644 --- a/R/transformCounts.R +++ b/R/transformCounts.R @@ -959,10 +959,13 @@ NULL # species are distributed proportionally across nbins 1 through B bin_values <- round(seq(from = nbins, to = 1, length.out = n_nonzero)) } else { - # Ranks 1 to n_nonzero - # Formula: bin = B - ceiling(rank * B / N) + 1 - ranks <- seq_len(n_nonzero) - bin_values <- nbins - ceiling(ranks * nbins / n_nonzero) + 1 + # Get cut points + bin_idx <- cut( + seq_len(n_nonzero), + breaks = seq(0, n_nonzero, length.out = nbins + 1), + labels = FALSE + ) + bin_values <- nbins - bin_idx + 1 } # Assign back