From c28f1c8aca69b660afd0c0fb9c0e8258e48faf09 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Sun, 1 Jun 2025 16:25:24 +0200 Subject: [PATCH 01/44] Remove dependency on deprecated zlibbioc --- DESCRIPTION | 1 - NAMESPACE | 1 - R/digestFastqs.R | 1 - src/Makevars | 1 + src/Makevars.win | 1 + 5 files changed, 2 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 790ad41..60698a0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -45,7 +45,6 @@ Imports: DT, ggrepel, IRanges, - zlibbioc, utils, DelayedArray, lifecycle diff --git a/NAMESPACE b/NAMESPACE index 230375d..445df33 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -18,7 +18,6 @@ export(plotTotals) export(plotVolcano) export(relabelMutPositions) export(summarizeExperiment) -import(zlibbioc) importFrom(BiocGenerics,paste) importFrom(BiocGenerics,rownames) importFrom(DT,datatable) diff --git a/R/digestFastqs.R b/R/digestFastqs.R index fe43ec0..b70d1c5 100644 --- a/R/digestFastqs.R +++ b/R/digestFastqs.R @@ -333,7 +333,6 @@ #' out$errorStatistics #' #' @export -#' @import zlibbioc #' @importFrom lifecycle deprecated is_present deprecate_warn digestFastqs <- function(fastqForward, fastqReverse = NULL, mergeForwardReverse = FALSE, minOverlap = 0, maxOverlap = 0, diff --git a/src/Makevars b/src/Makevars index a4b6954..8b30577 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,3 +1,4 @@ +PKG_LIBS+=-lz CXX_STD=CXX11 CXX1X=$(shell ${R_HOME}/bin/R CMD config CXX11) ifeq ($(shell $(CXX1X) -fopenmp -E -xc++ - 2>&1 >/dev/null && echo 'true'), true) diff --git a/src/Makevars.win b/src/Makevars.win index 350dcb9..9f7bdc6 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -1,3 +1,4 @@ +PKG_LIBS+=-lz CXX_STD=CXX11 CXX1X=$(shell ${R_HOME}/bin${R_ARCH_BIN}/R CMD config CXX11) PKG_LIBS+=$(shell $(R_HOME)/bin/Rscript.exe -e "Rcpp:::LdFlags()") From bf206bb64c5c978c8c317a29ee8407592c263a17 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Sun, 1 Jun 2025 16:27:25 +0200 Subject: [PATCH 02/44] Add BiocManager to Suggests to address R CMD check warning --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 60698a0..b8e6b13 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -55,7 +55,8 @@ Suggests: Biostrings, pwalign, plotly, - scattermore + scattermore, + BiocManager SystemRequirements: GNU make biocViews: GeneticVariability, GenomicVariation, Preprocessing License: MIT + file LICENSE From 370e6ab3b3959985c130ebf594ba7b1294e23a42 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Sun, 1 Jun 2025 16:41:46 +0200 Subject: [PATCH 03/44] Escape braces in documentation --- R/digestFastqs.R | 8 ++++---- R/relabelMutPositions.R | 6 +++--- man/digestFastqs.Rd | 8 ++++---- man/relabelMutPositions.Rd | 6 +++--- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/R/digestFastqs.R b/R/digestFastqs.R index b70d1c5..704094d 100644 --- a/R/digestFastqs.R +++ b/R/digestFastqs.R @@ -37,9 +37,9 @@ #' the codons encoded by #' \code{forbiddenMutatedCodonsForward}/\code{forbiddenMutatedCodonsReverse}. #' \item Assign a 'mutation name' to the read (pair). This name is a -#' combination of parts of the form XX{.}YY{.}NNN, where XX is the name of the +#' combination of parts of the form XX\{.\}YY\{.\}NNN, where XX is the name of the #' most similar reference sequence, YY is the mutated codon number, and NNN is -#' the mutated codon. {.} is a delimiter, specified via +#' the mutated codon. \{.\} is a delimiter, specified via #' \code{mutNameDelimiter}. If no wildtype sequences are provided, the #' variable sequence will be used as the mutation name'. #'} @@ -146,11 +146,11 @@ #' any mutated base has a Phred score lower than \code{mutatedPhredMin}, #' the read (pair) will be discarded. #' @param mutNameDelimiter Character scalar, the delimiter used in the naming -#' of mutants. Generally, mutants will be named as XX{.}YY{.}NNN, where XX +#' of mutants. Generally, mutants will be named as XX\{.\}YY\{.\}NNN, where XX #' is the closest provided reference sequence, YY is the mutated base or #' codon number (depending on whether \code{nbrMutatedBases*} or #' \code{nbrMutatedCodons*} is specified), and NNN is the -#' mutated base or codon. Here, {.} is the provided \code{mutNameDelimiter}. +#' mutated base or codon. Here, \{.\} is the provided \code{mutNameDelimiter}. #' The delimiter must be a single character (not "_"), and can not appear #' in any of the provided reference sequence names. #' @param constantMaxDistForward,constantMaxDistReverse Numeric scalars, the diff --git a/R/relabelMutPositions.R b/R/relabelMutPositions.R index 2a14d6f..c9247ab 100644 --- a/R/relabelMutPositions.R +++ b/R/relabelMutPositions.R @@ -1,9 +1,9 @@ #' Relabel the positions of mutations in the designated ID #' #' @param se SummarizedExperiment object, with row names of the form -#' XX{.}AA{.}NNN, where XX is the name of the reference sequence, AA is the +#' XX\{.\}AA\{.\}NNN, where XX is the name of the reference sequence, AA is the #' position of the mutated codon, and NNN is the mutated codon or amino -#' acid. {.} is the delimiter, to be specified in the +#' acid. \{.\} is the delimiter, to be specified in the #' \code{mutNameDelimiter} argument. For rows corresponding to sequences #' with multiple mutated codons, the row names contain multiple names of #' the form above in a single string, separated by "_". @@ -15,7 +15,7 @@ #' \item name The new name for the codon (will replace AA in the mutation #' name, if the reference sequence matches seqname) #' } -#' @param mutNameDelimiter The delimiter used in the mutation name ({.} above). +#' @param mutNameDelimiter The delimiter used in the mutation name (\{.\} above). #' #' @author Charlotte Soneson #' @export diff --git a/man/digestFastqs.Rd b/man/digestFastqs.Rd index a871108..38052a3 100644 --- a/man/digestFastqs.Rd +++ b/man/digestFastqs.Rd @@ -181,11 +181,11 @@ any mutated base has a Phred score lower than \code{mutatedPhredMin}, the read (pair) will be discarded.} \item{mutNameDelimiter}{Character scalar, the delimiter used in the naming -of mutants. Generally, mutants will be named as XX{.}YY{.}NNN, where XX +of mutants. Generally, mutants will be named as XX\{.\}YY\{.\}NNN, where XX is the closest provided reference sequence, YY is the mutated base or codon number (depending on whether \code{nbrMutatedBases*} or \code{nbrMutatedCodons*} is specified), and NNN is the -mutated base or codon. Here, {.} is the provided \code{mutNameDelimiter}. +mutated base or codon. Here, \{.\} is the provided \code{mutNameDelimiter}. The delimiter must be a single character (not "_"), and can not appear in any of the provided reference sequence names.} @@ -302,9 +302,9 @@ The processing of a read pair goes as follows: the codons encoded by \code{forbiddenMutatedCodonsForward}/\code{forbiddenMutatedCodonsReverse}. \item Assign a 'mutation name' to the read (pair). This name is a - combination of parts of the form XX{.}YY{.}NNN, where XX is the name of the + combination of parts of the form XX\{.\}YY\{.\}NNN, where XX is the name of the most similar reference sequence, YY is the mutated codon number, and NNN is - the mutated codon. {.} is a delimiter, specified via + the mutated codon. \{.\} is a delimiter, specified via \code{mutNameDelimiter}. If no wildtype sequences are provided, the variable sequence will be used as the mutation name'. } diff --git a/man/relabelMutPositions.Rd b/man/relabelMutPositions.Rd index f52db8b..1c32607 100644 --- a/man/relabelMutPositions.Rd +++ b/man/relabelMutPositions.Rd @@ -8,9 +8,9 @@ relabelMutPositions(se, conversionTable, mutNameDelimiter = ".") } \arguments{ \item{se}{SummarizedExperiment object, with row names of the form -XX{.}AA{.}NNN, where XX is the name of the reference sequence, AA is the +XX\{.\}AA\{.\}NNN, where XX is the name of the reference sequence, AA is the position of the mutated codon, and NNN is the mutated codon or amino -acid. {.} is the delimiter, to be specified in the +acid. \{.\} is the delimiter, to be specified in the \code{mutNameDelimiter} argument. For rows corresponding to sequences with multiple mutated codons, the row names contain multiple names of the form above in a single string, separated by "_".} @@ -24,7 +24,7 @@ mutation name) name, if the reference sequence matches seqname) }} -\item{mutNameDelimiter}{The delimiter used in the mutation name ({.} above).} +\item{mutNameDelimiter}{The delimiter used in the mutation name (\{.\} above).} } \value{ A SummarizedExperiment object with modified row names. From 7e790a34f27cb788bf03efb0d32e4b68d7273889 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Sun, 1 Jun 2025 16:42:02 +0200 Subject: [PATCH 04/44] Test against Bioc devel on GHA --- .github/workflows/R-CMD-check.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 021392e..db676a0 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -18,9 +18,9 @@ jobs: fail-fast: false matrix: config: - - { os: macOS-latest, bioc: 'release', curlConfigPath: '/usr/bin/'} - - { os: windows-latest, bioc: 'release'} - - { os: ubuntu-latest, image: "bioconductor/bioconductor_docker:RELEASE_3_20", cran: "https://demo.rstudiopm.com/all/__linux__/xenial/latest"} + - { os: macOS-latest, bioc: 'devel', curlConfigPath: '/usr/bin/'} + - { os: windows-latest, bioc: 'devel'} + - { os: ubuntu-latest, image: "bioconductor/bioconductor_docker:devel", cran: "https://demo.rstudiopm.com/all/__linux__/xenial/latest"} env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true From 9d6d7c9b11951aa56e3363f73420624da4942172 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Sun, 1 Jun 2025 16:42:31 +0200 Subject: [PATCH 05/44] Depend on R>=4.5.0 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index b8e6b13..40085c2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,7 +20,7 @@ Description: Provides functionality for processing and statistical analysis files to publication-ready visualizations. A broad range of library designs can be processed with a single, unified interface. Depends: - R (>= 3.5) + R (>= 4.5.0) Imports: BiocGenerics, S4Vectors, From dcf21367334ee159434f04a044c9bc7e1a00218d Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Sun, 1 Jun 2025 16:42:42 +0200 Subject: [PATCH 06/44] Add URL and BugReports fields --- DESCRIPTION | 2 ++ 1 file changed, 2 insertions(+) diff --git a/DESCRIPTION b/DESCRIPTION index 40085c2..5c545e7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -66,3 +66,5 @@ VignetteBuilder: knitr LinkingTo: Rcpp Config/testthat/edition: 3 +URL: https://github.com/fmicompbio/mutscan +BugReports: https://github.com/fmicompbio/mutscan/issues From 03bab67cc7c3e230f175502880f2898f29de3cd4 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Sun, 1 Jun 2025 19:22:59 +0200 Subject: [PATCH 07/44] Cleanup, utils.R --- R/utils.R | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/R/utils.R b/R/utils.R index 38852e2..7e40be8 100644 --- a/R/utils.R +++ b/R/utils.R @@ -74,11 +74,13 @@ } args <- lapply(mycall, as.character)[-1] xname <- if ("x" %in% names(args)) args$x else "argument" - + ## Check arguments stopifnot(is.null(type) || (length(type) == 1L && is.character(type))) - stopifnot(is.null(rngIncl) || (length(rngIncl) == 2L && is.numeric(rngIncl))) - stopifnot(is.null(rngExcl) || (length(rngExcl) == 2L && is.numeric(rngExcl))) + stopifnot(is.null(rngIncl) || (length(rngIncl) == 2L && + is.numeric(rngIncl))) + stopifnot(is.null(rngExcl) || (length(rngExcl) == 2L && + is.numeric(rngExcl))) stopifnot(is.null(len) || (length(len) == 1L && is.numeric(len))) stopifnot(is.null(rngLen) || (length(rngLen) == 2L && is.numeric(rngLen))) stopifnot(is.logical(allowNULL) && length(allowNULL) == 1L) @@ -107,13 +109,14 @@ type <- "numeric" } - if (!is.null(type) && !methods::is(x, type)) { + if (!is.null(type) && !is(x, type)) { stop("'", xname, "' must be of class '", type, "'", call. = FALSE) } if (!is.null(rngIncl)) { if (!is.null(validValues)) { - if (any((x < rngIncl[1] | x > rngIncl[2]) & !(x %in% validValues))) { + if (any((x < rngIncl[1] | x > rngIncl[2]) & + !(x %in% validValues))) { stop("'", xname, "' must be within [", rngIncl[1], ",", rngIncl[2], "] (inclusive), or one of: ", vvPrint, call. = FALSE) @@ -126,7 +129,8 @@ } } else if (!is.null(rngExcl)) { if (!is.null(validValues)) { - if (any((x <= rngExcl[1] | x >= rngExcl[2]) & !(x %in% validValues))) { + if (any((x <= rngExcl[1] | x >= rngExcl[2]) & + !(x %in% validValues))) { stop("'", xname, "' must be within (", rngExcl[1], ",", rngExcl[2], ") (exclusive), or one of: ", vvPrint, call. = FALSE) @@ -144,11 +148,11 @@ } } - + if (!is.null(len) && length(x) != len) { stop("'", xname, "' must have length ", len, call. = FALSE) } - + if (!is.null(rngLen) && (length(x) < rngLen[1] || length(x) > rngLen[2])) { stop("length of '", xname, "' must be within [", rngLen[1], ",", rngLen[2], "] (inclusive)", call. = FALSE) @@ -190,20 +194,22 @@ caller <- deparse(sys.calls()[[sys.nframe() - 1]]) callerfunc <- sub("\\(.+$", "", caller) haveBioc <- requireNamespace("BiocManager", quietly = TRUE) - msg <- paste0("The package", ifelse(sum(!avail) > 1, "s '", " '"), - paste(sub("^[^/]+/", "", pkgs[!avail]), collapse = "', '"), - "' ", - ifelse(sum(!avail) > 1, "are", "is"), " required for ", - callerfunc, "(), but not installed.\n") + msg <- paste0( + "The package", ifelse(sum(!avail) > 1, "s '", " '"), + paste(sub("^[^/]+/", "", pkgs[!avail]), collapse = "', '"), + "' ", + ifelse(sum(!avail) > 1, "are", "is"), " required for ", + callerfunc, "(), but not installed.\n") if (suggestInstallation) { - msg <- paste0(msg, - "Install ", ifelse(sum(!avail) > 1, "them", "it"), " using:\n", - ifelse(haveBioc, "", "install.packages(\"BiocManager\")\n"), - "BiocManager::install(c(\"", - paste(pkgs[!avail], collapse = "\", \""), "\"))") + msg <- paste0( + msg, + "Install ", ifelse(sum(!avail) > 1, "them", "it"), " using:\n", + ifelse(haveBioc, "", "install.packages(\"BiocManager\")\n"), + "BiocManager::install(c(\"", + paste(pkgs[!avail], collapse = "\", \""), "\"))") } stop(msg, call. = FALSE) } invisible(TRUE) -} \ No newline at end of file +} From 70643385ce092c9c58be63d30b4b157e293c2a8e Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Sun, 1 Jun 2025 21:22:32 +0200 Subject: [PATCH 08/44] Cleanup, vignette --- vignettes/mutscan.Rmd | 124 ++++++++++++++++++++++-------------------- 1 file changed, 65 insertions(+), 59 deletions(-) diff --git a/vignettes/mutscan.Rmd b/vignettes/mutscan.Rmd index a83c002..0d1dec7 100644 --- a/vignettes/mutscan.Rmd +++ b/vignettes/mutscan.Rmd @@ -102,33 +102,34 @@ As an example, a read with the following composition: would be specified to `digestFastqs()` by an elements string `"SUCV"`, and an element length vector `c(1, 10, 18, 96)`. -The package can also accommodate designs with primer sequences. In this situation, -the primer acts as an 'anchor', and the read composition before and after the -primer is specified. For example, a read with the following composition: +The package can also accommodate designs with primer sequences. In this +situation, the primer acts as an 'anchor', and the read composition before and +after the primer is specified. For example, a read with the following +composition: [unknown sequence] - [10 nt primer] - [variable region, constituting the remainder of the read] would be represented by an elements string `"SPV"`, and an element length vector `c(-1, 10, -1)`, where the -1 indicates that the corresponding read part -consists of the remaining part of the read, not accounted for by any of the other -specified parts. In addition, the sequence of the primer must be specified, and -any read where the primer is not present will be discarded. +consists of the remaining part of the read, not accounted for by any of the +other specified parts. In addition, the sequence of the primer must be +specified, and any read where the primer is not present will be discarded. The forward and reverse reads can have different compositions. The user can also specify whether the variable parts of the forward and/or -reverse reads should be reverse complemented before being processed, and whether the -variable regions of the forward and reverse reads should be merged into a single -consensus sequence. +reverse reads should be reverse complemented before being processed, and +whether the variable regions of the forward and reverse reads should be merged +into a single consensus sequence. ## Filtering -In addition to reading the FASTQ files, the `digestFastqs()` function will perform -a series of filtering steps, in the following order: +In addition to reading the FASTQ files, the `digestFastqs()` function will +perform a series of filtering steps, in the following order: * Search for perfect matches to forward/reverse adapter sequences, filter out the read pair if a match is found in either the forward or reverse read. -* Search for perfect matches to forward/reverse primer sequences, filter out the -read pair if not both are found. +* Search for perfect matches to forward/reverse primer sequences, filter out +the read pair if not both are found. * Filter out reads whose length is not compatible with the indicated composition. * If forward and reverse variable regions should be merged, filter out read @@ -161,9 +162,9 @@ other reasons). # Processing TRANS data Here, we illustrate the processing of the provided TRANS experiment example -data. We filter out reads with any adapter match, an average phred quality below -20, any Ns in the UMI or variable sequence, more than one mutated codon, or a -mutated codon ending with A or T (represented by the `NNW` value of the +data. We filter out reads with any adapter match, an average phred quality +below 20, any Ns in the UMI or variable sequence, more than one mutated codon, +or a mutated codon ending with A or T (represented by the `NNW` value of the `forbiddenMutatedCodonsForward` and `forbiddenMutatedCodonsReverse` arguments). In order to identify the number of mutations, we have to provide the wild type sequences of _FOS_ and _JUN_ (forward and reverse reads, respectively). This @@ -203,9 +204,9 @@ transInput <- digestFastqs( ) ``` -The `digestFastqs()` function returns a list with four elements. The `parameters` -list records all parameter values used during the processing, as well as the -`mutscan` version and time of processing. +The `digestFastqs()` function returns a list with four elements. The +`parameters` list records all parameter values used during the processing, as +well as the `mutscan` version and time of processing. ```{r} transInput$parameters @@ -213,8 +214,8 @@ transInput$parameters The `filterSummary` data.frame contains a summary of the number of reads filtered out in each step. Note that for any given filtering step, only the -reads retained by the previous filters are considered. The numbers in the filter -column names indicate the order of the filters. +reads retained by the previous filters are considered. The numbers in the +filter column names indicate the order of the filters. ```{r} transInput$filterSummary @@ -225,13 +226,14 @@ each variable sequence pair. The underscore in the strings in the sequence column indicate the separation between the forward and reverse wild type sequences. In addition, the table contains a column `mutantName`, which provides a shorthand notation for each mutant. The format of the values in this -column is a combination of `WTS.xx.NNN` (separated by `_`), where `WTS` provides -the name of the closest matching wild type sequence (if only one, unnamed wild -type sequence is provided, the name will be `f/r` depending on if it corresponds -to the forward/reverse read, respectively). The `xx` part indicates the mutated -codon or nucleotide number, and `NNN` gives the observed sequence for the mutated -codon or nucleotide. A sequence without mutations is named `WTS.0.WT`, where, -again, `WTS` is the name of the matching wild type sequence. +column is a combination of `WTS.xx.NNN` (separated by `_`), where `WTS` +provides the name of the closest matching wild type sequence (if only one, +unnamed wild type sequence is provided, the name will be `f/r` depending on +if it corresponds to the forward/reverse read, respectively). The `xx` part +indicates the mutated codon or nucleotide number, and `NNN` gives the +observed sequence for the mutated codon or nucleotide. A sequence without +mutations is named `WTS.0.WT`, where, again, `WTS` is the name of the +matching wild type sequence. ```{r} head(transInput$summaryTable) @@ -249,9 +251,11 @@ The `errorStatistics` output can be used to estimate the sequencing error rate: ```{r errStats} (propErrorsConstantF <- sum(transInput$errorStatistics$nbrMismatchForward) / - (nchar(transInput$parameters$constantForward) * transInput$filterSummary$nbrRetained)) + (nchar(transInput$parameters$constantForward) * + transInput$filterSummary$nbrRetained)) (propErrorsConstantR <- sum(transInput$errorStatistics$nbrMismatchReverse) / - (nchar(transInput$parameters$constantReverse) * transInput$filterSummary$nbrRetained)) + (nchar(transInput$parameters$constantReverse) * + transInput$filterSummary$nbrRetained)) ``` # Processing CIS data @@ -314,8 +318,8 @@ cisInput$filterSummary cisInput$errorStatistics[rowSums(cisInput$errorStatistics[, -1]) != 0, ] ``` -The `summaryTable` now provides the number of reads and unique UMIs observed for -each variable sequence, and all values in the `mutNames` column will start +The `summaryTable` now provides the number of reads and unique UMIs observed +for each variable sequence, and all values in the `mutNames` column will start with `FOS`. ```{r} @@ -382,12 +386,12 @@ leu <- c( ) ``` -Next we process the data, not allowing any mismatches in the forward read, but 1 -mismatching codon in the reverse read. Now, we assume that the variable sequence -starts immediately after the provided primers, and hence we don't specify any -UMI/constant sequence lengths. For the forward read, the variable region is -taken to be the remainder of the read (after the primer), whereas for the -reverse read, we specify the variable sequence length to 96. +Next we process the data, not allowing any mismatches in the forward read, but +1 mismatching codon in the reverse read. Now, we assume that the variable +sequence starts immediately after the provided primers, and hence we don't +specify any UMI/constant sequence lengths. For the forward read, the variable +region is taken to be the remainder of the read (after the primer), whereas +for the reverse read, we specify the variable sequence length to 96. ```{r} leujunt0 <- digestFastqs( @@ -503,8 +507,8 @@ metadata(se)$countType # Collapsing count matrix to amino acids -The \link{\code[SummarizedExperiment]{SummarizedExperiment}} object generated by -`summarizeExperiment()` contains one row for each observed variant +The \link{\code[SummarizedExperiment]{SummarizedExperiment}} object generated +by `summarizeExperiment()` contains one row for each observed variant (combination). This can be further collapsed by replacing the mutated codon by the corresponding amino acid, and aggregating the counts corresponding to the same mutated amino acid (combination). @@ -567,15 +571,15 @@ generateQCReport(se, outFile = tempfile(fileext = ".html")) # Calculating fitness scores -The function `calculateFitnessScore()` can be used to calculate fitness scores as -described in [@Diss2018]. The function requires the user to specify a `pairingCol`, -containing the replicate ID for each sample; one or more `ODCols`, containing -the optical density for each sample, and a `comparison`, which is a character -vector of length 3 specifying the comparison to perform, of the form -(`groupColumn`, `numerator`, `denominator`). Here, `groupColumn` is the name of -the column in `colData(se)` that contains the grouping information, and -`numerator` and `denominator` specify the values of this column representing the -two groups to be compared. +The function `calculateFitnessScore()` can be used to calculate fitness scores +as described in [@Diss2018]. The function requires the user to specify a +`pairingCol`, containing the replicate ID for each sample; one or more +`ODCols`, containing the optical density for each sample, and a +`comparison`, which is a character vector of length 3 specifying the +comparison to perform, of the form (`groupColumn`, `numerator`, +`denominator`). Here, `groupColumn` is the name of the column in `colData(se)` +that contains the grouping information, and `numerator` and `denominator` +specify the values of this column representing the two groups to be compared. Here, we illustrate the application of `calculateFitnessScore()` on the SummarizedExperiment object containing all the three CIS @@ -583,10 +587,12 @@ replicates from [@Diss2018]. ```{r} se_collapsed <- collapseMutantsByAA(se) -ppis <- calculateFitnessScore(se = se_collapsed, pairingCol = "Replicate", - ODCols = c("OD1", "OD2"), - comparison = c("Condition", "cis_output", "cis_input"), - WTrows = "f.0.WT") +ppis <- calculateFitnessScore( + se = se_collapsed, pairingCol = "Replicate", + ODCols = c("OD1", "OD2"), + comparison = c("Condition", "cis_output", "cis_input"), + WTrows = "f.0.WT" +) head(ppis[order(abs(rowMeans(ppis)), decreasing = TRUE), , drop = FALSE]) ## The PPI score for the WT sequence is 1, by construction @@ -595,12 +601,12 @@ ppis["f.0.WT", , drop = FALSE] # Scoring mutations with `edgeR` or `limma` -As an alternative to the fitness scoring, `mutscan` can be used to model the observed -counts using a generalized linear model (with `edgeR`) or a general linear model -(with `limma`) and estimate a logFC and a p-value for the enrichment of each -variant betwen two conditions (or more generally, in association with any -predictor), compared to one or more WT sequences. Note that for this, at least -two replicates are required. +As an alternative to the fitness scoring, `mutscan` can be used to model the +observed counts using a generalized linear model (with `edgeR`) or a general +linear model (with `limma`) and estimate a logFC and a p-value for the +enrichment of each variant betwen two conditions (or more generally, in +association with any predictor), compared to one or more WT sequences. Note +that for this, at least two replicates are required. We start by looking at the design matrix, in order to determine which of the coefficients to specify for the testing in `calculateRelativeFC()`. For more From ef8df50ffc02aab6bb5a190b6c79979b883f9f60 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Sun, 1 Jun 2025 21:27:27 +0200 Subject: [PATCH 09/44] Cleanup, generateQCReport.R --- DESCRIPTION | 3 +- NAMESPACE | 1 + R/generateQCReport.R | 14 +++++---- inst/templates/qc_report_template.Rmd | 45 +++++++++++++++------------ 4 files changed, 36 insertions(+), 27 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5c545e7..1d88953 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -47,7 +47,8 @@ Imports: IRanges, utils, DelayedArray, - lifecycle + lifecycle, + tools Suggests: testthat (>= 3.0.0), BiocStyle, diff --git a/NAMESPACE b/NAMESPACE index 445df33..422b793 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -119,6 +119,7 @@ importFrom(tidyr,separate) importFrom(tidyr,separate_rows) importFrom(tidyr,unite) importFrom(tidyselect,matches) +importFrom(tools,file_ext) importFrom(utils,globalVariables) importFrom(utils,relist) importFrom(xfun,Rscript_call) diff --git a/R/generateQCReport.R b/R/generateQCReport.R index e0d973f..6338977 100644 --- a/R/generateQCReport.R +++ b/R/generateQCReport.R @@ -23,6 +23,7 @@ #' @importFrom SummarizedExperiment colData #' @importFrom dplyr bind_rows full_join #' @importFrom S4Vectors metadata +#' @importFrom tools file_ext #' #' @examples #' ## Load SummarizedExperiment object @@ -43,7 +44,7 @@ generateQCReport <- function(se, outFile, reportTitle = "mutscan QC report", .assertVector(x = se, type = "SummarizedExperiment") .assertScalar(x = outFile, type = "character") .assertScalar(x = forceOverwrite, type = "logical") - if (tools::file_ext(outFile) != "html") { + if (file_ext(outFile) != "html") { stop("'outFile' must have the file extension '.html'.") } outDir <- dirname(outFile) @@ -52,9 +53,11 @@ generateQCReport <- function(se, outFile, reportTitle = "mutscan QC report", } if (file.exists(outFile)) { if (!forceOverwrite) { - stop(outFile, " already exists and forceOverwrite = FALSE, stopping.") + stop(outFile, " already exists and forceOverwrite = FALSE, ", + "stopping.") } else { - message(outFile, " already exists but forceOverwrite = TRUE, overwriting.") + message(outFile, " already exists but forceOverwrite = TRUE, ", + "overwriting.") } } @@ -77,9 +80,8 @@ generateQCReport <- function(se, outFile, reportTitle = "mutscan QC report", } args <- c(args, list(...)[setdiff(...names(), names(args))]) - outputReport <- xfun::Rscript_call( - rmarkdown::render, - args + outputReport <- Rscript_call( + render, args ) ## --------------------------------------------------------------------- ## diff --git a/inst/templates/qc_report_template.Rmd b/inst/templates/qc_report_template.Rmd index 1279198..6ebe3d8 100644 --- a/inst/templates/qc_report_template.Rmd +++ b/inst/templates/qc_report_template.Rmd @@ -53,14 +53,15 @@ extradf <- do.call( function(x) paste(x, collapse = "; "))) }) ) -DT::datatable(dplyr::full_join(tmpdf, extradf, by = "Name"), - extensions = "FixedColumns", - rownames = FALSE, - options = list( - paging = TRUE, searching = TRUE, info = FALSE, - pageLength = 20, - sort = TRUE, scrollX = TRUE, fixedColumns = list(leftColumns = 1) - )) %>% +DT::datatable( + dplyr::full_join(tmpdf, extradf, by = "Name"), + extensions = "FixedColumns", + rownames = FALSE, + options = list( + paging = TRUE, searching = TRUE, info = FALSE, + pageLength = 20, + sort = TRUE, scrollX = TRUE, fixedColumns = list(leftColumns = 1) + )) %>% formatStyle("Name", "vertical-align" = "center") ``` @@ -69,15 +70,17 @@ reads remaining after each filtering step performed by `mutscan`, as well as the fraction of reads that are filtered out by each step. ```{r, fig.height = 1.5 * ncol(params$se), fig.width = 10} -tryCatch({mutscan::plotFiltering(params$se, valueType = "reads", onlyActiveFilters = TRUE, - displayNumbers = TRUE, plotType = "remaining", facetBy = "sample")}, - error = function(e) message("Couldn't generate filtering plots.")) +tryCatch({mutscan::plotFiltering( + params$se, valueType = "reads", onlyActiveFilters = TRUE, + displayNumbers = TRUE, plotType = "remaining", facetBy = "sample")}, + error = function(e) message("Couldn't generate filtering plots.")) ``` ```{r, fig.height = 1.5 * ncol(params$se), fig.width = 10} -tryCatch({mutscan::plotFiltering(params$se, valueType = "fractions", onlyActiveFilters = TRUE, - displayNumbers = TRUE, plotType = "filtered", facetBy = "step")}, - error = function(e) message("Couldn't generate filtering plots.")) +tryCatch({mutscan::plotFiltering( + params$se, valueType = "fractions", onlyActiveFilters = TRUE, + displayNumbers = TRUE, plotType = "filtered", facetBy = "step")}, + error = function(e) message("Couldn't generate filtering plots.")) ``` # Pairs plot @@ -109,15 +112,17 @@ arranged in decreasing order, and as a density plot. Note that both plots display the values on a log scale; hence any zero values are not shown. ```{r, warning = FALSE} -tryCatch({mutscan::plotDistributions(params$se, selAssay = "counts", pseudocount = 1, - groupBy = NULL, plotType = "knee", facet = FALSE)}, - error = function(e) message("Couldn't generate distribution plot.")) +tryCatch({mutscan::plotDistributions( + params$se, selAssay = "counts", pseudocount = 1, + groupBy = NULL, plotType = "knee", facet = FALSE)}, + error = function(e) message("Couldn't generate distribution plot.")) ``` ```{r, warning = FALSE} -tryCatch({mutscan::plotDistributions(params$se, selAssay = "counts", pseudocount = 1, - groupBy = NULL, plotType = "density", facet = FALSE)}, - error = function(e) message("Couldn't generate distribution plot.")) +tryCatch({mutscan::plotDistributions( + params$se, selAssay = "counts", pseudocount = 1, + groupBy = NULL, plotType = "density", facet = FALSE)}, + error = function(e) message("Couldn't generate distribution plot.")) ``` # Session info From 838bf64a440d6812046914caada74da785d4197b Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Sun, 1 Jun 2025 21:32:08 +0200 Subject: [PATCH 10/44] Cleanup, plotTotals.R --- R/plotTotals.R | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/R/plotTotals.R b/R/plotTotals.R index 7bcf732..30649d9 100644 --- a/R/plotTotals.R +++ b/R/plotTotals.R @@ -15,7 +15,7 @@ #' #' @importFrom ggplot2 ggplot theme_minimal theme element_text labs #' geom_bar scale_fill_discrete aes -#' @importFrom SummarizedExperiment assay rowData +#' @importFrom SummarizedExperiment assay rowData assayNames #' @importFrom rlang .data #' #' @examples @@ -26,23 +26,25 @@ plotTotals <- function(se, selAssay = "counts", groupBy = NULL) { .assertVector(x = se, type = "SummarizedExperiment") .assertScalar(x = selAssay, type = "character", - validValues = SummarizedExperiment::assayNames(se)) + validValues = assayNames(se)) if (!is.null(groupBy)) { - .assertScalar(x = groupBy, type = "character", - validValues = colnames(SummarizedExperiment::rowData(se))) + .assertScalar( + x = groupBy, type = "character", + validValues = colnames(rowData(se))) } - + ## Extract the assay matrix and reformat relevant parts of it for plotting - selAssayMat <- SummarizedExperiment::assay(se, selAssay) + selAssayMat <- assay(se, selAssay) if (!is.null(groupBy)) { df <- do.call( rbind, - lapply(unique(SummarizedExperiment::rowData(se)[[groupBy]]), + lapply(unique(rowData(se)[[groupBy]]), function(rn) { data.frame( names = colnames(se), category = rn, - total = colSums(selAssayMat[SummarizedExperiment::rowData(se)[[groupBy]] == rn, ]) + total = colSums( + selAssayMat[rowData(se)[[groupBy]] == rn, ]) ) }) ) @@ -52,21 +54,22 @@ plotTotals <- function(se, selAssay = "counts", groupBy = NULL) { total = colSums(selAssayMat)) } - gg <- ggplot2::ggplot(df, ggplot2::aes(x = .data$names, y = .data$total)) + - ggplot2::theme_minimal() + - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1, - vjust = 0.5, size = 12), - axis.text.y = ggplot2::element_text(size = 12), - axis.title = ggplot2::element_text(size = 14)) + - ggplot2::labs(x = "", y = paste0("Total (", selAssay, ")")) + gg <- ggplot(df, aes(x = .data$names, y = .data$total)) + + theme_minimal() + + theme( + axis.text.x = element_text(angle = 90, hjust = 1, + vjust = 0.5, size = 12), + axis.text.y = element_text(size = 12), + axis.title = element_text(size = 14)) + + labs(x = "", y = paste0("Total (", selAssay, ")")) if (!is.null(groupBy)) { gg <- gg + - ggplot2::geom_bar(stat = "identity", position = "stack", - ggplot2::aes(fill = .data$category)) + - ggplot2::scale_fill_discrete(name = groupBy) + geom_bar(stat = "identity", position = "stack", + aes(fill = .data$category)) + + scale_fill_discrete(name = groupBy) } else { gg <- gg + - ggplot2::geom_bar(stat = "identity", position = "stack") + geom_bar(stat = "identity", position = "stack") } gg From bed42f6f07bd7ac711b521d452888aa1030c0b5c Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Sun, 1 Jun 2025 21:35:39 +0200 Subject: [PATCH 11/44] Cleanup, relabelMutPositions.R --- NAMESPACE | 1 + R/relabelMutPositions.R | 28 +++++++++++++++------------- man/relabelMutPositions.Rd | 9 +++++---- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 422b793..2cb15f0 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -18,6 +18,7 @@ export(plotTotals) export(plotVolcano) export(relabelMutPositions) export(summarizeExperiment) +importFrom(BiocGenerics,"rownames<-") importFrom(BiocGenerics,paste) importFrom(BiocGenerics,rownames) importFrom(DT,datatable) diff --git a/R/relabelMutPositions.R b/R/relabelMutPositions.R index c9247ab..a96107d 100644 --- a/R/relabelMutPositions.R +++ b/R/relabelMutPositions.R @@ -1,9 +1,9 @@ #' Relabel the positions of mutations in the designated ID #' #' @param se SummarizedExperiment object, with row names of the form -#' XX\{.\}AA\{.\}NNN, where XX is the name of the reference sequence, AA is the -#' position of the mutated codon, and NNN is the mutated codon or amino -#' acid. \{.\} is the delimiter, to be specified in the +#' XX\{.\}AA\{.\}NNN, where XX is the name of the reference sequence, +#' AA is the position of the mutated codon, and NNN is the mutated codon +#' or amino acid. \{.\} is the delimiter, to be specified in the #' \code{mutNameDelimiter} argument. For rows corresponding to sequences #' with multiple mutated codons, the row names contain multiple names of #' the form above in a single string, separated by "_". @@ -15,7 +15,8 @@ #' \item name The new name for the codon (will replace AA in the mutation #' name, if the reference sequence matches seqname) #' } -#' @param mutNameDelimiter The delimiter used in the mutation name (\{.\} above). +#' @param mutNameDelimiter The delimiter used in the mutation name +#' (\{.\} above). #' #' @author Charlotte Soneson #' @export @@ -24,7 +25,7 @@ #' #' @importFrom S4Vectors unstrsplit #' @importFrom utils relist -#' @importFrom BiocGenerics rownames +#' @importFrom BiocGenerics rownames rownames<- #' #' @examples #' x <- readRDS(system.file("extdata", "GSE102901_cis_se.rds", @@ -38,12 +39,13 @@ relabelMutPositions <- function(se, conversionTable, mutNameDelimiter = ".") { .assertVector(x = se, type = "SummarizedExperiment") .assertVector(x = colnames(conversionTable), type = "character") .assertScalar(x = mutNameDelimiter, type = "character") - stopifnot(all(c("position", "name", "seqname") %in% colnames(conversionTable))) + stopifnot(all(c("position", "name", "seqname") %in% + colnames(conversionTable))) conversionTable$position <- as.character(conversionTable$position) - spl <- base::strsplit(rownames(se), "_") - unl <- base::unlist(spl) - unl <- lapply(base::strsplit(unl, mutNameDelimiter, fixed = TRUE), + spl <- strsplit(rownames(se), "_") + unl <- unlist(spl) + unl <- lapply(strsplit(unl, mutNameDelimiter, fixed = TRUE), function(w) { idx <- which(conversionTable$seqname == w[1] & conversionTable$position == w[2]) @@ -52,9 +54,9 @@ relabelMutPositions <- function(se, conversionTable, mutNameDelimiter = ".") { } w }) - unl <- S4Vectors::unstrsplit(unl, sep = mutNameDelimiter) - spl <- utils::relist(unl, skeleton = spl) - spl <- S4Vectors::unstrsplit(spl, "_") - BiocGenerics::rownames(se) <- spl + unl <- unstrsplit(unl, sep = mutNameDelimiter) + spl <- relist(unl, skeleton = spl) + spl <- unstrsplit(spl, "_") + rownames(se) <- spl se } diff --git a/man/relabelMutPositions.Rd b/man/relabelMutPositions.Rd index 1c32607..bbc5468 100644 --- a/man/relabelMutPositions.Rd +++ b/man/relabelMutPositions.Rd @@ -8,9 +8,9 @@ relabelMutPositions(se, conversionTable, mutNameDelimiter = ".") } \arguments{ \item{se}{SummarizedExperiment object, with row names of the form -XX\{.\}AA\{.\}NNN, where XX is the name of the reference sequence, AA is the -position of the mutated codon, and NNN is the mutated codon or amino -acid. \{.\} is the delimiter, to be specified in the +XX\{.\}AA\{.\}NNN, where XX is the name of the reference sequence, +AA is the position of the mutated codon, and NNN is the mutated codon +or amino acid. \{.\} is the delimiter, to be specified in the \code{mutNameDelimiter} argument. For rows corresponding to sequences with multiple mutated codons, the row names contain multiple names of the form above in a single string, separated by "_".} @@ -24,7 +24,8 @@ mutation name) name, if the reference sequence matches seqname) }} -\item{mutNameDelimiter}{The delimiter used in the mutation name (\{.\} above).} +\item{mutNameDelimiter}{The delimiter used in the mutation name +(\{.\} above).} } \value{ A SummarizedExperiment object with modified row names. From 5e2ca1e70378a8e43d8697215ed45129e0fe0da5 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Sun, 1 Jun 2025 21:48:56 +0200 Subject: [PATCH 12/44] Cleanup, summarizeExperiment.R --- NAMESPACE | 2 - R/summarizeExperiment.R | 136 ++++++++++++++++++++----------------- man/summarizeExperiment.Rd | 28 ++++---- 3 files changed, 90 insertions(+), 76 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 2cb15f0..ea80d3d 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -19,7 +19,6 @@ export(plotVolcano) export(relabelMutPositions) export(summarizeExperiment) importFrom(BiocGenerics,"rownames<-") -importFrom(BiocGenerics,paste) importFrom(BiocGenerics,rownames) importFrom(DT,datatable) importFrom(DelayedArray,rowsum) @@ -118,7 +117,6 @@ importFrom(tibble,rownames_to_column) importFrom(tidyr,gather) importFrom(tidyr,separate) importFrom(tidyr,separate_rows) -importFrom(tidyr,unite) importFrom(tidyselect,matches) importFrom(tools,file_ext) importFrom(utils,globalVariables) diff --git a/R/summarizeExperiment.R b/R/summarizeExperiment.R index 125ae48..cc8a7a8 100644 --- a/R/summarizeExperiment.R +++ b/R/summarizeExperiment.R @@ -18,8 +18,8 @@ #' \code{coldata}. #' @param coldata A \code{data.frame} with at least one column "Name", which #' will be used to link to objects in \code{x}. A potentially subset and -#' reordered version of \code{coldata} is stored in the \code{colData} of the -#' returned \code{\link[SummarizedExperiment]{SummarizedExperiment}}. +#' reordered version of \code{coldata} is stored in the \code{colData} of +#' the returned \code{\link[SummarizedExperiment]{SummarizedExperiment}}. #' @param countType Either "reads" or "umis". If "reads", the "count" assay of #' the returned object will contain the observed number of reads for each #' sequence (pair). If "umis", the "count" assay will contain the number of @@ -28,26 +28,26 @@ #' @return A \code{\link[SummarizedExperiment]{SummarizedExperiment}} \code{x} #' with #' \describe{ -#' \item{assays(x)$counts}{containing the observed number of sequences or -#' sequence pairs (if \code{countType} = "reads"), or the observed number of -#' unique UMIs for each sequence or sequence pair (if \code{countType} = -#' "umis").} -#' \item{rowData(x)}{containing the unique sequences or sequence pairs.} -#' \item{colData(x)}{containing the metadata provided by \code{coldata}.} +#' \item{assays(x)$counts}{containing the observed number of sequences +#' or sequence pairs (if \code{countType} = "reads"), or the observed +#' number of unique UMIs for each sequence or sequence pair (if +#' \code{countType} = "umis").} +#' \item{rowData(x)}{containing the unique sequences or sequence +#' pairs.} +#' \item{colData(x)}{containing the metadata provided by +#' \code{coldata}.} #' } #' #' @author Michael Stadler, Charlotte Soneson #' #' @export #' -#' @importFrom SummarizedExperiment SummarizedExperiment colData -#' @importFrom BiocGenerics paste +#' @importFrom SummarizedExperiment SummarizedExperiment #' @importFrom S4Vectors DataFrame #' @importFrom IRanges IntegerList #' @importFrom methods is new as -#' @importFrom dplyr bind_rows distinct left_join %>% mutate filter group_by +#' @importFrom dplyr bind_rows distinct left_join mutate filter group_by #' summarize -#' @importFrom tidyr unite separate_rows #' @importFrom rlang .data #' @importFrom stats setNames #' @@ -58,8 +58,9 @@ #' package = "mutscan"), #' elementsForward = "SUCV", elementLengthsForward = c(1, 10, 18, 96), #' constantForward = "AACCGGAGGAGGGAGCTG", -#' wildTypeForward = c(FOS = paste0("ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", -#' "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), +#' wildTypeForward = c(FOS = paste0( +#' "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", +#' "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), #' nbrMutatedCodonsMaxForward = 1 #' ) #' ## Output sample @@ -68,8 +69,9 @@ #' package = "mutscan"), #' elementsForward = "SUCV", elementLengthsForward = c(1, 10, 18, 96), #' constantForward = "AACCGGAGGAGGGAGCTG", -#' wildTypeForward = c(FOS = paste0("ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", -#' "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), +#' wildTypeForward = c(FOS = paste0( +#' "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", +#' "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), #' nbrMutatedCodonsMaxForward = 1 #' ) #' ## Combine @@ -83,14 +85,15 @@ #' se #' summarizeExperiment <- function(x, coldata, countType = "umis") { - ## -------------------------------------------------------------------------- + ## ------------------------------------------------------------------------ ## Pre-flight checks - ## -------------------------------------------------------------------------- + ## ------------------------------------------------------------------------ if (!is(x, "list") || is.null(names(x))) { stop("'x' must be a named list") } if (any(duplicated(names(x)))) { - stop("duplicated names in 'x' (e.g. technical replicated to be merged) is not supported yet") + stop("duplicated names in 'x' (e.g. technical replicated to be ", + "merged) is not supported yet") } if (!is(coldata, "data.frame") || !("Name" %in% colnames(coldata))) { stop("'coldata' must be a data.frame with at least one column, named ", @@ -98,25 +101,29 @@ summarizeExperiment <- function(x, coldata, countType = "umis") { } .assertScalar(x = countType, type = "character", validValues = c("umis", "reads")) - ## If no UMI sequences were given, then countType = "umis" should not be allowed + ## If no UMI sequences were given, then countType = "umis" should not be + ## allowed if (countType == "umis" && - !(all(vapply(x, function(w) .hasReadComponent(w$parameters$elementsForward, "U") || - .hasReadComponent(w$parameters$elementsReverse, "U"), FALSE)))) { + !(all(vapply(x, function(w) { + .hasReadComponent(w$parameters$elementsForward, "U") || + .hasReadComponent(w$parameters$elementsReverse, "U") + }, FALSE)))) { stop("'countType' is set to 'umis', but no UMI sequences ", "were provided when quantifying. ", "Set 'countType' to 'reads' instead.") } ## Get the mutNameDelimiters, and make sure that they are the same - mutnamedel <- unique(vapply(x, function(w) w$parameters$mutNameDelimiter, "")) + mutnamedel <- unique(vapply(x, + function(w) w$parameters$mutNameDelimiter, "")) if (length(mutnamedel) > 1) { stop("All samples must have the same 'mutNameDelimiter'") } - + coldata$Name <- as.character(coldata$Name) - - ## -------------------------------------------------------------------------- + + ## ------------------------------------------------------------------------ ## Link elements in x with coldata - ## -------------------------------------------------------------------------- + ## ------------------------------------------------------------------------ nms <- intersect(names(x), coldata$Name) if (length(nms) == 0) { stop("names in 'x' do not match 'coldata$Name'") @@ -128,7 +135,7 @@ summarizeExperiment <- function(x, coldata, countType = "umis") { } x <- x[nms] coldata <- coldata[match(nms, coldata$Name), , drop = FALSE] - + ## All sample names allSamples <- names(x) names(allSamples) <- allSamples @@ -138,11 +145,11 @@ summarizeExperiment <- function(x, coldata, countType = "umis") { ## For a given sample, the same mutant name can correspond to multiple ## sequences, separated by , ## ------------------------------------------------------------------------ - tmpdf <- do.call(dplyr::bind_rows, lapply(x, function(w) w$summaryTable)) + tmpdf <- do.call(bind_rows, lapply(x, function(w) w$summaryTable)) - allSequences <- mergeValues(tmpdf$mutantName, tmpdf$sequence) %>% - stats::setNames(c("mutantName", "sequence")) - allSequences <- S4Vectors::DataFrame(allSequences) + allSequences <- mergeValues(tmpdf$mutantName, tmpdf$sequence) |> + setNames(c("mutantName", "sequence")) + allSequences <- DataFrame(allSequences) ## ------------------------------------------------------------------------ ## Add info about nbr mutated bases/codons/AAs, @@ -152,64 +159,69 @@ summarizeExperiment <- function(x, coldata, countType = "umis") { ## ------------------------------------------------------------------------ for (v in intersect(c("nbrMutBases", "nbrMutCodons", "nbrMutAAs", "mutantNameBase", "mutantNameBaseHGVS", - "mutantNameCodon", "mutantNameAA", "mutantNameAAHGVS", - "sequenceAA", "mutationTypes", + "mutantNameCodon", "mutantNameAA", + "mutantNameAAHGVS", "sequenceAA", "mutationTypes", "varLengths"), colnames(tmpdf))) { - tmp <- mergeValues(tmpdf$mutantName, tmpdf[[v]]) %>% - stats::setNames(c("mutantName", v)) + tmp <- mergeValues(tmpdf$mutantName, tmpdf[[v]]) |> + setNames(c("mutantName", v)) allSequences[[v]] <- tmp[[v]][match(allSequences$mutantName, tmp$mutantName)] if (v %in% c("nbrMutBases", "nbrMutCodons", "nbrMutAAs")) { - tmpList <- methods::as( - lapply(strsplit(allSequences[[v]], ","), function(w) sort(as.integer(w))), + tmpList <- as( + lapply(strsplit(allSequences[[v]], ","), + function(w) sort(as.integer(w))), "IntegerList") allSequences[[paste0("min", sub("^n", "N", v))]] <- min(tmpList) allSequences[[paste0("max", sub("^n", "N", v))]] <- max(tmpList) } } - - ## -------------------------------------------------------------------------- + + ## ------------------------------------------------------------------------ ## Create a sparse matrix - ## -------------------------------------------------------------------------- + ## ------------------------------------------------------------------------ countCol <- ifelse(countType == "umis", "nbrUmis", "nbrReads") - tmp <- do.call(dplyr::bind_rows, lapply(allSamples, function(s) { + tmp <- do.call(bind_rows, lapply(allSamples, function(s) { st <- x[[s]]$summaryTable data.frame(i = match(st$mutantName, allSequences$mutantName), j = match(s, allSamples), x = as.numeric(st[, countCol])) })) - countMat <- methods::new("dgTMatrix", i = tmp$i - 1L, j = tmp$j - 1L, - x = tmp$x, Dim = c(nrow(allSequences), length(allSamples))) + countMat <- new("dgTMatrix", i = tmp$i - 1L, j = tmp$j - 1L, + x = tmp$x, Dim = c(nrow(allSequences), + length(allSamples))) ## Convert to dgCMatrix for easier processing downstream - countMat <- methods::as(countMat, "CsparseMatrix") - - ## -------------------------------------------------------------------------- + countMat <- as(countMat, "CsparseMatrix") + + ## ------------------------------------------------------------------------ ## Create the colData - ## -------------------------------------------------------------------------- - addMeta <- do.call(dplyr::bind_rows, lapply(allSamples, function(s) { - x[[s]]$filterSummary %>% dplyr::mutate(Name = s) + ## ------------------------------------------------------------------------ + addMeta <- do.call(bind_rows, lapply(allSamples, function(s) { + x[[s]]$filterSummary |> mutate(Name = s) })) - coldata <- dplyr::left_join(coldata, addMeta, by = "Name") - - ## -------------------------------------------------------------------------- + coldata <- left_join(coldata, addMeta, by = "Name") + + ## ------------------------------------------------------------------------ ## Create SummarizedExperiment object - ## -------------------------------------------------------------------------- - se <- SummarizedExperiment::SummarizedExperiment( + ## ------------------------------------------------------------------------ + se <- SummarizedExperiment( assays = list(counts = countMat), colData = coldata[match(allSamples, coldata$Name), , drop = FALSE], rowData = allSequences, - metadata = list(parameters = lapply(allSamples, function(w) x[[w]]$parameters), - errorStatistics = lapply(allSamples, function(w) x[[w]]$errorStatistics), - countType = countType, - mutNameDelimiter = mutnamedel) + metadata = list( + parameters = lapply(allSamples, + function(w) x[[w]]$parameters), + errorStatistics = lapply(allSamples, + function(w) x[[w]]$errorStatistics), + countType = countType, + mutNameDelimiter = mutnamedel) ) - + if (!any(allSequences$mutantName == "")) { rownames(se) <- allSequences$mutantName } colnames(se) <- allSamples - + return(se) -} \ No newline at end of file +} diff --git a/man/summarizeExperiment.Rd b/man/summarizeExperiment.Rd index 4ee4cdc..141fa5e 100644 --- a/man/summarizeExperiment.Rd +++ b/man/summarizeExperiment.Rd @@ -13,8 +13,8 @@ Names are used to link the objects to the metadata provided in \item{coldata}{A \code{data.frame} with at least one column "Name", which will be used to link to objects in \code{x}. A potentially subset and -reordered version of \code{coldata} is stored in the \code{colData} of the -returned \code{\link[SummarizedExperiment]{SummarizedExperiment}}.} +reordered version of \code{coldata} is stored in the \code{colData} of +the returned \code{\link[SummarizedExperiment]{SummarizedExperiment}}.} \item{countType}{Either "reads" or "umis". If "reads", the "count" assay of the returned object will contain the observed number of reads for each @@ -25,12 +25,14 @@ unique UMIs observed for each sequence (pair).} A \code{\link[SummarizedExperiment]{SummarizedExperiment}} \code{x} with \describe{ - \item{assays(x)$counts}{containing the observed number of sequences or - sequence pairs (if \code{countType} = "reads"), or the observed number of - unique UMIs for each sequence or sequence pair (if \code{countType} = - "umis").} - \item{rowData(x)}{containing the unique sequences or sequence pairs.} - \item{colData(x)}{containing the metadata provided by \code{coldata}.} + \item{assays(x)$counts}{containing the observed number of sequences + or sequence pairs (if \code{countType} = "reads"), or the observed + number of unique UMIs for each sequence or sequence pair (if + \code{countType} = "umis").} + \item{rowData(x)}{containing the unique sequences or sequence + pairs.} + \item{colData(x)}{containing the metadata provided by + \code{coldata}.} } } \description{ @@ -45,8 +47,9 @@ inp <- digestFastqs( package = "mutscan"), elementsForward = "SUCV", elementLengthsForward = c(1, 10, 18, 96), constantForward = "AACCGGAGGAGGGAGCTG", - wildTypeForward = c(FOS = paste0("ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", - "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), + wildTypeForward = c(FOS = paste0( + "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", + "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), nbrMutatedCodonsMaxForward = 1 ) ## Output sample @@ -55,8 +58,9 @@ outp <- digestFastqs( package = "mutscan"), elementsForward = "SUCV", elementLengthsForward = c(1, 10, 18, 96), constantForward = "AACCGGAGGAGGGAGCTG", - wildTypeForward = c(FOS = paste0("ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", - "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), + wildTypeForward = c(FOS = paste0( + "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", + "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), nbrMutatedCodonsMaxForward = 1 ) ## Combine From cbd2cd5c2809666728a756ab21fe531f2e2e1016 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Sun, 1 Jun 2025 21:51:47 +0200 Subject: [PATCH 13/44] Cleanup, calculateFitnessScore.R --- R/calculateFitnessScore.R | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/R/calculateFitnessScore.R b/R/calculateFitnessScore.R index ae0e8d8..a5c54d6 100644 --- a/R/calculateFitnessScore.R +++ b/R/calculateFitnessScore.R @@ -59,22 +59,23 @@ calculateFitnessScore <- function(se, pairingCol, ODCols, comparison, WTrows, ## pairingCol is in colData(se) .assertScalar(x = pairingCol, type = "character", - validValues = colnames(SummarizedExperiment::colData(se))) + validValues = colnames(colData(se))) ## ODCols are all in colData(se) and contain numeric values .assertVector(x = ODCols, type = "character", rngLen = c(1, Inf), - validValues = colnames(SummarizedExperiment::colData(se))) + validValues = colnames(colData(se))) for (odc in ODCols) { - .assertVector(x = SummarizedExperiment::colData(se)[[odc]], + .assertVector(x = colData(se)[[odc]], type = "numeric") } ## comparison is length(3)-character with column and values in colData(se) .assertVector(x = comparison, type = "character", len = 3) .assertScalar(x = comparison[1], type = "character", - validValues = colnames(SummarizedExperiment::colData(se))) - .assertVector(x = comparison[2:3], type = "character", - validValues = SummarizedExperiment::colData(se)[[comparison[1]]]) + validValues = colnames(colData(se))) + .assertVector( + x = comparison[2:3], type = "character", + validValues = colData(se)[[comparison[1]]]) ## there is exactly one observation per pairing and condition if (any(table(colData(se)[colData(se)[, comparison[1]] %in% @@ -100,25 +101,29 @@ calculateFitnessScore <- function(se, pairingCol, ODCols, comparison, WTrows, colData(se_denominator)[, pairingCol]) se_numerator <- se_numerator[, match(shared_repl, colData(se_numerator)[, pairingCol])] - se_denominator <- se_denominator[, match(shared_repl, - colData(se_denominator)[, pairingCol])] + se_denominator <- + se_denominator[, match(shared_repl, + colData(se_denominator)[, pairingCol])] ## ------------------------------------------------------------------------ ## calculate normalized counts (n_i) ## ------------------------------------------------------------------------ norm_counts_numerator <- sweep( as.matrix(assay(se_numerator, selAssay)), MARGIN = 2, - STATS = apply(colData(se_numerator)[, ODCols, drop = FALSE], MARGIN = 1, prod) / - Matrix::colSums(assay(se_numerator, selAssay)), + STATS = apply(colData(se_numerator)[, ODCols, drop = FALSE], + MARGIN = 1, prod) / + colSums(assay(se_numerator, selAssay)), FUN = "*") norm_counts_denominator <- sweep( as.matrix(assay(se_denominator, selAssay)), MARGIN = 2, - STATS = apply(colData(se_denominator)[, ODCols, drop = FALSE], MARGIN = 1, prod) / - Matrix::colSums(assay(se_denominator, selAssay)), + STATS = apply(colData(se_denominator)[, ODCols, drop = FALSE], + MARGIN = 1, prod) / + colSums(assay(se_denominator, selAssay)), FUN = "*") n <- log2(norm_counts_numerator/norm_counts_denominator) n[!is.finite(n)] <- NA - colnames(n) <- paste0(comparison[2], "_vs_", comparison[3], "_repl", shared_repl) + colnames(n) <- paste0(comparison[2], "_vs_", comparison[3], + "_repl", shared_repl) ## ------------------------------------------------------------------------ @@ -131,4 +136,4 @@ calculateFitnessScore <- function(se, pairingCol, ODCols, comparison, WTrows, } fitness <- sweep(n, MARGIN = 2, STATS = nWT, FUN = "/") return(fitness) -} \ No newline at end of file +} From f675a6459008d2e84289124cf9f740c801e61351 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Sun, 1 Jun 2025 21:56:46 +0200 Subject: [PATCH 14/44] Cleanup, calculateRelativeFC.R --- R/calculateRelativeFC.R | 96 ++++++++++++++++++++------------------ man/calculateRelativeFC.Rd | 19 ++++---- 2 files changed, 60 insertions(+), 55 deletions(-) diff --git a/R/calculateRelativeFC.R b/R/calculateRelativeFC.R index 3c3f8e1..63c7554 100644 --- a/R/calculateRelativeFC.R +++ b/R/calculateRelativeFC.R @@ -10,10 +10,10 @@ #' @param coef Coefficient(s) to test with edgeR or limma. #' @param contrast Numeric contrast to test with edgeR or limma. #' @param WTrows Vector of row names that will be used as the reference when -#' calculating logFCs and statistics. If more than one value is provided, the -#' sum of the corresponding counts is used to generate offsets. If NULL, -#' offsets will be defined as the effective library sizes (using TMM -#' normalization factors). +#' calculating logFCs and statistics. If more than one value is provided, +#' the sum of the corresponding counts is used to generate offsets. If +#' \code{NULL}, offsets will be defined as the effective library sizes +#' (using TMM normalization factors). #' @param selAssay Assay to select from \code{se} for the analysis. #' @param pseudocount Pseudocount to add when calculating log-fold changes. #' @param method Either 'edgeR' or 'limma'. If set to 'limma', voom is used to @@ -21,9 +21,9 @@ #' limma. In this case, the results also contain the standard errors of the #' logFCs. #' @param normMethod Character scalar indicating which normalization method -#' should be used to calculate size factors. Should be either \code{"TMM"} or -#' \code{"csaw"} when \code{WTrows} is \code{NULL}, and \code{"geomean"} or -#' \code{"sum"} when \code{WTrows} is provided. +#' should be used to calculate size factors. Should be either \code{"TMM"} +#' or \code{"csaw"} when \code{WTrows} is \code{NULL}, and \code{"geomean"} +#' or \code{"sum"} when \code{WTrows} is provided. #' #' @author Charlotte Soneson, Michael Stadler #' @@ -34,15 +34,16 @@ #' #' @importFrom edgeR DGEList scaleOffset estimateDisp glmQLFit glmQLFTest #' topTags predFC topTags calcNormFactors getNormLibSizes -#' @importFrom SummarizedExperiment colData assay assayNames +#' @importFrom SummarizedExperiment colData assay assayNames assays #' @importFrom limma voom eBayes topTable lmFit contrasts.fit #' @importFrom csaw normOffsets #' #' @examples +#' library(SummarizedExperiment) #' se <- readRDS(system.file("extdata", "GSE102901_cis_se.rds", #' package = "mutscan"))[1:200, ] -#' design <- stats::model.matrix(~ Replicate + Condition, -#' data = SummarizedExperiment::colData(se)) +#' design <- model.matrix(~ Replicate + Condition, +#' data = colData(se)) #' #' ## Calculate "absolute" log-fold changes with edgeR #' res <- calculateRelativeFC(se, design, coef = "Conditioncis_output", @@ -71,9 +72,9 @@ calculateRelativeFC <- function(se, design, coef = NULL, contrast = NULL, "TMM", "sum")) { .assertVector(x = se, type = "SummarizedExperiment") .assertScalar(x = selAssay, type = "character") - if (!(selAssay %in% SummarizedExperiment::assayNames(se))) { - if (is.null(SummarizedExperiment::assayNames(se)) && - length(SummarizedExperiment::assays(se)) == 1) { + if (!(selAssay %in% assayNames(se))) { + if (is.null(assayNames(se)) && + length(assays(se)) == 1) { warning("No assayNames provided in 'se', but only one ", "assay present - using that.") selAssay <- 1L @@ -81,83 +82,86 @@ calculateRelativeFC <- function(se, design, coef = NULL, contrast = NULL, stop("The provided 'selAssay' not present in 'se'.") } } - + if (!is.null(WTrows)) { - .assertVector(x = WTrows, type = "character", validValues = rownames(se)) + .assertVector(x = WTrows, type = "character", + validValues = rownames(se)) } - + if (nrow(design) != ncol(se)) { stop("The number of rows in 'design' (", nrow(design), ") is not equal to the number", " of columns in 'se' (", ncol(se), ").") } - + .assertScalar(x = pseudocount, type = "numeric", rngIncl = c(0, Inf)) - + .assertScalar(x = method, type = "character", validValues = c("edgeR", "limma")) - + if (normMethod %in% c("csaw", "TMM") && !is.null(WTrows)) { stop("normMethod = '", normMethod, "' can only be used when WTrows is NULL.") } - + if (normMethod %in% c("sum", "geomean") && is.null(WTrows)) { stop("normMethod = '", normMethod, "' can only be used when WTrows is not NULL.") } - + if (normMethod == "csaw" && method == "limma") { stop("normMethod = 'csaw' can only be used with method = 'edgeR'.") } - + .assertScalar(x = normMethod, type = "character", validValues = c("csaw", "TMM", "geomean", "sum")) - + if (is.null(coef) && is.null(contrast)) { stop("'coef' and 'contrast' can not both be NULL.") } - + if (!is.null(contrast) && !is.null(dim(contrast))) { stop("'contrast' must be a vector.") } - + ## Create DGEList from SummarizedExperiment - dge <- edgeR::DGEList(counts = as.matrix(SummarizedExperiment::assay(se, selAssay)), - samples = SummarizedExperiment::colData(se)) + dge <- DGEList( + counts = as.matrix(assay(se, selAssay)), + samples = colData(se)) if (normMethod == "csaw") { ## csaw normalization - also calculate normalization factors since ## aveLogCPM does not use provided offsets ## In this case, we know that WTrows is NULL, so all features ## will be used for the normalization - dge <- edgeR::calcNormFactors(dge) - dge <- csaw::normOffsets(dge) + dge <- calcNormFactors(dge) + dge <- normOffsets(dge) } else if (normMethod == "TMM") { ## TMM normalization, with all features - dge <- edgeR::calcNormFactors(dge) + dge <- calcNormFactors(dge) } else if (normMethod == "geomean") { ## Use size factors (offsets) derived from the geometric mean ## of the WT rows tmp0 <- dge$counts[WTrows, , drop = FALSE] tmp0 <- tmp0[apply(tmp0, 1, min) > 0, , drop = FALSE] logoffsets <- apply(tmp0, 2, function(s) mean(log(s))) - dge <- edgeR::scaleOffset(dge, logoffsets) + dge <- scaleOffset(dge, logoffsets) } else if (normMethod == "sum") { ## Use size factors (offsets) derived from the sum of the ## WT rows tmp0 <- dge$counts[WTrows, , drop = FALSE] - dge <- edgeR::scaleOffset(dge, log(colSums(tmp0))) + dge <- scaleOffset(dge, log(colSums(tmp0))) } - + ## Fit model and perform test if (method == "edgeR") { - dge <- edgeR::estimateDisp(dge, design = design) - fit <- edgeR::glmQLFit(dge, design = design) - qlf <- edgeR::glmQLFTest(fit, coef = coef, contrast = contrast) - tt <- edgeR::topTags(qlf, n = Inf, sort.by = "none")$table + dge <- estimateDisp(dge, design = design) + fit <- glmQLFit(dge, design = design) + qlf <- glmQLFTest(fit, coef = coef, contrast = contrast) + tt <- topTags(qlf, n = Inf, sort.by = "none")$table ## Calculate shrunken fold changes. Only when testing ## a single coefficient or contrast - predfc <- edgeR::predFC(dge, design = design, prior.count = pseudocount) + predfc <- predFC(dge, design = design, + prior.count = pseudocount) if (length(coef) == 1 && is.null(contrast)) { tt$logFC_shrunk <- predfc[, coef] } else if (!is.null(contrast)) { @@ -168,19 +172,19 @@ calculateRelativeFC <- function(se, design, coef = NULL, contrast = NULL, tt$df.test <- qlf$df.test } else if (method == "limma") { if (!is.null(dge$offset)) { - vm <- limma::voom(dge, design = design, lib.size = exp(dge$offset)) + vm <- voom(dge, design = design, lib.size = exp(dge$offset)) } else { - vm <- limma::voom(dge, design = design, - lib.size = edgeR::getNormLibSizes(dge)) + vm <- voom(dge, design = design, + lib.size = getNormLibSizes(dge)) } - fit <- limma::lmFit(vm, design = design) + fit <- lmFit(vm, design = design) if (!is.null(contrast)) { - fit <- limma::contrasts.fit(fit, contrasts = contrast) + fit <- contrasts.fit(fit, contrasts = contrast) coef <- 1 } - fit <- limma::eBayes(fit) - tt <- limma::topTable(fit, coef = coef, - confint = TRUE, number = Inf, sort.by = "none") + fit <- eBayes(fit) + tt <- topTable(fit, coef = coef, + confint = TRUE, number = Inf, sort.by = "none") if (length(coef) == 1) { tt$se.logFC <- sqrt(fit$s2.post) * fit$stdev.unscaled[, coef] } diff --git a/man/calculateRelativeFC.Rd b/man/calculateRelativeFC.Rd index a65684a..7d07a2d 100644 --- a/man/calculateRelativeFC.Rd +++ b/man/calculateRelativeFC.Rd @@ -27,10 +27,10 @@ same order as the columns in \code{se}.} \item{contrast}{Numeric contrast to test with edgeR or limma.} \item{WTrows}{Vector of row names that will be used as the reference when -calculating logFCs and statistics. If more than one value is provided, the -sum of the corresponding counts is used to generate offsets. If NULL, -offsets will be defined as the effective library sizes (using TMM -normalization factors).} +calculating logFCs and statistics. If more than one value is provided, +the sum of the corresponding counts is used to generate offsets. If +\code{NULL}, offsets will be defined as the effective library sizes +(using TMM normalization factors).} \item{selAssay}{Assay to select from \code{se} for the analysis.} @@ -42,9 +42,9 @@ limma. In this case, the results also contain the standard errors of the logFCs.} \item{normMethod}{Character scalar indicating which normalization method -should be used to calculate size factors. Should be either \code{"TMM"} or -\code{"csaw"} when \code{WTrows} is \code{NULL}, and \code{"geomean"} or -\code{"sum"} when \code{WTrows} is provided.} +should be used to calculate size factors. Should be either \code{"TMM"} +or \code{"csaw"} when \code{WTrows} is \code{NULL}, and \code{"geomean"} +or \code{"sum"} when \code{WTrows} is provided.} } \value{ A \code{data.frame} with output from the statistical testing @@ -56,10 +56,11 @@ either limma or the Negative Binomial quasi-likelihood framework of edgeR. The observed counts for the WT variants can be used as offsets in the model. } \examples{ +library(SummarizedExperiment) se <- readRDS(system.file("extdata", "GSE102901_cis_se.rds", package = "mutscan"))[1:200, ] -design <- stats::model.matrix(~ Replicate + Condition, - data = SummarizedExperiment::colData(se)) +design <- model.matrix(~ Replicate + Condition, + data = colData(se)) ## Calculate "absolute" log-fold changes with edgeR res <- calculateRelativeFC(se, design, coef = "Conditioncis_output", From 35e9172d358bb39e2913506616cba429ddaf1c98 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Sun, 1 Jun 2025 22:03:32 +0200 Subject: [PATCH 15/44] Cleanup, plotResults.R --- R/plotResults.R | 70 ++++++++++++++++++++++++++------------------- man/plotMeanDiff.Rd | 3 +- man/plotVolcano.Rd | 3 +- 3 files changed, 45 insertions(+), 31 deletions(-) diff --git a/R/plotResults.R b/R/plotResults.R index aa67536..6470054 100644 --- a/R/plotResults.R +++ b/R/plotResults.R @@ -60,7 +60,7 @@ .assertScalar(x = pointSize, type = "character", validValues = c("small", "large")) .assertScalar(x = interactivePlot, type = "logical") - + if (interactivePlot && !requireNamespace("plotly", quietly = TRUE)) { stop("The 'plotly' package is required to make interactive plots.") } @@ -72,37 +72,42 @@ if ("y" %in% centerAxis) { yr <- c(-max(abs(yr), na.rm = TRUE), max(abs(yr), na.rm = TRUE)) } - gg <- ggplot2::ggplot(res, ggplot2::aes(x = .data[[xCol]], y = .data[[yCol]])) + - ggplot2::theme_minimal() + ggplot2::coord_cartesian(xlim = xr, ylim = yr) + - ggplot2::theme(axis.text = ggplot2::element_text(size = 12), - axis.title = ggplot2::element_text(size = 14), - title = ggplot2::element_text(size = 14)) + - ggplot2::labs(x = xLabel, y = yLabel) + gg <- ggplot(res, aes(x = .data[[xCol]], + y = .data[[yCol]])) + + theme_minimal() + + coord_cartesian(xlim = xr, ylim = yr) + + theme(axis.text = element_text(size = 12), + axis.title = element_text(size = 14), + title = element_text(size = 14)) + + labs(x = xLabel, y = yLabel) if (!is.null(labelCol)) { gg <- gg + - ggplot2::aes(label = .data[[labelCol]]) + aes(label = .data[[labelCol]]) } if (pointSize == "large") { gg <- gg + - ggplot2::geom_point(fill = "lightgrey", color = "grey", pch = 21, size = 1.5) + geom_point(fill = "lightgrey", color = "grey", + pch = 21, size = 1.5) } else { gg <- gg + - ggplot2::geom_point(color = "black", size = 0.25, alpha = 0.5) + geom_point(color = "black", size = 0.25, alpha = 0.5) } if (any(res[[colorCol]] < colorThreshold)) { gg <- gg + - ggplot2::labs(subtitle = paste0("Points are indicated in red if ", - colorCol, "<", colorThreshold)) + labs(subtitle = paste0("Points are indicated in red if ", + colorCol, "<", colorThreshold)) if (pointSize == "large") { gg <- gg + - ggplot2::geom_point( - data = res[res[[colorCol]] < colorThreshold, , drop = FALSE], + geom_point( + data = res[res[[colorCol]] < colorThreshold, , + drop = FALSE], fill = "red", color = "grey", pch = 21, size = 1.5 ) } else { gg <- gg + - ggplot2::geom_point( - data = res[res[[colorCol]] < colorThreshold, , drop = FALSE], + geom_point( + data = res[res[[colorCol]] < colorThreshold, , + drop = FALSE], color = "red", size = 0.25 ) } @@ -111,10 +116,12 @@ plotly::ggplotly(gg) } else { if (!is.null(labelCol) && length(labelValues) > 0) { - .assertVector(x = labelValues, type = "character", validValues = res[[labelCol]]) + .assertVector(x = labelValues, type = "character", + validValues = res[[labelCol]]) gg <- gg + - ggrepel::geom_text_repel( - data = res[res[[labelCol]] %in% labelValues, , drop = FALSE], + geom_text_repel( + data = res[res[[labelCol]] %in% labelValues, , + drop = FALSE], max.overlaps = Inf, size = 4, min.segment.length = 0.1) } gg @@ -124,7 +131,7 @@ .getColName <- function(res, validValues, aspect) { .assertVector(x = res, type = "data.frame") colName <- grep(paste(paste0("^", validValues, "$"), collapse = "|"), - colnames(res), value = TRUE) + colnames(res), value = TRUE) if (length(colName) == 0) { stop("No suitable column found for ", aspect, ", one of the ", "following expected: ", paste(validValues, collapse = ", ")) @@ -175,15 +182,17 @@ #' @export #' #' @examples +#' library(SummarizedExperiment) #' se <- readRDS(system.file("extdata", "GSE102901_cis_se.rds", #' package = "mutscan"))[1:200, ] #' design <- model.matrix(~ Replicate + Condition, -#' data = SummarizedExperiment::colData(se)) +#' data = colData(se)) #' res <- calculateRelativeFC(se, design, coef = "Conditioncis_output") #' plotMeanDiff(res, pointSize = "large", nTopToLabel = 3) #' plotMeanDiff <- function(res, meanCol = NULL, logFCCol = NULL, pvalCol = NULL, - padjCol = NULL, padjThreshold = 0.05, pointSize = "small", + padjCol = NULL, padjThreshold = 0.05, + pointSize = "small", interactivePlot = FALSE, nTopToLabel = 0) { .assertScalar(x = nTopToLabel, type = "numeric", rngIncl = c(0, Inf)) .assertVector(x = res, type = "data.frame") @@ -202,16 +211,17 @@ plotMeanDiff <- function(res, meanCol = NULL, logFCCol = NULL, pvalCol = NULL, } if (is.null(padjCol)) { padjCol <- .getColName(res, validValues = c("FDR", "adj.P.Val"), - aspect = "highlighting") + aspect = "highlighting") } .assertScalar(pvalCol, type = "character", validValues = colnames(res)) res$feature <- rownames(res) if (nTopToLabel > 0) { - labelValues <- res[order(res[[pvalCol]]), "feature"][seq_len(nTopToLabel)] + labelValues <- res[order(res[[pvalCol]]), + "feature"][seq_len(nTopToLabel)] } else { labelValues <- c() } - + .plotScatter(res = res, xCol = meanCol, yCol = logFCCol, xLabel = meanCol, yLabel = logFCCol, colorCol = padjCol, @@ -256,10 +266,11 @@ plotMeanDiff <- function(res, meanCol = NULL, logFCCol = NULL, pvalCol = NULL, #' @export #' #' @examples +#' library(SummarizedExperiment) #' se <- readRDS(system.file("extdata", "GSE102901_cis_se.rds", #' package = "mutscan"))[1:200, ] #' design <- model.matrix(~ Replicate + Condition, -#' data = SummarizedExperiment::colData(se)) +#' data = colData(se)) #' res <- calculateRelativeFC(se, design, coef = "Conditioncis_output") #' plotVolcano(res, pointSize = "large", nTopToLabel = 3) #' @@ -275,7 +286,7 @@ plotVolcano <- function(res, logFCCol = NULL, pvalCol = NULL, padjCol = NULL, } if (is.null(pvalCol)) { pvalCol <- .getColName(res, validValues = c("PValue", "P.Value"), - aspect = "y-axis") + aspect = "y-axis") } if (is.null(padjCol)) { padjCol <- .getColName(res, validValues = c("FDR", "adj.P.Val"), @@ -285,11 +296,12 @@ plotVolcano <- function(res, logFCCol = NULL, pvalCol = NULL, padjCol = NULL, res$negLog10P <- -log10(res[[pvalCol]]) res$feature <- rownames(res) if (nTopToLabel > 0) { - labelValues <- res[order(res[[pvalCol]]), "feature"][seq_len(nTopToLabel)] + labelValues <- res[order(res[[pvalCol]]), + "feature"][seq_len(nTopToLabel)] } else { labelValues <- c() } - + .plotScatter(res = res, xCol = logFCCol, yCol = "negLog10P", xLabel = logFCCol, yLabel = paste0("-log10(", pvalCol, ")"), colorCol = padjCol, diff --git a/man/plotMeanDiff.Rd b/man/plotMeanDiff.Rd index 57f417f..2e83f2b 100644 --- a/man/plotMeanDiff.Rd +++ b/man/plotMeanDiff.Rd @@ -59,10 +59,11 @@ If \code{interactivePlot} is \code{TRUE}, a \code{plotly} Construct an MA (mean-difference) plot } \examples{ +library(SummarizedExperiment) se <- readRDS(system.file("extdata", "GSE102901_cis_se.rds", package = "mutscan"))[1:200, ] design <- model.matrix(~ Replicate + Condition, - data = SummarizedExperiment::colData(se)) + data = colData(se)) res <- calculateRelativeFC(se, design, coef = "Conditioncis_output") plotMeanDiff(res, pointSize = "large", nTopToLabel = 3) diff --git a/man/plotVolcano.Rd b/man/plotVolcano.Rd index f0a47dc..0b94d83 100644 --- a/man/plotVolcano.Rd +++ b/man/plotVolcano.Rd @@ -55,10 +55,11 @@ If \code{interactivePlot} is \code{TRUE}, a \code{plotly} Construct a volcano plot } \examples{ +library(SummarizedExperiment) se <- readRDS(system.file("extdata", "GSE102901_cis_se.rds", package = "mutscan"))[1:200, ] design <- model.matrix(~ Replicate + Condition, - data = SummarizedExperiment::colData(se)) + data = colData(se)) res <- calculateRelativeFC(se, design, coef = "Conditioncis_output") plotVolcano(res, pointSize = "large", nTopToLabel = 3) From 2a6d658d4f93989425477412b567ed6999162865 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Mon, 2 Jun 2025 19:55:06 +0200 Subject: [PATCH 16/44] Cleanup, collapseMutantsByAA.R --- R/collapseMutantsByAA.R | 107 +++++++++++++++-------------- man/collapseMutantsBySimilarity.Rd | 28 ++++---- 2 files changed, 70 insertions(+), 65 deletions(-) diff --git a/R/collapseMutantsByAA.R b/R/collapseMutantsByAA.R index c0892c7..78120bd 100644 --- a/R/collapseMutantsByAA.R +++ b/R/collapseMutantsByAA.R @@ -1,14 +1,15 @@ #' Collapse mutants by similarity #' #' These functions can be used to collapse variants, either by similarity or -#' according to a pre-defined grouping. The functions \code{collapseMutants} and -#' \code{collapseMutantsByAA} assume that a grouping variable is available as a -#' column in \code{rowData(se)} (\code{collapseMutantsByAA} is a convenience -#' function for the case when this column is "mutantNameAA", and is provided -#' for backwards compatibility). The \code{collapseMutantsBySimilarity} will -#' generate the grouping variable based on user-provided thresholds on the -#' sequence similarity (defined by the Hamming distance), and subsequently -#' collapse based on the derived grouping. +#' according to a pre-defined grouping. The functions \code{collapseMutants} +#' and \code{collapseMutantsByAA} assume that a grouping variable is available +#' as a column in \code{rowData(se)} (\code{collapseMutantsByAA} is a +#' convenience function for the case when this column is "mutantNameAA", and +#' is provided for backwards compatibility). The +#' \code{collapseMutantsBySimilarity} will generate the grouping variable +#' based on user-provided thresholds on the sequence similarity (defined by +#' the Hamming distance), and subsequently collapse based on the derived +#' grouping. #' #' @param se A \code{\link[SummarizedExperiment]{SummarizedExperiment}} #' generated by \code{\link{summarizeExperiment}} @@ -25,8 +26,8 @@ #' similar sequences. If the value is in [0, 1), it defines the maximal #' Hamming distance in terms of a fraction of sequence length: #' (\code{round(collapseMaxDist * nchar(sequence))}). -#' A value greater or equal to 1 is rounded and directly used as the maximum -#' allowed Hamming distance. Note that sequences can only be +#' A value greater or equal to 1 is rounded and directly used as the +#' maximum allowed Hamming distance. Note that sequences can only be #' collapsed if they are all of the same length. #' @param collapseMinScore Numeric scalar, indicating the minimum score for the #' sequence to be considered for collapsing with similar sequences. @@ -45,6 +46,7 @@ #' @importFrom Matrix rowSums rowMeans #' #' @examples +#' library(SummarizedExperiment) #' se <- readRDS(system.file("extdata", "GSE102901_cis_se.rds", #' package = "mutscan"))[1:200, ] #' ## The rows of this object correspond to individual codon variants @@ -58,7 +60,7 @@ #' head(rownames(sec)) #' ## The mutantName column contains the individual codon variants that were #' ## collapsed -#' head(SummarizedExperiment::rowData(sec)) +#' head(rowData(sec)) #' #' ## Collapse similar sequences #' sec2 <- collapseMutantsBySimilarity( @@ -67,9 +69,9 @@ #' collapseMinScore = 0, collapseMinRatio = 0) #' dim(sec2) #' head(rownames(sec2)) -#' head(SummarizedExperiment::rowData(sec2)) +#' head(rowData(sec2)) #' ## collapsed count matrix -#' SummarizedExperiment::assay(sec2, "counts") +#' assay(sec2, "counts") #' collapseMutantsBySimilarity <- function(se, assayName, scoreMethod = "rowSum", sequenceCol = "sequence", @@ -80,18 +82,18 @@ collapseMutantsBySimilarity <- function(se, assayName, scoreMethod = "rowSum", ## Check input arguments .assertVector(x = se, type = "SummarizedExperiment") .assertScalar(x = assayName, type = "character", - validValues = SummarizedExperiment::assayNames(se)) + validValues = assayNames(se)) .assertScalar(x = scoreMethod, type = "character", validValues = c("rowSum", "rowMean")) .assertScalar(x = sequenceCol, type = "character", - validValues = colnames(SummarizedExperiment::rowData(se))) + validValues = colnames(rowData(se))) .assertScalar(x = collapseMaxDist, type = "numeric") .assertScalar(x = collapseMinScore, type = "numeric") .assertScalar(x = collapseMinRatio, type = "numeric") .assertScalar(x = verbose, type = "logical") ## All sequences must be the same length, and can only consist of ACGT_ - seqs <- SummarizedExperiment::rowData(se)[, sequenceCol] + seqs <- rowData(se)[, sequenceCol] if (length(unique(nchar(seqs))) != 1) { stop("All entries in the sequence column must have the same length") } @@ -102,9 +104,9 @@ collapseMutantsBySimilarity <- function(se, assayName, scoreMethod = "rowSum", ## Get matching table if (scoreMethod == "rowSum") { - scores <- Matrix::rowSums(SummarizedExperiment::assay(se, assayName)) + scores <- rowSums(assay(se, assayName)) } else if (scoreMethod == "rowMean") { - scores <- Matrix::rowMeans(SummarizedExperiment::assay(se, assayName)) + scores <- rowMeans(assay(se, assayName)) } matchTable <- groupSimilarSequences( seqs = seqs, @@ -116,12 +118,13 @@ collapseMutantsBySimilarity <- function(se, assayName, scoreMethod = "rowSum", ) matchTable$representative <- rownames(se)[ match(matchTable$representative, - SummarizedExperiment::rowData(se)[, sequenceCol])] - + rowData(se)[, sequenceCol])] + ## Add grouping variable - SummarizedExperiment::rowData(se)$collapseCol <- - matchTable$representative[match(SummarizedExperiment::rowData(se)[, sequenceCol], - matchTable$sequence)] + rowData(se)$collapseCol <- + matchTable$representative[ + match(rowData(se)[, sequenceCol], + matchTable$sequence)] ## Collapse se <- collapseMutants(se, nameCol = "collapseCol") @@ -152,9 +155,9 @@ collapseMutantsByAA <- function(se) { #' @export #' #' @importFrom DelayedArray rowsum -#' @importFrom S4Vectors metadata +#' @importFrom S4Vectors metadata DataFrame #' @importFrom SummarizedExperiment assays rowData SummarizedExperiment colData -#' @importFrom dplyr across group_by summarize +#' @importFrom dplyr across group_by summarize full_join #' @importFrom stats setNames #' #' @rdname collapseMutantsBySimilarity @@ -162,51 +165,51 @@ collapseMutantsByAA <- function(se) { collapseMutants <- function(se, nameCol) { .assertVector(x = se, type = "SummarizedExperiment") .assertScalar(x = nameCol, type = "character", - validValues = colnames(SummarizedExperiment::rowData(se))) + validValues = colnames(rowData(se))) ## Collapse assays ## Matrix.utils has been removed from CRAN - # aList <- lapply(SummarizedExperiment::assays(se), function(a) { + # aList <- lapply(assays(se), function(a) { # Matrix.utils::aggregate.Matrix( # x = a, - # groupings = factor(SummarizedExperiment::rowData(se)[[nameCol]]), + # groupings = factor(rowData(se)[[nameCol]]), # fun = "colSums") # }) - aList <- lapply(SummarizedExperiment::assays(se), function(a) { - DelayedArray::rowsum( + aList <- lapply(assays(se), function(a) { + rowsum( x = a, - group = factor(SummarizedExperiment::rowData(se)[[nameCol]]) + group = factor(rowData(se)[[nameCol]]) ) }) ## Collapse rowData - simple columns - rd <- mergeValues(SummarizedExperiment::rowData(se)[[nameCol]], - SummarizedExperiment::rowData(se)$sequence) %>% - stats::setNames(c(nameCol, "sequence")) + rd <- mergeValues(rowData(se)[[nameCol]], + rowData(se)$sequence) %>% + setNames(c(nameCol, "sequence")) for (v in setdiff( intersect(c("mutantName", "mutantNameBase", "mutantNameBaseHGVS", "mutantNameCodon", "mutantNameAA", "mutantNameAAHGVS", "sequenceAA", "mutationTypes", "nbrMutBases", "nbrMutCodons", "nbrMutAAs", "varLengths"), - colnames(SummarizedExperiment::rowData(se))), + colnames(rowData(se))), nameCol)) { - tmp <- mergeValues(SummarizedExperiment::rowData(se)[[nameCol]], - SummarizedExperiment::rowData(se)[[v]]) + tmp <- mergeValues(rowData(se)[[nameCol]], + rowData(se)[[v]]) rd[[v]] <- tmp$valueColl[match(rd[[nameCol]], tmp$mutantNameColl)] } - rd0 <- as.data.frame(SummarizedExperiment::rowData(se)) %>% - dplyr::group_by(.data[[nameCol]]) %>% - dplyr::summarize( - dplyr::across(c("minNbrMutBases", "minNbrMutCodons", - "minNbrMutAAs"), - function(x) min(x)), - dplyr::across(c("maxNbrMutBases", "maxNbrMutCodons", - "maxNbrMutAAs"), - function(x) max(x))) + rd0 <- as.data.frame(rowData(se)) %>% + group_by(.data[[nameCol]]) %>% + summarize( + across(c("minNbrMutBases", "minNbrMutCodons", + "minNbrMutAAs"), + function(x) min(x)), + across(c("maxNbrMutBases", "maxNbrMutCodons", + "maxNbrMutAAs"), + function(x) max(x))) rd <- rd %>% - dplyr::full_join(rd0, by = nameCol) + full_join(rd0, by = nameCol) - rd <- S4Vectors::DataFrame(rd) + rd <- DataFrame(rd) rownames(rd) <- rd[[nameCol]] ## Create SummarizedExperiment @@ -216,11 +219,11 @@ collapseMutants <- function(se, nameCol) { stopifnot(all(rownames(aList[[i]]) == rn1)) stopifnot(all(colnames(aList[[i]]) == cn1)) } - stopifnot(cn1 == rownames(SummarizedExperiment::colData(se))) - SummarizedExperiment::SummarizedExperiment( + stopifnot(cn1 == rownames(colData(se))) + SummarizedExperiment( assays = aList, - colData = SummarizedExperiment::colData(se), - metadata = S4Vectors::metadata(se), + colData = colData(se), + metadata = metadata(se), rowData = rd[rn1, ] ) } diff --git a/man/collapseMutantsBySimilarity.Rd b/man/collapseMutantsBySimilarity.Rd index 527877b..d2b57bf 100644 --- a/man/collapseMutantsBySimilarity.Rd +++ b/man/collapseMutantsBySimilarity.Rd @@ -41,8 +41,8 @@ variants.} similar sequences. If the value is in [0, 1), it defines the maximal Hamming distance in terms of a fraction of sequence length: (\code{round(collapseMaxDist * nchar(sequence))}). -A value greater or equal to 1 is rounded and directly used as the maximum -allowed Hamming distance. Note that sequences can only be +A value greater or equal to 1 is rounded and directly used as the +maximum allowed Hamming distance. Note that sequences can only be collapsed if they are all of the same length.} \item{collapseMinScore}{Numeric scalar, indicating the minimum score for the @@ -66,16 +66,18 @@ A \code{\link[SummarizedExperiment]{SummarizedExperiment}} where } \description{ These functions can be used to collapse variants, either by similarity or -according to a pre-defined grouping. The functions \code{collapseMutants} and -\code{collapseMutantsByAA} assume that a grouping variable is available as a -column in \code{rowData(se)} (\code{collapseMutantsByAA} is a convenience -function for the case when this column is "mutantNameAA", and is provided -for backwards compatibility). The \code{collapseMutantsBySimilarity} will -generate the grouping variable based on user-provided thresholds on the -sequence similarity (defined by the Hamming distance), and subsequently -collapse based on the derived grouping. +according to a pre-defined grouping. The functions \code{collapseMutants} +and \code{collapseMutantsByAA} assume that a grouping variable is available +as a column in \code{rowData(se)} (\code{collapseMutantsByAA} is a +convenience function for the case when this column is "mutantNameAA", and +is provided for backwards compatibility). The +\code{collapseMutantsBySimilarity} will generate the grouping variable +based on user-provided thresholds on the sequence similarity (defined by +the Hamming distance), and subsequently collapse based on the derived +grouping. } \examples{ +library(SummarizedExperiment) se <- readRDS(system.file("extdata", "GSE102901_cis_se.rds", package = "mutscan"))[1:200, ] ## The rows of this object correspond to individual codon variants @@ -89,7 +91,7 @@ dim(sec) head(rownames(sec)) ## The mutantName column contains the individual codon variants that were ## collapsed -head(SummarizedExperiment::rowData(sec)) +head(rowData(sec)) ## Collapse similar sequences sec2 <- collapseMutantsBySimilarity( @@ -98,9 +100,9 @@ sec2 <- collapseMutantsBySimilarity( collapseMinScore = 0, collapseMinRatio = 0) dim(sec2) head(rownames(sec2)) -head(SummarizedExperiment::rowData(sec2)) +head(rowData(sec2)) ## collapsed count matrix -SummarizedExperiment::assay(sec2, "counts") +assay(sec2, "counts") } \author{ From 82071a8ff3b4590ab9adc840c205114feca0bfb5 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Mon, 2 Jun 2025 20:03:41 +0200 Subject: [PATCH 17/44] Cleanup, digestFastqs --- R/digestFastqs.R | 418 +++++++++++++++++++++++++------------------- man/digestFastqs.Rd | 115 ++++++------ 2 files changed, 302 insertions(+), 231 deletions(-) diff --git a/R/digestFastqs.R b/R/digestFastqs.R index 704094d..f3220e5 100644 --- a/R/digestFastqs.R +++ b/R/digestFastqs.R @@ -2,8 +2,8 @@ #' #' Read sequences for one or a pair of fastq files and digest them (extract #' umis, constant and variable parts, filter, extract mismatch information from -#' constant and count the observed unique variable parts). Alternatively, primer -#' sequences could be specified, in which case the sequence immediately +#' constant and count the observed unique variable parts). Alternatively, +#' primer sequences could be specified, in which case the sequence immediately #' following the primer will be considered the variable sequence. #' #' The processing of a read pair goes as follows: @@ -23,8 +23,8 @@ #' either the forward or reverse read (or the merged read). #' \item Filter out the read (pair) if the number of Ns in the variable region #' exceeds \code{variableNMaxForward}/\code{variableNMaxReverse}. -#' \item Filter out the read (pair) if the number of Ns in the combined forward -#' and reverse UMI sequence exceeds \code{umiNMax} +#' \item Filter out the read (pair) if the number of Ns in the combined +#' forward and reverse UMI sequence exceeds \code{umiNMax} #' \item If one or more wild type sequences (for the variable region) are #' provided, find the mismatches between the (forward/reverse) variable region #' and the provided wild type sequence (if more than one wild type sequence is @@ -37,9 +37,9 @@ #' the codons encoded by #' \code{forbiddenMutatedCodonsForward}/\code{forbiddenMutatedCodonsReverse}. #' \item Assign a 'mutation name' to the read (pair). This name is a -#' combination of parts of the form XX\{.\}YY\{.\}NNN, where XX is the name of the -#' most similar reference sequence, YY is the mutated codon number, and NNN is -#' the mutated codon. \{.\} is a delimiter, specified via +#' combination of parts of the form XX\{.\}YY\{.\}NNN, where XX is the name of +#' the most similar reference sequence, YY is the mutated codon number, and +#' NNN is the mutated codon. \{.\} is a delimiter, specified via #' \code{mutNameDelimiter}. If no wildtype sequences are provided, the #' variable sequence will be used as the mutation name'. #'} @@ -83,8 +83,8 @@ #' but there can be at most one occurrence of P. If a given letter is #' included multiple times, the corresponding sequences will be #' concatenated in the output. -#' @param elementLengthsForward,elementLengthsReverse Numeric vectors containing -#' the lengths of each read component from +#' @param elementLengthsForward,elementLengthsReverse Numeric vectors +#' containing the lengths of each read component from #' \code{elementsForward}/\code{elementsReverse}, respectively. If the #' length of one element is set to -1, it will be inferred from the other #' lengths (as the remainder of the read). At most one number (or one @@ -96,11 +96,11 @@ #' contains the corresponding adapter sequence, the sequence pair will be #' filtered out. If set to \code{NULL}, no adapter filtering is performed. #' The number of filtered read pairs are reported in the return value. -#' @param primerForward,primerReverse Character vectors, representing the primer -#' sequence(s) for forward/reverse reads, respectively. Only read pairs that -#' contain perfect matches to both the forward and reverse primers (if -#' given) will be retained. Multiple primers can be specified - they will be -#' considered in order and the first match will be used. +#' @param primerForward,primerReverse Character vectors, representing the +#' primer sequence(s) for forward/reverse reads, respectively. Only read +#' pairs that contain perfect matches to both the forward and reverse +#' primers (if given) will be retained. Multiple primers can be specified - +#' they will be considered in order and the first match will be used. #' @param wildTypeForward,wildTypeReverse Character scalars or named character #' vectors, the wild type sequence for the forward and reverse variable #' region. If given as a single string, the reference sequence will be @@ -146,13 +146,14 @@ #' any mutated base has a Phred score lower than \code{mutatedPhredMin}, #' the read (pair) will be discarded. #' @param mutNameDelimiter Character scalar, the delimiter used in the naming -#' of mutants. Generally, mutants will be named as XX\{.\}YY\{.\}NNN, where XX -#' is the closest provided reference sequence, YY is the mutated base or +#' of mutants. Generally, mutants will be named as XX\{.\}YY\{.\}NNN, where +#' XX is the closest provided reference sequence, YY is the mutated base or #' codon number (depending on whether \code{nbrMutatedBases*} or #' \code{nbrMutatedCodons*} is specified), and NNN is the -#' mutated base or codon. Here, \{.\} is the provided \code{mutNameDelimiter}. -#' The delimiter must be a single character (not "_"), and can not appear -#' in any of the provided reference sequence names. +#' mutated base or codon. Here, \{.\} is the provided +#' \code{mutNameDelimiter}. The delimiter must be a single character +#' (not "_"), and can not appear in any of the provided reference sequence +#' names. #' @param constantMaxDistForward,constantMaxDistReverse Numeric scalars, the #' maximum allowed Hamming distance between the extracted and expected #' constant sequence. If multiple constant sequences are provided, the most @@ -167,8 +168,8 @@ #' in [0, 1), it defines the maximal Hamming distance in terms of a #' fraction of sequence length: #' (\code{round(umiCollapseMaxDist * nchar(umiSeq))}). -#' A value greater or equal to 1 is rounded and directly used as the maximum -#' allowed Hamming distance. +#' A value greater or equal to 1 is rounded and directly used as the +#' maximum allowed Hamming distance. #' @param filteredReadsFastqForward,filteredReadsFastqReverse Character #' scalars, the names of a (pair of) FASTQ file(s) where filtered-out reads #' will be written. The name(s) should end in .gz (the output will always @@ -189,20 +190,21 @@ #' @return A list with four entries: #' \describe{ #' \item{summaryTable}{A \code{data.frame} that contains, for each observed -#' mutation combination, the corresponding variable region sequences (or pair of -#' sequences), the number of observed such sequences, and the number of unique -#' UMIs observed for the sequence. It also has additional columns: 'maxNbrReads' -#' contains the number of reads for the most frequent observed sequence -#' represented by the feature (only relevant if similar variable regions are -#' collapsed). 'nbrMutBases', 'nbrMutCodons' and 'nbrMutAAs' give the number of -#' mutated bases, codons or amino acids in each variant. Alternative variant -#' names based on base, codon or amino acid sequence are provided in columns -#' 'mutantNameBase', 'mutantNameCodon', 'mutantNameAA'. In addition, -#' 'mutantNameBaseHGVS' and 'mutantNameAAHGVS' give base- and amino acid-based -#' names following the HGVS nomenclature (https://varnomen.hgvs.org/). Please -#' note that the provided reference sequence names are used for the HGVS -#' sequence identifiers. It is up to the user to use appropriately named -#' reference sequences in order to obtain valid HGVS variant names.} +#' mutation combination, the corresponding variable region sequences (or pair +#' of sequences), the number of observed such sequences, and the number of +#' unique UMIs observed for the sequence. It also has additional columns: +#' 'maxNbrReads' contains the number of reads for the most frequent observed +#' sequence represented by the feature (only relevant if similar variable +#' regions are collapsed). 'nbrMutBases', 'nbrMutCodons' and 'nbrMutAAs' give +#' the number of mutated bases, codons or amino acids in each variant. +#' Alternative variant names based on base, codon or amino acid sequence are +#' provided in columns mutantNameBase', 'mutantNameCodon', 'mutantNameAA'. In +#' addition, mutantNameBaseHGVS' and 'mutantNameAAHGVS' give base- and amino +#' acid-based names following the HGVS nomenclature +#' (https://varnomen.hgvs.org/). Please note that the provided reference +#' sequence names are used for the HGVS sequence identifiers. It is up to the +#' user to use appropriately named reference sequences in order to obtain +#' valid HGVS variant names.} #' \item{filterSummary}{A \code{data.frame} that contains the number of input #' reads, the number of reads filtered out in the processing, and the number of #' retained reads. The filters are named according to the convention @@ -211,11 +213,11 @@ #' applied successively, and the reads filtered out in one step are not #' considered for successive filtering steps.} #' \item{errorStatistics}{A \code{data.frame} that contains, for each Phred -#' quality score between 0 and 99, the number of bases in the extracted constant -#' sequences with that quality score that match/mismatch with the provided -#' reference constant sequence.} -#' \item{parameters}{A \code{list} with all parameter settings that were used in -#' the processing. Also contains the version of the package and the time of +#' quality score between 0 and 99, the number of bases in the extracted +#' constant sequences with that quality score that match/mismatch with the +#' provided reference constant sequence.} +#' \item{parameters}{A \code{list} with all parameter settings that were used +#' in the processing. Also contains the version of the package and the time of #' processing.} #' } #' @@ -223,7 +225,7 @@ #' ## See the vignette for complete worked-out examples for different types of #' ## data sets #' -#' ## ----------------------------------------------------------------------- ## +#' ## ---------------------------------------------------------------------- ## #' ## Process a single-end data set, assume that the full read represents #' ## the variable region #' out <- digestFastqs( @@ -236,7 +238,7 @@ #' ## Filter summary #' out$filterSummary #' -#' ## ----------------------------------------------------------------------- ## +#' ## ---------------------------------------------------------------------- ## #' ## Process a single-end data set, specify the read as a combination of #' ## UMI, constant region and variable region (skip the first base) #' out <- digestFastqs( @@ -252,7 +254,7 @@ #' ## Error statistics #' out$errorStatistics #' -#' ## ----------------------------------------------------------------------- ## +#' ## ---------------------------------------------------------------------- ## #' ## Process a single-end data set, specify the read as a combination of #' ## UMI, constant region and variable region (skip the first base), provide #' ## the wild type sequence to compare the variable region to and limit the @@ -262,8 +264,9 @@ #' package = "mutscan"), #' elementsForward = "SUCV", elementLengthsForward = c(1, 10, 18, 96), #' constantForward = "AACCGGAGGAGGGAGCTG", -#' wildTypeForward = c(FOS = paste0("ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", -#' "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), +#' wildTypeForward = c(FOS = paste0( +#' "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", +#' "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), #' nbrMutatedCodonsMaxForward = 1 #' ) #' ## Table with read counts and mutant information @@ -273,7 +276,7 @@ #' ## Error statistics #' out$errorStatistics #' -#' ## ----------------------------------------------------------------------- ## +#' ## ---------------------------------------------------------------------- ## #' ## Process a paired-end data set where both the forward and reverse reads #' ## contain the same variable region and thus should be merged to generate #' ## the final variable sequence, specify the reads as a combination of @@ -285,13 +288,15 @@ #' package = "mutscan"), #' fastqReverse = system.file("extdata", "cisInput_2.fastq.gz", #' package = "mutscan"), -#' mergeForwardReverse = TRUE, revComplForward = FALSE, revComplReverse = TRUE, +#' mergeForwardReverse = TRUE, +#' revComplForward = FALSE, revComplReverse = TRUE, #' elementsForward = "SUCV", elementLengthsForward = c(1, 10, 18, 96), #' elementsReverse = "SUCVS", elementLengthsReverse = c(1, 7, 17, 96, -1), #' constantForward = "AACCGGAGGAGGGAGCTG", #' constantReverse = "GAGTTCATCCTGGCAGC", -#' wildTypeForward = c(FOS = paste0("ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", -#' "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), +#' wildTypeForward = c(FOS = paste0( +#' "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", +#' "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), #' nbrMutatedCodonsMaxForward = 1 #' ) #' ## Table with read counts and mutant information @@ -301,7 +306,7 @@ #' ## Error statistics #' out$errorStatistics #' -#' ## ----------------------------------------------------------------------- ## +#' ## ---------------------------------------------------------------------- ## #' ## Process a paired-end data set where the forward and reverse reads #' ## contain variable regions corresponding to different proteins, and thus #' ## should not be merged, specify the reads as a combination of @@ -318,10 +323,12 @@ #' elementsReverse = "SUCV", elementLengthsReverse = c(1, 8, 20, 96), #' constantForward = "AACCGGAGGAGGGAGCTG", #' constantReverse = "GAAAAAGGAAGCTGGAGAGA", -#' wildTypeForward = c(FOS = paste0("ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", -#' "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), -#' wildTypeReverse = c(JUN = paste0("ATCGCCCGGCTGGAGGAAAAAGTGAAAACCTTGAAAGCTCAGAACTC", -#' "GGAGCTGGCGTCCACGGCCAACATGCTCAGGGAACAGGTGGCACAGCTT")), +#' wildTypeForward = c(FOS = paste0( +#' "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", +#' "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), +#' wildTypeReverse = c(JUN = paste0( +#' "ATCGCCCGGCTGGAGGAAAAAGTGAAAACCTTGAAAGCTCAGAACTC", +#' "GGAGCTGGCGTCCACGGCCAACATGCTCAGGGAACAGGTGGCACAGCTT")), #' nbrMutatedCodonsMaxForward = 1, #' nbrMutatedCodonsMaxReverse = 1 #' ) @@ -334,8 +341,10 @@ #' #' @export #' @importFrom lifecycle deprecated is_present deprecate_warn +#' @importFrom utils packageVersion digestFastqs <- function(fastqForward, fastqReverse = NULL, - mergeForwardReverse = FALSE, minOverlap = 0, maxOverlap = 0, + mergeForwardReverse = FALSE, + minOverlap = 0, maxOverlap = 0, minMergedLength = 0, maxMergedLength = 0, maxFracMismatchOverlap = 1, greedyOverlap = TRUE, revComplForward = FALSE, revComplReverse = FALSE, @@ -376,12 +385,12 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, ## pre-flight checks --------------------------------------------------------- ## deprecated arguments deprecMessageColl <- paste0( - "Starting from mutscan v0.3.0, collapsing of variable sequences is no ", - "longer supported by digestFastqs(), and arguments ", + "Starting from mutscan v0.3.0, collapsing of variable sequences is ", + "no longer supported by digestFastqs(), and arguments ", "variableCollapseMaxDist, variableCollapseMinReads ", "and variableCollapseMinRatio will be ignored. Please run ", - "summarizeExperiment() to generate a SummarizedExperiment object, and ", - "then call collapseMutantsBySimilarity() to collapse variable ", + "summarizeExperiment() to generate a SummarizedExperiment object, ", + "and then call collapseMutantsBySimilarity() to collapse variable ", "sequences in a consistent way across all samples." ) if (lifecycle::is_present(variableCollapseMaxDist)) { @@ -416,14 +425,17 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, ## fastq files exist if (length(fastqForward) < 1 || !all(file.exists(fastqForward)) || - (!is.null(fastqReverse) && (length(fastqReverse) != length(fastqForward) || - !all(file.exists(fastqReverse))))) { - stop("'fastqForward' and 'fastqReverse' must point to one or several matching existing files"); + (!is.null(fastqReverse) && + (length(fastqReverse) != length(fastqForward) || + !all(file.exists(fastqReverse))))) { + stop("'fastqForward' and 'fastqReverse' must point to one or ", + "several matching existing files") } if (is.null(fastqReverse)) { fastqReverse <- rep("", length(fastqForward)) } - ## Convert file paths to canonical form, expand ~ and complete relative paths + ## Convert file paths to canonical form, expand ~ and complete + ## relative paths fastqForward <- normalizePath(fastqForward, mustWork = FALSE) fastqReverse <- normalizePath(fastqReverse, mustWork = FALSE) @@ -432,8 +444,8 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, .assertScalar(x = revComplForward, type = "logical") .assertScalar(x = revComplReverse, type = "logical") if (any(fastqReverse == "") && mergeForwardReverse) { - stop("Both forward and reverse FASTQ files must be given in order to merge ", - "forward and reverse reads") + stop("Both forward and reverse FASTQ files must be given in order to ", + "merge forward and reverse reads") } ## check numeric inputs @@ -445,10 +457,14 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, .assertScalar(x = maxOverlap, type = "numeric", rngIncl = c(0, Inf)) .assertScalar(x = minMergedLength, type = "numeric", rngIncl = c(0, Inf)) .assertScalar(x = maxMergedLength, type = "numeric", rngIncl = c(0, Inf)) - .assertScalar(x = avePhredMinForward, type = "numeric", rngIncl = c(0, Inf)) - .assertScalar(x = avePhredMinReverse, type = "numeric", rngIncl = c(0, Inf)) - .assertScalar(x = variableNMaxForward, type = "numeric", rngIncl = c(0, Inf)) - .assertScalar(x = variableNMaxReverse, type = "numeric", rngIncl = c(0, Inf)) + .assertScalar(x = avePhredMinForward, type = "numeric", + rngIncl = c(0, Inf)) + .assertScalar(x = avePhredMinReverse, type = "numeric", + rngIncl = c(0, Inf)) + .assertScalar(x = variableNMaxForward, type = "numeric", + rngIncl = c(0, Inf)) + .assertScalar(x = variableNMaxReverse, type = "numeric", + rngIncl = c(0, Inf)) .assertScalar(x = umiNMax, type = "numeric", rngIncl = c(0, Inf)) .assertScalar(x = nbrMutatedCodonsMaxForward, type = "numeric", rngIncl = c(0, Inf), validValues = -1) @@ -458,38 +474,51 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, rngIncl = c(0, Inf), validValues = -1) .assertScalar(x = nbrMutatedBasesMaxReverse, type = "numeric", rngIncl = c(0, Inf), validValues = -1) - .assertScalar(x = mutatedPhredMinForward, type = "numeric", rngIncl = c(0, Inf)) - .assertScalar(x = mutatedPhredMinReverse, type = "numeric", rngIncl = c(0, Inf)) + .assertScalar(x = mutatedPhredMinForward, type = "numeric", + rngIncl = c(0, Inf)) + .assertScalar(x = mutatedPhredMinReverse, type = "numeric", + rngIncl = c(0, Inf)) .assertScalar(x = constantMaxDistForward, type = "numeric", rngIncl = c(0, Inf), validValues = -1) .assertScalar(x = constantMaxDistReverse, type = "numeric", rngIncl = c(0, Inf), validValues = -1) - .assertScalar(x = umiCollapseMaxDist, type = "numeric", rngIncl = c(0, Inf)) + .assertScalar(x = umiCollapseMaxDist, type = "numeric", + rngIncl = c(0, Inf)) .assertScalar(x = maxNReads, type = "numeric", rngIncl = c(0, Inf), validValues = -1) .assertScalar(x = nThreads, type = "numeric", rngExcl = c(0, Inf)) .assertScalar(x = chunkSize, type = "numeric", rngExcl = c(0, Inf)) .assertScalar(x = maxReadLength, type = "numeric", rngExcl = c(0, Inf)) - ## If a wildtype sequence is provided, it must be unambiguous how to identify and name mutants + ## If a wildtype sequence is provided, + ## it must be unambiguous how to identify and name mutants if (any(wildTypeForward != "")) { - if ((nbrMutatedCodonsMaxForward == (-1) && nbrMutatedBasesMaxForward == (-1)) || - (nbrMutatedCodonsMaxForward != (-1) && nbrMutatedBasesMaxForward != (-1))) { - stop("Exactly one of 'nbrMutatedCodonsMaxForward' and 'nbrMutatedBasesMaxForward' must be -1") + if ((nbrMutatedCodonsMaxForward == (-1) && + nbrMutatedBasesMaxForward == (-1)) || + (nbrMutatedCodonsMaxForward != (-1) && + nbrMutatedBasesMaxForward != (-1))) { + stop("Exactly one of 'nbrMutatedCodonsMaxForward' and ", + "'nbrMutatedBasesMaxForward' must be -1") } } if (any(wildTypeReverse != "")) { - if ((nbrMutatedCodonsMaxReverse == (-1) && nbrMutatedBasesMaxReverse == (-1)) || - (nbrMutatedCodonsMaxReverse != (-1) && nbrMutatedBasesMaxReverse != (-1))) { - stop("Exactly one of 'nbrMutatedCodonsMaxReverse' and 'nbrMutatedBasesMaxReverse' must be -1") + if ((nbrMutatedCodonsMaxReverse == (-1) && + nbrMutatedBasesMaxReverse == (-1)) || + (nbrMutatedCodonsMaxReverse != (-1) && + nbrMutatedBasesMaxReverse != (-1))) { + stop("Exactly one of 'nbrMutatedCodonsMaxReverse' and ", + "'nbrMutatedBasesMaxReverse' must be -1") } } ## adapters must be strings, valid DNA characters if (length(adapterForward) != 1 || !is.character(adapterForward) || - !grepl("^[AaCcGgTt]*$", adapterForward) || length(adapterReverse) != 1 || - !is.character(adapterReverse) || !grepl("^[AaCcGgTt]*$", adapterReverse)) { - stop("Adapters must be character strings, only containing valid DNA characters") + !grepl("^[AaCcGgTt]*$", adapterForward) || + length(adapterReverse) != 1 || + !is.character(adapterReverse) || + !grepl("^[AaCcGgTt]*$", adapterReverse)) { + stop("Adapters must be character strings, only containing ", + "valid DNA characters") } else { adapterForward <- toupper(adapterForward) adapterReverse <- toupper(adapterReverse) @@ -510,18 +539,22 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, stop("'elementsReverse' must be a non-empty character scalar") } - if (!all(strsplit(elementsForward, "")[[1]] %in% c("C", "U", "S", "V", "P"))) { + if (!all(strsplit(elementsForward, "")[[1]] %in% + c("C", "U", "S", "V", "P"))) { stop("'elementsForward' can only contain letters 'CUSVP'") } - if (!all(strsplit(elementsReverse, "")[[1]] %in% c("C", "U", "S", "V", "P"))) { + if (!all(strsplit(elementsReverse, "")[[1]] %in% + c("C", "U", "S", "V", "P"))) { stop("'elementsReverse' can only contain letters 'CUSVP'") } if (nchar(elementsForward) != length(elementLengthsForward)) { - stop("'elementsForward' and 'elementsLengthsForward' must have the same length") + stop("'elementsForward' and 'elementsLengthsForward' must ", + "have the same length") } if (nchar(elementsReverse) != length(elementLengthsReverse)) { - stop("'elementsReverse' and 'elementsLengthsReverse' must have the same length") + stop("'elementsReverse' and 'elementsLengthsReverse' must ", + "have the same length") } ## Max one 'P' @@ -547,24 +580,31 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, ## If a 'P', max one -1 length on each side if (length(PposFwd) == 1 && PposFwd != -1) { - if ((PposFwd != 1 && sum(elementLengthsForward[seq_len(PposFwd - 1)] == -1) > 1) || + if ((PposFwd != 1 && + sum(elementLengthsForward[seq_len(PposFwd - 1)] == -1) > 1) || (PposFwd != nchar(elementsForward) && sum(elementLengthsForward[(PposFwd + 1):nchar(elementsForward)] == -1) > 1)) { - stop("Max one element length (forward) on each side of the primer can be -1") + stop("Max one element length (forward) on each side ", + "of the primer can be -1") } } if (length(PposRev) == 1 && PposRev != -1) { - if ((PposRev != 1 && sum(elementLengthsReverse[seq_len(PposRev - 1)] == -1) > 1) || + if ((PposRev != 1 && + sum(elementLengthsReverse[seq_len(PposRev - 1)] == -1) > 1) || (PposRev != nchar(elementsReverse) && sum(elementLengthsReverse[(PposRev + 1):nchar(elementsReverse)] == -1) > 1)) { - stop("Max one element length (reverse) on each side of the primer can be -1") + stop("Max one element length (reverse) on each side ", + "of the primer can be -1") } } if (!is.character(primerForward) || length(primerForward) < 1 || - !all(grepl("^[AaCcGgTt]*$", primerForward)) || !is.character(primerReverse) || - length(primerReverse) < 1 || !all(grepl("^[AaCcGgTt]*$", primerReverse))) { - stop("Primers must be character vectors, only containing valid DNA characters") + !all(grepl("^[AaCcGgTt]*$", primerForward)) || + !is.character(primerReverse) || + length(primerReverse) < 1 || + !all(grepl("^[AaCcGgTt]*$", primerReverse))) { + stop("Primers must be character vectors, ", + "only containing valid DNA characters") } else { primerForward <- vapply(primerForward, toupper, "") primerReverse <- vapply(primerReverse, toupper, "") @@ -579,18 +619,22 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, } ## wild type sequences must be given in named vectors - if (any(is.null(names(wildTypeForward))) || any(names(wildTypeForward) == "") || - any(is.null(names(wildTypeReverse))) || any(names(wildTypeReverse) == "")) { + if (any(is.null(names(wildTypeForward))) || + any(names(wildTypeForward) == "") || + any(is.null(names(wildTypeReverse))) || + any(names(wildTypeReverse) == "")) { stop('wild type sequences must be given in named vectors') } ## wild type sequences must be strings, valid DNA characters if (!all(vapply(wildTypeForward, is.character, FALSE)) || !all(vapply(wildTypeForward, length, 0) == 1) || - !all(vapply(wildTypeForward, function(w) grepl("^[AaCcGgTt]*$", w), FALSE)) || + !all(vapply(wildTypeForward, + function(w) grepl("^[AaCcGgTt]*$", w), FALSE)) || !all(vapply(wildTypeReverse, is.character, FALSE)) || !all(vapply(wildTypeReverse, length, 0) == 1) || - !all(vapply(wildTypeReverse, function(w) grepl("^[AaCcGgTt]*$", w), FALSE))) { + !all(vapply(wildTypeReverse, + function(w) grepl("^[AaCcGgTt]*$", w), FALSE))) { stop("wild type sequences must be character strings, ", "only containing valid DNA characters") } else { @@ -606,7 +650,8 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, ## cis experiment - should not have wildTypeReverse if (mergeForwardReverse && any(vapply(wildTypeReverse, nchar, 1) > 0)) { - warning("Ignoring 'wildTypeReverse' when forward and reverse reads are merged") + warning("Ignoring 'wildTypeReverse' when forward and ", + "reverse reads are merged") wildTypeReverse <- c(r = "") } @@ -623,8 +668,10 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, constantReverse <- toupper(constantReverse) } - if (!all(vapply(constantForward, nchar, 0L) == nchar(constantForward[1])) || - !all(vapply(constantReverse, nchar, 0L) == nchar(constantReverse[1]))) { + if (!all(vapply(constantForward, nchar, 0L) == + nchar(constantForward[1])) || + !all(vapply(constantReverse, nchar, 0L) == + nchar(constantReverse[1]))) { stop("All constant sequences must be of the same length") } @@ -634,7 +681,8 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, sum(elementLengthsForward[CposFwd]) != nchar(constantForward[1])) { stop("The sum of the constant sequence lengths in elementsForward (", sum(elementLengthsForward[CposFwd]), - ") does not correspond to the length of the given 'constantForward' (", + ") does not correspond to the length of the given ", + "'constantForward' (", nchar(constantForward[1]), ")") } CposRev <- gregexpr(pattern = "C", elementsReverse)[[1]] @@ -643,15 +691,18 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, sum(elementLengthsReverse[CposRev]) != nchar(constantReverse[1])) { stop("The sum of the constant sequence lengths in elementsReverse (", sum(elementLengthsReverse[CposRev]), - ") does not correspond to the length of the given 'constantReverse' (", + ") does not correspond to the length of the given ", + "'constantReverse' (", nchar(constantReverse[1]), ")") } if (!all(is.character(forbiddenMutatedCodonsForward)) || - !all(grepl("^[ACGTMRWSYKVHDBN]{3}$", toupper(forbiddenMutatedCodonsForward)) | + !all(grepl("^[ACGTMRWSYKVHDBN]{3}$", + toupper(forbiddenMutatedCodonsForward)) | forbiddenMutatedCodonsForward == "") || !all(is.character(forbiddenMutatedCodonsReverse)) || - !all(grepl("^[ACGTMRWSYKVHDBN]{3}$", toupper(forbiddenMutatedCodonsReverse)) | + !all(grepl("^[ACGTMRWSYKVHDBN]{3}$", + toupper(forbiddenMutatedCodonsReverse)) | forbiddenMutatedCodonsReverse == "")) { stop("All elements of 'forbiddenMutatedCodonsForward' and 'forbiddenMutatedCodonsReverse' must be ", "character strings consisting of three valid IUPAC letters.") @@ -671,96 +722,109 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, stop("'collapseToWTReverse' must be a logical scalar.") } - ## mutNameDelimiter must be a single character, and can not appear in any of the WT sequence names + ## mutNameDelimiter must be a single character, and can not + ## appear in any of the WT sequence names if (!is.character(mutNameDelimiter) || length(mutNameDelimiter) != 1 || nchar(mutNameDelimiter) != 1 || mutNameDelimiter == "_") { - stop("'mutNameDelimiter' must be a single-letter character scalar, not equal to '_'") - } - if (any(grepl(mutNameDelimiter, c(names(wildTypeForward), names(wildTypeReverse)), fixed = TRUE))) { - stop("'mutNameDelimiter' can not appear in the name of any of the provided wild type sequences.") - } - - if (!is.character(filteredReadsFastqForward) || !is.character(filteredReadsFastqReverse) || - length(filteredReadsFastqForward) != 1 || length(filteredReadsFastqReverse) != 1 || - (filteredReadsFastqForward != "" && !grepl("\\.gz$", filteredReadsFastqForward)) || - (filteredReadsFastqReverse != "" && !grepl("\\.gz$", filteredReadsFastqReverse))) { - stop("'filteredReadsFastqForward' and 'filteredReadsFastqReverse' must be character ", - "scalars ending with .gz.") + stop("'mutNameDelimiter' must be a single-letter character scalar, ", + "not equal to '_'") + } + if (any(grepl(mutNameDelimiter, + c(names(wildTypeForward), names(wildTypeReverse)), + fixed = TRUE))) { + stop("'mutNameDelimiter' can not appear in the name of any of ", + "the provided wild type sequences.") + } + + if (!is.character(filteredReadsFastqForward) || + !is.character(filteredReadsFastqReverse) || + length(filteredReadsFastqForward) != 1 || + length(filteredReadsFastqReverse) != 1 || + (filteredReadsFastqForward != "" && + !grepl("\\.gz$", filteredReadsFastqForward)) || + (filteredReadsFastqReverse != "" && + !grepl("\\.gz$", filteredReadsFastqReverse))) { + stop("'filteredReadsFastqForward' and 'filteredReadsFastqReverse' ", + "must be character scalars ending with .gz.") } ## If path is "", it will still be "" - filteredReadsFastqForward <- normalizePath(filteredReadsFastqForward, mustWork = FALSE) - filteredReadsFastqReverse <- normalizePath(filteredReadsFastqReverse, mustWork = FALSE) + filteredReadsFastqForward <- + normalizePath(filteredReadsFastqForward, mustWork = FALSE) + filteredReadsFastqReverse <- + normalizePath(filteredReadsFastqReverse, mustWork = FALSE) if ((any(fastqReverse == "") && filteredReadsFastqReverse != "") || (all(fastqReverse != "") && filteredReadsFastqForward != "" && filteredReadsFastqReverse == "") || (all(fastqForward != "") && filteredReadsFastqForward == "" && filteredReadsFastqReverse != "")) { - stop("The pairing of the output FASTQ files must be compatible with that of the input files.") + stop("The pairing of the output FASTQ files must be compatible ", + "with that of the input files.") } .assertScalar(x = verbose, type = "logical") - ## call digestFastqsCpp ------------------------------------------------------ - ## Represent the wildtype sequences as pairs of vectors (one with sequences, - ## one with names), to make things faster on the C++ side + ## call digestFastqsCpp --------------------------------------------------- + ## Represent the wildtype sequences as pairs of vectors (one with + ## sequences, one with names), to make things faster on the C++ side wildTypeForwardNames <- names(wildTypeForward) wildTypeForward <- unname(wildTypeForward) wildTypeReverseNames <- names(wildTypeReverse) wildTypeReverse <- unname(wildTypeReverse) - res <- digestFastqsCpp(fastqForwardVect = fastqForward, - fastqReverseVect = fastqReverse, - mergeForwardReverse = mergeForwardReverse, - minOverlap = minOverlap, - maxOverlap = maxOverlap, - minMergedLength = minMergedLength, - maxMergedLength = maxMergedLength, - maxFracMismatchOverlap = maxFracMismatchOverlap, - greedyOverlap = greedyOverlap, - revComplForward = revComplForward, - revComplReverse = revComplReverse, - elementsForward = elementsForward, - elementLengthsForward = as.numeric(elementLengthsForward), - elementsReverse = elementsReverse, - elementLengthsReverse = as.numeric(elementLengthsReverse), - adapterForward = adapterForward, - adapterReverse = adapterReverse, - primerForward = primerForward, - primerReverse = primerReverse, - wildTypeForward = wildTypeForward, - wildTypeForwardNames = wildTypeForwardNames, - wildTypeReverse = wildTypeReverse, - wildTypeReverseNames = wildTypeReverseNames, - constantForward = constantForward, - constantReverse = constantReverse, - avePhredMinForward = avePhredMinForward, - avePhredMinReverse = avePhredMinReverse, - variableNMaxForward = variableNMaxForward, - variableNMaxReverse = variableNMaxReverse, - umiNMax = umiNMax, - nbrMutatedCodonsMaxForward = nbrMutatedCodonsMaxForward, - nbrMutatedCodonsMaxReverse = nbrMutatedCodonsMaxReverse, - nbrMutatedBasesMaxForward = nbrMutatedBasesMaxForward, - nbrMutatedBasesMaxReverse = nbrMutatedBasesMaxReverse, - forbiddenMutatedCodonsForward = forbiddenMutatedCodonsForward, - forbiddenMutatedCodonsReverse = forbiddenMutatedCodonsReverse, - useTreeWTmatch = useTreeWTmatch, - collapseToWTForward = collapseToWTForward, - collapseToWTReverse = collapseToWTReverse, - mutatedPhredMinForward = mutatedPhredMinForward, - mutatedPhredMinReverse = mutatedPhredMinReverse, - mutNameDelimiter = mutNameDelimiter, - constantMaxDistForward = constantMaxDistForward, - constantMaxDistReverse = constantMaxDistReverse, - umiCollapseMaxDist = umiCollapseMaxDist, - filteredReadsFastqForward = filteredReadsFastqForward, - filteredReadsFastqReverse = filteredReadsFastqReverse, - maxNReads = maxNReads, - verbose = verbose, - nThreads = as.integer(nThreads), - chunkSize = as.integer(chunkSize), - maxReadLength = maxReadLength) - - ## Add package version and processing date ----------------------------------- + res <- digestFastqsCpp( + fastqForwardVect = fastqForward, + fastqReverseVect = fastqReverse, + mergeForwardReverse = mergeForwardReverse, + minOverlap = minOverlap, + maxOverlap = maxOverlap, + minMergedLength = minMergedLength, + maxMergedLength = maxMergedLength, + maxFracMismatchOverlap = maxFracMismatchOverlap, + greedyOverlap = greedyOverlap, + revComplForward = revComplForward, + revComplReverse = revComplReverse, + elementsForward = elementsForward, + elementLengthsForward = as.numeric(elementLengthsForward), + elementsReverse = elementsReverse, + elementLengthsReverse = as.numeric(elementLengthsReverse), + adapterForward = adapterForward, + adapterReverse = adapterReverse, + primerForward = primerForward, + primerReverse = primerReverse, + wildTypeForward = wildTypeForward, + wildTypeForwardNames = wildTypeForwardNames, + wildTypeReverse = wildTypeReverse, + wildTypeReverseNames = wildTypeReverseNames, + constantForward = constantForward, + constantReverse = constantReverse, + avePhredMinForward = avePhredMinForward, + avePhredMinReverse = avePhredMinReverse, + variableNMaxForward = variableNMaxForward, + variableNMaxReverse = variableNMaxReverse, + umiNMax = umiNMax, + nbrMutatedCodonsMaxForward = nbrMutatedCodonsMaxForward, + nbrMutatedCodonsMaxReverse = nbrMutatedCodonsMaxReverse, + nbrMutatedBasesMaxForward = nbrMutatedBasesMaxForward, + nbrMutatedBasesMaxReverse = nbrMutatedBasesMaxReverse, + forbiddenMutatedCodonsForward = forbiddenMutatedCodonsForward, + forbiddenMutatedCodonsReverse = forbiddenMutatedCodonsReverse, + useTreeWTmatch = useTreeWTmatch, + collapseToWTForward = collapseToWTForward, + collapseToWTReverse = collapseToWTReverse, + mutatedPhredMinForward = mutatedPhredMinForward, + mutatedPhredMinReverse = mutatedPhredMinReverse, + mutNameDelimiter = mutNameDelimiter, + constantMaxDistForward = constantMaxDistForward, + constantMaxDistReverse = constantMaxDistReverse, + umiCollapseMaxDist = umiCollapseMaxDist, + filteredReadsFastqForward = filteredReadsFastqForward, + filteredReadsFastqReverse = filteredReadsFastqReverse, + maxNReads = maxNReads, + verbose = verbose, + nThreads = as.integer(nThreads), + chunkSize = as.integer(chunkSize), + maxReadLength = maxReadLength) + + ## Add package version and processing date -------------------------------- res$parameters$processingInfo <- paste0( "Processed by mutscan v", utils::packageVersion("mutscan"), " on ", Sys.time() diff --git a/man/digestFastqs.Rd b/man/digestFastqs.Rd index 38052a3..216133c 100644 --- a/man/digestFastqs.Rd +++ b/man/digestFastqs.Rd @@ -110,8 +110,8 @@ but there can be at most one occurrence of P. If a given letter is included multiple times, the corresponding sequences will be concatenated in the output.} -\item{elementLengthsForward, elementLengthsReverse}{Numeric vectors containing -the lengths of each read component from +\item{elementLengthsForward, elementLengthsReverse}{Numeric vectors +containing the lengths of each read component from \code{elementsForward}/\code{elementsReverse}, respectively. If the length of one element is set to -1, it will be inferred from the other lengths (as the remainder of the read). At most one number (or one @@ -119,11 +119,11 @@ number on each side of the primer P) can be set to -1. The indicated length of the primer is not used (instead it's inferred from the provided primer sequence) and can also be set to -1.} -\item{primerForward, primerReverse}{Character vectors, representing the primer -sequence(s) for forward/reverse reads, respectively. Only read pairs that -contain perfect matches to both the forward and reverse primers (if -given) will be retained. Multiple primers can be specified - they will be -considered in order and the first match will be used.} +\item{primerForward, primerReverse}{Character vectors, representing the +primer sequence(s) for forward/reverse reads, respectively. Only read +pairs that contain perfect matches to both the forward and reverse +primers (if given) will be retained. Multiple primers can be specified - +they will be considered in order and the first match will be used.} \item{wildTypeForward, wildTypeReverse}{Character scalars or named character vectors, the wild type sequence for the forward and reverse variable @@ -181,13 +181,14 @@ any mutated base has a Phred score lower than \code{mutatedPhredMin}, the read (pair) will be discarded.} \item{mutNameDelimiter}{Character scalar, the delimiter used in the naming -of mutants. Generally, mutants will be named as XX\{.\}YY\{.\}NNN, where XX -is the closest provided reference sequence, YY is the mutated base or +of mutants. Generally, mutants will be named as XX\{.\}YY\{.\}NNN, where +XX is the closest provided reference sequence, YY is the mutated base or codon number (depending on whether \code{nbrMutatedBases*} or \code{nbrMutatedCodons*} is specified), and NNN is the -mutated base or codon. Here, \{.\} is the provided \code{mutNameDelimiter}. -The delimiter must be a single character (not "_"), and can not appear -in any of the provided reference sequence names.} +mutated base or codon. Here, \{.\} is the provided +\code{mutNameDelimiter}. The delimiter must be a single character +(not "_"), and can not appear in any of the provided reference sequence +names.} \item{constantMaxDistForward, constantMaxDistReverse}{Numeric scalars, the maximum allowed Hamming distance between the extracted and expected @@ -204,8 +205,8 @@ the tolerances for collapsing similar UMI sequences. If the value is in [0, 1), it defines the maximal Hamming distance in terms of a fraction of sequence length: (\code{round(umiCollapseMaxDist * nchar(umiSeq))}). -A value greater or equal to 1 is rounded and directly used as the maximum -allowed Hamming distance.} +A value greater or equal to 1 is rounded and directly used as the +maximum allowed Hamming distance.} \item{filteredReadsFastqForward, filteredReadsFastqReverse}{Character scalars, the names of a (pair of) FASTQ file(s) where filtered-out reads @@ -233,20 +234,21 @@ the \code{chunkSize} to be decreased.} A list with four entries: \describe{ \item{summaryTable}{A \code{data.frame} that contains, for each observed -mutation combination, the corresponding variable region sequences (or pair of -sequences), the number of observed such sequences, and the number of unique -UMIs observed for the sequence. It also has additional columns: 'maxNbrReads' -contains the number of reads for the most frequent observed sequence -represented by the feature (only relevant if similar variable regions are -collapsed). 'nbrMutBases', 'nbrMutCodons' and 'nbrMutAAs' give the number of -mutated bases, codons or amino acids in each variant. Alternative variant -names based on base, codon or amino acid sequence are provided in columns -'mutantNameBase', 'mutantNameCodon', 'mutantNameAA'. In addition, -'mutantNameBaseHGVS' and 'mutantNameAAHGVS' give base- and amino acid-based -names following the HGVS nomenclature (https://varnomen.hgvs.org/). Please -note that the provided reference sequence names are used for the HGVS -sequence identifiers. It is up to the user to use appropriately named -reference sequences in order to obtain valid HGVS variant names.} +mutation combination, the corresponding variable region sequences (or pair +of sequences), the number of observed such sequences, and the number of +unique UMIs observed for the sequence. It also has additional columns: +'maxNbrReads' contains the number of reads for the most frequent observed +sequence represented by the feature (only relevant if similar variable +regions are collapsed). 'nbrMutBases', 'nbrMutCodons' and 'nbrMutAAs' give +the number of mutated bases, codons or amino acids in each variant. +Alternative variant names based on base, codon or amino acid sequence are +provided in columns mutantNameBase', 'mutantNameCodon', 'mutantNameAA'. In +addition, mutantNameBaseHGVS' and 'mutantNameAAHGVS' give base- and amino +acid-based names following the HGVS nomenclature +(https://varnomen.hgvs.org/). Please note that the provided reference +sequence names are used for the HGVS sequence identifiers. It is up to the +user to use appropriately named reference sequences in order to obtain +valid HGVS variant names.} \item{filterSummary}{A \code{data.frame} that contains the number of input reads, the number of reads filtered out in the processing, and the number of retained reads. The filters are named according to the convention @@ -255,19 +257,19 @@ applied, and "filter" indicates the type of filter. Note that filters are applied successively, and the reads filtered out in one step are not considered for successive filtering steps.} \item{errorStatistics}{A \code{data.frame} that contains, for each Phred -quality score between 0 and 99, the number of bases in the extracted constant -sequences with that quality score that match/mismatch with the provided -reference constant sequence.} -\item{parameters}{A \code{list} with all parameter settings that were used in -the processing. Also contains the version of the package and the time of +quality score between 0 and 99, the number of bases in the extracted +constant sequences with that quality score that match/mismatch with the +provided reference constant sequence.} +\item{parameters}{A \code{list} with all parameter settings that were used +in the processing. Also contains the version of the package and the time of processing.} } } \description{ Read sequences for one or a pair of fastq files and digest them (extract umis, constant and variable parts, filter, extract mismatch information from -constant and count the observed unique variable parts). Alternatively, primer -sequences could be specified, in which case the sequence immediately +constant and count the observed unique variable parts). Alternatively, +primer sequences could be specified, in which case the sequence immediately following the primer will be considered the variable sequence. } \details{ @@ -288,8 +290,8 @@ The processing of a read pair goes as follows: either the forward or reverse read (or the merged read). \item Filter out the read (pair) if the number of Ns in the variable region exceeds \code{variableNMaxForward}/\code{variableNMaxReverse}. - \item Filter out the read (pair) if the number of Ns in the combined forward - and reverse UMI sequence exceeds \code{umiNMax} + \item Filter out the read (pair) if the number of Ns in the combined + forward and reverse UMI sequence exceeds \code{umiNMax} \item If one or more wild type sequences (for the variable region) are provided, find the mismatches between the (forward/reverse) variable region and the provided wild type sequence (if more than one wild type sequence is @@ -302,9 +304,9 @@ The processing of a read pair goes as follows: the codons encoded by \code{forbiddenMutatedCodonsForward}/\code{forbiddenMutatedCodonsReverse}. \item Assign a 'mutation name' to the read (pair). This name is a - combination of parts of the form XX\{.\}YY\{.\}NNN, where XX is the name of the - most similar reference sequence, YY is the mutated codon number, and NNN is - the mutated codon. \{.\} is a delimiter, specified via + combination of parts of the form XX\{.\}YY\{.\}NNN, where XX is the name of + the most similar reference sequence, YY is the mutated codon number, and + NNN is the mutated codon. \{.\} is a delimiter, specified via \code{mutNameDelimiter}. If no wildtype sequences are provided, the variable sequence will be used as the mutation name'. } @@ -317,7 +319,7 @@ number of reads, and the number of unique UMIs, for each variable sequence ## See the vignette for complete worked-out examples for different types of ## data sets -## ----------------------------------------------------------------------- ## +## ---------------------------------------------------------------------- ## ## Process a single-end data set, assume that the full read represents ## the variable region out <- digestFastqs( @@ -330,7 +332,7 @@ head(out$summaryTable) ## Filter summary out$filterSummary -## ----------------------------------------------------------------------- ## +## ---------------------------------------------------------------------- ## ## Process a single-end data set, specify the read as a combination of ## UMI, constant region and variable region (skip the first base) out <- digestFastqs( @@ -346,7 +348,7 @@ out$filterSummary ## Error statistics out$errorStatistics -## ----------------------------------------------------------------------- ## +## ---------------------------------------------------------------------- ## ## Process a single-end data set, specify the read as a combination of ## UMI, constant region and variable region (skip the first base), provide ## the wild type sequence to compare the variable region to and limit the @@ -356,8 +358,9 @@ out <- digestFastqs( package = "mutscan"), elementsForward = "SUCV", elementLengthsForward = c(1, 10, 18, 96), constantForward = "AACCGGAGGAGGGAGCTG", - wildTypeForward = c(FOS = paste0("ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", - "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), + wildTypeForward = c(FOS = paste0( + "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", + "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), nbrMutatedCodonsMaxForward = 1 ) ## Table with read counts and mutant information @@ -367,7 +370,7 @@ out$filterSummary ## Error statistics out$errorStatistics -## ----------------------------------------------------------------------- ## +## ---------------------------------------------------------------------- ## ## Process a paired-end data set where both the forward and reverse reads ## contain the same variable region and thus should be merged to generate ## the final variable sequence, specify the reads as a combination of @@ -379,13 +382,15 @@ out <- digestFastqs( package = "mutscan"), fastqReverse = system.file("extdata", "cisInput_2.fastq.gz", package = "mutscan"), - mergeForwardReverse = TRUE, revComplForward = FALSE, revComplReverse = TRUE, + mergeForwardReverse = TRUE, + revComplForward = FALSE, revComplReverse = TRUE, elementsForward = "SUCV", elementLengthsForward = c(1, 10, 18, 96), elementsReverse = "SUCVS", elementLengthsReverse = c(1, 7, 17, 96, -1), constantForward = "AACCGGAGGAGGGAGCTG", constantReverse = "GAGTTCATCCTGGCAGC", - wildTypeForward = c(FOS = paste0("ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", - "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), + wildTypeForward = c(FOS = paste0( + "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", + "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), nbrMutatedCodonsMaxForward = 1 ) ## Table with read counts and mutant information @@ -395,7 +400,7 @@ out$filterSummary ## Error statistics out$errorStatistics -## ----------------------------------------------------------------------- ## +## ---------------------------------------------------------------------- ## ## Process a paired-end data set where the forward and reverse reads ## contain variable regions corresponding to different proteins, and thus ## should not be merged, specify the reads as a combination of @@ -412,10 +417,12 @@ out <- digestFastqs( elementsReverse = "SUCV", elementLengthsReverse = c(1, 8, 20, 96), constantForward = "AACCGGAGGAGGGAGCTG", constantReverse = "GAAAAAGGAAGCTGGAGAGA", - wildTypeForward = c(FOS = paste0("ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", - "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), - wildTypeReverse = c(JUN = paste0("ATCGCCCGGCTGGAGGAAAAAGTGAAAACCTTGAAAGCTCAGAACTC", - "GGAGCTGGCGTCCACGGCCAACATGCTCAGGGAACAGGTGGCACAGCTT")), + wildTypeForward = c(FOS = paste0( + "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", + "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), + wildTypeReverse = c(JUN = paste0( + "ATCGCCCGGCTGGAGGAAAAAGTGAAAACCTTGAAAGCTCAGAACTC", + "GGAGCTGGCGTCCACGGCCAACATGCTCAGGGAACAGGTGGCACAGCTT")), nbrMutatedCodonsMaxForward = 1, nbrMutatedCodonsMaxReverse = 1 ) From 76d024b8beeb763fe030564f8cfe9ca1b40937a1 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Mon, 2 Jun 2025 20:05:20 +0200 Subject: [PATCH 18/44] Cleanup, linkMultipleVariants.R --- R/linkMultipleVariants.R | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/R/linkMultipleVariants.R b/R/linkMultipleVariants.R index 3a04c1d..06aee7e 100644 --- a/R/linkMultipleVariants.R +++ b/R/linkMultipleVariants.R @@ -113,9 +113,10 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { paramsSeparate <- lapply(paramsSeparate, function(parm) { c(parm, as.list(defaults[!(names(defaults) %in% names(parm))])) }) - combinedDigestParams <- c(combinedDigestParams, - as.list(defaults[!(names(defaults) %in% - names(combinedDigestParams))])) + combinedDigestParams <- + c(combinedDigestParams, + as.list(defaults[!(names(defaults) %in% + names(combinedDigestParams))])) ## --------------------------------------------------------------------- ## ## Checks @@ -161,27 +162,27 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { ## Get count matrix with "raw" (uncorrected) sequences countCombined <- outCombined$summaryTable %>% dplyr::select("sequence", "nbrReads", "varLengths") %>% - dplyr::mutate(idx = paste0("I", seq_along(.data$sequence))) + mutate(idx = paste0("I", seq_along(.data$sequence))) ## If applicable, separate into forward and reverse sequences if (any(grepl("_", countCombined$sequence))) { countCombined <- countCombined %>% - tidyr::separate(.data$sequence, into = c("sequenceForward", + separate(.data$sequence, into = c("sequenceForward", "sequenceReverse"), sep = "_") %>% - tidyr::separate(.data$varLengths, into = c("varLengthsForward", + separate(.data$varLengths, into = c("varLengthsForward", "varLengthsReverse"), sep = "_") %>% - dplyr::mutate( + mutate( nCompForward = vapply(strsplit(.data$varLengthsForward, ","), length, 0), nCompReverse = vapply(strsplit(.data$varLengthsReverse, ","), length, 0)) } else { countCombined <- countCombined %>% - dplyr::rename(sequenceForward = "sequence", + rename(sequenceForward = "sequence", varLengthsForward = "varLengths") %>% - dplyr::mutate( + mutate( nCompForward = vapply(strsplit(.data$varLengthsForward, ","), length, 0)) } @@ -197,7 +198,7 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { countCombined <- unsplit(lapply(tmp, function(df) { w <- as.numeric(strsplit(df$varLengthsForward[1], ",")[[1]]) df <- df %>% - tidyr::separate( + separate( .data$sequenceForward, into = names(paramsSeparate)[seq_along(w)], # into = paste0("V", seq_along(w)), @@ -212,7 +213,7 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { countCombined <- unsplit(lapply(tmp, function(df) { w <- as.numeric(strsplit(df$varLengthsReverse[1], ",")[[1]]) df <- df %>% - tidyr::separate( + separate( .data$sequenceReverse, into = names(paramsSeparate)[offsetForward + seq_along(w)], # into = paste0("V", offsetForward + seq_along(w)), @@ -226,10 +227,12 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { outSeparate <- lapply(paramsSeparate, function(ps) { tmp <- do.call(digestFastqs, ps[!names(ps) %in% collapseParams]) if (any(collapseParams %in% names(ps))) { - ## Collapse this variable region -> need to call groupSimilarSequences + ## Collapse this variable region -> need to call + ## groupSimilarSequences tbl <- do.call(groupSimilarSequences, c(list(seqs = tmp$summaryTable$sequence, - scores = tmp$summaryTable$nbrReads, verbose = FALSE), + scores = tmp$summaryTable$nbrReads, + verbose = FALSE), ps[names(ps) %in% collapseParams])) colnames(tbl)[colnames(tbl) == "representative"] <- "mutantName" tmp$summaryTable <- tbl @@ -244,7 +247,7 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { convSeparate <- lapply(outSeparate, function(out) { out$summaryTable %>% dplyr::select("mutantName", "sequence") %>% - tidyr::separate_rows("sequence", sep = ",") + separate_rows("sequence", sep = ",") }) ## --------------------------------------------------------------------- ## @@ -262,8 +265,8 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { ## Aggregate counts countAggregated <- countCombined %>% - dplyr::group_by(dplyr::across(names(paramsSeparate))) %>% - dplyr::summarize(nbrReads = sum(.data$nbrReads), .groups = "drop") + group_by(across(names(paramsSeparate))) %>% + summarize(nbrReads = sum(.data$nbrReads), .groups = "drop") list(countAggregated = countAggregated, convSeparate = convSeparate, outCombined = outCombined, filtSeparate = filtSeparate) From 1c3a48ae337b3b549e18603f3ea60ad33c921bdd Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Mon, 2 Jun 2025 20:05:31 +0200 Subject: [PATCH 19/44] Add import --- NAMESPACE | 1 + 1 file changed, 1 insertion(+) diff --git a/NAMESPACE b/NAMESPACE index ea80d3d..225a205 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -120,6 +120,7 @@ importFrom(tidyr,separate_rows) importFrom(tidyselect,matches) importFrom(tools,file_ext) importFrom(utils,globalVariables) +importFrom(utils,packageVersion) importFrom(utils,relist) importFrom(xfun,Rscript_call) useDynLib(mutscan, .registration = TRUE) From 1255ed028f8b6c496ffee6cf07f75bf614d171f3 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Mon, 2 Jun 2025 20:10:26 +0200 Subject: [PATCH 20/44] Cleanup, plotDistributions.R --- R/plotDistributions.R | 115 ++++++++++++++++++++++-------------------- 1 file changed, 61 insertions(+), 54 deletions(-) diff --git a/R/plotDistributions.R b/R/plotDistributions.R index a92fe24..7a66ceb 100644 --- a/R/plotDistributions.R +++ b/R/plotDistributions.R @@ -21,7 +21,7 @@ #' @importFrom tibble rownames_to_column #' @importFrom tidyr gather #' @importFrom dplyr group_by arrange mutate desc ungroup left_join -#' @importFrom SummarizedExperiment colData assay +#' @importFrom SummarizedExperiment colData assay assayNames #' @importFrom ggplot2 ggplot scale_x_log10 scale_y_log10 labs geom_line #' facet_wrap geom_density geom_histogram theme_minimal theme #' element_text aes @@ -37,35 +37,36 @@ plotDistributions <- function(se, selAssay = "counts", facet = FALSE, pseudocount = 0) { .assertVector(x = se, type = "SummarizedExperiment") .assertScalar(x = selAssay, type = "character", - validValues = SummarizedExperiment::assayNames(se)) + validValues = assayNames(se)) if (!is.null(groupBy)) { - .assertScalar(x = groupBy, type = "character", - validValues = colnames(SummarizedExperiment::colData(se))) + .assertScalar( + x = groupBy, type = "character", + validValues = colnames(colData(se))) } .assertScalar(x = plotType, type = "character", validValues = c("density", "knee", "histogram")) .assertScalar(x = facet, type = "logical") .assertScalar(x = pseudocount, type = "numeric", rngIncl = c(0, Inf)) - + ## Define a common theme to use for the plots commonTheme <- list( - ggplot2::theme_minimal(), - ggplot2::theme(axis.text = ggplot2::element_text(size = 12), - axis.title = ggplot2::element_text(size = 14)) + theme_minimal(), + theme(axis.text = element_text(size = 12), + axis.title = element_text(size = 14)) ) df <- as.data.frame(as.matrix( - SummarizedExperiment::assay(se, selAssay, withDimnames = TRUE) + assay(se, selAssay, withDimnames = TRUE) )) %>% - tibble::rownames_to_column("feature") %>% - tidyr::gather(key = "Name", value = "value", -"feature") %>% - dplyr::group_by(.data$Name) %>% - dplyr::arrange(dplyr::desc(.data$value)) %>% - dplyr::mutate(idx = seq_along(.data$value), - value = .data$value + pseudocount) %>% - dplyr::ungroup() %>% - dplyr::left_join(as.data.frame(SummarizedExperiment::colData(se)), - by = "Name") + rownames_to_column("feature") %>% + gather(key = "Name", value = "value", -"feature") %>% + group_by(.data$Name) %>% + arrange(desc(.data$value)) %>% + mutate(idx = seq_along(.data$value), + value = .data$value + pseudocount) %>% + ungroup() %>% + left_join(as.data.frame(colData(se)), + by = "Name") ## If the user doesn't explicitly group by any variable, impose grouping ## by the sample ID. In that case, don't color by sample ID if facetting @@ -80,67 +81,73 @@ plotDistributions <- function(se, selAssay = "counts", ## Specify plot depending on desired type if (plotType == "knee") { - gg <- ggplot2::ggplot(df, ggplot2::aes(x = .data$idx, y = .data$value)) + - ggplot2::scale_x_log10() + ggplot2::scale_y_log10() + - ggplot2::labs(x = "Feature (sorted)", - y = paste0(selAssay, - ifelse(pseudocount == 0, - "", paste0(" + ", pseudocount)))) + gg <- ggplot(df, aes(x = .data$idx, + y = .data$value)) + + scale_x_log10() + scale_y_log10() + + labs(x = "Feature (sorted)", + y = paste0(selAssay, + ifelse(pseudocount == 0, + "", paste0(" + ", pseudocount)))) if (facet) { if (colorFacetByName) { - gg <- gg + ggplot2::geom_line(ggplot2::aes(color = .data$Name)) + gg <- gg + geom_line(aes(color = .data$Name)) } else { - gg <- gg + ggplot2::geom_line(ggplot2::aes(group = .data$Name)) + gg <- gg + geom_line(aes(group = .data$Name)) } gg <- gg + - ggplot2::facet_wrap(~ .data[[groupBy]]) + facet_wrap(~ .data[[groupBy]]) } else { - gg <- gg + ggplot2::geom_line(ggplot2::aes(group = .data$Name, - color = .data[[groupBy]])) + gg <- gg + + geom_line(aes(group = .data$Name, + color = .data[[groupBy]])) } } else if (plotType == "density") { - gg <- ggplot2::ggplot(df, ggplot2::aes(x = .data$value)) + - ggplot2::scale_x_log10() + - ggplot2::labs(x = paste0(selAssay, - ifelse(pseudocount == 0, - "", paste0(" + ", pseudocount))), - y = "Density") + gg <- ggplot(df, aes(x = .data$value)) + + scale_x_log10() + + labs(x = paste0(selAssay, + ifelse(pseudocount == 0, + "", paste0(" + ", pseudocount))), + y = "Density") if (facet) { if (colorFacetByName) { - gg <- gg + ggplot2::geom_density(ggplot2::aes(color = .data$Name)) + gg <- gg + + geom_density(aes(color = .data$Name)) } else { - gg <- gg + ggplot2::geom_density(ggplot2::aes(group = .data$Name)) + gg <- gg + + geom_density(aes(group = .data$Name)) } gg <- gg + - ggplot2::facet_wrap(~ .data[[groupBy]]) + facet_wrap(~ .data[[groupBy]]) } else { - gg <- gg + ggplot2::geom_density(ggplot2::aes(group = .data$Name, - color = .data[[groupBy]])) + gg <- gg + + geom_density(aes(group = .data$Name, + color = .data[[groupBy]])) } } else if (plotType == "histogram") { - gg <- ggplot2::ggplot(df, ggplot2::aes(x = .data$value)) + - ggplot2::scale_x_log10() + - ggplot2::labs(x = paste0(selAssay, - ifelse(pseudocount == 0, - "", paste0(" + ", pseudocount))), - y = "Count") + gg <- ggplot(df, aes(x = .data$value)) + + scale_x_log10() + + labs(x = paste0(selAssay, + ifelse(pseudocount == 0, + "", paste0(" + ", pseudocount))), + y = "Count") if (facet) { if (colorFacetByName) { gg <- gg + - ggplot2::geom_histogram(ggplot2::aes(fill = .data$Name), - bins = 50) + geom_histogram(aes(fill = .data$Name), + bins = 50) } else { gg <- gg + - ggplot2::geom_histogram(ggplot2::aes(group = .data$Name), - bins = 50) + geom_histogram(aes(group = .data$Name), + bins = 50) } gg <- gg + - ggplot2::facet_wrap(~ .data[[groupBy]]) + facet_wrap(~ .data[[groupBy]]) } else { - gg <- gg + ggplot2::geom_histogram(ggplot2::aes(group = .data$Name, - fill = .data[[groupBy]])) + gg <- gg + + geom_histogram(aes(group = .data$Name, + fill = .data[[groupBy]])) } } gg + commonTheme -} \ No newline at end of file +} From c5a9cb6039db0d94793b912f9f4548e1236df244 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Mon, 2 Jun 2025 20:14:19 +0200 Subject: [PATCH 21/44] Cleanup, plotFiltering.R --- R/plotFiltering.R | 110 +++++++++++++++++++++++-------------------- man/plotFiltering.Rd | 4 +- 2 files changed, 62 insertions(+), 52 deletions(-) diff --git a/R/plotFiltering.R b/R/plotFiltering.R index 8dfad4c..fbf8e24 100644 --- a/R/plotFiltering.R +++ b/R/plotFiltering.R @@ -20,8 +20,8 @@ #' plot the number of reads, or the fraction of the total number of reads, #' that are retained after/filtered out in each filtering step. #' @param onlyActiveFilters Logical scalar, whether to only include the -#' active filters (i.e., where any read was filtered out in any of the samples). -#' Defaults to \code{TRUE}. +#' active filters (i.e., where any read was filtered out in any of the +#' samples). Defaults to \code{TRUE}. #' @param displayNumbers Logical scalar, indicating whether to display the #' number (or fraction) of reads retained at every filtering step. #' @param numberSize Numeric scalar, indicating the size of the displayed @@ -57,13 +57,14 @@ plotFiltering <- function(se, valueType = "reads", onlyActiveFilters = TRUE, validValues = c("remaining", "filtered")) .assertScalar(x = facetBy, type = "character", validValues = c("sample", "step")) - - ## ----------------------------------------------------------------------- ## + + ## --------------------------------------------------------------------- ## ## Extract relevant columns from colData(se) - ## ----------------------------------------------------------------------- ## + ## --------------------------------------------------------------------- ## cd <- as.data.frame(colData(se)) %>% dplyr::select("nbrTotal":"nbrRetained") %>% - dplyr::select(c("nbrTotal", tidyselect::matches("^f.*_"), "nbrRetained")) + dplyr::select(c("nbrTotal", matches("^f.*_"), + "nbrRetained")) ## Remove inactive filters if desired if (onlyActiveFilters) { @@ -71,7 +72,8 @@ plotFiltering <- function(se, valueType = "reads", onlyActiveFilters = TRUE, } ## Check that filtering columns are in the right order - nbrs <- gsub("^f", "", vapply(strsplit(colnames(cd), "_"), .subset, FUN.VALUE = "", 1)) + nbrs <- gsub("^f", "", vapply(strsplit(colnames(cd), "_"), .subset, + FUN.VALUE = "", 1)) nbrs <- nbrs[-c(1, length(nbrs))] nbrs <- gsub("a$|b$", "", nbrs) stopifnot(all(diff(as.numeric(nbrs)) >= 0)) @@ -79,46 +81,51 @@ plotFiltering <- function(se, valueType = "reads", onlyActiveFilters = TRUE, ## Check that all columns are numeric stopifnot(all(apply(cd, 2, is.numeric))) - ## ----------------------------------------------------------------------- ## + ## --------------------------------------------------------------------- ## ## Reshape into long format - ## ----------------------------------------------------------------------- ## + ## --------------------------------------------------------------------- ## cd <- cd %>% - tibble::rownames_to_column("sample") %>% - tidyr::gather(key = "step", value = "nbrReads", -"sample") %>% - dplyr::mutate(step = factor(.data$step, levels = colnames(cd))) + rownames_to_column("sample") %>% + gather(key = "step", value = "nbrReads", -"sample") %>% + mutate(step = factor(.data$step, levels = colnames(cd))) ## Check that numbers add up (total - all filters = retained) - stopifnot(all(cd %>% - dplyr::group_by(sample) %>% - dplyr::summarize(remdiff = .data$nbrReads[.data$step == "nbrTotal"] - - sum(.data$nbrReads[grepl("^f", .data$step)]), - remlist = .data$nbrReads[.data$step == "nbrRetained"]) %>% - dplyr::mutate(obsdiff = .data$remdiff - .data$remlist) %>% - dplyr::pull(.data$obsdiff) == 0)) + stopifnot(all( + cd %>% + group_by(sample) %>% + summarize( + remdiff = .data$nbrReads[.data$step == "nbrTotal"] - + sum(.data$nbrReads[grepl("^f", .data$step)]), + remlist = .data$nbrReads[.data$step == "nbrRetained"]) %>% + mutate(obsdiff = .data$remdiff - .data$remlist) %>% + pull(.data$obsdiff) == 0)) - ## ----------------------------------------------------------------------- ## + ## --------------------------------------------------------------------- ## ## Calculate number of remaining reads at each step - ## ----------------------------------------------------------------------- ## + ## --------------------------------------------------------------------- ## cd <- cd %>% - dplyr::group_by(sample) %>% - dplyr::mutate(fracReads = signif(.data$nbrReads / - .data$nbrReads[.data$step == "nbrTotal"], - digits = 3)) %>% - dplyr::mutate(cumulsum = vapply(seq_along(.data$step), function(i) { - sum(.data$nbrReads[as.numeric(.data$step) <= as.numeric(.data$step[i]) & + group_by(sample) %>% + mutate(fracReads = signif( + .data$nbrReads / + .data$nbrReads[.data$step == "nbrTotal"], + digits = 3)) %>% + mutate(cumulsum = vapply(seq_along(.data$step), function(i) { + sum(.data$nbrReads[as.numeric(.data$step) <= + as.numeric(.data$step[i]) & .data$step != "nbrTotal"]) }, NA_real_)) %>% - dplyr::mutate(nbrRemaining = .data$nbrReads[.data$step == "nbrTotal"] - - .data$cumulsum) %>% - dplyr::mutate(fracRemaining = signif(.data$nbrRemaining / - .data$nbrReads[.data$step == "nbrTotal"], - digits = 3)) %>% - dplyr::filter(.data$step != "nbrRetained") %>% - dplyr::ungroup() + mutate(nbrRemaining = .data$nbrReads[.data$step == "nbrTotal"] - + .data$cumulsum) %>% + mutate(fracRemaining = signif( + .data$nbrRemaining / + .data$nbrReads[.data$step == "nbrTotal"], + digits = 3)) %>% + filter(.data$step != "nbrRetained") %>% + ungroup() - ## ----------------------------------------------------------------------- ## + ## --------------------------------------------------------------------- ## ## Create plot - ## ----------------------------------------------------------------------- ## + ## --------------------------------------------------------------------- ## yvar <- switch( paste0(valueType, "_", plotType), @@ -139,23 +146,26 @@ plotFiltering <- function(se, valueType = "reads", onlyActiveFilters = TRUE, ) if (plotType == "filtered") { cd <- cd %>% - dplyr::filter(.data$step != "nbrTotal") + filter(.data$step != "nbrTotal") } - gg <- ggplot2::ggplot(cd, ggplot2::aes(x = .data[[setdiff(c("step", "sample"), facetBy)]], - y = .data[[yvar]], - label = .data[[yvar]])) + - ggplot2::geom_bar(stat = "identity") + - ggplot2::facet_wrap(~ .data[[facetBy]], ncol = 1, scales = "free_y") + - ggplot2::theme_bw() + - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1, - vjust = 0.5, size = 12), - axis.text.y = ggplot2::element_text(size = 12), - axis.title = ggplot2::element_text(size = 14)) + - ggplot2::labs(x = "", y = ylab1, - title = paste0(ylab1, ylab2, " each filtering step")) + gg <- ggplot(cd, aes( + x = .data[[setdiff(c("step", "sample"), facetBy)]], + y = .data[[yvar]], + label = .data[[yvar]])) + + geom_bar(stat = "identity") + + facet_wrap(~ .data[[facetBy]], ncol = 1, scales = "free_y") + + theme_bw() + + theme( + axis.text.x = element_text(angle = 90, hjust = 1, + vjust = 0.5, size = 12), + axis.text.y = element_text(size = 12), + axis.title = element_text(size = 14)) + + labs(x = "", y = ylab1, + title = paste0(ylab1, ylab2, " each filtering step")) if (displayNumbers) { - gg <- gg + ggplot2::geom_text(vjust = 1.5, color = "white", size = numberSize) + gg <- gg + geom_text(vjust = 1.5, color = "white", + size = numberSize) } gg } \ No newline at end of file diff --git a/man/plotFiltering.Rd b/man/plotFiltering.Rd index 688ebd1..cf758eb 100644 --- a/man/plotFiltering.Rd +++ b/man/plotFiltering.Rd @@ -23,8 +23,8 @@ plot the number of reads, or the fraction of the total number of reads, that are retained after/filtered out in each filtering step.} \item{onlyActiveFilters}{Logical scalar, whether to only include the -active filters (i.e., where any read was filtered out in any of the samples). -Defaults to \code{TRUE}.} +active filters (i.e., where any read was filtered out in any of the +samples). Defaults to \code{TRUE}.} \item{displayNumbers}{Logical scalar, indicating whether to display the number (or fraction) of reads retained at every filtering step.} From d6c14533d15a64b894418b1c06d15c0ca4e4c8aa Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Mon, 2 Jun 2025 20:19:59 +0200 Subject: [PATCH 22/44] Cleanup, plotPairs.R --- NAMESPACE | 2 +- R/plotPairs.R | 162 +++++++++++++++++++++++++++----------------------- 2 files changed, 89 insertions(+), 75 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 225a205..597dead 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -24,7 +24,6 @@ importFrom(DT,datatable) importFrom(DelayedArray,rowsum) importFrom(GGally,eval_data_col) importFrom(GGally,ggpairs) -importFrom(GGally,wrap) importFrom(IRanges,IntegerList) importFrom(Matrix,colSums) importFrom(Matrix,rowMeans) @@ -71,6 +70,7 @@ importFrom(ggplot2,after_stat) importFrom(ggplot2,annotate) importFrom(ggplot2,coord_cartesian) importFrom(ggplot2,element_blank) +importFrom(ggplot2,element_rect) importFrom(ggplot2,element_text) importFrom(ggplot2,facet_wrap) importFrom(ggplot2,geom_abline) diff --git a/R/plotPairs.R b/R/plotPairs.R index c80d896..579bdc5 100644 --- a/R/plotPairs.R +++ b/R/plotPairs.R @@ -41,11 +41,11 @@ #' @param addIdentityLine Logical scalar, indicating whether the identity line #' should be added (only used if \code{pointsType = "points"}). #' -#' @importFrom GGally eval_data_col ggpairs wrap +#' @importFrom GGally eval_data_col ggpairs #' @importFrom ggplot2 ggplot annotate theme_void ylim stat_density2d #' scale_fill_continuous geom_point theme_bw theme element_blank aes #' geom_histogram scale_x_continuous scale_y_continuous geom_abline -#' after_stat +#' after_stat element_rect #' @importFrom stats cor #' @importFrom SummarizedExperiment assayNames assay #' @importFrom grDevices hcl.colors rgb colorRamp @@ -62,10 +62,10 @@ plotPairs <- function(se, selAssay = "counts", doLog = TRUE, pseudocount = 1, corSizeAdd = 2, pointSize = 0.1, pointAlpha = 0.3, colorByCorrelation = TRUE, corrColorRange = NULL, addIdentityLine = FALSE) { - + .assertVector(x = se, type = "SummarizedExperiment") .assertScalar(x = selAssay, type = "character", - validValues = SummarizedExperiment::assayNames(se)) + validValues = assayNames(se)) .assertScalar(x = doLog, type = "logical") .assertScalar(x = pseudocount, type = "numeric", rngIncl = c(0, Inf)) .assertScalar(x = corMethod, type = "character", @@ -88,89 +88,97 @@ plotPairs <- function(se, selAssay = "counts", doLog = TRUE, pseudocount = 1, if (pointsType %in% c("scattermore", "scattermost")) { .assertPackagesAvailable("scattermore") } - - ## ----------------------------------------------------------------------- ## + + ## --------------------------------------------------------------------- ## ## Define shared theme elements - ## ----------------------------------------------------------------------- ## + ## --------------------------------------------------------------------- ## ggtheme <- list( theme_bw(), - theme(panel.grid.major = ggplot2::element_blank(), - panel.grid.minor = ggplot2::element_blank()) + theme(panel.grid.major = element_blank(), + panel.grid.minor = element_blank()) ) - - ## ----------------------------------------------------------------------- ## + + ## --------------------------------------------------------------------- ## ## Correlations - ## ----------------------------------------------------------------------- ## - ## Define function to calculate and display correlations (for use with ggpairs) + ## --------------------------------------------------------------------- ## + ## Define function to calculate and display correlations + ## (for use with ggpairs) cor_fcn <- function(data, mapping, ...) { ## Get data - xData <- GGally::eval_data_col(data, mapping$x) - yData <- GGally::eval_data_col(data, mapping$y) - + xData <- eval_data_col(data, mapping$x) + yData <- eval_data_col(data, mapping$y) + ## Calculate correlation - mainCor <- stats::cor(xData, yData, method = corMethod, - use = "pairwise.complete.obs") - transfCor <- (abs(mainCor) - min(corRange))/(max(corRange) - min(corRange)) + mainCor <- cor(xData, yData, method = corMethod, + use = "pairwise.complete.obs") + transfCor <- (abs(mainCor) - min(corRange)) / + (max(corRange) - min(corRange)) ## Protect against numerical imprecision leading to values outside ## the allowed range transfCor <- min(1.0, transfCor) transfCor <- max(0.0, transfCor) - + ## Determine the color if (colorByCorrelation) { if (mainCor >= 0) { - cols <- grDevices::hcl.colors(n = 11, palette = "RdBu")[6:2] - col <- grDevices::rgb(grDevices::colorRamp( + cols <- hcl.colors(n = 11, palette = "RdBu")[6:2] + col <- rgb(colorRamp( cols)(transfCor), maxColorValue = 255) } else { - cols <- grDevices::hcl.colors(n = 11, palette = "RdBu")[6:10] - col <- grDevices::rgb(grDevices::colorRamp( + cols <- hcl.colors(n = 11, palette = "RdBu")[6:10] + col <- rgb(colorRamp( cols)(transfCor), maxColorValue = 255) } } else { col <- "white" } - + ## Construct plot - ggplot2::ggplot(data = data, mapping = mapping) + - ggplot2::annotate(x = 0.5, y = 0.5, label = round(mainCor, digits = 3), - geom = "text", - size = abs(mainCor) * corSizeMult + corSizeAdd) + - ggtheme + ggplot2::ylim(c(0, 1)) + - ggplot2::theme(panel.background = ggplot2::element_rect(fill = col)) + ggplot(data = data, mapping = mapping) + + annotate(x = 0.5, y = 0.5, + label = round(mainCor, digits = 3), + geom = "text", + size = abs(mainCor) * corSizeMult + corSizeAdd) + + ggtheme + ylim(c(0, 1)) + + theme( + panel.background = element_rect(fill = col) + ) } - + ## ----------------------------------------------------------------------- ## ## Scatter plots ## ----------------------------------------------------------------------- ## if (pointsType == "smoothscatter") { - ## Define function to create smoothscatter-like plot (for use with ggpairs) + ## Define function to create smoothscatter-like plot + ## (for use with ggpairs) smoothscat <- function(data, mapping, ...) { - ggplot2::ggplot(data = data, mapping = mapping) + - ggplot2::stat_density2d( - ggplot2::aes(fill = ggplot2::after_stat(.data$density)^0.25), geom = "tile", + ggplot(data = data, mapping = mapping) + + stat_density2d( + aes(fill = after_stat( + .data$density)^0.25), geom = "tile", contour = FALSE, n = 200) + - ggplot2::scale_fill_continuous(low = "white", high = "darkgreen") + + scale_fill_continuous(low = "white", + high = "darkgreen") + ggtheme + - ggplot2::scale_x_continuous(expand = c(0, 0)) + - ggplot2::scale_y_continuous(expand = c(0, 0)) + scale_x_continuous(expand = c(0, 0)) + + scale_y_continuous(expand = c(0, 0)) } } else if (pointsType == "points") { ## Define function to create scatter plot (for use with ggpairs) if (addIdentityLine) { plotpoints <- function(data, mapping, ...) { - ggplot2::ggplot(data = data, mapping = mapping) + - ggplot2::geom_abline(slope = 1, intercept = 0, - linetype = "dashed", color = "grey") + - ggplot2::geom_point(alpha = pointAlpha, size = pointSize) + + ggplot(data = data, mapping = mapping) + + geom_abline(slope = 1, intercept = 0, + linetype = "dashed", color = "grey") + + geom_point(alpha = pointAlpha, size = pointSize) + ggtheme } } else { plotpoints <- function(data, mapping, ...) { - ggplot2::ggplot(data = data, mapping = mapping) + - ggplot2::geom_point(alpha = pointAlpha, size = pointSize) + + ggplot(data = data, mapping = mapping) + + geom_point(alpha = pointAlpha, size = pointSize) + ggtheme } } @@ -178,16 +186,16 @@ plotPairs <- function(se, selAssay = "counts", doLog = TRUE, pseudocount = 1, ## Define function to create scattermore plot (for use with ggpairs) if (addIdentityLine) { plotscattermore <- function(data, mapping, ...) { - ggplot2::ggplot(data = data, mapping = mapping) + - ggplot2::geom_abline(slope = 1, intercept = 0, - linetype = "dashed", color = "grey") + + ggplot(data = data, mapping = mapping) + + geom_abline(slope = 1, intercept = 0, + linetype = "dashed", color = "grey") + scattermore::geom_scattermore(alpha = pointAlpha, pointsize = pointSize) + ggtheme } } else { plotscattermore <- function(data, mapping, ...) { - ggplot2::ggplot(data = data, mapping = mapping) + + ggplot(data = data, mapping = mapping) + scattermore::geom_scattermore(alpha = pointAlpha, pointsize = pointSize) + ggtheme @@ -198,12 +206,12 @@ plotPairs <- function(se, selAssay = "counts", doLog = TRUE, pseudocount = 1, if (addIdentityLine) { plotscattermost <- function(data, mapping, ...) { ## Get data - xData <- GGally::eval_data_col(data, mapping$x) - yData <- GGally::eval_data_col(data, mapping$y) + xData <- eval_data_col(data, mapping$x) + yData <- eval_data_col(data, mapping$y) - ggplot2::ggplot(data = data, mapping = mapping) + - ggplot2::geom_abline(slope = 1, intercept = 0, - linetype = "dashed", color = "grey") + + ggplot(data = data, mapping = mapping) + + geom_abline(slope = 1, intercept = 0, + linetype = "dashed", color = "grey") + scattermore::geom_scattermost(xy = cbind(xData, yData), pointsize = pointSize) + ggtheme @@ -211,10 +219,10 @@ plotPairs <- function(se, selAssay = "counts", doLog = TRUE, pseudocount = 1, } else { plotscattermost <- function(data, mapping, ...) { ## Get data - xData <- GGally::eval_data_col(data, mapping$x) - yData <- GGally::eval_data_col(data, mapping$y) + xData <- eval_data_col(data, mapping$x) + yData <- eval_data_col(data, mapping$y) - ggplot2::ggplot(data = data, mapping = mapping) + + ggplot(data = data, mapping = mapping) + scattermore::geom_scattermost(xy = cbind(xData, yData), pointsize = pointSize) + ggtheme @@ -232,28 +240,32 @@ plotPairs <- function(se, selAssay = "counts", doLog = TRUE, pseudocount = 1, } else if (pointsType == "scattermost") { lower <- list(continuous = plotscattermost) } - - ## ----------------------------------------------------------------------- ## + + ## --------------------------------------------------------------------- ## ## Histogram - ## ----------------------------------------------------------------------- ## + ## --------------------------------------------------------------------- ## diaghist <- function(data, mapping, ...) { - gg <- ggplot2::ggplot(data = data, mapping = mapping) + - ggplot2::geom_histogram(fill = "#F5C710", color = "grey50", bins = histBreaks) + + gg <- ggplot(data = data, mapping = mapping) + + geom_histogram(fill = "#F5C710", color = "grey50", + bins = histBreaks) + ggtheme if (pointsType == "smoothscatter") { - gg <- gg + ggplot2::scale_x_continuous(expand = c(0, 0)) + gg <- gg + scale_x_continuous(expand = c(0, 0)) } gg } - - ## ----------------------------------------------------------------------- ## + + ## --------------------------------------------------------------------- ## ## Combined plot - ## ----------------------------------------------------------------------- ## - ## Prepare the data and plot title, depending on whether to log-transform or not + ## --------------------------------------------------------------------- ## + ## Prepare the data and plot title, depending on whether to log-transform + ## or not if (doLog) { - mat <- log10(as.matrix(SummarizedExperiment::assay(se, selAssay)) + pseudocount) + mat <- log10(as.matrix(assay(se, selAssay)) + + pseudocount) title <- paste0("log10(", selAssay, - ifelse(pseudocount > 0, paste0(" + ", pseudocount), ""), + ifelse(pseudocount > 0, + paste0(" + ", pseudocount), ""), ")") } else { mat <- as.matrix(assay(se, selAssay)) @@ -262,21 +274,23 @@ plotPairs <- function(se, selAssay = "counts", doLog = TRUE, pseudocount = 1, mat <- as.data.frame(mat) title <- paste0(title, ", ", toupper(substr(corMethod, 1, 1)), substr(corMethod, 2, nchar(corMethod)), " correlation") - + ## Calculate correlations and get range - allCors <- stats::cor(mat, method = corMethod, use = "pairwise.complete.obs") + allCors <- cor(mat, method = corMethod, + use = "pairwise.complete.obs") if (!is.null(corrColorRange)) { corRange <- corrColorRange } else { corRange <- range(abs(allCors[upper.tri(allCors)])) } if (length(unique(corRange)) == 1) { - ## Having a range with width 0 leads to problems in the rescaling of the correlations + ## Having a range with width 0 leads to problems in the rescaling of + ## the correlations corRange <- c(0, 1) } - + ## Construct the pairs plot - GGally::ggpairs( + ggpairs( data = mat, mapping = NULL, columns = seq_len(ncol(mat)), From e71468f2652d770075011c542c1c8953b38d35ab Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Mon, 2 Jun 2025 20:23:54 +0200 Subject: [PATCH 23/44] Swap %>% for |> --- NAMESPACE | 1 - R/collapseMutantsByAA.R | 8 ++--- R/linkMultipleVariants.R | 26 +++++++------- R/plotDistributions.R | 14 ++++---- R/plotFiltering.R | 36 +++++++++---------- .../scripts/generateValuesForUnitTestsTrans.R | 2 +- tests/testthat/test_linkMultipleVariants.R | 32 ++++++++--------- 7 files changed, 59 insertions(+), 60 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 597dead..2a67cd4 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -39,7 +39,6 @@ importFrom(SummarizedExperiment,assays) importFrom(SummarizedExperiment,colData) importFrom(SummarizedExperiment,rowData) importFrom(csaw,normOffsets) -importFrom(dplyr,"%>%") importFrom(dplyr,across) importFrom(dplyr,arrange) importFrom(dplyr,bind_rows) diff --git a/R/collapseMutantsByAA.R b/R/collapseMutantsByAA.R index 78120bd..e9da27b 100644 --- a/R/collapseMutantsByAA.R +++ b/R/collapseMutantsByAA.R @@ -184,7 +184,7 @@ collapseMutants <- function(se, nameCol) { ## Collapse rowData - simple columns rd <- mergeValues(rowData(se)[[nameCol]], - rowData(se)$sequence) %>% + rowData(se)$sequence) |> setNames(c(nameCol, "sequence")) for (v in setdiff( intersect(c("mutantName", "mutantNameBase", "mutantNameBaseHGVS", @@ -197,8 +197,8 @@ collapseMutants <- function(se, nameCol) { rowData(se)[[v]]) rd[[v]] <- tmp$valueColl[match(rd[[nameCol]], tmp$mutantNameColl)] } - rd0 <- as.data.frame(rowData(se)) %>% - group_by(.data[[nameCol]]) %>% + rd0 <- as.data.frame(rowData(se)) |> + group_by(.data[[nameCol]]) |> summarize( across(c("minNbrMutBases", "minNbrMutCodons", "minNbrMutAAs"), @@ -206,7 +206,7 @@ collapseMutants <- function(se, nameCol) { across(c("maxNbrMutBases", "maxNbrMutCodons", "maxNbrMutAAs"), function(x) max(x))) - rd <- rd %>% + rd <- rd |> full_join(rd0, by = nameCol) rd <- DataFrame(rd) diff --git a/R/linkMultipleVariants.R b/R/linkMultipleVariants.R index 06aee7e..cb5e3a9 100644 --- a/R/linkMultipleVariants.R +++ b/R/linkMultipleVariants.R @@ -160,28 +160,28 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { outCombined <- do.call(digestFastqs, combinedDigestParams) ## Get count matrix with "raw" (uncorrected) sequences - countCombined <- outCombined$summaryTable %>% - dplyr::select("sequence", "nbrReads", "varLengths") %>% + countCombined <- outCombined$summaryTable |> + dplyr::select("sequence", "nbrReads", "varLengths") |> mutate(idx = paste0("I", seq_along(.data$sequence))) ## If applicable, separate into forward and reverse sequences if (any(grepl("_", countCombined$sequence))) { - countCombined <- countCombined %>% + countCombined <- countCombined |> separate(.data$sequence, into = c("sequenceForward", "sequenceReverse"), - sep = "_") %>% + sep = "_") |> separate(.data$varLengths, into = c("varLengthsForward", "varLengthsReverse"), - sep = "_") %>% + sep = "_") |> mutate( nCompForward = vapply(strsplit(.data$varLengthsForward, ","), length, 0), nCompReverse = vapply(strsplit(.data$varLengthsReverse, ","), length, 0)) } else { - countCombined <- countCombined %>% + countCombined <- countCombined |> rename(sequenceForward = "sequence", - varLengthsForward = "varLengths") %>% + varLengthsForward = "varLengths") |> mutate( nCompForward = vapply(strsplit(.data$varLengthsForward, ","), length, 0)) @@ -197,7 +197,7 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { tmp <- split(countCombined, countCombined$varLengthsForward) countCombined <- unsplit(lapply(tmp, function(df) { w <- as.numeric(strsplit(df$varLengthsForward[1], ",")[[1]]) - df <- df %>% + df <- df |> separate( .data$sequenceForward, into = names(paramsSeparate)[seq_along(w)], @@ -212,7 +212,7 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { tmp <- split(countCombined, countCombined$varLengthsReverse) countCombined <- unsplit(lapply(tmp, function(df) { w <- as.numeric(strsplit(df$varLengthsReverse[1], ",")[[1]]) - df <- df %>% + df <- df |> separate( .data$sequenceReverse, into = names(paramsSeparate)[offsetForward + seq_along(w)], @@ -245,8 +245,8 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { ## Conversion tables convSeparate <- lapply(outSeparate, function(out) { - out$summaryTable %>% - dplyr::select("mutantName", "sequence") %>% + out$summaryTable |> + dplyr::select("mutantName", "sequence") |> separate_rows("sequence", sep = ",") }) @@ -264,8 +264,8 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { drop = FALSE] ## Aggregate counts - countAggregated <- countCombined %>% - group_by(across(names(paramsSeparate))) %>% + countAggregated <- countCombined |> + group_by(across(names(paramsSeparate))) |> summarize(nbrReads = sum(.data$nbrReads), .groups = "drop") list(countAggregated = countAggregated, convSeparate = convSeparate, diff --git a/R/plotDistributions.R b/R/plotDistributions.R index 7a66ceb..3690fe5 100644 --- a/R/plotDistributions.R +++ b/R/plotDistributions.R @@ -57,14 +57,14 @@ plotDistributions <- function(se, selAssay = "counts", df <- as.data.frame(as.matrix( assay(se, selAssay, withDimnames = TRUE) - )) %>% - rownames_to_column("feature") %>% - gather(key = "Name", value = "value", -"feature") %>% - group_by(.data$Name) %>% - arrange(desc(.data$value)) %>% + )) |> + rownames_to_column("feature") |> + gather(key = "Name", value = "value", -"feature") |> + group_by(.data$Name) |> + arrange(desc(.data$value)) |> mutate(idx = seq_along(.data$value), - value = .data$value + pseudocount) %>% - ungroup() %>% + value = .data$value + pseudocount) |> + ungroup() |> left_join(as.data.frame(colData(se)), by = "Name") diff --git a/R/plotFiltering.R b/R/plotFiltering.R index fbf8e24..2eebfe6 100644 --- a/R/plotFiltering.R +++ b/R/plotFiltering.R @@ -32,7 +32,7 @@ #' should be facetted. Either \code{"sample"} or \code{"step"}. #' #' @importFrom tidyselect matches -#' @importFrom dplyr select %>% mutate group_by summarize pull ungroup filter +#' @importFrom dplyr select mutate group_by summarize pull ungroup filter #' @importFrom tibble rownames_to_column #' @importFrom tidyr gather #' @importFrom ggplot2 ggplot aes geom_bar facet_wrap theme theme_bw labs @@ -61,8 +61,8 @@ plotFiltering <- function(se, valueType = "reads", onlyActiveFilters = TRUE, ## --------------------------------------------------------------------- ## ## Extract relevant columns from colData(se) ## --------------------------------------------------------------------- ## - cd <- as.data.frame(colData(se)) %>% - dplyr::select("nbrTotal":"nbrRetained") %>% + cd <- as.data.frame(colData(se)) |> + dplyr::select("nbrTotal":"nbrRetained") |> dplyr::select(c("nbrTotal", matches("^f.*_"), "nbrRetained")) @@ -84,43 +84,43 @@ plotFiltering <- function(se, valueType = "reads", onlyActiveFilters = TRUE, ## --------------------------------------------------------------------- ## ## Reshape into long format ## --------------------------------------------------------------------- ## - cd <- cd %>% - rownames_to_column("sample") %>% - gather(key = "step", value = "nbrReads", -"sample") %>% + cd <- cd |> + rownames_to_column("sample") |> + gather(key = "step", value = "nbrReads", -"sample") |> mutate(step = factor(.data$step, levels = colnames(cd))) ## Check that numbers add up (total - all filters = retained) stopifnot(all( - cd %>% - group_by(sample) %>% + cd |> + group_by(sample) |> summarize( remdiff = .data$nbrReads[.data$step == "nbrTotal"] - sum(.data$nbrReads[grepl("^f", .data$step)]), - remlist = .data$nbrReads[.data$step == "nbrRetained"]) %>% - mutate(obsdiff = .data$remdiff - .data$remlist) %>% + remlist = .data$nbrReads[.data$step == "nbrRetained"]) |> + mutate(obsdiff = .data$remdiff - .data$remlist) |> pull(.data$obsdiff) == 0)) ## --------------------------------------------------------------------- ## ## Calculate number of remaining reads at each step ## --------------------------------------------------------------------- ## - cd <- cd %>% - group_by(sample) %>% + cd <- cd |> + group_by(sample) |> mutate(fracReads = signif( .data$nbrReads / .data$nbrReads[.data$step == "nbrTotal"], - digits = 3)) %>% + digits = 3)) |> mutate(cumulsum = vapply(seq_along(.data$step), function(i) { sum(.data$nbrReads[as.numeric(.data$step) <= as.numeric(.data$step[i]) & .data$step != "nbrTotal"]) - }, NA_real_)) %>% + }, NA_real_)) |> mutate(nbrRemaining = .data$nbrReads[.data$step == "nbrTotal"] - - .data$cumulsum) %>% + .data$cumulsum) |> mutate(fracRemaining = signif( .data$nbrRemaining / .data$nbrReads[.data$step == "nbrTotal"], - digits = 3)) %>% - filter(.data$step != "nbrRetained") %>% + digits = 3)) |> + filter(.data$step != "nbrRetained") |> ungroup() ## --------------------------------------------------------------------- ## @@ -145,7 +145,7 @@ plotFiltering <- function(se, valueType = "reads", onlyActiveFilters = TRUE, filtered = " filtered out in" ) if (plotType == "filtered") { - cd <- cd %>% + cd <- cd |> filter(.data$step != "nbrTotal") } gg <- ggplot(cd, aes( diff --git a/inst/scripts/generateValuesForUnitTestsTrans.R b/inst/scripts/generateValuesForUnitTestsTrans.R index e860b53..47043be 100644 --- a/inst/scripts/generateValuesForUnitTestsTrans.R +++ b/inst/scripts/generateValuesForUnitTestsTrans.R @@ -479,7 +479,7 @@ processReadsTrans <- function(Ldef) { } else { fqseq <- as.character(varForward) } - tbl <- as.data.frame(table(fqseq)) %>% dplyr::arrange(desc(Freq), fqseq) %>% + tbl <- as.data.frame(table(fqseq)) |> dplyr::arrange(desc(Freq), fqseq) |> dplyr::mutate(fqseq = as.character(fqseq)) tbl$umis <- "" for (i in seq_len(nrow(tbl))) { diff --git a/tests/testthat/test_linkMultipleVariants.R b/tests/testthat/test_linkMultipleVariants.R index f075f29..f797755 100644 --- a/tests/testthat/test_linkMultipleVariants.R +++ b/tests/testthat/test_linkMultipleVariants.R @@ -93,12 +93,12 @@ test_that("linkMultipleVariants works", { keep <- !grepl("N|del", truth$truth$status) ## Truth - correct <- truth$truth[keep, ] %>% - dplyr::group_by(trueBarcode, trueV2, trueV3) %>% dplyr::tally() %>% + correct <- truth$truth[keep, ] |> + dplyr::group_by(trueBarcode, trueV2, trueV3) |> dplyr::tally() |> dplyr::arrange(dplyr::desc(n), trueBarcode, trueV2, trueV3) ## Obs - obs <- res$countAggregated %>% dplyr::arrange(desc(nbrReads), barcode, V2, V3) + obs <- res$countAggregated |> dplyr::arrange(desc(nbrReads), barcode, V2, V3) expect_equal(sum(obs$nbrReads), sum(keep)) expect_equal(correct$trueBarcode, obs$barcode) @@ -196,12 +196,12 @@ test_that("linkMultipleVariants works", { keep <- !grepl("C.*mut|N|del", truth$truth$status) ## Truth - correct <- truth$truth[keep, ] %>% - dplyr::group_by(trueBarcode, trueV2, trueV3) %>% dplyr::tally() %>% + correct <- truth$truth[keep, ] |> + dplyr::group_by(trueBarcode, trueV2, trueV3) |> dplyr::tally() |> dplyr::arrange(dplyr::desc(n), trueBarcode, trueV2, trueV3) ## Obs - obs <- res$countAggregated %>% dplyr::arrange(desc(nbrReads), barcode, V2, V3) + obs <- res$countAggregated |> dplyr::arrange(desc(nbrReads), barcode, V2, V3) expect_equal(sum(obs$nbrReads), sum(keep)) expect_equal(correct$trueBarcode, obs$barcode) @@ -302,14 +302,14 @@ test_that("linkMultipleVariants works", { keep <- !grepl("N|del", truth$truth$status) ## Truth - correct <- truth$truth[keep, ] %>% + correct <- truth$truth[keep, ] |> dplyr::mutate(trueV2 = gsub("\\..*", "", trueV2), - trueV3 = gsub("\\..*", "", trueV3)) %>% - dplyr::group_by(trueBarcode, trueV2, trueV3) %>% dplyr::tally() %>% + trueV3 = gsub("\\..*", "", trueV3)) |> + dplyr::group_by(trueBarcode, trueV2, trueV3) |> dplyr::tally() |> dplyr::arrange(dplyr::desc(n), trueBarcode, trueV2, trueV3) ## Obs - obs <- res$countAggregated %>% dplyr::arrange(desc(nbrReads), barcode, V2, V3) + obs <- res$countAggregated |> dplyr::arrange(desc(nbrReads), barcode, V2, V3) expect_equal(sum(obs$nbrReads), sum(keep)) expect_equal(correct$trueBarcode, obs$barcode) @@ -384,12 +384,12 @@ test_that("linkMultipleVariants works", { keep <- !grepl("N|del", truth$truth$status) ## Truth - correct <- truth$truth[keep, ] %>% - dplyr::group_by(trueBarcode, trueV2, trueV3) %>% dplyr::tally() %>% + correct <- truth$truth[keep, ] |> + dplyr::group_by(trueBarcode, trueV2, trueV3) |> dplyr::tally() |> dplyr::arrange(dplyr::desc(n), trueBarcode, trueV2, trueV3) ## Obs - obs <- res$countAggregated %>% dplyr::arrange(desc(nbrReads), barcode, V2, V3) + obs <- res$countAggregated |> dplyr::arrange(desc(nbrReads), barcode, V2, V3) expect_equal(sum(obs$nbrReads), sum(keep)) expect_equal(correct$trueBarcode, obs$barcode) @@ -482,12 +482,12 @@ test_that("linkMultipleVariants works", { keep <- !grepl("C.*mut|N|del|V2|V3", truth$truth$status) ## Truth - correct <- truth$truth[keep, ] %>% - dplyr::group_by(obsBarcode, trueV2, trueV3) %>% dplyr::tally() %>% + correct <- truth$truth[keep, ] |> + dplyr::group_by(obsBarcode, trueV2, trueV3) |> dplyr::tally() |> dplyr::arrange(dplyr::desc(n), obsBarcode, trueV2, trueV3) ## Obs - obs <- res$countAggregated %>% dplyr::arrange(desc(nbrReads), barcode, V2, V3) + obs <- res$countAggregated |> dplyr::arrange(desc(nbrReads), barcode, V2, V3) expect_equal(sum(obs$nbrReads), sum(keep)) expect_equal(correct$obsBarcode, obs$barcode) From 182bebca18950dd66bfa756d4acb5131a9111fb2 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Mon, 2 Jun 2025 20:34:51 +0200 Subject: [PATCH 24/44] Import rowData<- --- NAMESPACE | 1 + R/collapseMutantsByAA.R | 1 + 2 files changed, 2 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 2a67cd4..2a001db 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -32,6 +32,7 @@ importFrom(Rcpp,sourceCpp) importFrom(S4Vectors,DataFrame) importFrom(S4Vectors,metadata) importFrom(S4Vectors,unstrsplit) +importFrom(SummarizedExperiment,"rowData<-") importFrom(SummarizedExperiment,SummarizedExperiment) importFrom(SummarizedExperiment,assay) importFrom(SummarizedExperiment,assayNames) diff --git a/R/collapseMutantsByAA.R b/R/collapseMutantsByAA.R index e9da27b..86166c3 100644 --- a/R/collapseMutantsByAA.R +++ b/R/collapseMutantsByAA.R @@ -157,6 +157,7 @@ collapseMutantsByAA <- function(se) { #' @importFrom DelayedArray rowsum #' @importFrom S4Vectors metadata DataFrame #' @importFrom SummarizedExperiment assays rowData SummarizedExperiment colData +#' rowData<- #' @importFrom dplyr across group_by summarize full_join #' @importFrom stats setNames #' From f7950c2227970baf6e091e5bf71593e6f8b357b8 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Mon, 2 Jun 2025 20:35:04 +0200 Subject: [PATCH 25/44] Bump version, update NEWS --- DESCRIPTION | 2 +- NEWS.md | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 1d88953..936fb4b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: mutscan Title: Preprocessing and Analysis of Deep Mutational Scanning Data -Version: 0.3.4 +Version: 0.99.0 Authors@R: c(person(given = "Charlotte", family = "Soneson", diff --git a/NEWS.md b/NEWS.md index 6dfec25..eaf7062 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# mutscan 0.99.0 + +* Prepare for Bioconductor submission + # mutscan 0.3.4 * Allow use of scattermore/scattermost in plotPairs From 1560e1e342a41af5910188198afefd4abdd6e72e Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Fri, 18 Jul 2025 14:26:14 +0200 Subject: [PATCH 26/44] Use expect_true(is_ggplot(.)) instead of expect_s3_class(., "ggplot") --- tests/testthat/test_plotDistributions.R | 147 ++++++++++++++---------- tests/testthat/test_plotFiltering.R | 99 +++++++++------- tests/testthat/test_plotResults.R | 50 ++++---- tests/testthat/test_plotTotals.R | 8 +- 4 files changed, 170 insertions(+), 134 deletions(-) diff --git a/tests/testthat/test_plotDistributions.R b/tests/testthat/test_plotDistributions.R index db5d386..bcb85d6 100644 --- a/tests/testthat/test_plotDistributions.R +++ b/tests/testthat/test_plotDistributions.R @@ -32,78 +32,99 @@ test_that("plotDistributions fails with incorrect arguments", { test_that("plotDistributions works as expected", { ## Defaults - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = NULL, plotType = "density", - facet = FALSE, pseudocount = 0), "ggplot") + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = NULL, plotType = "density", + facet = FALSE, pseudocount = 0))) ## Change plot type - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = NULL, plotType = "histogram", - facet = FALSE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = NULL, plotType = "knee", - facet = FALSE, pseudocount = 0), "ggplot") + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = NULL, plotType = "histogram", + facet = FALSE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = NULL, plotType = "knee", + facet = FALSE, pseudocount = 0))) ## groupBy Name - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Name", plotType = "density", - facet = FALSE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Name", plotType = "histogram", - facet = FALSE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Name", plotType = "knee", - facet = FALSE, pseudocount = 0), "ggplot") + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Name", plotType = "density", + facet = FALSE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Name", plotType = "histogram", + facet = FALSE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Name", plotType = "knee", + facet = FALSE, pseudocount = 0))) ## groupBy Condition - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Condition", plotType = "density", - facet = FALSE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Condition", plotType = "histogram", - facet = FALSE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Condition", plotType = "knee", - facet = FALSE, pseudocount = 0), "ggplot") + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Condition", plotType = "density", + facet = FALSE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Condition", plotType = "histogram", + facet = FALSE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Condition", plotType = "knee", + facet = FALSE, pseudocount = 0))) ## Facet - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = NULL, plotType = "density", - facet = TRUE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = NULL, plotType = "histogram", - facet = TRUE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = NULL, plotType = "knee", - facet = TRUE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Name", plotType = "density", - facet = TRUE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Name", plotType = "histogram", - facet = TRUE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Name", plotType = "knee", - facet = TRUE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Condition", plotType = "density", - facet = TRUE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Condition", plotType = "histogram", - facet = TRUE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Condition", plotType = "knee", - facet = TRUE, pseudocount = 0), "ggplot") + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = NULL, plotType = "density", + facet = TRUE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = NULL, plotType = "histogram", + facet = TRUE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = NULL, plotType = "knee", + facet = TRUE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Name", plotType = "density", + facet = TRUE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Name", plotType = "histogram", + facet = TRUE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Name", plotType = "knee", + facet = TRUE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Condition", plotType = "density", + facet = TRUE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Condition", plotType = "histogram", + facet = TRUE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Condition", plotType = "knee", + facet = TRUE, pseudocount = 0))) ## Increase pseudocount - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Condition", plotType = "density", - facet = FALSE, pseudocount = 2), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Condition", plotType = "histogram", - facet = FALSE, pseudocount = 3), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Condition", plotType = "knee", - facet = FALSE, pseudocount = 4), "ggplot") + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Condition", plotType = "density", + facet = FALSE, pseudocount = 2))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Condition", plotType = "histogram", + facet = FALSE, pseudocount = 3))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Condition", plotType = "knee", + facet = FALSE, pseudocount = 4))) }) \ No newline at end of file diff --git a/tests/testthat/test_plotFiltering.R b/tests/testthat/test_plotFiltering.R index 4c4b676..d542ff4 100644 --- a/tests/testthat/test_plotFiltering.R +++ b/tests/testthat/test_plotFiltering.R @@ -36,69 +36,80 @@ test_that("plotFiltering fails with incorrect arguments", { test_that("plotFiltering works with correct arguments", { ## Defaults - expect_s3_class(plotFiltering(se = se, valueType = "reads", - onlyActiveFilters = FALSE, - displayNumbers = TRUE, numberSize = 4, - plotType = "remaining", facetBy = "sample"), "ggplot") + expect_true(ggplot2::is_ggplot( + plotFiltering(se = se, valueType = "reads", + onlyActiveFilters = FALSE, + displayNumbers = TRUE, numberSize = 4, + plotType = "remaining", facetBy = "sample"))) ## Fractions - expect_s3_class(plotFiltering(se = se, valueType = "fractions", - onlyActiveFilters = FALSE, - displayNumbers = TRUE, numberSize = 4, - plotType = "remaining", facetBy = "sample"), "ggplot") + expect_true(ggplot2::is_ggplot( + plotFiltering(se = se, valueType = "fractions", + onlyActiveFilters = FALSE, + displayNumbers = TRUE, numberSize = 4, + plotType = "remaining", facetBy = "sample"))) ## Only active filters - expect_s3_class(plotFiltering(se = se, valueType = "reads", - onlyActiveFilters = TRUE, - displayNumbers = TRUE, numberSize = 4, - plotType = "remaining", facetBy = "sample"), "ggplot") + expect_true(ggplot2::is_ggplot( + plotFiltering(se = se, valueType = "reads", + onlyActiveFilters = TRUE, + displayNumbers = TRUE, numberSize = 4, + plotType = "remaining", facetBy = "sample"))) ## Don't display numbers - expect_s3_class(plotFiltering(se = se, valueType = "reads", - onlyActiveFilters = FALSE, - displayNumbers = FALSE, numberSize = 4, - plotType = "remaining", facetBy = "sample"), "ggplot") + expect_true(ggplot2::is_ggplot( + plotFiltering(se = se, valueType = "reads", + onlyActiveFilters = FALSE, + displayNumbers = FALSE, numberSize = 4, + plotType = "remaining", facetBy = "sample"))) ## Change size of displayed numbers - expect_s3_class(plotFiltering(se = se, valueType = "reads", - onlyActiveFilters = FALSE, - displayNumbers = FALSE, numberSize = 2, - plotType = "remaining", facetBy = "sample"), "ggplot") + expect_true(ggplot2::is_ggplot( + plotFiltering(se = se, valueType = "reads", + onlyActiveFilters = FALSE, + displayNumbers = FALSE, numberSize = 2, + plotType = "remaining", facetBy = "sample"))) ## Reads + Filtered + Sample - expect_s3_class(plotFiltering(se = se, valueType = "reads", - onlyActiveFilters = FALSE, - displayNumbers = TRUE, numberSize = 4, - plotType = "filtered", facetBy = "sample"), "ggplot") + expect_true(ggplot2::is_ggplot( + plotFiltering(se = se, valueType = "reads", + onlyActiveFilters = FALSE, + displayNumbers = TRUE, numberSize = 4, + plotType = "filtered", facetBy = "sample"))) ## Reads + Filtered + Step - expect_s3_class(plotFiltering(se = se, valueType = "reads", - onlyActiveFilters = FALSE, - displayNumbers = TRUE, numberSize = 4, - plotType = "filtered", facetBy = "step"), "ggplot") + expect_true(ggplot2::is_ggplot( + plotFiltering(se = se, valueType = "reads", + onlyActiveFilters = FALSE, + displayNumbers = TRUE, numberSize = 4, + plotType = "filtered", facetBy = "step"))) ## Reads + Remaining + Step - expect_s3_class(plotFiltering(se = se, valueType = "reads", - onlyActiveFilters = FALSE, - displayNumbers = TRUE, numberSize = 4, - plotType = "remaining", facetBy = "step"), "ggplot") + expect_true(ggplot2::is_ggplot( + plotFiltering(se = se, valueType = "reads", + onlyActiveFilters = FALSE, + displayNumbers = TRUE, numberSize = 4, + plotType = "remaining", facetBy = "step"))) ## Fractions + Filtered + Sample - expect_s3_class(plotFiltering(se = se, valueType = "fractions", - onlyActiveFilters = FALSE, - displayNumbers = TRUE, numberSize = 4, - plotType = "filtered", facetBy = "sample"), "ggplot") + expect_true(ggplot2::is_ggplot( + plotFiltering(se = se, valueType = "fractions", + onlyActiveFilters = FALSE, + displayNumbers = TRUE, numberSize = 4, + plotType = "filtered", facetBy = "sample"))) ## Fractions + Filtered + Step - expect_s3_class(plotFiltering(se = se, valueType = "fractions", - onlyActiveFilters = FALSE, - displayNumbers = TRUE, numberSize = 4, - plotType = "filtered", facetBy = "step"), "ggplot") + expect_true(ggplot2::is_ggplot( + plotFiltering(se = se, valueType = "fractions", + onlyActiveFilters = FALSE, + displayNumbers = TRUE, numberSize = 4, + plotType = "filtered", facetBy = "step"))) ## Fractions + Remaining + Step - expect_s3_class(plotFiltering(se = se, valueType = "fractions", - onlyActiveFilters = FALSE, - displayNumbers = TRUE, numberSize = 4, - plotType = "remaining", facetBy = "step"), "ggplot") + expect_true(ggplot2::is_ggplot( + plotFiltering(se = se, valueType = "fractions", + onlyActiveFilters = FALSE, + displayNumbers = TRUE, numberSize = 4, + plotType = "remaining", facetBy = "step"))) }) diff --git a/tests/testthat/test_plotResults.R b/tests/testthat/test_plotResults.R index 1eb32fc..f7f9f7f 100644 --- a/tests/testthat/test_plotResults.R +++ b/tests/testthat/test_plotResults.R @@ -113,29 +113,33 @@ test_that("plotResults functions work as expected", { pseudocount = 1, method = "limma", normMethod = "sum") - expect_s3_class(plotMeanDiff(res1), "ggplot") - expect_s3_class(plotMeanDiff(res2), "ggplot") - expect_s3_class(plotVolcano(res1), "ggplot") - expect_s3_class(plotVolcano(res2), "ggplot") - - expect_s3_class(plotMeanDiff(res1, meanCol = "logCPM", logFCCol = "logFC", - padjCol = "FDR", padjThreshold = 0.1), "ggplot") - expect_s3_class(plotMeanDiff(res2, meanCol = "AveExpr", logFCCol = "logFC", - padjCol = "adj.P.Val", padjThreshold = 0.1), "ggplot") - expect_s3_class(plotVolcano(res1, logFCCol = "logFC", pvalCol = "PValue", - padjCol = "FDR", padjThreshold = 0.1), "ggplot") - expect_s3_class(plotVolcano(res2, logFCCol = "logFC", pvalCol = "P.Value", - padjCol = "adj.P.Val", padjThreshold = 0.1), "ggplot") - - expect_s3_class(plotMeanDiff(res1, nTopToLabel = 5), "ggplot") - expect_s3_class(plotMeanDiff(res2, nTopToLabel = 5), "ggplot") - expect_s3_class(plotVolcano(res1, nTopToLabel = 5), "ggplot") - expect_s3_class(plotVolcano(res2, nTopToLabel = 5), "ggplot") - - expect_s3_class(plotMeanDiff(res1, pointSize = "large"), "ggplot") - expect_s3_class(plotMeanDiff(res2, pointSize = "large"), "ggplot") - expect_s3_class(plotVolcano(res1, pointSize = "large"), "ggplot") - expect_s3_class(plotVolcano(res2, pointSize = "large"), "ggplot") + expect_true(ggplot2::is_ggplot(plotMeanDiff(res1))) + expect_true(ggplot2::is_ggplot(plotMeanDiff(res2))) + expect_true(ggplot2::is_ggplot(plotVolcano(res1))) + expect_true(ggplot2::is_ggplot(plotVolcano(res2))) + + expect_true(ggplot2::is_ggplot( + plotMeanDiff(res1, meanCol = "logCPM", logFCCol = "logFC", + padjCol = "FDR", padjThreshold = 0.1))) + expect_true(ggplot2::is_ggplot( + plotMeanDiff(res2, meanCol = "AveExpr", logFCCol = "logFC", + padjCol = "adj.P.Val", padjThreshold = 0.1))) + expect_true(ggplot2::is_ggplot( + plotVolcano(res1, logFCCol = "logFC", pvalCol = "PValue", + padjCol = "FDR", padjThreshold = 0.1))) + expect_true(ggplot2::is_ggplot( + plotVolcano(res2, logFCCol = "logFC", pvalCol = "P.Value", + padjCol = "adj.P.Val", padjThreshold = 0.1))) + + expect_true(ggplot2::is_ggplot(plotMeanDiff(res1, nTopToLabel = 5))) + expect_true(ggplot2::is_ggplot(plotMeanDiff(res2, nTopToLabel = 5))) + expect_true(ggplot2::is_ggplot(plotVolcano(res1, nTopToLabel = 5))) + expect_true(ggplot2::is_ggplot(plotVolcano(res2, nTopToLabel = 5))) + + expect_true(ggplot2::is_ggplot(plotMeanDiff(res1, pointSize = "large"))) + expect_true(ggplot2::is_ggplot(plotMeanDiff(res2, pointSize = "large"))) + expect_true(ggplot2::is_ggplot(plotVolcano(res1, pointSize = "large"))) + expect_true(ggplot2::is_ggplot(plotVolcano(res2, pointSize = "large"))) ## These tests won't run if the X11 display connection can not be opened # skip_if_not_installed("plotly") diff --git a/tests/testthat/test_plotTotals.R b/tests/testthat/test_plotTotals.R index 19dd8d6..ff3b89b 100644 --- a/tests/testthat/test_plotTotals.R +++ b/tests/testthat/test_plotTotals.R @@ -19,11 +19,11 @@ test_that("plotTotals fails with incorrect arguments", { test_that("plotTotals works as expected", { ## Defaults - expect_s3_class(plotTotals(se = se, selAssay = "counts", - groupBy = NULL), "ggplot") + expect_true(ggplot2::is_ggplot(plotTotals(se = se, selAssay = "counts", + groupBy = NULL))) ## Group by column - expect_s3_class(plotTotals(se = se, selAssay = "counts", - groupBy = "categ"), "ggplot") + expect_true(ggplot2::is_ggplot(plotTotals(se = se, selAssay = "counts", + groupBy = "categ"))) }) \ No newline at end of file From 2d72661d1d876da544a155a83e7c5101e414b0f1 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Sat, 26 Jul 2025 16:47:32 +0200 Subject: [PATCH 27/44] Add chunk labels to vignette --- vignettes/mutscan.Rmd | 56 +++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/vignettes/mutscan.Rmd b/vignettes/mutscan.Rmd index 0d1dec7..526fc8b 100644 --- a/vignettes/mutscan.Rmd +++ b/vignettes/mutscan.Rmd @@ -14,7 +14,7 @@ editor_options: chunk_output_type: console --- -```{r, include = FALSE} +```{r chunk-opts, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" @@ -48,7 +48,7 @@ knitr::include_graphics("mutscan-overview.001.png") The `mutscan` package contains several small example FASTQ files representing different types of experiments: -```{r readTrans} +```{r list-files} datadir <- system.file("extdata", package = "mutscan") dir(datadir) ``` @@ -171,7 +171,7 @@ sequences of _FOS_ and _JUN_ (forward and reverse reads, respectively). This will additionally name the final mutants according to the positions and sequences of the mutated codons. -```{r} +```{r trans-digest} transInput <- digestFastqs( fastqForward = file.path(datadir, "transInput_1.fastq.gz"), fastqReverse = file.path(datadir, "transInput_2.fastq.gz"), @@ -208,7 +208,7 @@ The `digestFastqs()` function returns a list with four elements. The `parameters` list records all parameter values used during the processing, as well as the `mutscan` version and time of processing. -```{r} +```{r trans-pars} transInput$parameters ``` @@ -217,7 +217,7 @@ filtered out in each step. Note that for any given filtering step, only the reads retained by the previous filters are considered. The numbers in the filter column names indicate the order of the filters. -```{r} +```{r trans-filter} transInput$filterSummary ``` @@ -235,7 +235,7 @@ observed sequence for the mutated codon or nucleotide. A sequence without mutations is named `WTS.0.WT`, where, again, `WTS` is the name of the matching wild type sequence. -```{r} +```{r trans-summary} head(transInput$summaryTable) ``` @@ -243,13 +243,13 @@ Finally, the `errorStatistics` data.frame lists the number of matching and mismatching bases in the constant sequences, stratified by the phred quality score (from 0 to 99). -```{r} +```{r trans-error} transInput$errorStatistics[rowSums(transInput$errorStatistics[, -1]) != 0, ] ``` The `errorStatistics` output can be used to estimate the sequencing error rate: -```{r errStats} +```{r trans-error-stats} (propErrorsConstantF <- sum(transInput$errorStatistics$nbrMismatchForward) / (nchar(transInput$parameters$constantForward) * transInput$filterSummary$nbrRetained)) @@ -273,7 +273,7 @@ forward and reverse variable sequences are merged into one variable sequence, only one wild type sequence is provided (the `wildTypeReverse` argument will be ignored if specified). -```{r} +```{r cis-digest} cisInput <- digestFastqs( fastqForward = file.path(datadir, "cisInput_1.fastq.gz"), fastqReverse = file.path(datadir, "cisInput_2.fastq.gz"), @@ -312,7 +312,7 @@ cisInput <- digestFastqs( ) ``` -```{r} +```{r cis-out} cisInput$parameters cisInput$filterSummary cisInput$errorStatistics[rowSums(cisInput$errorStatistics[, -1]) != 0, ] @@ -322,7 +322,7 @@ The `summaryTable` now provides the number of reads and unique UMIs observed for each variable sequence, and all values in the `mutNames` column will start with `FOS`. -```{r} +```{r cis-summary} head(cisInput$summaryTable) ``` @@ -335,7 +335,7 @@ sequences for the forward read. If multiple wild type sequences are provided like here, `mutscan` will match each read against all of them and find the most similar one for each read. -```{r} +```{r def-primers} leu <- c( ATF2 = "GATCCTGATGAAAAAAGGAGAAAGTTTTTAGAGCGAAATAGAGCAGCAGCTTCAAGATGCCGACAAAAAAGGAAAGTCTGGGTTCAGTCTTTAGAGAAGAAAGCTGAAGACTTGAGTTCATTAAATGGTCAGCTGCAGAGTGAAGTCACCCTGCTGAGAAATGAAGTGGCACAGCTGAAACAGCTTCTTCTGGCT", ATF7 = "GATCCAGATGAGCGACGGCAGCGCTTTCTGGAGCGCAACCGGGCTGCAGCCTCCCGCTGCCGCCAAAAGCGAAAGCTGTGGGTGTCCTCCCTAGAGAAGAAGGCCGAAGAACTCACTTCTCAGAACATTCAGCTGAGTAATGAAGTCACATTACTACGCAATGAGGTGGCCCAGTTGAAACAGCTACTGTTAGCT", @@ -393,7 +393,7 @@ specify any UMI/constant sequence lengths. For the forward read, the variable region is taken to be the remainder of the read (after the primer), whereas for the reverse read, we specify the variable sequence length to 96. -```{r} +```{r primers-digest} leujunt0 <- digestFastqs( fastqForward = file.path(datadir, "leujunt0_1.fastq.gz"), fastqReverse = file.path(datadir, "leujunt0_2.fastq.gz"), @@ -428,12 +428,12 @@ leujunt0 <- digestFastqs( ) ``` -```{r} +```{r primers-out} leujunt0$parameters leujunt0$filterSummary ``` -```{r} +```{r primers-summary} head(leujunt0$summaryTable) ``` @@ -449,7 +449,7 @@ process also the output sample for the TRANS experiment for which we processed the input sample above, and feed both outputs to `summarizeExperiment()`. -```{r combine} +```{r combine-digest} transOutput <- digestFastqs( fastqForward = file.path(datadir, "transOutput_1.fastq.gz"), fastqReverse = file.path(datadir, "transOutput_2.fastq.gz"), @@ -513,7 +513,7 @@ by `summarizeExperiment()` contains one row for each observed variant the corresponding amino acid, and aggregating the counts corresponding to the same mutated amino acid (combination). -```{r} +```{r collapse-aa} se_collapsed <- collapseMutantsByAA(se) head(assay(se_collapsed, "counts")) Matrix::colSums(assay(se_collapsed, "counts")) @@ -528,7 +528,7 @@ full CIS data set from [@Diss2018], which has been processed using `digestFastqs()` as illustrated above, and summarized in a `SummarizedExperiment` object provided with the package. -```{r} +```{r read-se} se <- readRDS(file.path(datadir, "GSE102901_cis_se.rds")) ``` @@ -536,12 +536,12 @@ First, we can plot a summary of the filtering process, indicating the number of reads that were filtered out by (or retained after) each step of the `mutscan` filtering. -```{r, fig.height=10, fig.width=8} +```{r plot-filter-reads, fig.height=10, fig.width=8} plotFiltering(se, valueType = "reads", onlyActiveFilters = TRUE, plotType = "remaining", facetBy = "sample", numberSize = 3) ``` -```{r, fig.height=10, fig.width=8} +```{r plot-filter-frac, fig.height=10, fig.width=8} plotFiltering(se, valueType = "fractions", onlyActiveFilters = TRUE, plotType = "filtered", facetBy = "step", numberSize = 3) ``` @@ -549,14 +549,14 @@ plotFiltering(se, valueType = "fractions", onlyActiveFilters = TRUE, We can also generate a pairs plot displaying the correlation among the samples in the data set. -```{r, fig.height=7} +```{r plot-pairs, fig.height=7} plotPairs(se, selAssay = "counts") ``` Additional plot functions can be used to visualize the total count per sample, across all variants, or the distribution of variant counts per sample. -```{r} +```{r plot-more} plotTotals(se, selAssay = "counts") plotDistributions(se, selAssay = "counts", plotType = "density", pseudocount = 1) @@ -564,7 +564,7 @@ plotDistributions(se, selAssay = "counts", plotType = "density", Finally, we can create a full QC report as follows: -```{r, eval=FALSE} +```{r make-qc-report, eval=FALSE} generateQCReport(se, outFile = tempfile(fileext = ".html")) ``` @@ -585,7 +585,7 @@ Here, we illustrate the application of `calculateFitnessScore()` on the SummarizedExperiment object containing all the three CIS replicates from [@Diss2018]. -```{r} +```{r calc-ppi} se_collapsed <- collapseMutantsByAA(se) ppis <- calculateFitnessScore( se = se_collapsed, pairingCol = "Replicate", @@ -614,7 +614,7 @@ information about how to set up and interpret design matrices in `edgeR` or `limma`, see e.g. [Law et al (2020)](https://f1000research.com/articles/9-1444), or [Soneson et al (2020)](https://f1000research.com/articles/9-512). -```{r} +```{r def-mm} model.matrix(~ Replicate + Condition, data = colData(se_collapsed)) ``` @@ -622,7 +622,7 @@ model.matrix(~ Replicate + Condition, Next, we apply either `edgeR` or `limma` to extract the logFCs of the mutants, compared to the wildtype sequence. -```{r} +```{r calc-rel-fc} ## edgeR edger_scores <- calculateRelativeFC( se = se_collapsed, @@ -650,7 +650,7 @@ limma_scores["f.0.WT", , drop = FALSE] testing - in particular, MA (mean-difference) plots and volcano plots can be easily generated. -```{r} +```{r plot-rel-fc} plotMeanDiff(edger_scores, pointSize = "large") plotVolcano(edger_scores, pointSize = "large") ``` @@ -668,7 +668,7 @@ the forward and reverse FASTQ files in the same order. This vignette was compiled on the following system: -```{r} +```{r session-info} sessionInfo() ``` From 3b1928b08f75fbfc7559fc1b428dc7bfceff207c Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Sat, 26 Jul 2025 16:51:13 +0200 Subject: [PATCH 28/44] Fix a couple of long lines --- R/digestFastqs.R | 11 +++++++---- R/plotPairs.R | 4 ++-- vignettes/mutscan.Rmd | 3 ++- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/R/digestFastqs.R b/R/digestFastqs.R index f3220e5..be5acb1 100644 --- a/R/digestFastqs.R +++ b/R/digestFastqs.R @@ -382,7 +382,7 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, maxNReads = -1, verbose = FALSE, nThreads = 1, chunkSize = 100000, maxReadLength = 1024) { - ## pre-flight checks --------------------------------------------------------- + ## pre-flight checks ------------------------------------------------------ ## deprecated arguments deprecMessageColl <- paste0( "Starting from mutscan v0.3.0, collapsing of variable sequences is ", @@ -704,7 +704,8 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, !all(grepl("^[ACGTMRWSYKVHDBN]{3}$", toupper(forbiddenMutatedCodonsReverse)) | forbiddenMutatedCodonsReverse == "")) { - stop("All elements of 'forbiddenMutatedCodonsForward' and 'forbiddenMutatedCodonsReverse' must be ", + stop("All elements of 'forbiddenMutatedCodonsForward' and ", + "'forbiddenMutatedCodonsReverse' must be ", "character strings consisting of three valid IUPAC letters.") } else { forbiddenMutatedCodonsForward <- toupper(forbiddenMutatedCodonsForward) @@ -754,8 +755,10 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, normalizePath(filteredReadsFastqReverse, mustWork = FALSE) if ((any(fastqReverse == "") && filteredReadsFastqReverse != "") || - (all(fastqReverse != "") && filteredReadsFastqForward != "" && filteredReadsFastqReverse == "") || - (all(fastqForward != "") && filteredReadsFastqForward == "" && filteredReadsFastqReverse != "")) { + (all(fastqReverse != "") && filteredReadsFastqForward != "" && + filteredReadsFastqReverse == "") || + (all(fastqForward != "") && filteredReadsFastqForward == "" && + filteredReadsFastqReverse != "")) { stop("The pairing of the output FASTQ files must be compatible ", "with that of the input files.") } diff --git a/R/plotPairs.R b/R/plotPairs.R index 579bdc5..167d707 100644 --- a/R/plotPairs.R +++ b/R/plotPairs.R @@ -147,9 +147,9 @@ plotPairs <- function(se, selAssay = "counts", doLog = TRUE, pseudocount = 1, ) } - ## ----------------------------------------------------------------------- ## + ## --------------------------------------------------------------------- ## ## Scatter plots - ## ----------------------------------------------------------------------- ## + ## --------------------------------------------------------------------- ## if (pointsType == "smoothscatter") { ## Define function to create smoothscatter-like plot ## (for use with ggpairs) diff --git a/vignettes/mutscan.Rmd b/vignettes/mutscan.Rmd index 526fc8b..a988eeb 100644 --- a/vignettes/mutscan.Rmd +++ b/vignettes/mutscan.Rmd @@ -611,7 +611,8 @@ that for this, at least two replicates are required. We start by looking at the design matrix, in order to determine which of the coefficients to specify for the testing in `calculateRelativeFC()`. For more information about how to set up and interpret design matrices in `edgeR` or -`limma`, see e.g. [Law et al (2020)](https://f1000research.com/articles/9-1444), +`limma`, see e.g. +[Law et al (2020)](https://f1000research.com/articles/9-1444), or [Soneson et al (2020)](https://f1000research.com/articles/9-512). ```{r def-mm} From 2975358d296b95e425c2703bb2b65e279236eb87 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Thu, 31 Jul 2025 19:30:35 +0200 Subject: [PATCH 29/44] Remove deprecated arguments to digestFastqs --- DESCRIPTION | 1 - NAMESPACE | 3 -- NEWS.md | 1 + R/digestFastqs.R | 48 ------------------- man/digestFastqs.Rd | 7 --- src/digestFastqs.cpp | 2 +- .../test_digestFastqs_trans_collapse.R | 40 ++-------------- 7 files changed, 5 insertions(+), 97 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 936fb4b..3d8989c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -47,7 +47,6 @@ Imports: IRanges, utils, DelayedArray, - lifecycle, tools Suggests: testthat (>= 3.0.0), diff --git a/NAMESPACE b/NAMESPACE index 2a001db..215b433 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -98,9 +98,6 @@ importFrom(ggrepel,geom_text_repel) importFrom(grDevices,colorRamp) importFrom(grDevices,hcl.colors) importFrom(grDevices,rgb) -importFrom(lifecycle,deprecate_warn) -importFrom(lifecycle,deprecated) -importFrom(lifecycle,is_present) importFrom(limma,contrasts.fit) importFrom(limma,eBayes) importFrom(limma,lmFit) diff --git a/NEWS.md b/NEWS.md index eaf7062..366297d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,7 @@ # mutscan 0.99.0 * Prepare for Bioconductor submission +* Remove deprecated arguments variableCollapseMaxDist, variableCollapseMinReads and variableCollapseMinRatio from digestFastqs (deprecated in mutscan 0.3.0) # mutscan 0.3.4 diff --git a/R/digestFastqs.R b/R/digestFastqs.R index be5acb1..9790779 100644 --- a/R/digestFastqs.R +++ b/R/digestFastqs.R @@ -159,10 +159,6 @@ #' constant sequence. If multiple constant sequences are provided, the most #' similar one is used. Reads with a larger distance to the expected #' constant sequence are discarded. If set to -1, no filtering is done. -#' @param variableCollapseMaxDist,variableCollapseMinReads,variableCollapseMinRatio -#' Deprecated. Collapsing of variable sequences is no longer performed in -#' \code{digestFastqs}. Please use \code{collapseMutantsBySimilarity} -#' instead. #' @param umiCollapseMaxDist Numeric scalar defining #' the tolerances for collapsing similar UMI sequences. If the value is #' in [0, 1), it defines the maximal Hamming distance in terms of a @@ -340,7 +336,6 @@ #' out$errorStatistics #' #' @export -#' @importFrom lifecycle deprecated is_present deprecate_warn #' @importFrom utils packageVersion digestFastqs <- function(fastqForward, fastqReverse = NULL, mergeForwardReverse = FALSE, @@ -373,9 +368,6 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, mutNameDelimiter = ".", constantMaxDistForward = -1, constantMaxDistReverse = -1, - variableCollapseMaxDist = deprecated(), - variableCollapseMinReads = deprecated(), - variableCollapseMinRatio = deprecated(), umiCollapseMaxDist = 0.0, filteredReadsFastqForward = "", filteredReadsFastqReverse = "", @@ -383,46 +375,6 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, nThreads = 1, chunkSize = 100000, maxReadLength = 1024) { ## pre-flight checks ------------------------------------------------------ - ## deprecated arguments - deprecMessageColl <- paste0( - "Starting from mutscan v0.3.0, collapsing of variable sequences is ", - "no longer supported by digestFastqs(), and arguments ", - "variableCollapseMaxDist, variableCollapseMinReads ", - "and variableCollapseMinRatio will be ignored. Please run ", - "summarizeExperiment() to generate a SummarizedExperiment object, ", - "and then call collapseMutantsBySimilarity() to collapse variable ", - "sequences in a consistent way across all samples." - ) - if (lifecycle::is_present(variableCollapseMaxDist)) { - # Signal the deprecation to the user - lifecycle::deprecate_warn( - "0.3.0", "mutscan::digestFastqs(variableCollapseMaxDist = )", - "mutscan::collapseMutantsBySimilarity()", - id = "digestFastqs_variableCollapse", - details = deprecMessageColl, - always = TRUE) - } - if (lifecycle::is_present(variableCollapseMinReads)) { - # Signal the deprecation to the user - lifecycle::deprecate_warn( - "0.3.0", "mutscan::digestFastqs(variableCollapseMinReads = )", - "mutscan::collapseMutantsBySimilarity()", - id = "digestFastqs_variableCollapse", - details = deprecMessageColl, - always = TRUE) - } - if (lifecycle::is_present(variableCollapseMinRatio)) { - # Signal the deprecation to the user - lifecycle::deprecate_warn( - "0.3.0", "mutscan::digestFastqs(variableCollapseMinRatio = )", - "mutscan::collapseMutantsBySimilarity()", - id = "digestFastqs_variableCollapse", - details = deprecMessageColl, - always = TRUE) - } - - - ## fastq files exist if (length(fastqForward) < 1 || !all(file.exists(fastqForward)) || (!is.null(fastqReverse) && diff --git a/man/digestFastqs.Rd b/man/digestFastqs.Rd index 216133c..9bb3bfd 100644 --- a/man/digestFastqs.Rd +++ b/man/digestFastqs.Rd @@ -47,9 +47,6 @@ digestFastqs( mutNameDelimiter = ".", constantMaxDistForward = -1, constantMaxDistReverse = -1, - variableCollapseMaxDist = deprecated(), - variableCollapseMinReads = deprecated(), - variableCollapseMinRatio = deprecated(), umiCollapseMaxDist = 0, filteredReadsFastqForward = "", filteredReadsFastqReverse = "", @@ -196,10 +193,6 @@ constant sequence. If multiple constant sequences are provided, the most similar one is used. Reads with a larger distance to the expected constant sequence are discarded. If set to -1, no filtering is done.} -\item{variableCollapseMaxDist, variableCollapseMinReads, variableCollapseMinRatio}{Deprecated. Collapsing of variable sequences is no longer performed in -\code{digestFastqs}. Please use \code{collapseMutantsBySimilarity} -instead.} - \item{umiCollapseMaxDist}{Numeric scalar defining the tolerances for collapsing similar UMI sequences. If the value is in [0, 1), it defines the maximal Hamming distance in terms of a diff --git a/src/digestFastqs.cpp b/src/digestFastqs.cpp index 1659b37..5b41a7e 100644 --- a/src/digestFastqs.cpp +++ b/src/digestFastqs.cpp @@ -1366,7 +1366,7 @@ Rcpp::DataFrame groupSimilarSequences(std::vector seqs, size_t start_size = (double)tree.size; while (tree.size > 0) { querySeq = tree.first(); - // check in seqsScores if score for querySeq is < variableCollapseMinReads + // check in seqsScores if score for querySeq is < collapseMinScore seqsScoresIt = seqsScores.find(querySeq); if (collapseMinScore > 0 && seqsScoresIt != seqsScores.end() && diff --git a/tests/testthat/test_digestFastqs_trans_collapse.R b/tests/testthat/test_digestFastqs_trans_collapse.R index 44751c0..2481834 100644 --- a/tests/testthat/test_digestFastqs_trans_collapse.R +++ b/tests/testthat/test_digestFastqs_trans_collapse.R @@ -42,25 +42,6 @@ test_that("digestFastqs works as expected for trans experiments, when similar se res <- do.call(digestFastqs, Ldef) - ## First check that we get deprecation warnings if deprecated arguments are - ## specified. Results should be the same as if these arguments are not used - lifecycle::expect_deprecated({ - resdep1 <- do.call(digestFastqs, c(Ldef, list(variableCollapseMaxDist = 2)))}) - lifecycle::expect_deprecated({ - resdep2 <- do.call(digestFastqs, c(Ldef, list(variableCollapseMinReads = 2)))}) - lifecycle::expect_deprecated({ - resdep3 <- do.call(digestFastqs, c(Ldef, list(variableCollapseMinRatio = 2)))}) - - expect_equal(res$filterSummary, resdep1$filterSummary) - expect_equal(res$summaryTable, resdep1$summaryTable) - expect_equal(res$errorStatistics, resdep1$errorStatistics) - expect_equal(res$filterSummary, resdep2$filterSummary) - expect_equal(res$summaryTable, resdep2$summaryTable) - expect_equal(res$errorStatistics, resdep2$errorStatistics) - expect_equal(res$filterSummary, resdep3$filterSummary) - expect_equal(res$summaryTable, resdep3$summaryTable) - expect_equal(res$errorStatistics, resdep3$errorStatistics) - ## Summarize single sample and collapse se <- summarizeExperiment(list(s1 = res), coldata = data.frame(Name = "s1"), countType = "reads") @@ -382,12 +363,7 @@ test_that("digestFastqs works as expected for trans experiments, when similar se for (nm in setdiff(names(Ldef), c("forbiddenMutatedCodonsForward", "nThreads", "forbiddenMutatedCodonsReverse", "verbose", "fastqForward", "fastqReverse"))) { - if (nm == "variableCollapseMinReads") { - expect_equal(res$parameters[[nm]], as.integer(ceiling(Ldef[[nm]])), - ignore_attr = TRUE) - } else { - expect_equal(res$parameters[[nm]], Ldef[[nm]], ignore_attr = TRUE) - } + expect_equal(res$parameters[[nm]], Ldef[[nm]], ignore_attr = TRUE) } for (nm in c("fastqForward", "fastqReverse")) { expect_equal(res$parameters[[nm]], normalizePath(Ldef[[nm]], mustWork = FALSE), @@ -480,12 +456,7 @@ test_that("digestFastqs works as expected for trans experiments, when similar se expect_equal(res$filterSummary$nbrRetained, 679L) for (nm in setdiff(names(Ldef), c("forbiddenMutatedCodonsForward", "forbiddenMutatedCodonsReverse", "verbose", "fastqForward", "fastqReverse"))) { - if (nm == "variableCollapseMinReads") { - expect_equal(res$parameters[[nm]], as.integer(ceiling(Ldef[[nm]])), - ignore_attr = TRUE) - } else { - expect_equal(res$parameters[[nm]], Ldef[[nm]], ignore_attr = TRUE) - } + expect_equal(res$parameters[[nm]], Ldef[[nm]], ignore_attr = TRUE) } for (nm in c("fastqForward", "fastqReverse")) { expect_equal(res$parameters[[nm]], normalizePath(Ldef[[nm]], mustWork = FALSE), @@ -573,12 +544,7 @@ test_that("digestFastqs works as expected for trans experiments, when similar se expect_equal(res$filterSummary$nbrRetained, 679L) for (nm in setdiff(names(Ldef), c("forbiddenMutatedCodonsForward", "forbiddenMutatedCodonsReverse", "verbose", "fastqForward", "fastqReverse"))) { - if (nm == "variableCollapseMinReads") { - expect_equal(res$parameters[[nm]], as.integer(ceiling(Ldef[[nm]])), - ignore_attr = TRUE) - } else { - expect_equal(res$parameters[[nm]], Ldef[[nm]], ignore_attr = TRUE) - } + expect_equal(res$parameters[[nm]], Ldef[[nm]], ignore_attr = TRUE) } for (nm in c("fastqForward", "fastqReverse")) { expect_equal(res$parameters[[nm]], normalizePath(Ldef[[nm]], mustWork = FALSE), From 5579e3d7ad3edf971a5998244692cffc115cc90e Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Thu, 31 Jul 2025 19:57:38 +0200 Subject: [PATCH 30/44] Set default values of collapse parameters --- R/RcppExports.R | 6 +++--- man/groupSimilarSequences.Rd | 12 ++++++------ src/digestFastqs.cpp | 12 ++++++------ 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index 24c8c47..4779be8 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -70,11 +70,11 @@ findClosestRefSeqEarlyStop <- function(varSeq, wtSeq, upperBoundMismatch, sim) { #' (\code{round(collapseMaxDist * nchar(sequence))}). #' A value greater or equal to 1 is rounded and directly used as the maximum #' allowed Hamming distance. Note that sequences can only be -#' collapsed if they are all of the same length. +#' collapsed if they are all of the same length. The default value is 0. #' @param collapseMinScore Numeric scalar, indicating the minimum score #' required for a sequence to be considered as a representative for a #' group of similar sequences (i.e., to allow other sequences to be -#' collapsed into it). +#' collapsed into it). The default value is 0. #' @param collapseMinRatio Numeric scalar. During collapsing of #' similar sequences, a low-frequency sequence will be collapsed #' with a higher-frequency sequence only if the ratio between the @@ -95,7 +95,7 @@ findClosestRefSeqEarlyStop <- function(varSeq, wtSeq, upperBoundMismatch, sim) { #' #' @export #' @author Michael Stadler, Charlotte Soneson -groupSimilarSequences <- function(seqs, scores, collapseMaxDist, collapseMinScore, collapseMinRatio, verbose) { +groupSimilarSequences <- function(seqs, scores, collapseMaxDist = 0.0, collapseMinScore = 0.0, collapseMinRatio = 0.0, verbose = FALSE) { .Call(`_mutscan_groupSimilarSequences`, seqs, scores, collapseMaxDist, collapseMinScore, collapseMinRatio, verbose) } diff --git a/man/groupSimilarSequences.Rd b/man/groupSimilarSequences.Rd index 476ba46..07a3694 100644 --- a/man/groupSimilarSequences.Rd +++ b/man/groupSimilarSequences.Rd @@ -7,10 +7,10 @@ groupSimilarSequences( seqs, scores, - collapseMaxDist, - collapseMinScore, - collapseMinRatio, - verbose + collapseMaxDist = 0, + collapseMinScore = 0, + collapseMinRatio = 0, + verbose = FALSE ) } \arguments{ @@ -29,12 +29,12 @@ Hamming distance in terms of a fraction of sequence length: (\code{round(collapseMaxDist * nchar(sequence))}). A value greater or equal to 1 is rounded and directly used as the maximum allowed Hamming distance. Note that sequences can only be -collapsed if they are all of the same length.} +collapsed if they are all of the same length. The default value is 0.} \item{collapseMinScore}{Numeric scalar, indicating the minimum score required for a sequence to be considered as a representative for a group of similar sequences (i.e., to allow other sequences to be -collapsed into it).} +collapsed into it). The default value is 0.} \item{collapseMinRatio}{Numeric scalar. During collapsing of similar sequences, a low-frequency sequence will be collapsed diff --git a/src/digestFastqs.cpp b/src/digestFastqs.cpp index 5b41a7e..035ad88 100644 --- a/src/digestFastqs.cpp +++ b/src/digestFastqs.cpp @@ -1272,11 +1272,11 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma //' (\code{round(collapseMaxDist * nchar(sequence))}). //' A value greater or equal to 1 is rounded and directly used as the maximum //' allowed Hamming distance. Note that sequences can only be -//' collapsed if they are all of the same length. +//' collapsed if they are all of the same length. The default value is 0. //' @param collapseMinScore Numeric scalar, indicating the minimum score //' required for a sequence to be considered as a representative for a //' group of similar sequences (i.e., to allow other sequences to be -//' collapsed into it). +//' collapsed into it). The default value is 0. //' @param collapseMinRatio Numeric scalar. During collapsing of //' similar sequences, a low-frequency sequence will be collapsed //' with a higher-frequency sequence only if the ratio between the @@ -1300,10 +1300,10 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma // [[Rcpp::export]] Rcpp::DataFrame groupSimilarSequences(std::vector seqs, std::vector scores, - double collapseMaxDist, - double collapseMinScore, - double collapseMinRatio, - bool verbose) { + double collapseMaxDist=0.0, + double collapseMinScore=0.0, + double collapseMinRatio=0.0, + bool verbose=false) { // combine seqs and scores std::map seqsScores; From 1a1cef314b9e0b54532fc218d90088331dac83ea Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Thu, 31 Jul 2025 19:58:02 +0200 Subject: [PATCH 31/44] Add example to linkMultipleVariants --- R/linkMultipleVariants.R | 20 ++++++++++++++++++++ man/linkMultipleVariants.Rd | 21 +++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/R/linkMultipleVariants.R b/R/linkMultipleVariants.R index cb5e3a9..aec04dd 100644 --- a/R/linkMultipleVariants.R +++ b/R/linkMultipleVariants.R @@ -80,6 +80,26 @@ #' \item outCombined - the \code{digestFastqs} output for the combined run. #' } #' +#' @examples +#' fqFile <- system.file("extdata", "cisInput_1.fastq.gz", +#' package = "mutscan") +#' out <- linkMultipleVariants( +#' combinedDigestParams = list(fastqForward = fqFile, +#' elementsForward = "SVCV", +#' elementLengthsForward = c(1, 10, 18, 96)), +#' # the first variable sequence is the UMI +#' umi = list(fastqForward = fqFile, elementsForward = "SVCS", +#' elementLengthsForward = c(1, 10, 18, 96)), +#' # the second variable sequence is the amplicon variant +#' var = list(fastqForward = fqFile, elementsForward = "SSCV", +#' elementLengthsForward = c(1, 10, 18, 96), +#' collapseMaxDist = 3, collapseMinScore = 1) +#' ) +#' # conversion tables +#' lapply(out$convSeparate, head) +#' # aggregated count table +#' head(out$countAggregated) +#' #' @importFrom dplyr select rename group_by summarize across matches mutate #' @importFrom tidyr separate separate_rows #' @importFrom rlang .data diff --git a/man/linkMultipleVariants.Rd b/man/linkMultipleVariants.Rd index ccaf163..e24c43e 100644 --- a/man/linkMultipleVariants.Rd +++ b/man/linkMultipleVariants.Rd @@ -86,6 +86,27 @@ consistent (since this can affect e.g. filtering criteria). Note that to process individual variable sequences in the reverse read, you also need to swap the 'forward' and 'reverse' specifications (since \code{digestFastqs} requires a forward read). +} +\examples{ +fqFile <- system.file("extdata", "cisInput_1.fastq.gz", + package = "mutscan") +out <- linkMultipleVariants( + combinedDigestParams = list(fastqForward = fqFile, + elementsForward = "SVCV", + elementLengthsForward = c(1, 10, 18, 96)), + # the first variable sequence is the UMI + umi = list(fastqForward = fqFile, elementsForward = "SVCS", + elementLengthsForward = c(1, 10, 18, 96)), + # the second variable sequence is the amplicon variant + var = list(fastqForward = fqFile, elementsForward = "SSCV", + elementLengthsForward = c(1, 10, 18, 96), + collapseMaxDist = 3, collapseMinScore = 1) +) +# conversion tables +lapply(out$convSeparate, head) +# aggregated count table +head(out$countAggregated) + } \author{ Charlotte Soneson, Michael Stadler From 96ec0b34a7805990afe237e55526abf87a0c176d Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Mon, 4 Aug 2025 12:33:50 +0200 Subject: [PATCH 32/44] Harmonize findClosestRefSeq with findClosestRefSeqEarlyStop (only compare until the end of the shortest of the variable and wildtype sequences) --- src/digestFastqs.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/digestFastqs.cpp b/src/digestFastqs.cpp index 035ad88..7344878 100644 --- a/src/digestFastqs.cpp +++ b/src/digestFastqs.cpp @@ -986,10 +986,12 @@ int findClosestRefSeq(std::string &varSeq, std::vector &wtSeq, int maxsim = 0; int nbrbesthits = 0; int currsim; + size_t minl; for (size_t i = 0; i < wtSeq.size(); i++) { currsim = 0; std::string currSeq = wtSeq[i]; - for (size_t j = 0; j < currSeq.length(); j++) { + minl = std::min(varSeq.size(), currSeq.size()); + for (size_t j = 0; j < minl; j++) { currsim += (currSeq[j] == varSeq[j]); } if (((int)varSeq.size() - currsim <= (int)upperBoundMismatch)) { @@ -997,7 +999,7 @@ int findClosestRefSeq(std::string &varSeq, std::vector &wtSeq, nbrbesthits++; } else if (currsim > maxsim) { nbrbesthits = 1; - idx = i; + idx = (int)i; maxsim = currsim; } } From 80c6260e6dfd91f0ca7b9fe1a7876c5b5af53b5c Mon Sep 17 00:00:00 2001 From: mbstadler Date: Thu, 7 Aug 2025 14:14:55 +0200 Subject: [PATCH 33/44] ignore test coverage for write failures --- src/FastqBuffer_utils.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/FastqBuffer_utils.h b/src/FastqBuffer_utils.h index f134c80..af3d41b 100644 --- a/src/FastqBuffer_utils.h +++ b/src/FastqBuffer_utils.h @@ -16,7 +16,7 @@ class FastqBuffer { // (space for seq2 and qual2 is only allocated for paired=true) public: char *seq1, *qual1, *seq2, *qual2; - + // constructor FastqBuffer(size_t n, size_t b, bool p = true) { nentries = n; @@ -33,7 +33,7 @@ class FastqBuffer { qual2 = NULL; } } - + // copy constructor FastqBuffer(const FastqBuffer& fqb) { paired = fqb.paired; @@ -51,12 +51,12 @@ class FastqBuffer { qual2 = NULL; } } - + // destructor ~FastqBuffer() { delete[] buffer; } - + // Write one (pair of) filtered reads to output fastq file(s) bool write_seq(size_t i, gzFile file1, gzFile file2, const int n, const char* label) { bool success = (i < nentries); @@ -67,32 +67,32 @@ class FastqBuffer { if (success && file1 != NULL) { std::string read_id = "@S" + std::to_string(n) + "_" + label + " 1\n"; if (success && gzputs(file1, read_id.c_str()) == (-1)) { - success = false; + success = false; // # nocov } if (success && gzputs(file1, seq1 + (i * BUFFER_SIZE)) == (-1)) { - success = false; + success = false; // # nocov } if (success && gzputs(file1, "+\n") == (-1)) { - success = false; + success = false; // # nocov } if (success && gzputs(file1, qual1 + (i * BUFFER_SIZE)) == (-1)) { - success = false; + success = false; // # nocov } } - + if (success && file2 != NULL) { std::string read_id = "@S" + std::to_string(n) + "_" + label + " 2\n"; if (success && gzputs(file2, read_id.c_str()) == (-1)) { - success = false; + success = false; // # nocov } if (success && gzputs(file2, seq2 + (i * BUFFER_SIZE)) == (-1)) { - success = false; + success = false; // # nocov } if (success && gzputs(file2, "+\n") == (-1)) { - success = false; + success = false; // # nocov } if (success && gzputs(file2, qual2 + (i * BUFFER_SIZE)) == (-1)) { - success = false; + success = false; // # nocov } } } From ec1a439f62dcfaa7e98698f4652b8eef16cdf77f Mon Sep 17 00:00:00 2001 From: mbstadler Date: Thu, 7 Aug 2025 15:01:59 +0200 Subject: [PATCH 34/44] export and test C `complement` function --- R/RcppExports.R | 40 ++++++++++++---------- src/RcppExports.cpp | 12 +++++++ src/digestFastqs.cpp | 1 + tests/testthat/test_digestFastqs_helpers.R | 24 +++++++++++-- 4 files changed, 56 insertions(+), 21 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index 4779be8..03933da 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -24,6 +24,10 @@ calcNearestStringDist <- function(x, metric = "hamming", nThreads = 1L) { .Call(`_mutscan_calcNearestStringDist`, x, metric, nThreads) } +complement <- function(n) { + .Call(`_mutscan_complement`, n) +} + compareCodonPositions <- function(a, b, mutNameDelimiter) { .Call(`_mutscan_compareCodonPositions`, a, b, mutNameDelimiter) } @@ -57,42 +61,42 @@ findClosestRefSeqEarlyStop <- function(varSeq, wtSeq, upperBoundMismatch, sim) { } #' Create a conversion table for collapsing similar sequences -#' @param seqs Character vector with nucleotide sequences (or pairs of -#' sequences concatenated with "_") to be collapsed. The sequences must +#' @param seqs Character vector with nucleotide sequences (or pairs of +#' sequences concatenated with "_") to be collapsed. The sequences must #' all be of the same length. #' @param scores Numeric vector of "scores" for the sequences. Typically -#' the total read/UMI count. A higher score will be preferred when -#' deciding which sequence to use as the representative for a group of +#' the total read/UMI count. A higher score will be preferred when +#' deciding which sequence to use as the representative for a group of #' collapsed sequences. -#' @param collapseMaxDist Numeric scalar defining the tolerance for collapsing -#' similar sequences. If the value is in [0, 1), it defines the maximal +#' @param collapseMaxDist Numeric scalar defining the tolerance for collapsing +#' similar sequences. If the value is in [0, 1), it defines the maximal #' Hamming distance in terms of a fraction of sequence length: #' (\code{round(collapseMaxDist * nchar(sequence))}). #' A value greater or equal to 1 is rounded and directly used as the maximum #' allowed Hamming distance. Note that sequences can only be #' collapsed if they are all of the same length. The default value is 0. -#' @param collapseMinScore Numeric scalar, indicating the minimum score -#' required for a sequence to be considered as a representative for a -#' group of similar sequences (i.e., to allow other sequences to be +#' @param collapseMinScore Numeric scalar, indicating the minimum score +#' required for a sequence to be considered as a representative for a +#' group of similar sequences (i.e., to allow other sequences to be #' collapsed into it). The default value is 0. #' @param collapseMinRatio Numeric scalar. During collapsing of -#' similar sequences, a low-frequency sequence will be collapsed -#' with a higher-frequency sequence only if the ratio between the -#' high-frequency and the low-frequency scores is at least this +#' similar sequences, a low-frequency sequence will be collapsed +#' with a higher-frequency sequence only if the ratio between the +#' high-frequency and the low-frequency scores is at least this #' high. A value of 0 indicates that no such check is performed. #' @param verbose Logical scalar, whether to print progress messages. -#' -#' @return A data.frame with two columns, containing the input sequences +#' +#' @return A data.frame with two columns, containing the input sequences #' and the representatives for the groups resulting from grouping similar #' sequences, respectively. -#' +#' #' @examples #' seqs <- c("AACGTAGCA", "ACCGTAGCA", "AACGGAGCA", "ATCGGAGCA", "TGAGGCATA") #' scores <- c(5, 1, 3, 1, 8) -#' groupSimilarSequences(seqs = seqs, scores = scores, -#' collapseMaxDist = 1, collapseMinScore = 0, +#' groupSimilarSequences(seqs = seqs, scores = scores, +#' collapseMaxDist = 1, collapseMinScore = 0, #' collapseMinRatio = 0, verbose = FALSE) -#' +#' #' @export #' @author Michael Stadler, Charlotte Soneson groupSimilarSequences <- function(seqs, scores, collapseMaxDist = 0.0, collapseMinScore = 0.0, collapseMinRatio = 0.0, verbose = FALSE) { diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 2ff18f4..efffbf7 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -23,6 +23,17 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// complement +char complement(char n); +RcppExport SEXP _mutscan_complement(SEXP nSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< char >::type n(nSEXP); + rcpp_result_gen = Rcpp::wrap(complement(n)); + return rcpp_result_gen; +END_RCPP +} // compareCodonPositions bool compareCodonPositions(std::string a, std::string b, const char mutNameDelimiter); RcppExport SEXP _mutscan_compareCodonPositions(SEXP aSEXP, SEXP bSEXP, SEXP mutNameDelimiterSEXP) { @@ -282,6 +293,7 @@ RcppExport SEXP _rcpp_module_boot_mod_BKtree(); static const R_CallMethodDef CallEntries[] = { {"_mutscan_calcNearestStringDist", (DL_FUNC) &_mutscan_calcNearestStringDist, 3}, + {"_mutscan_complement", (DL_FUNC) &_mutscan_complement, 1}, {"_mutscan_compareCodonPositions", (DL_FUNC) &_mutscan_compareCodonPositions, 3}, {"_mutscan_translateString", (DL_FUNC) &_mutscan_translateString, 1}, {"_mutscan_makeBaseHGVS", (DL_FUNC) &_mutscan_makeBaseHGVS, 4}, diff --git a/src/digestFastqs.cpp b/src/digestFastqs.cpp index 7344878..488a667 100644 --- a/src/digestFastqs.cpp +++ b/src/digestFastqs.cpp @@ -80,6 +80,7 @@ bool get_next_seq(gzFile file, char *seq, char *qual, size_t BUFFER_SIZE) { } // create the complement of a base +// [[Rcpp::export]] char complement(char n) { switch(n) { case 'A': diff --git a/tests/testthat/test_digestFastqs_helpers.R b/tests/testthat/test_digestFastqs_helpers.R index 031946e..b0dde45 100644 --- a/tests/testthat/test_digestFastqs_helpers.R +++ b/tests/testthat/test_digestFastqs_helpers.R @@ -1,3 +1,21 @@ +## ---------------------------------------------------------------------------- +## complement +## ---------------------------------------------------------------------------- +test_that("complement works", { + expect_identical(complement("a"), "T") + expect_identical(complement("A"), "T") + expect_identical(complement("c"), "G") + expect_identical(complement("C"), "G") + expect_identical(complement("g"), "C") + expect_identical(complement("G"), "C") + expect_identical(complement("t"), "A") + expect_identical(complement("T"), "A") + expect_identical(complement("n"), "N") + expect_identical(complement("N"), "N") + expect_error(complement("B"), "Invalid DNA base") + expect_error(complement("Q"), "Invalid DNA base") +}) + ## ---------------------------------------------------------------------------- ## compareCodonPositions ## ---------------------------------------------------------------------------- @@ -365,7 +383,7 @@ test_that("mergeReadPairsPartial works", { expect_identical(res2e$mergedSeq, "TTACAACACACA") expect_identical(res2e$mergedQual, rep(c(10L, 40L), c(5, 7))) expect_identical(res2e$mergedLengths, 12L) - + ## invalid overlaps specified/overlaps modified internally ## minOverlap > lenF -> no valid overlap res0a <- mutscan:::test_mergeReadPairPartial(sF2, qF2, sR2, qR2, lF2, lR2, 8, 7, 9, 9, 1, FALSE) @@ -397,7 +415,7 @@ test_that("mergeReadPairsPartial works", { expect_identical(res0d$mergedQual, qF2) expect_identical(res0d$mergedLengths, 7L) expect_true(res0d$return) - + ## minMergedLength > lenF + lenR -> no valid overlap res0e <- mutscan:::test_mergeReadPairPartial(sF2, qF2, sR2, qR2, lF2, lR2, 1, 7, 15, 9, 1, FALSE) expect_type(res0e, "list") @@ -412,7 +430,7 @@ test_that("mergeReadPairsPartial works", { expect_identical(res0f$mergedQual, rep(c(10L, 40L), c(2, 5))) expect_identical(res0f$mergedLengths, 7L) expect_false(res0f$return) - + ## padded reads for (i in 1:10) { sF <- paste(rep(c("C","A"), c(i, 6)), collapse = "") From fb60010789fa48707682cf42b6aca08693531037 Mon Sep 17 00:00:00 2001 From: mbstadler Date: Thu, 7 Aug 2025 15:03:13 +0200 Subject: [PATCH 35/44] ignore unit test coverage of hard/impossible to trigger failures in reached_end_of_file and get_next_seq --- src/digestFastqs.cpp | 204 +++++++++++++++++++++---------------------- 1 file changed, 102 insertions(+), 102 deletions(-) diff --git a/src/digestFastqs.cpp b/src/digestFastqs.cpp index 488a667..291d1cb 100644 --- a/src/digestFastqs.cpp +++ b/src/digestFastqs.cpp @@ -37,14 +37,14 @@ bool reached_end_of_file(gzFile file, char *ret) { if (gzeof(file)) { return true; } else { - const char *error_string; + const char *error_string; // # nocov start error_string = gzerror(file, &err); if (err) { stop(error_string); - } + } // # nocov end } } - // Check if we have read until a newline character. Otherwise, the read is + // Check if we have read until a newline character. Otherwise, the read is // too long -> break if (std::string(ret).back() != '\n') { stop("Encountered a read exceeding the maximal allowed length"); @@ -65,15 +65,15 @@ bool get_next_seq(gzFile file, char *seq, char *qual, size_t BUFFER_SIZE) { } // sequence if (reached_end_of_file(file, gzgets(file, seq, BUFFER_SIZE))) { - return true; + return true; // # nocov } // quality identifier if (reached_end_of_file(file, gzgets(file, qual, BUFFER_SIZE))) { - return true; + return true; // # nocov } // quality if (reached_end_of_file(file, gzgets(file, qual, BUFFER_SIZE))) { - return true; + return true; // # nocov } return false; @@ -177,7 +177,7 @@ std::string translateString(const std::string& s) { size_t i = 0, j = 0; const int pow4[] = {1, 4, 16}; int codonVal = 0; - + for (i = 0; i < s.size(); i++) { if (s[i] == '_') { j = 0; @@ -188,29 +188,29 @@ std::string translateString(const std::string& s) { case 'a': // codonVal += 0 * pow4[j]; break; - + case 'C': case 'c': codonVal += 1 * pow4[j]; break; - + case 'G': case 'g': codonVal += 2 * pow4[j]; break; - + case 'T': case 't': case 'u': case 'U': codonVal += 3 * pow4[j]; break; - + default: codonVal += 64; break; } - + if (j == 2) { if (codonVal > 63) { aa.push_back('X'); @@ -222,9 +222,9 @@ std::string translateString(const std::string& s) { } else { j++; } - + } - + } return aa; } @@ -239,33 +239,33 @@ bool hasEnding (std::string const &fullString, std::string const &ending) { } } -// Helper function to create the ID for a single mutation group (delins or +// Helper function to create the ID for a single mutation group (delins or // single mutation) -std::string makeSingleBaseHGVSid(int posMin, int posMax, const std::string wtSeq, +std::string makeSingleBaseHGVSid(int posMin, int posMax, const std::string wtSeq, const std::string varSeq) { std::string id = ""; if (posMax > posMin) { // delins - id += std::to_string(posMin) + "_" + std::to_string(posMax) + - "delins" + + id += std::to_string(posMin) + "_" + std::to_string(posMax) + + "delins" + varSeq.substr((size_t)(posMin - 1), (size_t)(posMax - posMin + 1)) + ";"; } else { // single base mutation - id += std::to_string(posMin) + wtSeq[posMin - 1] + + id += std::to_string(posMin) + wtSeq[posMin - 1] + ">" + varSeq[posMin - 1] + ";"; } return id; } // [[Rcpp::export]] -std::string makeBaseHGVS(const std::vector mutationsSorted, - const std::string mutNameDelimiter, +std::string makeBaseHGVS(const std::vector mutationsSorted, + const std::string mutNameDelimiter, const std::string wtSeq, const std::string varSeq) { int prevPosMin = -5, prevPosMax = -5; // something far enough from the first mutation position to not cause a delins bool moreThanOne = false; int pos; std::string mutId = ""; - + if (mutationsSorted.size() == 0) { // no mutations - return only _ (all variants should end with this character, // as it will be stripped off later) @@ -307,15 +307,15 @@ std::string makeBaseHGVS(const std::vector mutationsSorted, return mutId; } -std::string makeAAHGVS(const std::vector mutationsSorted, - const std::string mutNameDelimiter, - const std::string wtSeq, +std::string makeAAHGVS(const std::vector mutationsSorted, + const std::string mutNameDelimiter, + const std::string wtSeq, std::map &threeAA) { int pos; std::string mutAA; char wtAA; std::string mutId = ""; - + if (mutationsSorted.size() == 0) { // no mutations - return only _ (all variants should end with this character, // as it will be stripped off later) @@ -343,13 +343,13 @@ std::string makeAAHGVS(const std::vector mutationsSorted, } // [[Rcpp::export]] -std::string test_makeAAHGVS(const std::vector mutationsSorted, - const std::string mutNameDelimiter, +std::string test_makeAAHGVS(const std::vector mutationsSorted, + const std::string mutNameDelimiter, const std::string wtSeq) { std::map threeAA = initializeThreeAA(); return makeAAHGVS(mutationsSorted, mutNameDelimiter, wtSeq, threeAA); -} - +} + // compare read to wildtype sequence, // identify mutated bases/codons, filter, update counters // and add to the name (mutantName* may be empty if no mutations are found, @@ -375,11 +375,11 @@ bool compareToWildtype(const std::string varSeq, const std::string wtSeq, const int nbrMutatedCodonsMax, const std::set &forbiddenCodons, const std::string codonPrefix, const int nbrMutatedBasesMax, int &nMutQualTooLow, int &nTooManyMutCodons, int &nForbiddenCodons, - int &nTooManyMutBases, std::string &mutantName, + int &nTooManyMutBases, std::string &mutantName, std::string &mutantNameBase, std::string &mutantNameCodon, std::string &mutantNameBaseHGVS, std::string &mutantNameAA, std::string &mutantNameAAHGVS, int &nMutBases, - int &nMutCodons, int &nMutAAs, std::set &mutationTypes, + int &nMutCodons, int &nMutAAs, std::set &mutationTypes, const std::string mutNameDelimiter, const bool collapseToWT, std::map &threeAA) { @@ -402,7 +402,7 @@ bool compareToWildtype(const std::string varSeq, const std::string wtSeq, aaPrefixHGVS.pop_back(); aaPrefixHGVS = aaPrefixHGVS + "p"; } - + // filter if there are too many mutated codons // mutatedCodons.clear(); hasLowQualMutation = false; @@ -434,9 +434,9 @@ bool compareToWildtype(const std::string varSeq, const std::string wtSeq, wtAA = translateString(wtCodon); if (varAA != wtAA) { // add to mutatedAA - mutatedAAs.insert(codonPrefix + mutNameDelimiter + + mutatedAAs.insert(codonPrefix + mutNameDelimiter + std::to_string((int)(i / 3) + 1) + mutNameDelimiter + - varAA + + varAA + std::string("_")); // nonsynonymous or stop if (varAA == "*") { @@ -510,19 +510,19 @@ bool compareToWildtype(const std::string varSeq, const std::string wtSeq, mutantNameAAHGVS += aaPrefixHGVS + "_"; } else { std::vector mutatedBasesSorted, mutatedCodonsSorted, mutatedAAsSorted; - + mutatedBasesSorted.assign(mutatedBases.begin(), mutatedBases.end()); std::sort(mutatedBasesSorted.begin(), mutatedBasesSorted.end(), std::bind(compareCodonPositions, _1, _2, *(mutNameDelimiter.c_str()))); - + mutatedCodonsSorted.assign(mutatedCodons.begin(), mutatedCodons.end()); std::sort(mutatedCodonsSorted.begin(), mutatedCodonsSorted.end(), std::bind(compareCodonPositions, _1, _2, *(mutNameDelimiter.c_str()))); - + mutatedAAsSorted.assign(mutatedAAs.begin(), mutatedAAs.end()); std::sort(mutatedAAsSorted.begin(), mutatedAAsSorted.end(), std::bind(compareCodonPositions, _1, _2, *(mutNameDelimiter.c_str()))); - + for (size_t i = 0; i < mutatedBasesSorted.size(); i++) { mutantNameBase += mutatedBasesSorted[i]; if (nbrMutatedCodonsMax == (-1)) { @@ -545,15 +545,15 @@ bool compareToWildtype(const std::string varSeq, const std::string wtSeq, mutantName += codonPrefix + mutNameDelimiter + "0" + mutNameDelimiter + "WT_"; mutantNameBaseHGVS += basePrefixHGVS + "_"; } else { - mutantNameBaseHGVS += basePrefixHGVS + "." + + mutantNameBaseHGVS += basePrefixHGVS + "." + makeBaseHGVS(mutatedBasesSorted, mutNameDelimiter, wtSeq, varSeq); } if (mutatedAAsSorted.size() == 0) { mutantNameAA += codonPrefix + mutNameDelimiter + "0" + mutNameDelimiter + "WT_"; mutantNameAAHGVS += aaPrefixHGVS + "_"; } else { - mutantNameAAHGVS += aaPrefixHGVS + "." + - makeAAHGVS(mutatedAAsSorted, mutNameDelimiter, + mutantNameAAHGVS += aaPrefixHGVS + "." + + makeAAHGVS(mutatedAAsSorted, mutNameDelimiter, translateString(wtSeq), threeAA); } } @@ -1088,11 +1088,11 @@ int findClosestRefSeqTree(std::string &varSeq, BKtree &wtTree, // Convert mutant summary to DataFrame DataFrame mutantSummaryToDataFrame(std::map mutantSummary) { std::map::iterator mutantSummaryIt; - + size_t dfLen = mutantSummary.size(); std::vector dfSeq(dfLen, ""), dfName(dfLen, ""); std::vector dfReads(dfLen, 0), dfUmis(dfLen, 0), dfMaxReads(dfLen, 0); - std::vector dfMutBases(dfLen, ""), dfMutCodons(dfLen, ""); + std::vector dfMutBases(dfLen, ""), dfMutCodons(dfLen, ""); std::vector dfMutAAs(dfLen, ""), dfVarLengths(dfLen, ""); std::vector dfMutantNameBase(dfLen, ""), dfMutantNameCodon(dfLen, ""); std::vector dfMutantNameBaseHGVS(dfLen, ""), dfMutantNameAA(dfLen, ""); @@ -1110,7 +1110,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma if (!collapsedSequence.empty()) { collapsedSequence.pop_back(); // remove final "," } - + // mutantNameBase std::vector mutantNameBaseVector((*mutantSummaryIt).second.mutantNameBase.begin(), (*mutantSummaryIt).second.mutantNameBase.end()); @@ -1121,7 +1121,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma if (!collapsedMutantNameBase.empty()) { collapsedMutantNameBase.pop_back(); // remove final "," } - + // mutantNameCodon std::vector mutantNameCodonVector((*mutantSummaryIt).second.mutantNameCodon.begin(), (*mutantSummaryIt).second.mutantNameCodon.end()); @@ -1132,7 +1132,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma if (!collapsedMutantNameCodon.empty()) { collapsedMutantNameCodon.pop_back(); // remove final "," } - + // mutantNameBaseHGVS std::vector mutantNameBaseHGVSVector((*mutantSummaryIt).second.mutantNameBaseHGVS.begin(), (*mutantSummaryIt).second.mutantNameBaseHGVS.end()); @@ -1143,7 +1143,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma if (!collapsedMutantNameBaseHGVS.empty()) { collapsedMutantNameBaseHGVS.pop_back(); // remove final "," } - + // mutantNameAA std::vector mutantNameAAVector((*mutantSummaryIt).second.mutantNameAA.begin(), (*mutantSummaryIt).second.mutantNameAA.end()); @@ -1154,7 +1154,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma if (!collapsedMutantNameAA.empty()) { collapsedMutantNameAA.pop_back(); // remove final "," } - + // mutantNameAAHGVS std::vector mutantNameAAHGVSVector((*mutantSummaryIt).second.mutantNameAAHGVS.begin(), (*mutantSummaryIt).second.mutantNameAAHGVS.end()); @@ -1165,7 +1165,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma if (!collapsedMutantNameAAHGVS.empty()) { collapsedMutantNameAAHGVS.pop_back(); // remove final "," } - + // mutationTypes std::vector mutationTypesVector((*mutantSummaryIt).second.mutationTypes.begin(), (*mutantSummaryIt).second.mutationTypes.end()); @@ -1176,7 +1176,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma if (!collapsedMutationTypes.empty()) { collapsedMutationTypes.pop_back(); // remove final "," } - + // collapse all aa sequences associated with the mutant std::vector sequenceAAVector((*mutantSummaryIt).second.sequenceAA.begin(), (*mutantSummaryIt).second.sequenceAA.end()); @@ -1187,7 +1187,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma if (!collapsedSequenceAA.empty()) { collapsedSequenceAA.pop_back(); // remove final "," } - + // collapse all observed nMutBases/nMutCodons std::vector nMutBasesVector((*mutantSummaryIt).second.nMutBases.begin(), (*mutantSummaryIt).second.nMutBases.end()); @@ -1198,7 +1198,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma if (!collapsedNMutBases.empty()) { collapsedNMutBases.pop_back(); } - + // codons std::vector nMutCodonsVector((*mutantSummaryIt).second.nMutCodons.begin(), (*mutantSummaryIt).second.nMutCodons.end()); @@ -1209,7 +1209,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma if (!collapsedNMutCodons.empty()) { collapsedNMutCodons.pop_back(); } - + // AAs std::vector nMutAAsVector((*mutantSummaryIt).second.nMutAAs.begin(), (*mutantSummaryIt).second.nMutAAs.end()); @@ -1220,7 +1220,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma if (!collapsedNMutAAs.empty()) { collapsedNMutAAs.pop_back(); } - + dfName[j] = (*mutantSummaryIt).first; dfSeq[j] = collapsedSequence; dfReads[j] = (*mutantSummaryIt).second.nReads; @@ -1239,7 +1239,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma dfVarLengths[j] = (*mutantSummaryIt).second.varLengths; j++; } - + DataFrame df = DataFrame::create(Named("mutantName") = dfName, Named("sequence") = dfSeq, Named("nbrReads") = dfReads, @@ -1257,64 +1257,64 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma Named("mutationTypes") = dfMutationTypes, Named("sequenceAA") = dfSeqAA, Named("stringsAsFactors") = false); - + return df; } //' Create a conversion table for collapsing similar sequences -//' @param seqs Character vector with nucleotide sequences (or pairs of -//' sequences concatenated with "_") to be collapsed. The sequences must +//' @param seqs Character vector with nucleotide sequences (or pairs of +//' sequences concatenated with "_") to be collapsed. The sequences must //' all be of the same length. //' @param scores Numeric vector of "scores" for the sequences. Typically -//' the total read/UMI count. A higher score will be preferred when -//' deciding which sequence to use as the representative for a group of +//' the total read/UMI count. A higher score will be preferred when +//' deciding which sequence to use as the representative for a group of //' collapsed sequences. -//' @param collapseMaxDist Numeric scalar defining the tolerance for collapsing -//' similar sequences. If the value is in [0, 1), it defines the maximal +//' @param collapseMaxDist Numeric scalar defining the tolerance for collapsing +//' similar sequences. If the value is in [0, 1), it defines the maximal //' Hamming distance in terms of a fraction of sequence length: //' (\code{round(collapseMaxDist * nchar(sequence))}). //' A value greater or equal to 1 is rounded and directly used as the maximum //' allowed Hamming distance. Note that sequences can only be //' collapsed if they are all of the same length. The default value is 0. -//' @param collapseMinScore Numeric scalar, indicating the minimum score -//' required for a sequence to be considered as a representative for a -//' group of similar sequences (i.e., to allow other sequences to be +//' @param collapseMinScore Numeric scalar, indicating the minimum score +//' required for a sequence to be considered as a representative for a +//' group of similar sequences (i.e., to allow other sequences to be //' collapsed into it). The default value is 0. //' @param collapseMinRatio Numeric scalar. During collapsing of -//' similar sequences, a low-frequency sequence will be collapsed -//' with a higher-frequency sequence only if the ratio between the -//' high-frequency and the low-frequency scores is at least this +//' similar sequences, a low-frequency sequence will be collapsed +//' with a higher-frequency sequence only if the ratio between the +//' high-frequency and the low-frequency scores is at least this //' high. A value of 0 indicates that no such check is performed. //' @param verbose Logical scalar, whether to print progress messages. -//' -//' @return A data.frame with two columns, containing the input sequences +//' +//' @return A data.frame with two columns, containing the input sequences //' and the representatives for the groups resulting from grouping similar //' sequences, respectively. -//' +//' //' @examples //' seqs <- c("AACGTAGCA", "ACCGTAGCA", "AACGGAGCA", "ATCGGAGCA", "TGAGGCATA") //' scores <- c(5, 1, 3, 1, 8) -//' groupSimilarSequences(seqs = seqs, scores = scores, -//' collapseMaxDist = 1, collapseMinScore = 0, +//' groupSimilarSequences(seqs = seqs, scores = scores, +//' collapseMaxDist = 1, collapseMinScore = 0, //' collapseMinRatio = 0, verbose = FALSE) -//' +//' //' @export //' @author Michael Stadler, Charlotte Soneson // [[Rcpp::export]] Rcpp::DataFrame groupSimilarSequences(std::vector seqs, - std::vector scores, - double collapseMaxDist=0.0, + std::vector scores, + double collapseMaxDist=0.0, double collapseMinScore=0.0, double collapseMinRatio=0.0, bool verbose=false) { - + // combine seqs and scores std::map seqsScores; std::map::iterator seqsScoresIt, seqsScoresSimIt; for (size_t i = 0; i < seqs.size(); i++) { seqsScores.insert(std::pair(seqs[i], scores[i])); } - + // get sequence length size_t seqlen = seqs[0].length(); @@ -1326,11 +1326,11 @@ Rcpp::DataFrame groupSimilarSequences(std::vector seqs, tol = (collapseMaxDist * (seqs[0].find("_") != std::string::npos ? seqlen-1 : seqlen)); } - + if (verbose) { Rcout << "start collapsing sequences (tolerance: " << tol << ")..."; } - + // sort seqsScores decreasingly by scores // ... create an empty intermediate vector std::vector> vec; @@ -1347,7 +1347,7 @@ Rcpp::DataFrame groupSimilarSequences(std::vector seqs, return l.second > r.second; return l.first < r.first; }); - + // store sequences in BK tree BKtree tree; for (vecIt = vec.begin(); vecIt != vec.end(); vecIt++) { @@ -1360,11 +1360,11 @@ Rcpp::DataFrame groupSimilarSequences(std::vector seqs, } } vec.clear(); // remove temporary vector - + std::string querySeq, representativeSeq; std::vector simSeqs; std::map single2representative; - + // start querying in the order of tree.items (ordered decreasingly by scores) size_t start_size = (double)tree.size; while (tree.size > 0) { @@ -1399,7 +1399,7 @@ Rcpp::DataFrame groupSimilarSequences(std::vector seqs, tree.remove(simSeqs[i]); } } - + // check for user interruption and print progress if ((start_size - tree.size) % 2000 == 0) { // every 2,000 queries (every ~1-2s) Rcpp::checkUserInterrupt(); // ... check for user interrupt @@ -1412,7 +1412,7 @@ Rcpp::DataFrame groupSimilarSequences(std::vector seqs, } } } - + // return grouping results as a DataFrame std::vector repseqs(seqs.size()); std::unordered_set uniqueRepseqs; @@ -1834,8 +1834,8 @@ List digestFastqsCpp(std::vector fastqForwardVect, } std::string wtForward = wildTypeForward[idxForward]; std::string wtNameForward = wildTypeForwardNames[idxForward]; - - // if the variable region is longer than the (best matching) WT sequence, + + // if the variable region is longer than the (best matching) WT sequence, // skip the read (consider it to have the wrong length) if (varSeqForward.length() > wtForward.length()) { #ifdef _OPENMP @@ -1843,16 +1843,16 @@ List digestFastqsCpp(std::vector fastqForwardVect, #endif nReadWrongLength++; chunkBuffer->write_seq(ci, outfile1, outfile2, nTot-(int)iChunk+(int)ci, "noPrimer_readWrongLength"); - continue; + continue; } - + if (compareToWildtype(varSeqForward, wtForward, varIntQualForward, mutatedPhredMinForward, nbrMutatedCodonsMaxForward, forbiddenCodonsForward, wtNameForward, nbrMutatedBasesMaxForward, nMutQualTooLow, - nTooManyMutCodons, nForbiddenCodons, nTooManyMutBases, + nTooManyMutCodons, nForbiddenCodons, nTooManyMutBases, mutantName, mutantNameBase, mutantNameCodon, mutantNameBaseHGVS, mutantNameAA, mutantNameAAHGVS, - nMutBases, nMutCodons, + nMutBases, nMutCodons, nMutAAs, mutationTypes, mutNameDelimiter, collapseToWTForward, threeAA)) { // read is to be filtered out @@ -1911,8 +1911,8 @@ List digestFastqsCpp(std::vector fastqForwardVect, } std::string wtReverse = wildTypeReverse[idxReverse]; std::string wtNameReverse = wildTypeReverseNames[idxReverse]; - - // if the variable region is longer than the (best matching) WT sequence, + + // if the variable region is longer than the (best matching) WT sequence, // skip the read (consider it to have the wrong length) if (varSeqReverse.length() > wtReverse.length()) { #ifdef _OPENMP @@ -1920,16 +1920,16 @@ List digestFastqsCpp(std::vector fastqForwardVect, #endif nReadWrongLength++; chunkBuffer->write_seq(ci, outfile1, outfile2, nTot-(int)iChunk+(int)ci, "noPrimer_readWrongLength"); - continue; + continue; } - + if (compareToWildtype(varSeqReverse, wtReverse, varIntQualReverse, mutatedPhredMinReverse, nbrMutatedCodonsMaxReverse, forbiddenCodonsReverse, wtNameReverse, nbrMutatedBasesMaxReverse, nMutQualTooLow, - nTooManyMutCodons, nForbiddenCodons, nTooManyMutBases, + nTooManyMutCodons, nForbiddenCodons, nTooManyMutBases, mutantName, mutantNameBase, mutantNameCodon, mutantNameBaseHGVS, mutantNameAA, mutantNameAAHGVS, - nMutBases, nMutCodons, + nMutBases, nMutCodons, nMutAAs, mutationTypes, mutNameDelimiter, collapseToWTReverse, threeAA)) { // read is to be filtered out @@ -2238,7 +2238,7 @@ List digestFastqsCpp(std::vector fastqForwardVect, Rcpp::StringVector wildTypeReverseRcpp(wildTypeReverse.size()); wildTypeReverseRcpp = wildTypeReverse; wildTypeReverseRcpp.attr("names") = wildTypeReverseNames; - + std::vector forbiddenCodonsUsedForward(forbiddenCodonsForward.begin(), forbiddenCodonsForward.end()); std::vector forbiddenCodonsUsedReverse(forbiddenCodonsReverse.begin(), forbiddenCodonsReverse.end()); List param; @@ -2298,7 +2298,7 @@ List digestFastqsCpp(std::vector fastqForwardVect, Named("nbrMismatchForward") = nPhredMismatchForward, Named("nbrMatchReverse") = nPhredCorrectReverse, Named("nbrMismatchReverse") = nPhredMismatchReverse); - + // ... filter statistics DataFrame filt = DataFrame::create(Named("nbrTotal") = nTot, Named("f1_nbrAdapter") = nAdapter, @@ -2316,10 +2316,10 @@ List digestFastqsCpp(std::vector fastqForwardVect, Named("f12_nbrTooManyMutConstant") = nTooManyMutConstant, Named("f13_nbrTooManyBestConstantHits") = nTooManyBestConstantHits, Named("nbrRetained") = nRetain); - + // ... main data frame DataFrame df = mutantSummaryToDataFrame(mutantSummary); - + // ... pack into list List L = List::create(Named("parameters") = param, Named("filterSummary") = filt, From 03e225c66996f6046f4b5189ba6e6c249964b83a Mon Sep 17 00:00:00 2001 From: mbstadler Date: Thu, 7 Aug 2025 15:10:46 +0200 Subject: [PATCH 36/44] ignore hard to trigger file open error for test coverage in openFastq --- src/digestFastqs.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/digestFastqs.cpp b/src/digestFastqs.cpp index 291d1cb..e9444d0 100644 --- a/src/digestFastqs.cpp +++ b/src/digestFastqs.cpp @@ -608,12 +608,12 @@ struct mutantInfo { gzFile openFastq(std::string filename, const char* mode = "rb") { gzFile file = gzopen(filename.c_str(), mode); if (!file) { - if (errno) { + if (errno) { // # nocov start stop("Failed to open file '", filename, "': ", strerror(errno), " (errno=", errno, ")"); } else { stop("Failed to open file '", filename, "': zlib out of memory"); - } + } // # nocov end } return file; } From 20b42b369ee7fab5f70c103804ffcebfc18df65d Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Thu, 7 Aug 2025 16:46:06 +0200 Subject: [PATCH 37/44] Add some more tests, ignore lines that are not recognized by covr --- R/plotPairs.R | 10 +++- R/plotResults.R | 4 +- tests/testthat/test_linkMultipleVariants.R | 54 ++++++++++++++++++++++ 3 files changed, 65 insertions(+), 3 deletions(-) diff --git a/R/plotPairs.R b/R/plotPairs.R index 167d707..b1736b4 100644 --- a/R/plotPairs.R +++ b/R/plotPairs.R @@ -126,13 +126,15 @@ plotPairs <- function(se, selAssay = "counts", doLog = TRUE, pseudocount = 1, cols)(transfCor), maxColorValue = 255) } else { + # nocov start cols <- hcl.colors(n = 11, palette = "RdBu")[6:10] col <- rgb(colorRamp( cols)(transfCor), maxColorValue = 255) + # nocov end } } else { - col <- "white" + col <- "white" #nocov } ## Construct plot @@ -150,6 +152,9 @@ plotPairs <- function(se, selAssay = "counts", doLog = TRUE, pseudocount = 1, ## --------------------------------------------------------------------- ## ## Scatter plots ## --------------------------------------------------------------------- ## + # All the point plotting functions are tested, but for some reason covr + # doesn't pick it up - ignore these definitions in the coverage tests + # nocov start if (pointsType == "smoothscatter") { ## Define function to create smoothscatter-like plot ## (for use with ggpairs) @@ -229,6 +234,7 @@ plotPairs <- function(se, selAssay = "counts", doLog = TRUE, pseudocount = 1, } } } + # nocov end ## Define the function to use for the plots if (pointsType == "smoothscatter") { @@ -244,6 +250,7 @@ plotPairs <- function(se, selAssay = "counts", doLog = TRUE, pseudocount = 1, ## --------------------------------------------------------------------- ## ## Histogram ## --------------------------------------------------------------------- ## + # nocov start diaghist <- function(data, mapping, ...) { gg <- ggplot(data = data, mapping = mapping) + geom_histogram(fill = "#F5C710", color = "grey50", @@ -254,6 +261,7 @@ plotPairs <- function(se, selAssay = "counts", doLog = TRUE, pseudocount = 1, } gg } + # nocov end ## --------------------------------------------------------------------- ## ## Combined plot diff --git a/R/plotResults.R b/R/plotResults.R index 6470054..e61d758 100644 --- a/R/plotResults.R +++ b/R/plotResults.R @@ -62,7 +62,7 @@ .assertScalar(x = interactivePlot, type = "logical") if (interactivePlot && !requireNamespace("plotly", quietly = TRUE)) { - stop("The 'plotly' package is required to make interactive plots.") + stop("The 'plotly' package is required to make interactive plots.") # nocov } xr <- range(res[[xCol]], na.rm = TRUE) yr <- range(res[[yCol]], na.rm = TRUE) @@ -113,7 +113,7 @@ } } if (interactivePlot) { - plotly::ggplotly(gg) + plotly::ggplotly(gg) # nocov } else { if (!is.null(labelCol) && length(labelValues) > 0) { .assertVector(x = labelValues, type = "character", diff --git a/tests/testthat/test_linkMultipleVariants.R b/tests/testthat/test_linkMultipleVariants.R index f797755..6d9e58b 100644 --- a/tests/testthat/test_linkMultipleVariants.R +++ b/tests/testthat/test_linkMultipleVariants.R @@ -535,6 +535,60 @@ test_that("linkMultipleVariants works", { expect_equal(res$filtSeparate$V3[, "nbrRetained"], sum(!grepl("C.*mut|del|V3\\.", truth$truth$status))) + # -------------------------------------------------------------------------- + # Get warning when summing UMIs + expect_warning({ + res <- linkMultipleVariants( + combinedDigestParams = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "UVCVCV", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CVCV", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 20, + maxOverlap = 30, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + verbose = verbose), + barcode = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + elementsForward = "CVCSCS", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + avePhredMinForward = 20, + collapseMaxDist = 1, collapseMinScore = 1, collapseMinRatio = 1, + verbose = verbose), + V2 = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "CSCVCS", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CSCV", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 10, + maxOverlap = 20, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + wildTypeForward = truth$WTV2, + nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, + forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE, + verbose = verbose, useTreeWTmatch = TRUE), + V3 = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "CSCSCV", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CVCS", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 5, + maxOverlap = 15, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + wildTypeForward = truth$WTV3, + nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, + forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE, + verbose = verbose, useTreeWTmatch = TRUE) + )}, "Aggregating UMI counts") }) From 8219b7441fc3983fc24e2c1163c2ae730499190e Mon Sep 17 00:00:00 2001 From: mbstadler Date: Thu, 7 Aug 2025 18:48:06 +0200 Subject: [PATCH 38/44] add test_compareToWildtype and corresponding unit tests --- R/RcppExports.R | 4 ++ src/RcppExports.cpp | 21 ++++++++ src/digestFastqs.cpp | 56 +++++++++++++++++++ tests/testthat/test_digestFastqs_helpers.R | 62 ++++++++++++++++++++++ 4 files changed, 143 insertions(+) diff --git a/R/RcppExports.R b/R/RcppExports.R index 03933da..19b3356 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -44,6 +44,10 @@ test_makeAAHGVS <- function(mutationsSorted, mutNameDelimiter, wtSeq) { .Call(`_mutscan_test_makeAAHGVS`, mutationsSorted, mutNameDelimiter, wtSeq) } +test_compareToWildtype <- function(varSeq, wtSeq, varIntQual, forbiddenCodons_vect, mutatedPhredMin = 0.0, nbrMutatedCodonsMax = -1L, codonPrefix = "c", nbrMutatedBasesMax = -1L, mutNameDelimiter = ".", collapseToWT = FALSE) { + .Call(`_mutscan_test_compareToWildtype`, varSeq, wtSeq, varIntQual, forbiddenCodons_vect, mutatedPhredMin, nbrMutatedCodonsMax, codonPrefix, nbrMutatedBasesMax, mutNameDelimiter, collapseToWT) +} + test_decomposeRead <- function(sseq, squal, elements, elementLengths, primerSeqs, umiSeq, varSeq, varQual, varLengths, constSeq, constQual, nNoPrimer, nReadWrongLength) { .Call(`_mutscan_test_decomposeRead`, sseq, squal, elements, elementLengths, primerSeqs, umiSeq, varSeq, varQual, varLengths, constSeq, constQual, nNoPrimer, nReadWrongLength) } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index efffbf7..27a64cd 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -85,6 +85,26 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// test_compareToWildtype +Rcpp::List test_compareToWildtype(const std::string varSeq, const std::string wtSeq, const std::vector varIntQual, const std::vector forbiddenCodons_vect, const double mutatedPhredMin, const int nbrMutatedCodonsMax, const std::string codonPrefix, const int nbrMutatedBasesMax, const std::string mutNameDelimiter, const bool collapseToWT); +RcppExport SEXP _mutscan_test_compareToWildtype(SEXP varSeqSEXP, SEXP wtSeqSEXP, SEXP varIntQualSEXP, SEXP forbiddenCodons_vectSEXP, SEXP mutatedPhredMinSEXP, SEXP nbrMutatedCodonsMaxSEXP, SEXP codonPrefixSEXP, SEXP nbrMutatedBasesMaxSEXP, SEXP mutNameDelimiterSEXP, SEXP collapseToWTSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::string >::type varSeq(varSeqSEXP); + Rcpp::traits::input_parameter< const std::string >::type wtSeq(wtSeqSEXP); + Rcpp::traits::input_parameter< const std::vector >::type varIntQual(varIntQualSEXP); + Rcpp::traits::input_parameter< const std::vector >::type forbiddenCodons_vect(forbiddenCodons_vectSEXP); + Rcpp::traits::input_parameter< const double >::type mutatedPhredMin(mutatedPhredMinSEXP); + Rcpp::traits::input_parameter< const int >::type nbrMutatedCodonsMax(nbrMutatedCodonsMaxSEXP); + Rcpp::traits::input_parameter< const std::string >::type codonPrefix(codonPrefixSEXP); + Rcpp::traits::input_parameter< const int >::type nbrMutatedBasesMax(nbrMutatedBasesMaxSEXP); + Rcpp::traits::input_parameter< const std::string >::type mutNameDelimiter(mutNameDelimiterSEXP); + Rcpp::traits::input_parameter< const bool >::type collapseToWT(collapseToWTSEXP); + rcpp_result_gen = Rcpp::wrap(test_compareToWildtype(varSeq, wtSeq, varIntQual, forbiddenCodons_vect, mutatedPhredMin, nbrMutatedCodonsMax, codonPrefix, nbrMutatedBasesMax, mutNameDelimiter, collapseToWT)); + return rcpp_result_gen; +END_RCPP +} // test_decomposeRead List test_decomposeRead(const std::string sseq, const std::string squal, const std::string elements, const std::vector elementLengths, const std::vector primerSeqs, std::string umiSeq, std::string varSeq, std::string varQual, std::vector varLengths, std::string constSeq, std::string constQual, int nNoPrimer, int nReadWrongLength); RcppExport SEXP _mutscan_test_decomposeRead(SEXP sseqSEXP, SEXP squalSEXP, SEXP elementsSEXP, SEXP elementLengthsSEXP, SEXP primerSeqsSEXP, SEXP umiSeqSEXP, SEXP varSeqSEXP, SEXP varQualSEXP, SEXP varLengthsSEXP, SEXP constSeqSEXP, SEXP constQualSEXP, SEXP nNoPrimerSEXP, SEXP nReadWrongLengthSEXP) { @@ -298,6 +318,7 @@ static const R_CallMethodDef CallEntries[] = { {"_mutscan_translateString", (DL_FUNC) &_mutscan_translateString, 1}, {"_mutscan_makeBaseHGVS", (DL_FUNC) &_mutscan_makeBaseHGVS, 4}, {"_mutscan_test_makeAAHGVS", (DL_FUNC) &_mutscan_test_makeAAHGVS, 3}, + {"_mutscan_test_compareToWildtype", (DL_FUNC) &_mutscan_test_compareToWildtype, 10}, {"_mutscan_test_decomposeRead", (DL_FUNC) &_mutscan_test_decomposeRead, 13}, {"_mutscan_test_mergeReadPairPartial", (DL_FUNC) &_mutscan_test_mergeReadPairPartial, 12}, {"_mutscan_findClosestRefSeq", (DL_FUNC) &_mutscan_findClosestRefSeq, 4}, diff --git a/src/digestFastqs.cpp b/src/digestFastqs.cpp index e9444d0..31f6154 100644 --- a/src/digestFastqs.cpp +++ b/src/digestFastqs.cpp @@ -561,6 +561,62 @@ bool compareToWildtype(const std::string varSeq, const std::string wtSeq, return false; } +// [[Rcpp::export]] +Rcpp::List test_compareToWildtype( + const std::string varSeq, + const std::string wtSeq, + const std::vector varIntQual, + const std::vector forbiddenCodons_vect, + const double mutatedPhredMin = 0.0, + const int nbrMutatedCodonsMax = -1, + const std::string codonPrefix = "c", + const int nbrMutatedBasesMax = -1, + const std::string mutNameDelimiter = ".", + const bool collapseToWT = false) { + // allocate C++ only variables + std::map threeAA = initializeThreeAA(); + std::set forbiddenCodons(forbiddenCodons_vect.begin(), + forbiddenCodons_vect.end()); + int nMutQualTooLow = 0, nTooManyMutCodons = 0, nForbiddenCodons = 0; + int nTooManyMutBases = 0, nMutBases = 0, nMutCodons = 0, nMutAAs = 0; + std::string mutantName(""), mutantNameBase(""); + std::string mutantNameCodon(""), mutantNameBaseHGVS(""); + std::string mutantNameAA(""), mutantNameAAHGVS(""); + std::set mutationTypes; + + // call compareToWildtype + if (compareToWildtype(varSeq, wtSeq, varIntQual, mutatedPhredMin, + nbrMutatedCodonsMax, forbiddenCodons, + codonPrefix, nbrMutatedBasesMax, nMutQualTooLow, + nTooManyMutCodons, nForbiddenCodons, + nTooManyMutBases, mutantName, + mutantNameBase, mutantNameCodon, + mutantNameBaseHGVS, mutantNameAA, + mutantNameAAHGVS, nMutBases, + nMutCodons, nMutAAs, mutationTypes, + mutNameDelimiter, collapseToWT, threeAA)) { + // read should be filtered out -> return empty list + return Rcpp::List::create(); + } else { + std::vector mutationTypes_vect(mutationTypes.begin(), + mutationTypes.end()); + return Rcpp::List::create(Rcpp::_["nMutQualTooLow"] = nMutQualTooLow, + Rcpp::_["nTooManyMutCodons"] = nTooManyMutCodons, + Rcpp::_["nForbiddenCodons"] = nForbiddenCodons, + Rcpp::_["nTooManyMutBases"] = nTooManyMutBases, + Rcpp::_["nMutBases"] = nMutBases, + Rcpp::_["nMutCodons"] = nMutCodons, + Rcpp::_["nMutAAs"] = nMutAAs, + Rcpp::_["mutantName"] = mutantName, + Rcpp::_["mutantNameBase"] = mutantNameBase, + Rcpp::_["mutantNameCodon"] = mutantNameCodon, + Rcpp::_["mutantNameBaseHGVS"] = mutantNameBaseHGVS, + Rcpp::_["mutantNameAA"] = mutantNameAA, + Rcpp::_["mutantNameAAHGVS"] = mutantNameAAHGVS, + Rcpp::_["mutationTypes"] = mutationTypes_vect); + } +} + // compare constSeq to constant sequence and update // counter by match/mismatch and by base Phred quality // return true diff --git a/tests/testthat/test_digestFastqs_helpers.R b/tests/testthat/test_digestFastqs_helpers.R index b0dde45..f492227 100644 --- a/tests/testthat/test_digestFastqs_helpers.R +++ b/tests/testthat/test_digestFastqs_helpers.R @@ -524,4 +524,66 @@ test_that("makeAAHGVS works", { "[(Thr1Leu);(Asp2Met)]_") expect_equal(test_makeAAHGVS(c("f.1.L", "f.2.M", "f.7.M"), ".", "TDTLQAETDQLEDEKSALQTEIANLLKEKEKL"), "[(Thr1Leu);(Asp2Met);(Glu7Met)]_") + +## ---------------------------------------------------------------------------- +## compareToWildtype +## ---------------------------------------------------------------------------- +test_that("compareToWildtype works", { + # read is kept + expect_identical( + test_compareToWildtype(varSeq = "AAAGGACGA", wtSeq = "AATGGACGT", + varIntQual = rep(32L, 9L), + forbiddenCodons_vect = character(0), + mutatedPhredMin = 0.0, + nbrMutatedCodonsMax = 3L, + codonPrefix = "xyz", + nbrMutatedBasesMax = 3L, + mutNameDelimiter = ".", + collapseToWT = TRUE), + list(nMutQualTooLow = 0L, nTooManyMutCodons = 0L, nForbiddenCodons = 0L, + nTooManyMutBases = 0L, nMutBases = 2L, nMutCodons = 2L, + nMutAAs = 1L, mutantName = "xyz_", mutantNameBase = "xyz_", + mutantNameCodon = "xyz_", mutantNameBaseHGVS = "xyz:c_", + mutantNameAA = "xyz_", mutantNameAAHGVS = "xyz:p_", + mutationTypes = c("nonsynonymous", "silent"))) + expect_identical( + test_compareToWildtype(varSeq = "AAAGGACGA", wtSeq = "AATGGACGT", + varIntQual = rep(32L, 9L), + forbiddenCodons_vect = character(0), + mutatedPhredMin = 0.0, + nbrMutatedCodonsMax = 3L, + codonPrefix = "xyz:c", + nbrMutatedBasesMax = 3L, + mutNameDelimiter = ".", + collapseToWT = TRUE), + list(nMutQualTooLow = 0L, nTooManyMutCodons = 0L, nForbiddenCodons = 0L, + nTooManyMutBases = 0L, nMutBases = 2L, nMutCodons = 2L, + nMutAAs = 1L, mutantName = "xyz:c_", mutantNameBase = "xyz:c_", + mutantNameCodon = "xyz:c_", mutantNameBaseHGVS = "xyz:c_", + mutantNameAA = "xyz:c_", mutantNameAAHGVS = "xyz:p_", + mutationTypes = c("nonsynonymous", "silent"))) + # read is filtered out + expect_identical( + test_compareToWildtype(varSeq = "CGTCGTCGA", wtSeq = "CGTCGTCGT", + varIntQual = rep(32L, 9L), + forbiddenCodons_vect = character(0), + mutatedPhredMin = 33.0, # <--- + nbrMutatedCodonsMax = 3L, + codonPrefix = "xyz", + nbrMutatedBasesMax = 3L, + mutNameDelimiter = ".", + collapseToWT = TRUE), + list()) + expect_identical( + test_compareToWildtype(varSeq = "CGACGACGA", wtSeq = "CGTCGTCGT", + varIntQual = rep(32L, 9L), + forbiddenCodons_vect = character(0), + mutatedPhredMin = 0.0, + nbrMutatedCodonsMax = -1L, + codonPrefix = "xyz", + nbrMutatedBasesMax = 2L, # <--- + mutNameDelimiter = ".", + collapseToWT = TRUE), + list()) }) +}) \ No newline at end of file From 544a5a593a75872e3ca39fb6e9e6178464ca61ef Mon Sep 17 00:00:00 2001 From: mbstadler Date: Thu, 7 Aug 2025 18:48:40 +0200 Subject: [PATCH 39/44] ignore coverage of lines that are unreachable through the R API --- src/BKtree_utils.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/BKtree_utils.h b/src/BKtree_utils.h index c42bc45..1f0f669 100644 --- a/src/BKtree_utils.h +++ b/src/BKtree_utils.h @@ -177,7 +177,7 @@ class BKtree { void print() { Rcout << size << "\n"; if (size > 0) { - _print(root, root, 0); // recursive node printing starting from root at zero identation + _print(root, root, 0); // recursive node printing starting from root at zero indentation } } @@ -398,10 +398,10 @@ class BKtree { // and the recurse on all children of n void _print(node* r = nullptr, node* n = nullptr, int depth = 0) { if (n == nullptr) { - n = root; + n = root; // # nocov } if (r == nullptr) { - r = root; + r = root; // # nocov } for (int i = 0; i < depth; ++i) { From d9c470425ec77bb341a71d29317198d879e37b2f Mon Sep 17 00:00:00 2001 From: mbstadler Date: Thu, 7 Aug 2025 18:49:21 +0200 Subject: [PATCH 40/44] ignore coverage for lines that are hard to trigger in tests in various helper functions --- src/digestFastqs.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/digestFastqs.cpp b/src/digestFastqs.cpp index 31f6154..2f46daf 100644 --- a/src/digestFastqs.cpp +++ b/src/digestFastqs.cpp @@ -751,7 +751,7 @@ bool decomposeRead(const std::string sseq, umiSeq, varSeq, varQual, varLengths, constSeq, constQual, nNoPrimer, nReadWrongLength); if (!post) { - return false; + return false; // # nocov } break; } @@ -1028,7 +1028,7 @@ void removeEOL(std::string &seq) { seq.pop_back(); } if (seq.back() == '\r') { - seq.pop_back(); + seq.pop_back(); // # nocov } } @@ -1458,12 +1458,12 @@ Rcpp::DataFrame groupSimilarSequences(std::vector seqs, // check for user interruption and print progress if ((start_size - tree.size) % 2000 == 0) { // every 2,000 queries (every ~1-2s) - Rcpp::checkUserInterrupt(); // ... check for user interrupt + Rcpp::checkUserInterrupt(); // ... check for user interrupt # nocov start // ... and give an update if (verbose && (start_size - tree.size) % 2000 == 0) { Rcout << " " << std::setprecision(4) << (100.0 * ((double)(start_size - tree.size) / (double)start_size)) << - "% done" << std::endl; + "% done" << std::endl; // # nocov end } } } @@ -1648,7 +1648,7 @@ List digestFastqsCpp(std::vector fastqForwardVect, // if maxNReads has been reached in the previous file, break bool done; if (maxNReads != (-1) && nTot >= maxNReads) { - done = true; + done = true; // # nocov } else { done = false; } @@ -1689,7 +1689,7 @@ List digestFastqsCpp(std::vector fastqForwardVect, nTot++; } if (nTot % 200000 == 0) { // every 200,000 reads (every ~1.6 seconds) - Rcpp::checkUserInterrupt(); // ... check for user interrupt + Rcpp::checkUserInterrupt(); // ... check for user interrupt # nocov } // if maxNReads has been reached, break @@ -2276,7 +2276,7 @@ List digestFastqsCpp(std::vector fastqForwardVect, mutCounter++; // check for user interruption and print progress if (mutCounter % 200 == 0) { // every 200 queries - Rcpp::checkUserInterrupt(); // ... check for user interrupt + Rcpp::checkUserInterrupt(); // ... check for user interrupt # nocov } } From a1525ce8eaef701f731998a548cdef70914f295d Mon Sep 17 00:00:00 2001 From: mbstadler Date: Thu, 7 Aug 2025 18:49:49 +0200 Subject: [PATCH 41/44] add test coverage of digestFastqsCpp --- tests/testthat/test_digestFastqs_cis.R | 62 +++++++++++++++++++++----- 1 file changed, 51 insertions(+), 11 deletions(-) diff --git a/tests/testthat/test_digestFastqs_cis.R b/tests/testthat/test_digestFastqs_cis.R index 5f0c328..10aaef2 100644 --- a/tests/testthat/test_digestFastqs_cis.R +++ b/tests/testthat/test_digestFastqs_cis.R @@ -36,10 +36,50 @@ test_that("digestFastqs works as expected for cis experiments", { filteredReadsFastqForward = "", filteredReadsFastqReverse = "", maxNReads = -1, verbose = FALSE, - nThreads = 1, chunkSize = 1000, + nThreads = 1, chunkSize = 1000, maxReadLength = 1024 ) + ## hit maxNReads + Ldef0 <- Ldef + Ldef0$maxNReads <- 10 + res0 <- do.call(digestFastqs, Ldef0) + expect_identical(res0$filterSummary$nbrTotal, 10L) + + ## hit nTooManyNinUMI + Ldef0 <- Ldef + Ldef0$elementsForward <- "UV" + Ldef0$elementsReverse <- "UVS" + Ldef0$elementLengthsForward <- c(115, 10) + Ldef0$elementLengthsReverse = c(111, 10, -1) + Ldef0$adapterForward <- Ldef0$adapterReverse <- "" + Ldef0$primerForward <- Ldef0$primerReverse <- "" + Ldef0$wildTypeForward <- "GGAAAAACTA" + Ldef0$constantForward <- Ldef0$constantReverse <- "" + res0 <- do.call(digestFastqs, Ldef0) + expect_identical(res0$filterSummary$f7_nbrTooManyNinUMI, 497L) + + ## hit nTooManyMutBases + Ldef0 <- Ldef + Ldef0$nbrMutatedBasesMaxForward <- 0 + Ldef0$nbrMutatedBasesMaxReverse <- 0 + Ldef0$nbrMutatedCodonsMaxForward <- -1 + Ldef0$nbrMutatedCodonsMaxReverse <- -1 + res0 <- do.call(digestFastqs, Ldef0) + expect_identical(res0$filterSummary$f10b_nbrTooManyMutBases, 753L) + + ## hit not all WT sequences are of the same length + Ldef0 <- Ldef + Ldef0$maxNReads <- 100 + Ldef0$useTreeWTmatch <- TRUE + Ldef0$mergeForwardReverse <- FALSE + Ldef0$wildTypeForward <- "" + Ldef0$wildTypeReverse <- c(w1 = "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTCTGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA", + w2 = "GACAGACCAACTAGAAGTTACATGAGAAGTCTGCTTTGCAGACCGAGATTGCCA") + res0 <- do.call(digestFastqs, Ldef0) + expect_true(any(grepl("_w1", res0$summaryTable$mutantNameBase))) + + ## now run with the unmodified arguments res <- do.call(digestFastqs, Ldef) ## Specify reverse WT - check that it's ignored @@ -72,10 +112,10 @@ test_that("digestFastqs works as expected for cis experiments", { expect_equal(res$parameters[[nm]], Ldef[[nm]], ignore_attr = TRUE) } for (nm in c("fastqForward", "fastqReverse")) { - expect_equal(res$parameters[[nm]], normalizePath(Ldef[[nm]], mustWork = FALSE), + expect_equal(res$parameters[[nm]], normalizePath(Ldef[[nm]], mustWork = FALSE), ignore_attr = TRUE) } - + expect_equal(sum(res$summaryTable$nbrReads), res$filterSummary$nbrRetained) expect_equal(sum(res$summaryTable$nbrReads == 2), 11L) expect_equal(sort(res$summaryTable$mutantName[res$summaryTable$nbrReads == 2]), @@ -116,11 +156,11 @@ test_that("digestFastqs works as expected for cis experiments", { "f:c.11T>G") expect_equal(res$summaryTable$mutantNameAAHGVS[res$summaryTable$sequence == example_seq], "f:p.(Leu4Arg)") - - expect_equal(res$summaryTable$sequenceAA, + + expect_equal(res$summaryTable$sequenceAA, as.character(Biostrings::translate( Biostrings::DNAStringSet(res$summaryTable$sequence)))) - expect_equal(as.numeric(res$summaryTable$varLengths), + expect_equal(as.numeric(res$summaryTable$varLengths), nchar(res$summaryTable$sequence)) expect_equal(sum(grepl("WT", res$summaryTable$mutantNameAA) & res$summaryTable$nbrMutBases > 0), 13) expect_true(all(grepl("silent", res$summaryTable$mutationTypes[grepl("WT", res$summaryTable$mutantNameAA) & res$summaryTable$nbrMutBases > 0]))) @@ -198,7 +238,7 @@ test_that("digestFastqs works as expected when specifying max nbr of mutated bas filteredReadsFastqForward = "", filteredReadsFastqReverse = "", maxNReads = -1, verbose = FALSE, - nThreads = 1, chunkSize = 1000, + nThreads = 1, chunkSize = 1000, maxReadLength = 125 ) @@ -233,7 +273,7 @@ test_that("digestFastqs works as expected when specifying max nbr of mutated bas expect_equal(res$parameters[[nm]], Ldef[[nm]], ignore_attr = TRUE) } for (nm in c("fastqForward", "fastqReverse")) { - expect_equal(res$parameters[[nm]], normalizePath(Ldef[[nm]], mustWork = FALSE), + expect_equal(res$parameters[[nm]], normalizePath(Ldef[[nm]], mustWork = FALSE), ignore_attr = TRUE) } @@ -270,7 +310,7 @@ test_that("digestFastqs works as expected when specifying max nbr of mutated bas "GCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA") expect_equal(res$summaryTable$mutantName[res$summaryTable$sequence == example_seq], "f.11.G") - expect_equal(res$summaryTable$mutantNameAA[res$summaryTable$sequence == example_seq], + expect_equal(res$summaryTable$mutantNameAA[res$summaryTable$sequence == example_seq], "f.4.R") expect_equal(res$summaryTable$mutantNameBase[res$summaryTable$sequence == example_seq], "f.11.G") @@ -280,7 +320,7 @@ test_that("digestFastqs works as expected when specifying max nbr of mutated bas "f:c.11T>G") expect_equal(res$summaryTable$mutantNameAAHGVS[res$summaryTable$sequence == example_seq], "f:p.(Leu4Arg)") - + expect_equal(sum(res$errorStatistics$nbrMatchForward + res$errorStatistics$nbrMismatchForward), nchar(Ldef$constantForward[1]) * res$filterSummary$nbrRetained) expect_equal(sum(res$errorStatistics$nbrMatchReverse + res$errorStatistics$nbrMismatchReverse), @@ -348,7 +388,7 @@ test_that("digestFastqs gives the same results regardless of how WT matching is filteredReadsFastqForward = "", filteredReadsFastqReverse = "", maxNReads = -1, verbose = FALSE, - nThreads = 1, chunkSize = 1000, + nThreads = 1, chunkSize = 1000, maxReadLength = 1024 ) From 22fd02f23e5d4ecdba206f1812316eaed5fa17b7 Mon Sep 17 00:00:00 2001 From: mbstadler Date: Thu, 7 Aug 2025 18:50:28 +0200 Subject: [PATCH 42/44] add unit test coverage for makeBaseHGVS, makeAAHGVS and groupSimilarSequences --- tests/testthat/test_digestFastqs_helpers.R | 24 ++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/testthat/test_digestFastqs_helpers.R b/tests/testthat/test_digestFastqs_helpers.R index f492227..cef832f 100644 --- a/tests/testthat/test_digestFastqs_helpers.R +++ b/tests/testthat/test_digestFastqs_helpers.R @@ -513,6 +513,10 @@ test_that("makeBaseHGVS works", { expect_equal(makeBaseHGVS(c("f.1.A", "r.4.A", "f.6.C"), ".", "TAGTGTAGTCCGT", "AAGAGCAGTCCGT"), "[1T>A;4_6delinsAGC]_") expect_equal(makeBaseHGVS(c("r.4.A"), ".", "TAGTGTAGTCCGT", "TAGAGTAGTCCGT"), "4T>A_") + expect_equal(makeBaseHGVS(character(0), ".", "AAA", "AAA"), "_") + expect_equal(makeBaseHGVS(c("f.1.A", "r.4.A", "r.8.A"), ".", + "TACTTTAT", "AAGAAAAA"), + "[1T>A;4T>A;8T>A]_") }) test_that("makeAAHGVS works", { @@ -524,6 +528,8 @@ test_that("makeAAHGVS works", { "[(Thr1Leu);(Asp2Met)]_") expect_equal(test_makeAAHGVS(c("f.1.L", "f.2.M", "f.7.M"), ".", "TDTLQAETDQLEDEKSALQTEIANLLKEKEKL"), "[(Thr1Leu);(Asp2Met);(Glu7Met)]_") + expect_equal(test_makeAAHGVS(character(0), ".", "TDTL"), "_") +}) ## ---------------------------------------------------------------------------- ## compareToWildtype @@ -586,4 +592,22 @@ test_that("compareToWildtype works", { collapseToWT = TRUE), list()) }) + +## ---------------------------------------------------------------------------- +## groupSimilarSequences +## ---------------------------------------------------------------------------- +test_that("groupSimilarSequences works", { + expect_identical( + groupSimilarSequences(c("AA", "AT", "AC", "TT"), 1:4, collapseMaxDist = 0), + data.frame(sequence = c("AA", "AT", "AC", "TT"), + representative = c("AA", "AT", "AC", "TT")) + ) + expect_identical( + groupSimilarSequences(c("AA", "AT", "AC", "TT"), 1:4, collapseMaxDist = 1), + data.frame(sequence = c("AA", "AT", "AC", "TT"), + representative = c("AC", "TT", "AC", "TT")) + ) + expect_warning(groupSimilarSequences(c("AA", "AT", "AC", "TTT"), 1:4, + collapseMaxDist = 1), + "not all of the same length") }) \ No newline at end of file From ccb6a0c2baff6ec97ebef5e583993819b966d111 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Thu, 7 Aug 2025 21:38:24 +0200 Subject: [PATCH 43/44] Skip likely unreachable sections --- src/digestFastqs.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/digestFastqs.cpp b/src/digestFastqs.cpp index 2f46daf..e8f9631 100644 --- a/src/digestFastqs.cpp +++ b/src/digestFastqs.cpp @@ -2125,13 +2125,13 @@ List digestFastqsCpp(std::vector fastqForwardVect, mutantNameBase.pop_back(); // remove '_' at the end mutantNameCodon.pop_back(); // remove '_' at the end } else { - // will we ever go in here? + // will we ever go in here? // # nocov start if (wildTypeForward[0].compare("") != 0 || (!noReverse && wildTypeReverse[0].compare("") != 0)) { mutantName = "WT"; mutantNameBase = "WT"; mutantNameCodon = "WT"; - } + } // # nocov end } if (mutantNameBaseHGVS.length() > 0) { // we have a (closest) wildtype name mutantNameBaseHGVS.pop_back(); // remove '_' at the end @@ -2140,11 +2140,11 @@ List digestFastqsCpp(std::vector fastqForwardVect, if (mutantNameAA.length() > 0) { // we have a least one mutation, or sequence-based name mutantNameAA.pop_back(); // remove '_' at the end } else { - // will we ever go in here? + // will we ever go in here? // # nocov start if (wildTypeForward[0].compare("") != 0 || (!noReverse && wildTypeReverse[0].compare("") != 0)) { mutantNameAA = "WT"; - } + } // # nocov end } if (!noReverse) { // "trans" experiment varSeqForward += (std::string("_") + varSeqReverse); From cc199f83e15134f69419f0e1001338d1ce7f7e7f Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Thu, 7 Aug 2025 21:40:27 +0200 Subject: [PATCH 44/44] Add unit tests --- tests/testthat/test_digestFastqs_primers.R | 147 +++++++++++++++++ tests/testthat/test_digestFastqs_trans.R | 150 ++++++++++++++++++ .../test_digestFastqs_trans_collapse.R | 111 +++++++++++++ 3 files changed, 408 insertions(+) diff --git a/tests/testthat/test_digestFastqs_primers.R b/tests/testthat/test_digestFastqs_primers.R index 0360baf..d138ca6 100644 --- a/tests/testthat/test_digestFastqs_primers.R +++ b/tests/testthat/test_digestFastqs_primers.R @@ -228,4 +228,151 @@ test_that("digestFastqs works as expected for primer experiments", { expect_equal(sum(res1$summaryTable[-idx, "nbrReads"]), res2$summaryTable[res2$summaryTable$mutantName == paste0(names(leu2)[1], ".0.WT_r.0.WT"), "nbrReads"]) + + ## --------------------------------------------------------------------------- + ## Swap forward and reverse reads, specify max nbr of mutated bases + ## --------------------------------------------------------------------------- + Ldef <- list( + fastqForward = fqt2, fastqReverse = fqt1, + mergeForwardReverse = FALSE, + minOverlap = 0, maxOverlap = 0, maxFracMismatchOverlap = 0, greedyOverlap = TRUE, + revComplForward = FALSE, revComplReverse = FALSE, + elementsForward = "SPSS", elementsReverse = "SPV", + elementLengthsForward = c(-1, -1, 96, -1), + elementLengthsReverse = c(-1, -1, -1), + adapterForward = "", + adapterReverse = "", + primerForward = "GAAAAAGGAAGCTGGAGAGA", + primerReverse = "GTCAGGTGGAGGCGGATCC", + wildTypeForward = "ATCGCCCGGCTGGAGGAAAAAGTGAAAACCTTGAAAGCTCAGAACTCGGAGCTGGCGTCCACGGCCAACATGCTCAGGGAACAGGTGGCACAGCTT", + wildTypeReverse = leu, + constantForward = "", + constantReverse = "", + avePhredMinForward = 20.0, avePhredMinReverse = 20.0, + variableNMaxForward = 0, variableNMaxReverse = 0, + umiNMax = 0, + nbrMutatedCodonsMaxForward = -1, + nbrMutatedCodonsMaxReverse = -1, + nbrMutatedBasesMaxForward = 0, + nbrMutatedBasesMaxReverse = 0, + forbiddenMutatedCodonsForward = "NNW", + forbiddenMutatedCodonsReverse = "", + useTreeWTmatch = FALSE, + mutatedPhredMinForward = 0.0, mutatedPhredMinReverse = 0.0, + mutNameDelimiter = ".", + constantMaxDistForward = -1, + constantMaxDistReverse = -1, + umiCollapseMaxDist = 0, + filteredReadsFastqForward = "", + filteredReadsFastqReverse = "", + maxNReads = -1, verbose = FALSE, + nThreads = 1, chunkSize = 1000, + maxReadLength = 1024 + ) + res1 <- do.call(digestFastqs, Ldef) + leu2 <- leu + names(leu2)[1:5] <- "ATF2" + Ldef$wildTypeReverse <- leu2 + res2 <- do.call(digestFastqs, Ldef) + + expect_identical(res1$filterSummary, res2$filterSummary) + expect_identical(res1$errorStatistics, res2$errorStatistics) + expect_equal(nrow(res1$summaryTable), nrow(res2$summaryTable) + 4) + idx <- which(!res1$summaryTable$mutantName %in% paste0("f.0.WT_", names(leu)[1:5], ".0.WT")) + expect_identical(res1$summaryTable[idx, ], + res2$summaryTable[match(res1$summaryTable$mutantName[idx], + res2$summaryTable$mutantName), ], + ignore_attr = TRUE) + expect_equal(sum(res1$summaryTable[-idx, "nbrReads"]), + res2$summaryTable[res2$summaryTable$mutantName == + paste0("f.0.WT_", names(leu2)[1], ".0.WT"), "nbrReads"]) + + ## --------------------------------------------------------------------------- + ## Too many WT hits, reverse reads + ## --------------------------------------------------------------------------- + Ldef <- list( + fastqForward = fqt1, fastqReverse = fqt2, + mergeForwardReverse = FALSE, + minOverlap = 0, maxOverlap = 0, maxFracMismatchOverlap = 0, greedyOverlap = TRUE, + revComplForward = FALSE, revComplReverse = FALSE, + elementsForward = "SPV", elementsReverse = "SPSS", + elementLengthsForward = c(-1, -1, -1), + elementLengthsReverse = c(-1, -1, 96, -1), + adapterForward = "", + adapterReverse = "", + primerForward = "GTCAGGTGGAGGCGGATCC", + primerReverse = "GAAAAAGGAAGCTGGAGAGA", + wildTypeForward = leu, + wildTypeReverse = c(A = "ATCGCCCGGCTGGAGGAAAAAGTGAAAACCTTGAAAGCTCAGAACTCGGAGCTGGCGTCCACGGCCAACATGCTCAGGGAACAGGTGGCACAGCTT", B = "ATCGCCCGGCTGGAGGAAAAAGTGAAAACCTTGAAAGCTCAGAACTCGGAGCTGGCGTCCACGGCCAACATGCTCAGGGAACAGGTGGCACAGCTTA"), + constantForward = "", + constantReverse = "", + avePhredMinForward = 20.0, avePhredMinReverse = 20.0, + variableNMaxForward = 0, variableNMaxReverse = 0, + umiNMax = 0, + nbrMutatedCodonsMaxForward = 0, + nbrMutatedCodonsMaxReverse = 0, + nbrMutatedBasesMaxForward = -1, + nbrMutatedBasesMaxReverse = -1, + forbiddenMutatedCodonsForward = "", + forbiddenMutatedCodonsReverse = "NNW", + useTreeWTmatch = FALSE, + mutatedPhredMinForward = 0.0, mutatedPhredMinReverse = 0.0, + mutNameDelimiter = ".", + constantMaxDistForward = -1, + constantMaxDistReverse = -1, + umiCollapseMaxDist = 0, + filteredReadsFastqForward = "", + filteredReadsFastqReverse = "", + maxNReads = -1, verbose = FALSE, + nThreads = 1, chunkSize = 1000, + maxReadLength = 1024 + ) + res1 <- do.call(digestFastqs, Ldef) + + expect_identical(res1$filterSummary$f8_nbrTooManyBestWTHits, 809L) + + ## --------------------------------------------------------------------------- + ## Too short WT hit, reverse reads + ## --------------------------------------------------------------------------- + Ldef <- list( + fastqForward = fqt1, fastqReverse = fqt2, + mergeForwardReverse = FALSE, + minOverlap = 0, maxOverlap = 0, maxFracMismatchOverlap = 0, greedyOverlap = TRUE, + revComplForward = FALSE, revComplReverse = FALSE, + elementsForward = "SPV", elementsReverse = "SPVS", + elementLengthsForward = c(-1, -1, -1), + elementLengthsReverse = c(-1, -1, 96, -1), + adapterForward = "", + adapterReverse = "", + primerForward = "GTCAGGTGGAGGCGGATCC", + primerReverse = "GAAAAAGGAAGCTGGAGAGA", + wildTypeForward = leu, + wildTypeReverse = "ATCGCCCGGCTGGAGGAAAAAGTGAAAACCTTGAAAGCTCAGAACTCGGAGCTGGCGTCCACGGCCAACATGCTCAGGGAACAGGTGGCACAGCT", + constantForward = "", + constantReverse = "", + avePhredMinForward = 20.0, avePhredMinReverse = 20.0, + variableNMaxForward = 0, variableNMaxReverse = 0, + umiNMax = 0, + nbrMutatedCodonsMaxForward = 0, + nbrMutatedCodonsMaxReverse = 1, + nbrMutatedBasesMaxForward = -1, + nbrMutatedBasesMaxReverse = -1, + forbiddenMutatedCodonsForward = "", + forbiddenMutatedCodonsReverse = "NNW", + useTreeWTmatch = FALSE, + mutatedPhredMinForward = 0.0, mutatedPhredMinReverse = 0.0, + mutNameDelimiter = ".", + constantMaxDistForward = -1, + constantMaxDistReverse = -1, + umiCollapseMaxDist = 0, + filteredReadsFastqForward = "", + filteredReadsFastqReverse = "", + maxNReads = -1, verbose = FALSE, + nThreads = 1, chunkSize = 1000, + maxReadLength = 1024 + ) + res1 <- do.call(digestFastqs, Ldef) + + expect_identical(res1$filterSummary$f3_nbrReadWrongLength, 439L) + expect_identical(res1$filterSummary$nbrRetained, 0L) }) diff --git a/tests/testthat/test_digestFastqs_trans.R b/tests/testthat/test_digestFastqs_trans.R index a21b61e..73ae347 100644 --- a/tests/testthat/test_digestFastqs_trans.R +++ b/tests/testthat/test_digestFastqs_trans.R @@ -1261,3 +1261,153 @@ test_that("digestFastqs works as expected for trans experiments, if collapsing t expect_equal(res$summaryTable$varLengths, "80,16_16,80") }) +test_that("reads are filtered out if there are too many constant hits", { + fqt1 <- system.file("extdata/transInput_1.fastq.gz", package = "mutscan") + fqt2 <- system.file("extdata/transInput_2.fastq.gz", package = "mutscan") + ## default arguments + Ldef <- list( + fastqForward = fqt1, fastqReverse = fqt2, + mergeForwardReverse = FALSE, + minOverlap = 0, maxOverlap = 0, maxFracMismatchOverlap = 0, greedyOverlap = TRUE, + revComplForward = FALSE, revComplReverse = FALSE, + elementsForward = "SUCVV", elementsReverse = "SUCVV", + elementLengthsForward = c(1, 10, 18, 80, 16), + elementLengthsReverse = c(1, 8, 20, 16, 80), + adapterForward = "GGAAGAGCACACGTC", + adapterReverse = "GGAAGAGCGTCGTGT", + primerForward = "", + primerReverse = "", + wildTypeForward = "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTCTGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA", + wildTypeReverse = "ATCGCCCGGCTGGAGGAAAAAGTGAAAACCTTGAAAGCTCAGAACTCGGAGCTGGCGTCCACGGCCAACATGCTCAGGGAACAGGTGGCACAGCTT", + constantForward = "AACCGGAGGAGGGAGCTG", + constantReverse = "GAAAAAGGAAGCTGGAGAGA", + avePhredMinForward = 20.0, avePhredMinReverse = 20.0, + variableNMaxForward = 0, variableNMaxReverse = 0, + umiNMax = 0, + nbrMutatedCodonsMaxForward = 1, + nbrMutatedCodonsMaxReverse = 1, + nbrMutatedBasesMaxForward = -1, + nbrMutatedBasesMaxReverse = -1, + forbiddenMutatedCodonsForward = "NNW", + forbiddenMutatedCodonsReverse = "NNW", + useTreeWTmatch = FALSE, + collapseToWTForward = TRUE, + collapseToWTReverse = TRUE, + mutatedPhredMinForward = 0.0, mutatedPhredMinReverse = 0.0, + mutNameDelimiter = ".", + constantMaxDistForward = -1, + constantMaxDistReverse = -1, + umiCollapseMaxDist = 0, + filteredReadsFastqForward = "", + filteredReadsFastqReverse = "", + maxNReads = -1, verbose = FALSE, + nThreads = 1, chunkSize = 1000, + maxReadLength = 1024 + ) + + ## Forward + Ldef1 <- Ldef; Ldef1$constantForward <- c(c1 = "AACCGGAGGAGGGAGCTA", c2 = "AACCGGAGGAGGGAGCTC") + res1 <- do.call(digestFastqs, Ldef1) + + expect_equal(res1$filterSummary$nbrTotal, 1000L) + expect_equal(res1$filterSummary$f1_nbrAdapter, 314L) + expect_equal(res1$filterSummary$f2_nbrNoPrimer, 0L) + expect_equal(res1$filterSummary$f3_nbrReadWrongLength, 0L) + expect_equal(res1$filterSummary$f4_nbrNoValidOverlap, 0L) + expect_equal(res1$filterSummary$f5_nbrAvgVarQualTooLow, 7L) + expect_equal(res1$filterSummary$f6_nbrTooManyNinVar, 0L) + expect_equal(res1$filterSummary$f7_nbrTooManyNinUMI, 0L) + expect_equal(res1$filterSummary$f8_nbrTooManyBestWTHits, 0L) + expect_equal(res1$filterSummary$f9_nbrMutQualTooLow, 0L) + expect_equal(res1$filterSummary$f10a_nbrTooManyMutCodons, 192L + 95L + 68L + 37L) + expect_equal(res1$filterSummary$f10b_nbrTooManyMutBases, 0L) + expect_equal(res1$filterSummary$f11_nbrForbiddenCodons, 6L + 2L) + expect_equal(res1$filterSummary$f12_nbrTooManyMutConstant, 0L) + expect_equal(res1$filterSummary$f13_nbrTooManyBestConstantHits, 279L) + expect_equal(res1$filterSummary$nbrRetained, 0L) + + ## Reverse + Ldef1 <- Ldef; Ldef1$constantReverse <- c(c1 = "GAAAAAGGAAGCTGGAGAGG", c2 = "GAAAAAGGAAGCTGGAGAGT") + res1 <- do.call(digestFastqs, Ldef1) + + expect_equal(res1$filterSummary$nbrTotal, 1000L) + expect_equal(res1$filterSummary$f1_nbrAdapter, 314L) + expect_equal(res1$filterSummary$f2_nbrNoPrimer, 0L) + expect_equal(res1$filterSummary$f3_nbrReadWrongLength, 0L) + expect_equal(res1$filterSummary$f4_nbrNoValidOverlap, 0L) + expect_equal(res1$filterSummary$f5_nbrAvgVarQualTooLow, 7L) + expect_equal(res1$filterSummary$f6_nbrTooManyNinVar, 0L) + expect_equal(res1$filterSummary$f7_nbrTooManyNinUMI, 0L) + expect_equal(res1$filterSummary$f8_nbrTooManyBestWTHits, 0L) + expect_equal(res1$filterSummary$f9_nbrMutQualTooLow, 0L) + expect_equal(res1$filterSummary$f10a_nbrTooManyMutCodons, 192L + 95L + 68L + 37L) + expect_equal(res1$filterSummary$f10b_nbrTooManyMutBases, 0L) + expect_equal(res1$filterSummary$f11_nbrForbiddenCodons, 6L + 2L) + expect_equal(res1$filterSummary$f12_nbrTooManyMutConstant, 0L) + expect_equal(res1$filterSummary$f13_nbrTooManyBestConstantHits, 279L) + expect_equal(res1$filterSummary$nbrRetained, 0L) +}) + +test_that("digestFastqs runs with revComplForward = TRUE", { + fqt1 <- system.file("extdata/transInput_1.fastq.gz", package = "mutscan") + fqt2 <- system.file("extdata/transInput_2.fastq.gz", package = "mutscan") + ## default arguments + Ldef <- list( + fastqForward = fqt1, fastqReverse = fqt2, + mergeForwardReverse = FALSE, + minOverlap = 0, maxOverlap = 0, maxFracMismatchOverlap = 0, greedyOverlap = TRUE, + revComplForward = TRUE, revComplReverse = FALSE, + elementsForward = "SUCVV", elementsReverse = "SUCVV", + elementLengthsForward = c(1, 10, 18, 80, 16), + elementLengthsReverse = c(1, 8, 20, 16, 80), + adapterForward = "GGAAGAGCACACGTC", + adapterReverse = "GGAAGAGCGTCGTGT", + primerForward = "", + primerReverse = "", + wildTypeForward = "TAGTTTTTCCTTCTCCTTCAGCAGGTTGGCAATCTCGGTCTGCAAAGCAGACTTCTCATCTTCTAGTTGGTCTGTCTCCGCTTGGAGTGTATCAGT", + wildTypeReverse = "ATCGCCCGGCTGGAGGAAAAAGTGAAAACCTTGAAAGCTCAGAACTCGGAGCTGGCGTCCACGGCCAACATGCTCAGGGAACAGGTGGCACAGCTT", + constantForward = "CAGCTCCCTCCTCCGGTT", + constantReverse = "GAAAAAGGAAGCTGGAGAGA", + avePhredMinForward = 20.0, avePhredMinReverse = 20.0, + variableNMaxForward = 0, variableNMaxReverse = 0, + umiNMax = 0, + nbrMutatedCodonsMaxForward = 1, + nbrMutatedCodonsMaxReverse = 1, + nbrMutatedBasesMaxForward = -1, + nbrMutatedBasesMaxReverse = -1, + forbiddenMutatedCodonsForward = "", + forbiddenMutatedCodonsReverse = "", + useTreeWTmatch = FALSE, + collapseToWTForward = TRUE, + collapseToWTReverse = TRUE, + mutatedPhredMinForward = 0.0, mutatedPhredMinReverse = 0.0, + mutNameDelimiter = ".", + constantMaxDistForward = -1, + constantMaxDistReverse = -1, + umiCollapseMaxDist = 0, + filteredReadsFastqForward = "", + filteredReadsFastqReverse = "", + maxNReads = -1, verbose = FALSE, + nThreads = 1, chunkSize = 1000, + maxReadLength = 1024 + ) + + res <- do.call(digestFastqs, Ldef) + + expect_equal(res$filterSummary$nbrTotal, 1000L) + expect_equal(res$filterSummary$f1_nbrAdapter, 314L) + expect_equal(res$filterSummary$f2_nbrNoPrimer, 0L) + expect_equal(res$filterSummary$f3_nbrReadWrongLength, 0L) + expect_equal(res$filterSummary$f4_nbrNoValidOverlap, 0L) + expect_equal(res$filterSummary$f5_nbrAvgVarQualTooLow, 7L) + expect_equal(res$filterSummary$f6_nbrTooManyNinVar, 0L) + expect_equal(res$filterSummary$f7_nbrTooManyNinUMI, 0L) + expect_equal(res$filterSummary$f8_nbrTooManyBestWTHits, 0L) + expect_equal(res$filterSummary$f9_nbrMutQualTooLow, 0L) + expect_equal(res$filterSummary$f10a_nbrTooManyMutCodons, 394L) + expect_equal(res$filterSummary$f10b_nbrTooManyMutBases, 0L) + expect_equal(res$filterSummary$f11_nbrForbiddenCodons, 0L) + expect_equal(res$filterSummary$f12_nbrTooManyMutConstant, 0L) + expect_equal(res$filterSummary$f13_nbrTooManyBestConstantHits, 0L) + expect_equal(res$filterSummary$nbrRetained, 285L) +}) diff --git a/tests/testthat/test_digestFastqs_trans_collapse.R b/tests/testthat/test_digestFastqs_trans_collapse.R index 2481834..146a354 100644 --- a/tests/testthat/test_digestFastqs_trans_collapse.R +++ b/tests/testthat/test_digestFastqs_trans_collapse.R @@ -109,6 +109,117 @@ test_that("digestFastqs works as expected for trans experiments, when similar se expect_true(all(res$summaryTable$varLengths == "96_96")) }) +test_that("digestFastqs works as expected for trans experiments, when similar sequences are collapsed (relative UMI distance)", { + fqt1 <- system.file("extdata/transInput_1.fastq.gz", package = "mutscan") + fqt2 <- system.file("extdata/transInput_2.fastq.gz", package = "mutscan") + ## default arguments + Ldef <- list( + fastqForward = fqt1, fastqReverse = fqt2, + mergeForwardReverse = FALSE, + minOverlap = 0, maxOverlap = 0, maxFracMismatchOverlap = 0, greedyOverlap = TRUE, + revComplForward = FALSE, revComplReverse = FALSE, + elementsForward = "SUCV", elementsReverse = "SUCV", + elementLengthsForward = c(1, 10, 18, 96), + elementLengthsReverse = c(1, 8, 20, 96), + adapterForward = "GGAAGAGCACACGTC", + adapterReverse = "GGAAGAGCGTCGTGT", + primerForward = "", + primerReverse = "", + wildTypeForward = "", + wildTypeReverse = "", + constantForward = "AACCGGAGGAGGGAGCTG", + constantReverse = "GAAAAAGGAAGCTGGAGAGA", + avePhredMinForward = 20.0, avePhredMinReverse = 20.0, + variableNMaxForward = 0, variableNMaxReverse = 0, + umiNMax = 0, + nbrMutatedCodonsMaxForward = 1, + nbrMutatedCodonsMaxReverse = 1, + nbrMutatedBasesMaxForward = -1, + nbrMutatedBasesMaxReverse = -1, + forbiddenMutatedCodonsForward = "NNW", + forbiddenMutatedCodonsReverse = "NNW", + useTreeWTmatch = FALSE, + mutatedPhredMinForward = 0.0, mutatedPhredMinReverse = 0.0, + mutNameDelimiter = ".", + constantMaxDistForward = -1, + constantMaxDistReverse = -1, + umiCollapseMaxDist = 0.23, + filteredReadsFastqForward = "", + filteredReadsFastqReverse = "", + maxNReads = -1, verbose = TRUE, + nThreads = 1, chunkSize = 1000, + maxReadLength = 1024 + ) + + res <- do.call(digestFastqs, Ldef) + + ## Summarize single sample and collapse + se <- summarizeExperiment(list(s1 = res), coldata = data.frame(Name = "s1"), + countType = "reads") + secoll <- collapseMutantsBySimilarity(se, assayName = "counts", + collapseMaxDist = 6, + collapseMinScore = 0, collapseMinRatio = 0, + verbose = FALSE) + seumi <- summarizeExperiment(list(s1 = res), coldata = data.frame(Name = "s1"), + countType = "umis") + secollumi <- collapseMutantsBySimilarity(seumi, assayName = "counts", + scoreMethod = "rowMean", + collapseMaxDist = 6, + collapseMinScore = 0, collapseMinRatio = 0, + verbose = TRUE) + + expect_equal(res$filterSummary$nbrTotal, 1000L) + expect_equal(res$filterSummary$f1_nbrAdapter, 314L) + expect_equal(res$filterSummary$f2_nbrNoPrimer, 0L) + expect_equal(res$filterSummary$f3_nbrReadWrongLength, 0L) + expect_equal(res$filterSummary$f4_nbrNoValidOverlap, 0L) + expect_equal(res$filterSummary$f5_nbrAvgVarQualTooLow, 7L) + expect_equal(res$filterSummary$f6_nbrTooManyNinVar, 0L) + expect_equal(res$filterSummary$f7_nbrTooManyNinUMI, 0L) + expect_equal(res$filterSummary$f8_nbrTooManyBestWTHits, 0L) + expect_equal(res$filterSummary$f9_nbrMutQualTooLow, 0L) + expect_equal(res$filterSummary$f10a_nbrTooManyMutCodons, 0L) + expect_equal(res$filterSummary$f10b_nbrTooManyMutBases, 0L) + expect_equal(res$filterSummary$f11_nbrForbiddenCodons, 0L) + expect_equal(res$filterSummary$f12_nbrTooManyMutConstant, 0L) + expect_equal(res$filterSummary$f13_nbrTooManyBestConstantHits, 0L) + expect_equal(res$filterSummary$nbrRetained, 679L) + + expect_equal(SummarizedExperiment::colData(secoll)$nbrTotal, 1000L) + expect_equal(SummarizedExperiment::colData(secoll)$f1_nbrAdapter, 314L) + expect_equal(SummarizedExperiment::colData(secoll)$f2_nbrNoPrimer, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$f3_nbrReadWrongLength, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$f4_nbrNoValidOverlap, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$f5_nbrAvgVarQualTooLow, 7L) + expect_equal(SummarizedExperiment::colData(secoll)$f6_nbrTooManyNinVar, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$f7_nbrTooManyNinUMI, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$f8_nbrTooManyBestWTHits, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$f9_nbrMutQualTooLow, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$f10a_nbrTooManyMutCodons, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$f10b_nbrTooManyMutBases, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$f11_nbrForbiddenCodons, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$f12_nbrTooManyMutConstant, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$f13_nbrTooManyBestConstantHits, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$nbrRetained, 679L) + + for (nm in setdiff(names(Ldef), c("forbiddenMutatedCodonsForward", "forbiddenMutatedCodonsReverse", "verbose", "fastqForward", "fastqReverse"))) { + expect_equal(res$parameters[[nm]], Ldef[[nm]], ignore_attr = TRUE) + } + for (nm in c("fastqForward", "fastqReverse")) { + expect_equal(res$parameters[[nm]], normalizePath(Ldef[[nm]], mustWork = FALSE), + ignore_attr = TRUE) + } + + expect_equal(sum(res$summaryTable$nbrReads), res$filterSummary$nbrRetained) + expect_equal(sum(SummarizedExperiment::assay(secoll, "counts")), + res$filterSummary$nbrRetained) + expect_equal(nrow(res$summaryTable), 677L) + expect_equal(sum(res$summaryTable$nbrUmis), 679L) + expect_equal(nrow(secoll), 294L) + expect_equal(nrow(secollumi), 294L) + expect_true(all(res$summaryTable$varLengths == "96_96")) +}) + test_that("digestFastqs works as expected for trans experiments, when similar sequences are collapsed - extreme case", { fqt1 <- system.file("extdata/transInput_1.fastq.gz", package = "mutscan") fqt2 <- system.file("extdata/transInput_2.fastq.gz", package = "mutscan")