diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 021392e..db676a0 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -18,9 +18,9 @@ jobs: fail-fast: false matrix: config: - - { os: macOS-latest, bioc: 'release', curlConfigPath: '/usr/bin/'} - - { os: windows-latest, bioc: 'release'} - - { os: ubuntu-latest, image: "bioconductor/bioconductor_docker:RELEASE_3_20", cran: "https://demo.rstudiopm.com/all/__linux__/xenial/latest"} + - { os: macOS-latest, bioc: 'devel', curlConfigPath: '/usr/bin/'} + - { os: windows-latest, bioc: 'devel'} + - { os: ubuntu-latest, image: "bioconductor/bioconductor_docker:devel", cran: "https://demo.rstudiopm.com/all/__linux__/xenial/latest"} env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true diff --git a/DESCRIPTION b/DESCRIPTION index 790ad41..3d8989c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: mutscan Title: Preprocessing and Analysis of Deep Mutational Scanning Data -Version: 0.3.4 +Version: 0.99.0 Authors@R: c(person(given = "Charlotte", family = "Soneson", @@ -20,7 +20,7 @@ Description: Provides functionality for processing and statistical analysis files to publication-ready visualizations. A broad range of library designs can be processed with a single, unified interface. Depends: - R (>= 3.5) + R (>= 4.5.0) Imports: BiocGenerics, S4Vectors, @@ -45,10 +45,9 @@ Imports: DT, ggrepel, IRanges, - zlibbioc, utils, DelayedArray, - lifecycle + tools Suggests: testthat (>= 3.0.0), BiocStyle, @@ -56,7 +55,8 @@ Suggests: Biostrings, pwalign, plotly, - scattermore + scattermore, + BiocManager SystemRequirements: GNU make biocViews: GeneticVariability, GenomicVariation, Preprocessing License: MIT + file LICENSE @@ -66,3 +66,5 @@ VignetteBuilder: knitr LinkingTo: Rcpp Config/testthat/edition: 3 +URL: https://github.com/fmicompbio/mutscan +BugReports: https://github.com/fmicompbio/mutscan/issues diff --git a/NAMESPACE b/NAMESPACE index 230375d..215b433 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -18,14 +18,12 @@ export(plotTotals) export(plotVolcano) export(relabelMutPositions) export(summarizeExperiment) -import(zlibbioc) -importFrom(BiocGenerics,paste) +importFrom(BiocGenerics,"rownames<-") importFrom(BiocGenerics,rownames) importFrom(DT,datatable) importFrom(DelayedArray,rowsum) importFrom(GGally,eval_data_col) importFrom(GGally,ggpairs) -importFrom(GGally,wrap) importFrom(IRanges,IntegerList) importFrom(Matrix,colSums) importFrom(Matrix,rowMeans) @@ -34,6 +32,7 @@ importFrom(Rcpp,sourceCpp) importFrom(S4Vectors,DataFrame) importFrom(S4Vectors,metadata) importFrom(S4Vectors,unstrsplit) +importFrom(SummarizedExperiment,"rowData<-") importFrom(SummarizedExperiment,SummarizedExperiment) importFrom(SummarizedExperiment,assay) importFrom(SummarizedExperiment,assayNames) @@ -41,7 +40,6 @@ importFrom(SummarizedExperiment,assays) importFrom(SummarizedExperiment,colData) importFrom(SummarizedExperiment,rowData) importFrom(csaw,normOffsets) -importFrom(dplyr,"%>%") importFrom(dplyr,across) importFrom(dplyr,arrange) importFrom(dplyr,bind_rows) @@ -72,6 +70,7 @@ importFrom(ggplot2,after_stat) importFrom(ggplot2,annotate) importFrom(ggplot2,coord_cartesian) importFrom(ggplot2,element_blank) +importFrom(ggplot2,element_rect) importFrom(ggplot2,element_text) importFrom(ggplot2,facet_wrap) importFrom(ggplot2,geom_abline) @@ -99,9 +98,6 @@ importFrom(ggrepel,geom_text_repel) importFrom(grDevices,colorRamp) importFrom(grDevices,hcl.colors) importFrom(grDevices,rgb) -importFrom(lifecycle,deprecate_warn) -importFrom(lifecycle,deprecated) -importFrom(lifecycle,is_present) importFrom(limma,contrasts.fit) importFrom(limma,eBayes) importFrom(limma,lmFit) @@ -118,9 +114,10 @@ importFrom(tibble,rownames_to_column) importFrom(tidyr,gather) importFrom(tidyr,separate) importFrom(tidyr,separate_rows) -importFrom(tidyr,unite) importFrom(tidyselect,matches) +importFrom(tools,file_ext) importFrom(utils,globalVariables) +importFrom(utils,packageVersion) importFrom(utils,relist) importFrom(xfun,Rscript_call) useDynLib(mutscan, .registration = TRUE) diff --git a/NEWS.md b/NEWS.md index 6dfec25..366297d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# mutscan 0.99.0 + +* Prepare for Bioconductor submission +* Remove deprecated arguments variableCollapseMaxDist, variableCollapseMinReads and variableCollapseMinRatio from digestFastqs (deprecated in mutscan 0.3.0) + # mutscan 0.3.4 * Allow use of scattermore/scattermost in plotPairs diff --git a/R/RcppExports.R b/R/RcppExports.R index 24c8c47..19b3356 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -24,6 +24,10 @@ calcNearestStringDist <- function(x, metric = "hamming", nThreads = 1L) { .Call(`_mutscan_calcNearestStringDist`, x, metric, nThreads) } +complement <- function(n) { + .Call(`_mutscan_complement`, n) +} + compareCodonPositions <- function(a, b, mutNameDelimiter) { .Call(`_mutscan_compareCodonPositions`, a, b, mutNameDelimiter) } @@ -40,6 +44,10 @@ test_makeAAHGVS <- function(mutationsSorted, mutNameDelimiter, wtSeq) { .Call(`_mutscan_test_makeAAHGVS`, mutationsSorted, mutNameDelimiter, wtSeq) } +test_compareToWildtype <- function(varSeq, wtSeq, varIntQual, forbiddenCodons_vect, mutatedPhredMin = 0.0, nbrMutatedCodonsMax = -1L, codonPrefix = "c", nbrMutatedBasesMax = -1L, mutNameDelimiter = ".", collapseToWT = FALSE) { + .Call(`_mutscan_test_compareToWildtype`, varSeq, wtSeq, varIntQual, forbiddenCodons_vect, mutatedPhredMin, nbrMutatedCodonsMax, codonPrefix, nbrMutatedBasesMax, mutNameDelimiter, collapseToWT) +} + test_decomposeRead <- function(sseq, squal, elements, elementLengths, primerSeqs, umiSeq, varSeq, varQual, varLengths, constSeq, constQual, nNoPrimer, nReadWrongLength) { .Call(`_mutscan_test_decomposeRead`, sseq, squal, elements, elementLengths, primerSeqs, umiSeq, varSeq, varQual, varLengths, constSeq, constQual, nNoPrimer, nReadWrongLength) } @@ -57,45 +65,45 @@ findClosestRefSeqEarlyStop <- function(varSeq, wtSeq, upperBoundMismatch, sim) { } #' Create a conversion table for collapsing similar sequences -#' @param seqs Character vector with nucleotide sequences (or pairs of -#' sequences concatenated with "_") to be collapsed. The sequences must +#' @param seqs Character vector with nucleotide sequences (or pairs of +#' sequences concatenated with "_") to be collapsed. The sequences must #' all be of the same length. #' @param scores Numeric vector of "scores" for the sequences. Typically -#' the total read/UMI count. A higher score will be preferred when -#' deciding which sequence to use as the representative for a group of +#' the total read/UMI count. A higher score will be preferred when +#' deciding which sequence to use as the representative for a group of #' collapsed sequences. -#' @param collapseMaxDist Numeric scalar defining the tolerance for collapsing -#' similar sequences. If the value is in [0, 1), it defines the maximal +#' @param collapseMaxDist Numeric scalar defining the tolerance for collapsing +#' similar sequences. If the value is in [0, 1), it defines the maximal #' Hamming distance in terms of a fraction of sequence length: #' (\code{round(collapseMaxDist * nchar(sequence))}). #' A value greater or equal to 1 is rounded and directly used as the maximum #' allowed Hamming distance. Note that sequences can only be -#' collapsed if they are all of the same length. -#' @param collapseMinScore Numeric scalar, indicating the minimum score -#' required for a sequence to be considered as a representative for a -#' group of similar sequences (i.e., to allow other sequences to be -#' collapsed into it). +#' collapsed if they are all of the same length. The default value is 0. +#' @param collapseMinScore Numeric scalar, indicating the minimum score +#' required for a sequence to be considered as a representative for a +#' group of similar sequences (i.e., to allow other sequences to be +#' collapsed into it). The default value is 0. #' @param collapseMinRatio Numeric scalar. During collapsing of -#' similar sequences, a low-frequency sequence will be collapsed -#' with a higher-frequency sequence only if the ratio between the -#' high-frequency and the low-frequency scores is at least this +#' similar sequences, a low-frequency sequence will be collapsed +#' with a higher-frequency sequence only if the ratio between the +#' high-frequency and the low-frequency scores is at least this #' high. A value of 0 indicates that no such check is performed. #' @param verbose Logical scalar, whether to print progress messages. -#' -#' @return A data.frame with two columns, containing the input sequences +#' +#' @return A data.frame with two columns, containing the input sequences #' and the representatives for the groups resulting from grouping similar #' sequences, respectively. -#' +#' #' @examples #' seqs <- c("AACGTAGCA", "ACCGTAGCA", "AACGGAGCA", "ATCGGAGCA", "TGAGGCATA") #' scores <- c(5, 1, 3, 1, 8) -#' groupSimilarSequences(seqs = seqs, scores = scores, -#' collapseMaxDist = 1, collapseMinScore = 0, +#' groupSimilarSequences(seqs = seqs, scores = scores, +#' collapseMaxDist = 1, collapseMinScore = 0, #' collapseMinRatio = 0, verbose = FALSE) -#' +#' #' @export #' @author Michael Stadler, Charlotte Soneson -groupSimilarSequences <- function(seqs, scores, collapseMaxDist, collapseMinScore, collapseMinRatio, verbose) { +groupSimilarSequences <- function(seqs, scores, collapseMaxDist = 0.0, collapseMinScore = 0.0, collapseMinRatio = 0.0, verbose = FALSE) { .Call(`_mutscan_groupSimilarSequences`, seqs, scores, collapseMaxDist, collapseMinScore, collapseMinRatio, verbose) } diff --git a/R/calculateFitnessScore.R b/R/calculateFitnessScore.R index ae0e8d8..a5c54d6 100644 --- a/R/calculateFitnessScore.R +++ b/R/calculateFitnessScore.R @@ -59,22 +59,23 @@ calculateFitnessScore <- function(se, pairingCol, ODCols, comparison, WTrows, ## pairingCol is in colData(se) .assertScalar(x = pairingCol, type = "character", - validValues = colnames(SummarizedExperiment::colData(se))) + validValues = colnames(colData(se))) ## ODCols are all in colData(se) and contain numeric values .assertVector(x = ODCols, type = "character", rngLen = c(1, Inf), - validValues = colnames(SummarizedExperiment::colData(se))) + validValues = colnames(colData(se))) for (odc in ODCols) { - .assertVector(x = SummarizedExperiment::colData(se)[[odc]], + .assertVector(x = colData(se)[[odc]], type = "numeric") } ## comparison is length(3)-character with column and values in colData(se) .assertVector(x = comparison, type = "character", len = 3) .assertScalar(x = comparison[1], type = "character", - validValues = colnames(SummarizedExperiment::colData(se))) - .assertVector(x = comparison[2:3], type = "character", - validValues = SummarizedExperiment::colData(se)[[comparison[1]]]) + validValues = colnames(colData(se))) + .assertVector( + x = comparison[2:3], type = "character", + validValues = colData(se)[[comparison[1]]]) ## there is exactly one observation per pairing and condition if (any(table(colData(se)[colData(se)[, comparison[1]] %in% @@ -100,25 +101,29 @@ calculateFitnessScore <- function(se, pairingCol, ODCols, comparison, WTrows, colData(se_denominator)[, pairingCol]) se_numerator <- se_numerator[, match(shared_repl, colData(se_numerator)[, pairingCol])] - se_denominator <- se_denominator[, match(shared_repl, - colData(se_denominator)[, pairingCol])] + se_denominator <- + se_denominator[, match(shared_repl, + colData(se_denominator)[, pairingCol])] ## ------------------------------------------------------------------------ ## calculate normalized counts (n_i) ## ------------------------------------------------------------------------ norm_counts_numerator <- sweep( as.matrix(assay(se_numerator, selAssay)), MARGIN = 2, - STATS = apply(colData(se_numerator)[, ODCols, drop = FALSE], MARGIN = 1, prod) / - Matrix::colSums(assay(se_numerator, selAssay)), + STATS = apply(colData(se_numerator)[, ODCols, drop = FALSE], + MARGIN = 1, prod) / + colSums(assay(se_numerator, selAssay)), FUN = "*") norm_counts_denominator <- sweep( as.matrix(assay(se_denominator, selAssay)), MARGIN = 2, - STATS = apply(colData(se_denominator)[, ODCols, drop = FALSE], MARGIN = 1, prod) / - Matrix::colSums(assay(se_denominator, selAssay)), + STATS = apply(colData(se_denominator)[, ODCols, drop = FALSE], + MARGIN = 1, prod) / + colSums(assay(se_denominator, selAssay)), FUN = "*") n <- log2(norm_counts_numerator/norm_counts_denominator) n[!is.finite(n)] <- NA - colnames(n) <- paste0(comparison[2], "_vs_", comparison[3], "_repl", shared_repl) + colnames(n) <- paste0(comparison[2], "_vs_", comparison[3], + "_repl", shared_repl) ## ------------------------------------------------------------------------ @@ -131,4 +136,4 @@ calculateFitnessScore <- function(se, pairingCol, ODCols, comparison, WTrows, } fitness <- sweep(n, MARGIN = 2, STATS = nWT, FUN = "/") return(fitness) -} \ No newline at end of file +} diff --git a/R/calculateRelativeFC.R b/R/calculateRelativeFC.R index 3c3f8e1..63c7554 100644 --- a/R/calculateRelativeFC.R +++ b/R/calculateRelativeFC.R @@ -10,10 +10,10 @@ #' @param coef Coefficient(s) to test with edgeR or limma. #' @param contrast Numeric contrast to test with edgeR or limma. #' @param WTrows Vector of row names that will be used as the reference when -#' calculating logFCs and statistics. If more than one value is provided, the -#' sum of the corresponding counts is used to generate offsets. If NULL, -#' offsets will be defined as the effective library sizes (using TMM -#' normalization factors). +#' calculating logFCs and statistics. If more than one value is provided, +#' the sum of the corresponding counts is used to generate offsets. If +#' \code{NULL}, offsets will be defined as the effective library sizes +#' (using TMM normalization factors). #' @param selAssay Assay to select from \code{se} for the analysis. #' @param pseudocount Pseudocount to add when calculating log-fold changes. #' @param method Either 'edgeR' or 'limma'. If set to 'limma', voom is used to @@ -21,9 +21,9 @@ #' limma. In this case, the results also contain the standard errors of the #' logFCs. #' @param normMethod Character scalar indicating which normalization method -#' should be used to calculate size factors. Should be either \code{"TMM"} or -#' \code{"csaw"} when \code{WTrows} is \code{NULL}, and \code{"geomean"} or -#' \code{"sum"} when \code{WTrows} is provided. +#' should be used to calculate size factors. Should be either \code{"TMM"} +#' or \code{"csaw"} when \code{WTrows} is \code{NULL}, and \code{"geomean"} +#' or \code{"sum"} when \code{WTrows} is provided. #' #' @author Charlotte Soneson, Michael Stadler #' @@ -34,15 +34,16 @@ #' #' @importFrom edgeR DGEList scaleOffset estimateDisp glmQLFit glmQLFTest #' topTags predFC topTags calcNormFactors getNormLibSizes -#' @importFrom SummarizedExperiment colData assay assayNames +#' @importFrom SummarizedExperiment colData assay assayNames assays #' @importFrom limma voom eBayes topTable lmFit contrasts.fit #' @importFrom csaw normOffsets #' #' @examples +#' library(SummarizedExperiment) #' se <- readRDS(system.file("extdata", "GSE102901_cis_se.rds", #' package = "mutscan"))[1:200, ] -#' design <- stats::model.matrix(~ Replicate + Condition, -#' data = SummarizedExperiment::colData(se)) +#' design <- model.matrix(~ Replicate + Condition, +#' data = colData(se)) #' #' ## Calculate "absolute" log-fold changes with edgeR #' res <- calculateRelativeFC(se, design, coef = "Conditioncis_output", @@ -71,9 +72,9 @@ calculateRelativeFC <- function(se, design, coef = NULL, contrast = NULL, "TMM", "sum")) { .assertVector(x = se, type = "SummarizedExperiment") .assertScalar(x = selAssay, type = "character") - if (!(selAssay %in% SummarizedExperiment::assayNames(se))) { - if (is.null(SummarizedExperiment::assayNames(se)) && - length(SummarizedExperiment::assays(se)) == 1) { + if (!(selAssay %in% assayNames(se))) { + if (is.null(assayNames(se)) && + length(assays(se)) == 1) { warning("No assayNames provided in 'se', but only one ", "assay present - using that.") selAssay <- 1L @@ -81,83 +82,86 @@ calculateRelativeFC <- function(se, design, coef = NULL, contrast = NULL, stop("The provided 'selAssay' not present in 'se'.") } } - + if (!is.null(WTrows)) { - .assertVector(x = WTrows, type = "character", validValues = rownames(se)) + .assertVector(x = WTrows, type = "character", + validValues = rownames(se)) } - + if (nrow(design) != ncol(se)) { stop("The number of rows in 'design' (", nrow(design), ") is not equal to the number", " of columns in 'se' (", ncol(se), ").") } - + .assertScalar(x = pseudocount, type = "numeric", rngIncl = c(0, Inf)) - + .assertScalar(x = method, type = "character", validValues = c("edgeR", "limma")) - + if (normMethod %in% c("csaw", "TMM") && !is.null(WTrows)) { stop("normMethod = '", normMethod, "' can only be used when WTrows is NULL.") } - + if (normMethod %in% c("sum", "geomean") && is.null(WTrows)) { stop("normMethod = '", normMethod, "' can only be used when WTrows is not NULL.") } - + if (normMethod == "csaw" && method == "limma") { stop("normMethod = 'csaw' can only be used with method = 'edgeR'.") } - + .assertScalar(x = normMethod, type = "character", validValues = c("csaw", "TMM", "geomean", "sum")) - + if (is.null(coef) && is.null(contrast)) { stop("'coef' and 'contrast' can not both be NULL.") } - + if (!is.null(contrast) && !is.null(dim(contrast))) { stop("'contrast' must be a vector.") } - + ## Create DGEList from SummarizedExperiment - dge <- edgeR::DGEList(counts = as.matrix(SummarizedExperiment::assay(se, selAssay)), - samples = SummarizedExperiment::colData(se)) + dge <- DGEList( + counts = as.matrix(assay(se, selAssay)), + samples = colData(se)) if (normMethod == "csaw") { ## csaw normalization - also calculate normalization factors since ## aveLogCPM does not use provided offsets ## In this case, we know that WTrows is NULL, so all features ## will be used for the normalization - dge <- edgeR::calcNormFactors(dge) - dge <- csaw::normOffsets(dge) + dge <- calcNormFactors(dge) + dge <- normOffsets(dge) } else if (normMethod == "TMM") { ## TMM normalization, with all features - dge <- edgeR::calcNormFactors(dge) + dge <- calcNormFactors(dge) } else if (normMethod == "geomean") { ## Use size factors (offsets) derived from the geometric mean ## of the WT rows tmp0 <- dge$counts[WTrows, , drop = FALSE] tmp0 <- tmp0[apply(tmp0, 1, min) > 0, , drop = FALSE] logoffsets <- apply(tmp0, 2, function(s) mean(log(s))) - dge <- edgeR::scaleOffset(dge, logoffsets) + dge <- scaleOffset(dge, logoffsets) } else if (normMethod == "sum") { ## Use size factors (offsets) derived from the sum of the ## WT rows tmp0 <- dge$counts[WTrows, , drop = FALSE] - dge <- edgeR::scaleOffset(dge, log(colSums(tmp0))) + dge <- scaleOffset(dge, log(colSums(tmp0))) } - + ## Fit model and perform test if (method == "edgeR") { - dge <- edgeR::estimateDisp(dge, design = design) - fit <- edgeR::glmQLFit(dge, design = design) - qlf <- edgeR::glmQLFTest(fit, coef = coef, contrast = contrast) - tt <- edgeR::topTags(qlf, n = Inf, sort.by = "none")$table + dge <- estimateDisp(dge, design = design) + fit <- glmQLFit(dge, design = design) + qlf <- glmQLFTest(fit, coef = coef, contrast = contrast) + tt <- topTags(qlf, n = Inf, sort.by = "none")$table ## Calculate shrunken fold changes. Only when testing ## a single coefficient or contrast - predfc <- edgeR::predFC(dge, design = design, prior.count = pseudocount) + predfc <- predFC(dge, design = design, + prior.count = pseudocount) if (length(coef) == 1 && is.null(contrast)) { tt$logFC_shrunk <- predfc[, coef] } else if (!is.null(contrast)) { @@ -168,19 +172,19 @@ calculateRelativeFC <- function(se, design, coef = NULL, contrast = NULL, tt$df.test <- qlf$df.test } else if (method == "limma") { if (!is.null(dge$offset)) { - vm <- limma::voom(dge, design = design, lib.size = exp(dge$offset)) + vm <- voom(dge, design = design, lib.size = exp(dge$offset)) } else { - vm <- limma::voom(dge, design = design, - lib.size = edgeR::getNormLibSizes(dge)) + vm <- voom(dge, design = design, + lib.size = getNormLibSizes(dge)) } - fit <- limma::lmFit(vm, design = design) + fit <- lmFit(vm, design = design) if (!is.null(contrast)) { - fit <- limma::contrasts.fit(fit, contrasts = contrast) + fit <- contrasts.fit(fit, contrasts = contrast) coef <- 1 } - fit <- limma::eBayes(fit) - tt <- limma::topTable(fit, coef = coef, - confint = TRUE, number = Inf, sort.by = "none") + fit <- eBayes(fit) + tt <- topTable(fit, coef = coef, + confint = TRUE, number = Inf, sort.by = "none") if (length(coef) == 1) { tt$se.logFC <- sqrt(fit$s2.post) * fit$stdev.unscaled[, coef] } diff --git a/R/collapseMutantsByAA.R b/R/collapseMutantsByAA.R index c0892c7..86166c3 100644 --- a/R/collapseMutantsByAA.R +++ b/R/collapseMutantsByAA.R @@ -1,14 +1,15 @@ #' Collapse mutants by similarity #' #' These functions can be used to collapse variants, either by similarity or -#' according to a pre-defined grouping. The functions \code{collapseMutants} and -#' \code{collapseMutantsByAA} assume that a grouping variable is available as a -#' column in \code{rowData(se)} (\code{collapseMutantsByAA} is a convenience -#' function for the case when this column is "mutantNameAA", and is provided -#' for backwards compatibility). The \code{collapseMutantsBySimilarity} will -#' generate the grouping variable based on user-provided thresholds on the -#' sequence similarity (defined by the Hamming distance), and subsequently -#' collapse based on the derived grouping. +#' according to a pre-defined grouping. The functions \code{collapseMutants} +#' and \code{collapseMutantsByAA} assume that a grouping variable is available +#' as a column in \code{rowData(se)} (\code{collapseMutantsByAA} is a +#' convenience function for the case when this column is "mutantNameAA", and +#' is provided for backwards compatibility). The +#' \code{collapseMutantsBySimilarity} will generate the grouping variable +#' based on user-provided thresholds on the sequence similarity (defined by +#' the Hamming distance), and subsequently collapse based on the derived +#' grouping. #' #' @param se A \code{\link[SummarizedExperiment]{SummarizedExperiment}} #' generated by \code{\link{summarizeExperiment}} @@ -25,8 +26,8 @@ #' similar sequences. If the value is in [0, 1), it defines the maximal #' Hamming distance in terms of a fraction of sequence length: #' (\code{round(collapseMaxDist * nchar(sequence))}). -#' A value greater or equal to 1 is rounded and directly used as the maximum -#' allowed Hamming distance. Note that sequences can only be +#' A value greater or equal to 1 is rounded and directly used as the +#' maximum allowed Hamming distance. Note that sequences can only be #' collapsed if they are all of the same length. #' @param collapseMinScore Numeric scalar, indicating the minimum score for the #' sequence to be considered for collapsing with similar sequences. @@ -45,6 +46,7 @@ #' @importFrom Matrix rowSums rowMeans #' #' @examples +#' library(SummarizedExperiment) #' se <- readRDS(system.file("extdata", "GSE102901_cis_se.rds", #' package = "mutscan"))[1:200, ] #' ## The rows of this object correspond to individual codon variants @@ -58,7 +60,7 @@ #' head(rownames(sec)) #' ## The mutantName column contains the individual codon variants that were #' ## collapsed -#' head(SummarizedExperiment::rowData(sec)) +#' head(rowData(sec)) #' #' ## Collapse similar sequences #' sec2 <- collapseMutantsBySimilarity( @@ -67,9 +69,9 @@ #' collapseMinScore = 0, collapseMinRatio = 0) #' dim(sec2) #' head(rownames(sec2)) -#' head(SummarizedExperiment::rowData(sec2)) +#' head(rowData(sec2)) #' ## collapsed count matrix -#' SummarizedExperiment::assay(sec2, "counts") +#' assay(sec2, "counts") #' collapseMutantsBySimilarity <- function(se, assayName, scoreMethod = "rowSum", sequenceCol = "sequence", @@ -80,18 +82,18 @@ collapseMutantsBySimilarity <- function(se, assayName, scoreMethod = "rowSum", ## Check input arguments .assertVector(x = se, type = "SummarizedExperiment") .assertScalar(x = assayName, type = "character", - validValues = SummarizedExperiment::assayNames(se)) + validValues = assayNames(se)) .assertScalar(x = scoreMethod, type = "character", validValues = c("rowSum", "rowMean")) .assertScalar(x = sequenceCol, type = "character", - validValues = colnames(SummarizedExperiment::rowData(se))) + validValues = colnames(rowData(se))) .assertScalar(x = collapseMaxDist, type = "numeric") .assertScalar(x = collapseMinScore, type = "numeric") .assertScalar(x = collapseMinRatio, type = "numeric") .assertScalar(x = verbose, type = "logical") ## All sequences must be the same length, and can only consist of ACGT_ - seqs <- SummarizedExperiment::rowData(se)[, sequenceCol] + seqs <- rowData(se)[, sequenceCol] if (length(unique(nchar(seqs))) != 1) { stop("All entries in the sequence column must have the same length") } @@ -102,9 +104,9 @@ collapseMutantsBySimilarity <- function(se, assayName, scoreMethod = "rowSum", ## Get matching table if (scoreMethod == "rowSum") { - scores <- Matrix::rowSums(SummarizedExperiment::assay(se, assayName)) + scores <- rowSums(assay(se, assayName)) } else if (scoreMethod == "rowMean") { - scores <- Matrix::rowMeans(SummarizedExperiment::assay(se, assayName)) + scores <- rowMeans(assay(se, assayName)) } matchTable <- groupSimilarSequences( seqs = seqs, @@ -116,12 +118,13 @@ collapseMutantsBySimilarity <- function(se, assayName, scoreMethod = "rowSum", ) matchTable$representative <- rownames(se)[ match(matchTable$representative, - SummarizedExperiment::rowData(se)[, sequenceCol])] - + rowData(se)[, sequenceCol])] + ## Add grouping variable - SummarizedExperiment::rowData(se)$collapseCol <- - matchTable$representative[match(SummarizedExperiment::rowData(se)[, sequenceCol], - matchTable$sequence)] + rowData(se)$collapseCol <- + matchTable$representative[ + match(rowData(se)[, sequenceCol], + matchTable$sequence)] ## Collapse se <- collapseMutants(se, nameCol = "collapseCol") @@ -152,9 +155,10 @@ collapseMutantsByAA <- function(se) { #' @export #' #' @importFrom DelayedArray rowsum -#' @importFrom S4Vectors metadata +#' @importFrom S4Vectors metadata DataFrame #' @importFrom SummarizedExperiment assays rowData SummarizedExperiment colData -#' @importFrom dplyr across group_by summarize +#' rowData<- +#' @importFrom dplyr across group_by summarize full_join #' @importFrom stats setNames #' #' @rdname collapseMutantsBySimilarity @@ -162,51 +166,51 @@ collapseMutantsByAA <- function(se) { collapseMutants <- function(se, nameCol) { .assertVector(x = se, type = "SummarizedExperiment") .assertScalar(x = nameCol, type = "character", - validValues = colnames(SummarizedExperiment::rowData(se))) + validValues = colnames(rowData(se))) ## Collapse assays ## Matrix.utils has been removed from CRAN - # aList <- lapply(SummarizedExperiment::assays(se), function(a) { + # aList <- lapply(assays(se), function(a) { # Matrix.utils::aggregate.Matrix( # x = a, - # groupings = factor(SummarizedExperiment::rowData(se)[[nameCol]]), + # groupings = factor(rowData(se)[[nameCol]]), # fun = "colSums") # }) - aList <- lapply(SummarizedExperiment::assays(se), function(a) { - DelayedArray::rowsum( + aList <- lapply(assays(se), function(a) { + rowsum( x = a, - group = factor(SummarizedExperiment::rowData(se)[[nameCol]]) + group = factor(rowData(se)[[nameCol]]) ) }) ## Collapse rowData - simple columns - rd <- mergeValues(SummarizedExperiment::rowData(se)[[nameCol]], - SummarizedExperiment::rowData(se)$sequence) %>% - stats::setNames(c(nameCol, "sequence")) + rd <- mergeValues(rowData(se)[[nameCol]], + rowData(se)$sequence) |> + setNames(c(nameCol, "sequence")) for (v in setdiff( intersect(c("mutantName", "mutantNameBase", "mutantNameBaseHGVS", "mutantNameCodon", "mutantNameAA", "mutantNameAAHGVS", "sequenceAA", "mutationTypes", "nbrMutBases", "nbrMutCodons", "nbrMutAAs", "varLengths"), - colnames(SummarizedExperiment::rowData(se))), + colnames(rowData(se))), nameCol)) { - tmp <- mergeValues(SummarizedExperiment::rowData(se)[[nameCol]], - SummarizedExperiment::rowData(se)[[v]]) + tmp <- mergeValues(rowData(se)[[nameCol]], + rowData(se)[[v]]) rd[[v]] <- tmp$valueColl[match(rd[[nameCol]], tmp$mutantNameColl)] } - rd0 <- as.data.frame(SummarizedExperiment::rowData(se)) %>% - dplyr::group_by(.data[[nameCol]]) %>% - dplyr::summarize( - dplyr::across(c("minNbrMutBases", "minNbrMutCodons", - "minNbrMutAAs"), - function(x) min(x)), - dplyr::across(c("maxNbrMutBases", "maxNbrMutCodons", - "maxNbrMutAAs"), - function(x) max(x))) - rd <- rd %>% - dplyr::full_join(rd0, by = nameCol) + rd0 <- as.data.frame(rowData(se)) |> + group_by(.data[[nameCol]]) |> + summarize( + across(c("minNbrMutBases", "minNbrMutCodons", + "minNbrMutAAs"), + function(x) min(x)), + across(c("maxNbrMutBases", "maxNbrMutCodons", + "maxNbrMutAAs"), + function(x) max(x))) + rd <- rd |> + full_join(rd0, by = nameCol) - rd <- S4Vectors::DataFrame(rd) + rd <- DataFrame(rd) rownames(rd) <- rd[[nameCol]] ## Create SummarizedExperiment @@ -216,11 +220,11 @@ collapseMutants <- function(se, nameCol) { stopifnot(all(rownames(aList[[i]]) == rn1)) stopifnot(all(colnames(aList[[i]]) == cn1)) } - stopifnot(cn1 == rownames(SummarizedExperiment::colData(se))) - SummarizedExperiment::SummarizedExperiment( + stopifnot(cn1 == rownames(colData(se))) + SummarizedExperiment( assays = aList, - colData = SummarizedExperiment::colData(se), - metadata = S4Vectors::metadata(se), + colData = colData(se), + metadata = metadata(se), rowData = rd[rn1, ] ) } diff --git a/R/digestFastqs.R b/R/digestFastqs.R index fe43ec0..9790779 100644 --- a/R/digestFastqs.R +++ b/R/digestFastqs.R @@ -2,8 +2,8 @@ #' #' Read sequences for one or a pair of fastq files and digest them (extract #' umis, constant and variable parts, filter, extract mismatch information from -#' constant and count the observed unique variable parts). Alternatively, primer -#' sequences could be specified, in which case the sequence immediately +#' constant and count the observed unique variable parts). Alternatively, +#' primer sequences could be specified, in which case the sequence immediately #' following the primer will be considered the variable sequence. #' #' The processing of a read pair goes as follows: @@ -23,8 +23,8 @@ #' either the forward or reverse read (or the merged read). #' \item Filter out the read (pair) if the number of Ns in the variable region #' exceeds \code{variableNMaxForward}/\code{variableNMaxReverse}. -#' \item Filter out the read (pair) if the number of Ns in the combined forward -#' and reverse UMI sequence exceeds \code{umiNMax} +#' \item Filter out the read (pair) if the number of Ns in the combined +#' forward and reverse UMI sequence exceeds \code{umiNMax} #' \item If one or more wild type sequences (for the variable region) are #' provided, find the mismatches between the (forward/reverse) variable region #' and the provided wild type sequence (if more than one wild type sequence is @@ -37,9 +37,9 @@ #' the codons encoded by #' \code{forbiddenMutatedCodonsForward}/\code{forbiddenMutatedCodonsReverse}. #' \item Assign a 'mutation name' to the read (pair). This name is a -#' combination of parts of the form XX{.}YY{.}NNN, where XX is the name of the -#' most similar reference sequence, YY is the mutated codon number, and NNN is -#' the mutated codon. {.} is a delimiter, specified via +#' combination of parts of the form XX\{.\}YY\{.\}NNN, where XX is the name of +#' the most similar reference sequence, YY is the mutated codon number, and +#' NNN is the mutated codon. \{.\} is a delimiter, specified via #' \code{mutNameDelimiter}. If no wildtype sequences are provided, the #' variable sequence will be used as the mutation name'. #'} @@ -83,8 +83,8 @@ #' but there can be at most one occurrence of P. If a given letter is #' included multiple times, the corresponding sequences will be #' concatenated in the output. -#' @param elementLengthsForward,elementLengthsReverse Numeric vectors containing -#' the lengths of each read component from +#' @param elementLengthsForward,elementLengthsReverse Numeric vectors +#' containing the lengths of each read component from #' \code{elementsForward}/\code{elementsReverse}, respectively. If the #' length of one element is set to -1, it will be inferred from the other #' lengths (as the remainder of the read). At most one number (or one @@ -96,11 +96,11 @@ #' contains the corresponding adapter sequence, the sequence pair will be #' filtered out. If set to \code{NULL}, no adapter filtering is performed. #' The number of filtered read pairs are reported in the return value. -#' @param primerForward,primerReverse Character vectors, representing the primer -#' sequence(s) for forward/reverse reads, respectively. Only read pairs that -#' contain perfect matches to both the forward and reverse primers (if -#' given) will be retained. Multiple primers can be specified - they will be -#' considered in order and the first match will be used. +#' @param primerForward,primerReverse Character vectors, representing the +#' primer sequence(s) for forward/reverse reads, respectively. Only read +#' pairs that contain perfect matches to both the forward and reverse +#' primers (if given) will be retained. Multiple primers can be specified - +#' they will be considered in order and the first match will be used. #' @param wildTypeForward,wildTypeReverse Character scalars or named character #' vectors, the wild type sequence for the forward and reverse variable #' region. If given as a single string, the reference sequence will be @@ -146,29 +146,26 @@ #' any mutated base has a Phred score lower than \code{mutatedPhredMin}, #' the read (pair) will be discarded. #' @param mutNameDelimiter Character scalar, the delimiter used in the naming -#' of mutants. Generally, mutants will be named as XX{.}YY{.}NNN, where XX -#' is the closest provided reference sequence, YY is the mutated base or +#' of mutants. Generally, mutants will be named as XX\{.\}YY\{.\}NNN, where +#' XX is the closest provided reference sequence, YY is the mutated base or #' codon number (depending on whether \code{nbrMutatedBases*} or #' \code{nbrMutatedCodons*} is specified), and NNN is the -#' mutated base or codon. Here, {.} is the provided \code{mutNameDelimiter}. -#' The delimiter must be a single character (not "_"), and can not appear -#' in any of the provided reference sequence names. +#' mutated base or codon. Here, \{.\} is the provided +#' \code{mutNameDelimiter}. The delimiter must be a single character +#' (not "_"), and can not appear in any of the provided reference sequence +#' names. #' @param constantMaxDistForward,constantMaxDistReverse Numeric scalars, the #' maximum allowed Hamming distance between the extracted and expected #' constant sequence. If multiple constant sequences are provided, the most #' similar one is used. Reads with a larger distance to the expected #' constant sequence are discarded. If set to -1, no filtering is done. -#' @param variableCollapseMaxDist,variableCollapseMinReads,variableCollapseMinRatio -#' Deprecated. Collapsing of variable sequences is no longer performed in -#' \code{digestFastqs}. Please use \code{collapseMutantsBySimilarity} -#' instead. #' @param umiCollapseMaxDist Numeric scalar defining #' the tolerances for collapsing similar UMI sequences. If the value is #' in [0, 1), it defines the maximal Hamming distance in terms of a #' fraction of sequence length: #' (\code{round(umiCollapseMaxDist * nchar(umiSeq))}). -#' A value greater or equal to 1 is rounded and directly used as the maximum -#' allowed Hamming distance. +#' A value greater or equal to 1 is rounded and directly used as the +#' maximum allowed Hamming distance. #' @param filteredReadsFastqForward,filteredReadsFastqReverse Character #' scalars, the names of a (pair of) FASTQ file(s) where filtered-out reads #' will be written. The name(s) should end in .gz (the output will always @@ -189,20 +186,21 @@ #' @return A list with four entries: #' \describe{ #' \item{summaryTable}{A \code{data.frame} that contains, for each observed -#' mutation combination, the corresponding variable region sequences (or pair of -#' sequences), the number of observed such sequences, and the number of unique -#' UMIs observed for the sequence. It also has additional columns: 'maxNbrReads' -#' contains the number of reads for the most frequent observed sequence -#' represented by the feature (only relevant if similar variable regions are -#' collapsed). 'nbrMutBases', 'nbrMutCodons' and 'nbrMutAAs' give the number of -#' mutated bases, codons or amino acids in each variant. Alternative variant -#' names based on base, codon or amino acid sequence are provided in columns -#' 'mutantNameBase', 'mutantNameCodon', 'mutantNameAA'. In addition, -#' 'mutantNameBaseHGVS' and 'mutantNameAAHGVS' give base- and amino acid-based -#' names following the HGVS nomenclature (https://varnomen.hgvs.org/). Please -#' note that the provided reference sequence names are used for the HGVS -#' sequence identifiers. It is up to the user to use appropriately named -#' reference sequences in order to obtain valid HGVS variant names.} +#' mutation combination, the corresponding variable region sequences (or pair +#' of sequences), the number of observed such sequences, and the number of +#' unique UMIs observed for the sequence. It also has additional columns: +#' 'maxNbrReads' contains the number of reads for the most frequent observed +#' sequence represented by the feature (only relevant if similar variable +#' regions are collapsed). 'nbrMutBases', 'nbrMutCodons' and 'nbrMutAAs' give +#' the number of mutated bases, codons or amino acids in each variant. +#' Alternative variant names based on base, codon or amino acid sequence are +#' provided in columns mutantNameBase', 'mutantNameCodon', 'mutantNameAA'. In +#' addition, mutantNameBaseHGVS' and 'mutantNameAAHGVS' give base- and amino +#' acid-based names following the HGVS nomenclature +#' (https://varnomen.hgvs.org/). Please note that the provided reference +#' sequence names are used for the HGVS sequence identifiers. It is up to the +#' user to use appropriately named reference sequences in order to obtain +#' valid HGVS variant names.} #' \item{filterSummary}{A \code{data.frame} that contains the number of input #' reads, the number of reads filtered out in the processing, and the number of #' retained reads. The filters are named according to the convention @@ -211,11 +209,11 @@ #' applied successively, and the reads filtered out in one step are not #' considered for successive filtering steps.} #' \item{errorStatistics}{A \code{data.frame} that contains, for each Phred -#' quality score between 0 and 99, the number of bases in the extracted constant -#' sequences with that quality score that match/mismatch with the provided -#' reference constant sequence.} -#' \item{parameters}{A \code{list} with all parameter settings that were used in -#' the processing. Also contains the version of the package and the time of +#' quality score between 0 and 99, the number of bases in the extracted +#' constant sequences with that quality score that match/mismatch with the +#' provided reference constant sequence.} +#' \item{parameters}{A \code{list} with all parameter settings that were used +#' in the processing. Also contains the version of the package and the time of #' processing.} #' } #' @@ -223,7 +221,7 @@ #' ## See the vignette for complete worked-out examples for different types of #' ## data sets #' -#' ## ----------------------------------------------------------------------- ## +#' ## ---------------------------------------------------------------------- ## #' ## Process a single-end data set, assume that the full read represents #' ## the variable region #' out <- digestFastqs( @@ -236,7 +234,7 @@ #' ## Filter summary #' out$filterSummary #' -#' ## ----------------------------------------------------------------------- ## +#' ## ---------------------------------------------------------------------- ## #' ## Process a single-end data set, specify the read as a combination of #' ## UMI, constant region and variable region (skip the first base) #' out <- digestFastqs( @@ -252,7 +250,7 @@ #' ## Error statistics #' out$errorStatistics #' -#' ## ----------------------------------------------------------------------- ## +#' ## ---------------------------------------------------------------------- ## #' ## Process a single-end data set, specify the read as a combination of #' ## UMI, constant region and variable region (skip the first base), provide #' ## the wild type sequence to compare the variable region to and limit the @@ -262,8 +260,9 @@ #' package = "mutscan"), #' elementsForward = "SUCV", elementLengthsForward = c(1, 10, 18, 96), #' constantForward = "AACCGGAGGAGGGAGCTG", -#' wildTypeForward = c(FOS = paste0("ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", -#' "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), +#' wildTypeForward = c(FOS = paste0( +#' "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", +#' "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), #' nbrMutatedCodonsMaxForward = 1 #' ) #' ## Table with read counts and mutant information @@ -273,7 +272,7 @@ #' ## Error statistics #' out$errorStatistics #' -#' ## ----------------------------------------------------------------------- ## +#' ## ---------------------------------------------------------------------- ## #' ## Process a paired-end data set where both the forward and reverse reads #' ## contain the same variable region and thus should be merged to generate #' ## the final variable sequence, specify the reads as a combination of @@ -285,13 +284,15 @@ #' package = "mutscan"), #' fastqReverse = system.file("extdata", "cisInput_2.fastq.gz", #' package = "mutscan"), -#' mergeForwardReverse = TRUE, revComplForward = FALSE, revComplReverse = TRUE, +#' mergeForwardReverse = TRUE, +#' revComplForward = FALSE, revComplReverse = TRUE, #' elementsForward = "SUCV", elementLengthsForward = c(1, 10, 18, 96), #' elementsReverse = "SUCVS", elementLengthsReverse = c(1, 7, 17, 96, -1), #' constantForward = "AACCGGAGGAGGGAGCTG", #' constantReverse = "GAGTTCATCCTGGCAGC", -#' wildTypeForward = c(FOS = paste0("ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", -#' "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), +#' wildTypeForward = c(FOS = paste0( +#' "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", +#' "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), #' nbrMutatedCodonsMaxForward = 1 #' ) #' ## Table with read counts and mutant information @@ -301,7 +302,7 @@ #' ## Error statistics #' out$errorStatistics #' -#' ## ----------------------------------------------------------------------- ## +#' ## ---------------------------------------------------------------------- ## #' ## Process a paired-end data set where the forward and reverse reads #' ## contain variable regions corresponding to different proteins, and thus #' ## should not be merged, specify the reads as a combination of @@ -318,10 +319,12 @@ #' elementsReverse = "SUCV", elementLengthsReverse = c(1, 8, 20, 96), #' constantForward = "AACCGGAGGAGGGAGCTG", #' constantReverse = "GAAAAAGGAAGCTGGAGAGA", -#' wildTypeForward = c(FOS = paste0("ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", -#' "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), -#' wildTypeReverse = c(JUN = paste0("ATCGCCCGGCTGGAGGAAAAAGTGAAAACCTTGAAAGCTCAGAACTC", -#' "GGAGCTGGCGTCCACGGCCAACATGCTCAGGGAACAGGTGGCACAGCTT")), +#' wildTypeForward = c(FOS = paste0( +#' "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", +#' "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), +#' wildTypeReverse = c(JUN = paste0( +#' "ATCGCCCGGCTGGAGGAAAAAGTGAAAACCTTGAAAGCTCAGAACTC", +#' "GGAGCTGGCGTCCACGGCCAACATGCTCAGGGAACAGGTGGCACAGCTT")), #' nbrMutatedCodonsMaxForward = 1, #' nbrMutatedCodonsMaxReverse = 1 #' ) @@ -333,10 +336,10 @@ #' out$errorStatistics #' #' @export -#' @import zlibbioc -#' @importFrom lifecycle deprecated is_present deprecate_warn +#' @importFrom utils packageVersion digestFastqs <- function(fastqForward, fastqReverse = NULL, - mergeForwardReverse = FALSE, minOverlap = 0, maxOverlap = 0, + mergeForwardReverse = FALSE, + minOverlap = 0, maxOverlap = 0, minMergedLength = 0, maxMergedLength = 0, maxFracMismatchOverlap = 1, greedyOverlap = TRUE, revComplForward = FALSE, revComplReverse = FALSE, @@ -365,66 +368,26 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, mutNameDelimiter = ".", constantMaxDistForward = -1, constantMaxDistReverse = -1, - variableCollapseMaxDist = deprecated(), - variableCollapseMinReads = deprecated(), - variableCollapseMinRatio = deprecated(), umiCollapseMaxDist = 0.0, filteredReadsFastqForward = "", filteredReadsFastqReverse = "", maxNReads = -1, verbose = FALSE, nThreads = 1, chunkSize = 100000, maxReadLength = 1024) { - ## pre-flight checks --------------------------------------------------------- - ## deprecated arguments - deprecMessageColl <- paste0( - "Starting from mutscan v0.3.0, collapsing of variable sequences is no ", - "longer supported by digestFastqs(), and arguments ", - "variableCollapseMaxDist, variableCollapseMinReads ", - "and variableCollapseMinRatio will be ignored. Please run ", - "summarizeExperiment() to generate a SummarizedExperiment object, and ", - "then call collapseMutantsBySimilarity() to collapse variable ", - "sequences in a consistent way across all samples." - ) - if (lifecycle::is_present(variableCollapseMaxDist)) { - # Signal the deprecation to the user - lifecycle::deprecate_warn( - "0.3.0", "mutscan::digestFastqs(variableCollapseMaxDist = )", - "mutscan::collapseMutantsBySimilarity()", - id = "digestFastqs_variableCollapse", - details = deprecMessageColl, - always = TRUE) - } - if (lifecycle::is_present(variableCollapseMinReads)) { - # Signal the deprecation to the user - lifecycle::deprecate_warn( - "0.3.0", "mutscan::digestFastqs(variableCollapseMinReads = )", - "mutscan::collapseMutantsBySimilarity()", - id = "digestFastqs_variableCollapse", - details = deprecMessageColl, - always = TRUE) - } - if (lifecycle::is_present(variableCollapseMinRatio)) { - # Signal the deprecation to the user - lifecycle::deprecate_warn( - "0.3.0", "mutscan::digestFastqs(variableCollapseMinRatio = )", - "mutscan::collapseMutantsBySimilarity()", - id = "digestFastqs_variableCollapse", - details = deprecMessageColl, - always = TRUE) - } - - - + ## pre-flight checks ------------------------------------------------------ ## fastq files exist if (length(fastqForward) < 1 || !all(file.exists(fastqForward)) || - (!is.null(fastqReverse) && (length(fastqReverse) != length(fastqForward) || - !all(file.exists(fastqReverse))))) { - stop("'fastqForward' and 'fastqReverse' must point to one or several matching existing files"); + (!is.null(fastqReverse) && + (length(fastqReverse) != length(fastqForward) || + !all(file.exists(fastqReverse))))) { + stop("'fastqForward' and 'fastqReverse' must point to one or ", + "several matching existing files") } if (is.null(fastqReverse)) { fastqReverse <- rep("", length(fastqForward)) } - ## Convert file paths to canonical form, expand ~ and complete relative paths + ## Convert file paths to canonical form, expand ~ and complete + ## relative paths fastqForward <- normalizePath(fastqForward, mustWork = FALSE) fastqReverse <- normalizePath(fastqReverse, mustWork = FALSE) @@ -433,8 +396,8 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, .assertScalar(x = revComplForward, type = "logical") .assertScalar(x = revComplReverse, type = "logical") if (any(fastqReverse == "") && mergeForwardReverse) { - stop("Both forward and reverse FASTQ files must be given in order to merge ", - "forward and reverse reads") + stop("Both forward and reverse FASTQ files must be given in order to ", + "merge forward and reverse reads") } ## check numeric inputs @@ -446,10 +409,14 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, .assertScalar(x = maxOverlap, type = "numeric", rngIncl = c(0, Inf)) .assertScalar(x = minMergedLength, type = "numeric", rngIncl = c(0, Inf)) .assertScalar(x = maxMergedLength, type = "numeric", rngIncl = c(0, Inf)) - .assertScalar(x = avePhredMinForward, type = "numeric", rngIncl = c(0, Inf)) - .assertScalar(x = avePhredMinReverse, type = "numeric", rngIncl = c(0, Inf)) - .assertScalar(x = variableNMaxForward, type = "numeric", rngIncl = c(0, Inf)) - .assertScalar(x = variableNMaxReverse, type = "numeric", rngIncl = c(0, Inf)) + .assertScalar(x = avePhredMinForward, type = "numeric", + rngIncl = c(0, Inf)) + .assertScalar(x = avePhredMinReverse, type = "numeric", + rngIncl = c(0, Inf)) + .assertScalar(x = variableNMaxForward, type = "numeric", + rngIncl = c(0, Inf)) + .assertScalar(x = variableNMaxReverse, type = "numeric", + rngIncl = c(0, Inf)) .assertScalar(x = umiNMax, type = "numeric", rngIncl = c(0, Inf)) .assertScalar(x = nbrMutatedCodonsMaxForward, type = "numeric", rngIncl = c(0, Inf), validValues = -1) @@ -459,38 +426,51 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, rngIncl = c(0, Inf), validValues = -1) .assertScalar(x = nbrMutatedBasesMaxReverse, type = "numeric", rngIncl = c(0, Inf), validValues = -1) - .assertScalar(x = mutatedPhredMinForward, type = "numeric", rngIncl = c(0, Inf)) - .assertScalar(x = mutatedPhredMinReverse, type = "numeric", rngIncl = c(0, Inf)) + .assertScalar(x = mutatedPhredMinForward, type = "numeric", + rngIncl = c(0, Inf)) + .assertScalar(x = mutatedPhredMinReverse, type = "numeric", + rngIncl = c(0, Inf)) .assertScalar(x = constantMaxDistForward, type = "numeric", rngIncl = c(0, Inf), validValues = -1) .assertScalar(x = constantMaxDistReverse, type = "numeric", rngIncl = c(0, Inf), validValues = -1) - .assertScalar(x = umiCollapseMaxDist, type = "numeric", rngIncl = c(0, Inf)) + .assertScalar(x = umiCollapseMaxDist, type = "numeric", + rngIncl = c(0, Inf)) .assertScalar(x = maxNReads, type = "numeric", rngIncl = c(0, Inf), validValues = -1) .assertScalar(x = nThreads, type = "numeric", rngExcl = c(0, Inf)) .assertScalar(x = chunkSize, type = "numeric", rngExcl = c(0, Inf)) .assertScalar(x = maxReadLength, type = "numeric", rngExcl = c(0, Inf)) - ## If a wildtype sequence is provided, it must be unambiguous how to identify and name mutants + ## If a wildtype sequence is provided, + ## it must be unambiguous how to identify and name mutants if (any(wildTypeForward != "")) { - if ((nbrMutatedCodonsMaxForward == (-1) && nbrMutatedBasesMaxForward == (-1)) || - (nbrMutatedCodonsMaxForward != (-1) && nbrMutatedBasesMaxForward != (-1))) { - stop("Exactly one of 'nbrMutatedCodonsMaxForward' and 'nbrMutatedBasesMaxForward' must be -1") + if ((nbrMutatedCodonsMaxForward == (-1) && + nbrMutatedBasesMaxForward == (-1)) || + (nbrMutatedCodonsMaxForward != (-1) && + nbrMutatedBasesMaxForward != (-1))) { + stop("Exactly one of 'nbrMutatedCodonsMaxForward' and ", + "'nbrMutatedBasesMaxForward' must be -1") } } if (any(wildTypeReverse != "")) { - if ((nbrMutatedCodonsMaxReverse == (-1) && nbrMutatedBasesMaxReverse == (-1)) || - (nbrMutatedCodonsMaxReverse != (-1) && nbrMutatedBasesMaxReverse != (-1))) { - stop("Exactly one of 'nbrMutatedCodonsMaxReverse' and 'nbrMutatedBasesMaxReverse' must be -1") + if ((nbrMutatedCodonsMaxReverse == (-1) && + nbrMutatedBasesMaxReverse == (-1)) || + (nbrMutatedCodonsMaxReverse != (-1) && + nbrMutatedBasesMaxReverse != (-1))) { + stop("Exactly one of 'nbrMutatedCodonsMaxReverse' and ", + "'nbrMutatedBasesMaxReverse' must be -1") } } ## adapters must be strings, valid DNA characters if (length(adapterForward) != 1 || !is.character(adapterForward) || - !grepl("^[AaCcGgTt]*$", adapterForward) || length(adapterReverse) != 1 || - !is.character(adapterReverse) || !grepl("^[AaCcGgTt]*$", adapterReverse)) { - stop("Adapters must be character strings, only containing valid DNA characters") + !grepl("^[AaCcGgTt]*$", adapterForward) || + length(adapterReverse) != 1 || + !is.character(adapterReverse) || + !grepl("^[AaCcGgTt]*$", adapterReverse)) { + stop("Adapters must be character strings, only containing ", + "valid DNA characters") } else { adapterForward <- toupper(adapterForward) adapterReverse <- toupper(adapterReverse) @@ -511,18 +491,22 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, stop("'elementsReverse' must be a non-empty character scalar") } - if (!all(strsplit(elementsForward, "")[[1]] %in% c("C", "U", "S", "V", "P"))) { + if (!all(strsplit(elementsForward, "")[[1]] %in% + c("C", "U", "S", "V", "P"))) { stop("'elementsForward' can only contain letters 'CUSVP'") } - if (!all(strsplit(elementsReverse, "")[[1]] %in% c("C", "U", "S", "V", "P"))) { + if (!all(strsplit(elementsReverse, "")[[1]] %in% + c("C", "U", "S", "V", "P"))) { stop("'elementsReverse' can only contain letters 'CUSVP'") } if (nchar(elementsForward) != length(elementLengthsForward)) { - stop("'elementsForward' and 'elementsLengthsForward' must have the same length") + stop("'elementsForward' and 'elementsLengthsForward' must ", + "have the same length") } if (nchar(elementsReverse) != length(elementLengthsReverse)) { - stop("'elementsReverse' and 'elementsLengthsReverse' must have the same length") + stop("'elementsReverse' and 'elementsLengthsReverse' must ", + "have the same length") } ## Max one 'P' @@ -548,24 +532,31 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, ## If a 'P', max one -1 length on each side if (length(PposFwd) == 1 && PposFwd != -1) { - if ((PposFwd != 1 && sum(elementLengthsForward[seq_len(PposFwd - 1)] == -1) > 1) || + if ((PposFwd != 1 && + sum(elementLengthsForward[seq_len(PposFwd - 1)] == -1) > 1) || (PposFwd != nchar(elementsForward) && sum(elementLengthsForward[(PposFwd + 1):nchar(elementsForward)] == -1) > 1)) { - stop("Max one element length (forward) on each side of the primer can be -1") + stop("Max one element length (forward) on each side ", + "of the primer can be -1") } } if (length(PposRev) == 1 && PposRev != -1) { - if ((PposRev != 1 && sum(elementLengthsReverse[seq_len(PposRev - 1)] == -1) > 1) || + if ((PposRev != 1 && + sum(elementLengthsReverse[seq_len(PposRev - 1)] == -1) > 1) || (PposRev != nchar(elementsReverse) && sum(elementLengthsReverse[(PposRev + 1):nchar(elementsReverse)] == -1) > 1)) { - stop("Max one element length (reverse) on each side of the primer can be -1") + stop("Max one element length (reverse) on each side ", + "of the primer can be -1") } } if (!is.character(primerForward) || length(primerForward) < 1 || - !all(grepl("^[AaCcGgTt]*$", primerForward)) || !is.character(primerReverse) || - length(primerReverse) < 1 || !all(grepl("^[AaCcGgTt]*$", primerReverse))) { - stop("Primers must be character vectors, only containing valid DNA characters") + !all(grepl("^[AaCcGgTt]*$", primerForward)) || + !is.character(primerReverse) || + length(primerReverse) < 1 || + !all(grepl("^[AaCcGgTt]*$", primerReverse))) { + stop("Primers must be character vectors, ", + "only containing valid DNA characters") } else { primerForward <- vapply(primerForward, toupper, "") primerReverse <- vapply(primerReverse, toupper, "") @@ -580,18 +571,22 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, } ## wild type sequences must be given in named vectors - if (any(is.null(names(wildTypeForward))) || any(names(wildTypeForward) == "") || - any(is.null(names(wildTypeReverse))) || any(names(wildTypeReverse) == "")) { + if (any(is.null(names(wildTypeForward))) || + any(names(wildTypeForward) == "") || + any(is.null(names(wildTypeReverse))) || + any(names(wildTypeReverse) == "")) { stop('wild type sequences must be given in named vectors') } ## wild type sequences must be strings, valid DNA characters if (!all(vapply(wildTypeForward, is.character, FALSE)) || !all(vapply(wildTypeForward, length, 0) == 1) || - !all(vapply(wildTypeForward, function(w) grepl("^[AaCcGgTt]*$", w), FALSE)) || + !all(vapply(wildTypeForward, + function(w) grepl("^[AaCcGgTt]*$", w), FALSE)) || !all(vapply(wildTypeReverse, is.character, FALSE)) || !all(vapply(wildTypeReverse, length, 0) == 1) || - !all(vapply(wildTypeReverse, function(w) grepl("^[AaCcGgTt]*$", w), FALSE))) { + !all(vapply(wildTypeReverse, + function(w) grepl("^[AaCcGgTt]*$", w), FALSE))) { stop("wild type sequences must be character strings, ", "only containing valid DNA characters") } else { @@ -607,7 +602,8 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, ## cis experiment - should not have wildTypeReverse if (mergeForwardReverse && any(vapply(wildTypeReverse, nchar, 1) > 0)) { - warning("Ignoring 'wildTypeReverse' when forward and reverse reads are merged") + warning("Ignoring 'wildTypeReverse' when forward and ", + "reverse reads are merged") wildTypeReverse <- c(r = "") } @@ -624,8 +620,10 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, constantReverse <- toupper(constantReverse) } - if (!all(vapply(constantForward, nchar, 0L) == nchar(constantForward[1])) || - !all(vapply(constantReverse, nchar, 0L) == nchar(constantReverse[1]))) { + if (!all(vapply(constantForward, nchar, 0L) == + nchar(constantForward[1])) || + !all(vapply(constantReverse, nchar, 0L) == + nchar(constantReverse[1]))) { stop("All constant sequences must be of the same length") } @@ -635,7 +633,8 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, sum(elementLengthsForward[CposFwd]) != nchar(constantForward[1])) { stop("The sum of the constant sequence lengths in elementsForward (", sum(elementLengthsForward[CposFwd]), - ") does not correspond to the length of the given 'constantForward' (", + ") does not correspond to the length of the given ", + "'constantForward' (", nchar(constantForward[1]), ")") } CposRev <- gregexpr(pattern = "C", elementsReverse)[[1]] @@ -644,17 +643,21 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, sum(elementLengthsReverse[CposRev]) != nchar(constantReverse[1])) { stop("The sum of the constant sequence lengths in elementsReverse (", sum(elementLengthsReverse[CposRev]), - ") does not correspond to the length of the given 'constantReverse' (", + ") does not correspond to the length of the given ", + "'constantReverse' (", nchar(constantReverse[1]), ")") } if (!all(is.character(forbiddenMutatedCodonsForward)) || - !all(grepl("^[ACGTMRWSYKVHDBN]{3}$", toupper(forbiddenMutatedCodonsForward)) | + !all(grepl("^[ACGTMRWSYKVHDBN]{3}$", + toupper(forbiddenMutatedCodonsForward)) | forbiddenMutatedCodonsForward == "") || !all(is.character(forbiddenMutatedCodonsReverse)) || - !all(grepl("^[ACGTMRWSYKVHDBN]{3}$", toupper(forbiddenMutatedCodonsReverse)) | + !all(grepl("^[ACGTMRWSYKVHDBN]{3}$", + toupper(forbiddenMutatedCodonsReverse)) | forbiddenMutatedCodonsReverse == "")) { - stop("All elements of 'forbiddenMutatedCodonsForward' and 'forbiddenMutatedCodonsReverse' must be ", + stop("All elements of 'forbiddenMutatedCodonsForward' and ", + "'forbiddenMutatedCodonsReverse' must be ", "character strings consisting of three valid IUPAC letters.") } else { forbiddenMutatedCodonsForward <- toupper(forbiddenMutatedCodonsForward) @@ -672,96 +675,111 @@ digestFastqs <- function(fastqForward, fastqReverse = NULL, stop("'collapseToWTReverse' must be a logical scalar.") } - ## mutNameDelimiter must be a single character, and can not appear in any of the WT sequence names + ## mutNameDelimiter must be a single character, and can not + ## appear in any of the WT sequence names if (!is.character(mutNameDelimiter) || length(mutNameDelimiter) != 1 || nchar(mutNameDelimiter) != 1 || mutNameDelimiter == "_") { - stop("'mutNameDelimiter' must be a single-letter character scalar, not equal to '_'") - } - if (any(grepl(mutNameDelimiter, c(names(wildTypeForward), names(wildTypeReverse)), fixed = TRUE))) { - stop("'mutNameDelimiter' can not appear in the name of any of the provided wild type sequences.") - } - - if (!is.character(filteredReadsFastqForward) || !is.character(filteredReadsFastqReverse) || - length(filteredReadsFastqForward) != 1 || length(filteredReadsFastqReverse) != 1 || - (filteredReadsFastqForward != "" && !grepl("\\.gz$", filteredReadsFastqForward)) || - (filteredReadsFastqReverse != "" && !grepl("\\.gz$", filteredReadsFastqReverse))) { - stop("'filteredReadsFastqForward' and 'filteredReadsFastqReverse' must be character ", - "scalars ending with .gz.") + stop("'mutNameDelimiter' must be a single-letter character scalar, ", + "not equal to '_'") + } + if (any(grepl(mutNameDelimiter, + c(names(wildTypeForward), names(wildTypeReverse)), + fixed = TRUE))) { + stop("'mutNameDelimiter' can not appear in the name of any of ", + "the provided wild type sequences.") + } + + if (!is.character(filteredReadsFastqForward) || + !is.character(filteredReadsFastqReverse) || + length(filteredReadsFastqForward) != 1 || + length(filteredReadsFastqReverse) != 1 || + (filteredReadsFastqForward != "" && + !grepl("\\.gz$", filteredReadsFastqForward)) || + (filteredReadsFastqReverse != "" && + !grepl("\\.gz$", filteredReadsFastqReverse))) { + stop("'filteredReadsFastqForward' and 'filteredReadsFastqReverse' ", + "must be character scalars ending with .gz.") } ## If path is "", it will still be "" - filteredReadsFastqForward <- normalizePath(filteredReadsFastqForward, mustWork = FALSE) - filteredReadsFastqReverse <- normalizePath(filteredReadsFastqReverse, mustWork = FALSE) + filteredReadsFastqForward <- + normalizePath(filteredReadsFastqForward, mustWork = FALSE) + filteredReadsFastqReverse <- + normalizePath(filteredReadsFastqReverse, mustWork = FALSE) if ((any(fastqReverse == "") && filteredReadsFastqReverse != "") || - (all(fastqReverse != "") && filteredReadsFastqForward != "" && filteredReadsFastqReverse == "") || - (all(fastqForward != "") && filteredReadsFastqForward == "" && filteredReadsFastqReverse != "")) { - stop("The pairing of the output FASTQ files must be compatible with that of the input files.") + (all(fastqReverse != "") && filteredReadsFastqForward != "" && + filteredReadsFastqReverse == "") || + (all(fastqForward != "") && filteredReadsFastqForward == "" && + filteredReadsFastqReverse != "")) { + stop("The pairing of the output FASTQ files must be compatible ", + "with that of the input files.") } .assertScalar(x = verbose, type = "logical") - ## call digestFastqsCpp ------------------------------------------------------ - ## Represent the wildtype sequences as pairs of vectors (one with sequences, - ## one with names), to make things faster on the C++ side + ## call digestFastqsCpp --------------------------------------------------- + ## Represent the wildtype sequences as pairs of vectors (one with + ## sequences, one with names), to make things faster on the C++ side wildTypeForwardNames <- names(wildTypeForward) wildTypeForward <- unname(wildTypeForward) wildTypeReverseNames <- names(wildTypeReverse) wildTypeReverse <- unname(wildTypeReverse) - res <- digestFastqsCpp(fastqForwardVect = fastqForward, - fastqReverseVect = fastqReverse, - mergeForwardReverse = mergeForwardReverse, - minOverlap = minOverlap, - maxOverlap = maxOverlap, - minMergedLength = minMergedLength, - maxMergedLength = maxMergedLength, - maxFracMismatchOverlap = maxFracMismatchOverlap, - greedyOverlap = greedyOverlap, - revComplForward = revComplForward, - revComplReverse = revComplReverse, - elementsForward = elementsForward, - elementLengthsForward = as.numeric(elementLengthsForward), - elementsReverse = elementsReverse, - elementLengthsReverse = as.numeric(elementLengthsReverse), - adapterForward = adapterForward, - adapterReverse = adapterReverse, - primerForward = primerForward, - primerReverse = primerReverse, - wildTypeForward = wildTypeForward, - wildTypeForwardNames = wildTypeForwardNames, - wildTypeReverse = wildTypeReverse, - wildTypeReverseNames = wildTypeReverseNames, - constantForward = constantForward, - constantReverse = constantReverse, - avePhredMinForward = avePhredMinForward, - avePhredMinReverse = avePhredMinReverse, - variableNMaxForward = variableNMaxForward, - variableNMaxReverse = variableNMaxReverse, - umiNMax = umiNMax, - nbrMutatedCodonsMaxForward = nbrMutatedCodonsMaxForward, - nbrMutatedCodonsMaxReverse = nbrMutatedCodonsMaxReverse, - nbrMutatedBasesMaxForward = nbrMutatedBasesMaxForward, - nbrMutatedBasesMaxReverse = nbrMutatedBasesMaxReverse, - forbiddenMutatedCodonsForward = forbiddenMutatedCodonsForward, - forbiddenMutatedCodonsReverse = forbiddenMutatedCodonsReverse, - useTreeWTmatch = useTreeWTmatch, - collapseToWTForward = collapseToWTForward, - collapseToWTReverse = collapseToWTReverse, - mutatedPhredMinForward = mutatedPhredMinForward, - mutatedPhredMinReverse = mutatedPhredMinReverse, - mutNameDelimiter = mutNameDelimiter, - constantMaxDistForward = constantMaxDistForward, - constantMaxDistReverse = constantMaxDistReverse, - umiCollapseMaxDist = umiCollapseMaxDist, - filteredReadsFastqForward = filteredReadsFastqForward, - filteredReadsFastqReverse = filteredReadsFastqReverse, - maxNReads = maxNReads, - verbose = verbose, - nThreads = as.integer(nThreads), - chunkSize = as.integer(chunkSize), - maxReadLength = maxReadLength) - - ## Add package version and processing date ----------------------------------- + res <- digestFastqsCpp( + fastqForwardVect = fastqForward, + fastqReverseVect = fastqReverse, + mergeForwardReverse = mergeForwardReverse, + minOverlap = minOverlap, + maxOverlap = maxOverlap, + minMergedLength = minMergedLength, + maxMergedLength = maxMergedLength, + maxFracMismatchOverlap = maxFracMismatchOverlap, + greedyOverlap = greedyOverlap, + revComplForward = revComplForward, + revComplReverse = revComplReverse, + elementsForward = elementsForward, + elementLengthsForward = as.numeric(elementLengthsForward), + elementsReverse = elementsReverse, + elementLengthsReverse = as.numeric(elementLengthsReverse), + adapterForward = adapterForward, + adapterReverse = adapterReverse, + primerForward = primerForward, + primerReverse = primerReverse, + wildTypeForward = wildTypeForward, + wildTypeForwardNames = wildTypeForwardNames, + wildTypeReverse = wildTypeReverse, + wildTypeReverseNames = wildTypeReverseNames, + constantForward = constantForward, + constantReverse = constantReverse, + avePhredMinForward = avePhredMinForward, + avePhredMinReverse = avePhredMinReverse, + variableNMaxForward = variableNMaxForward, + variableNMaxReverse = variableNMaxReverse, + umiNMax = umiNMax, + nbrMutatedCodonsMaxForward = nbrMutatedCodonsMaxForward, + nbrMutatedCodonsMaxReverse = nbrMutatedCodonsMaxReverse, + nbrMutatedBasesMaxForward = nbrMutatedBasesMaxForward, + nbrMutatedBasesMaxReverse = nbrMutatedBasesMaxReverse, + forbiddenMutatedCodonsForward = forbiddenMutatedCodonsForward, + forbiddenMutatedCodonsReverse = forbiddenMutatedCodonsReverse, + useTreeWTmatch = useTreeWTmatch, + collapseToWTForward = collapseToWTForward, + collapseToWTReverse = collapseToWTReverse, + mutatedPhredMinForward = mutatedPhredMinForward, + mutatedPhredMinReverse = mutatedPhredMinReverse, + mutNameDelimiter = mutNameDelimiter, + constantMaxDistForward = constantMaxDistForward, + constantMaxDistReverse = constantMaxDistReverse, + umiCollapseMaxDist = umiCollapseMaxDist, + filteredReadsFastqForward = filteredReadsFastqForward, + filteredReadsFastqReverse = filteredReadsFastqReverse, + maxNReads = maxNReads, + verbose = verbose, + nThreads = as.integer(nThreads), + chunkSize = as.integer(chunkSize), + maxReadLength = maxReadLength) + + ## Add package version and processing date -------------------------------- res$parameters$processingInfo <- paste0( "Processed by mutscan v", utils::packageVersion("mutscan"), " on ", Sys.time() diff --git a/R/generateQCReport.R b/R/generateQCReport.R index e0d973f..6338977 100644 --- a/R/generateQCReport.R +++ b/R/generateQCReport.R @@ -23,6 +23,7 @@ #' @importFrom SummarizedExperiment colData #' @importFrom dplyr bind_rows full_join #' @importFrom S4Vectors metadata +#' @importFrom tools file_ext #' #' @examples #' ## Load SummarizedExperiment object @@ -43,7 +44,7 @@ generateQCReport <- function(se, outFile, reportTitle = "mutscan QC report", .assertVector(x = se, type = "SummarizedExperiment") .assertScalar(x = outFile, type = "character") .assertScalar(x = forceOverwrite, type = "logical") - if (tools::file_ext(outFile) != "html") { + if (file_ext(outFile) != "html") { stop("'outFile' must have the file extension '.html'.") } outDir <- dirname(outFile) @@ -52,9 +53,11 @@ generateQCReport <- function(se, outFile, reportTitle = "mutscan QC report", } if (file.exists(outFile)) { if (!forceOverwrite) { - stop(outFile, " already exists and forceOverwrite = FALSE, stopping.") + stop(outFile, " already exists and forceOverwrite = FALSE, ", + "stopping.") } else { - message(outFile, " already exists but forceOverwrite = TRUE, overwriting.") + message(outFile, " already exists but forceOverwrite = TRUE, ", + "overwriting.") } } @@ -77,9 +80,8 @@ generateQCReport <- function(se, outFile, reportTitle = "mutscan QC report", } args <- c(args, list(...)[setdiff(...names(), names(args))]) - outputReport <- xfun::Rscript_call( - rmarkdown::render, - args + outputReport <- Rscript_call( + render, args ) ## --------------------------------------------------------------------- ## diff --git a/R/linkMultipleVariants.R b/R/linkMultipleVariants.R index 3a04c1d..aec04dd 100644 --- a/R/linkMultipleVariants.R +++ b/R/linkMultipleVariants.R @@ -80,6 +80,26 @@ #' \item outCombined - the \code{digestFastqs} output for the combined run. #' } #' +#' @examples +#' fqFile <- system.file("extdata", "cisInput_1.fastq.gz", +#' package = "mutscan") +#' out <- linkMultipleVariants( +#' combinedDigestParams = list(fastqForward = fqFile, +#' elementsForward = "SVCV", +#' elementLengthsForward = c(1, 10, 18, 96)), +#' # the first variable sequence is the UMI +#' umi = list(fastqForward = fqFile, elementsForward = "SVCS", +#' elementLengthsForward = c(1, 10, 18, 96)), +#' # the second variable sequence is the amplicon variant +#' var = list(fastqForward = fqFile, elementsForward = "SSCV", +#' elementLengthsForward = c(1, 10, 18, 96), +#' collapseMaxDist = 3, collapseMinScore = 1) +#' ) +#' # conversion tables +#' lapply(out$convSeparate, head) +#' # aggregated count table +#' head(out$countAggregated) +#' #' @importFrom dplyr select rename group_by summarize across matches mutate #' @importFrom tidyr separate separate_rows #' @importFrom rlang .data @@ -113,9 +133,10 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { paramsSeparate <- lapply(paramsSeparate, function(parm) { c(parm, as.list(defaults[!(names(defaults) %in% names(parm))])) }) - combinedDigestParams <- c(combinedDigestParams, - as.list(defaults[!(names(defaults) %in% - names(combinedDigestParams))])) + combinedDigestParams <- + c(combinedDigestParams, + as.list(defaults[!(names(defaults) %in% + names(combinedDigestParams))])) ## --------------------------------------------------------------------- ## ## Checks @@ -159,29 +180,29 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { outCombined <- do.call(digestFastqs, combinedDigestParams) ## Get count matrix with "raw" (uncorrected) sequences - countCombined <- outCombined$summaryTable %>% - dplyr::select("sequence", "nbrReads", "varLengths") %>% - dplyr::mutate(idx = paste0("I", seq_along(.data$sequence))) + countCombined <- outCombined$summaryTable |> + dplyr::select("sequence", "nbrReads", "varLengths") |> + mutate(idx = paste0("I", seq_along(.data$sequence))) ## If applicable, separate into forward and reverse sequences if (any(grepl("_", countCombined$sequence))) { - countCombined <- countCombined %>% - tidyr::separate(.data$sequence, into = c("sequenceForward", + countCombined <- countCombined |> + separate(.data$sequence, into = c("sequenceForward", "sequenceReverse"), - sep = "_") %>% - tidyr::separate(.data$varLengths, into = c("varLengthsForward", + sep = "_") |> + separate(.data$varLengths, into = c("varLengthsForward", "varLengthsReverse"), - sep = "_") %>% - dplyr::mutate( + sep = "_") |> + mutate( nCompForward = vapply(strsplit(.data$varLengthsForward, ","), length, 0), nCompReverse = vapply(strsplit(.data$varLengthsReverse, ","), length, 0)) } else { - countCombined <- countCombined %>% - dplyr::rename(sequenceForward = "sequence", - varLengthsForward = "varLengths") %>% - dplyr::mutate( + countCombined <- countCombined |> + rename(sequenceForward = "sequence", + varLengthsForward = "varLengths") |> + mutate( nCompForward = vapply(strsplit(.data$varLengthsForward, ","), length, 0)) } @@ -196,8 +217,8 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { tmp <- split(countCombined, countCombined$varLengthsForward) countCombined <- unsplit(lapply(tmp, function(df) { w <- as.numeric(strsplit(df$varLengthsForward[1], ",")[[1]]) - df <- df %>% - tidyr::separate( + df <- df |> + separate( .data$sequenceForward, into = names(paramsSeparate)[seq_along(w)], # into = paste0("V", seq_along(w)), @@ -211,8 +232,8 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { tmp <- split(countCombined, countCombined$varLengthsReverse) countCombined <- unsplit(lapply(tmp, function(df) { w <- as.numeric(strsplit(df$varLengthsReverse[1], ",")[[1]]) - df <- df %>% - tidyr::separate( + df <- df |> + separate( .data$sequenceReverse, into = names(paramsSeparate)[offsetForward + seq_along(w)], # into = paste0("V", offsetForward + seq_along(w)), @@ -226,10 +247,12 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { outSeparate <- lapply(paramsSeparate, function(ps) { tmp <- do.call(digestFastqs, ps[!names(ps) %in% collapseParams]) if (any(collapseParams %in% names(ps))) { - ## Collapse this variable region -> need to call groupSimilarSequences + ## Collapse this variable region -> need to call + ## groupSimilarSequences tbl <- do.call(groupSimilarSequences, c(list(seqs = tmp$summaryTable$sequence, - scores = tmp$summaryTable$nbrReads, verbose = FALSE), + scores = tmp$summaryTable$nbrReads, + verbose = FALSE), ps[names(ps) %in% collapseParams])) colnames(tbl)[colnames(tbl) == "representative"] <- "mutantName" tmp$summaryTable <- tbl @@ -242,9 +265,9 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { ## Conversion tables convSeparate <- lapply(outSeparate, function(out) { - out$summaryTable %>% - dplyr::select("mutantName", "sequence") %>% - tidyr::separate_rows("sequence", sep = ",") + out$summaryTable |> + dplyr::select("mutantName", "sequence") |> + separate_rows("sequence", sep = ",") }) ## --------------------------------------------------------------------- ## @@ -261,9 +284,9 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { drop = FALSE] ## Aggregate counts - countAggregated <- countCombined %>% - dplyr::group_by(dplyr::across(names(paramsSeparate))) %>% - dplyr::summarize(nbrReads = sum(.data$nbrReads), .groups = "drop") + countAggregated <- countCombined |> + group_by(across(names(paramsSeparate))) |> + summarize(nbrReads = sum(.data$nbrReads), .groups = "drop") list(countAggregated = countAggregated, convSeparate = convSeparate, outCombined = outCombined, filtSeparate = filtSeparate) diff --git a/R/plotDistributions.R b/R/plotDistributions.R index a92fe24..3690fe5 100644 --- a/R/plotDistributions.R +++ b/R/plotDistributions.R @@ -21,7 +21,7 @@ #' @importFrom tibble rownames_to_column #' @importFrom tidyr gather #' @importFrom dplyr group_by arrange mutate desc ungroup left_join -#' @importFrom SummarizedExperiment colData assay +#' @importFrom SummarizedExperiment colData assay assayNames #' @importFrom ggplot2 ggplot scale_x_log10 scale_y_log10 labs geom_line #' facet_wrap geom_density geom_histogram theme_minimal theme #' element_text aes @@ -37,35 +37,36 @@ plotDistributions <- function(se, selAssay = "counts", facet = FALSE, pseudocount = 0) { .assertVector(x = se, type = "SummarizedExperiment") .assertScalar(x = selAssay, type = "character", - validValues = SummarizedExperiment::assayNames(se)) + validValues = assayNames(se)) if (!is.null(groupBy)) { - .assertScalar(x = groupBy, type = "character", - validValues = colnames(SummarizedExperiment::colData(se))) + .assertScalar( + x = groupBy, type = "character", + validValues = colnames(colData(se))) } .assertScalar(x = plotType, type = "character", validValues = c("density", "knee", "histogram")) .assertScalar(x = facet, type = "logical") .assertScalar(x = pseudocount, type = "numeric", rngIncl = c(0, Inf)) - + ## Define a common theme to use for the plots commonTheme <- list( - ggplot2::theme_minimal(), - ggplot2::theme(axis.text = ggplot2::element_text(size = 12), - axis.title = ggplot2::element_text(size = 14)) + theme_minimal(), + theme(axis.text = element_text(size = 12), + axis.title = element_text(size = 14)) ) df <- as.data.frame(as.matrix( - SummarizedExperiment::assay(se, selAssay, withDimnames = TRUE) - )) %>% - tibble::rownames_to_column("feature") %>% - tidyr::gather(key = "Name", value = "value", -"feature") %>% - dplyr::group_by(.data$Name) %>% - dplyr::arrange(dplyr::desc(.data$value)) %>% - dplyr::mutate(idx = seq_along(.data$value), - value = .data$value + pseudocount) %>% - dplyr::ungroup() %>% - dplyr::left_join(as.data.frame(SummarizedExperiment::colData(se)), - by = "Name") + assay(se, selAssay, withDimnames = TRUE) + )) |> + rownames_to_column("feature") |> + gather(key = "Name", value = "value", -"feature") |> + group_by(.data$Name) |> + arrange(desc(.data$value)) |> + mutate(idx = seq_along(.data$value), + value = .data$value + pseudocount) |> + ungroup() |> + left_join(as.data.frame(colData(se)), + by = "Name") ## If the user doesn't explicitly group by any variable, impose grouping ## by the sample ID. In that case, don't color by sample ID if facetting @@ -80,67 +81,73 @@ plotDistributions <- function(se, selAssay = "counts", ## Specify plot depending on desired type if (plotType == "knee") { - gg <- ggplot2::ggplot(df, ggplot2::aes(x = .data$idx, y = .data$value)) + - ggplot2::scale_x_log10() + ggplot2::scale_y_log10() + - ggplot2::labs(x = "Feature (sorted)", - y = paste0(selAssay, - ifelse(pseudocount == 0, - "", paste0(" + ", pseudocount)))) + gg <- ggplot(df, aes(x = .data$idx, + y = .data$value)) + + scale_x_log10() + scale_y_log10() + + labs(x = "Feature (sorted)", + y = paste0(selAssay, + ifelse(pseudocount == 0, + "", paste0(" + ", pseudocount)))) if (facet) { if (colorFacetByName) { - gg <- gg + ggplot2::geom_line(ggplot2::aes(color = .data$Name)) + gg <- gg + geom_line(aes(color = .data$Name)) } else { - gg <- gg + ggplot2::geom_line(ggplot2::aes(group = .data$Name)) + gg <- gg + geom_line(aes(group = .data$Name)) } gg <- gg + - ggplot2::facet_wrap(~ .data[[groupBy]]) + facet_wrap(~ .data[[groupBy]]) } else { - gg <- gg + ggplot2::geom_line(ggplot2::aes(group = .data$Name, - color = .data[[groupBy]])) + gg <- gg + + geom_line(aes(group = .data$Name, + color = .data[[groupBy]])) } } else if (plotType == "density") { - gg <- ggplot2::ggplot(df, ggplot2::aes(x = .data$value)) + - ggplot2::scale_x_log10() + - ggplot2::labs(x = paste0(selAssay, - ifelse(pseudocount == 0, - "", paste0(" + ", pseudocount))), - y = "Density") + gg <- ggplot(df, aes(x = .data$value)) + + scale_x_log10() + + labs(x = paste0(selAssay, + ifelse(pseudocount == 0, + "", paste0(" + ", pseudocount))), + y = "Density") if (facet) { if (colorFacetByName) { - gg <- gg + ggplot2::geom_density(ggplot2::aes(color = .data$Name)) + gg <- gg + + geom_density(aes(color = .data$Name)) } else { - gg <- gg + ggplot2::geom_density(ggplot2::aes(group = .data$Name)) + gg <- gg + + geom_density(aes(group = .data$Name)) } gg <- gg + - ggplot2::facet_wrap(~ .data[[groupBy]]) + facet_wrap(~ .data[[groupBy]]) } else { - gg <- gg + ggplot2::geom_density(ggplot2::aes(group = .data$Name, - color = .data[[groupBy]])) + gg <- gg + + geom_density(aes(group = .data$Name, + color = .data[[groupBy]])) } } else if (plotType == "histogram") { - gg <- ggplot2::ggplot(df, ggplot2::aes(x = .data$value)) + - ggplot2::scale_x_log10() + - ggplot2::labs(x = paste0(selAssay, - ifelse(pseudocount == 0, - "", paste0(" + ", pseudocount))), - y = "Count") + gg <- ggplot(df, aes(x = .data$value)) + + scale_x_log10() + + labs(x = paste0(selAssay, + ifelse(pseudocount == 0, + "", paste0(" + ", pseudocount))), + y = "Count") if (facet) { if (colorFacetByName) { gg <- gg + - ggplot2::geom_histogram(ggplot2::aes(fill = .data$Name), - bins = 50) + geom_histogram(aes(fill = .data$Name), + bins = 50) } else { gg <- gg + - ggplot2::geom_histogram(ggplot2::aes(group = .data$Name), - bins = 50) + geom_histogram(aes(group = .data$Name), + bins = 50) } gg <- gg + - ggplot2::facet_wrap(~ .data[[groupBy]]) + facet_wrap(~ .data[[groupBy]]) } else { - gg <- gg + ggplot2::geom_histogram(ggplot2::aes(group = .data$Name, - fill = .data[[groupBy]])) + gg <- gg + + geom_histogram(aes(group = .data$Name, + fill = .data[[groupBy]])) } } gg + commonTheme -} \ No newline at end of file +} diff --git a/R/plotFiltering.R b/R/plotFiltering.R index 8dfad4c..2eebfe6 100644 --- a/R/plotFiltering.R +++ b/R/plotFiltering.R @@ -20,8 +20,8 @@ #' plot the number of reads, or the fraction of the total number of reads, #' that are retained after/filtered out in each filtering step. #' @param onlyActiveFilters Logical scalar, whether to only include the -#' active filters (i.e., where any read was filtered out in any of the samples). -#' Defaults to \code{TRUE}. +#' active filters (i.e., where any read was filtered out in any of the +#' samples). Defaults to \code{TRUE}. #' @param displayNumbers Logical scalar, indicating whether to display the #' number (or fraction) of reads retained at every filtering step. #' @param numberSize Numeric scalar, indicating the size of the displayed @@ -32,7 +32,7 @@ #' should be facetted. Either \code{"sample"} or \code{"step"}. #' #' @importFrom tidyselect matches -#' @importFrom dplyr select %>% mutate group_by summarize pull ungroup filter +#' @importFrom dplyr select mutate group_by summarize pull ungroup filter #' @importFrom tibble rownames_to_column #' @importFrom tidyr gather #' @importFrom ggplot2 ggplot aes geom_bar facet_wrap theme theme_bw labs @@ -57,13 +57,14 @@ plotFiltering <- function(se, valueType = "reads", onlyActiveFilters = TRUE, validValues = c("remaining", "filtered")) .assertScalar(x = facetBy, type = "character", validValues = c("sample", "step")) - - ## ----------------------------------------------------------------------- ## + + ## --------------------------------------------------------------------- ## ## Extract relevant columns from colData(se) - ## ----------------------------------------------------------------------- ## - cd <- as.data.frame(colData(se)) %>% - dplyr::select("nbrTotal":"nbrRetained") %>% - dplyr::select(c("nbrTotal", tidyselect::matches("^f.*_"), "nbrRetained")) + ## --------------------------------------------------------------------- ## + cd <- as.data.frame(colData(se)) |> + dplyr::select("nbrTotal":"nbrRetained") |> + dplyr::select(c("nbrTotal", matches("^f.*_"), + "nbrRetained")) ## Remove inactive filters if desired if (onlyActiveFilters) { @@ -71,7 +72,8 @@ plotFiltering <- function(se, valueType = "reads", onlyActiveFilters = TRUE, } ## Check that filtering columns are in the right order - nbrs <- gsub("^f", "", vapply(strsplit(colnames(cd), "_"), .subset, FUN.VALUE = "", 1)) + nbrs <- gsub("^f", "", vapply(strsplit(colnames(cd), "_"), .subset, + FUN.VALUE = "", 1)) nbrs <- nbrs[-c(1, length(nbrs))] nbrs <- gsub("a$|b$", "", nbrs) stopifnot(all(diff(as.numeric(nbrs)) >= 0)) @@ -79,46 +81,51 @@ plotFiltering <- function(se, valueType = "reads", onlyActiveFilters = TRUE, ## Check that all columns are numeric stopifnot(all(apply(cd, 2, is.numeric))) - ## ----------------------------------------------------------------------- ## + ## --------------------------------------------------------------------- ## ## Reshape into long format - ## ----------------------------------------------------------------------- ## - cd <- cd %>% - tibble::rownames_to_column("sample") %>% - tidyr::gather(key = "step", value = "nbrReads", -"sample") %>% - dplyr::mutate(step = factor(.data$step, levels = colnames(cd))) + ## --------------------------------------------------------------------- ## + cd <- cd |> + rownames_to_column("sample") |> + gather(key = "step", value = "nbrReads", -"sample") |> + mutate(step = factor(.data$step, levels = colnames(cd))) ## Check that numbers add up (total - all filters = retained) - stopifnot(all(cd %>% - dplyr::group_by(sample) %>% - dplyr::summarize(remdiff = .data$nbrReads[.data$step == "nbrTotal"] - - sum(.data$nbrReads[grepl("^f", .data$step)]), - remlist = .data$nbrReads[.data$step == "nbrRetained"]) %>% - dplyr::mutate(obsdiff = .data$remdiff - .data$remlist) %>% - dplyr::pull(.data$obsdiff) == 0)) + stopifnot(all( + cd |> + group_by(sample) |> + summarize( + remdiff = .data$nbrReads[.data$step == "nbrTotal"] - + sum(.data$nbrReads[grepl("^f", .data$step)]), + remlist = .data$nbrReads[.data$step == "nbrRetained"]) |> + mutate(obsdiff = .data$remdiff - .data$remlist) |> + pull(.data$obsdiff) == 0)) - ## ----------------------------------------------------------------------- ## + ## --------------------------------------------------------------------- ## ## Calculate number of remaining reads at each step - ## ----------------------------------------------------------------------- ## - cd <- cd %>% - dplyr::group_by(sample) %>% - dplyr::mutate(fracReads = signif(.data$nbrReads / - .data$nbrReads[.data$step == "nbrTotal"], - digits = 3)) %>% - dplyr::mutate(cumulsum = vapply(seq_along(.data$step), function(i) { - sum(.data$nbrReads[as.numeric(.data$step) <= as.numeric(.data$step[i]) & + ## --------------------------------------------------------------------- ## + cd <- cd |> + group_by(sample) |> + mutate(fracReads = signif( + .data$nbrReads / + .data$nbrReads[.data$step == "nbrTotal"], + digits = 3)) |> + mutate(cumulsum = vapply(seq_along(.data$step), function(i) { + sum(.data$nbrReads[as.numeric(.data$step) <= + as.numeric(.data$step[i]) & .data$step != "nbrTotal"]) - }, NA_real_)) %>% - dplyr::mutate(nbrRemaining = .data$nbrReads[.data$step == "nbrTotal"] - - .data$cumulsum) %>% - dplyr::mutate(fracRemaining = signif(.data$nbrRemaining / - .data$nbrReads[.data$step == "nbrTotal"], - digits = 3)) %>% - dplyr::filter(.data$step != "nbrRetained") %>% - dplyr::ungroup() + }, NA_real_)) |> + mutate(nbrRemaining = .data$nbrReads[.data$step == "nbrTotal"] - + .data$cumulsum) |> + mutate(fracRemaining = signif( + .data$nbrRemaining / + .data$nbrReads[.data$step == "nbrTotal"], + digits = 3)) |> + filter(.data$step != "nbrRetained") |> + ungroup() - ## ----------------------------------------------------------------------- ## + ## --------------------------------------------------------------------- ## ## Create plot - ## ----------------------------------------------------------------------- ## + ## --------------------------------------------------------------------- ## yvar <- switch( paste0(valueType, "_", plotType), @@ -138,24 +145,27 @@ plotFiltering <- function(se, valueType = "reads", onlyActiveFilters = TRUE, filtered = " filtered out in" ) if (plotType == "filtered") { - cd <- cd %>% - dplyr::filter(.data$step != "nbrTotal") + cd <- cd |> + filter(.data$step != "nbrTotal") } - gg <- ggplot2::ggplot(cd, ggplot2::aes(x = .data[[setdiff(c("step", "sample"), facetBy)]], - y = .data[[yvar]], - label = .data[[yvar]])) + - ggplot2::geom_bar(stat = "identity") + - ggplot2::facet_wrap(~ .data[[facetBy]], ncol = 1, scales = "free_y") + - ggplot2::theme_bw() + - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1, - vjust = 0.5, size = 12), - axis.text.y = ggplot2::element_text(size = 12), - axis.title = ggplot2::element_text(size = 14)) + - ggplot2::labs(x = "", y = ylab1, - title = paste0(ylab1, ylab2, " each filtering step")) + gg <- ggplot(cd, aes( + x = .data[[setdiff(c("step", "sample"), facetBy)]], + y = .data[[yvar]], + label = .data[[yvar]])) + + geom_bar(stat = "identity") + + facet_wrap(~ .data[[facetBy]], ncol = 1, scales = "free_y") + + theme_bw() + + theme( + axis.text.x = element_text(angle = 90, hjust = 1, + vjust = 0.5, size = 12), + axis.text.y = element_text(size = 12), + axis.title = element_text(size = 14)) + + labs(x = "", y = ylab1, + title = paste0(ylab1, ylab2, " each filtering step")) if (displayNumbers) { - gg <- gg + ggplot2::geom_text(vjust = 1.5, color = "white", size = numberSize) + gg <- gg + geom_text(vjust = 1.5, color = "white", + size = numberSize) } gg } \ No newline at end of file diff --git a/R/plotPairs.R b/R/plotPairs.R index c80d896..b1736b4 100644 --- a/R/plotPairs.R +++ b/R/plotPairs.R @@ -41,11 +41,11 @@ #' @param addIdentityLine Logical scalar, indicating whether the identity line #' should be added (only used if \code{pointsType = "points"}). #' -#' @importFrom GGally eval_data_col ggpairs wrap +#' @importFrom GGally eval_data_col ggpairs #' @importFrom ggplot2 ggplot annotate theme_void ylim stat_density2d #' scale_fill_continuous geom_point theme_bw theme element_blank aes #' geom_histogram scale_x_continuous scale_y_continuous geom_abline -#' after_stat +#' after_stat element_rect #' @importFrom stats cor #' @importFrom SummarizedExperiment assayNames assay #' @importFrom grDevices hcl.colors rgb colorRamp @@ -62,10 +62,10 @@ plotPairs <- function(se, selAssay = "counts", doLog = TRUE, pseudocount = 1, corSizeAdd = 2, pointSize = 0.1, pointAlpha = 0.3, colorByCorrelation = TRUE, corrColorRange = NULL, addIdentityLine = FALSE) { - + .assertVector(x = se, type = "SummarizedExperiment") .assertScalar(x = selAssay, type = "character", - validValues = SummarizedExperiment::assayNames(se)) + validValues = assayNames(se)) .assertScalar(x = doLog, type = "logical") .assertScalar(x = pseudocount, type = "numeric", rngIncl = c(0, Inf)) .assertScalar(x = corMethod, type = "character", @@ -88,89 +88,102 @@ plotPairs <- function(se, selAssay = "counts", doLog = TRUE, pseudocount = 1, if (pointsType %in% c("scattermore", "scattermost")) { .assertPackagesAvailable("scattermore") } - - ## ----------------------------------------------------------------------- ## + + ## --------------------------------------------------------------------- ## ## Define shared theme elements - ## ----------------------------------------------------------------------- ## + ## --------------------------------------------------------------------- ## ggtheme <- list( theme_bw(), - theme(panel.grid.major = ggplot2::element_blank(), - panel.grid.minor = ggplot2::element_blank()) + theme(panel.grid.major = element_blank(), + panel.grid.minor = element_blank()) ) - - ## ----------------------------------------------------------------------- ## + + ## --------------------------------------------------------------------- ## ## Correlations - ## ----------------------------------------------------------------------- ## - ## Define function to calculate and display correlations (for use with ggpairs) + ## --------------------------------------------------------------------- ## + ## Define function to calculate and display correlations + ## (for use with ggpairs) cor_fcn <- function(data, mapping, ...) { ## Get data - xData <- GGally::eval_data_col(data, mapping$x) - yData <- GGally::eval_data_col(data, mapping$y) - + xData <- eval_data_col(data, mapping$x) + yData <- eval_data_col(data, mapping$y) + ## Calculate correlation - mainCor <- stats::cor(xData, yData, method = corMethod, - use = "pairwise.complete.obs") - transfCor <- (abs(mainCor) - min(corRange))/(max(corRange) - min(corRange)) + mainCor <- cor(xData, yData, method = corMethod, + use = "pairwise.complete.obs") + transfCor <- (abs(mainCor) - min(corRange)) / + (max(corRange) - min(corRange)) ## Protect against numerical imprecision leading to values outside ## the allowed range transfCor <- min(1.0, transfCor) transfCor <- max(0.0, transfCor) - + ## Determine the color if (colorByCorrelation) { if (mainCor >= 0) { - cols <- grDevices::hcl.colors(n = 11, palette = "RdBu")[6:2] - col <- grDevices::rgb(grDevices::colorRamp( + cols <- hcl.colors(n = 11, palette = "RdBu")[6:2] + col <- rgb(colorRamp( cols)(transfCor), maxColorValue = 255) } else { - cols <- grDevices::hcl.colors(n = 11, palette = "RdBu")[6:10] - col <- grDevices::rgb(grDevices::colorRamp( + # nocov start + cols <- hcl.colors(n = 11, palette = "RdBu")[6:10] + col <- rgb(colorRamp( cols)(transfCor), maxColorValue = 255) + # nocov end } } else { - col <- "white" + col <- "white" #nocov } - + ## Construct plot - ggplot2::ggplot(data = data, mapping = mapping) + - ggplot2::annotate(x = 0.5, y = 0.5, label = round(mainCor, digits = 3), - geom = "text", - size = abs(mainCor) * corSizeMult + corSizeAdd) + - ggtheme + ggplot2::ylim(c(0, 1)) + - ggplot2::theme(panel.background = ggplot2::element_rect(fill = col)) + ggplot(data = data, mapping = mapping) + + annotate(x = 0.5, y = 0.5, + label = round(mainCor, digits = 3), + geom = "text", + size = abs(mainCor) * corSizeMult + corSizeAdd) + + ggtheme + ylim(c(0, 1)) + + theme( + panel.background = element_rect(fill = col) + ) } - - ## ----------------------------------------------------------------------- ## + + ## --------------------------------------------------------------------- ## ## Scatter plots - ## ----------------------------------------------------------------------- ## + ## --------------------------------------------------------------------- ## + # All the point plotting functions are tested, but for some reason covr + # doesn't pick it up - ignore these definitions in the coverage tests + # nocov start if (pointsType == "smoothscatter") { - ## Define function to create smoothscatter-like plot (for use with ggpairs) + ## Define function to create smoothscatter-like plot + ## (for use with ggpairs) smoothscat <- function(data, mapping, ...) { - ggplot2::ggplot(data = data, mapping = mapping) + - ggplot2::stat_density2d( - ggplot2::aes(fill = ggplot2::after_stat(.data$density)^0.25), geom = "tile", + ggplot(data = data, mapping = mapping) + + stat_density2d( + aes(fill = after_stat( + .data$density)^0.25), geom = "tile", contour = FALSE, n = 200) + - ggplot2::scale_fill_continuous(low = "white", high = "darkgreen") + + scale_fill_continuous(low = "white", + high = "darkgreen") + ggtheme + - ggplot2::scale_x_continuous(expand = c(0, 0)) + - ggplot2::scale_y_continuous(expand = c(0, 0)) + scale_x_continuous(expand = c(0, 0)) + + scale_y_continuous(expand = c(0, 0)) } } else if (pointsType == "points") { ## Define function to create scatter plot (for use with ggpairs) if (addIdentityLine) { plotpoints <- function(data, mapping, ...) { - ggplot2::ggplot(data = data, mapping = mapping) + - ggplot2::geom_abline(slope = 1, intercept = 0, - linetype = "dashed", color = "grey") + - ggplot2::geom_point(alpha = pointAlpha, size = pointSize) + + ggplot(data = data, mapping = mapping) + + geom_abline(slope = 1, intercept = 0, + linetype = "dashed", color = "grey") + + geom_point(alpha = pointAlpha, size = pointSize) + ggtheme } } else { plotpoints <- function(data, mapping, ...) { - ggplot2::ggplot(data = data, mapping = mapping) + - ggplot2::geom_point(alpha = pointAlpha, size = pointSize) + + ggplot(data = data, mapping = mapping) + + geom_point(alpha = pointAlpha, size = pointSize) + ggtheme } } @@ -178,16 +191,16 @@ plotPairs <- function(se, selAssay = "counts", doLog = TRUE, pseudocount = 1, ## Define function to create scattermore plot (for use with ggpairs) if (addIdentityLine) { plotscattermore <- function(data, mapping, ...) { - ggplot2::ggplot(data = data, mapping = mapping) + - ggplot2::geom_abline(slope = 1, intercept = 0, - linetype = "dashed", color = "grey") + + ggplot(data = data, mapping = mapping) + + geom_abline(slope = 1, intercept = 0, + linetype = "dashed", color = "grey") + scattermore::geom_scattermore(alpha = pointAlpha, pointsize = pointSize) + ggtheme } } else { plotscattermore <- function(data, mapping, ...) { - ggplot2::ggplot(data = data, mapping = mapping) + + ggplot(data = data, mapping = mapping) + scattermore::geom_scattermore(alpha = pointAlpha, pointsize = pointSize) + ggtheme @@ -198,12 +211,12 @@ plotPairs <- function(se, selAssay = "counts", doLog = TRUE, pseudocount = 1, if (addIdentityLine) { plotscattermost <- function(data, mapping, ...) { ## Get data - xData <- GGally::eval_data_col(data, mapping$x) - yData <- GGally::eval_data_col(data, mapping$y) + xData <- eval_data_col(data, mapping$x) + yData <- eval_data_col(data, mapping$y) - ggplot2::ggplot(data = data, mapping = mapping) + - ggplot2::geom_abline(slope = 1, intercept = 0, - linetype = "dashed", color = "grey") + + ggplot(data = data, mapping = mapping) + + geom_abline(slope = 1, intercept = 0, + linetype = "dashed", color = "grey") + scattermore::geom_scattermost(xy = cbind(xData, yData), pointsize = pointSize) + ggtheme @@ -211,16 +224,17 @@ plotPairs <- function(se, selAssay = "counts", doLog = TRUE, pseudocount = 1, } else { plotscattermost <- function(data, mapping, ...) { ## Get data - xData <- GGally::eval_data_col(data, mapping$x) - yData <- GGally::eval_data_col(data, mapping$y) + xData <- eval_data_col(data, mapping$x) + yData <- eval_data_col(data, mapping$y) - ggplot2::ggplot(data = data, mapping = mapping) + + ggplot(data = data, mapping = mapping) + scattermore::geom_scattermost(xy = cbind(xData, yData), pointsize = pointSize) + ggtheme } } } + # nocov end ## Define the function to use for the plots if (pointsType == "smoothscatter") { @@ -232,28 +246,34 @@ plotPairs <- function(se, selAssay = "counts", doLog = TRUE, pseudocount = 1, } else if (pointsType == "scattermost") { lower <- list(continuous = plotscattermost) } - - ## ----------------------------------------------------------------------- ## + + ## --------------------------------------------------------------------- ## ## Histogram - ## ----------------------------------------------------------------------- ## + ## --------------------------------------------------------------------- ## + # nocov start diaghist <- function(data, mapping, ...) { - gg <- ggplot2::ggplot(data = data, mapping = mapping) + - ggplot2::geom_histogram(fill = "#F5C710", color = "grey50", bins = histBreaks) + + gg <- ggplot(data = data, mapping = mapping) + + geom_histogram(fill = "#F5C710", color = "grey50", + bins = histBreaks) + ggtheme if (pointsType == "smoothscatter") { - gg <- gg + ggplot2::scale_x_continuous(expand = c(0, 0)) + gg <- gg + scale_x_continuous(expand = c(0, 0)) } gg } - - ## ----------------------------------------------------------------------- ## + # nocov end + + ## --------------------------------------------------------------------- ## ## Combined plot - ## ----------------------------------------------------------------------- ## - ## Prepare the data and plot title, depending on whether to log-transform or not + ## --------------------------------------------------------------------- ## + ## Prepare the data and plot title, depending on whether to log-transform + ## or not if (doLog) { - mat <- log10(as.matrix(SummarizedExperiment::assay(se, selAssay)) + pseudocount) + mat <- log10(as.matrix(assay(se, selAssay)) + + pseudocount) title <- paste0("log10(", selAssay, - ifelse(pseudocount > 0, paste0(" + ", pseudocount), ""), + ifelse(pseudocount > 0, + paste0(" + ", pseudocount), ""), ")") } else { mat <- as.matrix(assay(se, selAssay)) @@ -262,21 +282,23 @@ plotPairs <- function(se, selAssay = "counts", doLog = TRUE, pseudocount = 1, mat <- as.data.frame(mat) title <- paste0(title, ", ", toupper(substr(corMethod, 1, 1)), substr(corMethod, 2, nchar(corMethod)), " correlation") - + ## Calculate correlations and get range - allCors <- stats::cor(mat, method = corMethod, use = "pairwise.complete.obs") + allCors <- cor(mat, method = corMethod, + use = "pairwise.complete.obs") if (!is.null(corrColorRange)) { corRange <- corrColorRange } else { corRange <- range(abs(allCors[upper.tri(allCors)])) } if (length(unique(corRange)) == 1) { - ## Having a range with width 0 leads to problems in the rescaling of the correlations + ## Having a range with width 0 leads to problems in the rescaling of + ## the correlations corRange <- c(0, 1) } - + ## Construct the pairs plot - GGally::ggpairs( + ggpairs( data = mat, mapping = NULL, columns = seq_len(ncol(mat)), diff --git a/R/plotResults.R b/R/plotResults.R index aa67536..e61d758 100644 --- a/R/plotResults.R +++ b/R/plotResults.R @@ -60,9 +60,9 @@ .assertScalar(x = pointSize, type = "character", validValues = c("small", "large")) .assertScalar(x = interactivePlot, type = "logical") - + if (interactivePlot && !requireNamespace("plotly", quietly = TRUE)) { - stop("The 'plotly' package is required to make interactive plots.") + stop("The 'plotly' package is required to make interactive plots.") # nocov } xr <- range(res[[xCol]], na.rm = TRUE) yr <- range(res[[yCol]], na.rm = TRUE) @@ -72,49 +72,56 @@ if ("y" %in% centerAxis) { yr <- c(-max(abs(yr), na.rm = TRUE), max(abs(yr), na.rm = TRUE)) } - gg <- ggplot2::ggplot(res, ggplot2::aes(x = .data[[xCol]], y = .data[[yCol]])) + - ggplot2::theme_minimal() + ggplot2::coord_cartesian(xlim = xr, ylim = yr) + - ggplot2::theme(axis.text = ggplot2::element_text(size = 12), - axis.title = ggplot2::element_text(size = 14), - title = ggplot2::element_text(size = 14)) + - ggplot2::labs(x = xLabel, y = yLabel) + gg <- ggplot(res, aes(x = .data[[xCol]], + y = .data[[yCol]])) + + theme_minimal() + + coord_cartesian(xlim = xr, ylim = yr) + + theme(axis.text = element_text(size = 12), + axis.title = element_text(size = 14), + title = element_text(size = 14)) + + labs(x = xLabel, y = yLabel) if (!is.null(labelCol)) { gg <- gg + - ggplot2::aes(label = .data[[labelCol]]) + aes(label = .data[[labelCol]]) } if (pointSize == "large") { gg <- gg + - ggplot2::geom_point(fill = "lightgrey", color = "grey", pch = 21, size = 1.5) + geom_point(fill = "lightgrey", color = "grey", + pch = 21, size = 1.5) } else { gg <- gg + - ggplot2::geom_point(color = "black", size = 0.25, alpha = 0.5) + geom_point(color = "black", size = 0.25, alpha = 0.5) } if (any(res[[colorCol]] < colorThreshold)) { gg <- gg + - ggplot2::labs(subtitle = paste0("Points are indicated in red if ", - colorCol, "<", colorThreshold)) + labs(subtitle = paste0("Points are indicated in red if ", + colorCol, "<", colorThreshold)) if (pointSize == "large") { gg <- gg + - ggplot2::geom_point( - data = res[res[[colorCol]] < colorThreshold, , drop = FALSE], + geom_point( + data = res[res[[colorCol]] < colorThreshold, , + drop = FALSE], fill = "red", color = "grey", pch = 21, size = 1.5 ) } else { gg <- gg + - ggplot2::geom_point( - data = res[res[[colorCol]] < colorThreshold, , drop = FALSE], + geom_point( + data = res[res[[colorCol]] < colorThreshold, , + drop = FALSE], color = "red", size = 0.25 ) } } if (interactivePlot) { - plotly::ggplotly(gg) + plotly::ggplotly(gg) # nocov } else { if (!is.null(labelCol) && length(labelValues) > 0) { - .assertVector(x = labelValues, type = "character", validValues = res[[labelCol]]) + .assertVector(x = labelValues, type = "character", + validValues = res[[labelCol]]) gg <- gg + - ggrepel::geom_text_repel( - data = res[res[[labelCol]] %in% labelValues, , drop = FALSE], + geom_text_repel( + data = res[res[[labelCol]] %in% labelValues, , + drop = FALSE], max.overlaps = Inf, size = 4, min.segment.length = 0.1) } gg @@ -124,7 +131,7 @@ .getColName <- function(res, validValues, aspect) { .assertVector(x = res, type = "data.frame") colName <- grep(paste(paste0("^", validValues, "$"), collapse = "|"), - colnames(res), value = TRUE) + colnames(res), value = TRUE) if (length(colName) == 0) { stop("No suitable column found for ", aspect, ", one of the ", "following expected: ", paste(validValues, collapse = ", ")) @@ -175,15 +182,17 @@ #' @export #' #' @examples +#' library(SummarizedExperiment) #' se <- readRDS(system.file("extdata", "GSE102901_cis_se.rds", #' package = "mutscan"))[1:200, ] #' design <- model.matrix(~ Replicate + Condition, -#' data = SummarizedExperiment::colData(se)) +#' data = colData(se)) #' res <- calculateRelativeFC(se, design, coef = "Conditioncis_output") #' plotMeanDiff(res, pointSize = "large", nTopToLabel = 3) #' plotMeanDiff <- function(res, meanCol = NULL, logFCCol = NULL, pvalCol = NULL, - padjCol = NULL, padjThreshold = 0.05, pointSize = "small", + padjCol = NULL, padjThreshold = 0.05, + pointSize = "small", interactivePlot = FALSE, nTopToLabel = 0) { .assertScalar(x = nTopToLabel, type = "numeric", rngIncl = c(0, Inf)) .assertVector(x = res, type = "data.frame") @@ -202,16 +211,17 @@ plotMeanDiff <- function(res, meanCol = NULL, logFCCol = NULL, pvalCol = NULL, } if (is.null(padjCol)) { padjCol <- .getColName(res, validValues = c("FDR", "adj.P.Val"), - aspect = "highlighting") + aspect = "highlighting") } .assertScalar(pvalCol, type = "character", validValues = colnames(res)) res$feature <- rownames(res) if (nTopToLabel > 0) { - labelValues <- res[order(res[[pvalCol]]), "feature"][seq_len(nTopToLabel)] + labelValues <- res[order(res[[pvalCol]]), + "feature"][seq_len(nTopToLabel)] } else { labelValues <- c() } - + .plotScatter(res = res, xCol = meanCol, yCol = logFCCol, xLabel = meanCol, yLabel = logFCCol, colorCol = padjCol, @@ -256,10 +266,11 @@ plotMeanDiff <- function(res, meanCol = NULL, logFCCol = NULL, pvalCol = NULL, #' @export #' #' @examples +#' library(SummarizedExperiment) #' se <- readRDS(system.file("extdata", "GSE102901_cis_se.rds", #' package = "mutscan"))[1:200, ] #' design <- model.matrix(~ Replicate + Condition, -#' data = SummarizedExperiment::colData(se)) +#' data = colData(se)) #' res <- calculateRelativeFC(se, design, coef = "Conditioncis_output") #' plotVolcano(res, pointSize = "large", nTopToLabel = 3) #' @@ -275,7 +286,7 @@ plotVolcano <- function(res, logFCCol = NULL, pvalCol = NULL, padjCol = NULL, } if (is.null(pvalCol)) { pvalCol <- .getColName(res, validValues = c("PValue", "P.Value"), - aspect = "y-axis") + aspect = "y-axis") } if (is.null(padjCol)) { padjCol <- .getColName(res, validValues = c("FDR", "adj.P.Val"), @@ -285,11 +296,12 @@ plotVolcano <- function(res, logFCCol = NULL, pvalCol = NULL, padjCol = NULL, res$negLog10P <- -log10(res[[pvalCol]]) res$feature <- rownames(res) if (nTopToLabel > 0) { - labelValues <- res[order(res[[pvalCol]]), "feature"][seq_len(nTopToLabel)] + labelValues <- res[order(res[[pvalCol]]), + "feature"][seq_len(nTopToLabel)] } else { labelValues <- c() } - + .plotScatter(res = res, xCol = logFCCol, yCol = "negLog10P", xLabel = logFCCol, yLabel = paste0("-log10(", pvalCol, ")"), colorCol = padjCol, diff --git a/R/plotTotals.R b/R/plotTotals.R index 7bcf732..30649d9 100644 --- a/R/plotTotals.R +++ b/R/plotTotals.R @@ -15,7 +15,7 @@ #' #' @importFrom ggplot2 ggplot theme_minimal theme element_text labs #' geom_bar scale_fill_discrete aes -#' @importFrom SummarizedExperiment assay rowData +#' @importFrom SummarizedExperiment assay rowData assayNames #' @importFrom rlang .data #' #' @examples @@ -26,23 +26,25 @@ plotTotals <- function(se, selAssay = "counts", groupBy = NULL) { .assertVector(x = se, type = "SummarizedExperiment") .assertScalar(x = selAssay, type = "character", - validValues = SummarizedExperiment::assayNames(se)) + validValues = assayNames(se)) if (!is.null(groupBy)) { - .assertScalar(x = groupBy, type = "character", - validValues = colnames(SummarizedExperiment::rowData(se))) + .assertScalar( + x = groupBy, type = "character", + validValues = colnames(rowData(se))) } - + ## Extract the assay matrix and reformat relevant parts of it for plotting - selAssayMat <- SummarizedExperiment::assay(se, selAssay) + selAssayMat <- assay(se, selAssay) if (!is.null(groupBy)) { df <- do.call( rbind, - lapply(unique(SummarizedExperiment::rowData(se)[[groupBy]]), + lapply(unique(rowData(se)[[groupBy]]), function(rn) { data.frame( names = colnames(se), category = rn, - total = colSums(selAssayMat[SummarizedExperiment::rowData(se)[[groupBy]] == rn, ]) + total = colSums( + selAssayMat[rowData(se)[[groupBy]] == rn, ]) ) }) ) @@ -52,21 +54,22 @@ plotTotals <- function(se, selAssay = "counts", groupBy = NULL) { total = colSums(selAssayMat)) } - gg <- ggplot2::ggplot(df, ggplot2::aes(x = .data$names, y = .data$total)) + - ggplot2::theme_minimal() + - ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, hjust = 1, - vjust = 0.5, size = 12), - axis.text.y = ggplot2::element_text(size = 12), - axis.title = ggplot2::element_text(size = 14)) + - ggplot2::labs(x = "", y = paste0("Total (", selAssay, ")")) + gg <- ggplot(df, aes(x = .data$names, y = .data$total)) + + theme_minimal() + + theme( + axis.text.x = element_text(angle = 90, hjust = 1, + vjust = 0.5, size = 12), + axis.text.y = element_text(size = 12), + axis.title = element_text(size = 14)) + + labs(x = "", y = paste0("Total (", selAssay, ")")) if (!is.null(groupBy)) { gg <- gg + - ggplot2::geom_bar(stat = "identity", position = "stack", - ggplot2::aes(fill = .data$category)) + - ggplot2::scale_fill_discrete(name = groupBy) + geom_bar(stat = "identity", position = "stack", + aes(fill = .data$category)) + + scale_fill_discrete(name = groupBy) } else { gg <- gg + - ggplot2::geom_bar(stat = "identity", position = "stack") + geom_bar(stat = "identity", position = "stack") } gg diff --git a/R/relabelMutPositions.R b/R/relabelMutPositions.R index 2a14d6f..a96107d 100644 --- a/R/relabelMutPositions.R +++ b/R/relabelMutPositions.R @@ -1,9 +1,9 @@ #' Relabel the positions of mutations in the designated ID #' #' @param se SummarizedExperiment object, with row names of the form -#' XX{.}AA{.}NNN, where XX is the name of the reference sequence, AA is the -#' position of the mutated codon, and NNN is the mutated codon or amino -#' acid. {.} is the delimiter, to be specified in the +#' XX\{.\}AA\{.\}NNN, where XX is the name of the reference sequence, +#' AA is the position of the mutated codon, and NNN is the mutated codon +#' or amino acid. \{.\} is the delimiter, to be specified in the #' \code{mutNameDelimiter} argument. For rows corresponding to sequences #' with multiple mutated codons, the row names contain multiple names of #' the form above in a single string, separated by "_". @@ -15,7 +15,8 @@ #' \item name The new name for the codon (will replace AA in the mutation #' name, if the reference sequence matches seqname) #' } -#' @param mutNameDelimiter The delimiter used in the mutation name ({.} above). +#' @param mutNameDelimiter The delimiter used in the mutation name +#' (\{.\} above). #' #' @author Charlotte Soneson #' @export @@ -24,7 +25,7 @@ #' #' @importFrom S4Vectors unstrsplit #' @importFrom utils relist -#' @importFrom BiocGenerics rownames +#' @importFrom BiocGenerics rownames rownames<- #' #' @examples #' x <- readRDS(system.file("extdata", "GSE102901_cis_se.rds", @@ -38,12 +39,13 @@ relabelMutPositions <- function(se, conversionTable, mutNameDelimiter = ".") { .assertVector(x = se, type = "SummarizedExperiment") .assertVector(x = colnames(conversionTable), type = "character") .assertScalar(x = mutNameDelimiter, type = "character") - stopifnot(all(c("position", "name", "seqname") %in% colnames(conversionTable))) + stopifnot(all(c("position", "name", "seqname") %in% + colnames(conversionTable))) conversionTable$position <- as.character(conversionTable$position) - spl <- base::strsplit(rownames(se), "_") - unl <- base::unlist(spl) - unl <- lapply(base::strsplit(unl, mutNameDelimiter, fixed = TRUE), + spl <- strsplit(rownames(se), "_") + unl <- unlist(spl) + unl <- lapply(strsplit(unl, mutNameDelimiter, fixed = TRUE), function(w) { idx <- which(conversionTable$seqname == w[1] & conversionTable$position == w[2]) @@ -52,9 +54,9 @@ relabelMutPositions <- function(se, conversionTable, mutNameDelimiter = ".") { } w }) - unl <- S4Vectors::unstrsplit(unl, sep = mutNameDelimiter) - spl <- utils::relist(unl, skeleton = spl) - spl <- S4Vectors::unstrsplit(spl, "_") - BiocGenerics::rownames(se) <- spl + unl <- unstrsplit(unl, sep = mutNameDelimiter) + spl <- relist(unl, skeleton = spl) + spl <- unstrsplit(spl, "_") + rownames(se) <- spl se } diff --git a/R/summarizeExperiment.R b/R/summarizeExperiment.R index 125ae48..cc8a7a8 100644 --- a/R/summarizeExperiment.R +++ b/R/summarizeExperiment.R @@ -18,8 +18,8 @@ #' \code{coldata}. #' @param coldata A \code{data.frame} with at least one column "Name", which #' will be used to link to objects in \code{x}. A potentially subset and -#' reordered version of \code{coldata} is stored in the \code{colData} of the -#' returned \code{\link[SummarizedExperiment]{SummarizedExperiment}}. +#' reordered version of \code{coldata} is stored in the \code{colData} of +#' the returned \code{\link[SummarizedExperiment]{SummarizedExperiment}}. #' @param countType Either "reads" or "umis". If "reads", the "count" assay of #' the returned object will contain the observed number of reads for each #' sequence (pair). If "umis", the "count" assay will contain the number of @@ -28,26 +28,26 @@ #' @return A \code{\link[SummarizedExperiment]{SummarizedExperiment}} \code{x} #' with #' \describe{ -#' \item{assays(x)$counts}{containing the observed number of sequences or -#' sequence pairs (if \code{countType} = "reads"), or the observed number of -#' unique UMIs for each sequence or sequence pair (if \code{countType} = -#' "umis").} -#' \item{rowData(x)}{containing the unique sequences or sequence pairs.} -#' \item{colData(x)}{containing the metadata provided by \code{coldata}.} +#' \item{assays(x)$counts}{containing the observed number of sequences +#' or sequence pairs (if \code{countType} = "reads"), or the observed +#' number of unique UMIs for each sequence or sequence pair (if +#' \code{countType} = "umis").} +#' \item{rowData(x)}{containing the unique sequences or sequence +#' pairs.} +#' \item{colData(x)}{containing the metadata provided by +#' \code{coldata}.} #' } #' #' @author Michael Stadler, Charlotte Soneson #' #' @export #' -#' @importFrom SummarizedExperiment SummarizedExperiment colData -#' @importFrom BiocGenerics paste +#' @importFrom SummarizedExperiment SummarizedExperiment #' @importFrom S4Vectors DataFrame #' @importFrom IRanges IntegerList #' @importFrom methods is new as -#' @importFrom dplyr bind_rows distinct left_join %>% mutate filter group_by +#' @importFrom dplyr bind_rows distinct left_join mutate filter group_by #' summarize -#' @importFrom tidyr unite separate_rows #' @importFrom rlang .data #' @importFrom stats setNames #' @@ -58,8 +58,9 @@ #' package = "mutscan"), #' elementsForward = "SUCV", elementLengthsForward = c(1, 10, 18, 96), #' constantForward = "AACCGGAGGAGGGAGCTG", -#' wildTypeForward = c(FOS = paste0("ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", -#' "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), +#' wildTypeForward = c(FOS = paste0( +#' "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", +#' "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), #' nbrMutatedCodonsMaxForward = 1 #' ) #' ## Output sample @@ -68,8 +69,9 @@ #' package = "mutscan"), #' elementsForward = "SUCV", elementLengthsForward = c(1, 10, 18, 96), #' constantForward = "AACCGGAGGAGGGAGCTG", -#' wildTypeForward = c(FOS = paste0("ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", -#' "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), +#' wildTypeForward = c(FOS = paste0( +#' "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", +#' "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), #' nbrMutatedCodonsMaxForward = 1 #' ) #' ## Combine @@ -83,14 +85,15 @@ #' se #' summarizeExperiment <- function(x, coldata, countType = "umis") { - ## -------------------------------------------------------------------------- + ## ------------------------------------------------------------------------ ## Pre-flight checks - ## -------------------------------------------------------------------------- + ## ------------------------------------------------------------------------ if (!is(x, "list") || is.null(names(x))) { stop("'x' must be a named list") } if (any(duplicated(names(x)))) { - stop("duplicated names in 'x' (e.g. technical replicated to be merged) is not supported yet") + stop("duplicated names in 'x' (e.g. technical replicated to be ", + "merged) is not supported yet") } if (!is(coldata, "data.frame") || !("Name" %in% colnames(coldata))) { stop("'coldata' must be a data.frame with at least one column, named ", @@ -98,25 +101,29 @@ summarizeExperiment <- function(x, coldata, countType = "umis") { } .assertScalar(x = countType, type = "character", validValues = c("umis", "reads")) - ## If no UMI sequences were given, then countType = "umis" should not be allowed + ## If no UMI sequences were given, then countType = "umis" should not be + ## allowed if (countType == "umis" && - !(all(vapply(x, function(w) .hasReadComponent(w$parameters$elementsForward, "U") || - .hasReadComponent(w$parameters$elementsReverse, "U"), FALSE)))) { + !(all(vapply(x, function(w) { + .hasReadComponent(w$parameters$elementsForward, "U") || + .hasReadComponent(w$parameters$elementsReverse, "U") + }, FALSE)))) { stop("'countType' is set to 'umis', but no UMI sequences ", "were provided when quantifying. ", "Set 'countType' to 'reads' instead.") } ## Get the mutNameDelimiters, and make sure that they are the same - mutnamedel <- unique(vapply(x, function(w) w$parameters$mutNameDelimiter, "")) + mutnamedel <- unique(vapply(x, + function(w) w$parameters$mutNameDelimiter, "")) if (length(mutnamedel) > 1) { stop("All samples must have the same 'mutNameDelimiter'") } - + coldata$Name <- as.character(coldata$Name) - - ## -------------------------------------------------------------------------- + + ## ------------------------------------------------------------------------ ## Link elements in x with coldata - ## -------------------------------------------------------------------------- + ## ------------------------------------------------------------------------ nms <- intersect(names(x), coldata$Name) if (length(nms) == 0) { stop("names in 'x' do not match 'coldata$Name'") @@ -128,7 +135,7 @@ summarizeExperiment <- function(x, coldata, countType = "umis") { } x <- x[nms] coldata <- coldata[match(nms, coldata$Name), , drop = FALSE] - + ## All sample names allSamples <- names(x) names(allSamples) <- allSamples @@ -138,11 +145,11 @@ summarizeExperiment <- function(x, coldata, countType = "umis") { ## For a given sample, the same mutant name can correspond to multiple ## sequences, separated by , ## ------------------------------------------------------------------------ - tmpdf <- do.call(dplyr::bind_rows, lapply(x, function(w) w$summaryTable)) + tmpdf <- do.call(bind_rows, lapply(x, function(w) w$summaryTable)) - allSequences <- mergeValues(tmpdf$mutantName, tmpdf$sequence) %>% - stats::setNames(c("mutantName", "sequence")) - allSequences <- S4Vectors::DataFrame(allSequences) + allSequences <- mergeValues(tmpdf$mutantName, tmpdf$sequence) |> + setNames(c("mutantName", "sequence")) + allSequences <- DataFrame(allSequences) ## ------------------------------------------------------------------------ ## Add info about nbr mutated bases/codons/AAs, @@ -152,64 +159,69 @@ summarizeExperiment <- function(x, coldata, countType = "umis") { ## ------------------------------------------------------------------------ for (v in intersect(c("nbrMutBases", "nbrMutCodons", "nbrMutAAs", "mutantNameBase", "mutantNameBaseHGVS", - "mutantNameCodon", "mutantNameAA", "mutantNameAAHGVS", - "sequenceAA", "mutationTypes", + "mutantNameCodon", "mutantNameAA", + "mutantNameAAHGVS", "sequenceAA", "mutationTypes", "varLengths"), colnames(tmpdf))) { - tmp <- mergeValues(tmpdf$mutantName, tmpdf[[v]]) %>% - stats::setNames(c("mutantName", v)) + tmp <- mergeValues(tmpdf$mutantName, tmpdf[[v]]) |> + setNames(c("mutantName", v)) allSequences[[v]] <- tmp[[v]][match(allSequences$mutantName, tmp$mutantName)] if (v %in% c("nbrMutBases", "nbrMutCodons", "nbrMutAAs")) { - tmpList <- methods::as( - lapply(strsplit(allSequences[[v]], ","), function(w) sort(as.integer(w))), + tmpList <- as( + lapply(strsplit(allSequences[[v]], ","), + function(w) sort(as.integer(w))), "IntegerList") allSequences[[paste0("min", sub("^n", "N", v))]] <- min(tmpList) allSequences[[paste0("max", sub("^n", "N", v))]] <- max(tmpList) } } - - ## -------------------------------------------------------------------------- + + ## ------------------------------------------------------------------------ ## Create a sparse matrix - ## -------------------------------------------------------------------------- + ## ------------------------------------------------------------------------ countCol <- ifelse(countType == "umis", "nbrUmis", "nbrReads") - tmp <- do.call(dplyr::bind_rows, lapply(allSamples, function(s) { + tmp <- do.call(bind_rows, lapply(allSamples, function(s) { st <- x[[s]]$summaryTable data.frame(i = match(st$mutantName, allSequences$mutantName), j = match(s, allSamples), x = as.numeric(st[, countCol])) })) - countMat <- methods::new("dgTMatrix", i = tmp$i - 1L, j = tmp$j - 1L, - x = tmp$x, Dim = c(nrow(allSequences), length(allSamples))) + countMat <- new("dgTMatrix", i = tmp$i - 1L, j = tmp$j - 1L, + x = tmp$x, Dim = c(nrow(allSequences), + length(allSamples))) ## Convert to dgCMatrix for easier processing downstream - countMat <- methods::as(countMat, "CsparseMatrix") - - ## -------------------------------------------------------------------------- + countMat <- as(countMat, "CsparseMatrix") + + ## ------------------------------------------------------------------------ ## Create the colData - ## -------------------------------------------------------------------------- - addMeta <- do.call(dplyr::bind_rows, lapply(allSamples, function(s) { - x[[s]]$filterSummary %>% dplyr::mutate(Name = s) + ## ------------------------------------------------------------------------ + addMeta <- do.call(bind_rows, lapply(allSamples, function(s) { + x[[s]]$filterSummary |> mutate(Name = s) })) - coldata <- dplyr::left_join(coldata, addMeta, by = "Name") - - ## -------------------------------------------------------------------------- + coldata <- left_join(coldata, addMeta, by = "Name") + + ## ------------------------------------------------------------------------ ## Create SummarizedExperiment object - ## -------------------------------------------------------------------------- - se <- SummarizedExperiment::SummarizedExperiment( + ## ------------------------------------------------------------------------ + se <- SummarizedExperiment( assays = list(counts = countMat), colData = coldata[match(allSamples, coldata$Name), , drop = FALSE], rowData = allSequences, - metadata = list(parameters = lapply(allSamples, function(w) x[[w]]$parameters), - errorStatistics = lapply(allSamples, function(w) x[[w]]$errorStatistics), - countType = countType, - mutNameDelimiter = mutnamedel) + metadata = list( + parameters = lapply(allSamples, + function(w) x[[w]]$parameters), + errorStatistics = lapply(allSamples, + function(w) x[[w]]$errorStatistics), + countType = countType, + mutNameDelimiter = mutnamedel) ) - + if (!any(allSequences$mutantName == "")) { rownames(se) <- allSequences$mutantName } colnames(se) <- allSamples - + return(se) -} \ No newline at end of file +} diff --git a/R/utils.R b/R/utils.R index 38852e2..7e40be8 100644 --- a/R/utils.R +++ b/R/utils.R @@ -74,11 +74,13 @@ } args <- lapply(mycall, as.character)[-1] xname <- if ("x" %in% names(args)) args$x else "argument" - + ## Check arguments stopifnot(is.null(type) || (length(type) == 1L && is.character(type))) - stopifnot(is.null(rngIncl) || (length(rngIncl) == 2L && is.numeric(rngIncl))) - stopifnot(is.null(rngExcl) || (length(rngExcl) == 2L && is.numeric(rngExcl))) + stopifnot(is.null(rngIncl) || (length(rngIncl) == 2L && + is.numeric(rngIncl))) + stopifnot(is.null(rngExcl) || (length(rngExcl) == 2L && + is.numeric(rngExcl))) stopifnot(is.null(len) || (length(len) == 1L && is.numeric(len))) stopifnot(is.null(rngLen) || (length(rngLen) == 2L && is.numeric(rngLen))) stopifnot(is.logical(allowNULL) && length(allowNULL) == 1L) @@ -107,13 +109,14 @@ type <- "numeric" } - if (!is.null(type) && !methods::is(x, type)) { + if (!is.null(type) && !is(x, type)) { stop("'", xname, "' must be of class '", type, "'", call. = FALSE) } if (!is.null(rngIncl)) { if (!is.null(validValues)) { - if (any((x < rngIncl[1] | x > rngIncl[2]) & !(x %in% validValues))) { + if (any((x < rngIncl[1] | x > rngIncl[2]) & + !(x %in% validValues))) { stop("'", xname, "' must be within [", rngIncl[1], ",", rngIncl[2], "] (inclusive), or one of: ", vvPrint, call. = FALSE) @@ -126,7 +129,8 @@ } } else if (!is.null(rngExcl)) { if (!is.null(validValues)) { - if (any((x <= rngExcl[1] | x >= rngExcl[2]) & !(x %in% validValues))) { + if (any((x <= rngExcl[1] | x >= rngExcl[2]) & + !(x %in% validValues))) { stop("'", xname, "' must be within (", rngExcl[1], ",", rngExcl[2], ") (exclusive), or one of: ", vvPrint, call. = FALSE) @@ -144,11 +148,11 @@ } } - + if (!is.null(len) && length(x) != len) { stop("'", xname, "' must have length ", len, call. = FALSE) } - + if (!is.null(rngLen) && (length(x) < rngLen[1] || length(x) > rngLen[2])) { stop("length of '", xname, "' must be within [", rngLen[1], ",", rngLen[2], "] (inclusive)", call. = FALSE) @@ -190,20 +194,22 @@ caller <- deparse(sys.calls()[[sys.nframe() - 1]]) callerfunc <- sub("\\(.+$", "", caller) haveBioc <- requireNamespace("BiocManager", quietly = TRUE) - msg <- paste0("The package", ifelse(sum(!avail) > 1, "s '", " '"), - paste(sub("^[^/]+/", "", pkgs[!avail]), collapse = "', '"), - "' ", - ifelse(sum(!avail) > 1, "are", "is"), " required for ", - callerfunc, "(), but not installed.\n") + msg <- paste0( + "The package", ifelse(sum(!avail) > 1, "s '", " '"), + paste(sub("^[^/]+/", "", pkgs[!avail]), collapse = "', '"), + "' ", + ifelse(sum(!avail) > 1, "are", "is"), " required for ", + callerfunc, "(), but not installed.\n") if (suggestInstallation) { - msg <- paste0(msg, - "Install ", ifelse(sum(!avail) > 1, "them", "it"), " using:\n", - ifelse(haveBioc, "", "install.packages(\"BiocManager\")\n"), - "BiocManager::install(c(\"", - paste(pkgs[!avail], collapse = "\", \""), "\"))") + msg <- paste0( + msg, + "Install ", ifelse(sum(!avail) > 1, "them", "it"), " using:\n", + ifelse(haveBioc, "", "install.packages(\"BiocManager\")\n"), + "BiocManager::install(c(\"", + paste(pkgs[!avail], collapse = "\", \""), "\"))") } stop(msg, call. = FALSE) } invisible(TRUE) -} \ No newline at end of file +} diff --git a/inst/scripts/generateValuesForUnitTestsTrans.R b/inst/scripts/generateValuesForUnitTestsTrans.R index e860b53..47043be 100644 --- a/inst/scripts/generateValuesForUnitTestsTrans.R +++ b/inst/scripts/generateValuesForUnitTestsTrans.R @@ -479,7 +479,7 @@ processReadsTrans <- function(Ldef) { } else { fqseq <- as.character(varForward) } - tbl <- as.data.frame(table(fqseq)) %>% dplyr::arrange(desc(Freq), fqseq) %>% + tbl <- as.data.frame(table(fqseq)) |> dplyr::arrange(desc(Freq), fqseq) |> dplyr::mutate(fqseq = as.character(fqseq)) tbl$umis <- "" for (i in seq_len(nrow(tbl))) { diff --git a/inst/templates/qc_report_template.Rmd b/inst/templates/qc_report_template.Rmd index 1279198..6ebe3d8 100644 --- a/inst/templates/qc_report_template.Rmd +++ b/inst/templates/qc_report_template.Rmd @@ -53,14 +53,15 @@ extradf <- do.call( function(x) paste(x, collapse = "; "))) }) ) -DT::datatable(dplyr::full_join(tmpdf, extradf, by = "Name"), - extensions = "FixedColumns", - rownames = FALSE, - options = list( - paging = TRUE, searching = TRUE, info = FALSE, - pageLength = 20, - sort = TRUE, scrollX = TRUE, fixedColumns = list(leftColumns = 1) - )) %>% +DT::datatable( + dplyr::full_join(tmpdf, extradf, by = "Name"), + extensions = "FixedColumns", + rownames = FALSE, + options = list( + paging = TRUE, searching = TRUE, info = FALSE, + pageLength = 20, + sort = TRUE, scrollX = TRUE, fixedColumns = list(leftColumns = 1) + )) %>% formatStyle("Name", "vertical-align" = "center") ``` @@ -69,15 +70,17 @@ reads remaining after each filtering step performed by `mutscan`, as well as the fraction of reads that are filtered out by each step. ```{r, fig.height = 1.5 * ncol(params$se), fig.width = 10} -tryCatch({mutscan::plotFiltering(params$se, valueType = "reads", onlyActiveFilters = TRUE, - displayNumbers = TRUE, plotType = "remaining", facetBy = "sample")}, - error = function(e) message("Couldn't generate filtering plots.")) +tryCatch({mutscan::plotFiltering( + params$se, valueType = "reads", onlyActiveFilters = TRUE, + displayNumbers = TRUE, plotType = "remaining", facetBy = "sample")}, + error = function(e) message("Couldn't generate filtering plots.")) ``` ```{r, fig.height = 1.5 * ncol(params$se), fig.width = 10} -tryCatch({mutscan::plotFiltering(params$se, valueType = "fractions", onlyActiveFilters = TRUE, - displayNumbers = TRUE, plotType = "filtered", facetBy = "step")}, - error = function(e) message("Couldn't generate filtering plots.")) +tryCatch({mutscan::plotFiltering( + params$se, valueType = "fractions", onlyActiveFilters = TRUE, + displayNumbers = TRUE, plotType = "filtered", facetBy = "step")}, + error = function(e) message("Couldn't generate filtering plots.")) ``` # Pairs plot @@ -109,15 +112,17 @@ arranged in decreasing order, and as a density plot. Note that both plots display the values on a log scale; hence any zero values are not shown. ```{r, warning = FALSE} -tryCatch({mutscan::plotDistributions(params$se, selAssay = "counts", pseudocount = 1, - groupBy = NULL, plotType = "knee", facet = FALSE)}, - error = function(e) message("Couldn't generate distribution plot.")) +tryCatch({mutscan::plotDistributions( + params$se, selAssay = "counts", pseudocount = 1, + groupBy = NULL, plotType = "knee", facet = FALSE)}, + error = function(e) message("Couldn't generate distribution plot.")) ``` ```{r, warning = FALSE} -tryCatch({mutscan::plotDistributions(params$se, selAssay = "counts", pseudocount = 1, - groupBy = NULL, plotType = "density", facet = FALSE)}, - error = function(e) message("Couldn't generate distribution plot.")) +tryCatch({mutscan::plotDistributions( + params$se, selAssay = "counts", pseudocount = 1, + groupBy = NULL, plotType = "density", facet = FALSE)}, + error = function(e) message("Couldn't generate distribution plot.")) ``` # Session info diff --git a/man/calculateRelativeFC.Rd b/man/calculateRelativeFC.Rd index a65684a..7d07a2d 100644 --- a/man/calculateRelativeFC.Rd +++ b/man/calculateRelativeFC.Rd @@ -27,10 +27,10 @@ same order as the columns in \code{se}.} \item{contrast}{Numeric contrast to test with edgeR or limma.} \item{WTrows}{Vector of row names that will be used as the reference when -calculating logFCs and statistics. If more than one value is provided, the -sum of the corresponding counts is used to generate offsets. If NULL, -offsets will be defined as the effective library sizes (using TMM -normalization factors).} +calculating logFCs and statistics. If more than one value is provided, +the sum of the corresponding counts is used to generate offsets. If +\code{NULL}, offsets will be defined as the effective library sizes +(using TMM normalization factors).} \item{selAssay}{Assay to select from \code{se} for the analysis.} @@ -42,9 +42,9 @@ limma. In this case, the results also contain the standard errors of the logFCs.} \item{normMethod}{Character scalar indicating which normalization method -should be used to calculate size factors. Should be either \code{"TMM"} or -\code{"csaw"} when \code{WTrows} is \code{NULL}, and \code{"geomean"} or -\code{"sum"} when \code{WTrows} is provided.} +should be used to calculate size factors. Should be either \code{"TMM"} +or \code{"csaw"} when \code{WTrows} is \code{NULL}, and \code{"geomean"} +or \code{"sum"} when \code{WTrows} is provided.} } \value{ A \code{data.frame} with output from the statistical testing @@ -56,10 +56,11 @@ either limma or the Negative Binomial quasi-likelihood framework of edgeR. The observed counts for the WT variants can be used as offsets in the model. } \examples{ +library(SummarizedExperiment) se <- readRDS(system.file("extdata", "GSE102901_cis_se.rds", package = "mutscan"))[1:200, ] -design <- stats::model.matrix(~ Replicate + Condition, - data = SummarizedExperiment::colData(se)) +design <- model.matrix(~ Replicate + Condition, + data = colData(se)) ## Calculate "absolute" log-fold changes with edgeR res <- calculateRelativeFC(se, design, coef = "Conditioncis_output", diff --git a/man/collapseMutantsBySimilarity.Rd b/man/collapseMutantsBySimilarity.Rd index 527877b..d2b57bf 100644 --- a/man/collapseMutantsBySimilarity.Rd +++ b/man/collapseMutantsBySimilarity.Rd @@ -41,8 +41,8 @@ variants.} similar sequences. If the value is in [0, 1), it defines the maximal Hamming distance in terms of a fraction of sequence length: (\code{round(collapseMaxDist * nchar(sequence))}). -A value greater or equal to 1 is rounded and directly used as the maximum -allowed Hamming distance. Note that sequences can only be +A value greater or equal to 1 is rounded and directly used as the +maximum allowed Hamming distance. Note that sequences can only be collapsed if they are all of the same length.} \item{collapseMinScore}{Numeric scalar, indicating the minimum score for the @@ -66,16 +66,18 @@ A \code{\link[SummarizedExperiment]{SummarizedExperiment}} where } \description{ These functions can be used to collapse variants, either by similarity or -according to a pre-defined grouping. The functions \code{collapseMutants} and -\code{collapseMutantsByAA} assume that a grouping variable is available as a -column in \code{rowData(se)} (\code{collapseMutantsByAA} is a convenience -function for the case when this column is "mutantNameAA", and is provided -for backwards compatibility). The \code{collapseMutantsBySimilarity} will -generate the grouping variable based on user-provided thresholds on the -sequence similarity (defined by the Hamming distance), and subsequently -collapse based on the derived grouping. +according to a pre-defined grouping. The functions \code{collapseMutants} +and \code{collapseMutantsByAA} assume that a grouping variable is available +as a column in \code{rowData(se)} (\code{collapseMutantsByAA} is a +convenience function for the case when this column is "mutantNameAA", and +is provided for backwards compatibility). The +\code{collapseMutantsBySimilarity} will generate the grouping variable +based on user-provided thresholds on the sequence similarity (defined by +the Hamming distance), and subsequently collapse based on the derived +grouping. } \examples{ +library(SummarizedExperiment) se <- readRDS(system.file("extdata", "GSE102901_cis_se.rds", package = "mutscan"))[1:200, ] ## The rows of this object correspond to individual codon variants @@ -89,7 +91,7 @@ dim(sec) head(rownames(sec)) ## The mutantName column contains the individual codon variants that were ## collapsed -head(SummarizedExperiment::rowData(sec)) +head(rowData(sec)) ## Collapse similar sequences sec2 <- collapseMutantsBySimilarity( @@ -98,9 +100,9 @@ sec2 <- collapseMutantsBySimilarity( collapseMinScore = 0, collapseMinRatio = 0) dim(sec2) head(rownames(sec2)) -head(SummarizedExperiment::rowData(sec2)) +head(rowData(sec2)) ## collapsed count matrix -SummarizedExperiment::assay(sec2, "counts") +assay(sec2, "counts") } \author{ diff --git a/man/digestFastqs.Rd b/man/digestFastqs.Rd index a871108..9bb3bfd 100644 --- a/man/digestFastqs.Rd +++ b/man/digestFastqs.Rd @@ -47,9 +47,6 @@ digestFastqs( mutNameDelimiter = ".", constantMaxDistForward = -1, constantMaxDistReverse = -1, - variableCollapseMaxDist = deprecated(), - variableCollapseMinReads = deprecated(), - variableCollapseMinRatio = deprecated(), umiCollapseMaxDist = 0, filteredReadsFastqForward = "", filteredReadsFastqReverse = "", @@ -110,8 +107,8 @@ but there can be at most one occurrence of P. If a given letter is included multiple times, the corresponding sequences will be concatenated in the output.} -\item{elementLengthsForward, elementLengthsReverse}{Numeric vectors containing -the lengths of each read component from +\item{elementLengthsForward, elementLengthsReverse}{Numeric vectors +containing the lengths of each read component from \code{elementsForward}/\code{elementsReverse}, respectively. If the length of one element is set to -1, it will be inferred from the other lengths (as the remainder of the read). At most one number (or one @@ -119,11 +116,11 @@ number on each side of the primer P) can be set to -1. The indicated length of the primer is not used (instead it's inferred from the provided primer sequence) and can also be set to -1.} -\item{primerForward, primerReverse}{Character vectors, representing the primer -sequence(s) for forward/reverse reads, respectively. Only read pairs that -contain perfect matches to both the forward and reverse primers (if -given) will be retained. Multiple primers can be specified - they will be -considered in order and the first match will be used.} +\item{primerForward, primerReverse}{Character vectors, representing the +primer sequence(s) for forward/reverse reads, respectively. Only read +pairs that contain perfect matches to both the forward and reverse +primers (if given) will be retained. Multiple primers can be specified - +they will be considered in order and the first match will be used.} \item{wildTypeForward, wildTypeReverse}{Character scalars or named character vectors, the wild type sequence for the forward and reverse variable @@ -181,13 +178,14 @@ any mutated base has a Phred score lower than \code{mutatedPhredMin}, the read (pair) will be discarded.} \item{mutNameDelimiter}{Character scalar, the delimiter used in the naming -of mutants. Generally, mutants will be named as XX{.}YY{.}NNN, where XX -is the closest provided reference sequence, YY is the mutated base or +of mutants. Generally, mutants will be named as XX\{.\}YY\{.\}NNN, where +XX is the closest provided reference sequence, YY is the mutated base or codon number (depending on whether \code{nbrMutatedBases*} or \code{nbrMutatedCodons*} is specified), and NNN is the -mutated base or codon. Here, {.} is the provided \code{mutNameDelimiter}. -The delimiter must be a single character (not "_"), and can not appear -in any of the provided reference sequence names.} +mutated base or codon. Here, \{.\} is the provided +\code{mutNameDelimiter}. The delimiter must be a single character +(not "_"), and can not appear in any of the provided reference sequence +names.} \item{constantMaxDistForward, constantMaxDistReverse}{Numeric scalars, the maximum allowed Hamming distance between the extracted and expected @@ -195,17 +193,13 @@ constant sequence. If multiple constant sequences are provided, the most similar one is used. Reads with a larger distance to the expected constant sequence are discarded. If set to -1, no filtering is done.} -\item{variableCollapseMaxDist, variableCollapseMinReads, variableCollapseMinRatio}{Deprecated. Collapsing of variable sequences is no longer performed in -\code{digestFastqs}. Please use \code{collapseMutantsBySimilarity} -instead.} - \item{umiCollapseMaxDist}{Numeric scalar defining the tolerances for collapsing similar UMI sequences. If the value is in [0, 1), it defines the maximal Hamming distance in terms of a fraction of sequence length: (\code{round(umiCollapseMaxDist * nchar(umiSeq))}). -A value greater or equal to 1 is rounded and directly used as the maximum -allowed Hamming distance.} +A value greater or equal to 1 is rounded and directly used as the +maximum allowed Hamming distance.} \item{filteredReadsFastqForward, filteredReadsFastqReverse}{Character scalars, the names of a (pair of) FASTQ file(s) where filtered-out reads @@ -233,20 +227,21 @@ the \code{chunkSize} to be decreased.} A list with four entries: \describe{ \item{summaryTable}{A \code{data.frame} that contains, for each observed -mutation combination, the corresponding variable region sequences (or pair of -sequences), the number of observed such sequences, and the number of unique -UMIs observed for the sequence. It also has additional columns: 'maxNbrReads' -contains the number of reads for the most frequent observed sequence -represented by the feature (only relevant if similar variable regions are -collapsed). 'nbrMutBases', 'nbrMutCodons' and 'nbrMutAAs' give the number of -mutated bases, codons or amino acids in each variant. Alternative variant -names based on base, codon or amino acid sequence are provided in columns -'mutantNameBase', 'mutantNameCodon', 'mutantNameAA'. In addition, -'mutantNameBaseHGVS' and 'mutantNameAAHGVS' give base- and amino acid-based -names following the HGVS nomenclature (https://varnomen.hgvs.org/). Please -note that the provided reference sequence names are used for the HGVS -sequence identifiers. It is up to the user to use appropriately named -reference sequences in order to obtain valid HGVS variant names.} +mutation combination, the corresponding variable region sequences (or pair +of sequences), the number of observed such sequences, and the number of +unique UMIs observed for the sequence. It also has additional columns: +'maxNbrReads' contains the number of reads for the most frequent observed +sequence represented by the feature (only relevant if similar variable +regions are collapsed). 'nbrMutBases', 'nbrMutCodons' and 'nbrMutAAs' give +the number of mutated bases, codons or amino acids in each variant. +Alternative variant names based on base, codon or amino acid sequence are +provided in columns mutantNameBase', 'mutantNameCodon', 'mutantNameAA'. In +addition, mutantNameBaseHGVS' and 'mutantNameAAHGVS' give base- and amino +acid-based names following the HGVS nomenclature +(https://varnomen.hgvs.org/). Please note that the provided reference +sequence names are used for the HGVS sequence identifiers. It is up to the +user to use appropriately named reference sequences in order to obtain +valid HGVS variant names.} \item{filterSummary}{A \code{data.frame} that contains the number of input reads, the number of reads filtered out in the processing, and the number of retained reads. The filters are named according to the convention @@ -255,19 +250,19 @@ applied, and "filter" indicates the type of filter. Note that filters are applied successively, and the reads filtered out in one step are not considered for successive filtering steps.} \item{errorStatistics}{A \code{data.frame} that contains, for each Phred -quality score between 0 and 99, the number of bases in the extracted constant -sequences with that quality score that match/mismatch with the provided -reference constant sequence.} -\item{parameters}{A \code{list} with all parameter settings that were used in -the processing. Also contains the version of the package and the time of +quality score between 0 and 99, the number of bases in the extracted +constant sequences with that quality score that match/mismatch with the +provided reference constant sequence.} +\item{parameters}{A \code{list} with all parameter settings that were used +in the processing. Also contains the version of the package and the time of processing.} } } \description{ Read sequences for one or a pair of fastq files and digest them (extract umis, constant and variable parts, filter, extract mismatch information from -constant and count the observed unique variable parts). Alternatively, primer -sequences could be specified, in which case the sequence immediately +constant and count the observed unique variable parts). Alternatively, +primer sequences could be specified, in which case the sequence immediately following the primer will be considered the variable sequence. } \details{ @@ -288,8 +283,8 @@ The processing of a read pair goes as follows: either the forward or reverse read (or the merged read). \item Filter out the read (pair) if the number of Ns in the variable region exceeds \code{variableNMaxForward}/\code{variableNMaxReverse}. - \item Filter out the read (pair) if the number of Ns in the combined forward - and reverse UMI sequence exceeds \code{umiNMax} + \item Filter out the read (pair) if the number of Ns in the combined + forward and reverse UMI sequence exceeds \code{umiNMax} \item If one or more wild type sequences (for the variable region) are provided, find the mismatches between the (forward/reverse) variable region and the provided wild type sequence (if more than one wild type sequence is @@ -302,9 +297,9 @@ The processing of a read pair goes as follows: the codons encoded by \code{forbiddenMutatedCodonsForward}/\code{forbiddenMutatedCodonsReverse}. \item Assign a 'mutation name' to the read (pair). This name is a - combination of parts of the form XX{.}YY{.}NNN, where XX is the name of the - most similar reference sequence, YY is the mutated codon number, and NNN is - the mutated codon. {.} is a delimiter, specified via + combination of parts of the form XX\{.\}YY\{.\}NNN, where XX is the name of + the most similar reference sequence, YY is the mutated codon number, and + NNN is the mutated codon. \{.\} is a delimiter, specified via \code{mutNameDelimiter}. If no wildtype sequences are provided, the variable sequence will be used as the mutation name'. } @@ -317,7 +312,7 @@ number of reads, and the number of unique UMIs, for each variable sequence ## See the vignette for complete worked-out examples for different types of ## data sets -## ----------------------------------------------------------------------- ## +## ---------------------------------------------------------------------- ## ## Process a single-end data set, assume that the full read represents ## the variable region out <- digestFastqs( @@ -330,7 +325,7 @@ head(out$summaryTable) ## Filter summary out$filterSummary -## ----------------------------------------------------------------------- ## +## ---------------------------------------------------------------------- ## ## Process a single-end data set, specify the read as a combination of ## UMI, constant region and variable region (skip the first base) out <- digestFastqs( @@ -346,7 +341,7 @@ out$filterSummary ## Error statistics out$errorStatistics -## ----------------------------------------------------------------------- ## +## ---------------------------------------------------------------------- ## ## Process a single-end data set, specify the read as a combination of ## UMI, constant region and variable region (skip the first base), provide ## the wild type sequence to compare the variable region to and limit the @@ -356,8 +351,9 @@ out <- digestFastqs( package = "mutscan"), elementsForward = "SUCV", elementLengthsForward = c(1, 10, 18, 96), constantForward = "AACCGGAGGAGGGAGCTG", - wildTypeForward = c(FOS = paste0("ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", - "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), + wildTypeForward = c(FOS = paste0( + "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", + "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), nbrMutatedCodonsMaxForward = 1 ) ## Table with read counts and mutant information @@ -367,7 +363,7 @@ out$filterSummary ## Error statistics out$errorStatistics -## ----------------------------------------------------------------------- ## +## ---------------------------------------------------------------------- ## ## Process a paired-end data set where both the forward and reverse reads ## contain the same variable region and thus should be merged to generate ## the final variable sequence, specify the reads as a combination of @@ -379,13 +375,15 @@ out <- digestFastqs( package = "mutscan"), fastqReverse = system.file("extdata", "cisInput_2.fastq.gz", package = "mutscan"), - mergeForwardReverse = TRUE, revComplForward = FALSE, revComplReverse = TRUE, + mergeForwardReverse = TRUE, + revComplForward = FALSE, revComplReverse = TRUE, elementsForward = "SUCV", elementLengthsForward = c(1, 10, 18, 96), elementsReverse = "SUCVS", elementLengthsReverse = c(1, 7, 17, 96, -1), constantForward = "AACCGGAGGAGGGAGCTG", constantReverse = "GAGTTCATCCTGGCAGC", - wildTypeForward = c(FOS = paste0("ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", - "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), + wildTypeForward = c(FOS = paste0( + "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", + "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), nbrMutatedCodonsMaxForward = 1 ) ## Table with read counts and mutant information @@ -395,7 +393,7 @@ out$filterSummary ## Error statistics out$errorStatistics -## ----------------------------------------------------------------------- ## +## ---------------------------------------------------------------------- ## ## Process a paired-end data set where the forward and reverse reads ## contain variable regions corresponding to different proteins, and thus ## should not be merged, specify the reads as a combination of @@ -412,10 +410,12 @@ out <- digestFastqs( elementsReverse = "SUCV", elementLengthsReverse = c(1, 8, 20, 96), constantForward = "AACCGGAGGAGGGAGCTG", constantReverse = "GAAAAAGGAAGCTGGAGAGA", - wildTypeForward = c(FOS = paste0("ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", - "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), - wildTypeReverse = c(JUN = paste0("ATCGCCCGGCTGGAGGAAAAAGTGAAAACCTTGAAAGCTCAGAACTC", - "GGAGCTGGCGTCCACGGCCAACATGCTCAGGGAACAGGTGGCACAGCTT")), + wildTypeForward = c(FOS = paste0( + "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", + "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), + wildTypeReverse = c(JUN = paste0( + "ATCGCCCGGCTGGAGGAAAAAGTGAAAACCTTGAAAGCTCAGAACTC", + "GGAGCTGGCGTCCACGGCCAACATGCTCAGGGAACAGGTGGCACAGCTT")), nbrMutatedCodonsMaxForward = 1, nbrMutatedCodonsMaxReverse = 1 ) diff --git a/man/groupSimilarSequences.Rd b/man/groupSimilarSequences.Rd index 476ba46..07a3694 100644 --- a/man/groupSimilarSequences.Rd +++ b/man/groupSimilarSequences.Rd @@ -7,10 +7,10 @@ groupSimilarSequences( seqs, scores, - collapseMaxDist, - collapseMinScore, - collapseMinRatio, - verbose + collapseMaxDist = 0, + collapseMinScore = 0, + collapseMinRatio = 0, + verbose = FALSE ) } \arguments{ @@ -29,12 +29,12 @@ Hamming distance in terms of a fraction of sequence length: (\code{round(collapseMaxDist * nchar(sequence))}). A value greater or equal to 1 is rounded and directly used as the maximum allowed Hamming distance. Note that sequences can only be -collapsed if they are all of the same length.} +collapsed if they are all of the same length. The default value is 0.} \item{collapseMinScore}{Numeric scalar, indicating the minimum score required for a sequence to be considered as a representative for a group of similar sequences (i.e., to allow other sequences to be -collapsed into it).} +collapsed into it). The default value is 0.} \item{collapseMinRatio}{Numeric scalar. During collapsing of similar sequences, a low-frequency sequence will be collapsed diff --git a/man/linkMultipleVariants.Rd b/man/linkMultipleVariants.Rd index ccaf163..e24c43e 100644 --- a/man/linkMultipleVariants.Rd +++ b/man/linkMultipleVariants.Rd @@ -86,6 +86,27 @@ consistent (since this can affect e.g. filtering criteria). Note that to process individual variable sequences in the reverse read, you also need to swap the 'forward' and 'reverse' specifications (since \code{digestFastqs} requires a forward read). +} +\examples{ +fqFile <- system.file("extdata", "cisInput_1.fastq.gz", + package = "mutscan") +out <- linkMultipleVariants( + combinedDigestParams = list(fastqForward = fqFile, + elementsForward = "SVCV", + elementLengthsForward = c(1, 10, 18, 96)), + # the first variable sequence is the UMI + umi = list(fastqForward = fqFile, elementsForward = "SVCS", + elementLengthsForward = c(1, 10, 18, 96)), + # the second variable sequence is the amplicon variant + var = list(fastqForward = fqFile, elementsForward = "SSCV", + elementLengthsForward = c(1, 10, 18, 96), + collapseMaxDist = 3, collapseMinScore = 1) +) +# conversion tables +lapply(out$convSeparate, head) +# aggregated count table +head(out$countAggregated) + } \author{ Charlotte Soneson, Michael Stadler diff --git a/man/plotFiltering.Rd b/man/plotFiltering.Rd index 688ebd1..cf758eb 100644 --- a/man/plotFiltering.Rd +++ b/man/plotFiltering.Rd @@ -23,8 +23,8 @@ plot the number of reads, or the fraction of the total number of reads, that are retained after/filtered out in each filtering step.} \item{onlyActiveFilters}{Logical scalar, whether to only include the -active filters (i.e., where any read was filtered out in any of the samples). -Defaults to \code{TRUE}.} +active filters (i.e., where any read was filtered out in any of the +samples). Defaults to \code{TRUE}.} \item{displayNumbers}{Logical scalar, indicating whether to display the number (or fraction) of reads retained at every filtering step.} diff --git a/man/plotMeanDiff.Rd b/man/plotMeanDiff.Rd index 57f417f..2e83f2b 100644 --- a/man/plotMeanDiff.Rd +++ b/man/plotMeanDiff.Rd @@ -59,10 +59,11 @@ If \code{interactivePlot} is \code{TRUE}, a \code{plotly} Construct an MA (mean-difference) plot } \examples{ +library(SummarizedExperiment) se <- readRDS(system.file("extdata", "GSE102901_cis_se.rds", package = "mutscan"))[1:200, ] design <- model.matrix(~ Replicate + Condition, - data = SummarizedExperiment::colData(se)) + data = colData(se)) res <- calculateRelativeFC(se, design, coef = "Conditioncis_output") plotMeanDiff(res, pointSize = "large", nTopToLabel = 3) diff --git a/man/plotVolcano.Rd b/man/plotVolcano.Rd index f0a47dc..0b94d83 100644 --- a/man/plotVolcano.Rd +++ b/man/plotVolcano.Rd @@ -55,10 +55,11 @@ If \code{interactivePlot} is \code{TRUE}, a \code{plotly} Construct a volcano plot } \examples{ +library(SummarizedExperiment) se <- readRDS(system.file("extdata", "GSE102901_cis_se.rds", package = "mutscan"))[1:200, ] design <- model.matrix(~ Replicate + Condition, - data = SummarizedExperiment::colData(se)) + data = colData(se)) res <- calculateRelativeFC(se, design, coef = "Conditioncis_output") plotVolcano(res, pointSize = "large", nTopToLabel = 3) diff --git a/man/relabelMutPositions.Rd b/man/relabelMutPositions.Rd index f52db8b..bbc5468 100644 --- a/man/relabelMutPositions.Rd +++ b/man/relabelMutPositions.Rd @@ -8,9 +8,9 @@ relabelMutPositions(se, conversionTable, mutNameDelimiter = ".") } \arguments{ \item{se}{SummarizedExperiment object, with row names of the form -XX{.}AA{.}NNN, where XX is the name of the reference sequence, AA is the -position of the mutated codon, and NNN is the mutated codon or amino -acid. {.} is the delimiter, to be specified in the +XX\{.\}AA\{.\}NNN, where XX is the name of the reference sequence, +AA is the position of the mutated codon, and NNN is the mutated codon +or amino acid. \{.\} is the delimiter, to be specified in the \code{mutNameDelimiter} argument. For rows corresponding to sequences with multiple mutated codons, the row names contain multiple names of the form above in a single string, separated by "_".} @@ -24,7 +24,8 @@ mutation name) name, if the reference sequence matches seqname) }} -\item{mutNameDelimiter}{The delimiter used in the mutation name ({.} above).} +\item{mutNameDelimiter}{The delimiter used in the mutation name +(\{.\} above).} } \value{ A SummarizedExperiment object with modified row names. diff --git a/man/summarizeExperiment.Rd b/man/summarizeExperiment.Rd index 4ee4cdc..141fa5e 100644 --- a/man/summarizeExperiment.Rd +++ b/man/summarizeExperiment.Rd @@ -13,8 +13,8 @@ Names are used to link the objects to the metadata provided in \item{coldata}{A \code{data.frame} with at least one column "Name", which will be used to link to objects in \code{x}. A potentially subset and -reordered version of \code{coldata} is stored in the \code{colData} of the -returned \code{\link[SummarizedExperiment]{SummarizedExperiment}}.} +reordered version of \code{coldata} is stored in the \code{colData} of +the returned \code{\link[SummarizedExperiment]{SummarizedExperiment}}.} \item{countType}{Either "reads" or "umis". If "reads", the "count" assay of the returned object will contain the observed number of reads for each @@ -25,12 +25,14 @@ unique UMIs observed for each sequence (pair).} A \code{\link[SummarizedExperiment]{SummarizedExperiment}} \code{x} with \describe{ - \item{assays(x)$counts}{containing the observed number of sequences or - sequence pairs (if \code{countType} = "reads"), or the observed number of - unique UMIs for each sequence or sequence pair (if \code{countType} = - "umis").} - \item{rowData(x)}{containing the unique sequences or sequence pairs.} - \item{colData(x)}{containing the metadata provided by \code{coldata}.} + \item{assays(x)$counts}{containing the observed number of sequences + or sequence pairs (if \code{countType} = "reads"), or the observed + number of unique UMIs for each sequence or sequence pair (if + \code{countType} = "umis").} + \item{rowData(x)}{containing the unique sequences or sequence + pairs.} + \item{colData(x)}{containing the metadata provided by + \code{coldata}.} } } \description{ @@ -45,8 +47,9 @@ inp <- digestFastqs( package = "mutscan"), elementsForward = "SUCV", elementLengthsForward = c(1, 10, 18, 96), constantForward = "AACCGGAGGAGGGAGCTG", - wildTypeForward = c(FOS = paste0("ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", - "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), + wildTypeForward = c(FOS = paste0( + "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", + "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), nbrMutatedCodonsMaxForward = 1 ) ## Output sample @@ -55,8 +58,9 @@ outp <- digestFastqs( package = "mutscan"), elementsForward = "SUCV", elementLengthsForward = c(1, 10, 18, 96), constantForward = "AACCGGAGGAGGGAGCTG", - wildTypeForward = c(FOS = paste0("ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", - "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), + wildTypeForward = c(FOS = paste0( + "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTC", + "TGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA")), nbrMutatedCodonsMaxForward = 1 ) ## Combine diff --git a/src/BKtree_utils.h b/src/BKtree_utils.h index c42bc45..1f0f669 100644 --- a/src/BKtree_utils.h +++ b/src/BKtree_utils.h @@ -177,7 +177,7 @@ class BKtree { void print() { Rcout << size << "\n"; if (size > 0) { - _print(root, root, 0); // recursive node printing starting from root at zero identation + _print(root, root, 0); // recursive node printing starting from root at zero indentation } } @@ -398,10 +398,10 @@ class BKtree { // and the recurse on all children of n void _print(node* r = nullptr, node* n = nullptr, int depth = 0) { if (n == nullptr) { - n = root; + n = root; // # nocov } if (r == nullptr) { - r = root; + r = root; // # nocov } for (int i = 0; i < depth; ++i) { diff --git a/src/FastqBuffer_utils.h b/src/FastqBuffer_utils.h index f134c80..af3d41b 100644 --- a/src/FastqBuffer_utils.h +++ b/src/FastqBuffer_utils.h @@ -16,7 +16,7 @@ class FastqBuffer { // (space for seq2 and qual2 is only allocated for paired=true) public: char *seq1, *qual1, *seq2, *qual2; - + // constructor FastqBuffer(size_t n, size_t b, bool p = true) { nentries = n; @@ -33,7 +33,7 @@ class FastqBuffer { qual2 = NULL; } } - + // copy constructor FastqBuffer(const FastqBuffer& fqb) { paired = fqb.paired; @@ -51,12 +51,12 @@ class FastqBuffer { qual2 = NULL; } } - + // destructor ~FastqBuffer() { delete[] buffer; } - + // Write one (pair of) filtered reads to output fastq file(s) bool write_seq(size_t i, gzFile file1, gzFile file2, const int n, const char* label) { bool success = (i < nentries); @@ -67,32 +67,32 @@ class FastqBuffer { if (success && file1 != NULL) { std::string read_id = "@S" + std::to_string(n) + "_" + label + " 1\n"; if (success && gzputs(file1, read_id.c_str()) == (-1)) { - success = false; + success = false; // # nocov } if (success && gzputs(file1, seq1 + (i * BUFFER_SIZE)) == (-1)) { - success = false; + success = false; // # nocov } if (success && gzputs(file1, "+\n") == (-1)) { - success = false; + success = false; // # nocov } if (success && gzputs(file1, qual1 + (i * BUFFER_SIZE)) == (-1)) { - success = false; + success = false; // # nocov } } - + if (success && file2 != NULL) { std::string read_id = "@S" + std::to_string(n) + "_" + label + " 2\n"; if (success && gzputs(file2, read_id.c_str()) == (-1)) { - success = false; + success = false; // # nocov } if (success && gzputs(file2, seq2 + (i * BUFFER_SIZE)) == (-1)) { - success = false; + success = false; // # nocov } if (success && gzputs(file2, "+\n") == (-1)) { - success = false; + success = false; // # nocov } if (success && gzputs(file2, qual2 + (i * BUFFER_SIZE)) == (-1)) { - success = false; + success = false; // # nocov } } } diff --git a/src/Makevars b/src/Makevars index a4b6954..8b30577 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,3 +1,4 @@ +PKG_LIBS+=-lz CXX_STD=CXX11 CXX1X=$(shell ${R_HOME}/bin/R CMD config CXX11) ifeq ($(shell $(CXX1X) -fopenmp -E -xc++ - 2>&1 >/dev/null && echo 'true'), true) diff --git a/src/Makevars.win b/src/Makevars.win index 350dcb9..9f7bdc6 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -1,3 +1,4 @@ +PKG_LIBS+=-lz CXX_STD=CXX11 CXX1X=$(shell ${R_HOME}/bin${R_ARCH_BIN}/R CMD config CXX11) PKG_LIBS+=$(shell $(R_HOME)/bin/Rscript.exe -e "Rcpp:::LdFlags()") diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 2ff18f4..27a64cd 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -23,6 +23,17 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// complement +char complement(char n); +RcppExport SEXP _mutscan_complement(SEXP nSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< char >::type n(nSEXP); + rcpp_result_gen = Rcpp::wrap(complement(n)); + return rcpp_result_gen; +END_RCPP +} // compareCodonPositions bool compareCodonPositions(std::string a, std::string b, const char mutNameDelimiter); RcppExport SEXP _mutscan_compareCodonPositions(SEXP aSEXP, SEXP bSEXP, SEXP mutNameDelimiterSEXP) { @@ -74,6 +85,26 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// test_compareToWildtype +Rcpp::List test_compareToWildtype(const std::string varSeq, const std::string wtSeq, const std::vector varIntQual, const std::vector forbiddenCodons_vect, const double mutatedPhredMin, const int nbrMutatedCodonsMax, const std::string codonPrefix, const int nbrMutatedBasesMax, const std::string mutNameDelimiter, const bool collapseToWT); +RcppExport SEXP _mutscan_test_compareToWildtype(SEXP varSeqSEXP, SEXP wtSeqSEXP, SEXP varIntQualSEXP, SEXP forbiddenCodons_vectSEXP, SEXP mutatedPhredMinSEXP, SEXP nbrMutatedCodonsMaxSEXP, SEXP codonPrefixSEXP, SEXP nbrMutatedBasesMaxSEXP, SEXP mutNameDelimiterSEXP, SEXP collapseToWTSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::string >::type varSeq(varSeqSEXP); + Rcpp::traits::input_parameter< const std::string >::type wtSeq(wtSeqSEXP); + Rcpp::traits::input_parameter< const std::vector >::type varIntQual(varIntQualSEXP); + Rcpp::traits::input_parameter< const std::vector >::type forbiddenCodons_vect(forbiddenCodons_vectSEXP); + Rcpp::traits::input_parameter< const double >::type mutatedPhredMin(mutatedPhredMinSEXP); + Rcpp::traits::input_parameter< const int >::type nbrMutatedCodonsMax(nbrMutatedCodonsMaxSEXP); + Rcpp::traits::input_parameter< const std::string >::type codonPrefix(codonPrefixSEXP); + Rcpp::traits::input_parameter< const int >::type nbrMutatedBasesMax(nbrMutatedBasesMaxSEXP); + Rcpp::traits::input_parameter< const std::string >::type mutNameDelimiter(mutNameDelimiterSEXP); + Rcpp::traits::input_parameter< const bool >::type collapseToWT(collapseToWTSEXP); + rcpp_result_gen = Rcpp::wrap(test_compareToWildtype(varSeq, wtSeq, varIntQual, forbiddenCodons_vect, mutatedPhredMin, nbrMutatedCodonsMax, codonPrefix, nbrMutatedBasesMax, mutNameDelimiter, collapseToWT)); + return rcpp_result_gen; +END_RCPP +} // test_decomposeRead List test_decomposeRead(const std::string sseq, const std::string squal, const std::string elements, const std::vector elementLengths, const std::vector primerSeqs, std::string umiSeq, std::string varSeq, std::string varQual, std::vector varLengths, std::string constSeq, std::string constQual, int nNoPrimer, int nReadWrongLength); RcppExport SEXP _mutscan_test_decomposeRead(SEXP sseqSEXP, SEXP squalSEXP, SEXP elementsSEXP, SEXP elementLengthsSEXP, SEXP primerSeqsSEXP, SEXP umiSeqSEXP, SEXP varSeqSEXP, SEXP varQualSEXP, SEXP varLengthsSEXP, SEXP constSeqSEXP, SEXP constQualSEXP, SEXP nNoPrimerSEXP, SEXP nReadWrongLengthSEXP) { @@ -282,10 +313,12 @@ RcppExport SEXP _rcpp_module_boot_mod_BKtree(); static const R_CallMethodDef CallEntries[] = { {"_mutscan_calcNearestStringDist", (DL_FUNC) &_mutscan_calcNearestStringDist, 3}, + {"_mutscan_complement", (DL_FUNC) &_mutscan_complement, 1}, {"_mutscan_compareCodonPositions", (DL_FUNC) &_mutscan_compareCodonPositions, 3}, {"_mutscan_translateString", (DL_FUNC) &_mutscan_translateString, 1}, {"_mutscan_makeBaseHGVS", (DL_FUNC) &_mutscan_makeBaseHGVS, 4}, {"_mutscan_test_makeAAHGVS", (DL_FUNC) &_mutscan_test_makeAAHGVS, 3}, + {"_mutscan_test_compareToWildtype", (DL_FUNC) &_mutscan_test_compareToWildtype, 10}, {"_mutscan_test_decomposeRead", (DL_FUNC) &_mutscan_test_decomposeRead, 13}, {"_mutscan_test_mergeReadPairPartial", (DL_FUNC) &_mutscan_test_mergeReadPairPartial, 12}, {"_mutscan_findClosestRefSeq", (DL_FUNC) &_mutscan_findClosestRefSeq, 4}, diff --git a/src/digestFastqs.cpp b/src/digestFastqs.cpp index 1659b37..e8f9631 100644 --- a/src/digestFastqs.cpp +++ b/src/digestFastqs.cpp @@ -37,14 +37,14 @@ bool reached_end_of_file(gzFile file, char *ret) { if (gzeof(file)) { return true; } else { - const char *error_string; + const char *error_string; // # nocov start error_string = gzerror(file, &err); if (err) { stop(error_string); - } + } // # nocov end } } - // Check if we have read until a newline character. Otherwise, the read is + // Check if we have read until a newline character. Otherwise, the read is // too long -> break if (std::string(ret).back() != '\n') { stop("Encountered a read exceeding the maximal allowed length"); @@ -65,21 +65,22 @@ bool get_next_seq(gzFile file, char *seq, char *qual, size_t BUFFER_SIZE) { } // sequence if (reached_end_of_file(file, gzgets(file, seq, BUFFER_SIZE))) { - return true; + return true; // # nocov } // quality identifier if (reached_end_of_file(file, gzgets(file, qual, BUFFER_SIZE))) { - return true; + return true; // # nocov } // quality if (reached_end_of_file(file, gzgets(file, qual, BUFFER_SIZE))) { - return true; + return true; // # nocov } return false; } // create the complement of a base +// [[Rcpp::export]] char complement(char n) { switch(n) { case 'A': @@ -176,7 +177,7 @@ std::string translateString(const std::string& s) { size_t i = 0, j = 0; const int pow4[] = {1, 4, 16}; int codonVal = 0; - + for (i = 0; i < s.size(); i++) { if (s[i] == '_') { j = 0; @@ -187,29 +188,29 @@ std::string translateString(const std::string& s) { case 'a': // codonVal += 0 * pow4[j]; break; - + case 'C': case 'c': codonVal += 1 * pow4[j]; break; - + case 'G': case 'g': codonVal += 2 * pow4[j]; break; - + case 'T': case 't': case 'u': case 'U': codonVal += 3 * pow4[j]; break; - + default: codonVal += 64; break; } - + if (j == 2) { if (codonVal > 63) { aa.push_back('X'); @@ -221,9 +222,9 @@ std::string translateString(const std::string& s) { } else { j++; } - + } - + } return aa; } @@ -238,33 +239,33 @@ bool hasEnding (std::string const &fullString, std::string const &ending) { } } -// Helper function to create the ID for a single mutation group (delins or +// Helper function to create the ID for a single mutation group (delins or // single mutation) -std::string makeSingleBaseHGVSid(int posMin, int posMax, const std::string wtSeq, +std::string makeSingleBaseHGVSid(int posMin, int posMax, const std::string wtSeq, const std::string varSeq) { std::string id = ""; if (posMax > posMin) { // delins - id += std::to_string(posMin) + "_" + std::to_string(posMax) + - "delins" + + id += std::to_string(posMin) + "_" + std::to_string(posMax) + + "delins" + varSeq.substr((size_t)(posMin - 1), (size_t)(posMax - posMin + 1)) + ";"; } else { // single base mutation - id += std::to_string(posMin) + wtSeq[posMin - 1] + + id += std::to_string(posMin) + wtSeq[posMin - 1] + ">" + varSeq[posMin - 1] + ";"; } return id; } // [[Rcpp::export]] -std::string makeBaseHGVS(const std::vector mutationsSorted, - const std::string mutNameDelimiter, +std::string makeBaseHGVS(const std::vector mutationsSorted, + const std::string mutNameDelimiter, const std::string wtSeq, const std::string varSeq) { int prevPosMin = -5, prevPosMax = -5; // something far enough from the first mutation position to not cause a delins bool moreThanOne = false; int pos; std::string mutId = ""; - + if (mutationsSorted.size() == 0) { // no mutations - return only _ (all variants should end with this character, // as it will be stripped off later) @@ -306,15 +307,15 @@ std::string makeBaseHGVS(const std::vector mutationsSorted, return mutId; } -std::string makeAAHGVS(const std::vector mutationsSorted, - const std::string mutNameDelimiter, - const std::string wtSeq, +std::string makeAAHGVS(const std::vector mutationsSorted, + const std::string mutNameDelimiter, + const std::string wtSeq, std::map &threeAA) { int pos; std::string mutAA; char wtAA; std::string mutId = ""; - + if (mutationsSorted.size() == 0) { // no mutations - return only _ (all variants should end with this character, // as it will be stripped off later) @@ -342,13 +343,13 @@ std::string makeAAHGVS(const std::vector mutationsSorted, } // [[Rcpp::export]] -std::string test_makeAAHGVS(const std::vector mutationsSorted, - const std::string mutNameDelimiter, +std::string test_makeAAHGVS(const std::vector mutationsSorted, + const std::string mutNameDelimiter, const std::string wtSeq) { std::map threeAA = initializeThreeAA(); return makeAAHGVS(mutationsSorted, mutNameDelimiter, wtSeq, threeAA); -} - +} + // compare read to wildtype sequence, // identify mutated bases/codons, filter, update counters // and add to the name (mutantName* may be empty if no mutations are found, @@ -374,11 +375,11 @@ bool compareToWildtype(const std::string varSeq, const std::string wtSeq, const int nbrMutatedCodonsMax, const std::set &forbiddenCodons, const std::string codonPrefix, const int nbrMutatedBasesMax, int &nMutQualTooLow, int &nTooManyMutCodons, int &nForbiddenCodons, - int &nTooManyMutBases, std::string &mutantName, + int &nTooManyMutBases, std::string &mutantName, std::string &mutantNameBase, std::string &mutantNameCodon, std::string &mutantNameBaseHGVS, std::string &mutantNameAA, std::string &mutantNameAAHGVS, int &nMutBases, - int &nMutCodons, int &nMutAAs, std::set &mutationTypes, + int &nMutCodons, int &nMutAAs, std::set &mutationTypes, const std::string mutNameDelimiter, const bool collapseToWT, std::map &threeAA) { @@ -401,7 +402,7 @@ bool compareToWildtype(const std::string varSeq, const std::string wtSeq, aaPrefixHGVS.pop_back(); aaPrefixHGVS = aaPrefixHGVS + "p"; } - + // filter if there are too many mutated codons // mutatedCodons.clear(); hasLowQualMutation = false; @@ -433,9 +434,9 @@ bool compareToWildtype(const std::string varSeq, const std::string wtSeq, wtAA = translateString(wtCodon); if (varAA != wtAA) { // add to mutatedAA - mutatedAAs.insert(codonPrefix + mutNameDelimiter + + mutatedAAs.insert(codonPrefix + mutNameDelimiter + std::to_string((int)(i / 3) + 1) + mutNameDelimiter + - varAA + + varAA + std::string("_")); // nonsynonymous or stop if (varAA == "*") { @@ -509,19 +510,19 @@ bool compareToWildtype(const std::string varSeq, const std::string wtSeq, mutantNameAAHGVS += aaPrefixHGVS + "_"; } else { std::vector mutatedBasesSorted, mutatedCodonsSorted, mutatedAAsSorted; - + mutatedBasesSorted.assign(mutatedBases.begin(), mutatedBases.end()); std::sort(mutatedBasesSorted.begin(), mutatedBasesSorted.end(), std::bind(compareCodonPositions, _1, _2, *(mutNameDelimiter.c_str()))); - + mutatedCodonsSorted.assign(mutatedCodons.begin(), mutatedCodons.end()); std::sort(mutatedCodonsSorted.begin(), mutatedCodonsSorted.end(), std::bind(compareCodonPositions, _1, _2, *(mutNameDelimiter.c_str()))); - + mutatedAAsSorted.assign(mutatedAAs.begin(), mutatedAAs.end()); std::sort(mutatedAAsSorted.begin(), mutatedAAsSorted.end(), std::bind(compareCodonPositions, _1, _2, *(mutNameDelimiter.c_str()))); - + for (size_t i = 0; i < mutatedBasesSorted.size(); i++) { mutantNameBase += mutatedBasesSorted[i]; if (nbrMutatedCodonsMax == (-1)) { @@ -544,15 +545,15 @@ bool compareToWildtype(const std::string varSeq, const std::string wtSeq, mutantName += codonPrefix + mutNameDelimiter + "0" + mutNameDelimiter + "WT_"; mutantNameBaseHGVS += basePrefixHGVS + "_"; } else { - mutantNameBaseHGVS += basePrefixHGVS + "." + + mutantNameBaseHGVS += basePrefixHGVS + "." + makeBaseHGVS(mutatedBasesSorted, mutNameDelimiter, wtSeq, varSeq); } if (mutatedAAsSorted.size() == 0) { mutantNameAA += codonPrefix + mutNameDelimiter + "0" + mutNameDelimiter + "WT_"; mutantNameAAHGVS += aaPrefixHGVS + "_"; } else { - mutantNameAAHGVS += aaPrefixHGVS + "." + - makeAAHGVS(mutatedAAsSorted, mutNameDelimiter, + mutantNameAAHGVS += aaPrefixHGVS + "." + + makeAAHGVS(mutatedAAsSorted, mutNameDelimiter, translateString(wtSeq), threeAA); } } @@ -560,6 +561,62 @@ bool compareToWildtype(const std::string varSeq, const std::string wtSeq, return false; } +// [[Rcpp::export]] +Rcpp::List test_compareToWildtype( + const std::string varSeq, + const std::string wtSeq, + const std::vector varIntQual, + const std::vector forbiddenCodons_vect, + const double mutatedPhredMin = 0.0, + const int nbrMutatedCodonsMax = -1, + const std::string codonPrefix = "c", + const int nbrMutatedBasesMax = -1, + const std::string mutNameDelimiter = ".", + const bool collapseToWT = false) { + // allocate C++ only variables + std::map threeAA = initializeThreeAA(); + std::set forbiddenCodons(forbiddenCodons_vect.begin(), + forbiddenCodons_vect.end()); + int nMutQualTooLow = 0, nTooManyMutCodons = 0, nForbiddenCodons = 0; + int nTooManyMutBases = 0, nMutBases = 0, nMutCodons = 0, nMutAAs = 0; + std::string mutantName(""), mutantNameBase(""); + std::string mutantNameCodon(""), mutantNameBaseHGVS(""); + std::string mutantNameAA(""), mutantNameAAHGVS(""); + std::set mutationTypes; + + // call compareToWildtype + if (compareToWildtype(varSeq, wtSeq, varIntQual, mutatedPhredMin, + nbrMutatedCodonsMax, forbiddenCodons, + codonPrefix, nbrMutatedBasesMax, nMutQualTooLow, + nTooManyMutCodons, nForbiddenCodons, + nTooManyMutBases, mutantName, + mutantNameBase, mutantNameCodon, + mutantNameBaseHGVS, mutantNameAA, + mutantNameAAHGVS, nMutBases, + nMutCodons, nMutAAs, mutationTypes, + mutNameDelimiter, collapseToWT, threeAA)) { + // read should be filtered out -> return empty list + return Rcpp::List::create(); + } else { + std::vector mutationTypes_vect(mutationTypes.begin(), + mutationTypes.end()); + return Rcpp::List::create(Rcpp::_["nMutQualTooLow"] = nMutQualTooLow, + Rcpp::_["nTooManyMutCodons"] = nTooManyMutCodons, + Rcpp::_["nForbiddenCodons"] = nForbiddenCodons, + Rcpp::_["nTooManyMutBases"] = nTooManyMutBases, + Rcpp::_["nMutBases"] = nMutBases, + Rcpp::_["nMutCodons"] = nMutCodons, + Rcpp::_["nMutAAs"] = nMutAAs, + Rcpp::_["mutantName"] = mutantName, + Rcpp::_["mutantNameBase"] = mutantNameBase, + Rcpp::_["mutantNameCodon"] = mutantNameCodon, + Rcpp::_["mutantNameBaseHGVS"] = mutantNameBaseHGVS, + Rcpp::_["mutantNameAA"] = mutantNameAA, + Rcpp::_["mutantNameAAHGVS"] = mutantNameAAHGVS, + Rcpp::_["mutationTypes"] = mutationTypes_vect); + } +} + // compare constSeq to constant sequence and update // counter by match/mismatch and by base Phred quality // return true @@ -607,12 +664,12 @@ struct mutantInfo { gzFile openFastq(std::string filename, const char* mode = "rb") { gzFile file = gzopen(filename.c_str(), mode); if (!file) { - if (errno) { + if (errno) { // # nocov start stop("Failed to open file '", filename, "': ", strerror(errno), " (errno=", errno, ")"); } else { stop("Failed to open file '", filename, "': zlib out of memory"); - } + } // # nocov end } return file; } @@ -694,7 +751,7 @@ bool decomposeRead(const std::string sseq, umiSeq, varSeq, varQual, varLengths, constSeq, constQual, nNoPrimer, nReadWrongLength); if (!post) { - return false; + return false; // # nocov } break; } @@ -971,7 +1028,7 @@ void removeEOL(std::string &seq) { seq.pop_back(); } if (seq.back() == '\r') { - seq.pop_back(); + seq.pop_back(); // # nocov } } @@ -986,10 +1043,12 @@ int findClosestRefSeq(std::string &varSeq, std::vector &wtSeq, int maxsim = 0; int nbrbesthits = 0; int currsim; + size_t minl; for (size_t i = 0; i < wtSeq.size(); i++) { currsim = 0; std::string currSeq = wtSeq[i]; - for (size_t j = 0; j < currSeq.length(); j++) { + minl = std::min(varSeq.size(), currSeq.size()); + for (size_t j = 0; j < minl; j++) { currsim += (currSeq[j] == varSeq[j]); } if (((int)varSeq.size() - currsim <= (int)upperBoundMismatch)) { @@ -997,7 +1056,7 @@ int findClosestRefSeq(std::string &varSeq, std::vector &wtSeq, nbrbesthits++; } else if (currsim > maxsim) { nbrbesthits = 1; - idx = i; + idx = (int)i; maxsim = currsim; } } @@ -1085,11 +1144,11 @@ int findClosestRefSeqTree(std::string &varSeq, BKtree &wtTree, // Convert mutant summary to DataFrame DataFrame mutantSummaryToDataFrame(std::map mutantSummary) { std::map::iterator mutantSummaryIt; - + size_t dfLen = mutantSummary.size(); std::vector dfSeq(dfLen, ""), dfName(dfLen, ""); std::vector dfReads(dfLen, 0), dfUmis(dfLen, 0), dfMaxReads(dfLen, 0); - std::vector dfMutBases(dfLen, ""), dfMutCodons(dfLen, ""); + std::vector dfMutBases(dfLen, ""), dfMutCodons(dfLen, ""); std::vector dfMutAAs(dfLen, ""), dfVarLengths(dfLen, ""); std::vector dfMutantNameBase(dfLen, ""), dfMutantNameCodon(dfLen, ""); std::vector dfMutantNameBaseHGVS(dfLen, ""), dfMutantNameAA(dfLen, ""); @@ -1107,7 +1166,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma if (!collapsedSequence.empty()) { collapsedSequence.pop_back(); // remove final "," } - + // mutantNameBase std::vector mutantNameBaseVector((*mutantSummaryIt).second.mutantNameBase.begin(), (*mutantSummaryIt).second.mutantNameBase.end()); @@ -1118,7 +1177,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma if (!collapsedMutantNameBase.empty()) { collapsedMutantNameBase.pop_back(); // remove final "," } - + // mutantNameCodon std::vector mutantNameCodonVector((*mutantSummaryIt).second.mutantNameCodon.begin(), (*mutantSummaryIt).second.mutantNameCodon.end()); @@ -1129,7 +1188,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma if (!collapsedMutantNameCodon.empty()) { collapsedMutantNameCodon.pop_back(); // remove final "," } - + // mutantNameBaseHGVS std::vector mutantNameBaseHGVSVector((*mutantSummaryIt).second.mutantNameBaseHGVS.begin(), (*mutantSummaryIt).second.mutantNameBaseHGVS.end()); @@ -1140,7 +1199,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma if (!collapsedMutantNameBaseHGVS.empty()) { collapsedMutantNameBaseHGVS.pop_back(); // remove final "," } - + // mutantNameAA std::vector mutantNameAAVector((*mutantSummaryIt).second.mutantNameAA.begin(), (*mutantSummaryIt).second.mutantNameAA.end()); @@ -1151,7 +1210,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma if (!collapsedMutantNameAA.empty()) { collapsedMutantNameAA.pop_back(); // remove final "," } - + // mutantNameAAHGVS std::vector mutantNameAAHGVSVector((*mutantSummaryIt).second.mutantNameAAHGVS.begin(), (*mutantSummaryIt).second.mutantNameAAHGVS.end()); @@ -1162,7 +1221,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma if (!collapsedMutantNameAAHGVS.empty()) { collapsedMutantNameAAHGVS.pop_back(); // remove final "," } - + // mutationTypes std::vector mutationTypesVector((*mutantSummaryIt).second.mutationTypes.begin(), (*mutantSummaryIt).second.mutationTypes.end()); @@ -1173,7 +1232,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma if (!collapsedMutationTypes.empty()) { collapsedMutationTypes.pop_back(); // remove final "," } - + // collapse all aa sequences associated with the mutant std::vector sequenceAAVector((*mutantSummaryIt).second.sequenceAA.begin(), (*mutantSummaryIt).second.sequenceAA.end()); @@ -1184,7 +1243,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma if (!collapsedSequenceAA.empty()) { collapsedSequenceAA.pop_back(); // remove final "," } - + // collapse all observed nMutBases/nMutCodons std::vector nMutBasesVector((*mutantSummaryIt).second.nMutBases.begin(), (*mutantSummaryIt).second.nMutBases.end()); @@ -1195,7 +1254,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma if (!collapsedNMutBases.empty()) { collapsedNMutBases.pop_back(); } - + // codons std::vector nMutCodonsVector((*mutantSummaryIt).second.nMutCodons.begin(), (*mutantSummaryIt).second.nMutCodons.end()); @@ -1206,7 +1265,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma if (!collapsedNMutCodons.empty()) { collapsedNMutCodons.pop_back(); } - + // AAs std::vector nMutAAsVector((*mutantSummaryIt).second.nMutAAs.begin(), (*mutantSummaryIt).second.nMutAAs.end()); @@ -1217,7 +1276,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma if (!collapsedNMutAAs.empty()) { collapsedNMutAAs.pop_back(); } - + dfName[j] = (*mutantSummaryIt).first; dfSeq[j] = collapsedSequence; dfReads[j] = (*mutantSummaryIt).second.nReads; @@ -1236,7 +1295,7 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma dfVarLengths[j] = (*mutantSummaryIt).second.varLengths; j++; } - + DataFrame df = DataFrame::create(Named("mutantName") = dfName, Named("sequence") = dfSeq, Named("nbrReads") = dfReads, @@ -1254,64 +1313,64 @@ DataFrame mutantSummaryToDataFrame(std::map mutantSumma Named("mutationTypes") = dfMutationTypes, Named("sequenceAA") = dfSeqAA, Named("stringsAsFactors") = false); - + return df; } //' Create a conversion table for collapsing similar sequences -//' @param seqs Character vector with nucleotide sequences (or pairs of -//' sequences concatenated with "_") to be collapsed. The sequences must +//' @param seqs Character vector with nucleotide sequences (or pairs of +//' sequences concatenated with "_") to be collapsed. The sequences must //' all be of the same length. //' @param scores Numeric vector of "scores" for the sequences. Typically -//' the total read/UMI count. A higher score will be preferred when -//' deciding which sequence to use as the representative for a group of +//' the total read/UMI count. A higher score will be preferred when +//' deciding which sequence to use as the representative for a group of //' collapsed sequences. -//' @param collapseMaxDist Numeric scalar defining the tolerance for collapsing -//' similar sequences. If the value is in [0, 1), it defines the maximal +//' @param collapseMaxDist Numeric scalar defining the tolerance for collapsing +//' similar sequences. If the value is in [0, 1), it defines the maximal //' Hamming distance in terms of a fraction of sequence length: //' (\code{round(collapseMaxDist * nchar(sequence))}). //' A value greater or equal to 1 is rounded and directly used as the maximum //' allowed Hamming distance. Note that sequences can only be -//' collapsed if they are all of the same length. -//' @param collapseMinScore Numeric scalar, indicating the minimum score -//' required for a sequence to be considered as a representative for a -//' group of similar sequences (i.e., to allow other sequences to be -//' collapsed into it). +//' collapsed if they are all of the same length. The default value is 0. +//' @param collapseMinScore Numeric scalar, indicating the minimum score +//' required for a sequence to be considered as a representative for a +//' group of similar sequences (i.e., to allow other sequences to be +//' collapsed into it). The default value is 0. //' @param collapseMinRatio Numeric scalar. During collapsing of -//' similar sequences, a low-frequency sequence will be collapsed -//' with a higher-frequency sequence only if the ratio between the -//' high-frequency and the low-frequency scores is at least this +//' similar sequences, a low-frequency sequence will be collapsed +//' with a higher-frequency sequence only if the ratio between the +//' high-frequency and the low-frequency scores is at least this //' high. A value of 0 indicates that no such check is performed. //' @param verbose Logical scalar, whether to print progress messages. -//' -//' @return A data.frame with two columns, containing the input sequences +//' +//' @return A data.frame with two columns, containing the input sequences //' and the representatives for the groups resulting from grouping similar //' sequences, respectively. -//' +//' //' @examples //' seqs <- c("AACGTAGCA", "ACCGTAGCA", "AACGGAGCA", "ATCGGAGCA", "TGAGGCATA") //' scores <- c(5, 1, 3, 1, 8) -//' groupSimilarSequences(seqs = seqs, scores = scores, -//' collapseMaxDist = 1, collapseMinScore = 0, +//' groupSimilarSequences(seqs = seqs, scores = scores, +//' collapseMaxDist = 1, collapseMinScore = 0, //' collapseMinRatio = 0, verbose = FALSE) -//' +//' //' @export //' @author Michael Stadler, Charlotte Soneson // [[Rcpp::export]] Rcpp::DataFrame groupSimilarSequences(std::vector seqs, - std::vector scores, - double collapseMaxDist, - double collapseMinScore, - double collapseMinRatio, - bool verbose) { - + std::vector scores, + double collapseMaxDist=0.0, + double collapseMinScore=0.0, + double collapseMinRatio=0.0, + bool verbose=false) { + // combine seqs and scores std::map seqsScores; std::map::iterator seqsScoresIt, seqsScoresSimIt; for (size_t i = 0; i < seqs.size(); i++) { seqsScores.insert(std::pair(seqs[i], scores[i])); } - + // get sequence length size_t seqlen = seqs[0].length(); @@ -1323,11 +1382,11 @@ Rcpp::DataFrame groupSimilarSequences(std::vector seqs, tol = (collapseMaxDist * (seqs[0].find("_") != std::string::npos ? seqlen-1 : seqlen)); } - + if (verbose) { Rcout << "start collapsing sequences (tolerance: " << tol << ")..."; } - + // sort seqsScores decreasingly by scores // ... create an empty intermediate vector std::vector> vec; @@ -1344,7 +1403,7 @@ Rcpp::DataFrame groupSimilarSequences(std::vector seqs, return l.second > r.second; return l.first < r.first; }); - + // store sequences in BK tree BKtree tree; for (vecIt = vec.begin(); vecIt != vec.end(); vecIt++) { @@ -1357,16 +1416,16 @@ Rcpp::DataFrame groupSimilarSequences(std::vector seqs, } } vec.clear(); // remove temporary vector - + std::string querySeq, representativeSeq; std::vector simSeqs; std::map single2representative; - + // start querying in the order of tree.items (ordered decreasingly by scores) size_t start_size = (double)tree.size; while (tree.size > 0) { querySeq = tree.first(); - // check in seqsScores if score for querySeq is < variableCollapseMinReads + // check in seqsScores if score for querySeq is < collapseMinScore seqsScoresIt = seqsScores.find(querySeq); if (collapseMinScore > 0 && seqsScoresIt != seqsScores.end() && @@ -1396,20 +1455,20 @@ Rcpp::DataFrame groupSimilarSequences(std::vector seqs, tree.remove(simSeqs[i]); } } - + // check for user interruption and print progress if ((start_size - tree.size) % 2000 == 0) { // every 2,000 queries (every ~1-2s) - Rcpp::checkUserInterrupt(); // ... check for user interrupt + Rcpp::checkUserInterrupt(); // ... check for user interrupt # nocov start // ... and give an update if (verbose && (start_size - tree.size) % 2000 == 0) { Rcout << " " << std::setprecision(4) << (100.0 * ((double)(start_size - tree.size) / (double)start_size)) << - "% done" << std::endl; + "% done" << std::endl; // # nocov end } } } } - + // return grouping results as a DataFrame std::vector repseqs(seqs.size()); std::unordered_set uniqueRepseqs; @@ -1589,7 +1648,7 @@ List digestFastqsCpp(std::vector fastqForwardVect, // if maxNReads has been reached in the previous file, break bool done; if (maxNReads != (-1) && nTot >= maxNReads) { - done = true; + done = true; // # nocov } else { done = false; } @@ -1630,7 +1689,7 @@ List digestFastqsCpp(std::vector fastqForwardVect, nTot++; } if (nTot % 200000 == 0) { // every 200,000 reads (every ~1.6 seconds) - Rcpp::checkUserInterrupt(); // ... check for user interrupt + Rcpp::checkUserInterrupt(); // ... check for user interrupt # nocov } // if maxNReads has been reached, break @@ -1831,8 +1890,8 @@ List digestFastqsCpp(std::vector fastqForwardVect, } std::string wtForward = wildTypeForward[idxForward]; std::string wtNameForward = wildTypeForwardNames[idxForward]; - - // if the variable region is longer than the (best matching) WT sequence, + + // if the variable region is longer than the (best matching) WT sequence, // skip the read (consider it to have the wrong length) if (varSeqForward.length() > wtForward.length()) { #ifdef _OPENMP @@ -1840,16 +1899,16 @@ List digestFastqsCpp(std::vector fastqForwardVect, #endif nReadWrongLength++; chunkBuffer->write_seq(ci, outfile1, outfile2, nTot-(int)iChunk+(int)ci, "noPrimer_readWrongLength"); - continue; + continue; } - + if (compareToWildtype(varSeqForward, wtForward, varIntQualForward, mutatedPhredMinForward, nbrMutatedCodonsMaxForward, forbiddenCodonsForward, wtNameForward, nbrMutatedBasesMaxForward, nMutQualTooLow, - nTooManyMutCodons, nForbiddenCodons, nTooManyMutBases, + nTooManyMutCodons, nForbiddenCodons, nTooManyMutBases, mutantName, mutantNameBase, mutantNameCodon, mutantNameBaseHGVS, mutantNameAA, mutantNameAAHGVS, - nMutBases, nMutCodons, + nMutBases, nMutCodons, nMutAAs, mutationTypes, mutNameDelimiter, collapseToWTForward, threeAA)) { // read is to be filtered out @@ -1908,8 +1967,8 @@ List digestFastqsCpp(std::vector fastqForwardVect, } std::string wtReverse = wildTypeReverse[idxReverse]; std::string wtNameReverse = wildTypeReverseNames[idxReverse]; - - // if the variable region is longer than the (best matching) WT sequence, + + // if the variable region is longer than the (best matching) WT sequence, // skip the read (consider it to have the wrong length) if (varSeqReverse.length() > wtReverse.length()) { #ifdef _OPENMP @@ -1917,16 +1976,16 @@ List digestFastqsCpp(std::vector fastqForwardVect, #endif nReadWrongLength++; chunkBuffer->write_seq(ci, outfile1, outfile2, nTot-(int)iChunk+(int)ci, "noPrimer_readWrongLength"); - continue; + continue; } - + if (compareToWildtype(varSeqReverse, wtReverse, varIntQualReverse, mutatedPhredMinReverse, nbrMutatedCodonsMaxReverse, forbiddenCodonsReverse, wtNameReverse, nbrMutatedBasesMaxReverse, nMutQualTooLow, - nTooManyMutCodons, nForbiddenCodons, nTooManyMutBases, + nTooManyMutCodons, nForbiddenCodons, nTooManyMutBases, mutantName, mutantNameBase, mutantNameCodon, mutantNameBaseHGVS, mutantNameAA, mutantNameAAHGVS, - nMutBases, nMutCodons, + nMutBases, nMutCodons, nMutAAs, mutationTypes, mutNameDelimiter, collapseToWTReverse, threeAA)) { // read is to be filtered out @@ -2066,13 +2125,13 @@ List digestFastqsCpp(std::vector fastqForwardVect, mutantNameBase.pop_back(); // remove '_' at the end mutantNameCodon.pop_back(); // remove '_' at the end } else { - // will we ever go in here? + // will we ever go in here? // # nocov start if (wildTypeForward[0].compare("") != 0 || (!noReverse && wildTypeReverse[0].compare("") != 0)) { mutantName = "WT"; mutantNameBase = "WT"; mutantNameCodon = "WT"; - } + } // # nocov end } if (mutantNameBaseHGVS.length() > 0) { // we have a (closest) wildtype name mutantNameBaseHGVS.pop_back(); // remove '_' at the end @@ -2081,11 +2140,11 @@ List digestFastqsCpp(std::vector fastqForwardVect, if (mutantNameAA.length() > 0) { // we have a least one mutation, or sequence-based name mutantNameAA.pop_back(); // remove '_' at the end } else { - // will we ever go in here? + // will we ever go in here? // # nocov start if (wildTypeForward[0].compare("") != 0 || (!noReverse && wildTypeReverse[0].compare("") != 0)) { mutantNameAA = "WT"; - } + } // # nocov end } if (!noReverse) { // "trans" experiment varSeqForward += (std::string("_") + varSeqReverse); @@ -2217,7 +2276,7 @@ List digestFastqsCpp(std::vector fastqForwardVect, mutCounter++; // check for user interruption and print progress if (mutCounter % 200 == 0) { // every 200 queries - Rcpp::checkUserInterrupt(); // ... check for user interrupt + Rcpp::checkUserInterrupt(); // ... check for user interrupt # nocov } } @@ -2235,7 +2294,7 @@ List digestFastqsCpp(std::vector fastqForwardVect, Rcpp::StringVector wildTypeReverseRcpp(wildTypeReverse.size()); wildTypeReverseRcpp = wildTypeReverse; wildTypeReverseRcpp.attr("names") = wildTypeReverseNames; - + std::vector forbiddenCodonsUsedForward(forbiddenCodonsForward.begin(), forbiddenCodonsForward.end()); std::vector forbiddenCodonsUsedReverse(forbiddenCodonsReverse.begin(), forbiddenCodonsReverse.end()); List param; @@ -2295,7 +2354,7 @@ List digestFastqsCpp(std::vector fastqForwardVect, Named("nbrMismatchForward") = nPhredMismatchForward, Named("nbrMatchReverse") = nPhredCorrectReverse, Named("nbrMismatchReverse") = nPhredMismatchReverse); - + // ... filter statistics DataFrame filt = DataFrame::create(Named("nbrTotal") = nTot, Named("f1_nbrAdapter") = nAdapter, @@ -2313,10 +2372,10 @@ List digestFastqsCpp(std::vector fastqForwardVect, Named("f12_nbrTooManyMutConstant") = nTooManyMutConstant, Named("f13_nbrTooManyBestConstantHits") = nTooManyBestConstantHits, Named("nbrRetained") = nRetain); - + // ... main data frame DataFrame df = mutantSummaryToDataFrame(mutantSummary); - + // ... pack into list List L = List::create(Named("parameters") = param, Named("filterSummary") = filt, diff --git a/tests/testthat/test_digestFastqs_cis.R b/tests/testthat/test_digestFastqs_cis.R index 5f0c328..10aaef2 100644 --- a/tests/testthat/test_digestFastqs_cis.R +++ b/tests/testthat/test_digestFastqs_cis.R @@ -36,10 +36,50 @@ test_that("digestFastqs works as expected for cis experiments", { filteredReadsFastqForward = "", filteredReadsFastqReverse = "", maxNReads = -1, verbose = FALSE, - nThreads = 1, chunkSize = 1000, + nThreads = 1, chunkSize = 1000, maxReadLength = 1024 ) + ## hit maxNReads + Ldef0 <- Ldef + Ldef0$maxNReads <- 10 + res0 <- do.call(digestFastqs, Ldef0) + expect_identical(res0$filterSummary$nbrTotal, 10L) + + ## hit nTooManyNinUMI + Ldef0 <- Ldef + Ldef0$elementsForward <- "UV" + Ldef0$elementsReverse <- "UVS" + Ldef0$elementLengthsForward <- c(115, 10) + Ldef0$elementLengthsReverse = c(111, 10, -1) + Ldef0$adapterForward <- Ldef0$adapterReverse <- "" + Ldef0$primerForward <- Ldef0$primerReverse <- "" + Ldef0$wildTypeForward <- "GGAAAAACTA" + Ldef0$constantForward <- Ldef0$constantReverse <- "" + res0 <- do.call(digestFastqs, Ldef0) + expect_identical(res0$filterSummary$f7_nbrTooManyNinUMI, 497L) + + ## hit nTooManyMutBases + Ldef0 <- Ldef + Ldef0$nbrMutatedBasesMaxForward <- 0 + Ldef0$nbrMutatedBasesMaxReverse <- 0 + Ldef0$nbrMutatedCodonsMaxForward <- -1 + Ldef0$nbrMutatedCodonsMaxReverse <- -1 + res0 <- do.call(digestFastqs, Ldef0) + expect_identical(res0$filterSummary$f10b_nbrTooManyMutBases, 753L) + + ## hit not all WT sequences are of the same length + Ldef0 <- Ldef + Ldef0$maxNReads <- 100 + Ldef0$useTreeWTmatch <- TRUE + Ldef0$mergeForwardReverse <- FALSE + Ldef0$wildTypeForward <- "" + Ldef0$wildTypeReverse <- c(w1 = "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTCTGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA", + w2 = "GACAGACCAACTAGAAGTTACATGAGAAGTCTGCTTTGCAGACCGAGATTGCCA") + res0 <- do.call(digestFastqs, Ldef0) + expect_true(any(grepl("_w1", res0$summaryTable$mutantNameBase))) + + ## now run with the unmodified arguments res <- do.call(digestFastqs, Ldef) ## Specify reverse WT - check that it's ignored @@ -72,10 +112,10 @@ test_that("digestFastqs works as expected for cis experiments", { expect_equal(res$parameters[[nm]], Ldef[[nm]], ignore_attr = TRUE) } for (nm in c("fastqForward", "fastqReverse")) { - expect_equal(res$parameters[[nm]], normalizePath(Ldef[[nm]], mustWork = FALSE), + expect_equal(res$parameters[[nm]], normalizePath(Ldef[[nm]], mustWork = FALSE), ignore_attr = TRUE) } - + expect_equal(sum(res$summaryTable$nbrReads), res$filterSummary$nbrRetained) expect_equal(sum(res$summaryTable$nbrReads == 2), 11L) expect_equal(sort(res$summaryTable$mutantName[res$summaryTable$nbrReads == 2]), @@ -116,11 +156,11 @@ test_that("digestFastqs works as expected for cis experiments", { "f:c.11T>G") expect_equal(res$summaryTable$mutantNameAAHGVS[res$summaryTable$sequence == example_seq], "f:p.(Leu4Arg)") - - expect_equal(res$summaryTable$sequenceAA, + + expect_equal(res$summaryTable$sequenceAA, as.character(Biostrings::translate( Biostrings::DNAStringSet(res$summaryTable$sequence)))) - expect_equal(as.numeric(res$summaryTable$varLengths), + expect_equal(as.numeric(res$summaryTable$varLengths), nchar(res$summaryTable$sequence)) expect_equal(sum(grepl("WT", res$summaryTable$mutantNameAA) & res$summaryTable$nbrMutBases > 0), 13) expect_true(all(grepl("silent", res$summaryTable$mutationTypes[grepl("WT", res$summaryTable$mutantNameAA) & res$summaryTable$nbrMutBases > 0]))) @@ -198,7 +238,7 @@ test_that("digestFastqs works as expected when specifying max nbr of mutated bas filteredReadsFastqForward = "", filteredReadsFastqReverse = "", maxNReads = -1, verbose = FALSE, - nThreads = 1, chunkSize = 1000, + nThreads = 1, chunkSize = 1000, maxReadLength = 125 ) @@ -233,7 +273,7 @@ test_that("digestFastqs works as expected when specifying max nbr of mutated bas expect_equal(res$parameters[[nm]], Ldef[[nm]], ignore_attr = TRUE) } for (nm in c("fastqForward", "fastqReverse")) { - expect_equal(res$parameters[[nm]], normalizePath(Ldef[[nm]], mustWork = FALSE), + expect_equal(res$parameters[[nm]], normalizePath(Ldef[[nm]], mustWork = FALSE), ignore_attr = TRUE) } @@ -270,7 +310,7 @@ test_that("digestFastqs works as expected when specifying max nbr of mutated bas "GCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA") expect_equal(res$summaryTable$mutantName[res$summaryTable$sequence == example_seq], "f.11.G") - expect_equal(res$summaryTable$mutantNameAA[res$summaryTable$sequence == example_seq], + expect_equal(res$summaryTable$mutantNameAA[res$summaryTable$sequence == example_seq], "f.4.R") expect_equal(res$summaryTable$mutantNameBase[res$summaryTable$sequence == example_seq], "f.11.G") @@ -280,7 +320,7 @@ test_that("digestFastqs works as expected when specifying max nbr of mutated bas "f:c.11T>G") expect_equal(res$summaryTable$mutantNameAAHGVS[res$summaryTable$sequence == example_seq], "f:p.(Leu4Arg)") - + expect_equal(sum(res$errorStatistics$nbrMatchForward + res$errorStatistics$nbrMismatchForward), nchar(Ldef$constantForward[1]) * res$filterSummary$nbrRetained) expect_equal(sum(res$errorStatistics$nbrMatchReverse + res$errorStatistics$nbrMismatchReverse), @@ -348,7 +388,7 @@ test_that("digestFastqs gives the same results regardless of how WT matching is filteredReadsFastqForward = "", filteredReadsFastqReverse = "", maxNReads = -1, verbose = FALSE, - nThreads = 1, chunkSize = 1000, + nThreads = 1, chunkSize = 1000, maxReadLength = 1024 ) diff --git a/tests/testthat/test_digestFastqs_helpers.R b/tests/testthat/test_digestFastqs_helpers.R index 031946e..cef832f 100644 --- a/tests/testthat/test_digestFastqs_helpers.R +++ b/tests/testthat/test_digestFastqs_helpers.R @@ -1,3 +1,21 @@ +## ---------------------------------------------------------------------------- +## complement +## ---------------------------------------------------------------------------- +test_that("complement works", { + expect_identical(complement("a"), "T") + expect_identical(complement("A"), "T") + expect_identical(complement("c"), "G") + expect_identical(complement("C"), "G") + expect_identical(complement("g"), "C") + expect_identical(complement("G"), "C") + expect_identical(complement("t"), "A") + expect_identical(complement("T"), "A") + expect_identical(complement("n"), "N") + expect_identical(complement("N"), "N") + expect_error(complement("B"), "Invalid DNA base") + expect_error(complement("Q"), "Invalid DNA base") +}) + ## ---------------------------------------------------------------------------- ## compareCodonPositions ## ---------------------------------------------------------------------------- @@ -365,7 +383,7 @@ test_that("mergeReadPairsPartial works", { expect_identical(res2e$mergedSeq, "TTACAACACACA") expect_identical(res2e$mergedQual, rep(c(10L, 40L), c(5, 7))) expect_identical(res2e$mergedLengths, 12L) - + ## invalid overlaps specified/overlaps modified internally ## minOverlap > lenF -> no valid overlap res0a <- mutscan:::test_mergeReadPairPartial(sF2, qF2, sR2, qR2, lF2, lR2, 8, 7, 9, 9, 1, FALSE) @@ -397,7 +415,7 @@ test_that("mergeReadPairsPartial works", { expect_identical(res0d$mergedQual, qF2) expect_identical(res0d$mergedLengths, 7L) expect_true(res0d$return) - + ## minMergedLength > lenF + lenR -> no valid overlap res0e <- mutscan:::test_mergeReadPairPartial(sF2, qF2, sR2, qR2, lF2, lR2, 1, 7, 15, 9, 1, FALSE) expect_type(res0e, "list") @@ -412,7 +430,7 @@ test_that("mergeReadPairsPartial works", { expect_identical(res0f$mergedQual, rep(c(10L, 40L), c(2, 5))) expect_identical(res0f$mergedLengths, 7L) expect_false(res0f$return) - + ## padded reads for (i in 1:10) { sF <- paste(rep(c("C","A"), c(i, 6)), collapse = "") @@ -495,6 +513,10 @@ test_that("makeBaseHGVS works", { expect_equal(makeBaseHGVS(c("f.1.A", "r.4.A", "f.6.C"), ".", "TAGTGTAGTCCGT", "AAGAGCAGTCCGT"), "[1T>A;4_6delinsAGC]_") expect_equal(makeBaseHGVS(c("r.4.A"), ".", "TAGTGTAGTCCGT", "TAGAGTAGTCCGT"), "4T>A_") + expect_equal(makeBaseHGVS(character(0), ".", "AAA", "AAA"), "_") + expect_equal(makeBaseHGVS(c("f.1.A", "r.4.A", "r.8.A"), ".", + "TACTTTAT", "AAGAAAAA"), + "[1T>A;4T>A;8T>A]_") }) test_that("makeAAHGVS works", { @@ -506,4 +528,86 @@ test_that("makeAAHGVS works", { "[(Thr1Leu);(Asp2Met)]_") expect_equal(test_makeAAHGVS(c("f.1.L", "f.2.M", "f.7.M"), ".", "TDTLQAETDQLEDEKSALQTEIANLLKEKEKL"), "[(Thr1Leu);(Asp2Met);(Glu7Met)]_") + expect_equal(test_makeAAHGVS(character(0), ".", "TDTL"), "_") }) + +## ---------------------------------------------------------------------------- +## compareToWildtype +## ---------------------------------------------------------------------------- +test_that("compareToWildtype works", { + # read is kept + expect_identical( + test_compareToWildtype(varSeq = "AAAGGACGA", wtSeq = "AATGGACGT", + varIntQual = rep(32L, 9L), + forbiddenCodons_vect = character(0), + mutatedPhredMin = 0.0, + nbrMutatedCodonsMax = 3L, + codonPrefix = "xyz", + nbrMutatedBasesMax = 3L, + mutNameDelimiter = ".", + collapseToWT = TRUE), + list(nMutQualTooLow = 0L, nTooManyMutCodons = 0L, nForbiddenCodons = 0L, + nTooManyMutBases = 0L, nMutBases = 2L, nMutCodons = 2L, + nMutAAs = 1L, mutantName = "xyz_", mutantNameBase = "xyz_", + mutantNameCodon = "xyz_", mutantNameBaseHGVS = "xyz:c_", + mutantNameAA = "xyz_", mutantNameAAHGVS = "xyz:p_", + mutationTypes = c("nonsynonymous", "silent"))) + expect_identical( + test_compareToWildtype(varSeq = "AAAGGACGA", wtSeq = "AATGGACGT", + varIntQual = rep(32L, 9L), + forbiddenCodons_vect = character(0), + mutatedPhredMin = 0.0, + nbrMutatedCodonsMax = 3L, + codonPrefix = "xyz:c", + nbrMutatedBasesMax = 3L, + mutNameDelimiter = ".", + collapseToWT = TRUE), + list(nMutQualTooLow = 0L, nTooManyMutCodons = 0L, nForbiddenCodons = 0L, + nTooManyMutBases = 0L, nMutBases = 2L, nMutCodons = 2L, + nMutAAs = 1L, mutantName = "xyz:c_", mutantNameBase = "xyz:c_", + mutantNameCodon = "xyz:c_", mutantNameBaseHGVS = "xyz:c_", + mutantNameAA = "xyz:c_", mutantNameAAHGVS = "xyz:p_", + mutationTypes = c("nonsynonymous", "silent"))) + # read is filtered out + expect_identical( + test_compareToWildtype(varSeq = "CGTCGTCGA", wtSeq = "CGTCGTCGT", + varIntQual = rep(32L, 9L), + forbiddenCodons_vect = character(0), + mutatedPhredMin = 33.0, # <--- + nbrMutatedCodonsMax = 3L, + codonPrefix = "xyz", + nbrMutatedBasesMax = 3L, + mutNameDelimiter = ".", + collapseToWT = TRUE), + list()) + expect_identical( + test_compareToWildtype(varSeq = "CGACGACGA", wtSeq = "CGTCGTCGT", + varIntQual = rep(32L, 9L), + forbiddenCodons_vect = character(0), + mutatedPhredMin = 0.0, + nbrMutatedCodonsMax = -1L, + codonPrefix = "xyz", + nbrMutatedBasesMax = 2L, # <--- + mutNameDelimiter = ".", + collapseToWT = TRUE), + list()) +}) + +## ---------------------------------------------------------------------------- +## groupSimilarSequences +## ---------------------------------------------------------------------------- +test_that("groupSimilarSequences works", { + expect_identical( + groupSimilarSequences(c("AA", "AT", "AC", "TT"), 1:4, collapseMaxDist = 0), + data.frame(sequence = c("AA", "AT", "AC", "TT"), + representative = c("AA", "AT", "AC", "TT")) + ) + expect_identical( + groupSimilarSequences(c("AA", "AT", "AC", "TT"), 1:4, collapseMaxDist = 1), + data.frame(sequence = c("AA", "AT", "AC", "TT"), + representative = c("AC", "TT", "AC", "TT")) + ) + expect_warning(groupSimilarSequences(c("AA", "AT", "AC", "TTT"), 1:4, + collapseMaxDist = 1), + "not all of the same length") +}) \ No newline at end of file diff --git a/tests/testthat/test_digestFastqs_primers.R b/tests/testthat/test_digestFastqs_primers.R index 0360baf..d138ca6 100644 --- a/tests/testthat/test_digestFastqs_primers.R +++ b/tests/testthat/test_digestFastqs_primers.R @@ -228,4 +228,151 @@ test_that("digestFastqs works as expected for primer experiments", { expect_equal(sum(res1$summaryTable[-idx, "nbrReads"]), res2$summaryTable[res2$summaryTable$mutantName == paste0(names(leu2)[1], ".0.WT_r.0.WT"), "nbrReads"]) + + ## --------------------------------------------------------------------------- + ## Swap forward and reverse reads, specify max nbr of mutated bases + ## --------------------------------------------------------------------------- + Ldef <- list( + fastqForward = fqt2, fastqReverse = fqt1, + mergeForwardReverse = FALSE, + minOverlap = 0, maxOverlap = 0, maxFracMismatchOverlap = 0, greedyOverlap = TRUE, + revComplForward = FALSE, revComplReverse = FALSE, + elementsForward = "SPSS", elementsReverse = "SPV", + elementLengthsForward = c(-1, -1, 96, -1), + elementLengthsReverse = c(-1, -1, -1), + adapterForward = "", + adapterReverse = "", + primerForward = "GAAAAAGGAAGCTGGAGAGA", + primerReverse = "GTCAGGTGGAGGCGGATCC", + wildTypeForward = "ATCGCCCGGCTGGAGGAAAAAGTGAAAACCTTGAAAGCTCAGAACTCGGAGCTGGCGTCCACGGCCAACATGCTCAGGGAACAGGTGGCACAGCTT", + wildTypeReverse = leu, + constantForward = "", + constantReverse = "", + avePhredMinForward = 20.0, avePhredMinReverse = 20.0, + variableNMaxForward = 0, variableNMaxReverse = 0, + umiNMax = 0, + nbrMutatedCodonsMaxForward = -1, + nbrMutatedCodonsMaxReverse = -1, + nbrMutatedBasesMaxForward = 0, + nbrMutatedBasesMaxReverse = 0, + forbiddenMutatedCodonsForward = "NNW", + forbiddenMutatedCodonsReverse = "", + useTreeWTmatch = FALSE, + mutatedPhredMinForward = 0.0, mutatedPhredMinReverse = 0.0, + mutNameDelimiter = ".", + constantMaxDistForward = -1, + constantMaxDistReverse = -1, + umiCollapseMaxDist = 0, + filteredReadsFastqForward = "", + filteredReadsFastqReverse = "", + maxNReads = -1, verbose = FALSE, + nThreads = 1, chunkSize = 1000, + maxReadLength = 1024 + ) + res1 <- do.call(digestFastqs, Ldef) + leu2 <- leu + names(leu2)[1:5] <- "ATF2" + Ldef$wildTypeReverse <- leu2 + res2 <- do.call(digestFastqs, Ldef) + + expect_identical(res1$filterSummary, res2$filterSummary) + expect_identical(res1$errorStatistics, res2$errorStatistics) + expect_equal(nrow(res1$summaryTable), nrow(res2$summaryTable) + 4) + idx <- which(!res1$summaryTable$mutantName %in% paste0("f.0.WT_", names(leu)[1:5], ".0.WT")) + expect_identical(res1$summaryTable[idx, ], + res2$summaryTable[match(res1$summaryTable$mutantName[idx], + res2$summaryTable$mutantName), ], + ignore_attr = TRUE) + expect_equal(sum(res1$summaryTable[-idx, "nbrReads"]), + res2$summaryTable[res2$summaryTable$mutantName == + paste0("f.0.WT_", names(leu2)[1], ".0.WT"), "nbrReads"]) + + ## --------------------------------------------------------------------------- + ## Too many WT hits, reverse reads + ## --------------------------------------------------------------------------- + Ldef <- list( + fastqForward = fqt1, fastqReverse = fqt2, + mergeForwardReverse = FALSE, + minOverlap = 0, maxOverlap = 0, maxFracMismatchOverlap = 0, greedyOverlap = TRUE, + revComplForward = FALSE, revComplReverse = FALSE, + elementsForward = "SPV", elementsReverse = "SPSS", + elementLengthsForward = c(-1, -1, -1), + elementLengthsReverse = c(-1, -1, 96, -1), + adapterForward = "", + adapterReverse = "", + primerForward = "GTCAGGTGGAGGCGGATCC", + primerReverse = "GAAAAAGGAAGCTGGAGAGA", + wildTypeForward = leu, + wildTypeReverse = c(A = "ATCGCCCGGCTGGAGGAAAAAGTGAAAACCTTGAAAGCTCAGAACTCGGAGCTGGCGTCCACGGCCAACATGCTCAGGGAACAGGTGGCACAGCTT", B = "ATCGCCCGGCTGGAGGAAAAAGTGAAAACCTTGAAAGCTCAGAACTCGGAGCTGGCGTCCACGGCCAACATGCTCAGGGAACAGGTGGCACAGCTTA"), + constantForward = "", + constantReverse = "", + avePhredMinForward = 20.0, avePhredMinReverse = 20.0, + variableNMaxForward = 0, variableNMaxReverse = 0, + umiNMax = 0, + nbrMutatedCodonsMaxForward = 0, + nbrMutatedCodonsMaxReverse = 0, + nbrMutatedBasesMaxForward = -1, + nbrMutatedBasesMaxReverse = -1, + forbiddenMutatedCodonsForward = "", + forbiddenMutatedCodonsReverse = "NNW", + useTreeWTmatch = FALSE, + mutatedPhredMinForward = 0.0, mutatedPhredMinReverse = 0.0, + mutNameDelimiter = ".", + constantMaxDistForward = -1, + constantMaxDistReverse = -1, + umiCollapseMaxDist = 0, + filteredReadsFastqForward = "", + filteredReadsFastqReverse = "", + maxNReads = -1, verbose = FALSE, + nThreads = 1, chunkSize = 1000, + maxReadLength = 1024 + ) + res1 <- do.call(digestFastqs, Ldef) + + expect_identical(res1$filterSummary$f8_nbrTooManyBestWTHits, 809L) + + ## --------------------------------------------------------------------------- + ## Too short WT hit, reverse reads + ## --------------------------------------------------------------------------- + Ldef <- list( + fastqForward = fqt1, fastqReverse = fqt2, + mergeForwardReverse = FALSE, + minOverlap = 0, maxOverlap = 0, maxFracMismatchOverlap = 0, greedyOverlap = TRUE, + revComplForward = FALSE, revComplReverse = FALSE, + elementsForward = "SPV", elementsReverse = "SPVS", + elementLengthsForward = c(-1, -1, -1), + elementLengthsReverse = c(-1, -1, 96, -1), + adapterForward = "", + adapterReverse = "", + primerForward = "GTCAGGTGGAGGCGGATCC", + primerReverse = "GAAAAAGGAAGCTGGAGAGA", + wildTypeForward = leu, + wildTypeReverse = "ATCGCCCGGCTGGAGGAAAAAGTGAAAACCTTGAAAGCTCAGAACTCGGAGCTGGCGTCCACGGCCAACATGCTCAGGGAACAGGTGGCACAGCT", + constantForward = "", + constantReverse = "", + avePhredMinForward = 20.0, avePhredMinReverse = 20.0, + variableNMaxForward = 0, variableNMaxReverse = 0, + umiNMax = 0, + nbrMutatedCodonsMaxForward = 0, + nbrMutatedCodonsMaxReverse = 1, + nbrMutatedBasesMaxForward = -1, + nbrMutatedBasesMaxReverse = -1, + forbiddenMutatedCodonsForward = "", + forbiddenMutatedCodonsReverse = "NNW", + useTreeWTmatch = FALSE, + mutatedPhredMinForward = 0.0, mutatedPhredMinReverse = 0.0, + mutNameDelimiter = ".", + constantMaxDistForward = -1, + constantMaxDistReverse = -1, + umiCollapseMaxDist = 0, + filteredReadsFastqForward = "", + filteredReadsFastqReverse = "", + maxNReads = -1, verbose = FALSE, + nThreads = 1, chunkSize = 1000, + maxReadLength = 1024 + ) + res1 <- do.call(digestFastqs, Ldef) + + expect_identical(res1$filterSummary$f3_nbrReadWrongLength, 439L) + expect_identical(res1$filterSummary$nbrRetained, 0L) }) diff --git a/tests/testthat/test_digestFastqs_trans.R b/tests/testthat/test_digestFastqs_trans.R index a21b61e..73ae347 100644 --- a/tests/testthat/test_digestFastqs_trans.R +++ b/tests/testthat/test_digestFastqs_trans.R @@ -1261,3 +1261,153 @@ test_that("digestFastqs works as expected for trans experiments, if collapsing t expect_equal(res$summaryTable$varLengths, "80,16_16,80") }) +test_that("reads are filtered out if there are too many constant hits", { + fqt1 <- system.file("extdata/transInput_1.fastq.gz", package = "mutscan") + fqt2 <- system.file("extdata/transInput_2.fastq.gz", package = "mutscan") + ## default arguments + Ldef <- list( + fastqForward = fqt1, fastqReverse = fqt2, + mergeForwardReverse = FALSE, + minOverlap = 0, maxOverlap = 0, maxFracMismatchOverlap = 0, greedyOverlap = TRUE, + revComplForward = FALSE, revComplReverse = FALSE, + elementsForward = "SUCVV", elementsReverse = "SUCVV", + elementLengthsForward = c(1, 10, 18, 80, 16), + elementLengthsReverse = c(1, 8, 20, 16, 80), + adapterForward = "GGAAGAGCACACGTC", + adapterReverse = "GGAAGAGCGTCGTGT", + primerForward = "", + primerReverse = "", + wildTypeForward = "ACTGATACACTCCAAGCGGAGACAGACCAACTAGAAGATGAGAAGTCTGCTTTGCAGACCGAGATTGCCAACCTGCTGAAGGAGAAGGAAAAACTA", + wildTypeReverse = "ATCGCCCGGCTGGAGGAAAAAGTGAAAACCTTGAAAGCTCAGAACTCGGAGCTGGCGTCCACGGCCAACATGCTCAGGGAACAGGTGGCACAGCTT", + constantForward = "AACCGGAGGAGGGAGCTG", + constantReverse = "GAAAAAGGAAGCTGGAGAGA", + avePhredMinForward = 20.0, avePhredMinReverse = 20.0, + variableNMaxForward = 0, variableNMaxReverse = 0, + umiNMax = 0, + nbrMutatedCodonsMaxForward = 1, + nbrMutatedCodonsMaxReverse = 1, + nbrMutatedBasesMaxForward = -1, + nbrMutatedBasesMaxReverse = -1, + forbiddenMutatedCodonsForward = "NNW", + forbiddenMutatedCodonsReverse = "NNW", + useTreeWTmatch = FALSE, + collapseToWTForward = TRUE, + collapseToWTReverse = TRUE, + mutatedPhredMinForward = 0.0, mutatedPhredMinReverse = 0.0, + mutNameDelimiter = ".", + constantMaxDistForward = -1, + constantMaxDistReverse = -1, + umiCollapseMaxDist = 0, + filteredReadsFastqForward = "", + filteredReadsFastqReverse = "", + maxNReads = -1, verbose = FALSE, + nThreads = 1, chunkSize = 1000, + maxReadLength = 1024 + ) + + ## Forward + Ldef1 <- Ldef; Ldef1$constantForward <- c(c1 = "AACCGGAGGAGGGAGCTA", c2 = "AACCGGAGGAGGGAGCTC") + res1 <- do.call(digestFastqs, Ldef1) + + expect_equal(res1$filterSummary$nbrTotal, 1000L) + expect_equal(res1$filterSummary$f1_nbrAdapter, 314L) + expect_equal(res1$filterSummary$f2_nbrNoPrimer, 0L) + expect_equal(res1$filterSummary$f3_nbrReadWrongLength, 0L) + expect_equal(res1$filterSummary$f4_nbrNoValidOverlap, 0L) + expect_equal(res1$filterSummary$f5_nbrAvgVarQualTooLow, 7L) + expect_equal(res1$filterSummary$f6_nbrTooManyNinVar, 0L) + expect_equal(res1$filterSummary$f7_nbrTooManyNinUMI, 0L) + expect_equal(res1$filterSummary$f8_nbrTooManyBestWTHits, 0L) + expect_equal(res1$filterSummary$f9_nbrMutQualTooLow, 0L) + expect_equal(res1$filterSummary$f10a_nbrTooManyMutCodons, 192L + 95L + 68L + 37L) + expect_equal(res1$filterSummary$f10b_nbrTooManyMutBases, 0L) + expect_equal(res1$filterSummary$f11_nbrForbiddenCodons, 6L + 2L) + expect_equal(res1$filterSummary$f12_nbrTooManyMutConstant, 0L) + expect_equal(res1$filterSummary$f13_nbrTooManyBestConstantHits, 279L) + expect_equal(res1$filterSummary$nbrRetained, 0L) + + ## Reverse + Ldef1 <- Ldef; Ldef1$constantReverse <- c(c1 = "GAAAAAGGAAGCTGGAGAGG", c2 = "GAAAAAGGAAGCTGGAGAGT") + res1 <- do.call(digestFastqs, Ldef1) + + expect_equal(res1$filterSummary$nbrTotal, 1000L) + expect_equal(res1$filterSummary$f1_nbrAdapter, 314L) + expect_equal(res1$filterSummary$f2_nbrNoPrimer, 0L) + expect_equal(res1$filterSummary$f3_nbrReadWrongLength, 0L) + expect_equal(res1$filterSummary$f4_nbrNoValidOverlap, 0L) + expect_equal(res1$filterSummary$f5_nbrAvgVarQualTooLow, 7L) + expect_equal(res1$filterSummary$f6_nbrTooManyNinVar, 0L) + expect_equal(res1$filterSummary$f7_nbrTooManyNinUMI, 0L) + expect_equal(res1$filterSummary$f8_nbrTooManyBestWTHits, 0L) + expect_equal(res1$filterSummary$f9_nbrMutQualTooLow, 0L) + expect_equal(res1$filterSummary$f10a_nbrTooManyMutCodons, 192L + 95L + 68L + 37L) + expect_equal(res1$filterSummary$f10b_nbrTooManyMutBases, 0L) + expect_equal(res1$filterSummary$f11_nbrForbiddenCodons, 6L + 2L) + expect_equal(res1$filterSummary$f12_nbrTooManyMutConstant, 0L) + expect_equal(res1$filterSummary$f13_nbrTooManyBestConstantHits, 279L) + expect_equal(res1$filterSummary$nbrRetained, 0L) +}) + +test_that("digestFastqs runs with revComplForward = TRUE", { + fqt1 <- system.file("extdata/transInput_1.fastq.gz", package = "mutscan") + fqt2 <- system.file("extdata/transInput_2.fastq.gz", package = "mutscan") + ## default arguments + Ldef <- list( + fastqForward = fqt1, fastqReverse = fqt2, + mergeForwardReverse = FALSE, + minOverlap = 0, maxOverlap = 0, maxFracMismatchOverlap = 0, greedyOverlap = TRUE, + revComplForward = TRUE, revComplReverse = FALSE, + elementsForward = "SUCVV", elementsReverse = "SUCVV", + elementLengthsForward = c(1, 10, 18, 80, 16), + elementLengthsReverse = c(1, 8, 20, 16, 80), + adapterForward = "GGAAGAGCACACGTC", + adapterReverse = "GGAAGAGCGTCGTGT", + primerForward = "", + primerReverse = "", + wildTypeForward = "TAGTTTTTCCTTCTCCTTCAGCAGGTTGGCAATCTCGGTCTGCAAAGCAGACTTCTCATCTTCTAGTTGGTCTGTCTCCGCTTGGAGTGTATCAGT", + wildTypeReverse = "ATCGCCCGGCTGGAGGAAAAAGTGAAAACCTTGAAAGCTCAGAACTCGGAGCTGGCGTCCACGGCCAACATGCTCAGGGAACAGGTGGCACAGCTT", + constantForward = "CAGCTCCCTCCTCCGGTT", + constantReverse = "GAAAAAGGAAGCTGGAGAGA", + avePhredMinForward = 20.0, avePhredMinReverse = 20.0, + variableNMaxForward = 0, variableNMaxReverse = 0, + umiNMax = 0, + nbrMutatedCodonsMaxForward = 1, + nbrMutatedCodonsMaxReverse = 1, + nbrMutatedBasesMaxForward = -1, + nbrMutatedBasesMaxReverse = -1, + forbiddenMutatedCodonsForward = "", + forbiddenMutatedCodonsReverse = "", + useTreeWTmatch = FALSE, + collapseToWTForward = TRUE, + collapseToWTReverse = TRUE, + mutatedPhredMinForward = 0.0, mutatedPhredMinReverse = 0.0, + mutNameDelimiter = ".", + constantMaxDistForward = -1, + constantMaxDistReverse = -1, + umiCollapseMaxDist = 0, + filteredReadsFastqForward = "", + filteredReadsFastqReverse = "", + maxNReads = -1, verbose = FALSE, + nThreads = 1, chunkSize = 1000, + maxReadLength = 1024 + ) + + res <- do.call(digestFastqs, Ldef) + + expect_equal(res$filterSummary$nbrTotal, 1000L) + expect_equal(res$filterSummary$f1_nbrAdapter, 314L) + expect_equal(res$filterSummary$f2_nbrNoPrimer, 0L) + expect_equal(res$filterSummary$f3_nbrReadWrongLength, 0L) + expect_equal(res$filterSummary$f4_nbrNoValidOverlap, 0L) + expect_equal(res$filterSummary$f5_nbrAvgVarQualTooLow, 7L) + expect_equal(res$filterSummary$f6_nbrTooManyNinVar, 0L) + expect_equal(res$filterSummary$f7_nbrTooManyNinUMI, 0L) + expect_equal(res$filterSummary$f8_nbrTooManyBestWTHits, 0L) + expect_equal(res$filterSummary$f9_nbrMutQualTooLow, 0L) + expect_equal(res$filterSummary$f10a_nbrTooManyMutCodons, 394L) + expect_equal(res$filterSummary$f10b_nbrTooManyMutBases, 0L) + expect_equal(res$filterSummary$f11_nbrForbiddenCodons, 0L) + expect_equal(res$filterSummary$f12_nbrTooManyMutConstant, 0L) + expect_equal(res$filterSummary$f13_nbrTooManyBestConstantHits, 0L) + expect_equal(res$filterSummary$nbrRetained, 285L) +}) diff --git a/tests/testthat/test_digestFastqs_trans_collapse.R b/tests/testthat/test_digestFastqs_trans_collapse.R index 44751c0..146a354 100644 --- a/tests/testthat/test_digestFastqs_trans_collapse.R +++ b/tests/testthat/test_digestFastqs_trans_collapse.R @@ -42,25 +42,6 @@ test_that("digestFastqs works as expected for trans experiments, when similar se res <- do.call(digestFastqs, Ldef) - ## First check that we get deprecation warnings if deprecated arguments are - ## specified. Results should be the same as if these arguments are not used - lifecycle::expect_deprecated({ - resdep1 <- do.call(digestFastqs, c(Ldef, list(variableCollapseMaxDist = 2)))}) - lifecycle::expect_deprecated({ - resdep2 <- do.call(digestFastqs, c(Ldef, list(variableCollapseMinReads = 2)))}) - lifecycle::expect_deprecated({ - resdep3 <- do.call(digestFastqs, c(Ldef, list(variableCollapseMinRatio = 2)))}) - - expect_equal(res$filterSummary, resdep1$filterSummary) - expect_equal(res$summaryTable, resdep1$summaryTable) - expect_equal(res$errorStatistics, resdep1$errorStatistics) - expect_equal(res$filterSummary, resdep2$filterSummary) - expect_equal(res$summaryTable, resdep2$summaryTable) - expect_equal(res$errorStatistics, resdep2$errorStatistics) - expect_equal(res$filterSummary, resdep3$filterSummary) - expect_equal(res$summaryTable, resdep3$summaryTable) - expect_equal(res$errorStatistics, resdep3$errorStatistics) - ## Summarize single sample and collapse se <- summarizeExperiment(list(s1 = res), coldata = data.frame(Name = "s1"), countType = "reads") @@ -128,6 +109,117 @@ test_that("digestFastqs works as expected for trans experiments, when similar se expect_true(all(res$summaryTable$varLengths == "96_96")) }) +test_that("digestFastqs works as expected for trans experiments, when similar sequences are collapsed (relative UMI distance)", { + fqt1 <- system.file("extdata/transInput_1.fastq.gz", package = "mutscan") + fqt2 <- system.file("extdata/transInput_2.fastq.gz", package = "mutscan") + ## default arguments + Ldef <- list( + fastqForward = fqt1, fastqReverse = fqt2, + mergeForwardReverse = FALSE, + minOverlap = 0, maxOverlap = 0, maxFracMismatchOverlap = 0, greedyOverlap = TRUE, + revComplForward = FALSE, revComplReverse = FALSE, + elementsForward = "SUCV", elementsReverse = "SUCV", + elementLengthsForward = c(1, 10, 18, 96), + elementLengthsReverse = c(1, 8, 20, 96), + adapterForward = "GGAAGAGCACACGTC", + adapterReverse = "GGAAGAGCGTCGTGT", + primerForward = "", + primerReverse = "", + wildTypeForward = "", + wildTypeReverse = "", + constantForward = "AACCGGAGGAGGGAGCTG", + constantReverse = "GAAAAAGGAAGCTGGAGAGA", + avePhredMinForward = 20.0, avePhredMinReverse = 20.0, + variableNMaxForward = 0, variableNMaxReverse = 0, + umiNMax = 0, + nbrMutatedCodonsMaxForward = 1, + nbrMutatedCodonsMaxReverse = 1, + nbrMutatedBasesMaxForward = -1, + nbrMutatedBasesMaxReverse = -1, + forbiddenMutatedCodonsForward = "NNW", + forbiddenMutatedCodonsReverse = "NNW", + useTreeWTmatch = FALSE, + mutatedPhredMinForward = 0.0, mutatedPhredMinReverse = 0.0, + mutNameDelimiter = ".", + constantMaxDistForward = -1, + constantMaxDistReverse = -1, + umiCollapseMaxDist = 0.23, + filteredReadsFastqForward = "", + filteredReadsFastqReverse = "", + maxNReads = -1, verbose = TRUE, + nThreads = 1, chunkSize = 1000, + maxReadLength = 1024 + ) + + res <- do.call(digestFastqs, Ldef) + + ## Summarize single sample and collapse + se <- summarizeExperiment(list(s1 = res), coldata = data.frame(Name = "s1"), + countType = "reads") + secoll <- collapseMutantsBySimilarity(se, assayName = "counts", + collapseMaxDist = 6, + collapseMinScore = 0, collapseMinRatio = 0, + verbose = FALSE) + seumi <- summarizeExperiment(list(s1 = res), coldata = data.frame(Name = "s1"), + countType = "umis") + secollumi <- collapseMutantsBySimilarity(seumi, assayName = "counts", + scoreMethod = "rowMean", + collapseMaxDist = 6, + collapseMinScore = 0, collapseMinRatio = 0, + verbose = TRUE) + + expect_equal(res$filterSummary$nbrTotal, 1000L) + expect_equal(res$filterSummary$f1_nbrAdapter, 314L) + expect_equal(res$filterSummary$f2_nbrNoPrimer, 0L) + expect_equal(res$filterSummary$f3_nbrReadWrongLength, 0L) + expect_equal(res$filterSummary$f4_nbrNoValidOverlap, 0L) + expect_equal(res$filterSummary$f5_nbrAvgVarQualTooLow, 7L) + expect_equal(res$filterSummary$f6_nbrTooManyNinVar, 0L) + expect_equal(res$filterSummary$f7_nbrTooManyNinUMI, 0L) + expect_equal(res$filterSummary$f8_nbrTooManyBestWTHits, 0L) + expect_equal(res$filterSummary$f9_nbrMutQualTooLow, 0L) + expect_equal(res$filterSummary$f10a_nbrTooManyMutCodons, 0L) + expect_equal(res$filterSummary$f10b_nbrTooManyMutBases, 0L) + expect_equal(res$filterSummary$f11_nbrForbiddenCodons, 0L) + expect_equal(res$filterSummary$f12_nbrTooManyMutConstant, 0L) + expect_equal(res$filterSummary$f13_nbrTooManyBestConstantHits, 0L) + expect_equal(res$filterSummary$nbrRetained, 679L) + + expect_equal(SummarizedExperiment::colData(secoll)$nbrTotal, 1000L) + expect_equal(SummarizedExperiment::colData(secoll)$f1_nbrAdapter, 314L) + expect_equal(SummarizedExperiment::colData(secoll)$f2_nbrNoPrimer, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$f3_nbrReadWrongLength, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$f4_nbrNoValidOverlap, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$f5_nbrAvgVarQualTooLow, 7L) + expect_equal(SummarizedExperiment::colData(secoll)$f6_nbrTooManyNinVar, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$f7_nbrTooManyNinUMI, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$f8_nbrTooManyBestWTHits, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$f9_nbrMutQualTooLow, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$f10a_nbrTooManyMutCodons, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$f10b_nbrTooManyMutBases, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$f11_nbrForbiddenCodons, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$f12_nbrTooManyMutConstant, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$f13_nbrTooManyBestConstantHits, 0L) + expect_equal(SummarizedExperiment::colData(secoll)$nbrRetained, 679L) + + for (nm in setdiff(names(Ldef), c("forbiddenMutatedCodonsForward", "forbiddenMutatedCodonsReverse", "verbose", "fastqForward", "fastqReverse"))) { + expect_equal(res$parameters[[nm]], Ldef[[nm]], ignore_attr = TRUE) + } + for (nm in c("fastqForward", "fastqReverse")) { + expect_equal(res$parameters[[nm]], normalizePath(Ldef[[nm]], mustWork = FALSE), + ignore_attr = TRUE) + } + + expect_equal(sum(res$summaryTable$nbrReads), res$filterSummary$nbrRetained) + expect_equal(sum(SummarizedExperiment::assay(secoll, "counts")), + res$filterSummary$nbrRetained) + expect_equal(nrow(res$summaryTable), 677L) + expect_equal(sum(res$summaryTable$nbrUmis), 679L) + expect_equal(nrow(secoll), 294L) + expect_equal(nrow(secollumi), 294L) + expect_true(all(res$summaryTable$varLengths == "96_96")) +}) + test_that("digestFastqs works as expected for trans experiments, when similar sequences are collapsed - extreme case", { fqt1 <- system.file("extdata/transInput_1.fastq.gz", package = "mutscan") fqt2 <- system.file("extdata/transInput_2.fastq.gz", package = "mutscan") @@ -382,12 +474,7 @@ test_that("digestFastqs works as expected for trans experiments, when similar se for (nm in setdiff(names(Ldef), c("forbiddenMutatedCodonsForward", "nThreads", "forbiddenMutatedCodonsReverse", "verbose", "fastqForward", "fastqReverse"))) { - if (nm == "variableCollapseMinReads") { - expect_equal(res$parameters[[nm]], as.integer(ceiling(Ldef[[nm]])), - ignore_attr = TRUE) - } else { - expect_equal(res$parameters[[nm]], Ldef[[nm]], ignore_attr = TRUE) - } + expect_equal(res$parameters[[nm]], Ldef[[nm]], ignore_attr = TRUE) } for (nm in c("fastqForward", "fastqReverse")) { expect_equal(res$parameters[[nm]], normalizePath(Ldef[[nm]], mustWork = FALSE), @@ -480,12 +567,7 @@ test_that("digestFastqs works as expected for trans experiments, when similar se expect_equal(res$filterSummary$nbrRetained, 679L) for (nm in setdiff(names(Ldef), c("forbiddenMutatedCodonsForward", "forbiddenMutatedCodonsReverse", "verbose", "fastqForward", "fastqReverse"))) { - if (nm == "variableCollapseMinReads") { - expect_equal(res$parameters[[nm]], as.integer(ceiling(Ldef[[nm]])), - ignore_attr = TRUE) - } else { - expect_equal(res$parameters[[nm]], Ldef[[nm]], ignore_attr = TRUE) - } + expect_equal(res$parameters[[nm]], Ldef[[nm]], ignore_attr = TRUE) } for (nm in c("fastqForward", "fastqReverse")) { expect_equal(res$parameters[[nm]], normalizePath(Ldef[[nm]], mustWork = FALSE), @@ -573,12 +655,7 @@ test_that("digestFastqs works as expected for trans experiments, when similar se expect_equal(res$filterSummary$nbrRetained, 679L) for (nm in setdiff(names(Ldef), c("forbiddenMutatedCodonsForward", "forbiddenMutatedCodonsReverse", "verbose", "fastqForward", "fastqReverse"))) { - if (nm == "variableCollapseMinReads") { - expect_equal(res$parameters[[nm]], as.integer(ceiling(Ldef[[nm]])), - ignore_attr = TRUE) - } else { - expect_equal(res$parameters[[nm]], Ldef[[nm]], ignore_attr = TRUE) - } + expect_equal(res$parameters[[nm]], Ldef[[nm]], ignore_attr = TRUE) } for (nm in c("fastqForward", "fastqReverse")) { expect_equal(res$parameters[[nm]], normalizePath(Ldef[[nm]], mustWork = FALSE), diff --git a/tests/testthat/test_linkMultipleVariants.R b/tests/testthat/test_linkMultipleVariants.R index f075f29..6d9e58b 100644 --- a/tests/testthat/test_linkMultipleVariants.R +++ b/tests/testthat/test_linkMultipleVariants.R @@ -93,12 +93,12 @@ test_that("linkMultipleVariants works", { keep <- !grepl("N|del", truth$truth$status) ## Truth - correct <- truth$truth[keep, ] %>% - dplyr::group_by(trueBarcode, trueV2, trueV3) %>% dplyr::tally() %>% + correct <- truth$truth[keep, ] |> + dplyr::group_by(trueBarcode, trueV2, trueV3) |> dplyr::tally() |> dplyr::arrange(dplyr::desc(n), trueBarcode, trueV2, trueV3) ## Obs - obs <- res$countAggregated %>% dplyr::arrange(desc(nbrReads), barcode, V2, V3) + obs <- res$countAggregated |> dplyr::arrange(desc(nbrReads), barcode, V2, V3) expect_equal(sum(obs$nbrReads), sum(keep)) expect_equal(correct$trueBarcode, obs$barcode) @@ -196,12 +196,12 @@ test_that("linkMultipleVariants works", { keep <- !grepl("C.*mut|N|del", truth$truth$status) ## Truth - correct <- truth$truth[keep, ] %>% - dplyr::group_by(trueBarcode, trueV2, trueV3) %>% dplyr::tally() %>% + correct <- truth$truth[keep, ] |> + dplyr::group_by(trueBarcode, trueV2, trueV3) |> dplyr::tally() |> dplyr::arrange(dplyr::desc(n), trueBarcode, trueV2, trueV3) ## Obs - obs <- res$countAggregated %>% dplyr::arrange(desc(nbrReads), barcode, V2, V3) + obs <- res$countAggregated |> dplyr::arrange(desc(nbrReads), barcode, V2, V3) expect_equal(sum(obs$nbrReads), sum(keep)) expect_equal(correct$trueBarcode, obs$barcode) @@ -302,14 +302,14 @@ test_that("linkMultipleVariants works", { keep <- !grepl("N|del", truth$truth$status) ## Truth - correct <- truth$truth[keep, ] %>% + correct <- truth$truth[keep, ] |> dplyr::mutate(trueV2 = gsub("\\..*", "", trueV2), - trueV3 = gsub("\\..*", "", trueV3)) %>% - dplyr::group_by(trueBarcode, trueV2, trueV3) %>% dplyr::tally() %>% + trueV3 = gsub("\\..*", "", trueV3)) |> + dplyr::group_by(trueBarcode, trueV2, trueV3) |> dplyr::tally() |> dplyr::arrange(dplyr::desc(n), trueBarcode, trueV2, trueV3) ## Obs - obs <- res$countAggregated %>% dplyr::arrange(desc(nbrReads), barcode, V2, V3) + obs <- res$countAggregated |> dplyr::arrange(desc(nbrReads), barcode, V2, V3) expect_equal(sum(obs$nbrReads), sum(keep)) expect_equal(correct$trueBarcode, obs$barcode) @@ -384,12 +384,12 @@ test_that("linkMultipleVariants works", { keep <- !grepl("N|del", truth$truth$status) ## Truth - correct <- truth$truth[keep, ] %>% - dplyr::group_by(trueBarcode, trueV2, trueV3) %>% dplyr::tally() %>% + correct <- truth$truth[keep, ] |> + dplyr::group_by(trueBarcode, trueV2, trueV3) |> dplyr::tally() |> dplyr::arrange(dplyr::desc(n), trueBarcode, trueV2, trueV3) ## Obs - obs <- res$countAggregated %>% dplyr::arrange(desc(nbrReads), barcode, V2, V3) + obs <- res$countAggregated |> dplyr::arrange(desc(nbrReads), barcode, V2, V3) expect_equal(sum(obs$nbrReads), sum(keep)) expect_equal(correct$trueBarcode, obs$barcode) @@ -482,12 +482,12 @@ test_that("linkMultipleVariants works", { keep <- !grepl("C.*mut|N|del|V2|V3", truth$truth$status) ## Truth - correct <- truth$truth[keep, ] %>% - dplyr::group_by(obsBarcode, trueV2, trueV3) %>% dplyr::tally() %>% + correct <- truth$truth[keep, ] |> + dplyr::group_by(obsBarcode, trueV2, trueV3) |> dplyr::tally() |> dplyr::arrange(dplyr::desc(n), obsBarcode, trueV2, trueV3) ## Obs - obs <- res$countAggregated %>% dplyr::arrange(desc(nbrReads), barcode, V2, V3) + obs <- res$countAggregated |> dplyr::arrange(desc(nbrReads), barcode, V2, V3) expect_equal(sum(obs$nbrReads), sum(keep)) expect_equal(correct$obsBarcode, obs$barcode) @@ -535,6 +535,60 @@ test_that("linkMultipleVariants works", { expect_equal(res$filtSeparate$V3[, "nbrRetained"], sum(!grepl("C.*mut|del|V3\\.", truth$truth$status))) + # -------------------------------------------------------------------------- + # Get warning when summing UMIs + expect_warning({ + res <- linkMultipleVariants( + combinedDigestParams = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "UVCVCV", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CVCV", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 20, + maxOverlap = 30, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + verbose = verbose), + barcode = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + elementsForward = "CVCSCS", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + avePhredMinForward = 20, + collapseMaxDist = 1, collapseMinScore = 1, collapseMinRatio = 1, + verbose = verbose), + V2 = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "CSCVCS", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CSCV", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 10, + maxOverlap = 20, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + wildTypeForward = truth$WTV2, + nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, + forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE, + verbose = verbose, useTreeWTmatch = TRUE), + V3 = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "CSCSCV", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CVCS", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 5, + maxOverlap = 15, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + wildTypeForward = truth$WTV3, + nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, + forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE, + verbose = verbose, useTreeWTmatch = TRUE) + )}, "Aggregating UMI counts") }) diff --git a/tests/testthat/test_plotDistributions.R b/tests/testthat/test_plotDistributions.R index db5d386..bcb85d6 100644 --- a/tests/testthat/test_plotDistributions.R +++ b/tests/testthat/test_plotDistributions.R @@ -32,78 +32,99 @@ test_that("plotDistributions fails with incorrect arguments", { test_that("plotDistributions works as expected", { ## Defaults - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = NULL, plotType = "density", - facet = FALSE, pseudocount = 0), "ggplot") + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = NULL, plotType = "density", + facet = FALSE, pseudocount = 0))) ## Change plot type - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = NULL, plotType = "histogram", - facet = FALSE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = NULL, plotType = "knee", - facet = FALSE, pseudocount = 0), "ggplot") + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = NULL, plotType = "histogram", + facet = FALSE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = NULL, plotType = "knee", + facet = FALSE, pseudocount = 0))) ## groupBy Name - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Name", plotType = "density", - facet = FALSE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Name", plotType = "histogram", - facet = FALSE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Name", plotType = "knee", - facet = FALSE, pseudocount = 0), "ggplot") + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Name", plotType = "density", + facet = FALSE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Name", plotType = "histogram", + facet = FALSE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Name", plotType = "knee", + facet = FALSE, pseudocount = 0))) ## groupBy Condition - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Condition", plotType = "density", - facet = FALSE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Condition", plotType = "histogram", - facet = FALSE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Condition", plotType = "knee", - facet = FALSE, pseudocount = 0), "ggplot") + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Condition", plotType = "density", + facet = FALSE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Condition", plotType = "histogram", + facet = FALSE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Condition", plotType = "knee", + facet = FALSE, pseudocount = 0))) ## Facet - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = NULL, plotType = "density", - facet = TRUE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = NULL, plotType = "histogram", - facet = TRUE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = NULL, plotType = "knee", - facet = TRUE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Name", plotType = "density", - facet = TRUE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Name", plotType = "histogram", - facet = TRUE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Name", plotType = "knee", - facet = TRUE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Condition", plotType = "density", - facet = TRUE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Condition", plotType = "histogram", - facet = TRUE, pseudocount = 0), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Condition", plotType = "knee", - facet = TRUE, pseudocount = 0), "ggplot") + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = NULL, plotType = "density", + facet = TRUE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = NULL, plotType = "histogram", + facet = TRUE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = NULL, plotType = "knee", + facet = TRUE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Name", plotType = "density", + facet = TRUE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Name", plotType = "histogram", + facet = TRUE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Name", plotType = "knee", + facet = TRUE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Condition", plotType = "density", + facet = TRUE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Condition", plotType = "histogram", + facet = TRUE, pseudocount = 0))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Condition", plotType = "knee", + facet = TRUE, pseudocount = 0))) ## Increase pseudocount - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Condition", plotType = "density", - facet = FALSE, pseudocount = 2), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Condition", plotType = "histogram", - facet = FALSE, pseudocount = 3), "ggplot") - expect_s3_class(plotDistributions(se = se, selAssay = "counts", - groupBy = "Condition", plotType = "knee", - facet = FALSE, pseudocount = 4), "ggplot") + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Condition", plotType = "density", + facet = FALSE, pseudocount = 2))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Condition", plotType = "histogram", + facet = FALSE, pseudocount = 3))) + expect_true(ggplot2::is_ggplot( + plotDistributions(se = se, selAssay = "counts", + groupBy = "Condition", plotType = "knee", + facet = FALSE, pseudocount = 4))) }) \ No newline at end of file diff --git a/tests/testthat/test_plotFiltering.R b/tests/testthat/test_plotFiltering.R index 4c4b676..d542ff4 100644 --- a/tests/testthat/test_plotFiltering.R +++ b/tests/testthat/test_plotFiltering.R @@ -36,69 +36,80 @@ test_that("plotFiltering fails with incorrect arguments", { test_that("plotFiltering works with correct arguments", { ## Defaults - expect_s3_class(plotFiltering(se = se, valueType = "reads", - onlyActiveFilters = FALSE, - displayNumbers = TRUE, numberSize = 4, - plotType = "remaining", facetBy = "sample"), "ggplot") + expect_true(ggplot2::is_ggplot( + plotFiltering(se = se, valueType = "reads", + onlyActiveFilters = FALSE, + displayNumbers = TRUE, numberSize = 4, + plotType = "remaining", facetBy = "sample"))) ## Fractions - expect_s3_class(plotFiltering(se = se, valueType = "fractions", - onlyActiveFilters = FALSE, - displayNumbers = TRUE, numberSize = 4, - plotType = "remaining", facetBy = "sample"), "ggplot") + expect_true(ggplot2::is_ggplot( + plotFiltering(se = se, valueType = "fractions", + onlyActiveFilters = FALSE, + displayNumbers = TRUE, numberSize = 4, + plotType = "remaining", facetBy = "sample"))) ## Only active filters - expect_s3_class(plotFiltering(se = se, valueType = "reads", - onlyActiveFilters = TRUE, - displayNumbers = TRUE, numberSize = 4, - plotType = "remaining", facetBy = "sample"), "ggplot") + expect_true(ggplot2::is_ggplot( + plotFiltering(se = se, valueType = "reads", + onlyActiveFilters = TRUE, + displayNumbers = TRUE, numberSize = 4, + plotType = "remaining", facetBy = "sample"))) ## Don't display numbers - expect_s3_class(plotFiltering(se = se, valueType = "reads", - onlyActiveFilters = FALSE, - displayNumbers = FALSE, numberSize = 4, - plotType = "remaining", facetBy = "sample"), "ggplot") + expect_true(ggplot2::is_ggplot( + plotFiltering(se = se, valueType = "reads", + onlyActiveFilters = FALSE, + displayNumbers = FALSE, numberSize = 4, + plotType = "remaining", facetBy = "sample"))) ## Change size of displayed numbers - expect_s3_class(plotFiltering(se = se, valueType = "reads", - onlyActiveFilters = FALSE, - displayNumbers = FALSE, numberSize = 2, - plotType = "remaining", facetBy = "sample"), "ggplot") + expect_true(ggplot2::is_ggplot( + plotFiltering(se = se, valueType = "reads", + onlyActiveFilters = FALSE, + displayNumbers = FALSE, numberSize = 2, + plotType = "remaining", facetBy = "sample"))) ## Reads + Filtered + Sample - expect_s3_class(plotFiltering(se = se, valueType = "reads", - onlyActiveFilters = FALSE, - displayNumbers = TRUE, numberSize = 4, - plotType = "filtered", facetBy = "sample"), "ggplot") + expect_true(ggplot2::is_ggplot( + plotFiltering(se = se, valueType = "reads", + onlyActiveFilters = FALSE, + displayNumbers = TRUE, numberSize = 4, + plotType = "filtered", facetBy = "sample"))) ## Reads + Filtered + Step - expect_s3_class(plotFiltering(se = se, valueType = "reads", - onlyActiveFilters = FALSE, - displayNumbers = TRUE, numberSize = 4, - plotType = "filtered", facetBy = "step"), "ggplot") + expect_true(ggplot2::is_ggplot( + plotFiltering(se = se, valueType = "reads", + onlyActiveFilters = FALSE, + displayNumbers = TRUE, numberSize = 4, + plotType = "filtered", facetBy = "step"))) ## Reads + Remaining + Step - expect_s3_class(plotFiltering(se = se, valueType = "reads", - onlyActiveFilters = FALSE, - displayNumbers = TRUE, numberSize = 4, - plotType = "remaining", facetBy = "step"), "ggplot") + expect_true(ggplot2::is_ggplot( + plotFiltering(se = se, valueType = "reads", + onlyActiveFilters = FALSE, + displayNumbers = TRUE, numberSize = 4, + plotType = "remaining", facetBy = "step"))) ## Fractions + Filtered + Sample - expect_s3_class(plotFiltering(se = se, valueType = "fractions", - onlyActiveFilters = FALSE, - displayNumbers = TRUE, numberSize = 4, - plotType = "filtered", facetBy = "sample"), "ggplot") + expect_true(ggplot2::is_ggplot( + plotFiltering(se = se, valueType = "fractions", + onlyActiveFilters = FALSE, + displayNumbers = TRUE, numberSize = 4, + plotType = "filtered", facetBy = "sample"))) ## Fractions + Filtered + Step - expect_s3_class(plotFiltering(se = se, valueType = "fractions", - onlyActiveFilters = FALSE, - displayNumbers = TRUE, numberSize = 4, - plotType = "filtered", facetBy = "step"), "ggplot") + expect_true(ggplot2::is_ggplot( + plotFiltering(se = se, valueType = "fractions", + onlyActiveFilters = FALSE, + displayNumbers = TRUE, numberSize = 4, + plotType = "filtered", facetBy = "step"))) ## Fractions + Remaining + Step - expect_s3_class(plotFiltering(se = se, valueType = "fractions", - onlyActiveFilters = FALSE, - displayNumbers = TRUE, numberSize = 4, - plotType = "remaining", facetBy = "step"), "ggplot") + expect_true(ggplot2::is_ggplot( + plotFiltering(se = se, valueType = "fractions", + onlyActiveFilters = FALSE, + displayNumbers = TRUE, numberSize = 4, + plotType = "remaining", facetBy = "step"))) }) diff --git a/tests/testthat/test_plotResults.R b/tests/testthat/test_plotResults.R index 1eb32fc..f7f9f7f 100644 --- a/tests/testthat/test_plotResults.R +++ b/tests/testthat/test_plotResults.R @@ -113,29 +113,33 @@ test_that("plotResults functions work as expected", { pseudocount = 1, method = "limma", normMethod = "sum") - expect_s3_class(plotMeanDiff(res1), "ggplot") - expect_s3_class(plotMeanDiff(res2), "ggplot") - expect_s3_class(plotVolcano(res1), "ggplot") - expect_s3_class(plotVolcano(res2), "ggplot") - - expect_s3_class(plotMeanDiff(res1, meanCol = "logCPM", logFCCol = "logFC", - padjCol = "FDR", padjThreshold = 0.1), "ggplot") - expect_s3_class(plotMeanDiff(res2, meanCol = "AveExpr", logFCCol = "logFC", - padjCol = "adj.P.Val", padjThreshold = 0.1), "ggplot") - expect_s3_class(plotVolcano(res1, logFCCol = "logFC", pvalCol = "PValue", - padjCol = "FDR", padjThreshold = 0.1), "ggplot") - expect_s3_class(plotVolcano(res2, logFCCol = "logFC", pvalCol = "P.Value", - padjCol = "adj.P.Val", padjThreshold = 0.1), "ggplot") - - expect_s3_class(plotMeanDiff(res1, nTopToLabel = 5), "ggplot") - expect_s3_class(plotMeanDiff(res2, nTopToLabel = 5), "ggplot") - expect_s3_class(plotVolcano(res1, nTopToLabel = 5), "ggplot") - expect_s3_class(plotVolcano(res2, nTopToLabel = 5), "ggplot") - - expect_s3_class(plotMeanDiff(res1, pointSize = "large"), "ggplot") - expect_s3_class(plotMeanDiff(res2, pointSize = "large"), "ggplot") - expect_s3_class(plotVolcano(res1, pointSize = "large"), "ggplot") - expect_s3_class(plotVolcano(res2, pointSize = "large"), "ggplot") + expect_true(ggplot2::is_ggplot(plotMeanDiff(res1))) + expect_true(ggplot2::is_ggplot(plotMeanDiff(res2))) + expect_true(ggplot2::is_ggplot(plotVolcano(res1))) + expect_true(ggplot2::is_ggplot(plotVolcano(res2))) + + expect_true(ggplot2::is_ggplot( + plotMeanDiff(res1, meanCol = "logCPM", logFCCol = "logFC", + padjCol = "FDR", padjThreshold = 0.1))) + expect_true(ggplot2::is_ggplot( + plotMeanDiff(res2, meanCol = "AveExpr", logFCCol = "logFC", + padjCol = "adj.P.Val", padjThreshold = 0.1))) + expect_true(ggplot2::is_ggplot( + plotVolcano(res1, logFCCol = "logFC", pvalCol = "PValue", + padjCol = "FDR", padjThreshold = 0.1))) + expect_true(ggplot2::is_ggplot( + plotVolcano(res2, logFCCol = "logFC", pvalCol = "P.Value", + padjCol = "adj.P.Val", padjThreshold = 0.1))) + + expect_true(ggplot2::is_ggplot(plotMeanDiff(res1, nTopToLabel = 5))) + expect_true(ggplot2::is_ggplot(plotMeanDiff(res2, nTopToLabel = 5))) + expect_true(ggplot2::is_ggplot(plotVolcano(res1, nTopToLabel = 5))) + expect_true(ggplot2::is_ggplot(plotVolcano(res2, nTopToLabel = 5))) + + expect_true(ggplot2::is_ggplot(plotMeanDiff(res1, pointSize = "large"))) + expect_true(ggplot2::is_ggplot(plotMeanDiff(res2, pointSize = "large"))) + expect_true(ggplot2::is_ggplot(plotVolcano(res1, pointSize = "large"))) + expect_true(ggplot2::is_ggplot(plotVolcano(res2, pointSize = "large"))) ## These tests won't run if the X11 display connection can not be opened # skip_if_not_installed("plotly") diff --git a/tests/testthat/test_plotTotals.R b/tests/testthat/test_plotTotals.R index 19dd8d6..ff3b89b 100644 --- a/tests/testthat/test_plotTotals.R +++ b/tests/testthat/test_plotTotals.R @@ -19,11 +19,11 @@ test_that("plotTotals fails with incorrect arguments", { test_that("plotTotals works as expected", { ## Defaults - expect_s3_class(plotTotals(se = se, selAssay = "counts", - groupBy = NULL), "ggplot") + expect_true(ggplot2::is_ggplot(plotTotals(se = se, selAssay = "counts", + groupBy = NULL))) ## Group by column - expect_s3_class(plotTotals(se = se, selAssay = "counts", - groupBy = "categ"), "ggplot") + expect_true(ggplot2::is_ggplot(plotTotals(se = se, selAssay = "counts", + groupBy = "categ"))) }) \ No newline at end of file diff --git a/vignettes/mutscan.Rmd b/vignettes/mutscan.Rmd index a83c002..a988eeb 100644 --- a/vignettes/mutscan.Rmd +++ b/vignettes/mutscan.Rmd @@ -14,7 +14,7 @@ editor_options: chunk_output_type: console --- -```{r, include = FALSE} +```{r chunk-opts, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" @@ -48,7 +48,7 @@ knitr::include_graphics("mutscan-overview.001.png") The `mutscan` package contains several small example FASTQ files representing different types of experiments: -```{r readTrans} +```{r list-files} datadir <- system.file("extdata", package = "mutscan") dir(datadir) ``` @@ -102,33 +102,34 @@ As an example, a read with the following composition: would be specified to `digestFastqs()` by an elements string `"SUCV"`, and an element length vector `c(1, 10, 18, 96)`. -The package can also accommodate designs with primer sequences. In this situation, -the primer acts as an 'anchor', and the read composition before and after the -primer is specified. For example, a read with the following composition: +The package can also accommodate designs with primer sequences. In this +situation, the primer acts as an 'anchor', and the read composition before and +after the primer is specified. For example, a read with the following +composition: [unknown sequence] - [10 nt primer] - [variable region, constituting the remainder of the read] would be represented by an elements string `"SPV"`, and an element length vector `c(-1, 10, -1)`, where the -1 indicates that the corresponding read part -consists of the remaining part of the read, not accounted for by any of the other -specified parts. In addition, the sequence of the primer must be specified, and -any read where the primer is not present will be discarded. +consists of the remaining part of the read, not accounted for by any of the +other specified parts. In addition, the sequence of the primer must be +specified, and any read where the primer is not present will be discarded. The forward and reverse reads can have different compositions. The user can also specify whether the variable parts of the forward and/or -reverse reads should be reverse complemented before being processed, and whether the -variable regions of the forward and reverse reads should be merged into a single -consensus sequence. +reverse reads should be reverse complemented before being processed, and +whether the variable regions of the forward and reverse reads should be merged +into a single consensus sequence. ## Filtering -In addition to reading the FASTQ files, the `digestFastqs()` function will perform -a series of filtering steps, in the following order: +In addition to reading the FASTQ files, the `digestFastqs()` function will +perform a series of filtering steps, in the following order: * Search for perfect matches to forward/reverse adapter sequences, filter out the read pair if a match is found in either the forward or reverse read. -* Search for perfect matches to forward/reverse primer sequences, filter out the -read pair if not both are found. +* Search for perfect matches to forward/reverse primer sequences, filter out +the read pair if not both are found. * Filter out reads whose length is not compatible with the indicated composition. * If forward and reverse variable regions should be merged, filter out read @@ -161,16 +162,16 @@ other reasons). # Processing TRANS data Here, we illustrate the processing of the provided TRANS experiment example -data. We filter out reads with any adapter match, an average phred quality below -20, any Ns in the UMI or variable sequence, more than one mutated codon, or a -mutated codon ending with A or T (represented by the `NNW` value of the +data. We filter out reads with any adapter match, an average phred quality +below 20, any Ns in the UMI or variable sequence, more than one mutated codon, +or a mutated codon ending with A or T (represented by the `NNW` value of the `forbiddenMutatedCodonsForward` and `forbiddenMutatedCodonsReverse` arguments). In order to identify the number of mutations, we have to provide the wild type sequences of _FOS_ and _JUN_ (forward and reverse reads, respectively). This will additionally name the final mutants according to the positions and sequences of the mutated codons. -```{r} +```{r trans-digest} transInput <- digestFastqs( fastqForward = file.path(datadir, "transInput_1.fastq.gz"), fastqReverse = file.path(datadir, "transInput_2.fastq.gz"), @@ -203,20 +204,20 @@ transInput <- digestFastqs( ) ``` -The `digestFastqs()` function returns a list with four elements. The `parameters` -list records all parameter values used during the processing, as well as the -`mutscan` version and time of processing. +The `digestFastqs()` function returns a list with four elements. The +`parameters` list records all parameter values used during the processing, as +well as the `mutscan` version and time of processing. -```{r} +```{r trans-pars} transInput$parameters ``` The `filterSummary` data.frame contains a summary of the number of reads filtered out in each step. Note that for any given filtering step, only the -reads retained by the previous filters are considered. The numbers in the filter -column names indicate the order of the filters. +reads retained by the previous filters are considered. The numbers in the +filter column names indicate the order of the filters. -```{r} +```{r trans-filter} transInput$filterSummary ``` @@ -225,15 +226,16 @@ each variable sequence pair. The underscore in the strings in the sequence column indicate the separation between the forward and reverse wild type sequences. In addition, the table contains a column `mutantName`, which provides a shorthand notation for each mutant. The format of the values in this -column is a combination of `WTS.xx.NNN` (separated by `_`), where `WTS` provides -the name of the closest matching wild type sequence (if only one, unnamed wild -type sequence is provided, the name will be `f/r` depending on if it corresponds -to the forward/reverse read, respectively). The `xx` part indicates the mutated -codon or nucleotide number, and `NNN` gives the observed sequence for the mutated -codon or nucleotide. A sequence without mutations is named `WTS.0.WT`, where, -again, `WTS` is the name of the matching wild type sequence. - -```{r} +column is a combination of `WTS.xx.NNN` (separated by `_`), where `WTS` +provides the name of the closest matching wild type sequence (if only one, +unnamed wild type sequence is provided, the name will be `f/r` depending on +if it corresponds to the forward/reverse read, respectively). The `xx` part +indicates the mutated codon or nucleotide number, and `NNN` gives the +observed sequence for the mutated codon or nucleotide. A sequence without +mutations is named `WTS.0.WT`, where, again, `WTS` is the name of the +matching wild type sequence. + +```{r trans-summary} head(transInput$summaryTable) ``` @@ -241,17 +243,19 @@ Finally, the `errorStatistics` data.frame lists the number of matching and mismatching bases in the constant sequences, stratified by the phred quality score (from 0 to 99). -```{r} +```{r trans-error} transInput$errorStatistics[rowSums(transInput$errorStatistics[, -1]) != 0, ] ``` The `errorStatistics` output can be used to estimate the sequencing error rate: -```{r errStats} +```{r trans-error-stats} (propErrorsConstantF <- sum(transInput$errorStatistics$nbrMismatchForward) / - (nchar(transInput$parameters$constantForward) * transInput$filterSummary$nbrRetained)) + (nchar(transInput$parameters$constantForward) * + transInput$filterSummary$nbrRetained)) (propErrorsConstantR <- sum(transInput$errorStatistics$nbrMismatchReverse) / - (nchar(transInput$parameters$constantReverse) * transInput$filterSummary$nbrRetained)) + (nchar(transInput$parameters$constantReverse) * + transInput$filterSummary$nbrRetained)) ``` # Processing CIS data @@ -269,7 +273,7 @@ forward and reverse variable sequences are merged into one variable sequence, only one wild type sequence is provided (the `wildTypeReverse` argument will be ignored if specified). -```{r} +```{r cis-digest} cisInput <- digestFastqs( fastqForward = file.path(datadir, "cisInput_1.fastq.gz"), fastqReverse = file.path(datadir, "cisInput_2.fastq.gz"), @@ -308,17 +312,17 @@ cisInput <- digestFastqs( ) ``` -```{r} +```{r cis-out} cisInput$parameters cisInput$filterSummary cisInput$errorStatistics[rowSums(cisInput$errorStatistics[, -1]) != 0, ] ``` -The `summaryTable` now provides the number of reads and unique UMIs observed for -each variable sequence, and all values in the `mutNames` column will start +The `summaryTable` now provides the number of reads and unique UMIs observed +for each variable sequence, and all values in the `mutNames` column will start with `FOS`. -```{r} +```{r cis-summary} head(cisInput$summaryTable) ``` @@ -331,7 +335,7 @@ sequences for the forward read. If multiple wild type sequences are provided like here, `mutscan` will match each read against all of them and find the most similar one for each read. -```{r} +```{r def-primers} leu <- c( ATF2 = "GATCCTGATGAAAAAAGGAGAAAGTTTTTAGAGCGAAATAGAGCAGCAGCTTCAAGATGCCGACAAAAAAGGAAAGTCTGGGTTCAGTCTTTAGAGAAGAAAGCTGAAGACTTGAGTTCATTAAATGGTCAGCTGCAGAGTGAAGTCACCCTGCTGAGAAATGAAGTGGCACAGCTGAAACAGCTTCTTCTGGCT", ATF7 = "GATCCAGATGAGCGACGGCAGCGCTTTCTGGAGCGCAACCGGGCTGCAGCCTCCCGCTGCCGCCAAAAGCGAAAGCTGTGGGTGTCCTCCCTAGAGAAGAAGGCCGAAGAACTCACTTCTCAGAACATTCAGCTGAGTAATGAAGTCACATTACTACGCAATGAGGTGGCCCAGTTGAAACAGCTACTGTTAGCT", @@ -382,14 +386,14 @@ leu <- c( ) ``` -Next we process the data, not allowing any mismatches in the forward read, but 1 -mismatching codon in the reverse read. Now, we assume that the variable sequence -starts immediately after the provided primers, and hence we don't specify any -UMI/constant sequence lengths. For the forward read, the variable region is -taken to be the remainder of the read (after the primer), whereas for the -reverse read, we specify the variable sequence length to 96. +Next we process the data, not allowing any mismatches in the forward read, but +1 mismatching codon in the reverse read. Now, we assume that the variable +sequence starts immediately after the provided primers, and hence we don't +specify any UMI/constant sequence lengths. For the forward read, the variable +region is taken to be the remainder of the read (after the primer), whereas +for the reverse read, we specify the variable sequence length to 96. -```{r} +```{r primers-digest} leujunt0 <- digestFastqs( fastqForward = file.path(datadir, "leujunt0_1.fastq.gz"), fastqReverse = file.path(datadir, "leujunt0_2.fastq.gz"), @@ -424,12 +428,12 @@ leujunt0 <- digestFastqs( ) ``` -```{r} +```{r primers-out} leujunt0$parameters leujunt0$filterSummary ``` -```{r} +```{r primers-summary} head(leujunt0$summaryTable) ``` @@ -445,7 +449,7 @@ process also the output sample for the TRANS experiment for which we processed the input sample above, and feed both outputs to `summarizeExperiment()`. -```{r combine} +```{r combine-digest} transOutput <- digestFastqs( fastqForward = file.path(datadir, "transOutput_1.fastq.gz"), fastqReverse = file.path(datadir, "transOutput_2.fastq.gz"), @@ -503,13 +507,13 @@ metadata(se)$countType # Collapsing count matrix to amino acids -The \link{\code[SummarizedExperiment]{SummarizedExperiment}} object generated by -`summarizeExperiment()` contains one row for each observed variant +The \link{\code[SummarizedExperiment]{SummarizedExperiment}} object generated +by `summarizeExperiment()` contains one row for each observed variant (combination). This can be further collapsed by replacing the mutated codon by the corresponding amino acid, and aggregating the counts corresponding to the same mutated amino acid (combination). -```{r} +```{r collapse-aa} se_collapsed <- collapseMutantsByAA(se) head(assay(se_collapsed, "counts")) Matrix::colSums(assay(se_collapsed, "counts")) @@ -524,7 +528,7 @@ full CIS data set from [@Diss2018], which has been processed using `digestFastqs()` as illustrated above, and summarized in a `SummarizedExperiment` object provided with the package. -```{r} +```{r read-se} se <- readRDS(file.path(datadir, "GSE102901_cis_se.rds")) ``` @@ -532,12 +536,12 @@ First, we can plot a summary of the filtering process, indicating the number of reads that were filtered out by (or retained after) each step of the `mutscan` filtering. -```{r, fig.height=10, fig.width=8} +```{r plot-filter-reads, fig.height=10, fig.width=8} plotFiltering(se, valueType = "reads", onlyActiveFilters = TRUE, plotType = "remaining", facetBy = "sample", numberSize = 3) ``` -```{r, fig.height=10, fig.width=8} +```{r plot-filter-frac, fig.height=10, fig.width=8} plotFiltering(se, valueType = "fractions", onlyActiveFilters = TRUE, plotType = "filtered", facetBy = "step", numberSize = 3) ``` @@ -545,14 +549,14 @@ plotFiltering(se, valueType = "fractions", onlyActiveFilters = TRUE, We can also generate a pairs plot displaying the correlation among the samples in the data set. -```{r, fig.height=7} +```{r plot-pairs, fig.height=7} plotPairs(se, selAssay = "counts") ``` Additional plot functions can be used to visualize the total count per sample, across all variants, or the distribution of variant counts per sample. -```{r} +```{r plot-more} plotTotals(se, selAssay = "counts") plotDistributions(se, selAssay = "counts", plotType = "density", pseudocount = 1) @@ -560,33 +564,35 @@ plotDistributions(se, selAssay = "counts", plotType = "density", Finally, we can create a full QC report as follows: -```{r, eval=FALSE} +```{r make-qc-report, eval=FALSE} generateQCReport(se, outFile = tempfile(fileext = ".html")) ``` # Calculating fitness scores -The function `calculateFitnessScore()` can be used to calculate fitness scores as -described in [@Diss2018]. The function requires the user to specify a `pairingCol`, -containing the replicate ID for each sample; one or more `ODCols`, containing -the optical density for each sample, and a `comparison`, which is a character -vector of length 3 specifying the comparison to perform, of the form -(`groupColumn`, `numerator`, `denominator`). Here, `groupColumn` is the name of -the column in `colData(se)` that contains the grouping information, and -`numerator` and `denominator` specify the values of this column representing the -two groups to be compared. +The function `calculateFitnessScore()` can be used to calculate fitness scores +as described in [@Diss2018]. The function requires the user to specify a +`pairingCol`, containing the replicate ID for each sample; one or more +`ODCols`, containing the optical density for each sample, and a +`comparison`, which is a character vector of length 3 specifying the +comparison to perform, of the form (`groupColumn`, `numerator`, +`denominator`). Here, `groupColumn` is the name of the column in `colData(se)` +that contains the grouping information, and `numerator` and `denominator` +specify the values of this column representing the two groups to be compared. Here, we illustrate the application of `calculateFitnessScore()` on the SummarizedExperiment object containing all the three CIS replicates from [@Diss2018]. -```{r} +```{r calc-ppi} se_collapsed <- collapseMutantsByAA(se) -ppis <- calculateFitnessScore(se = se_collapsed, pairingCol = "Replicate", - ODCols = c("OD1", "OD2"), - comparison = c("Condition", "cis_output", "cis_input"), - WTrows = "f.0.WT") +ppis <- calculateFitnessScore( + se = se_collapsed, pairingCol = "Replicate", + ODCols = c("OD1", "OD2"), + comparison = c("Condition", "cis_output", "cis_input"), + WTrows = "f.0.WT" +) head(ppis[order(abs(rowMeans(ppis)), decreasing = TRUE), , drop = FALSE]) ## The PPI score for the WT sequence is 1, by construction @@ -595,20 +601,21 @@ ppis["f.0.WT", , drop = FALSE] # Scoring mutations with `edgeR` or `limma` -As an alternative to the fitness scoring, `mutscan` can be used to model the observed -counts using a generalized linear model (with `edgeR`) or a general linear model -(with `limma`) and estimate a logFC and a p-value for the enrichment of each -variant betwen two conditions (or more generally, in association with any -predictor), compared to one or more WT sequences. Note that for this, at least -two replicates are required. +As an alternative to the fitness scoring, `mutscan` can be used to model the +observed counts using a generalized linear model (with `edgeR`) or a general +linear model (with `limma`) and estimate a logFC and a p-value for the +enrichment of each variant betwen two conditions (or more generally, in +association with any predictor), compared to one or more WT sequences. Note +that for this, at least two replicates are required. We start by looking at the design matrix, in order to determine which of the coefficients to specify for the testing in `calculateRelativeFC()`. For more information about how to set up and interpret design matrices in `edgeR` or -`limma`, see e.g. [Law et al (2020)](https://f1000research.com/articles/9-1444), +`limma`, see e.g. +[Law et al (2020)](https://f1000research.com/articles/9-1444), or [Soneson et al (2020)](https://f1000research.com/articles/9-512). -```{r} +```{r def-mm} model.matrix(~ Replicate + Condition, data = colData(se_collapsed)) ``` @@ -616,7 +623,7 @@ model.matrix(~ Replicate + Condition, Next, we apply either `edgeR` or `limma` to extract the logFCs of the mutants, compared to the wildtype sequence. -```{r} +```{r calc-rel-fc} ## edgeR edger_scores <- calculateRelativeFC( se = se_collapsed, @@ -644,7 +651,7 @@ limma_scores["f.0.WT", , drop = FALSE] testing - in particular, MA (mean-difference) plots and volcano plots can be easily generated. -```{r} +```{r plot-rel-fc} plotMeanDiff(edger_scores, pointSize = "large") plotVolcano(edger_scores, pointSize = "large") ``` @@ -662,7 +669,7 @@ the forward and reverse FASTQ files in the same order. This vignette was compiled on the following system: -```{r} +```{r session-info} sessionInfo() ```