From 4720e5cae556c12057f92ba98624fa74a2258796 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Mon, 11 Apr 2022 20:18:38 +0200 Subject: [PATCH 1/2] Allow collapsing to AA in linkMultipleVariants() --- R/linkMultipleVariants.R | 42 +++- .../extdata/multipleVariableRegions_truth.rds | Bin 1385 -> 1661 bytes ...nerateExampleDataForLinkMultipleVariants.R | 43 ++++ man/linkMultipleVariants.Rd | 9 +- tests/testthat/test_linkMultipleVariants.R | 187 +++++++++++++++++- 5 files changed, 274 insertions(+), 7 deletions(-) diff --git a/R/linkMultipleVariants.R b/R/linkMultipleVariants.R index 9f303fb..2aeb503 100644 --- a/R/linkMultipleVariants.R +++ b/R/linkMultipleVariants.R @@ -62,6 +62,11 @@ #' @param ... Additional arguments providing arguments to \code{digestFastqs} #' for the separate runs (processing each variable sequence in turn). #' Each argument must be a named list of arguments to \code{digestFastqs}. +#' @param collapseToAA Either \code{TRUE}, \code{FALSE}, or a character vector +#' indicating for which of the separate runs the sequences should be +#' collapsed to the amino acid mutant name rather than the codon- or +#' nucleotide-level name. \code{TRUE} is equivalent to providing the +#' names of all lists provided in \code{...}. #' #' @author Charlotte Soneson #' @@ -75,13 +80,15 @@ #' \item convSeparate - a list of conversion tables from the respective #' separate runs. #' \item outCombined - the \code{digestFastqs} output for the combined run. +#' \item filtSeparate - a list of filtering tables for the separate runs. #' } #' #' @importFrom dplyr select rename group_by summarize across matches mutate #' @importFrom tidyr separate separate_rows #' @importFrom rlang .data #' -linkMultipleVariants <- function(combinedDigestParams = list(), ...) { +linkMultipleVariants <- function(combinedDigestParams = list(), ..., + collapseToAA = FALSE) { ## Process additional arguments paramsSeparate <- list(...) @@ -111,6 +118,24 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { as.list(defaults[!(names(defaults) %in% names(combinedDigestParams))])) + ## Make sure collapseToAA is a vector containing the names of the + ## runs where sequences should be collapsed to AAs + if (length(collapseToAA) == 1) { + if (is.logical(collapseToAA)) { + if (collapseToAA) { + collapseToAA <- names(paramsSeparate) + } else { + collapseToAA <- character(0) + } + } else { + .assertVector(x = collapseToAA, type = "character", + validValues = names(paramsSeparate)) + } + } else { + .assertVector(x = collapseToAA, type = "character", + validValues = names(paramsSeparate)) + } + ## --------------------------------------------------------------------- ## ## Checks ## --------------------------------------------------------------------- ## @@ -227,7 +252,8 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { ## Conversion tables convSeparate <- lapply(outSeparate, function(out) { out$summaryTable %>% - dplyr::select(.data$mutantName, .data$sequence) %>% + dplyr::select(.data$mutantName, .data$mutantNameAA, + .data$sequence) %>% tidyr::separate_rows(.data$sequence, sep = ",") }) @@ -235,9 +261,15 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) { ## Replace naive sequences with corrected ones ## --------------------------------------------------------------------- ## for (i in names(paramsSeparate)) { - countCombined[[i]] <- - convSeparate[[i]]$mutantName[match(countCombined[[i]], - convSeparate[[i]]$sequence)] + if (i %in% collapseToAA) { + countCombined[[i]] <- + convSeparate[[i]]$mutantNameAA[match(countCombined[[i]], + convSeparate[[i]]$sequence)] + } else { + countCombined[[i]] <- + convSeparate[[i]]$mutantName[match(countCombined[[i]], + convSeparate[[i]]$sequence)] + } } ## Filter out rows with NAs (the sequence was not retained) diff --git a/inst/extdata/multipleVariableRegions_truth.rds b/inst/extdata/multipleVariableRegions_truth.rds index fa1243cf3edc1771a16a1da7e070b33cdc0f80bd..a86cab19fc1feccbf8c2c5f7732f71b7968c1914 100644 GIT binary patch literal 1661 zcmZwHeLNEg7zgmgm83|aG%2I&vqd>$%{n!eniR^CjY)*<(vI4iSQ1(+ZmDsSaT@Zn z^0gx5u4%bB7QwT3`x^7-Bah zoFlmW?qT*bc!j`f-KxaFb6hCEu*nl}Cw^Ka;H^qr7Lji z=&o-;39B%J>ERB_E(tBnusmMBmT2IUd~Exv+ZKl{3v}9(A`p6MH{1^b^EZg~BlI9E zEp)!cNBKVDT@jlg1{Oc0-E>EMi;(|uMf^QO@%KE%-`g$z-bL|$O(Z#=uZRV)ZS&wv z#lkDOnxlx}C@Kg|7qlJGO+@l7dF%u=i|kr947Gov5D=~h#%3yAUY2}>1)u1-{Cihb z9A>$8^lE>6k6cDvx)K=_7?Z1g5h;5SzYQ)REc0?@l&0_=1*S)FTDHWQ?ISJK*ekD5 zKJ^`-C}i`~D49mNsD7+m7{hB#WQW%4P9ac5Z?+z|hZr^lg+DfEH$B?yU5sl)^a|JLCD1a2?b@rPs_eM%j zf)f1=0g|28xSaMU>5rTTk9&ZpnI7ym^Gs3O%^MS!)}zwBY6tqQKU?9V6<6sI)Cxv6 zSI&ROQg3GRB#vuh7u1~Av>5X}+wT_ehImt0qI*|y;Zj#At+?m6IU{d2oD%TFikQ&$ zaY9YPZ_?uxEmR(S(1q|*9O`4tDrk$p!fskgXs^T#xxX-byL+)j8{ z9bDUzc{<>NW{D?(@vNQqChzDQ;wOU4M?`B#c>IiT=4cGcj$JcZpJI!Mf6Hnbg z5lyE-6nyZ#@5&9G!9rUl(8!3q&A{Zr8T1Su*?;~=Z4FTI zVAUmTNQt+p)E9jgW(x*JHPxd_316TcM@^?JU*fqg&QVNG;H0LPP0-{5z>D+B-X~R3 z9NbkyahW51t@`2DWs9>E_TUa|?E_-;>e^JTY>;{GCmt5O`7Ks-2kpYpl@qvj8I3yryJe%#HSQZ_Vgu}4c)C^S8B#?Aq`TcM3GwRyCIYA@+0kfTRAoGk zC1!Q#PIQ>Dn(;#Xcs0?V%8k&cCb*Q5>RWr;TNp)-J0wXe2Epk>-3B_>6b*2QJB!Vi zAnDR@lV9Stct!m6Q}uvN8&qA;X~WEQTd-_;b)U1}v|71Op{qAPT}7=MBcD0{J}Rtm z7a#=JB`kYbH{ASIR@HfBZyE5^tI94&#%1U|VZq;EaX zjv5`i2J_0C`KVCjj6kOqtz`{AshmXyx9J`l%pk(oId3iQLD6pOd z%8EF_^!`24#AO@jAk_ijQ}hIaiBi>1iY~|4GCy~`B3sHv4%jQRl(g~yp-{ ztwTaKt5k2Azh3!HV@gbrgF#=8KQ*B9_G^Yw3{z{F(!Z0%=mP`#bmVVGWOKU61;;g`E|Iuwm8-!6hMuRV@P}(@Crz`$%>rl_ z!!ol@Mw!LaMCPYAKhlrEjWr%jx7rKe3IyK@4F5mplAl^<7!kzI6)QCzv6Tp+ejo9& z%^cXutE4M=TIhh<(yUZDrLA!=)+{eF;T-W1DDQW@`&L!DcRf*#x!{OI;YOVlPm~KO zg>@YmOt5&5z}_eAFjJfpRLja~*YoZ!~@%nb+fNIgH6v8R*ClUpssy~FnE7bs^5hPF>E<$s26t7FOWMrH-Tnf&X~7lKM~F!jnlS_uoF1> zJ?Gr<0E)NM>|*_yKKwGl1Mg3@c#{(9P1Cogn|Ub>nOz*VyCv1&uktcViG_BbF|k^2 zIkvbX@Wgag5tVY9gsAN+H;TXz+_uA}aXpFIaU}Wja9ekG zfM9L()HE!aD>}w7;6+LsUqs%daMA-TIqG%$3&Dg$s{>(La+=etjUL7&*j>p-#`kCb z)3gsBu;K&K!V@r&@a4w!dR@!C&=%B8CK-Jmq7_r`#bU?4_UFbms)e5o@}M-UqL>tX ze*26i-q@EJjM7>%<&GruaL-OeJw~c>$Gic!?qck5?QW0~!Fco0brhFmG5d{>Gow>c zf6CqYV}%^*9(Gx*4^_XKIy#6w0=(pEsGM);AhNj@K@}{eR>*?Pd`B#BI7AmR)&HLx zw3?@`fp+&~uj#%`qh4cEaK4Qn(~B13G+>h)=EC;75^cN~IuWe=sluwWXmIslKyUS> zd_wQwpA3=?q~eS!ue&uja3AvxDK{Q$)DQx3%0_lf=U~rGUQ2%GFnHOkhh>qnzdj+$ zo84JSK{-*MMVjt?rxbxEFpPMN8q^_WC*y+Rw!%qp?(9WbZW=FCLfg%`9_@<7$Bni+A0&^oKX zKf|EYd5kZU$#`Sq&Yz)8)8^EX>Q$9J;?h;Bxbz7BmCZxGKcjs(DBjB`BFTD&2}nz; zw@IgpT=Tv{W*1%LJE}wC=aZ02Y9j3F`N>X_ZF>LhTl6%#XN+x60DaejIbwGhYsx2Z z-6g%fBQ$+MV~J%S$7%Nvt}u%+Zc@b*a?1qE;Oy!rji0al!tycU8InE6)dT^1u5#BL n4c`zMO1H8F>h?_>SwAzQk$}KEO+EiTY&~UN0RZ#dB(4UvR diff --git a/inst/scripts/generateExampleDataForLinkMultipleVariants.R b/inst/scripts/generateExampleDataForLinkMultipleVariants.R index a38c9f5..c33b147 100644 --- a/inst/scripts/generateExampleDataForLinkMultipleVariants.R +++ b/inst/scripts/generateExampleDataForLinkMultipleVariants.R @@ -187,10 +187,53 @@ WTV3 <- c(V3 = V3) constFwd <- "ACGTGATTGATCCCGGTAAATACCGG" constRev <- "TAAATACCGGACCTGA" +## Get AA mutant name as well +V2varAA <- vapply(V2var, function(w) { + w <- strsplit(w, "\\.")[[1]] + if (w[3] != "WT") { + pos0 <- ceiling(as.numeric(w[2]) / 3) - 1 + wt <- strsplit(WTV2[w[1]], "")[[1]] + wt_codon <- paste(wt[(3 * pos0 + 1):(3 * pos0 + 3)], collapse = "") + wt_aa <- mutscan:::translateString(wt_codon) + wt[as.numeric(w[2])] <- w[3] + mut_codon <- paste(wt[(3 * pos0 + 1):(3 * pos0 + 3)], collapse = "") + mut_aa <- mutscan:::translateString(mut_codon) + if (mut_aa == wt_aa) { + paste(c(w[1], 0, "WT"), collapse = ".") + } else { + paste(c(w[1], pos0 + 1, mut_aa), collapse = ".") + } + } else { + paste(w, collapse = ".") + } +}, "") +V3varAA <- vapply(V3var, function(w) { + w <- strsplit(w, "\\.")[[1]] + if (w[3] != "WT") { + pos0 <- ceiling(as.numeric(w[2]) / 3) - 1 + wt <- strsplit(WTV3[w[1]], "")[[1]] + wt_codon <- paste(wt[(3 * pos0 + 1):(3 * pos0 + 3)], collapse = "") + wt_aa <- mutscan:::translateString(wt_codon) + wt[as.numeric(w[2])] <- w[3] + mut_codon <- paste(wt[(3 * pos0 + 1):(3 * pos0 + 3)], collapse = "") + mut_aa <- mutscan:::translateString(mut_codon) + if (mut_aa == wt_aa) { + paste(c(w[1], 0, "WT"), collapse = ".") + } else { + paste(c(w[1], pos0 + 1, mut_aa), collapse = ".") + } + } else { + paste(w, collapse = ".") + } +}, "") + + truth <- data.frame(read = paste0("R", seq_len(nReads)), trueBarcode = barcode, trueV2 = V2var, trueV3 = V3var, + trueV2aa = V2varAA, + trueV3aa = V3varAA, obsBarcode = v1, status = status) diff --git a/man/linkMultipleVariants.Rd b/man/linkMultipleVariants.Rd index cb60207..b9a8122 100644 --- a/man/linkMultipleVariants.Rd +++ b/man/linkMultipleVariants.Rd @@ -4,7 +4,7 @@ \alias{linkMultipleVariants} \title{Process an experiment with multiple variable sequences} \usage{ -linkMultipleVariants(combinedDigestParams = list(), ...) +linkMultipleVariants(combinedDigestParams = list(), ..., collapseToAA = FALSE) } \arguments{ \item{combinedDigestParams}{A named list of arguments to @@ -13,6 +13,12 @@ linkMultipleVariants(combinedDigestParams = list(), ...) \item{...}{Additional arguments providing arguments to \code{digestFastqs} for the separate runs (processing each variable sequence in turn). Each argument must be a named list of arguments to \code{digestFastqs}.} + +\item{collapseToAA}{Either \code{TRUE}, \code{FALSE}, or a character vector +indicating for which of the separate runs the sequences should be +collapsed to the amino acid mutant name rather than the codon- or +nucleotide-level name. \code{TRUE} is equivalent to providing the +names of all lists provided in \code{...}.} } \value{ A list with the following elements: @@ -23,6 +29,7 @@ A list with the following elements: \item convSeparate - a list of conversion tables from the respective separate runs. \item outCombined - the \code{digestFastqs} output for the combined run. +\item filtSeparate - a list of filtering tables for the separate runs. } } \description{ diff --git a/tests/testthat/test_linkMultipleVariants.R b/tests/testthat/test_linkMultipleVariants.R index 933db0b..f476375 100644 --- a/tests/testthat/test_linkMultipleVariants.R +++ b/tests/testthat/test_linkMultipleVariants.R @@ -16,6 +16,10 @@ test_that("linkMultipleVariants works", { expect_error(linkMultipleVariants(combinedDigestParams = list(fastqForward = 1), secondArg = list(unknown = 1)), "All values in 'namesparms' must be one of") + expect_error(linkMultipleVariants(combinedDigestParams = list(fastqForward = 1), + secondArg = list(fastqForward = 1), + collapseToAA = 1), + "'collapseToAA' must be of class 'character'") expect_error(linkMultipleVariants(combinedDigestParams = list(fastqForward = 1), secondArg = list(elementsForward = "CC")), @@ -86,7 +90,8 @@ test_that("linkMultipleVariants works", { wildTypeForward = truth$WTV3, nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE, - verbose = verbose, useTreeWTmatch = TRUE) + verbose = verbose, useTreeWTmatch = TRUE), + collapseToAA = FALSE ) ## Filter out only reads with N/deletions in the variable sequence @@ -124,6 +129,186 @@ test_that("linkMultipleVariants works", { expect_equal(res$filtSeparate$V3[, "nbrRetained"], sum(!grepl("del", truth$truth$status))) + ## ------------------------------------------------------------------------ + ## Don't filter by mutations in constant sequence, collapse by AA + res <- linkMultipleVariants( + combinedDigestParams = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "CVCVCV", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CVCV", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 20, + maxOverlap = 30, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + verbose = verbose), + barcode = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + elementsForward = "CVCSCS", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + avePhredMinForward = 20, variableCollapseMaxDist = 1, + variableCollapseMinReads = 1, variableCollapseMinRatio = 1, + verbose = verbose), + V2 = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "CSCVCS", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CSCV", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 10, + maxOverlap = 20, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + wildTypeForward = truth$WTV2, + nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, + forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE, + verbose = verbose, useTreeWTmatch = TRUE), + V3 = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "CSCSCV", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CVCS", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 5, + maxOverlap = 15, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + wildTypeForward = truth$WTV3, + nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, + forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE, + verbose = verbose, useTreeWTmatch = TRUE), + collapseToAA = c("V2", "V3") + ) + + ## Filter out only reads with N/deletions in the variable sequence + keep <- !grepl("N|del", truth$truth$status) + + ## Truth + correct <- truth$truth[keep, ] %>% + dplyr::group_by(trueBarcode, trueV2aa, trueV3aa) %>% dplyr::tally() %>% + dplyr::arrange(dplyr::desc(n), trueBarcode, trueV2aa, trueV3aa) + + ## Obs + obs <- res$countAggregated %>% dplyr::arrange(desc(nbrReads), barcode, V2, V3) + + expect_equal(sum(obs$nbrReads), sum(keep)) + expect_equal(correct$trueBarcode, obs$barcode) + expect_equal(correct$trueV2aa, obs$V2) + expect_equal(correct$trueV3aa, obs$V3) + expect_equal(correct$n, obs$nbrReads) + expect_equal(res$outCombined$filterSummary[, "f4_nbrNoValidOverlap"], + sum(grepl("del", truth$truth$status))) ## del + expect_equal(res$outCombined$filterSummary[, "f6_nbrTooManyNinVar"], + sum(grepl("N", truth$truth$status))) ## N + expect_equal(res$outCombined$filterSummary[, "nbrRetained"], + sum(!grepl("N|del", truth$truth$status))) + expect_equal(res$filtSeparate$barcode[, "f6_nbrTooManyNinVar"], + sum(grepl("V1N", truth$truth$status))) + expect_equal(res$filtSeparate$barcode[, "nbrRetained"], + sum(!grepl("V1N", truth$truth$status))) + expect_equal(res$filtSeparate$V2[, "f4_nbrNoValidOverlap"], + sum(grepl("del", truth$truth$status))) + expect_equal(res$filtSeparate$V2[, "nbrRetained"], + sum(!grepl("del", truth$truth$status))) + expect_equal(res$filtSeparate$V3[, "f4_nbrNoValidOverlap"], + sum(grepl("del", truth$truth$status))) + expect_equal(res$filtSeparate$V3[, "nbrRetained"], + sum(!grepl("del", truth$truth$status))) + + ## ------------------------------------------------------------------------ + ## Don't filter by mutations in constant sequence, collapse by AA (only V2) + res <- linkMultipleVariants( + combinedDigestParams = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "CVCVCV", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CVCV", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 20, + maxOverlap = 30, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + verbose = verbose), + barcode = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + elementsForward = "CVCSCS", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + avePhredMinForward = 20, variableCollapseMaxDist = 1, + variableCollapseMinReads = 1, variableCollapseMinRatio = 1, + verbose = verbose), + V2 = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "CSCVCS", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CSCV", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 10, + maxOverlap = 20, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + wildTypeForward = truth$WTV2, + nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, + forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE, + verbose = verbose, useTreeWTmatch = TRUE), + V3 = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "CSCSCV", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CVCS", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 5, + maxOverlap = 15, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + wildTypeForward = truth$WTV3, + nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, + forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE, + verbose = verbose, useTreeWTmatch = TRUE), + collapseToAA = "V2" + ) + + ## Filter out only reads with N/deletions in the variable sequence + keep <- !grepl("N|del", truth$truth$status) + + ## Truth + correct <- truth$truth[keep, ] %>% + dplyr::group_by(trueBarcode, trueV2aa, trueV3) %>% dplyr::tally() %>% + dplyr::arrange(dplyr::desc(n), trueBarcode, trueV2aa, trueV3) + + ## Obs + obs <- res$countAggregated %>% dplyr::arrange(desc(nbrReads), barcode, V2, V3) + + expect_equal(sum(obs$nbrReads), sum(keep)) + expect_equal(correct$trueBarcode, obs$barcode) + expect_equal(correct$trueV2aa, obs$V2) + expect_equal(correct$trueV3, obs$V3) + expect_equal(correct$n, obs$nbrReads) + expect_equal(res$outCombined$filterSummary[, "f4_nbrNoValidOverlap"], + sum(grepl("del", truth$truth$status))) ## del + expect_equal(res$outCombined$filterSummary[, "f6_nbrTooManyNinVar"], + sum(grepl("N", truth$truth$status))) ## N + expect_equal(res$outCombined$filterSummary[, "nbrRetained"], + sum(!grepl("N|del", truth$truth$status))) + expect_equal(res$filtSeparate$barcode[, "f6_nbrTooManyNinVar"], + sum(grepl("V1N", truth$truth$status))) + expect_equal(res$filtSeparate$barcode[, "nbrRetained"], + sum(!grepl("V1N", truth$truth$status))) + expect_equal(res$filtSeparate$V2[, "f4_nbrNoValidOverlap"], + sum(grepl("del", truth$truth$status))) + expect_equal(res$filtSeparate$V2[, "nbrRetained"], + sum(!grepl("del", truth$truth$status))) + expect_equal(res$filtSeparate$V3[, "f4_nbrNoValidOverlap"], + sum(grepl("del", truth$truth$status))) + expect_equal(res$filtSeparate$V3[, "nbrRetained"], + sum(!grepl("del", truth$truth$status))) + ## ------------------------------------------------------------------------ ## Filter out mutations in constant sequences res <- linkMultipleVariants( From 36f78d8f4e47064e61025159bdf354b417a835e2 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Mon, 11 Apr 2022 20:29:57 +0200 Subject: [PATCH 2/2] Add associateBarcodes() function --- NAMESPACE | 2 + R/associateBarcodes.R | 114 ++++++++++++++++ man/associateBarcodes.Rd | 40 ++++++ tests/testthat/test-associateBarcodes.R | 168 ++++++++++++++++++++++++ 4 files changed, 324 insertions(+) create mode 100644 R/associateBarcodes.R create mode 100644 man/associateBarcodes.Rd create mode 100644 tests/testthat/test-associateBarcodes.R diff --git a/NAMESPACE b/NAMESPACE index 22df1e7..b35770a 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(associateBarcodes) export(calcNearestStringDist) export(calculateFitnessScore) export(calculateRelativeFC) @@ -76,6 +77,7 @@ importFrom(ggplot2,geom_point) importFrom(ggplot2,geom_text) importFrom(ggplot2,ggplot) importFrom(ggplot2,labs) +importFrom(ggplot2,scale_color_manual) importFrom(ggplot2,scale_fill_continuous) importFrom(ggplot2,scale_fill_discrete) importFrom(ggplot2,scale_x_continuous) diff --git a/R/associateBarcodes.R b/R/associateBarcodes.R new file mode 100644 index 0000000..e4e81d6 --- /dev/null +++ b/R/associateBarcodes.R @@ -0,0 +1,114 @@ +#' Associate barcode(s) to variants +#' +#' @param countTable A data frame containing barcode sequences, variant +#' sequences and corresponding read counts for each barcode-variant pair. +#' @param barcodeCol,variantCol,countCol Character scalars indicating the +#' names of the columns of \code{countTable} containing barcode +#' sequences, variant IDs and read counts, respectively. +#' @param minCount Numeric scalar. Pairs with fewer reads will be flagged +#' for exclusion. +#' @param minFracOfBarcodeCount Numeric scalar. Pairs where the read count +#' is lower than \code{minFracOfBarcodeCount} * total barcode count will +#' be flagged for exclusion. +#' +#' @export +#' @author Charlotte Soneson +#' +#' @return A list with two elements: \code{counts} (a data frame with counts) +#' and \code{plots} (a list of ggplot objects). +#' +#' @importFrom ggplot2 scale_color_manual +#' @importFrom dplyr %>% group_by mutate ungroup filter arrange desc +#' summarize +#' @importFrom rlang .data +#' @importFrom ggplot2 ggplot aes geom_histogram theme_bw labs +#' scale_color_manual scale_x_log10 scale_y_log10 geom_point +#' +associateBarcodes <- function(countTable, barcodeCol, variantCol, + countCol = "nbrReads", minCount = 10, + minFracOfBarcodeCount = 0.5) { + .assertScalar(x = barcodeCol, type = "character", + validValues = colnames(countTable)) + .assertVector(x = countTable[[barcodeCol]], type = "character") + .assertScalar(x = variantCol, type = "character", + validValues = colnames(countTable)) + .assertVector(x = countTable[[variantCol]], type = "character") + .assertScalar(x = countCol, type = "character", + validValues = colnames(countTable)) + .assertVector(x = countTable[[countCol]], type = "numeric") + .assertScalar(x = minCount, type = "numeric", rngIncl = c(0, Inf)) + .assertScalar(x = minFracOfBarcodeCount, type = "numeric", rngIncl = c(0, 1)) + + ## Initialize plot list + plots <- list() + + ## Add columns with total counts for barcode/variant, with and without + ## filtering + countTable <- countTable %>% + ## first aggregate all rows with the same barcode/variant pair, + ## in case that wasn't already done + dplyr::group_by(.data[[barcodeCol]], .data[[variantCol]]) %>% + dplyr::summarize("{countCol}" := sum(.data[[countCol]])) %>% + dplyr::ungroup() %>% + ## flag pairs with < minCount reads + dplyr::mutate(keep = (.data[[countCol]] >= minCount)) %>% + ## add total barcode counts + dplyr::group_by(.data[[barcodeCol]]) %>% + dplyr::mutate(totalBarcodeCountAll = sum(.data[[countCol]])) %>% + dplyr::mutate(totalBarcodeCountFilt = sum(.data[[countCol]][.data$keep])) %>% + dplyr::ungroup() %>% + ## add total variant counts + dplyr::group_by(.data[[variantCol]]) %>% + dplyr::mutate(totalVariantCountAll = sum(.data[[countCol]])) %>% + dplyr::mutate(totalVariantCountFilt = sum(.data[[countCol]][.data$keep])) %>% + dplyr::ungroup() + plots[["totalBcCountAll"]] <- + ggplot(countTable %>% dplyr::filter(!duplicated(.data[[barcodeCol]])), + aes(x = .data$totalBarcodeCountAll)) + + geom_histogram(bins = 100, fill = "lightgrey") + + theme_bw() + scale_x_log10() + + labs(title = "Total barcode count (all)", x = "Total barcode count") + plots[["totalBcCountFiltered"]] <- + ggplot(countTable %>% dplyr::filter(!duplicated(.data[[barcodeCol]])), + aes(x = .data$totalBarcodeCountFilt)) + + geom_histogram(bins = 100, fill = "lightgrey") + + theme_bw() + scale_x_log10() + + labs(title = "Total barcode count (filtered)", x = "Total barcode count") + + ## Calculate relative frequency for each pair as the ratio between the + ## count for the pair and the total count for the barcode or variant, + ## respectively + countTable <- countTable %>% + dplyr::mutate(fracOfTotalBarcodeCountFilt = + .data[[countCol]]/.data$totalBarcodeCountFilt, + fracOfTotalVariantCountFilt = + .data[[countCol]]/.data$totalVariantCountFilt) + + ## Order by decreasing frequency and keep only the top hit for each + ## barcode + countTable <- countTable %>% + dplyr::arrange(dplyr::desc(.data[[countCol]])) %>% + dplyr::mutate(keep = .data$keep & !duplicated(.data[[barcodeCol]])) + + ## Flag pairs where the count/total barcode count is too low + countTable <- countTable %>% + dplyr::mutate(keep = .data$keep & + (.data$fracOfTotalBarcodeCountFilt >= + minFracOfBarcodeCount)) + + ## Plot count for pair vs total count for barcode + plots[["pairVsBarcodeCount"]] <- + ggplot(countTable, aes(x = .data$totalBarcodeCountFilt, + y = .data[[countCol]], + color = .data$keep)) + + geom_point(size = 0.5) + theme_bw() + + scale_color_manual(values = c(`TRUE` = "black", `FALSE` = "red"), + name = "Keep") + + scale_x_log10() + scale_y_log10() + + labs(x = "Total barcode count", y = "Barcode/variant pair count") + + return(list(counts = countTable, plots = plots)) +} + + + diff --git a/man/associateBarcodes.Rd b/man/associateBarcodes.Rd new file mode 100644 index 0000000..d198dc7 --- /dev/null +++ b/man/associateBarcodes.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/associateBarcodes.R +\name{associateBarcodes} +\alias{associateBarcodes} +\title{Associate barcode(s) to variants} +\usage{ +associateBarcodes( + countTable, + barcodeCol, + variantCol, + countCol = "nbrReads", + minCount = 10, + minFracOfBarcodeCount = 0.5 +) +} +\arguments{ +\item{countTable}{A data frame containing barcode sequences, variant +sequences and corresponding read counts for each barcode-variant pair.} + +\item{barcodeCol, variantCol, countCol}{Character scalars indicating the +names of the columns of \code{countTable} containing barcode +sequences, variant IDs and read counts, respectively.} + +\item{minCount}{Numeric scalar. Pairs with fewer reads will be flagged +for exclusion.} + +\item{minFracOfBarcodeCount}{Numeric scalar. Pairs where the read count +is lower than \code{minFracOfBarcodeCount} * total barcode count will +be flagged for exclusion.} +} +\value{ +A list with two elements: \code{counts} (a data frame with counts) +and \code{plots} (a list of ggplot objects). +} +\description{ +Associate barcode(s) to variants +} +\author{ +Charlotte Soneson +} diff --git a/tests/testthat/test-associateBarcodes.R b/tests/testthat/test-associateBarcodes.R new file mode 100644 index 0000000..e8c6140 --- /dev/null +++ b/tests/testthat/test-associateBarcodes.R @@ -0,0 +1,168 @@ +test_that("associateBarcodes works", { + ## Read truth (for WT sequences) + truth <- readRDS(system.file("extdata", "multipleVariableRegions_truth.rds", + package = "mutscan")) + verbose <- FALSE + + ## Don't filter by mutations in constant sequence + res <- linkMultipleVariants( + combinedDigestParams = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "CVCVCV", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CVCV", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 20, + maxOverlap = 30, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + verbose = verbose), + barcode = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + elementsForward = "CVCSCS", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + avePhredMinForward = 20, variableCollapseMaxDist = 1, + variableCollapseMinReads = 1, variableCollapseMinRatio = 1, + verbose = verbose), + V2 = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "CSCVCS", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CSCV", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 10, + maxOverlap = 20, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + wildTypeForward = truth$WTV2, + nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, + forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE, + verbose = verbose, useTreeWTmatch = TRUE), + V3 = list( + fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz", + package = "mutscan"), + fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz", + package = "mutscan"), + elementsForward = "CSCSCV", elementLengthsForward = c(6, 24, 10, 30, 10, -1), + elementsReverse = "CVCS", elementLengthsReverse = c(6, 40, 10, -1), + mergeForwardReverse = TRUE, minOverlap = 5, + maxOverlap = 15, maxFracMismatchOverlap = 0, + revComplForward = FALSE, revComplReverse = TRUE, + avePhredMinForward = 20, avePhredMinReverse = 20, + wildTypeForward = truth$WTV3, + nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, + forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE, + verbose = verbose, useTreeWTmatch = TRUE), + collapseToAA = FALSE + ) + + ## Fails with the wrong input + ## ------------------------------------------------------------------------ + expect_error(associateBarcodes(res$countAggregated, barcodeCol = 1, + variantCol = "V2", countCol = "nbrReads", minCount = 2, + minFracOfBarcodeCount = 0.25), + "'barcodeCol' must be of class 'character'") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = c("barcode", "V2"), + variantCol = "V2", countCol = "nbrReads", minCount = 2, + minFracOfBarcodeCount = 0.25), + "'barcodeCol' must have length 1") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "missing", + variantCol = "V2", countCol = "nbrReads", minCount = 2, + minFracOfBarcodeCount = 0.25), + "All values in 'barcodeCol' must be one of") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "nbrReads", + variantCol = "V2", countCol = "nbrReads", minCount = 2, + minFracOfBarcodeCount = 0.25), + "'[[countTablebarcodeCol' must be of class 'character'", fixed = TRUE) + + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = 1, countCol = "nbrReads", minCount = 2, + minFracOfBarcodeCount = 0.25), + "'variantCol' must be of class 'character'") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = c("V2", "barcode"), + countCol = "nbrReads", minCount = 2, + minFracOfBarcodeCount = 0.25), + "'variantCol' must have length 1") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "missing", countCol = "nbrReads", minCount = 2, + minFracOfBarcodeCount = 0.25), + "All values in 'variantCol' must be one of") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "nbrReads", countCol = "nbrReads", minCount = 2, + minFracOfBarcodeCount = 0.25), + "'[[countTablevariantCol' must be of class 'character'", fixed = TRUE) + + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "V2", countCol = 1, minCount = 2, + minFracOfBarcodeCount = 0.25), + "'countCol' must be of class 'character'") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "V2", countCol = c("barcode", "V2"), minCount = 2, + minFracOfBarcodeCount = 0.25), + "'countCol' must have length 1") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "V2", countCol = "missing", minCount = 2, + minFracOfBarcodeCount = 0.25), + "All values in 'countCol' must be one of") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "V2", countCol = "barcode", minCount = 2, + minFracOfBarcodeCount = 0.25), + "'[[countTablecountCol' must be of class 'numeric'", fixed = TRUE) + + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "V2", countCol = "nbrReads", minCount = "2", + minFracOfBarcodeCount = 0.25), + "'minCount' must be of class 'numeric'") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "V2", countCol = "nbrReads", minCount = c(1, 2), + minFracOfBarcodeCount = 0.25), + "'minCount' must have length 1") + + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "V2", countCol = "nbrReads", minCount = 2, + minFracOfBarcodeCount = "0.25"), + "'minFracOfBarcodeCount' must be of class 'numeric'") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "V2", countCol = "nbrReads", minCount = 2, + minFracOfBarcodeCount = c(0.1, 0.25)), + "'minFracOfBarcodeCount' must have length 1") + expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "V2", countCol = "nbrReads", minCount = 2, + minFracOfBarcodeCount = 2.5), + "'minFracOfBarcodeCount' must be within [0,1]", fixed = TRUE) + + + ## Works with the correct input + ## ------------------------------------------------------------------------ + ## Run barcode association + out <- associateBarcodes(res$countAggregated, barcodeCol = "barcode", + variantCol = "V2", minCount = 2, + minFracOfBarcodeCount = 0.25) + + ## Calculate expected values manually + correct <- truth$truth[!grepl("N|del", truth$truth$status), ] %>% + ## get frequency for each barcode/variant pair + dplyr::group_by(trueBarcode, trueV2) %>% + dplyr::tally() %>% + ## filter rare pairs + dplyr::filter(n >= 2) %>% + ## filter pairs with too low fraction of total bc counts + dplyr::group_by(trueBarcode) %>% + dplyr::mutate(relFrac = n/sum(n)) %>% + dplyr::filter(relFrac >= 0.25) %>% + ## keep top variant for each barcode %>% + dplyr::arrange(dplyr::desc(n)) %>% + dplyr::filter(!duplicated(trueBarcode)) + + expect_equal(sum(out$counts$keep), nrow(correct)) + expect_true(all(paste0(out$counts$barcode[out$counts$keep], + out$counts$V2[out$counts$keep]) %in% + paste0(correct$trueBarcode, correct$trueV2))) + tmp <- out$counts[match(paste0(correct$trueBarcode, correct$trueV2), + paste0(out$counts$barcode, out$counts$V2)), ] + expect_equal(tmp$nbrReads, correct$n) + expect_equal(tmp$fracOfTotalBarcodeCountFilt, correct$relFrac) +})